Menu

Diff of /trunk/python/python-tokenizer.php [r6] .. [r7]  Maximize  Restore

Switch to side-by-side view

--- a/trunk/python/python-tokenizer.php
+++ b/trunk/python/python-tokenizer.php
@@ -102,6 +102,8 @@
 -->
 
 <?php
+	$python_tokens = array("for","in");
+
 	function group($re) {
 		if (is_array($re)) {
 			return "(".implode("|",$re).")";
@@ -115,6 +117,12 @@
 	function maybe($re) {
 		return group($re)."?";	
 	}
+	function token($name,$value=null) {
+		return array(
+			"name"  => $name,
+			"value" => $value
+		);
+	}
 
 	$preg_whitespace 	= "[ \f\t]*";
 	$preg_comment 		= "#[^\r\n]*";
@@ -122,36 +130,61 @@
 	$preg_name 			= "[a-zA-Z_]\w*";
 	
 	function python_tokenize($pycode) {	
+		global $python_tokens;
+		
 		$line = $pycode;
 
+		$indent = 0;
+		
 		$out = array();
 		while (strlen($line)) {
-			if (preg_match('/^[ \t]+(\.[0-9]*)?/', $line, $regs)) {
+			if (preg_match('/^[ ]+/', $line, $regs)) {
 				# ignored
 				#$out[] = $regs[0];
 				$line = substr($line, strlen($regs[0]));
-			
+	
+			} else if (preg_match('/^[\t]+/', $line, $regs)) {
+				# indent/dedent
+				#$out[] = $regs[0];
+				$d = strlen($regs[0]) - $indent;
+				switch($d) {
+					case  0: break;
+					case +1: $out[]=token("INDENT"); break;
+					case -1: $out[]=token("DEDENT"); break;
+					default: die("Expectend indented block");
+				}
+				$line = substr($line, strlen($regs[0]));
+	
+			} else if (preg_match('/^:/', $line, $regs)) {
+				# newline
+				$out[] = token("COLON");
+				$line = substr($line, strlen($regs[0]));
+	
 			} else if (preg_match('/^\n/', $line, $regs)) {
 				# newline
-				$out[] = "__NEWLINE__";
+				$out[] = token("NEWLINE");
 				$line = substr($line, strlen($regs[0]));
 
 			} else if (preg_match('/^[0-9]+(\.[0-9]*)?/', $line, $regs)) {
 				# number
-				$out[] = $regs[0];
+				$out[] = token("NUMBER",$regs[0]);
 				$line = substr($line, strlen($regs[0]));
 
-			} else if (preg_match('/^"([^"]*)"/', $line, $regs)) {
+			} else if (preg_match('/^"([^"]*((\\\\")*[^"]*))"/', $line, $regs)) {
 				# double quoted string
 				#var_dump($regs);
-				$out[] = $regs[1];
+				$out[] = token("DSTRING",$regs[1]);
 				$line = substr($line, strlen($regs[0]));
 
 			} else if (preg_match('/^[A-Za-z_][A-Za-z0-1_]*/', $line, $regs)) {
-				# name
-				$out[] = $regs[0];
+				# name or litteral-token
+				if (in_array($regs[0],$python_tokens)) {
+					$out[] = token(strtoupper($regs[0]));
+				} else {
+					$out[] = token("NAME",$regs[0]);
+				}
 				$line = substr($line, strlen($regs[0]));
-
+	
 			} else {
 				# rest
 				$out[] = $line[0];