removed regex based tokenizer, finished switch/case based tokenizer, need to add support for whitespace in the parser or strip it out for now

12 years ago · ee0d7aebcb
2 changed files with 71 additions and 113 deletions
--- a/parse.js
+++ b/parse.js
@ -2,111 +2,9 @@

 var typ = require("./representation.js");
 var tool = require("./tools.js");
+var tokenizer = require("./tokenize.js");
 var fs = require("fs");

-// Tokenization
-
-var left_paren = /^\(/;
-var right_paren = /^\)/;
-
-var left_brace = /^\{/;
-var right_brace = /^\}/;
-
-var def = /^def/;
-
-var left_square = /^\[/;
-var right_square = /^\]/;
-var comma = /^,/;
-
-var truelit = /^true/;
-var falselit = /^false/;
-
-var stringlit = /^\"[^\"]*\"/;
-
-var number = /^(\+|-)?\d+(\.\d+)?/;
-
-var ifexp = /^if/;
-var thenexp = /^then/;
-var elsexp = /^else/;
-
-var identifier = /^[^\s\.\(\)\{\}\[\]\""]+/;
-
-var lambda = /^lambda/;
-
-var arrow = /^->/;
-
-function tokmatch(t) {
-	var ttype;
-	var m;
-	if (m = t.match(left_paren))
-		ttype = "left_paren";
-	else if (m = t.match(right_paren))
-		ttype = "right_paren";
-	else if (m = t.match(left_brace))
-		ttype = "left_brace";
-	else if (m = t.match(right_brace))
-		ttype = "right_brace";
-	else if (m = t.match(left_square))
-		ttype = "left_square";
-	else if (m = t.match(right_square))
-		ttype = "right_square";
-	else if (m = t.match(def))
-		ttype = "def";
-	else if (m = t.match(lambda))
-		ttype = "lambda";
-	else if (m = t.match(arrow))
-		ttype = "arrow";
-	else if (m = t.match(comma))
-		ttype = "comma";
-	else if (m = t.match(truelit))
-		ttype = "truelit";
-	else if (m = t.match(falselit))
-		ttype = "falselit";
-	else if (m = t.match(stringlit))
-		ttype = "stringlit";
-	else if (m = t.match(number))
-		if (m[0].indexOf(".") !== -1) {
-			ttype = "float";
-			return [[ttype, m[0]], m.input.slice(m[0].length)];
-		}
-		else {
-			ttype = "integer";
-			return [[ttype, m[0]], m.input.slice(m[0].length)];
-		}
-	else if (m = t.match(ifexp))
-		ttype = "ifexp";
-	else if (m = t.match(thenexp))
-		ttype = "thenexp";
-	else if (m = t.match(elsexp))
-		ttype = "elsexp";
-	else if (m = t.match(identifier))
-		ttype = "identifier";
-	else {
-		console.log("Error: unmatched string: " + t);
-		return;
-	}
-	return [[ttype, m[0]], m.input.slice(m[0].length)];
-}
-
-function tokenize(exp) {
-	var current, next;
-	var tokens = [];
-	while (exp != '') {
-		if (exp[0].match(/\s/)) {
-			exp = exp.slice(1);
-			// skip whitespace
-		}
-		else {
-			current = tokmatch(exp);
-			if (!current)
-				break;
-			exp = current[1];
-			tokens.push(current[0]);
-		}
-	}
-	return tokens;
-}
-
 function fst(ts) {
 	return ts[ts.length-1];
 }
@ -413,9 +311,9 @@ function pprint(expr) {

 var input = fs.readFileSync('/dev/stdin').toString();
 //var input = process.argv.slice(2).reduce(function(acc, x) {return acc + " " + x}, "");
-var tokenized = tokenize(input).reverse();
-
+var tokenized = tokenizer.tokenize(input).reverse();
 console.log(tokenized);
-//console.log(parse(tokenized))
+//console.log(tokenized);
+console.log(parse(tokenized))
 //console.log(pprint(parse(tokenized)));
 //console.log(tokenized);
--- a/tokenize.js
+++ b/tokenize.js
@ -17,6 +17,11 @@ function isWhitespace(a) {
  return (code === 9 || code === 32 || code === 10 || code === 13 || code === 11);
 }

+function isIdentifier(a) {
+  var code = a.charCodeAt();
+  return code !== 41 && code !== 40 && code && 125 && code && 123 && code !== 93 && code !== 91 && code !== 44;
+}
+
 function tokenizeNum(tokstream) {
  var number = [];
  var code = tokstream[0].charCodeAt();
@ -62,7 +67,7 @@ function tokenizeNum(tokstream) {
 function tokenizeIdent(tokstream) {
  var identifier = [];
  var n = 0;
-  while (!(isWhitespace(tokstream[0]) || tokstream[0].charCodeAt() === 34)) {
+  while ((!isWhitespace(tokstream[0])) && isIdentifier(tokstream[0])) {
    identifier.push(tokstream[0]);
    tokstream = tokstream.substr(1);
    n++;
@ -93,11 +98,20 @@ function tokenizeT(tokstream) {
    return false;
  var next4 = tokstream.substr(0,4);
  if (next4 === "then")
-    return ["then-exp", "then"];
+    return ["thenexp", "then"];
  else if (next4 === "true")
-    return ["bool", "true"];
-  else
+    return ["truelit", "true"];
+  return false;
+}
+
+function peek(tokstream, toktype, word) {
+  var n = word.length;
+  if (tokstream.length < n)
    return false;
+  var nextN = tokstream.substr(0, n);
+  if (nextN == word)
+    return [toktype, word];
+  return false;
 }

 function tokenize(tokstream) {
@ -158,6 +172,12 @@ function tokenize(tokstream) {
        tokstream = tokstream.substr(i);
        break;
      case 45: // '-'
+        var lambda = peek(tokstream, "arrow", "->");
+        if (lambda) {
+          tokens.push(lambda);
+          tokstream = tokstream.substr(2);
+          break;
+        }
        var result = tokenizeNum(tokstream);
        var num = result[1];
        var i = result[0];
@ -167,7 +187,7 @@ function tokenize(tokstream) {
        break;
      case 46: // '.'
        var result = tokenizeNum(tokstream);
-       var num = result[1];
+        var num = result[1];
        var i = result[0];
        if (num[1] !== NaN)
          tokens.push(num);
@ -180,6 +200,44 @@ function tokenize(tokstream) {
          tokstream = tokstream.substr(4); // 4 = length of either token
          break;
        }
+
+      case 105: // 'i'
+        var result = peek(tokstream, "ifexp", "if");
+        if (result) {
+          tokens.push(result);
+          tokstream = tokstream.substr(2);
+          break;
+        }
+
+      case 100: // 'd'
+        var result = peek(tokstream, "def", "def");
+        if (result) {
+          tokens.push(result);
+          tokstream = tokstream.substr(3);
+          break;
+        }
+      case 101: // e
+        var result = peek(tokstream, "elsexp", "else");
+        if (result) {
+          tokens.push(result);
+          tokstream = tokstream.substr(4);
+          break;
+        }
+      case 102: // f
+        var result = peek(tokstream, "falselit", "false");
+        if (result) {
+          tokens.push(result);
+          tokstream = tokstream.substr(5);
+          break;
+        }
+      case 108: // l
+        var result = peek(tokstream, "lambda", "lambda");
+        if (result) {
+          tokens.push(result);
+          tokstream = tokstream.substr(6);
+          break;
+        }
+
      default:
        if (isDigit(tokstream[0])) {
          var result = tokenizeNum(tokstream);
@ -200,8 +258,10 @@ function tokenize(tokstream) {
  return tokens;
 }

-var tokstream = fs.readFileSync("/dev/stdin").toString();
+module.exports = {tokenize : tokenize};

-console.log(tokenize(tokstream));
+//var tokstream = fs.readFileSync("/dev/stdin").toString();
+//console.log(isIdentifier(')'));
+//console.log(tokenize(tokstream));
 //tokenize(tokstream);