From ee0d7aebcbc269b6848077959db6bbf43bd2f557 Mon Sep 17 00:00:00 2001
From: Wesley Kerfoot <wjak56@gmail.com>
Date: Fri, 23 Aug 2013 20:04:23 -0400
Subject: [PATCH] removed regex based tokenizer, finished switch/case based
 tokenizer, need to add support for whitespace in the parser or strip it out
 for now

---
 parse.js    | 110 ++--------------------------------------------------
 tokenize.js |  74 +++++++++++++++++++++++++++++++----
 2 files changed, 71 insertions(+), 113 deletions(-)

diff --git a/parse.js b/parse.js
index e97432f..dc49e88 100755
--- a/parse.js
+++ b/parse.js
@@ -2,111 +2,9 @@
 
 var typ = require("./representation.js");
 var tool = require("./tools.js");
+var tokenizer = require("./tokenize.js");
 var fs = require("fs");
 
-// Tokenization
-
-var left_paren = /^\(/;
-var right_paren = /^\)/;
-
-var left_brace = /^\{/;
-var right_brace = /^\}/;
-
-var def = /^def/;
-
-var left_square = /^\[/;
-var right_square = /^\]/;
-var comma = /^,/;
-
-var truelit = /^true/;
-var falselit = /^false/;
-
-var stringlit = /^\"[^\"]*\"/;
-
-var number = /^(\+|-)?\d+(\.\d+)?/;
-
-var ifexp = /^if/;
-var thenexp = /^then/;
-var elsexp = /^else/;
-
-var identifier = /^[^\s\.\(\)\{\}\[\]\""]+/;
-
-var lambda = /^lambda/;
-
-var arrow = /^->/;
-
-function tokmatch(t) {
-	var ttype;
-	var m;
-	if (m = t.match(left_paren))
-		ttype = "left_paren";
-	else if (m = t.match(right_paren))
-		ttype = "right_paren";
-	else if (m = t.match(left_brace))
-		ttype = "left_brace";
-	else if (m = t.match(right_brace))
-		ttype = "right_brace";
-	else if (m = t.match(left_square))
-		ttype = "left_square";
-	else if (m = t.match(right_square))
-		ttype = "right_square";
-	else if (m = t.match(def))
-		ttype = "def";
-	else if (m = t.match(lambda))
-		ttype = "lambda";
-	else if (m = t.match(arrow))
-		ttype = "arrow";
-	else if (m = t.match(comma))
-		ttype = "comma";
-	else if (m = t.match(truelit))
-		ttype = "truelit";
-	else if (m = t.match(falselit))
-		ttype = "falselit";
-	else if (m = t.match(stringlit))
-		ttype = "stringlit";
-	else if (m = t.match(number))
-		if (m[0].indexOf(".") !== -1) {
-			ttype = "float";
-			return [[ttype, m[0]], m.input.slice(m[0].length)];
-		}
-		else {
-			ttype = "integer";
-			return [[ttype, m[0]], m.input.slice(m[0].length)];
-		}
-	else if (m = t.match(ifexp))
-		ttype = "ifexp";
-	else if (m = t.match(thenexp))
-		ttype = "thenexp";
-	else if (m = t.match(elsexp))
-		ttype = "elsexp";
-	else if (m = t.match(identifier))
-		ttype = "identifier";
-	else {
-		console.log("Error: unmatched string: " + t);
-		return;
-	}
-	return [[ttype, m[0]], m.input.slice(m[0].length)];
-}
-
-function tokenize(exp) {
-	var current, next;
-	var tokens = [];
-	while (exp != '') {
-		if (exp[0].match(/\s/)) {
-			exp = exp.slice(1);
-			// skip whitespace
-		}
-		else {
-			current = tokmatch(exp);
-			if (!current)
-				break;
-			exp = current[1];
-			tokens.push(current[0]);
-		}
-	}
-	return tokens;
-}
-
 function fst(ts) {
 	return ts[ts.length-1];
 }
@@ -413,9 +311,9 @@ function pprint(expr) {
 
 var input = fs.readFileSync('/dev/stdin').toString();
 //var input = process.argv.slice(2).reduce(function(acc, x) {return acc + " " + x}, "");
-var tokenized = tokenize(input).reverse();
-
+var tokenized = tokenizer.tokenize(input).reverse();
 console.log(tokenized);
-//console.log(parse(tokenized))
+//console.log(tokenized);
+console.log(parse(tokenized))
 //console.log(pprint(parse(tokenized)));
 //console.log(tokenized);
diff --git a/tokenize.js b/tokenize.js
index 2e520d0..8609501 100755
--- a/tokenize.js
+++ b/tokenize.js
@@ -17,6 +17,11 @@ function isWhitespace(a) {
   return (code === 9 || code === 32 || code === 10 || code === 13 || code === 11);
 }
 
+function isIdentifier(a) {
+  var code = a.charCodeAt();
+  return code !== 41 && code !== 40 && code && 125 && code && 123 && code !== 93 && code !== 91 && code !== 44;
+}
+
 function tokenizeNum(tokstream) {
   var number = [];
   var code = tokstream[0].charCodeAt();
@@ -62,7 +67,7 @@ function tokenizeNum(tokstream) {
 function tokenizeIdent(tokstream) {
   var identifier = [];
   var n = 0;
-  while (!(isWhitespace(tokstream[0]) || tokstream[0].charCodeAt() === 34)) {
+  while ((!isWhitespace(tokstream[0])) && isIdentifier(tokstream[0])) {
     identifier.push(tokstream[0]);
     tokstream = tokstream.substr(1);
     n++;
@@ -93,11 +98,20 @@ function tokenizeT(tokstream) {
     return false;
   var next4 = tokstream.substr(0,4);
   if (next4 === "then")
-    return ["then-exp", "then"];
+    return ["thenexp", "then"];
   else if (next4 === "true")
-    return ["bool", "true"];
-  else
+    return ["truelit", "true"];
+  return false;
+}
+
+function peek(tokstream, toktype, word) {
+  var n = word.length;
+  if (tokstream.length < n)
     return false;
+  var nextN = tokstream.substr(0, n);
+  if (nextN == word)
+    return [toktype, word];
+  return false;
 }
 
 function tokenize(tokstream) {
@@ -158,6 +172,12 @@ function tokenize(tokstream) {
         tokstream = tokstream.substr(i);
         break;
       case 45: // '-'
+        var lambda = peek(tokstream, "arrow", "->");
+        if (lambda) {
+          tokens.push(lambda);
+          tokstream = tokstream.substr(2);
+          break;
+        }
         var result = tokenizeNum(tokstream);
         var num = result[1];
         var i = result[0];
@@ -167,7 +187,7 @@ function tokenize(tokstream) {
         break;
       case 46: // '.'
         var result = tokenizeNum(tokstream);
-       var num = result[1];
+        var num = result[1];
         var i = result[0];
         if (num[1] !== NaN)
           tokens.push(num);
@@ -180,6 +200,44 @@ function tokenize(tokstream) {
           tokstream = tokstream.substr(4); // 4 = length of either token
           break;
         }
+
+      case 105: // 'i'
+        var result = peek(tokstream, "ifexp", "if");
+        if (result) {
+          tokens.push(result);
+          tokstream = tokstream.substr(2);
+          break;
+        }
+
+      case 100: // 'd'
+        var result = peek(tokstream, "def", "def");
+        if (result) {
+          tokens.push(result);
+          tokstream = tokstream.substr(3);
+          break;
+        }
+      case 101: // e
+        var result = peek(tokstream, "elsexp", "else");
+        if (result) {
+          tokens.push(result);
+          tokstream = tokstream.substr(4);
+          break;
+        }
+      case 102: // f
+        var result = peek(tokstream, "falselit", "false");
+        if (result) {
+          tokens.push(result);
+          tokstream = tokstream.substr(5);
+          break;
+        }
+      case 108: // l
+        var result = peek(tokstream, "lambda", "lambda");
+        if (result) {
+          tokens.push(result);
+          tokstream = tokstream.substr(6);
+          break;
+        }
+
       default:
         if (isDigit(tokstream[0])) {
           var result = tokenizeNum(tokstream);
@@ -200,8 +258,10 @@ function tokenize(tokstream) {
   return tokens;
 }
 
-var tokstream = fs.readFileSync("/dev/stdin").toString();
+module.exports = {tokenize : tokenize};
 
-console.log(tokenize(tokstream));
+//var tokstream = fs.readFileSync("/dev/stdin").toString();
+//console.log(isIdentifier(')'));
+//console.log(tokenize(tokstream));
 //tokenize(tokstream);