Browse Source

removed regex based tokenizer, finished switch/case based tokenizer, need to add support for whitespace in the parser or strip it out for now

pull/21/head
Wesley Kerfoot 12 years ago
parent
commit
ee0d7aebcb
  1. 110
      parse.js
  2. 74
      tokenize.js

110
parse.js

@ -2,111 +2,9 @@
var typ = require("./representation.js");
var tool = require("./tools.js");
var tokenizer = require("./tokenize.js");
var fs = require("fs");
// Tokenization
var left_paren = /^\(/;
var right_paren = /^\)/;
var left_brace = /^\{/;
var right_brace = /^\}/;
var def = /^def/;
var left_square = /^\[/;
var right_square = /^\]/;
var comma = /^,/;
var truelit = /^true/;
var falselit = /^false/;
var stringlit = /^\"[^\"]*\"/;
var number = /^(\+|-)?\d+(\.\d+)?/;
var ifexp = /^if/;
var thenexp = /^then/;
var elsexp = /^else/;
var identifier = /^[^\s\.\(\)\{\}\[\]\""]+/;
var lambda = /^lambda/;
var arrow = /^->/;
function tokmatch(t) {
var ttype;
var m;
if (m = t.match(left_paren))
ttype = "left_paren";
else if (m = t.match(right_paren))
ttype = "right_paren";
else if (m = t.match(left_brace))
ttype = "left_brace";
else if (m = t.match(right_brace))
ttype = "right_brace";
else if (m = t.match(left_square))
ttype = "left_square";
else if (m = t.match(right_square))
ttype = "right_square";
else if (m = t.match(def))
ttype = "def";
else if (m = t.match(lambda))
ttype = "lambda";
else if (m = t.match(arrow))
ttype = "arrow";
else if (m = t.match(comma))
ttype = "comma";
else if (m = t.match(truelit))
ttype = "truelit";
else if (m = t.match(falselit))
ttype = "falselit";
else if (m = t.match(stringlit))
ttype = "stringlit";
else if (m = t.match(number))
if (m[0].indexOf(".") !== -1) {
ttype = "float";
return [[ttype, m[0]], m.input.slice(m[0].length)];
}
else {
ttype = "integer";
return [[ttype, m[0]], m.input.slice(m[0].length)];
}
else if (m = t.match(ifexp))
ttype = "ifexp";
else if (m = t.match(thenexp))
ttype = "thenexp";
else if (m = t.match(elsexp))
ttype = "elsexp";
else if (m = t.match(identifier))
ttype = "identifier";
else {
console.log("Error: unmatched string: " + t);
return;
}
return [[ttype, m[0]], m.input.slice(m[0].length)];
}
function tokenize(exp) {
var current, next;
var tokens = [];
while (exp != '') {
if (exp[0].match(/\s/)) {
exp = exp.slice(1);
// skip whitespace
}
else {
current = tokmatch(exp);
if (!current)
break;
exp = current[1];
tokens.push(current[0]);
}
}
return tokens;
}
function fst(ts) {
return ts[ts.length-1];
}
@ -413,9 +311,9 @@ function pprint(expr) {
var input = fs.readFileSync('/dev/stdin').toString();
//var input = process.argv.slice(2).reduce(function(acc, x) {return acc + " " + x}, "");
var tokenized = tokenize(input).reverse();
var tokenized = tokenizer.tokenize(input).reverse();
console.log(tokenized);
//console.log(parse(tokenized))
//console.log(tokenized);
console.log(parse(tokenized))
//console.log(pprint(parse(tokenized)));
//console.log(tokenized);

74
tokenize.js

@ -17,6 +17,11 @@ function isWhitespace(a) {
return (code === 9 || code === 32 || code === 10 || code === 13 || code === 11);
}
function isIdentifier(a) {
var code = a.charCodeAt();
return code !== 41 && code !== 40 && code && 125 && code && 123 && code !== 93 && code !== 91 && code !== 44;
}
function tokenizeNum(tokstream) {
var number = [];
var code = tokstream[0].charCodeAt();
@ -62,7 +67,7 @@ function tokenizeNum(tokstream) {
function tokenizeIdent(tokstream) {
var identifier = [];
var n = 0;
while (!(isWhitespace(tokstream[0]) || tokstream[0].charCodeAt() === 34)) {
while ((!isWhitespace(tokstream[0])) && isIdentifier(tokstream[0])) {
identifier.push(tokstream[0]);
tokstream = tokstream.substr(1);
n++;
@ -93,11 +98,20 @@ function tokenizeT(tokstream) {
return false;
var next4 = tokstream.substr(0,4);
if (next4 === "then")
return ["then-exp", "then"];
return ["thenexp", "then"];
else if (next4 === "true")
return ["bool", "true"];
else
return ["truelit", "true"];
return false;
}
function peek(tokstream, toktype, word) {
var n = word.length;
if (tokstream.length < n)
return false;
var nextN = tokstream.substr(0, n);
if (nextN == word)
return [toktype, word];
return false;
}
function tokenize(tokstream) {
@ -158,6 +172,12 @@ function tokenize(tokstream) {
tokstream = tokstream.substr(i);
break;
case 45: // '-'
var lambda = peek(tokstream, "arrow", "->");
if (lambda) {
tokens.push(lambda);
tokstream = tokstream.substr(2);
break;
}
var result = tokenizeNum(tokstream);
var num = result[1];
var i = result[0];
@ -167,7 +187,7 @@ function tokenize(tokstream) {
break;
case 46: // '.'
var result = tokenizeNum(tokstream);
var num = result[1];
var num = result[1];
var i = result[0];
if (num[1] !== NaN)
tokens.push(num);
@ -180,6 +200,44 @@ function tokenize(tokstream) {
tokstream = tokstream.substr(4); // 4 = length of either token
break;
}
case 105: // 'i'
var result = peek(tokstream, "ifexp", "if");
if (result) {
tokens.push(result);
tokstream = tokstream.substr(2);
break;
}
case 100: // 'd'
var result = peek(tokstream, "def", "def");
if (result) {
tokens.push(result);
tokstream = tokstream.substr(3);
break;
}
case 101: // e
var result = peek(tokstream, "elsexp", "else");
if (result) {
tokens.push(result);
tokstream = tokstream.substr(4);
break;
}
case 102: // f
var result = peek(tokstream, "falselit", "false");
if (result) {
tokens.push(result);
tokstream = tokstream.substr(5);
break;
}
case 108: // l
var result = peek(tokstream, "lambda", "lambda");
if (result) {
tokens.push(result);
tokstream = tokstream.substr(6);
break;
}
default:
if (isDigit(tokstream[0])) {
var result = tokenizeNum(tokstream);
@ -200,8 +258,10 @@ function tokenize(tokstream) {
return tokens;
}
var tokstream = fs.readFileSync("/dev/stdin").toString();
module.exports = {tokenize : tokenize};
console.log(tokenize(tokstream));
//var tokstream = fs.readFileSync("/dev/stdin").toString();
//console.log(isIdentifier(')'));
//console.log(tokenize(tokstream));
//tokenize(tokstream);

Loading…
Cancel
Save