From ee0d7aebcbc269b6848077959db6bbf43bd2f557 Mon Sep 17 00:00:00 2001 From: Wesley Kerfoot Date: Fri, 23 Aug 2013 20:04:23 -0400 Subject: [PATCH] removed regex based tokenizer, finished switch/case based tokenizer, need to add support for whitespace in the parser or strip it out for now --- parse.js | 110 ++-------------------------------------------------- tokenize.js | 74 +++++++++++++++++++++++++++++++---- 2 files changed, 71 insertions(+), 113 deletions(-) diff --git a/parse.js b/parse.js index e97432f..dc49e88 100755 --- a/parse.js +++ b/parse.js @@ -2,111 +2,9 @@ var typ = require("./representation.js"); var tool = require("./tools.js"); +var tokenizer = require("./tokenize.js"); var fs = require("fs"); -// Tokenization - -var left_paren = /^\(/; -var right_paren = /^\)/; - -var left_brace = /^\{/; -var right_brace = /^\}/; - -var def = /^def/; - -var left_square = /^\[/; -var right_square = /^\]/; -var comma = /^,/; - -var truelit = /^true/; -var falselit = /^false/; - -var stringlit = /^\"[^\"]*\"/; - -var number = /^(\+|-)?\d+(\.\d+)?/; - -var ifexp = /^if/; -var thenexp = /^then/; -var elsexp = /^else/; - -var identifier = /^[^\s\.\(\)\{\}\[\]\""]+/; - -var lambda = /^lambda/; - -var arrow = /^->/; - -function tokmatch(t) { - var ttype; - var m; - if (m = t.match(left_paren)) - ttype = "left_paren"; - else if (m = t.match(right_paren)) - ttype = "right_paren"; - else if (m = t.match(left_brace)) - ttype = "left_brace"; - else if (m = t.match(right_brace)) - ttype = "right_brace"; - else if (m = t.match(left_square)) - ttype = "left_square"; - else if (m = t.match(right_square)) - ttype = "right_square"; - else if (m = t.match(def)) - ttype = "def"; - else if (m = t.match(lambda)) - ttype = "lambda"; - else if (m = t.match(arrow)) - ttype = "arrow"; - else if (m = t.match(comma)) - ttype = "comma"; - else if (m = t.match(truelit)) - ttype = "truelit"; - else if (m = t.match(falselit)) - ttype = "falselit"; - else if (m = t.match(stringlit)) - ttype = "stringlit"; - else if (m = t.match(number)) - if (m[0].indexOf(".") !== -1) { - ttype = "float"; - return [[ttype, m[0]], m.input.slice(m[0].length)]; - } - else { - ttype = "integer"; - return [[ttype, m[0]], m.input.slice(m[0].length)]; - } - else if (m = t.match(ifexp)) - ttype = "ifexp"; - else if (m = t.match(thenexp)) - ttype = "thenexp"; - else if (m = t.match(elsexp)) - ttype = "elsexp"; - else if (m = t.match(identifier)) - ttype = "identifier"; - else { - console.log("Error: unmatched string: " + t); - return; - } - return [[ttype, m[0]], m.input.slice(m[0].length)]; -} - -function tokenize(exp) { - var current, next; - var tokens = []; - while (exp != '') { - if (exp[0].match(/\s/)) { - exp = exp.slice(1); - // skip whitespace - } - else { - current = tokmatch(exp); - if (!current) - break; - exp = current[1]; - tokens.push(current[0]); - } - } - return tokens; -} - function fst(ts) { return ts[ts.length-1]; } @@ -413,9 +311,9 @@ function pprint(expr) { var input = fs.readFileSync('/dev/stdin').toString(); //var input = process.argv.slice(2).reduce(function(acc, x) {return acc + " " + x}, ""); -var tokenized = tokenize(input).reverse(); - +var tokenized = tokenizer.tokenize(input).reverse(); console.log(tokenized); -//console.log(parse(tokenized)) +//console.log(tokenized); +console.log(parse(tokenized)) //console.log(pprint(parse(tokenized))); //console.log(tokenized); diff --git a/tokenize.js b/tokenize.js index 2e520d0..8609501 100755 --- a/tokenize.js +++ b/tokenize.js @@ -17,6 +17,11 @@ function isWhitespace(a) { return (code === 9 || code === 32 || code === 10 || code === 13 || code === 11); } +function isIdentifier(a) { + var code = a.charCodeAt(); + return code !== 41 && code !== 40 && code && 125 && code && 123 && code !== 93 && code !== 91 && code !== 44; +} + function tokenizeNum(tokstream) { var number = []; var code = tokstream[0].charCodeAt(); @@ -62,7 +67,7 @@ function tokenizeNum(tokstream) { function tokenizeIdent(tokstream) { var identifier = []; var n = 0; - while (!(isWhitespace(tokstream[0]) || tokstream[0].charCodeAt() === 34)) { + while ((!isWhitespace(tokstream[0])) && isIdentifier(tokstream[0])) { identifier.push(tokstream[0]); tokstream = tokstream.substr(1); n++; @@ -93,11 +98,20 @@ function tokenizeT(tokstream) { return false; var next4 = tokstream.substr(0,4); if (next4 === "then") - return ["then-exp", "then"]; + return ["thenexp", "then"]; else if (next4 === "true") - return ["bool", "true"]; - else + return ["truelit", "true"]; + return false; +} + +function peek(tokstream, toktype, word) { + var n = word.length; + if (tokstream.length < n) return false; + var nextN = tokstream.substr(0, n); + if (nextN == word) + return [toktype, word]; + return false; } function tokenize(tokstream) { @@ -158,6 +172,12 @@ function tokenize(tokstream) { tokstream = tokstream.substr(i); break; case 45: // '-' + var lambda = peek(tokstream, "arrow", "->"); + if (lambda) { + tokens.push(lambda); + tokstream = tokstream.substr(2); + break; + } var result = tokenizeNum(tokstream); var num = result[1]; var i = result[0]; @@ -167,7 +187,7 @@ function tokenize(tokstream) { break; case 46: // '.' var result = tokenizeNum(tokstream); - var num = result[1]; + var num = result[1]; var i = result[0]; if (num[1] !== NaN) tokens.push(num); @@ -180,6 +200,44 @@ function tokenize(tokstream) { tokstream = tokstream.substr(4); // 4 = length of either token break; } + + case 105: // 'i' + var result = peek(tokstream, "ifexp", "if"); + if (result) { + tokens.push(result); + tokstream = tokstream.substr(2); + break; + } + + case 100: // 'd' + var result = peek(tokstream, "def", "def"); + if (result) { + tokens.push(result); + tokstream = tokstream.substr(3); + break; + } + case 101: // e + var result = peek(tokstream, "elsexp", "else"); + if (result) { + tokens.push(result); + tokstream = tokstream.substr(4); + break; + } + case 102: // f + var result = peek(tokstream, "falselit", "false"); + if (result) { + tokens.push(result); + tokstream = tokstream.substr(5); + break; + } + case 108: // l + var result = peek(tokstream, "lambda", "lambda"); + if (result) { + tokens.push(result); + tokstream = tokstream.substr(6); + break; + } + default: if (isDigit(tokstream[0])) { var result = tokenizeNum(tokstream); @@ -200,8 +258,10 @@ function tokenize(tokstream) { return tokens; } -var tokstream = fs.readFileSync("/dev/stdin").toString(); +module.exports = {tokenize : tokenize}; -console.log(tokenize(tokstream)); +//var tokstream = fs.readFileSync("/dev/stdin").toString(); +//console.log(isIdentifier(')')); +//console.log(tokenize(tokstream)); //tokenize(tokstream);