Browse Source

added source character and line numbers to tokenizer

pull/1/head
Wesley Kerfoot 11 years ago
parent
commit
246faaad2a
  1. 110
      tokenize.js

110
tokenize.js

@ -26,20 +26,21 @@ function isIdentifier(a) {
return code !== 41 && code !== 40 && code && 125 && code && 123 && code !== 93 && code !== 91 && code !== 44; return code !== 41 && code !== 40 && code && 125 && code && 123 && code !== 93 && code !== 91 && code !== 44;
} }
function tokenizeNum(tokstream) { function tokenizeNum(tokstream, charnum, linenum) {
var number = []; var number = [];
var code = tokstream[0].charCodeAt(); var code = tokstream[0].charCodeAt();
var isFloat = false; var isFloat = false;
var n = 0; var n = 0;
// + - // + -
if (code === 43 || code === 45) { if (code === 43 || code === 45) { // + or -
number.push(tokstream[0]); number.push(tokstream[0]);
tokstream = tokstream.substr(1); tokstream = tokstream.substr(1);
n++; n++;
} }
else if (code === 46) { else if (code === 46) { // .
tokstream = tokstream.substr(1); tokstream = tokstream.substr(1);
n++; n++;
charnum++;
number.push('0'); number.push('0');
number.push('.'); number.push('.');
isFloat = true; isFloat = true;
@ -48,24 +49,27 @@ function tokenizeNum(tokstream) {
while (isDigit(tokstream[0]) && tokstream.length !== 0) { while (isDigit(tokstream[0]) && tokstream.length !== 0) {
number.push(tokstream[0]); number.push(tokstream[0]);
tokstream = tokstream.substr(1); tokstream = tokstream.substr(1);
charnum++;
n++; n++;
} }
if (tokstream[0] === '.' && isDigit(tokstream[1])) { if (tokstream[0] === '.' && isDigit(tokstream[1])) {
number.push('.'); number.push('.');
number.push(tokstream[1]); number.push(tokstream[1]);
tokstream = tokstream.substr(2); tokstream = tokstream.substr(2);
charnum++; charnum++;
n++; n++; n++; n++;
while (isDigit(tokstream[0]) && tokstream.length !== 0) { while (isDigit(tokstream[0]) && tokstream.length !== 0) {
number.push(tokstream[0]); number.push(tokstream[0]);
tokstream = tokstream.substr(1); tokstream = tokstream.substr(1);
n++; n++;
charnum++;
} }
return [n, ["float", parseFloat(number.join(''), 10)]]; return [n, ["float", parseFloat(number.join(''), 10), charnum, linenum]];
} }
if (!isFloat) if (!isFloat)
return [n, ["integer", parseInt(number.join(''), 10)]]; return [n, ["integer", parseInt(number.join(''), 10), charnum, linenum]];
else else
return [n, ["float", parseFloat(number.join(''), 10)]]; return [n, ["float", parseFloat(number.join(''), 10), charnum, linenum]];
} }
/* Split up the tokenized identifier if an operator appears in it /* Split up the tokenized identifier if an operator appears in it
@ -74,20 +78,21 @@ function tokenizeNum(tokstream) {
* Everything after the operator goes back on to the token stream * Everything after the operator goes back on to the token stream
*/ */
function tokenizeIdent(tokstream) { function tokenizeIdent(tokstream, charnum, linenum) {
var identifier = []; var identifier = [];
var n = 0; var n = 0;
while ((!isWhitespace(tokstream[0])) && isIdentifier(tokstream[0]) && !matchop(tokstream)) { while ((!isWhitespace(tokstream[0])) && isIdentifier(tokstream[0]) && !matchop(tokstream)) {
identifier.push(tokstream[0]); identifier.push(tokstream[0]);
tokstream = tokstream.substr(1); tokstream = tokstream.substr(1);
n++; n++;
charnum++;
} }
identifier = identifier.join(''); identifier = identifier.join('');
return [[n, ["identifier", identifier]]]; return [[n, ["identifier", identifier, charnum, linenum]]];
} }
function tokenizeStr(tokstream) { function tokenizeStr(tokstream, charnum, linenum) {
var stringlit = []; var stringlit = [];
var n = 1; var n = 1;
tokstream = tokstream.substr(1); tokstream = tokstream.substr(1);
@ -95,16 +100,17 @@ function tokenizeStr(tokstream) {
stringlit.push(tokstream[0]); stringlit.push(tokstream[0]);
tokstream = tokstream.substr(1); tokstream = tokstream.substr(1);
n++; n++;
charnum++;
if (tokstream.length < 1) { if (tokstream.length < 1) {
throw "Error: missing quotation mark"; throw "Error: missing quotation mark";
} }
} }
n++; n++;
return [n, ["stringlit", stringlit.join('')]]; return [n, ["stringlit", stringlit.join(''), charnum, linenum]];
} }
function tokenizeT(tokstream) { function tokenizeT(tokstreami, charnum, linenum) {
if (tokstream.length < 4) if (tokstream.length < 4)
return false; return false;
var next4 = tokstream.substr(0,4); var next4 = tokstream.substr(0,4);
@ -115,7 +121,7 @@ function tokenizeT(tokstream) {
return false; return false;
} }
function peek(tokstream, toktype, word) { function peek(tokstream, toktype, word, charnum, linenum) {
var n = word.length; var n = word.length;
if (tokstream.length < n) if (tokstream.length < n)
return false; return false;
@ -128,68 +134,70 @@ function peek(tokstream, toktype, word) {
function tokenize(tokstream) { function tokenize(tokstream) {
var tokens = []; var tokens = [];
var charnum = 1;
var linenum = 1;
while (tokstream) { while (tokstream) {
switch (tokstream[0].charCodeAt()) { switch (tokstream[0].charCodeAt()) {
case 9: // '\t' case 9: // '\t'
tokens.push(["whitespace", '\t']); charnum++;
tokens.push(["whitespace", '\t', charnum, linenum]);
tokstream = tokstream.substr(1); tokstream = tokstream.substr(1);
break; break;
case 32: // ' ' case 32: // ' '
tokens.push(["whitespace", ' ']); charnum++;
tokens.push(["whitespace", ' ', charnum, linenum]);
tokstream = tokstream.substr(1); tokstream = tokstream.substr(1);
break; break;
case 10: // '\n' case 10: // '\n'
tokens.push(["whitespace", '\n']); linenum++;
charnum = 1;
tokens.push(["whitespace", '\n', charnum, linenum]);
tokstream = tokstream.substr(1); tokstream = tokstream.substr(1);
break; break;
case 44: // ',' case 44: // ','
tokens.push(["comma", ","]); charnum++;
tokens.push(["comma", ",", charnum, linenum]);
tokstream = tokstream.substr(1); tokstream = tokstream.substr(1);
break; break;
case 40: // '(' case 40: // '('
tokens.push(["left_paren", '(']); charnum++;
tokens.push(["left_paren", '(', charnum, linenum]);
tokstream = tokstream.substr(1); tokstream = tokstream.substr(1);
break; break;
case 41: // ')' case 41: // ')'
tokens.push(["right_paren", ')']); charnum++;
tokens.push(["right_paren", ')', charnum, linenum]);
tokstream = tokstream.substr(1); tokstream = tokstream.substr(1);
break; break;
case 123: // '{' case 123: // '{'
tokens.push(["left_brace", '{']); charnum++;
tokens.push(["left_brace", '{', charnum, linenum]);
tokstream = tokstream.substr(1); tokstream = tokstream.substr(1);
break; break;
case 125: // '}' case 125: // '}'
tokens.push(["right_brace", '}']); charnum++;
tokens.push(["right_brace", '}', charnum, linenum]);
tokstream = tokstream.substr(1); tokstream = tokstream.substr(1);
break; break;
case 91: // '[' case 91: // '['
tokens.push(["left_square", '[']); charnum++;
tokens.push(["left_square", '[', charnum, linenum]);
tokstream = tokstream.substr(1); tokstream = tokstream.substr(1);
break; break;
case 93: // ']' case 93: // ']'
tokens.push(["right_square", ']']); charnum++;
tokens.push(["right_square", ']', charnum, linenum]);
tokstream = tokstream.substr(1); tokstream = tokstream.substr(1);
break; break;
case 34: // '"' case 34: // '"'
var result = tokenizeStr(tokstream); var result = tokenizeStr(tokstream, charnum, linenum);
var str = result[1]; var str = result[1];
var i = result[0]; var i = result[0];
tokens.push(str); tokens.push(str);
tokstream = tokstream.substr(i); tokstream = tokstream.substr(i);
break; break;
/* case 43: // '+'
if (isDigit(tokstream[1])) {
var result = tokenizeNum(tokstream);
var num = result[1];
var i = result[0];
if (num[1] !== NaN)
tokens.push(num);
tokstream = tokstream.substr(i);
break;
}
*/
case 45: // '-' case 45: // '-'
var lambda = peek(tokstream, "arrow", "->"); var lambda = peek(tokstream, "arrow", "->");
if (lambda) { if (lambda) {
@ -198,23 +206,15 @@ function tokenize(tokstream) {
break; break;
} }
else { else {
tokens.push(["identifier", "-"]); tokens.push(["identifier", "-", charnum, linenum]);
charnum++;
tokstream = tokstream.substr(1); tokstream = tokstream.substr(1);
break; break;
} }
/* if (isDigit(tokstream[1])) {
var result = tokenizeNum(tokstream);
var num = result[1];
var i = result[0];
if (num[1] !== NaN)
tokens.push(num);
tokstream = tokstream.substr(i);
break;
}
*/
case 46: // '.' case 46: // '.'
if (isDigit(tokstream[1])) { if (isDigit(tokstream[1])) {
var result = tokenizeNum(tokstream); var result = tokenizeNum(tokstream, charnum, linenum);
var num = result[1]; var num = result[1];
var i = result[0]; var i = result[0];
if (num[1] !== NaN) if (num[1] !== NaN)
@ -281,7 +281,7 @@ function tokenize(tokstream) {
default: default:
if (isDigit(tokstream[0])) { if (isDigit(tokstream[0])) {
var result = tokenizeNum(tokstream); var result = tokenizeNum(tokstream, charnum, linenum);
var num = result[1]; var num = result[1];
var i = result[0]; var i = result[0];
if (num[1] !== NaN) if (num[1] !== NaN)
@ -292,12 +292,14 @@ function tokenize(tokstream) {
var op = matchop(tokstream); var op = matchop(tokstream);
if (op) { if (op) {
var l = op.length; var l = op.length;
charnum = charnum + l;
tokstream = tokstream.substr(l); tokstream = tokstream.substr(l);
tokens.push(["identifier", op]); tokens.push(["identifier", op, charnum, linenum]);
} }
else { else {
var result = tokenizeIdent(tokstream); var result = tokenizeIdent(tokstream, charnum, linenum);
result.map(function(x) { result.map(function(x) {
charnum++;
tokens.push(x[1]); tokens.push(x[1]);
tokstream = tokstream.substr(x[0]); tokstream = tokstream.substr(x[0]);
}); });
@ -319,14 +321,4 @@ function tokenizeFull(input) {
} }
} }
module.exports = {tokenize : tokenizeFull}; module.exports = {tokenize : tokenizeFull};
//var tokstream = fs.readFileSync("/dev/stdin").toString();
//console.log(tokenize(tokstream));
//console.log(tools.buildTrie('', operators)[1][6]);
//console.log(isIdentifier(')'));
//console.log(tools.maxBy(tools.len, operators.filter(function (x) { return "#".indexOf(x) != -1;})));
//console.log(tokenizeIdent("abc%%3"));

Loading…
Cancel
Save