JLambda/tokenize.js


								#!  /usr/bin/node


								var rep = require("./representation.js");

								var tools = require("./tools.js");

								var error = require("./errors.js");

								var operators = Object.keys(rep.OPInfo);


								function isDigit(a) {

								  if (!a)

								    return false;

								  var code = a.charCodeAt();

								  return (46 < code && code < 58 || code < 58 && code > 46);

								}


								function isWhitespace(a) {

								  if (!a)

								    return true;


								  var code = a.charCodeAt();

								  return (code === 9 || code === 32 || code === 10 || code === 13 || code === 11);

								}


								function isIdentifier(a) {

								  var code = a.charCodeAt();

								  return code !== 41 && code !== 40 && code && 125 && code && 123 && code !== 93 && code !== 91 && code !== 44;

								}


								function tokenizeNum(tokstream, charnum, linenum) {

								  var number = [];

								  var code = tokstream[0].charCodeAt();

								  var isFloat = false;

								  var n = 0;

								  // + -

								  // might want to remove this since it probably won't ever get run?

								  if (code === 43 || code === 45) { // + or -

								    number.push(tokstream[0]);

								    tokstream = tokstream.substr(1);

								    n++;

								  }

								  else if (code === 46) { // .

								    tokstream = tokstream.substr(1);

								    n++;

								    charnum++;

								    number.push('0');

								    number.push('.');

								    isFloat = true;

								  }


								  while (isDigit(tokstream[0]) && tokstream.length !== 0) {

								    number.push(tokstream[0]);

								    tokstream = tokstream.substr(1);

								    charnum++;

								    n++;

								  }

								  if (tokstream[0] === '.' && isDigit(tokstream[1])) {

								    number.push('.');

								    number.push(tokstream[1]);

								    tokstream = tokstream.substr(2);

								    charnum++; charnum++;

								    n++; n++;

								    while (isDigit(tokstream[0]) && tokstream.length !== 0) {

								      number.push(tokstream[0]);

								      tokstream = tokstream.substr(1);

								      n++;

								      charnum++;

								    }

								    return [n, ["float", parseFloat(number.join(''), 10), charnum, linenum]];

								  }

								  if (!isFloat)

								    return [n, ["integer", parseInt(number.join(''), 10), charnum, linenum]];

								  else

								    return [n, ["float", parseFloat(number.join(''), 10), charnum, linenum]];

								}


								/* Split up the tokenized identifier if an operator appears in it

								 * prefer longer identifiers that start with the same character(s) as shorter ones

								 * e.g. ++ over +

								 * Everything after the operator goes back on to the token stream

								 */


								function tokenizeIdent(tokstream, matchop, charnum, linenum) {

								  var identifier = [];

								  var n = 0;

								  while ((!isWhitespace(tokstream[0])) && isIdentifier(tokstream[0]) && !matchop(tokstream)) {

								    identifier.push(tokstream[0]);

								    tokstream = tokstream.substr(1);

								    n++;

								    charnum++;

								  }

								  identifier = identifier.join('');


								  return [[n, ["identifier", identifier, charnum, linenum]]];

								}


								function tokenizeStr(tokstream, charnum, linenum) {

								  var stringlit = [];

								  var n = 1;

								  tokstream = tokstream.substr(1);

								  while (tokstream[0].charCodeAt() !== 34) {

								    stringlit.push(tokstream[0]);

								    tokstream = tokstream.substr(1);

								    n++;

								    charnum++;

								    if (tokstream.length < 1) {

								      throw error.JSyntaxError(linenum, charnum, "Error: missing quotation mark");

								    }

								  }

								  n++;

								  return [n, ["stringlit", stringlit.join(''), charnum, linenum]];


								}


								function tokenizeT(tokstream, charnum, linenum) {

								  if (tokstream.length < 4)

								    return false;

								  var next4 = tokstream.substr(0,4);

								  if (next4 === "then")

								    return ["thenexp", "then"];

								  else if (next4 === "true")

								    return ["truelit", "true"];

								  return false;

								}


								function peek(tokstream, toktype, word, charnum, linenum) {

								  var n = word.length;

								  if (tokstream.length < n)

								    return false;

								  var nextN = tokstream.substr(0, n);

								  if (nextN == word) {

								    return [toktype, word];

								  }

								  return false;

								}


								function tokenize(tokstream, matchop) {

								  var tokens = [];

								  var charnum = 1;

								  var linenum = 1;

								  var i, result, lambda, num;


								  while (tokstream) {

								    switch (tokstream[0].charCodeAt()) {

								      /* falls through */

								      case 9: // '\t'

								        charnum++;

								        tokens.push(["whitespace", '\t', charnum, linenum]);

								        tokstream = tokstream.substr(1);

								        break;

								      /* falls through */

								      case 32: // ' '

								        charnum++;

								        tokens.push(["whitespace", ' ', charnum, linenum]);

								        tokstream = tokstream.substr(1);

								        break;

								      /* falls through */

								      case 10: // '\n'

								        linenum++;

								        charnum = 1;

								        tokens.push(["whitespace", '\n', charnum, linenum]);

								        tokstream = tokstream.substr(1);

								        break;

								      /* falls through */

								      case 44: // ','

								        charnum++;

								        tokens.push(["comma", ",", charnum, linenum]);

								        tokstream = tokstream.substr(1);

								        break;

								      /* falls through */

								      case 40: // '('

								        charnum++;

								        tokens.push(["left_paren", '(', charnum, linenum]);

								        tokstream = tokstream.substr(1);

								        break;

								      /* falls through */

								      case 41: // ')'

								        charnum++;

								        tokens.push(["right_paren", ')', charnum, linenum]);

								        tokstream = tokstream.substr(1);

								        break;

								      /* falls through */

								      case 123: // '{'

								        charnum++;

								        tokens.push(["left_brace", '{', charnum, linenum]);

								        tokstream = tokstream.substr(1);

								        break;

								      /* falls through */

								      case 125: // '}'

								        charnum++;

								        tokens.push(["right_brace", '}', charnum, linenum]);

								        tokstream = tokstream.substr(1);

								        break;

								      /* falls through */

								      case 91: // '['

								        charnum++;

								        tokens.push(["left_square", '[', charnum, linenum]);

								        tokstream = tokstream.substr(1);

								        break;

								      /* falls through */

								      case 93: // ']'

								        charnum++;

								        tokens.push(["right_square", ']', charnum, linenum]);

								        tokstream = tokstream.substr(1);

								        break;

								      /* falls through */

								      case 34: // '"'

								        result = tokenizeStr(tokstream, charnum, linenum);

								        var str = result[1];

								        i = result[0];

								        tokens.push(str);

								        tokstream = tokstream.substr(i);

								        break;


								      /* falls through */

								      case 45: // '-'

								        lambda = peek(tokstream, "arrow", "->");

								        if (lambda) {

								          tokens.push(lambda);

								          tokstream = tokstream.substr(2);

								          break;

								        }

								        else {

								          tokens.push(["identifier", "-", charnum, linenum]);

								          charnum++;

								          tokstream = tokstream.substr(1);

								          break;

								        }


								      /* falls through */

								      case 46: // '.'

								        if (isDigit(tokstream[1])) {

								          result = tokenizeNum(tokstream, charnum, linenum);

								          num = result[1];

								          i = result[0];

								          if (!isNaN(num[1])) {

								            tokens.push(num);

								          }

								          tokstream = tokstream.substr(i);

								          break;

								        }

								      /* falls through */

								      case 116: // 't'

								        result = tokenizeT(tokstream);

								        if (result) {

								          tokens.push(result);

								          tokstream = tokstream.substr(4); // 4 = length of either token

								          break;

								        }

								      /* falls through */

								      case 105: // 'i'

								        var ifexp = peek(tokstream, "ifexp", "if");

								        if (ifexp) {

								          tokens.push(ifexp);

								          tokstream = tokstream.substr(2);

								          break;

								        }

								        var inkeyword = peek(tokstream, "in", "in ");

								        if (inkeyword) {

								          tokens.push(inkeyword);

								          tokstream = tokstream.substr(3);

								          break;

								        }


								      /* falls through */

								      case 100: // 'd'

								        var defop = peek(tokstream, "defop", "defop");

								        if (defop) {

								          tokens.push(["defop", "defop", charnum, linenum]);

								          tokstream = tokstream.substr(5);

								          break;

								        }

								        var def = peek(tokstream, "def", "def");

								        if (def) {

								          tokens.push(["def", "def", charnum, linenum]);

								          tokstream = tokstream.substr(3);

								          break;

								        }

								      /* falls through */

								      case 101: // e

								        result = peek(tokstream, "elsexp", "else");

								        if (result) {

								          tokens.push(result);

								          tokstream = tokstream.substr(4);

								          break;

								        }

								      /* falls through */

								      case 102: // f

								        result = peek(tokstream, "falselit", "false");

								        if (result) {

								          tokens.push(result);

								          tokstream = tokstream.substr(5);

								          break;

								        }

								      /* falls through */

								      case 108: // l

								        lambda = peek(tokstream, "lambda", "lambda");

								        if (lambda) {

								          tokens.push(lambda);

								          tokstream = tokstream.substr(6);

								          break;

								        }

								        var letexp = peek(tokstream, "let", "let");

								        if (letexp) {

								          tokens.push(letexp);

								          tokstream = tokstream.substr(3);

								          break;

								        }


								      /* falls through */

								      default:

								        if (isDigit(tokstream[0])) {

								          result = tokenizeNum(tokstream, charnum, linenum);

								          num = result[1];

								          i = result[0];

								          if (!isNaN(num[1])) {

								            tokens.push(num);

								          }

								          tokstream = tokstream.substr(i);

								          break;

								        }

								        var op = matchop(tokstream);

								        if (op) {

								          var l = op.length;

								          charnum = charnum + l;

								          tokstream = tokstream.substr(l);

								          tokens.push(["identifier", op, charnum, linenum]);

								        }

								        else {

								          result = tokenizeIdent(tokstream, matchop, charnum, linenum);

								          for(var index = 0; index < result.length; index++) {

								            charnum++;

								            tokens.push(result[index][1]);

								            tokstream = tokstream.substr(result[index][0]);

								          }

								        }

								     }

								  }

								  return tokens;

								}


								function tokenizeHelp(input, matchop) {

								  try {

								    return tokenize(input, matchop).reverse().filter(function(x) {

								      return x[0] !== "whitespace";

								    });

								  } catch (e) {

								    console.log(e.stxerror());

								    process.exit(1);

								  }

								}


								function tokenizeFull(input) {

								  var matchop = tools.opMatch(operators);

								  var initialPass = tokenizeHelp(input, matchop).reverse();

								  for (var i = 0; i < initialPass.length; i++) {

								    if (initialPass.slice(i, i+7).map(tools.fst) ===

								        ["defop", "integer", "identifier", "Left", "Right",

								         "left_paren", "identifier", "identifier", "identifier",

								         "right_paren"])

								      rep.OPInfo[initialPass[i+5][1]] = [parseInt(initialPass[i+1][1], 10),

								                                         initialPass[i+2][1]];


								  }

								  operators = Object.keys(rep.OPInfo);

								  matchop = tools.opMatch(operators);

								  return tokenizeHelp(input, matchop);

								}


								module.exports = {tokenize : tokenizeFull};