fix ordering, match symbols properly

10 years ago · db5446112f
2 changed files with 22 additions and 24 deletions
--- a/tokenize.c
+++ b/tokenize.c
@ -226,7 +226,7 @@ match_identifier(source_t source, uint32_t begin, const uint32_t length) {
 static inline uint32_t
 match_symbol(source_t source, uint32_t begin, const uint32_t length) {
-  uint32_t i, identifier_match;
+  uint32_t i;
  assert(source != NULL);
  assert(length > 0);
@ -235,13 +235,13 @@ match_symbol(source_t source, uint32_t begin, const uint32_t length) {
    return false;
  }
  i++;
-
+  while (!isspace(source[i]) && i <= length) {
-  identifier_match = match_identifier(source, i, length);
+    i++;
  if (identifier_match) {
    return identifier_match;
  }
-  assert(identifier_match <= length);
+  if (i == begin) {
-  return false;
+    return false;
  }
  return i;
 }
 static inline void
@ -293,12 +293,7 @@ tokenize(source_t source, uint32_t begin, const uint32_t length) {
      position = begin + 1;
      push_token(&token_stack, right_paren);
    }
-    else if (source[begin] == '\'') {
+   else if (isspace(source[begin])) {
      /* Matched a quote (apostrophe) */
      position = begin + 1;
      push_token(&token_stack, quote_tok);
    }
    else if (isspace(source[begin])) {
      position = begin + 1;
      push_token(&token_stack, whitespace_tok);
      /* Matched a whitespace character */
@ -326,8 +321,8 @@ tokenize(source_t source, uint32_t begin, const uint32_t length) {
      /* Matched an int */
      lookahead = source[position];
      source[position] = '\0';
-      if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) {
+      if ((current_token_val = (char *)hsh_retrieve(token_stack.memo, source+begin))) {
-        current_token.integer = (char *)current_token_val;
+        current_token.integer = current_token_val;
        source[position] = lookahead;
      }
      else {
@ -341,15 +336,14 @@ tokenize(source_t source, uint32_t begin, const uint32_t length) {
        hsh_insert(token_stack.memo, current_token_val, current_token_val);
        current_token.integer = current_token_val;
      }
      push_token(&token_stack, make_token(current_token, INTEGER));
    }
    else if ((position = match_symbol(source, begin, length))) {
      /* Matched a symbol */
      lookahead = source[position];
      source[position] = '\0';
-      if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) {
+      if ((current_token_val = (char *)hsh_retrieve(token_stack.memo, source+begin))) {
-        current_token.symbol = (char *)current_token_val;
+        current_token.symbol = current_token_val;
        source[position] = lookahead;
      }
      else {
@ -365,16 +359,20 @@ tokenize(source_t source, uint32_t begin, const uint32_t length) {
      }
      push_token(&token_stack, make_token(current_token, SYMBOL));
    }
    else if (source[begin] == '\'') {
      /* Matched a quote (apostrophe) */
      position = begin + 1;
      push_token(&token_stack, quote_tok);
    }
    else if ((position = match_identifier(source, begin, length))) {
      /* Matched an identifier */
      lookahead = source[position];
      source[position] = '\0';
-      if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) {
+      if ((current_token_val = (char *)hsh_retrieve(token_stack.memo, source+begin))) {
-        current_token.identifier = (char *)current_token_val;
+        current_token.identifier = current_token_val;
        source[position] = lookahead;
      }
      else {
        assert(position > begin);
        assert(position <= length);
@ -385,7 +383,6 @@ tokenize(source_t source, uint32_t begin, const uint32_t length) {
        hsh_insert(token_stack.memo, current_token_val, current_token_val);
        current_token.identifier = current_token_val;
      }
      push_token(&token_stack, make_token(current_token, IDENTIFIER));
      /* Matched an identifier */
    }
--- a/tokenize.py
+++ b/tokenize.py
@ -47,5 +47,6 @@ def tokenize(source):
        tokenizer.pop_token(tp)
    tokenizer.release_tokens(tp)
-line = "(+ a b) 'blah whatever (++ +.043 -4a +.0 +.3.2"
+line = " '34 34"
-xs = list(tokenize(line*141500))
+xs = list(tokenize(line))
 print xs