Browse Source

fix ordering, match symbols properly

master
nisstyre56 10 years ago
parent
commit
db5446112f
  1. 41
      tokenize.c
  2. 5
      tokenize.py

41
tokenize.c

@ -226,7 +226,7 @@ match_identifier(source_t source, uint32_t begin, const uint32_t length) {
static inline uint32_t static inline uint32_t
match_symbol(source_t source, uint32_t begin, const uint32_t length) { match_symbol(source_t source, uint32_t begin, const uint32_t length) {
uint32_t i, identifier_match; uint32_t i;
assert(source != NULL); assert(source != NULL);
assert(length > 0); assert(length > 0);
@ -235,13 +235,13 @@ match_symbol(source_t source, uint32_t begin, const uint32_t length) {
return false; return false;
} }
i++; i++;
while (!isspace(source[i]) && i <= length) {
identifier_match = match_identifier(source, i, length); i++;
if (identifier_match) {
return identifier_match;
} }
assert(identifier_match <= length); if (i == begin) {
return false; return false;
}
return i;
} }
static inline void static inline void
@ -293,12 +293,7 @@ tokenize(source_t source, uint32_t begin, const uint32_t length) {
position = begin + 1; position = begin + 1;
push_token(&token_stack, right_paren); push_token(&token_stack, right_paren);
} }
else if (source[begin] == '\'') { else if (isspace(source[begin])) {
/* Matched a quote (apostrophe) */
position = begin + 1;
push_token(&token_stack, quote_tok);
}
else if (isspace(source[begin])) {
position = begin + 1; position = begin + 1;
push_token(&token_stack, whitespace_tok); push_token(&token_stack, whitespace_tok);
/* Matched a whitespace character */ /* Matched a whitespace character */
@ -326,8 +321,8 @@ tokenize(source_t source, uint32_t begin, const uint32_t length) {
/* Matched an int */ /* Matched an int */
lookahead = source[position]; lookahead = source[position];
source[position] = '\0'; source[position] = '\0';
if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) { if ((current_token_val = (char *)hsh_retrieve(token_stack.memo, source+begin))) {
current_token.integer = (char *)current_token_val; current_token.integer = current_token_val;
source[position] = lookahead; source[position] = lookahead;
} }
else { else {
@ -341,15 +336,14 @@ tokenize(source_t source, uint32_t begin, const uint32_t length) {
hsh_insert(token_stack.memo, current_token_val, current_token_val); hsh_insert(token_stack.memo, current_token_val, current_token_val);
current_token.integer = current_token_val; current_token.integer = current_token_val;
} }
push_token(&token_stack, make_token(current_token, INTEGER)); push_token(&token_stack, make_token(current_token, INTEGER));
} }
else if ((position = match_symbol(source, begin, length))) { else if ((position = match_symbol(source, begin, length))) {
/* Matched a symbol */ /* Matched a symbol */
lookahead = source[position]; lookahead = source[position];
source[position] = '\0'; source[position] = '\0';
if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) { if ((current_token_val = (char *)hsh_retrieve(token_stack.memo, source+begin))) {
current_token.symbol = (char *)current_token_val; current_token.symbol = current_token_val;
source[position] = lookahead; source[position] = lookahead;
} }
else { else {
@ -365,16 +359,20 @@ tokenize(source_t source, uint32_t begin, const uint32_t length) {
} }
push_token(&token_stack, make_token(current_token, SYMBOL)); push_token(&token_stack, make_token(current_token, SYMBOL));
} }
else if (source[begin] == '\'') {
/* Matched a quote (apostrophe) */
position = begin + 1;
push_token(&token_stack, quote_tok);
}
else if ((position = match_identifier(source, begin, length))) { else if ((position = match_identifier(source, begin, length))) {
/* Matched an identifier */ /* Matched an identifier */
lookahead = source[position]; lookahead = source[position];
source[position] = '\0'; source[position] = '\0';
if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) { if ((current_token_val = (char *)hsh_retrieve(token_stack.memo, source+begin))) {
current_token.identifier = (char *)current_token_val; current_token.identifier = current_token_val;
source[position] = lookahead; source[position] = lookahead;
} }
else { else {
assert(position > begin); assert(position > begin);
assert(position <= length); assert(position <= length);
@ -385,7 +383,6 @@ tokenize(source_t source, uint32_t begin, const uint32_t length) {
hsh_insert(token_stack.memo, current_token_val, current_token_val); hsh_insert(token_stack.memo, current_token_val, current_token_val);
current_token.identifier = current_token_val; current_token.identifier = current_token_val;
} }
push_token(&token_stack, make_token(current_token, IDENTIFIER)); push_token(&token_stack, make_token(current_token, IDENTIFIER));
/* Matched an identifier */ /* Matched an identifier */
} }

5
tokenize.py

@ -47,5 +47,6 @@ def tokenize(source):
tokenizer.pop_token(tp) tokenizer.pop_token(tp)
tokenizer.release_tokens(tp) tokenizer.release_tokens(tp)
line = "(+ a b) 'blah whatever (++ +.043 -4a +.0 +.3.2" line = " '34 34"
xs = list(tokenize(line*141500)) xs = list(tokenize(line))
print xs