diff --git a/foo.c b/foo.c new file mode 100644 index 0000000..b521292 --- /dev/null +++ b/foo.c @@ -0,0 +1,11 @@ +#include +#include + +int main(void) { + char *foo = malloc(20); + snprintf(foo, 5, "%s", "1234"); + char *bar = foo; + *foo++ = *bar++; + printf("%s\n", foo); + return 0; +} diff --git a/tokenize.c b/tokenize.c index 7170ead..d9d85b0 100644 --- a/tokenize.c +++ b/tokenize.c @@ -5,6 +5,7 @@ #include #include #include +#include "maa.h" #include "tokenize.h" /* @@ -22,9 +23,7 @@ static const token_t left_paren = {.token_type=PAREN, .token={.parenthesis="("} static const token_t right_paren = {.token_type=PAREN, .token={.parenthesis=")"} }; -static -inline -char * +static inline char * string_head(uint32_t n, char *in, char *out) { /* out must be large enough to store the number of characters * you want to select from in, plus a byte for the null terminator @@ -44,9 +43,7 @@ string_head(uint32_t n, char *in, char *out) { return out; } -static -inline -token_t +static inline token_t make_token(token_val_t val, tok_t toktype) { token_t result; result.token_type = toktype; @@ -102,29 +99,11 @@ pop_token(token_stream *tokens) { len--; assert(tokens->tokens != NULL); - switch (tokens->tokens[len].token_type) { - case SYMBOL: - free(tokens->tokens[len].token.symbol); - break; - case IDENTIFIER: - free(tokens->tokens[len].token.identifier); - break; - case INTEGER: - free(tokens->tokens[len].token.integer); - break; - case FLOATING: - free(tokens->tokens[len].token.floating); - break; - default: - break; - } - tokens->length--; return true; } -inline -token_t +inline token_t peek_token(token_stream *tokens) { /* * Check if tokens points to NULL @@ -140,9 +119,7 @@ peek_token(token_stream *tokens) { return tokens->tokens[len-1]; } -static -inline -uint32_t +static inline uint32_t match_int(source_t source, uint32_t begin, const uint32_t length) { /* Return false if there is no match * otherwise return the position of the end of the match + 1 @@ -166,9 +143,7 @@ match_int(source_t source, uint32_t begin, const uint32_t length) { return i; } -static -inline -uint32_t +static inline uint32_t match_float(source_t source, uint32_t begin, const uint32_t length) { /* Return false if there is no match * otherwise: @@ -218,9 +193,7 @@ match_float(source_t source, uint32_t begin, const uint32_t length) { return false; } -static -inline -uint32_t +static inline uint32_t match_identifier(source_t source, uint32_t begin, const uint32_t length) { /* Return false if there is no match @@ -251,9 +224,7 @@ match_identifier(source_t source, uint32_t begin, const uint32_t length) { return i; } -static -inline -uint32_t +static inline uint32_t match_symbol(source_t source, uint32_t begin, const uint32_t length) { uint32_t i, identifier_match; assert(source != NULL); @@ -273,9 +244,7 @@ match_symbol(source_t source, uint32_t begin, const uint32_t length) { return false; } -static -inline -void +static inline void extract_token(uint32_t position, uint32_t begin, source_t source, @@ -300,6 +269,8 @@ tokenize(source_t source, uint32_t begin, const uint32_t length) { token_val_t current_token; token_t *tokens = calloc(STACK_SIZE, sizeof(token_t)); + hsh_HashTable token_memo = hsh_create(NULL, NULL); + assert(begin == 0); assert(length > 0); assert(source != NULL); @@ -307,9 +278,10 @@ tokenize(source_t source, uint32_t begin, const uint32_t length) { token_stack.length = 0; token_stack.max_length = STACK_SIZE; token_stack.tokens = tokens; + token_stack.memo = token_memo; + char lookahead = '\0'; assert(STACK_SIZE > 0); - while (begin <= length && source[begin]) { if (source[begin] == '(') { /*Matched a left paren */ @@ -333,48 +305,86 @@ tokenize(source_t source, uint32_t begin, const uint32_t length) { } else if ((position = match_float(source, begin, length))) { /* Matched a float */ - assert(position > begin); - - current_token_val = calloc(((position - begin) + 1), sizeof(char)); - assert(current_token_val != NULL); - extract_token(position, begin, source, current_token_val); - current_token.floating = current_token_val; - + lookahead = source[position]; + source[position] = '\0'; + if ((current_token_val = (char *)hsh_retrieve(token_stack.memo, source+begin))) { + current_token.floating = current_token_val; + source[position] = lookahead; + } + else { + source[position] = lookahead; + assert(position > begin); + current_token_val = calloc(((position - begin) + 1), sizeof(char)); + assert(current_token_val != NULL); + extract_token(position, begin, source, current_token_val); + hsh_insert(token_stack.memo, current_token_val, current_token_val); + current_token.floating = current_token_val; + } push_token(&token_stack, make_token(current_token, FLOATING)); } else if ((position = match_int(source, begin, length))) { /* Matched an int */ - assert(position > begin); - assert(position <= length); - current_token_val = calloc(((position - begin) + 1), sizeof(char)); - assert(current_token_val != NULL); - extract_token(position, begin, source, current_token_val); - - current_token.integer = current_token_val; + lookahead = source[position]; + source[position] = '\0'; + if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) { + current_token.integer = (char *)current_token_val; + source[position] = lookahead; + } + else { + assert(position > begin); + assert(position <= length); + + source[position] = lookahead; + current_token_val = calloc(((position - begin) + 1), sizeof(char)); + assert(current_token_val != NULL); + extract_token(position, begin, source, current_token_val); + hsh_insert(token_stack.memo, current_token_val, current_token_val); + current_token.integer = current_token_val; + } push_token(&token_stack, make_token(current_token, INTEGER)); } else if ((position = match_symbol(source, begin, length))) { /* Matched a symbol */ - assert(position > begin); - assert(position <= length); - current_token_val = calloc(((position - begin) + 1), sizeof(char)); - assert(current_token_val != NULL); - extract_token(position, begin, source, current_token_val); - - current_token.symbol = current_token_val; - + lookahead = source[position]; + source[position] = '\0'; + if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) { + current_token.symbol = (char *)current_token_val; + source[position] = lookahead; + } + else { + assert(position > begin); + assert(position <= length); + + source[position] = lookahead; + current_token_val = calloc(((position - begin) + 1), sizeof(char)); + assert(current_token_val != NULL); + extract_token(position, begin, source, current_token_val); + hsh_insert(token_stack.memo, current_token_val, current_token_val); + current_token.symbol = current_token_val; + } push_token(&token_stack, make_token(current_token, SYMBOL)); - } else if ((position = match_identifier(source, begin, length))) { - assert(position > begin); - assert(position <= length); - current_token_val = calloc(((position - begin) + 1), sizeof(char)); - assert(current_token_val != NULL); - extract_token(position, begin, source, current_token_val); - - current_token.identifier = current_token_val; + /* Matched an identifier */ + lookahead = source[position]; + source[position] = '\0'; + if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) { + current_token.identifier = (char *)current_token_val; + source[position] = lookahead; + } + else { + + assert(position > begin); + assert(position <= length); + + source[position] = lookahead; + current_token_val = calloc(((position - begin) + 1), sizeof(char)); + assert(current_token_val != NULL); + extract_token(position, begin, source, current_token_val); + hsh_insert(token_stack.memo, current_token_val, current_token_val); + current_token.identifier = current_token_val; + } push_token(&token_stack, make_token(current_token, IDENTIFIER)); /* Matched an identifier */ @@ -385,9 +395,17 @@ tokenize(source_t source, uint32_t begin, const uint32_t length) { } begin = position; } + return token_stack; } +int free_token(const void *key, const void *val) { + /* silence warnings about unused parameters, key and val point to the same data*/ + (void)key; + free((char *)val); + return true; +} + bool release_tokens(token_stream *tokens) { /* Iterate through the stack, release each token @@ -396,10 +414,9 @@ release_tokens(token_stream *tokens) { assert(tokens != NULL); assert(tokens->tokens != NULL); assert(tokens->max_length > 0); - - while(tokens->length > 0) { - pop_token(tokens); - } free(tokens->tokens); + hsh_iterate(tokens->memo, free_token); + + hsh_destroy(tokens->memo); return true; } diff --git a/tokenize.py b/tokenize.py index 7e8dfd3..b1f6d90 100644 --- a/tokenize.py +++ b/tokenize.py @@ -29,7 +29,8 @@ class TokenT(Structure): class TokStream(Structure): _fields_ = [("length", c_size_t), ("max_length", c_size_t), - ("tokens", POINTER(TokenT))] + ("tokens", POINTER(TokenT)), + ("memo", c_void_p)] tokenizer.tokenize.restype = TokStream tokenizer.peek_token.restype = TokenT @@ -46,8 +47,5 @@ def tokenize(source): tokenizer.pop_token(tp) tokenizer.release_tokens(tp) -tokens = tokenize("(+ a b) 'blah whatever (++ +.043 -4a +.0 +.3.2"*1610) -xs = list(tokens) - -#print list(tokens) - +line = "(+ a b) 'blah whatever (++ +.043 -4a +.0 +.3.2" +xs = list(tokenize(line*141500)) diff --git a/wat.c b/wat.c new file mode 100644 index 0000000..7f2a80e --- /dev/null +++ b/wat.c @@ -0,0 +1,12 @@ +#include +#include +#include + +int main(void) { + char *blah = malloc(2); + blah[0] = 'a'; + blah[1] = '\0'; + printf("%zd\n", strlen(blah)); + free(blah); + return 0; +}