#include #include #include #include #include #include #include #include #include "error.h" #include "maa.h" #include "tokenize.h" /* * This is a basic s-expression tokenizer * it also tokenizes things like number, string, and symbol literals */ const token_t nulltok = { .token_type = EMPTY, { .null_token=false } }; static const token_t quote_tok = { .token_type = QUOTE, .token= { .quote=true } }; static const token_t left_paren = { .token_type = PAREN, .token = { .parenthesis="(" } }; static const token_t right_paren = { .token_type = PAREN, .token = { .parenthesis=")" } }; static inline const char * string_head(uint32_t n, const char *in, char *out) { /* out must be large enough to store the number of characters * you want to select from in, plus a byte for the null terminator */ #ifndef NDEBUG size_t in_len = strlen(in); #endif assert((n > 0 && n <= in_len)); int iserror = snprintf(out, (size_t)n+1 , "%s", in); assert((iserror != -1) && ((size_t)iserror == in_len)); if (iserror == -1) { printf("Out of memory"); exit(EXIT_FAILURE); } return (const char*)out; } static inline token_t make_token(token_val_t val, tok_t toktype) { token_t result; result.token_type = toktype; result.token = val; return result; } bool push_token(token_stream *tokens, token_t token) { /* * Check if tokens points to NULL */ size_t len; size_t max; CHECK(tokens); len = tokens->length; max = tokens->max_length; assert(len <= max); assert(max > 0); if (len == max) { /* We've reached the maximum stack size * So we must try to increase that by GROWTH_SIZE */ token_t *new_tokens = xrealloc(tokens->tokens, sizeof (token_t) * (max * GROWTH_FACTOR)); if (!new_tokens) { printf("Could not allocate enough memory for the token stack\n"); exit(EXIT_FAILURE); } tokens->tokens = new_tokens; tokens->max_length = max * GROWTH_FACTOR; tokens->tokens[len] = token; tokens->length++; return true; } tokens->tokens[len] = token; tokens->length++; return true; } bool pop_token(token_stream *tokens) { size_t len; CHECK(tokens); len = tokens->length; assert(len != 0); len--; CHECK(tokens->tokens); tokens->length--; return true; } inline token_t peek_token(token_stream *tokens) { /* * Check if tokens points to NULL */ size_t len = tokens->length; size_t max = tokens->max_length; CHECK(tokens); assert(len != 0); if (len == 0 || len > max) { return nulltok; } return tokens->tokens[len-1]; } static inline uint32_t match_int(source_t source, uint32_t begin, const uint32_t length) { /* Return false if there is no match * otherwise return the position of the end of the match + 1 */ uint32_t i = begin; uint32_t test; CHECK(source); assert(length > 0); if (source[i] == '+' || source[i] == '-') { i++; } test = i; while (i < length && isdigit(source[i])) { i++; } if (i == test) return false; return i; } static inline uint32_t match_float(source_t source, uint32_t begin, const uint32_t length) { /* Return false if there is no match * otherwise: * if there is a leading decimal point and then a valid int match: * return the position of the end of the match * if there is a leading valid int match: * but no decimal point match after that: * return false * if there is a decimal point match and then a valid int match: * return the position of the match * if there is no valid int match: * return false * ALWAYS returns the position + 1 to avoid confusion with false (which is a valid index) */ uint32_t i, leading_int_match, trailing_int_match; CHECK(source); assert(length > 0); i = begin; leading_int_match = match_int(source, i, length); if (leading_int_match) { i = leading_int_match; } assert(i <= length); if (source[i] != '.' || source[i] == '+' || source[i] == '-') { if (((i+1) <= length) && /* Make sure there is at least two characters to look at */ ((source[i] == '+') || (source[i] == '-')) && (source[i+1] == '.')) { i++; } else { return false; } } i++; trailing_int_match = match_int(source, i, length); if (trailing_int_match) { return trailing_int_match; } return false; } static inline uint32_t match_word(source_t source, uint32_t begin, const uint32_t length) { /* Return false if there is no match * if there is a match for any characters that are not: * whitespace * a parenthesis ( ) * a brace { } * a square bracket [ ] * then return the position of the match + 1 * if there is nothing else to match: * return false */ uint32_t i = begin; CHECK(source); assert(length > 0); while (i < length && !(source[i] == '(' || source[i] == ')' || isspace(source[i]))) { i++; } if (i == begin) { return false; } assert(i <= length); return i; } static inline uint32_t is_empty_string(const char *source, uint32_t length) { int allspace = false; uint32_t i = 0; if (source[i] != '\"') { return false; } for (; i < length; i++) { if (!isspace(source[i])) { allspace = true; } } /*if (allspace) { printf("Actually found an empty string! Of length %d\n", i); }*/ return allspace; } static inline uint32_t match_string(source_t source, uint32_t begin, const uint32_t length) { CHECK(source); (void)length; assert(length > 0); uint32_t i = begin; if (source[i] != '\"') { return false; } i++; while (source[i] != '\"' && (i < length) && (i < (begin + MAX_STRING_SIZE))) { i++; } if ((i != (begin+1)) && (i <= length) && (source[i] == '\"')) { return i+1; } return false; } static inline void extract_token(uint32_t position, uint32_t begin, const source_t source, const char *token_val) { assert(position > begin); string_head(position - begin, &source[begin], (char *)token_val); } token_stream tokenize(source_t source, uint32_t begin, const uint32_t length) { /* * Remember to free everything from this struct * for example, token_stack.tokens will not necessarily be * equal to tokens after this function has run * */ uint32_t position = begin; uint32_t allspace = false; const char *current_token_val; token_stream token_stack; token_val_t current_token; token_t *tokens = xcalloc(STACK_SIZE, sizeof (token_t)); hsh_HashTable token_memo = hsh_create(NULL, NULL); assert(begin == 0); assert(length > 0); CHECK(source); token_stack.length = 0; token_stack.max_length = STACK_SIZE; token_stack.tokens = tokens; token_stack.memo = token_memo; char lookahead = '\0'; assert(STACK_SIZE > 0); while (begin <= length && source[begin]) { /* Possibly matched a string * First look for closing " * Then look for a newline to close it if no " * Then stop after some large constant of characters maybe? * We're dealing with real text so people might forget to close * quotations, so we have to be clever about it and use heuristics (for performance) */ if (source[begin] == '(') { /*Matched a left paren */ position = begin + 1; push_token(&token_stack, left_paren); } else if (source[begin] == ')') { /*Matched a left paren */ position = begin + 1; push_token(&token_stack, right_paren); } else if (isspace(source[begin])) { position = begin + 1; /* Matched a whitespace character */ } else if ((position = match_string(source, begin, length))) { /* Possibly matched a string * First look for closing " * Then look for a newline to close it if no " * Then stop after some large constant of characters maybe? * We're dealing with real text so people might forget to close * quotations, so we have to be clever about it and use heuristics (for performance) */ lookahead = source[position]; source[position] = '\0'; if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) { current_token.string = current_token_val; source[position] = lookahead; allspace = false; } else { assert(position > begin); current_token_val = xcalloc(((position - begin) + 1), sizeof (char)); CHECK(current_token_val); extract_token(position, begin, source, current_token_val); if ((allspace = is_empty_string(current_token_val, (position - begin) + 1))) { source[position] = lookahead; current_token.string = current_token_val; } else { source[position] = lookahead; hsh_insert(token_stack.memo, current_token_val, current_token_val); current_token.string = current_token_val; } } if (allspace) { push_token(&token_stack, make_token(current_token, STRING)); } } else if ((position = match_float(source, begin, length))) { /* Matched a float */ lookahead = source[position]; source[position] = '\0'; if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) { current_token.floating = current_token_val; source[position] = lookahead; } else { assert(position > begin); current_token_val = xcalloc(((position - begin) + 1), sizeof (char)); CHECK(current_token_val); extract_token(position, begin, source, current_token_val); source[position] = lookahead; hsh_insert(token_stack.memo, current_token_val, current_token_val); current_token.floating = current_token_val; } push_token(&token_stack, make_token(current_token, FLOATING)); } else if ((position = match_int(source, begin, length))) { /* Matched an int */ lookahead = source[position]; source[position] = '\0'; if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) { current_token.integer = current_token_val; source[position] = lookahead; } else { assert(position > begin); assert(position <= length); current_token_val = xcalloc(((position - begin) + 1), sizeof (char)); CHECK(current_token_val); extract_token(position, begin, source, current_token_val); source[position] = lookahead; hsh_insert(token_stack.memo, current_token_val, current_token_val); current_token.integer = current_token_val; } push_token(&token_stack, make_token(current_token, INTEGER)); } else if (source[begin] == '\'') { /* Matched a quote (apostrophe) */ position = begin + 1; push_token(&token_stack, quote_tok); } else if ((position = match_word(source, begin, length))) { /* Matched a word */ lookahead = source[position]; source[position] = '\0'; if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) { current_token.word = current_token_val; source[position] = lookahead; } else { assert(position > begin); assert(position <= length); current_token_val = xcalloc(((position - begin) + 1), sizeof (char)); CHECK(current_token_val); extract_token(position, begin, source, current_token_val); source[position] = lookahead; hsh_insert(token_stack.memo, current_token_val, current_token_val); current_token.word = current_token_val; } push_token(&token_stack, make_token(current_token, WORD)); /* Matched a word */ } else if (position <= begin) { printf("Source is too large to read\n"); exit(EXIT_FAILURE); } else { printf("Unmatched token\n"); exit(EXIT_FAILURE); } begin = position; } return token_stack; } int free_token(const void *key, const void *val) { /* silence warnings about unused parameters, key and val point to the same data*/ (void)key; xfree((char *)val); return true; } bool release_tokens(token_stream *tokens) { /* Iterate through the stack, release each token * Then release the entire stack */ CHECK(tokens); CHECK(tokens->tokens); assert(tokens->max_length > 0); xfree(tokens->tokens); hsh_iterate(tokens->memo, free_token); hsh_destroy(tokens->memo); return true; } #ifndef TOK_LIB int main(void) { void *source_code = malloc(111000); size_t nbytes = read(STDIN_FILENO, source_code, 111000); if (nbytes == 0) { exit(EXIT_FAILURE); } token_stream toks = tokenize(source_code, 0, nbytes); token_t current_tok; while (toks.length > 0) { current_tok = peek_token(&toks); switch (current_tok.token_type) { case SYMBOL: printf("symbol: %s\n", current_tok.token.symbol); break; case WORD: printf("identifer: %s\n", current_tok.token.word); break; case INTEGER: printf("integer: %s\n", current_tok.token.integer); break; case FLOATING: printf("floating: %s\n", current_tok.token.floating); break; case QUOTE: printf("quote: '\n"); break; case WSPACE: printf("whitespace\n"); break; case PAREN: printf("paren: %s\n", current_tok.token.parenthesis); break; case EMPTY: printf("this should not be empty\n"); break; case STRING: printf("string: %s\n", current_tok.token.string); break; default: printf("oops, there was an unknown token, check valgrind or gdb\n"); } pop_token(&toks); } release_tokens(&toks); return 0; } #endif