From 3dc20ed2ddc7a11ff20c38fa38de5d840d22c5e7 Mon Sep 17 00:00:00 2001 From: nisstyre56 Date: Sat, 6 Aug 2016 02:40:20 +0000 Subject: [PATCH] first commit --- Makefile | 13 ++ error.h | 1 + markov.c | 400 +++++++++++++++++++++++++++++++++++++ markov.h | 78 ++++++++ roadnottaken | 23 +++ tokenize.c | 551 +++++++++++++++++++++++++++++++++++++++++++++++++++ tokenize.h | 73 +++++++ 7 files changed, 1139 insertions(+) create mode 100644 Makefile create mode 100644 error.h create mode 100644 markov.c create mode 100644 markov.h create mode 100644 roadnottaken create mode 100644 tokenize.c create mode 100644 tokenize.h diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..31ed6aa --- /dev/null +++ b/Makefile @@ -0,0 +1,13 @@ +default: markov.c markov.h + $(MAKE) lib; + $(CC) -g -DTOK_LIB -Wall -Wextra -std=gnu99 -Wpointer-arith -Wmissing-prototypes -Werror -lmaa -lm -L. -ltokenize -O3 ./markov.c -o markov_test -Wl,-rpath,/home/wes/markov; + +unsafe: markov.c markov.h + $(MAKE) lib; + $(CC) -DNDEBUG -DTOK_LIB -Wall -std=gnu99 -Wextra -Wpointer-arith -Wmissing-prototypes -lmaa -lm -L. -ltokenize -O3 ./markov.c -o markov -Wl,-rpath,/home/wes/markov; + +lib: markov.c markov.h tokenize.c tokenize.h + $(CC) -g -DTOK_LIB -c -fpic -Wall -Wextra -pedantic -Wpointer-arith -Werror -std=c99 -O3 ./tokenize.c + $(CC) -shared -o libtokenize.so tokenize.o -lmaa; + $(CC) -g -DTOK_LIB -c -fpic -Wall -Wextra -pedantic -Wpointer-arith -Werror -std=gnu99 -O3 ./markov.c -lmaa -lm + $(CC) -shared -o markov.so markov.o -lmaa; diff --git a/error.h b/error.h new file mode 100644 index 0000000..24bfcc3 --- /dev/null +++ b/error.h @@ -0,0 +1 @@ +#define CHECK(ptr) if ((ptr) == NULL) { printf("Failed to allocate memory\n"); exit(EXIT_FAILURE); } diff --git a/markov.c b/markov.c new file mode 100644 index 0000000..3206182 --- /dev/null +++ b/markov.c @@ -0,0 +1,400 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "error.h" +#include "maa.h" +#include "tokenize.h" +#include "markov.h" + +#define LEN 50 + +static char quote = '\''; + +static inline void +initialize_neighbours(const char *str, + hsh_HashTable graph_table) { + /* Initialize the table of neighbours corresponding to some string in the markov graph */ + assert(!hsh_retrieve(graph_table, str)); + neighbours_t *new_neighbours = xmalloc(sizeof (neighbours_t)); + CHECK(new_neighbours); + new_neighbours->neighbours = hsh_create(NULL, NULL); + new_neighbours->number = 0; + new_neighbours->unique_num = 0; + hsh_insert(graph_table, str, new_neighbours); + return; +} + +static inline unsigned long +numberof_keys(graph_t graph) { + /* Get the number of unique keys in the graph */ + hsh_Stats stats = hsh_get_stats(graph.cache); + unsigned long num = stats->entries; + xfree(stats); + return num; +} + +static inline unsigned long +numberof_transitionable(graph_t graph) { + /* Get the number of keys with >0 neighbours */ + /* Only call after graph has been converted */ + unsigned long num = 0; + void *p, *key; + markov_trans_t *val; + HSH_ITERATE(graph.graph, p, key, val) { + if (val->number > 0) { + num++; + } + } + return num; +} + +static const char* +get_ngram(const char* str, + graph_t graph) { + /* Try to get a string from the cache. + * If it's not already cached, allocate the memory for it + * then return the freshly cached string + */ + hsh_HashTable cache = graph.cache; + hsh_HashTable graph_table = graph.graph; + const char *exists = hsh_retrieve(cache, str); + if (exists) { + return exists; + } + else { + /* Add it to the cache and return it */ + size_t gram_size = strlen(str) + 1; + char *new_str = xmalloc(gram_size); + CHECK(str); + snprintf(new_str, gram_size, "%s", str); + hsh_insert(cache, new_str, new_str); + initialize_neighbours(new_str, graph_table); + return new_str; + } +} + +static inline void +insert_neighbour(const char *left, + const char *neighbour, + graph_t graph) { + /* Insert a neighbour into the table of neighbours for a given key */ + neighbours_t *neighbours = (neighbours_t *)hsh_retrieve(graph.graph, left); + neighbours->number++; + hsh_HashTable neighbours_table = neighbours->neighbours; + CHECK(neighbours->neighbours); + if (hsh_retrieve(neighbours_table, neighbour)) { + return; + } + neighbours->unique_num++; + const char *new_neighbour = get_ngram(neighbour, graph); + CHECK(new_neighbour); + uint32_t *count = xmalloc(sizeof (uint32_t)); + CHECK(count); + *count = 0; + hsh_insert(neighbours_table, new_neighbour, count); +} + +static inline void +increment_neighbour(const char *left, + const char *neighbour, + graph_t graph) { + /* Increment the frequency of a given bi-gram. + * bi-gram does not necessarily mean a specific thing + * it could be pairs of words, pairs of letters, sequences of n letters, and so on + */ + neighbours_t *neighbours = (neighbours_t *)hsh_retrieve(graph.graph, left); + hsh_HashTable neighbours_hash = neighbours->neighbours; + CHECK(neighbours_hash); + uint32_t *count = (uint32_t *)hsh_retrieve(neighbours_hash, neighbour); + CHECK(count); + (*count)++; +} + +static inline neighbours_t* +get_neighbours(graph_t graph, + char *gram) { + /* Simply return the table of neighbours corresponding to a given string */ + neighbours_t *neighbours; + neighbours = (neighbours_t *)hsh_retrieve(graph.graph, gram); + assert(neighbours); + return neighbours; +} + +static inline markov_trans_t* +get_prob_neighbours(graph_t graph, + char *gram) { + /* Return the converted probability transitions */ + markov_trans_t *neighbours; + neighbours = (markov_trans_t *)hsh_retrieve(graph.graph, gram); + assert(neighbours); + return neighbours; +} + +static inline void +convert_neighbours(graph_t graph, + char *gram) { + neighbours_t *neighbours = get_neighbours(graph, gram); + + markov_trans_t *result = xmalloc(sizeof (markov_trans_t)); + CHECK(result); + size_t nb_size = neighbours->number; + hsh_HashTable neighbours_hash = neighbours->neighbours; + + void *key; + uint32_t *frequency; + void *p; + uint32_t index = 0; + probability_t transition; + probability_t *neighbour_array = xcalloc(sizeof (probability_t), nb_size); + CHECK(neighbour_array); + HSH_ITERATE(neighbours_hash, p, key, frequency) { + transition.frequent.frequency = *frequency; + xfree(frequency); + transition.frequent.token = key; + neighbour_array[index] = transition; + index++; + } + float lower = 0.0; + probability_t current; + for (uint32_t i = 0; i < neighbours->unique_num; i++) { + current.frequent = neighbour_array[i].frequent; + neighbour_array[i].bucket.token = current.frequent.token; + neighbour_array[i].bucket.lower = lower; + neighbour_array[i].bucket.upper = lower + ((float)neighbour_array[i].frequent.frequency) / + (neighbours->number); + lower = neighbour_array[i].bucket.upper; + } + result->transitions = neighbour_array; + result->number = neighbours->unique_num; + hsh_delete(graph.graph, gram); + hsh_insert(graph.graph, gram, result); + hsh_destroy(neighbours->neighbours); + xfree(neighbours); +} + +static inline void +convert_all_neighbours(graph_t graph) { + void *p, *key; + char *current_key; + unsigned long num_keys = numberof_keys(graph); + if (num_keys == 0) { + return; + } + stk_Stack keys = stk_create(); + + /* iterate over all keys K, in hash table T */ + HSH_ITERATE_KEYS(graph.graph, p, key) { + stk_push(keys, key); + } + + for (uint32_t i = 0; i < num_keys; i++) { + current_key = (char *)stk_pop(keys); + convert_neighbours(graph, current_key); + } + stk_destroy(keys); +} + +static inline void +relate_bigram(const char *a, + const char *b, + graph_t graph) { + /* Update the graph with the information that b follows a */ + const char* str = get_ngram(a, graph); + insert_neighbour(str, b, graph); + increment_neighbour(str, b, graph); +} + +static int +transition_cmp(const void *keyval, + const void *datum) { + float chosen_number = *((float *)keyval); + probability_t *transition = (probability_t *)datum; + float lower = transition->bucket.lower; + float upper = transition->bucket.upper; + if ((chosen_number >= lower) && + (chosen_number <= upper)) { + return 0; + } + else if (chosen_number < lower) { + return -1; + } + else { + return 1; + } +} + +static inline char* +pick_random_transition(unique_keys_t unique_neighbours) { + size_t num = unique_neighbours.number; + char **keys = unique_neighbours.keys; + size_t selection = (size_t)floor(drand48() * (num - 1)); + return keys[selection]; +} + + +static inline char* +next_ngram(graph_t graph, + char *start, + unique_keys_t unique_neighbours) { + markov_trans_t *transitions = get_prob_neighbours(graph, start); + if (transitions->number == 0) { + return pick_random_transition(unique_neighbours); + } + probability_t *buckets = transitions->transitions; + size_t bucket_size = transitions->number; + float chosen = (float)drand48(); + probability_t *result = bsearch(&chosen, + buckets, + bucket_size, + sizeof (probability_t), + transition_cmp); + return ((char *)result->bucket.token); +} + +lst_List +generate_strings(markov_chain_t markov_chain, + char *start, + uint32_t n) { + unique_keys_t unique_neighbours = markov_chain.unique; + graph_t graph = markov_chain.graph; + lst_List result = lst_create(); + char *current = start; + for (uint32_t i = 0; i < n; i++) { + lst_append(result, current); + current = next_ngram(graph, current, unique_neighbours); + } + return result; +} + +static inline unique_keys_t +get_all_keys(graph_t graph) { + /* Gets all unique keys with neighbours */ + /* Should only be called after graph generation */ + unsigned long number = numberof_transitionable(graph); + char **keys = xcalloc(sizeof (char *), number); + CHECK(keys); + void *p, *key; + unique_keys_t result; + markov_trans_t *val; + uint32_t i = 0; + HSH_ITERATE(graph.graph, p, key, val) { + if (val->number > 0) { + keys[i] = key; + i++; + } + } + result.keys = keys; + result.number = i; + return result; +} + +static inline graph_t +make_graph(void) { + /* Make an initial empty graph */ + graph_t result; + result.cache = hsh_create(NULL, NULL); + result.graph = hsh_create(NULL, NULL); + return result; +} + +static inline void +release_converted_graph(graph_t graph) { + void *p, *key; + markov_trans_t *datum; + /* iterate over all keys K, in hash table and xfree them*/ + HSH_ITERATE(graph.graph, p, key, datum) { + xfree(datum->transitions); + xfree(datum); + xfree(key); + } + hsh_destroy(graph.cache); + hsh_destroy(graph.graph); +} + +markov_chain_t +build_markov_chain(token_stream tokens) { + markov_chain_t result; + graph_t graph = make_graph(); + token_t current; + token_t next; + while (tokens.length > 1) { + current = peek_token(&tokens); + pop_token(&tokens); + next = peek_token(&tokens); + relate_bigram(token_to_string(next), token_to_string(current), graph); + } + convert_all_neighbours(graph); + result.graph = graph; + result.unique = get_all_keys(graph); + return result; +} + +char * +token_to_string(token_t token) { + switch (token.token_type) { + case WORD: + return (char*)token.token.word; + break; + case INTEGER: + return (char*)token.token.integer; + break; + case FLOATING: + return (char*)token.token.floating; + break; + case QUOTE: + return "e; + break; + case PAREN: + return (char*)token.token.parenthesis; + break; + case EMPTY: + printf("should not be here\n"); + exit(EXIT_FAILURE); + break; + case STRING: + return (char*)token.token.string; + break; + default: + printf("oops, there was an unknown token, check valgrind or gdb\n"); + exit(EXIT_FAILURE); + } +} + +void +release_markov_chain(markov_chain_t chain) { + release_converted_graph(chain.graph); + xfree(chain.unique.keys); + return; +} + +int +main (void) { + void *test_input = xmalloc(555000); + size_t nbytes = read(STDIN_FILENO, test_input, 555000); + + if (nbytes == 0) { + exit(EXIT_FAILURE); + } + token_stream test_bigrams_stack = tokenize(test_input, 0, nbytes); + markov_chain_t chain = build_markov_chain(test_bigrams_stack); + srand48(time(NULL)); + lst_List test = generate_strings(chain, token_to_string(peek_token(&test_bigrams_stack)), LEN); + lst_pop(test); + for (uint32_t i = 0; i < LEN-1; i++) { + printf("%s ", (char *)lst_pop(test)); + } + printf("\n"); + lst_destroy(test); + _lst_shutdown(); + release_markov_chain(chain); + xfree(test_input); + release_tokens(&test_bigrams_stack); + return EXIT_SUCCESS; +} diff --git a/markov.h b/markov.h new file mode 100644 index 0000000..25f5239 --- /dev/null +++ b/markov.h @@ -0,0 +1,78 @@ +typedef + struct { + hsh_HashTable cache; + hsh_HashTable graph; + } + graph_t; + +typedef + struct { + size_t number; + char **keys; + } + unique_keys_t; + +typedef + struct { + graph_t graph; + unique_keys_t unique; + } + markov_chain_t; + + +typedef + struct { + hsh_HashTable neighbours; + size_t number; + size_t unique_num; + } + neighbours_t; + +/* + * Transition types for various reasons + */ + +typedef + struct { + float upper; + float lower; + const char *token; + } + bucket_t; + +typedef + struct { + uint32_t frequency; + const char *token; + } + transition_t; + + +typedef + union { + transition_t frequent; + bucket_t bucket; + } + probability_t; + +typedef + struct { + size_t number; + probability_t *transitions; + } + markov_trans_t; + + +markov_chain_t +build_markov_chain(token_stream); + +char * +token_to_string(token_t); + +void +release_markov_chain(markov_chain_t); + +lst_List +generate_strings(markov_chain_t, + char *, + uint32_t); diff --git a/roadnottaken b/roadnottaken new file mode 100644 index 0000000..6558308 --- /dev/null +++ b/roadnottaken @@ -0,0 +1,23 @@ +Two roads diverged in a yellow wood, +And sorry I could not travel both +And be one traveler, long I stood +And looked down one as far as I could +To where it bent in the undergrowth; + +Then took the other, as just as fair, +And having perhaps the better claim, +Because it was grassy and wanted wear; +Though as for that the passing there +Had worn them really about the same, + +And both that morning equally lay +In leaves no step had trodden black. +Oh, I kept the first for another day! +Yet knowing how way leads on to way, +I doubted if I should ever come back. + +I shall be telling this with a sigh +Somewhere ages and ages hence: +Two roads diverged in a wood, and I— +I took the one less traveled by, +And that has made all the difference. diff --git a/tokenize.c b/tokenize.c new file mode 100644 index 0000000..680fe95 --- /dev/null +++ b/tokenize.c @@ -0,0 +1,551 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "error.h" +#include "maa.h" +#include "tokenize.h" + +/* + * This is a basic s-expression tokenizer + * it also tokenizes things like number, string, and symbol literals + */ + +const token_t nulltok = { + .token_type = EMPTY, + { + .null_token=false + } +}; + +static const token_t quote_tok = { + .token_type = QUOTE, + .token= { + .quote=true + } +}; + +static const token_t left_paren = { + .token_type = PAREN, + .token = { + .parenthesis="(" + } +}; + +static const token_t right_paren = { + .token_type = PAREN, + .token = { + .parenthesis=")" + } +}; + +static inline const char * +string_head(uint32_t n, + const char *in, + char *out) { + /* out must be large enough to store the number of characters + * you want to select from in, plus a byte for the null terminator + */ +#ifndef NDEBUG + size_t in_len = strlen(in); +#endif + assert((n > 0 && n <= in_len)); + int iserror = snprintf(out, (size_t)n+1 , "%s", in); + + assert((iserror != -1) && ((size_t)iserror == in_len)); + + if (iserror == -1) { + printf("Out of memory"); + exit(EXIT_FAILURE); + } + return (const char*)out; +} + +static inline token_t +make_token(token_val_t val, + tok_t toktype) { + token_t result; + result.token_type = toktype; + result.token = val; + return result; +} + +bool +push_token(token_stream *tokens, + token_t token) { + /* + * Check if tokens points to NULL + */ + + size_t len; + size_t max; + + CHECK(tokens); + + len = tokens->length; + max = tokens->max_length; + + assert(len <= max); + assert(max > 0); + + if (len == max) { + /* We've reached the maximum stack size + * So we must try to increase that by GROWTH_SIZE + */ + token_t *new_tokens = xrealloc(tokens->tokens, sizeof (token_t) * (max * GROWTH_FACTOR)); + if (!new_tokens) { + printf("Could not allocate enough memory for the token stack\n"); + exit(EXIT_FAILURE); + } + tokens->tokens = new_tokens; + tokens->max_length = max * GROWTH_FACTOR; + tokens->tokens[len] = token; + tokens->length++; + return true; + } + tokens->tokens[len] = token; + tokens->length++; + return true; +} + +bool +pop_token(token_stream *tokens) { + size_t len; + CHECK(tokens); + + len = tokens->length; + + assert(len != 0); + len--; + CHECK(tokens->tokens); + + tokens->length--; + return true; +} + +inline token_t +peek_token(token_stream *tokens) { + /* + * Check if tokens points to NULL + */ + size_t len = tokens->length; + size_t max = tokens->max_length; + CHECK(tokens); + assert(len != 0); + + if (len == 0 || len > max) { + return nulltok; + } + return tokens->tokens[len-1]; +} + +static inline uint32_t +match_int(source_t source, + uint32_t begin, + const uint32_t length) { + /* Return false if there is no match + * otherwise return the position of the end of the match + 1 + */ + uint32_t i = begin; + uint32_t test; + CHECK(source); + assert(length > 0); + + if (source[i] == '+' || + source[i] == '-') { + i++; + } + test = i; + while (i < length && + isdigit(source[i])) { + i++; + } + if (i == test) + return false; + return i; +} + +static inline uint32_t +match_float(source_t source, + uint32_t begin, + const uint32_t length) { + /* Return false if there is no match + * otherwise: + * if there is a leading decimal point and then a valid int match: + * return the position of the end of the match + * if there is a leading valid int match: + * but no decimal point match after that: + * return false + * if there is a decimal point match and then a valid int match: + * return the position of the match + * if there is no valid int match: + * return false + * ALWAYS returns the position + 1 to avoid confusion with false (which is a valid index) + */ + uint32_t i, leading_int_match, trailing_int_match; + CHECK(source); + assert(length > 0); + + i = begin; + leading_int_match = match_int(source, i, length); + + if (leading_int_match) { + i = leading_int_match; + } + + assert(i <= length); + + if (source[i] != '.' || + source[i] == '+' || + source[i] == '-') { + if (((i+1) <= length) && /* Make sure there is at least two characters to look at */ + ((source[i] == '+') || + (source[i] == '-')) + && (source[i+1] == '.')) { + i++; + } + else { + return false; + } + } + i++; + + trailing_int_match = match_int(source, i, length); + if (trailing_int_match) { + return trailing_int_match; + } + return false; +} + +static inline uint32_t +match_word(source_t source, + uint32_t begin, + const uint32_t length) { + + /* Return false if there is no match + * if there is a match for any characters that are not: + * whitespace + * a parenthesis ( ) + * a brace { } + * a square bracket [ ] + * then return the position of the match + 1 + * if there is nothing else to match: + * return false + */ + uint32_t i = begin; + CHECK(source); + assert(length > 0); + + while (i < length && + !(source[i] == '(' || + source[i] == ')' || + isspace(source[i]))) { + i++; + } + + if (i == begin) { + return false; + } + assert(i <= length); + return i; +} + +static inline uint32_t +is_empty_string(const char *source, + uint32_t length) { + int allspace = false; + uint32_t i = 0; + if (source[i] != '\"') { + return false; + } + for (; i < length; i++) { + if (!isspace(source[i])) { + allspace = true; + } + } + /*if (allspace) { + printf("Actually found an empty string! Of length %d\n", i); + }*/ + return allspace; +} + + +static inline uint32_t +match_string(source_t source, + uint32_t begin, + const uint32_t length) { + CHECK(source); + (void)length; + assert(length > 0); + uint32_t i = begin; + if (source[i] != '\"') { + return false; + } + i++; + while (source[i] != '\"' && + (i < length) && + (i < (begin + MAX_STRING_SIZE))) { + i++; + } + if ((i != (begin+1)) && + (i <= length) && + (source[i] == '\"')) { + return i+1; + } + return false; +} + +static inline void +extract_token(uint32_t position, + uint32_t begin, + const source_t source, + const char *token_val) { + assert(position > begin); + string_head(position - begin, + &source[begin], + (char *)token_val); +} + +token_stream +tokenize(source_t source, + uint32_t begin, + const uint32_t length) { + /* + * Remember to free everything from this struct + * for example, token_stack.tokens will not necessarily be + * equal to tokens after this function has run + * + */ + uint32_t position = begin; + uint32_t allspace = false; + const char *current_token_val; + token_stream token_stack; + token_val_t current_token; + token_t *tokens = xcalloc(STACK_SIZE, sizeof (token_t)); + + hsh_HashTable token_memo = hsh_create(NULL, NULL); + + assert(begin == 0); + assert(length > 0); + CHECK(source); + + token_stack.length = 0; + token_stack.max_length = STACK_SIZE; + token_stack.tokens = tokens; + token_stack.memo = token_memo; + char lookahead = '\0'; + assert(STACK_SIZE > 0); + + while (begin <= length && source[begin]) { + /* Possibly matched a string + * First look for closing " + * Then look for a newline to close it if no " + * Then stop after some large constant of characters maybe? + * We're dealing with real text so people might forget to close + * quotations, so we have to be clever about it and use heuristics (for performance) + */ + if (source[begin] == '(') { + /*Matched a left paren */ + position = begin + 1; + push_token(&token_stack, left_paren); + } + else if (source[begin] == ')') { + /*Matched a left paren */ + position = begin + 1; + push_token(&token_stack, right_paren); + } + else if (isspace(source[begin])) { + position = begin + 1; + /* Matched a whitespace character */ + } + else if ((position = match_string(source, begin, length))) { + /* Possibly matched a string + * First look for closing " + * Then look for a newline to close it if no " + * Then stop after some large constant of characters maybe? + * We're dealing with real text so people might forget to close + * quotations, so we have to be clever about it and use heuristics (for performance) + */ + lookahead = source[position]; + source[position] = '\0'; + if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) { + current_token.string = current_token_val; + source[position] = lookahead; + allspace = false; + } + else { + assert(position > begin); + current_token_val = xcalloc(((position - begin) + 1), sizeof (char)); + CHECK(current_token_val); + extract_token(position, begin, source, current_token_val); + if ((allspace = is_empty_string(current_token_val, (position - begin) + 1))) { + source[position] = lookahead; + current_token.string = current_token_val; + } + else { + source[position] = lookahead; + hsh_insert(token_stack.memo, current_token_val, current_token_val); + current_token.string = current_token_val; + } + } + if (allspace) { + push_token(&token_stack, make_token(current_token, STRING)); + } + } + else if ((position = match_float(source, begin, length))) { + /* Matched a float */ + lookahead = source[position]; + source[position] = '\0'; + if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) { + current_token.floating = current_token_val; + source[position] = lookahead; + } + else { + assert(position > begin); + current_token_val = xcalloc(((position - begin) + 1), sizeof (char)); + CHECK(current_token_val); + extract_token(position, begin, source, current_token_val); + source[position] = lookahead; + hsh_insert(token_stack.memo, current_token_val, current_token_val); + current_token.floating = current_token_val; + } + push_token(&token_stack, make_token(current_token, FLOATING)); + } + else if ((position = match_int(source, begin, length))) { + /* Matched an int */ + lookahead = source[position]; + source[position] = '\0'; + if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) { + current_token.integer = current_token_val; + source[position] = lookahead; + } + else { + assert(position > begin); + assert(position <= length); + + current_token_val = xcalloc(((position - begin) + 1), sizeof (char)); + CHECK(current_token_val); + extract_token(position, begin, source, current_token_val); + source[position] = lookahead; + hsh_insert(token_stack.memo, current_token_val, current_token_val); + current_token.integer = current_token_val; + } + push_token(&token_stack, make_token(current_token, INTEGER)); + } + else if (source[begin] == '\'') { + /* Matched a quote (apostrophe) */ + position = begin + 1; + push_token(&token_stack, quote_tok); + } + else if ((position = match_word(source, begin, length))) { + /* Matched a word */ + lookahead = source[position]; + source[position] = '\0'; + if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) { + current_token.word = current_token_val; + source[position] = lookahead; + } + else { + assert(position > begin); + assert(position <= length); + + current_token_val = xcalloc(((position - begin) + 1), sizeof (char)); + CHECK(current_token_val); + extract_token(position, begin, source, current_token_val); + source[position] = lookahead; + hsh_insert(token_stack.memo, current_token_val, current_token_val); + current_token.word = current_token_val; + } + push_token(&token_stack, make_token(current_token, WORD)); + /* Matched a word */ + } + else if (position <= begin) { + printf("Source is too large to read\n"); + exit(EXIT_FAILURE); + } + else { + printf("Unmatched token\n"); + exit(EXIT_FAILURE); + } + begin = position; + } + + return token_stack; +} + +int +free_token(const void *key, + const void *val) { + /* silence warnings about unused parameters, key and val point to the same data*/ + (void)key; + xfree((char *)val); + return true; +} + +bool +release_tokens(token_stream *tokens) { + /* Iterate through the stack, release each token + * Then release the entire stack + */ + CHECK(tokens); + CHECK(tokens->tokens); + assert(tokens->max_length > 0); + xfree(tokens->tokens); + hsh_iterate(tokens->memo, free_token); + + hsh_destroy(tokens->memo); + return true; +} + +#ifndef TOK_LIB +int main(void) { + void *source_code = malloc(111000); + size_t nbytes = read(STDIN_FILENO, source_code, 111000); + if (nbytes == 0) { + exit(EXIT_FAILURE); + } + token_stream toks = tokenize(source_code, 0, nbytes); + token_t current_tok; + while (toks.length > 0) { + current_tok = peek_token(&toks); + switch (current_tok.token_type) { + case SYMBOL: + printf("symbol: %s\n", current_tok.token.symbol); + break; + case WORD: + printf("identifer: %s\n", current_tok.token.word); + break; + case INTEGER: + printf("integer: %s\n", current_tok.token.integer); + break; + case FLOATING: + printf("floating: %s\n", current_tok.token.floating); + break; + case QUOTE: + printf("quote: '\n"); + break; + case WSPACE: + printf("whitespace\n"); + break; + case PAREN: + printf("paren: %s\n", current_tok.token.parenthesis); + break; + case EMPTY: + printf("this should not be empty\n"); + break; + case STRING: + printf("string: %s\n", current_tok.token.string); + break; + default: + printf("oops, there was an unknown token, check valgrind or gdb\n"); + } + pop_token(&toks); + } + release_tokens(&toks); + return 0; +} +#endif diff --git a/tokenize.h b/tokenize.h new file mode 100644 index 0000000..1844944 --- /dev/null +++ b/tokenize.h @@ -0,0 +1,73 @@ +#define STACK_SIZE 4096 +#define GROWTH_FACTOR 2 +#define MAX_STRING_SIZE 30 + +typedef char* source_t; + +typedef enum { + WORD = 1, + INTEGER = 2, + FLOATING = 3, + QUOTE = 4, + WSPACE = 5, + PAREN = 6 , + EMPTY = 7, + STRING = 8 +} tok_t; + +typedef union { + const char *word; + const char *integer; + const char *floating; + const char *parenthesis; + const char *string; + bool quote; + bool null_token; +} token_val_t; + +typedef struct { + tok_t token_type; + token_val_t token; +} token_t; + +typedef struct { + size_t length; /* Number of current elements */ + size_t max_length; /* Maximum length of the stack */ + token_t *tokens; + hsh_HashTable memo; +} token_stream; + +bool +push_token(token_stream*, token_t); + +bool +pop_token(token_stream*); + +token_t +peek_token(token_stream*); + +token_stream +tokenize(source_t, uint32_t, const uint32_t); + +bool +release_tokens(token_stream*); + +#ifndef TOK_LIB +static uint32_t +match_int(source_t, uint32_t, const uint32_t); + +static uint32_t +match_float(source_t, uint32_t, const uint32_t); + +static uint32_t +match_word(source_t, uint32_t, const uint32_t); + +static uint32_t +match_string(source_t, uint32_t, const uint32_t); +#endif + +int +free_token(const void *, + const void *); +token_t +testfunc(void);