first commit

9 years ago · 3dc20ed2dd
7 changed files with 1139 additions and 0 deletions
--- a/13
+++ b/13
@ -0,0 +1,13 @@
 default: markov.c markov.h
 	$(MAKE) lib;
 	$(CC) -g -DTOK_LIB -Wall -Wextra -std=gnu99 -Wpointer-arith -Wmissing-prototypes -Werror -lmaa -lm -L. -ltokenize -O3 ./markov.c -o markov_test -Wl,-rpath,/home/wes/markov;
 unsafe: markov.c  markov.h
 	$(MAKE) lib;
 	$(CC) -DNDEBUG -DTOK_LIB -Wall -std=gnu99 -Wextra -Wpointer-arith -Wmissing-prototypes -lmaa -lm -L. -ltokenize -O3 ./markov.c -o markov -Wl,-rpath,/home/wes/markov;
 lib: markov.c markov.h tokenize.c tokenize.h
 	$(CC) -g -DTOK_LIB -c -fpic -Wall -Wextra -pedantic -Wpointer-arith -Werror -std=c99 -O3 ./tokenize.c
 	$(CC) -shared -o libtokenize.so tokenize.o -lmaa;
 	$(CC) -g -DTOK_LIB -c -fpic -Wall -Wextra -pedantic -Wpointer-arith -Werror -std=gnu99 -O3 ./markov.c -lmaa -lm
 	$(CC) -shared -o markov.so markov.o -lmaa;
--- a/error.h
+++ b/error.h
@ -0,0 +1 @@
 #define CHECK(ptr) if ((ptr) == NULL) { printf("Failed to allocate memory\n"); exit(EXIT_FAILURE); }
--- a/markov.c
+++ b/markov.c
@ -0,0 +1,400 @@
 #include <time.h>
 #include <math.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
 #include <ctype.h>
 #include <stdbool.h>
 #include <string.h>
 #include <assert.h>
 #include "error.h"
 #include "maa.h"
 #include "tokenize.h"
 #include "markov.h"
 #define LEN 50
 static char quote = '\'';
 static inline void
 initialize_neighbours(const char *str,
                     hsh_HashTable graph_table) {
  /* Initialize the table of neighbours corresponding to some string in the markov graph */
  assert(!hsh_retrieve(graph_table, str));
  neighbours_t *new_neighbours = xmalloc(sizeof (neighbours_t));
  CHECK(new_neighbours);
  new_neighbours->neighbours = hsh_create(NULL, NULL);
  new_neighbours->number = 0;
  new_neighbours->unique_num = 0;
  hsh_insert(graph_table, str, new_neighbours);
  return;
 }
 static inline unsigned long
 numberof_keys(graph_t graph) {
  /* Get the number of unique keys in the graph */
  hsh_Stats stats = hsh_get_stats(graph.cache);
  unsigned long num = stats->entries;
  xfree(stats);
  return num;
 }
 static inline unsigned long
 numberof_transitionable(graph_t graph) {
  /* Get the number of keys with >0 neighbours */
  /* Only call after graph has been converted */
  unsigned long num = 0;
  void *p, *key;
  markov_trans_t *val;
  HSH_ITERATE(graph.graph, p, key, val) {
    if (val->number > 0) {
      num++;
    }
  }
  return num;
 }
 static const char*
 get_ngram(const char* str,
          graph_t graph) {
  /* Try to get a string from the cache.
   * If it's not already cached, allocate the memory for it
   * then return the freshly cached string
   */
  hsh_HashTable cache = graph.cache;
  hsh_HashTable graph_table = graph.graph;
  const char *exists = hsh_retrieve(cache, str);
  if (exists) {
    return exists;
  }
  else {
    /* Add it to the cache and return it */
    size_t gram_size = strlen(str) + 1;
    char *new_str = xmalloc(gram_size);
    CHECK(str);
    snprintf(new_str, gram_size, "%s", str);
    hsh_insert(cache, new_str, new_str);
    initialize_neighbours(new_str, graph_table);
    return new_str;
  }
 }
 static inline void
 insert_neighbour(const char *left,
                 const char *neighbour,
                 graph_t graph) {
  /* Insert a neighbour into the table of neighbours for a given key */
  neighbours_t *neighbours = (neighbours_t *)hsh_retrieve(graph.graph, left);
  neighbours->number++;
  hsh_HashTable neighbours_table = neighbours->neighbours;
  CHECK(neighbours->neighbours);
  if (hsh_retrieve(neighbours_table, neighbour)) {
    return;
  }
  neighbours->unique_num++;
  const char *new_neighbour = get_ngram(neighbour, graph);
  CHECK(new_neighbour);
  uint32_t *count = xmalloc(sizeof (uint32_t));
  CHECK(count);
  *count = 0;
  hsh_insert(neighbours_table, new_neighbour, count);
 }
 static inline void
 increment_neighbour(const char *left,
                    const char *neighbour,
                    graph_t graph) {
  /* Increment the frequency of a given bi-gram.
   * bi-gram does not necessarily mean a specific thing
   * it could be pairs of words, pairs of letters, sequences of n letters, and so on
   */
  neighbours_t *neighbours = (neighbours_t *)hsh_retrieve(graph.graph, left);
  hsh_HashTable neighbours_hash = neighbours->neighbours;
  CHECK(neighbours_hash);
  uint32_t *count = (uint32_t *)hsh_retrieve(neighbours_hash, neighbour);
  CHECK(count);
  (*count)++;
 }
 static inline neighbours_t*
 get_neighbours(graph_t graph,
               char *gram) {
  /* Simply return the table of neighbours corresponding to a given string */
  neighbours_t *neighbours;
  neighbours = (neighbours_t *)hsh_retrieve(graph.graph, gram);
  assert(neighbours);
  return neighbours;
 }
 static inline markov_trans_t*
 get_prob_neighbours(graph_t graph,
                    char *gram) {
  /* Return the converted probability transitions */
  markov_trans_t *neighbours;
  neighbours = (markov_trans_t *)hsh_retrieve(graph.graph, gram);
  assert(neighbours);
  return neighbours;
 }
 static inline void
 convert_neighbours(graph_t graph,
                   char *gram) {
  neighbours_t *neighbours = get_neighbours(graph, gram);
  markov_trans_t *result = xmalloc(sizeof (markov_trans_t));
  CHECK(result);
  size_t nb_size = neighbours->number;
  hsh_HashTable neighbours_hash = neighbours->neighbours;
  void *key;
  uint32_t *frequency;
  void *p;
  uint32_t index = 0;
  probability_t transition;
  probability_t *neighbour_array = xcalloc(sizeof (probability_t), nb_size);
  CHECK(neighbour_array);
  HSH_ITERATE(neighbours_hash, p, key, frequency) {
    transition.frequent.frequency = *frequency;
    xfree(frequency);
    transition.frequent.token = key;
    neighbour_array[index] = transition;
    index++;
  }
 float lower = 0.0;
  probability_t current;
  for (uint32_t i = 0; i < neighbours->unique_num; i++) {
    current.frequent = neighbour_array[i].frequent;
    neighbour_array[i].bucket.token = current.frequent.token;
    neighbour_array[i].bucket.lower = lower;
    neighbour_array[i].bucket.upper = lower + ((float)neighbour_array[i].frequent.frequency) /
                                              (neighbours->number);
    lower = neighbour_array[i].bucket.upper;
  }
  result->transitions = neighbour_array;
  result->number = neighbours->unique_num;
  hsh_delete(graph.graph, gram);
  hsh_insert(graph.graph, gram, result);
  hsh_destroy(neighbours->neighbours);
  xfree(neighbours);
 }
 static inline void
 convert_all_neighbours(graph_t graph) {
  void *p, *key;
  char *current_key;
  unsigned long num_keys = numberof_keys(graph);
  if (num_keys == 0) {
    return;
  }
  stk_Stack keys = stk_create();
  /* iterate over all keys K, in hash table T */
  HSH_ITERATE_KEYS(graph.graph, p, key) {
    stk_push(keys, key);
  }
  for (uint32_t i = 0; i < num_keys; i++) {
    current_key = (char *)stk_pop(keys);
    convert_neighbours(graph, current_key);
  }
  stk_destroy(keys);
 }
 static inline void
 relate_bigram(const char *a,
              const char *b,
              graph_t graph) {
  /* Update the graph with the information that b follows a */
  const char* str = get_ngram(a, graph);
  insert_neighbour(str, b, graph);
  increment_neighbour(str, b, graph);
 }
 static int
 transition_cmp(const void *keyval,
               const void *datum) {
  float chosen_number = *((float *)keyval);
  probability_t *transition = (probability_t *)datum;
  float lower = transition->bucket.lower;
  float upper = transition->bucket.upper;
  if ((chosen_number >= lower) &&
      (chosen_number <= upper)) {
    return 0;
  }
  else if (chosen_number < lower) {
    return -1;
  }
  else {
    return 1;
  }
 }
 static inline char*
 pick_random_transition(unique_keys_t unique_neighbours) {
  size_t num = unique_neighbours.number;
  char **keys = unique_neighbours.keys;
  size_t selection = (size_t)floor(drand48() * (num - 1));
  return keys[selection];
 }
 static inline char*
 next_ngram(graph_t graph,
           char *start,
           unique_keys_t unique_neighbours) {
  markov_trans_t *transitions = get_prob_neighbours(graph, start);
  if (transitions->number == 0) {
    return pick_random_transition(unique_neighbours);
  }
  probability_t *buckets = transitions->transitions;
  size_t bucket_size = transitions->number;
  float chosen = (float)drand48();
  probability_t *result = bsearch(&chosen,
                                  buckets,
                                  bucket_size,
                                  sizeof (probability_t),
                                  transition_cmp);
  return ((char *)result->bucket.token);
 }
 lst_List
 generate_strings(markov_chain_t markov_chain,
                 char *start,
                 uint32_t n) {
  unique_keys_t unique_neighbours = markov_chain.unique;
  graph_t graph = markov_chain.graph;
  lst_List result = lst_create();
  char *current = start;
  for (uint32_t i = 0; i < n; i++) {
    lst_append(result, current);
    current = next_ngram(graph, current, unique_neighbours);
  }
  return result;
 }
 static inline unique_keys_t
 get_all_keys(graph_t graph) {
  /* Gets all unique keys with neighbours */
  /* Should only be called after graph generation */
  unsigned long number = numberof_transitionable(graph);
  char **keys = xcalloc(sizeof (char *), number);
  CHECK(keys);
  void *p, *key;
  unique_keys_t result;
  markov_trans_t *val;
  uint32_t i = 0;
  HSH_ITERATE(graph.graph, p, key, val) {
    if (val->number > 0) {
      keys[i] = key;
      i++;
    }
  }
  result.keys = keys;
  result.number = i;
  return result;
 }
 static inline graph_t
 make_graph(void) {
  /* Make an initial empty graph */
  graph_t result;
  result.cache = hsh_create(NULL, NULL);
  result.graph = hsh_create(NULL, NULL);
  return result;
 }
 static inline void
 release_converted_graph(graph_t graph) {
  void *p, *key;
  markov_trans_t *datum;
  /* iterate over all keys K, in hash table and xfree them*/
  HSH_ITERATE(graph.graph, p, key, datum) {
    xfree(datum->transitions);
    xfree(datum);
    xfree(key);
  }
  hsh_destroy(graph.cache);
  hsh_destroy(graph.graph);
 }
 markov_chain_t
 build_markov_chain(token_stream tokens) {
  markov_chain_t result;
  graph_t graph = make_graph();
  token_t current;
  token_t next;
  while (tokens.length > 1) {
    current = peek_token(&tokens);
    pop_token(&tokens);
    next = peek_token(&tokens);
    relate_bigram(token_to_string(next), token_to_string(current), graph);
  }
  convert_all_neighbours(graph);
  result.graph = graph;
  result.unique = get_all_keys(graph);
  return result;
 }
 char *
 token_to_string(token_t token) {
    switch (token.token_type) {
      case WORD:
        return (char*)token.token.word;
        break;
      case INTEGER:
        return (char*)token.token.integer;
        break;
      case FLOATING:
        return (char*)token.token.floating;
        break;
      case QUOTE:
        return &quote;
        break;
      case PAREN:
        return (char*)token.token.parenthesis;
        break;
      case EMPTY:
        printf("should not be here\n");
        exit(EXIT_FAILURE);
        break;
      case STRING:
        return (char*)token.token.string;
        break;
      default:
        printf("oops, there was an unknown token, check valgrind or gdb\n");
        exit(EXIT_FAILURE);
    }
 }
 void
 release_markov_chain(markov_chain_t chain) {
  release_converted_graph(chain.graph);
  xfree(chain.unique.keys);
  return;
 }
 int
 main (void) {
  void *test_input = xmalloc(555000);
  size_t nbytes = read(STDIN_FILENO, test_input, 555000);
  if (nbytes == 0) {
    exit(EXIT_FAILURE);
  }
  token_stream test_bigrams_stack = tokenize(test_input, 0, nbytes);
  markov_chain_t chain = build_markov_chain(test_bigrams_stack);
  srand48(time(NULL));
  lst_List test = generate_strings(chain, token_to_string(peek_token(&test_bigrams_stack)), LEN);
  lst_pop(test);
  for (uint32_t i = 0; i < LEN-1; i++) {
    printf("%s ", (char *)lst_pop(test));
  }
  printf("\n");
  lst_destroy(test);
  _lst_shutdown();
  release_markov_chain(chain);
  xfree(test_input);
  release_tokens(&test_bigrams_stack);
  return EXIT_SUCCESS;
 }
--- a/markov.h
+++ b/markov.h
@ -0,0 +1,78 @@
 typedef
  struct {
    hsh_HashTable cache;
    hsh_HashTable graph;
  }
  graph_t;
 typedef
  struct {
    size_t number;
    char **keys;
  }
  unique_keys_t;
 typedef
  struct {
    graph_t graph;
    unique_keys_t unique;
  }
  markov_chain_t;
 typedef
  struct {
    hsh_HashTable neighbours;
    size_t number;
    size_t unique_num;
  }
  neighbours_t;
 /*
 * Transition types for various reasons
 */
 typedef
  struct {
    float upper;
    float lower;
    const char *token;
  }
  bucket_t;
 typedef
  struct {
    uint32_t frequency;
    const char *token;
  }
  transition_t;
 typedef
  union {
    transition_t frequent;
    bucket_t bucket;
  }
  probability_t;
 typedef
  struct {
    size_t number;
    probability_t *transitions;
  }
  markov_trans_t;
 markov_chain_t
 build_markov_chain(token_stream);
 char *
 token_to_string(token_t);
 void
 release_markov_chain(markov_chain_t);
 lst_List
 generate_strings(markov_chain_t,
                 char *,
                 uint32_t);
--- a/23
+++ b/23
@ -0,0 +1,23 @@
 Two roads diverged in a yellow wood,
 And sorry I could not travel both
 And be one traveler, long I stood
 And looked down one as far as I could
 To where it bent in the undergrowth;
 Then took the other, as just as fair,
 And having perhaps the better claim,
 Because it was grassy and wanted wear;
 Though as for that the passing there
 Had worn them really about the same,
 And both that morning equally lay
 In leaves no step had trodden black.
 Oh, I kept the first for another day!
 Yet knowing how way leads on to way,
 I doubted if I should ever come back.
 I shall be telling this with a sigh
 Somewhere ages and ages hence:
 Two roads diverged in a wood, and I—
 I took the one less traveled by,
 And that has made all the difference.
--- a/tokenize.c
+++ b/tokenize.c
@ -0,0 +1,551 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
 #include <ctype.h>
 #include <stdbool.h>
 #include <string.h>
 #include <assert.h>
 #include "error.h"
 #include "maa.h"
 #include "tokenize.h"
 /*
 * This is a basic s-expression tokenizer
 * it also tokenizes things like number, string, and symbol literals
 */
 const token_t nulltok = {
  .token_type = EMPTY,
    {
      .null_token=false
    }
 };
 static const token_t quote_tok = {
  .token_type = QUOTE,
  .token= {
    .quote=true
  }
 };
 static const token_t left_paren = {
  .token_type = PAREN,
                .token = {
                  .parenthesis="("
                }
 };
 static const token_t right_paren = {
  .token_type = PAREN,
  .token = {
    .parenthesis=")"
  }
 };
 static inline const char *
 string_head(uint32_t n,
            const char *in,
            char *out) {
  /* out must be large enough to store the number of characters
   * you want to select from in, plus a byte for the null terminator
   */
 #ifndef NDEBUG
  size_t in_len = strlen(in);
 #endif
  assert((n > 0 && n <= in_len));
  int iserror = snprintf(out, (size_t)n+1 , "%s", in);
  assert((iserror != -1) && ((size_t)iserror == in_len));
  if (iserror == -1) {
    printf("Out of memory");
    exit(EXIT_FAILURE);
  }
  return (const char*)out;
 }
 static inline token_t
 make_token(token_val_t val,
           tok_t toktype) {
  token_t result;
  result.token_type = toktype;
  result.token = val;
  return result;
 }
 bool
 push_token(token_stream *tokens,
           token_t token) {
  /*
   * Check if tokens points to NULL
   */
  size_t len;
  size_t max;
  CHECK(tokens);
  len = tokens->length;
  max = tokens->max_length;
  assert(len <= max);
  assert(max > 0);
  if (len == max) {
    /* We've reached the maximum stack size
     * So we must try to increase that by GROWTH_SIZE
     */
    token_t *new_tokens = xrealloc(tokens->tokens, sizeof (token_t) * (max * GROWTH_FACTOR));
    if (!new_tokens) {
      printf("Could not allocate enough memory for the token stack\n");
      exit(EXIT_FAILURE);
    }
    tokens->tokens = new_tokens;
    tokens->max_length = max * GROWTH_FACTOR;
    tokens->tokens[len] = token;
    tokens->length++;
    return true;
  }
  tokens->tokens[len] = token;
  tokens->length++;
  return true;
 }
 bool
 pop_token(token_stream *tokens) {
  size_t len;
  CHECK(tokens);
  len = tokens->length;
  assert(len != 0);
  len--;
  CHECK(tokens->tokens);
  tokens->length--;
  return true;
 }
 inline token_t
 peek_token(token_stream *tokens) {
  /*
   * Check if tokens points to NULL
   */
  size_t len = tokens->length;
  size_t max = tokens->max_length;
  CHECK(tokens);
  assert(len != 0);
  if (len == 0 || len > max) {
    return nulltok;
  }
  return tokens->tokens[len-1];
 }
 static inline uint32_t
 match_int(source_t source,
          uint32_t begin,
          const uint32_t length) {
  /* Return false if there is no match
   * otherwise return the position of the end of the match + 1
   */
  uint32_t i = begin;
  uint32_t test;
  CHECK(source);
  assert(length > 0);
  if (source[i] == '+' ||
      source[i] == '-') {
    i++;
  }
  test = i;
  while (i < length &&
         isdigit(source[i])) {
    i++;
  }
  if (i == test)
    return false;
  return i;
 }
 static inline uint32_t
 match_float(source_t source,
            uint32_t begin,
            const uint32_t length) {
  /* Return false if there is no match
   * otherwise:
   *  if there is a leading decimal point and then a valid int match:
   *    return the position of the end of the match
   *  if there is a leading valid int match:
   *    but no decimal point match after that:
   *      return false
   *    if there is a decimal point match and then a valid int match:
   *        return the position of the match
   *    if there is no valid int match:
   *      return false
   * ALWAYS returns the position + 1 to avoid confusion with false (which is a valid index)
   */
  uint32_t i, leading_int_match, trailing_int_match;
  CHECK(source);
  assert(length > 0);
  i = begin;
  leading_int_match = match_int(source, i, length);
  if (leading_int_match) {
    i = leading_int_match;
  }
  assert(i <= length);
  if (source[i] != '.' ||
      source[i] == '+' ||
      source[i] == '-') {
    if (((i+1) <= length) && /* Make sure there is at least two characters to look at */
        ((source[i] == '+') ||
         (source[i] == '-'))
        && (source[i+1] == '.')) {
      i++;
    }
    else {
      return false;
    }
  }
  i++;
  trailing_int_match = match_int(source, i, length);
  if (trailing_int_match) {
    return trailing_int_match;
  }
  return false;
 }
 static inline uint32_t
 match_word(source_t source,
                 uint32_t begin,
                 const uint32_t length) {
  /* Return false if there is no match
   *    if there is a match for any characters that are not:
   *      whitespace
   *      a parenthesis ( )
   *      a brace { }
   *      a square bracket [ ]
   *        then return the position of the match + 1
   *    if there is nothing else to match:
   *      return false
   */
  uint32_t i = begin;
  CHECK(source);
  assert(length > 0);
  while (i < length &&
         !(source[i] == '(' ||
           source[i] == ')' ||
           isspace(source[i]))) {
    i++;
  }
  if (i == begin) {
    return false;
  }
  assert(i <= length);
  return i;
 }
 static inline uint32_t
 is_empty_string(const char *source,
                uint32_t length) {
  int allspace = false;
  uint32_t i = 0;
  if (source[i] != '\"') {
    return false;
  }
  for (; i < length; i++) {
    if (!isspace(source[i])) {
      allspace = true;
    }
  }
  /*if (allspace) {
    printf("Actually found an empty string! Of length %d\n", i);
  }*/
  return allspace;
 }
 static inline uint32_t
 match_string(source_t source,
             uint32_t begin,
             const uint32_t length) {
  CHECK(source);
  (void)length;
  assert(length > 0);
  uint32_t i = begin;
  if (source[i] != '\"') {
    return false;
  }
  i++;
  while (source[i] != '\"' &&
         (i < length) &&
         (i < (begin + MAX_STRING_SIZE))) {
    i++;
  }
  if ((i != (begin+1)) &&
      (i <= length) &&
      (source[i] == '\"')) {
    return i+1;
  }
  return false;
 }
 static inline void
 extract_token(uint32_t position,
              uint32_t begin,
              const source_t source,
              const char *token_val) {
    assert(position > begin);
    string_head(position - begin,
                &source[begin],
                (char *)token_val);
 }
 token_stream
 tokenize(source_t source,
         uint32_t begin,
         const uint32_t length) {
  /*
   * Remember to free everything from this struct
   * for example, token_stack.tokens will not necessarily be
   * equal to tokens after this function has run
   *
   */
  uint32_t position = begin;
  uint32_t allspace = false;
  const char *current_token_val;
  token_stream token_stack;
  token_val_t current_token;
  token_t *tokens = xcalloc(STACK_SIZE, sizeof (token_t));
  hsh_HashTable token_memo = hsh_create(NULL, NULL);
  assert(begin == 0);
  assert(length > 0);
  CHECK(source);
  token_stack.length = 0;
  token_stack.max_length = STACK_SIZE;
  token_stack.tokens = tokens;
  token_stack.memo = token_memo;
  char lookahead = '\0';
  assert(STACK_SIZE > 0);
  while (begin <= length && source[begin]) {
      /* Possibly matched a string
       * First look for closing "
       * Then look for a newline to close it if no "
       * Then stop after some large constant of characters maybe?
       * We're dealing with real text so people might forget to close
       * quotations, so we have to be clever about it and use heuristics (for performance)
       */
    if (source[begin] == '(') {
      /*Matched a left paren */
      position = begin + 1;
      push_token(&token_stack, left_paren);
    }
    else if (source[begin] == ')') {
      /*Matched a left paren */
      position = begin + 1;
      push_token(&token_stack, right_paren);
    }
    else if (isspace(source[begin])) {
      position = begin + 1;
      /* Matched a whitespace character */
    }
    else if ((position = match_string(source, begin, length))) {
      /* Possibly matched a string
       * First look for closing "
       * Then look for a newline to close it if no "
       * Then stop after some large constant of characters maybe?
       * We're dealing with real text so people might forget to close
       * quotations, so we have to be clever about it and use heuristics (for performance)
       */
      lookahead = source[position];
      source[position] = '\0';
      if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) {
        current_token.string = current_token_val;
        source[position] = lookahead;
        allspace = false;
      }
      else {
        assert(position > begin);
        current_token_val = xcalloc(((position - begin) + 1), sizeof (char));
        CHECK(current_token_val);
        extract_token(position, begin, source, current_token_val);
        if ((allspace = is_empty_string(current_token_val, (position - begin) + 1))) {
          source[position] = lookahead;
          current_token.string = current_token_val;
        }
        else {
        source[position] = lookahead;
        hsh_insert(token_stack.memo, current_token_val, current_token_val);
        current_token.string = current_token_val;
        }
      }
      if (allspace) {
        push_token(&token_stack, make_token(current_token, STRING));
      }
    }
    else if ((position = match_float(source, begin, length))) {
      /* Matched a float */
      lookahead = source[position];
      source[position] = '\0';
      if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) {
        current_token.floating = current_token_val;
        source[position] = lookahead;
      }
      else {
        assert(position > begin);
        current_token_val = xcalloc(((position - begin) + 1), sizeof (char));
        CHECK(current_token_val);
        extract_token(position, begin, source, current_token_val);
        source[position] = lookahead;
        hsh_insert(token_stack.memo, current_token_val, current_token_val);
        current_token.floating = current_token_val;
      }
      push_token(&token_stack, make_token(current_token, FLOATING));
    }
    else if ((position = match_int(source, begin, length))) {
      /* Matched an int */
      lookahead = source[position];
      source[position] = '\0';
      if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) {
        current_token.integer = current_token_val;
        source[position] = lookahead;
      }
      else {
        assert(position > begin);
        assert(position <= length);
        current_token_val = xcalloc(((position - begin) + 1), sizeof (char));
        CHECK(current_token_val);
        extract_token(position, begin, source, current_token_val);
        source[position] = lookahead;
        hsh_insert(token_stack.memo, current_token_val, current_token_val);
        current_token.integer = current_token_val;
      }
      push_token(&token_stack, make_token(current_token, INTEGER));
    }
    else if (source[begin] == '\'') {
      /* Matched a quote (apostrophe) */
      position = begin + 1;
      push_token(&token_stack, quote_tok);
    }
    else if ((position = match_word(source, begin, length))) {
      /* Matched a word */
      lookahead = source[position];
      source[position] = '\0';
      if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) {
        current_token.word = current_token_val;
        source[position] = lookahead;
      }
      else {
        assert(position > begin);
        assert(position <= length);
        current_token_val = xcalloc(((position - begin) + 1), sizeof (char));
        CHECK(current_token_val);
        extract_token(position, begin, source, current_token_val);
        source[position] = lookahead;
        hsh_insert(token_stack.memo, current_token_val, current_token_val);
        current_token.word = current_token_val;
      }
      push_token(&token_stack, make_token(current_token, WORD));
      /* Matched a word */
    }
    else if (position <= begin) {
      printf("Source is too large to read\n");
      exit(EXIT_FAILURE);
    }
    else {
      printf("Unmatched token\n");
      exit(EXIT_FAILURE);
    }
    begin = position;
  }
  return token_stack;
 }
 int
 free_token(const void *key,
           const void *val) {
  /* silence warnings about unused parameters, key and val point to the same data*/
  (void)key;
  xfree((char *)val);
  return true;
 }
 bool
 release_tokens(token_stream *tokens) {
  /* Iterate through the stack, release each token
   * Then release the entire stack
   */
  CHECK(tokens);
  CHECK(tokens->tokens);
  assert(tokens->max_length > 0);
  xfree(tokens->tokens);
  hsh_iterate(tokens->memo, free_token);
  hsh_destroy(tokens->memo);
  return true;
 }
 #ifndef TOK_LIB
 int main(void) {
  void *source_code = malloc(111000);
  size_t nbytes = read(STDIN_FILENO, source_code, 111000);
  if (nbytes == 0) {
    exit(EXIT_FAILURE);
  }
  token_stream toks = tokenize(source_code, 0, nbytes);
  token_t current_tok;
  while (toks.length > 0) {
    current_tok = peek_token(&toks);
    switch (current_tok.token_type) {
      case SYMBOL:
        printf("symbol: %s\n", current_tok.token.symbol);
        break;
      case WORD:
        printf("identifer: %s\n", current_tok.token.word);
        break;
      case INTEGER:
        printf("integer: %s\n", current_tok.token.integer);
        break;
      case FLOATING:
        printf("floating: %s\n", current_tok.token.floating);
        break;
      case QUOTE:
        printf("quote: '\n");
        break;
      case WSPACE:
        printf("whitespace\n");
        break;
      case PAREN:
        printf("paren: %s\n", current_tok.token.parenthesis);
        break;
      case EMPTY:
        printf("this should not be empty\n");
        break;
      case STRING:
        printf("string: %s\n", current_tok.token.string);
        break;
      default:
        printf("oops, there was an unknown token, check valgrind or gdb\n");
    }
    pop_token(&toks);
  }
  release_tokens(&toks);
  return 0;
 }
 #endif
--- a/tokenize.h
+++ b/tokenize.h
@ -0,0 +1,73 @@
 #define STACK_SIZE 4096
 #define GROWTH_FACTOR 2
 #define MAX_STRING_SIZE 30
 typedef char* source_t;
 typedef enum {
  WORD = 1,
  INTEGER = 2,
  FLOATING = 3,
  QUOTE = 4,
  WSPACE = 5,
  PAREN = 6 ,
  EMPTY = 7,
  STRING = 8
 } tok_t;
 typedef union {
    const char *word;
    const char *integer;
    const char *floating;
    const char *parenthesis;
    const char *string;
    bool quote;
    bool null_token;
 } token_val_t;
 typedef struct {
  tok_t token_type;
  token_val_t token;
 } token_t;
 typedef struct {
  size_t length; /* Number of current elements */
  size_t max_length; /* Maximum length of the stack */
  token_t *tokens;
  hsh_HashTable memo;
 } token_stream;
 bool
 push_token(token_stream*, token_t);
 bool
 pop_token(token_stream*);
 token_t
 peek_token(token_stream*);
 token_stream
 tokenize(source_t, uint32_t, const uint32_t);
 bool
 release_tokens(token_stream*);
 #ifndef TOK_LIB
 static uint32_t
 match_int(source_t, uint32_t, const uint32_t);
 static uint32_t
 match_float(source_t, uint32_t, const uint32_t);
 static uint32_t
 match_word(source_t, uint32_t, const uint32_t);
 static uint32_t
 match_string(source_t, uint32_t, const uint32_t);
 #endif
 int
 free_token(const void *,
           const void *);
 token_t
 testfunc(void);
	`@ -0,0 +1 @@`
					`#define CHECK(ptr) if ((ptr) == NULL) { printf("Failed to allocate memory\n"); exit(EXIT_FAILURE); }`