From 3dc20ed2ddc7a11ff20c38fa38de5d840d22c5e7 Mon Sep 17 00:00:00 2001
From: nisstyre56 <wjak56@gmail.com>
Date: Sat, 6 Aug 2016 02:40:20 +0000
Subject: [PATCH] first commit

---
 Makefile     |  13 ++
 error.h      |   1 +
 markov.c     | 400 +++++++++++++++++++++++++++++++++++++
 markov.h     |  78 ++++++++
 roadnottaken |  23 +++
 tokenize.c   | 551 +++++++++++++++++++++++++++++++++++++++++++++++++++
 tokenize.h   |  73 +++++++
 7 files changed, 1139 insertions(+)
 create mode 100644 Makefile
 create mode 100644 error.h
 create mode 100644 markov.c
 create mode 100644 markov.h
 create mode 100644 roadnottaken
 create mode 100644 tokenize.c
 create mode 100644 tokenize.h

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..31ed6aa
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,13 @@
+default: markov.c markov.h
+	$(MAKE) lib;
+	$(CC) -g -DTOK_LIB -Wall -Wextra -std=gnu99 -Wpointer-arith -Wmissing-prototypes -Werror -lmaa -lm -L. -ltokenize -O3 ./markov.c -o markov_test -Wl,-rpath,/home/wes/markov;
+
+unsafe: markov.c  markov.h
+	$(MAKE) lib;
+	$(CC) -DNDEBUG -DTOK_LIB -Wall -std=gnu99 -Wextra -Wpointer-arith -Wmissing-prototypes -lmaa -lm -L. -ltokenize -O3 ./markov.c -o markov -Wl,-rpath,/home/wes/markov;
+
+lib: markov.c markov.h tokenize.c tokenize.h
+	$(CC) -g -DTOK_LIB -c -fpic -Wall -Wextra -pedantic -Wpointer-arith -Werror -std=c99 -O3 ./tokenize.c
+	$(CC) -shared -o libtokenize.so tokenize.o -lmaa;
+	$(CC) -g -DTOK_LIB -c -fpic -Wall -Wextra -pedantic -Wpointer-arith -Werror -std=gnu99 -O3 ./markov.c -lmaa -lm
+	$(CC) -shared -o markov.so markov.o -lmaa;
diff --git a/error.h b/error.h
new file mode 100644
index 0000000..24bfcc3
--- /dev/null
+++ b/error.h
@@ -0,0 +1 @@
+#define CHECK(ptr) if ((ptr) == NULL) { printf("Failed to allocate memory\n"); exit(EXIT_FAILURE); }
diff --git a/markov.c b/markov.c
new file mode 100644
index 0000000..3206182
--- /dev/null
+++ b/markov.c
@@ -0,0 +1,400 @@
+#include <time.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+#include "error.h"
+#include "maa.h"
+#include "tokenize.h"
+#include "markov.h"
+
+#define LEN 50
+
+static char quote = '\'';
+
+static inline void
+initialize_neighbours(const char *str,
+                     hsh_HashTable graph_table) {
+  /* Initialize the table of neighbours corresponding to some string in the markov graph */
+  assert(!hsh_retrieve(graph_table, str));
+  neighbours_t *new_neighbours = xmalloc(sizeof (neighbours_t));
+  CHECK(new_neighbours);
+  new_neighbours->neighbours = hsh_create(NULL, NULL);
+  new_neighbours->number = 0;
+  new_neighbours->unique_num = 0;
+  hsh_insert(graph_table, str, new_neighbours);
+  return;
+}
+
+static inline unsigned long
+numberof_keys(graph_t graph) {
+  /* Get the number of unique keys in the graph */
+  hsh_Stats stats = hsh_get_stats(graph.cache);
+  unsigned long num = stats->entries;
+  xfree(stats);
+  return num;
+}
+
+static inline unsigned long
+numberof_transitionable(graph_t graph) {
+  /* Get the number of keys with >0 neighbours */
+  /* Only call after graph has been converted */
+  unsigned long num = 0;
+  void *p, *key;
+  markov_trans_t *val;
+  HSH_ITERATE(graph.graph, p, key, val) {
+    if (val->number > 0) {
+      num++;
+    }
+  }
+  return num;
+}
+
+static const char*
+get_ngram(const char* str,
+          graph_t graph) {
+  /* Try to get a string from the cache.
+   * If it's not already cached, allocate the memory for it
+   * then return the freshly cached string
+   */
+  hsh_HashTable cache = graph.cache;
+  hsh_HashTable graph_table = graph.graph;
+  const char *exists = hsh_retrieve(cache, str);
+  if (exists) {
+    return exists;
+  }
+  else {
+    /* Add it to the cache and return it */
+    size_t gram_size = strlen(str) + 1;
+    char *new_str = xmalloc(gram_size);
+    CHECK(str);
+    snprintf(new_str, gram_size, "%s", str);
+    hsh_insert(cache, new_str, new_str);
+    initialize_neighbours(new_str, graph_table);
+    return new_str;
+  }
+}
+
+static inline void
+insert_neighbour(const char *left,
+                 const char *neighbour,
+                 graph_t graph) {
+  /* Insert a neighbour into the table of neighbours for a given key */
+  neighbours_t *neighbours = (neighbours_t *)hsh_retrieve(graph.graph, left);
+  neighbours->number++;
+  hsh_HashTable neighbours_table = neighbours->neighbours;
+  CHECK(neighbours->neighbours);
+  if (hsh_retrieve(neighbours_table, neighbour)) {
+    return;
+  }
+  neighbours->unique_num++;
+  const char *new_neighbour = get_ngram(neighbour, graph);
+  CHECK(new_neighbour);
+  uint32_t *count = xmalloc(sizeof (uint32_t));
+  CHECK(count);
+  *count = 0;
+  hsh_insert(neighbours_table, new_neighbour, count);
+}
+
+static inline void
+increment_neighbour(const char *left,
+                    const char *neighbour,
+                    graph_t graph) {
+  /* Increment the frequency of a given bi-gram.
+   * bi-gram does not necessarily mean a specific thing
+   * it could be pairs of words, pairs of letters, sequences of n letters, and so on
+   */
+  neighbours_t *neighbours = (neighbours_t *)hsh_retrieve(graph.graph, left);
+  hsh_HashTable neighbours_hash = neighbours->neighbours;
+  CHECK(neighbours_hash);
+  uint32_t *count = (uint32_t *)hsh_retrieve(neighbours_hash, neighbour);
+  CHECK(count);
+  (*count)++;
+}
+
+static inline neighbours_t*
+get_neighbours(graph_t graph,
+               char *gram) {
+  /* Simply return the table of neighbours corresponding to a given string */
+  neighbours_t *neighbours;
+  neighbours = (neighbours_t *)hsh_retrieve(graph.graph, gram);
+  assert(neighbours);
+  return neighbours;
+}
+
+static inline markov_trans_t*
+get_prob_neighbours(graph_t graph,
+                    char *gram) {
+  /* Return the converted probability transitions */
+  markov_trans_t *neighbours;
+  neighbours = (markov_trans_t *)hsh_retrieve(graph.graph, gram);
+  assert(neighbours);
+  return neighbours;
+}
+
+static inline void
+convert_neighbours(graph_t graph,
+                   char *gram) {
+  neighbours_t *neighbours = get_neighbours(graph, gram);
+
+  markov_trans_t *result = xmalloc(sizeof (markov_trans_t));
+  CHECK(result);
+  size_t nb_size = neighbours->number;
+  hsh_HashTable neighbours_hash = neighbours->neighbours;
+
+  void *key;
+  uint32_t *frequency;
+  void *p;
+  uint32_t index = 0;
+  probability_t transition;
+  probability_t *neighbour_array = xcalloc(sizeof (probability_t), nb_size);
+  CHECK(neighbour_array);
+  HSH_ITERATE(neighbours_hash, p, key, frequency) {
+    transition.frequent.frequency = *frequency;
+    xfree(frequency);
+    transition.frequent.token = key;
+    neighbour_array[index] = transition;
+    index++;
+  }
+ float lower = 0.0;
+  probability_t current;
+  for (uint32_t i = 0; i < neighbours->unique_num; i++) {
+    current.frequent = neighbour_array[i].frequent;
+    neighbour_array[i].bucket.token = current.frequent.token;
+    neighbour_array[i].bucket.lower = lower;
+    neighbour_array[i].bucket.upper = lower + ((float)neighbour_array[i].frequent.frequency) /
+                                              (neighbours->number);
+    lower = neighbour_array[i].bucket.upper;
+  }
+  result->transitions = neighbour_array;
+  result->number = neighbours->unique_num;
+  hsh_delete(graph.graph, gram);
+  hsh_insert(graph.graph, gram, result);
+  hsh_destroy(neighbours->neighbours);
+  xfree(neighbours);
+}
+
+static inline void
+convert_all_neighbours(graph_t graph) {
+  void *p, *key;
+  char *current_key;
+  unsigned long num_keys = numberof_keys(graph);
+  if (num_keys == 0) {
+    return;
+  }
+  stk_Stack keys = stk_create();
+
+  /* iterate over all keys K, in hash table T */
+  HSH_ITERATE_KEYS(graph.graph, p, key) {
+    stk_push(keys, key);
+  }
+
+  for (uint32_t i = 0; i < num_keys; i++) {
+    current_key = (char *)stk_pop(keys);
+    convert_neighbours(graph, current_key);
+  }
+  stk_destroy(keys);
+}
+
+static inline void
+relate_bigram(const char *a,
+              const char *b,
+              graph_t graph) {
+  /* Update the graph with the information that b follows a */
+  const char* str = get_ngram(a, graph);
+  insert_neighbour(str, b, graph);
+  increment_neighbour(str, b, graph);
+}
+
+static int
+transition_cmp(const void *keyval,
+               const void *datum) {
+  float chosen_number = *((float *)keyval);
+  probability_t *transition = (probability_t *)datum;
+  float lower = transition->bucket.lower;
+  float upper = transition->bucket.upper;
+  if ((chosen_number >= lower) &&
+      (chosen_number <= upper)) {
+    return 0;
+  }
+  else if (chosen_number < lower) {
+    return -1;
+  }
+  else {
+    return 1;
+  }
+}
+
+static inline char*
+pick_random_transition(unique_keys_t unique_neighbours) {
+  size_t num = unique_neighbours.number;
+  char **keys = unique_neighbours.keys;
+  size_t selection = (size_t)floor(drand48() * (num - 1));
+  return keys[selection];
+}
+
+
+static inline char*
+next_ngram(graph_t graph,
+           char *start,
+           unique_keys_t unique_neighbours) {
+  markov_trans_t *transitions = get_prob_neighbours(graph, start);
+  if (transitions->number == 0) {
+    return pick_random_transition(unique_neighbours);
+  }
+  probability_t *buckets = transitions->transitions;
+  size_t bucket_size = transitions->number;
+  float chosen = (float)drand48();
+  probability_t *result = bsearch(&chosen,
+                                  buckets,
+                                  bucket_size,
+                                  sizeof (probability_t),
+                                  transition_cmp);
+  return ((char *)result->bucket.token);
+}
+
+lst_List
+generate_strings(markov_chain_t markov_chain,
+                 char *start,
+                 uint32_t n) {
+  unique_keys_t unique_neighbours = markov_chain.unique;
+  graph_t graph = markov_chain.graph;
+  lst_List result = lst_create();
+  char *current = start;
+  for (uint32_t i = 0; i < n; i++) {
+    lst_append(result, current);
+    current = next_ngram(graph, current, unique_neighbours);
+  }
+  return result;
+}
+
+static inline unique_keys_t
+get_all_keys(graph_t graph) {
+  /* Gets all unique keys with neighbours */
+  /* Should only be called after graph generation */
+  unsigned long number = numberof_transitionable(graph);
+  char **keys = xcalloc(sizeof (char *), number);
+  CHECK(keys);
+  void *p, *key;
+  unique_keys_t result;
+  markov_trans_t *val;
+  uint32_t i = 0;
+  HSH_ITERATE(graph.graph, p, key, val) {
+    if (val->number > 0) {
+      keys[i] = key;
+      i++;
+    }
+  }
+  result.keys = keys;
+  result.number = i;
+  return result;
+}
+
+static inline graph_t
+make_graph(void) {
+  /* Make an initial empty graph */
+  graph_t result;
+  result.cache = hsh_create(NULL, NULL);
+  result.graph = hsh_create(NULL, NULL);
+  return result;
+}
+
+static inline void
+release_converted_graph(graph_t graph) {
+  void *p, *key;
+  markov_trans_t *datum;
+  /* iterate over all keys K, in hash table and xfree them*/
+  HSH_ITERATE(graph.graph, p, key, datum) {
+    xfree(datum->transitions);
+    xfree(datum);
+    xfree(key);
+  }
+  hsh_destroy(graph.cache);
+  hsh_destroy(graph.graph);
+}
+
+markov_chain_t
+build_markov_chain(token_stream tokens) {
+  markov_chain_t result;
+  graph_t graph = make_graph();
+  token_t current;
+  token_t next;
+  while (tokens.length > 1) {
+    current = peek_token(&tokens);
+    pop_token(&tokens);
+    next = peek_token(&tokens);
+    relate_bigram(token_to_string(next), token_to_string(current), graph);
+  }
+  convert_all_neighbours(graph);
+  result.graph = graph;
+  result.unique = get_all_keys(graph);
+  return result;
+}
+
+char *
+token_to_string(token_t token) {
+    switch (token.token_type) {
+      case WORD:
+        return (char*)token.token.word;
+        break;
+      case INTEGER:
+        return (char*)token.token.integer;
+        break;
+      case FLOATING:
+        return (char*)token.token.floating;
+        break;
+      case QUOTE:
+        return &quote;
+        break;
+      case PAREN:
+        return (char*)token.token.parenthesis;
+        break;
+      case EMPTY:
+        printf("should not be here\n");
+        exit(EXIT_FAILURE);
+        break;
+      case STRING:
+        return (char*)token.token.string;
+        break;
+      default:
+        printf("oops, there was an unknown token, check valgrind or gdb\n");
+        exit(EXIT_FAILURE);
+    }
+}
+
+void
+release_markov_chain(markov_chain_t chain) {
+  release_converted_graph(chain.graph);
+  xfree(chain.unique.keys);
+  return;
+}
+
+int
+main (void) {
+  void *test_input = xmalloc(555000);
+  size_t nbytes = read(STDIN_FILENO, test_input, 555000);
+
+  if (nbytes == 0) {
+    exit(EXIT_FAILURE);
+  }
+  token_stream test_bigrams_stack = tokenize(test_input, 0, nbytes);
+  markov_chain_t chain = build_markov_chain(test_bigrams_stack);
+  srand48(time(NULL));
+  lst_List test = generate_strings(chain, token_to_string(peek_token(&test_bigrams_stack)), LEN);
+  lst_pop(test);
+  for (uint32_t i = 0; i < LEN-1; i++) {
+    printf("%s ", (char *)lst_pop(test));
+  }
+  printf("\n");
+  lst_destroy(test);
+  _lst_shutdown();
+  release_markov_chain(chain);
+  xfree(test_input);
+  release_tokens(&test_bigrams_stack);
+  return EXIT_SUCCESS;
+}
diff --git a/markov.h b/markov.h
new file mode 100644
index 0000000..25f5239
--- /dev/null
+++ b/markov.h
@@ -0,0 +1,78 @@
+typedef
+  struct {
+    hsh_HashTable cache;
+    hsh_HashTable graph;
+  }
+  graph_t;
+
+typedef
+  struct {
+    size_t number;
+    char **keys;
+  }
+  unique_keys_t;
+
+typedef
+  struct {
+    graph_t graph;
+    unique_keys_t unique;
+  }
+  markov_chain_t;
+
+
+typedef
+  struct {
+    hsh_HashTable neighbours;
+    size_t number;
+    size_t unique_num;
+  }
+  neighbours_t;
+
+/*
+ * Transition types for various reasons
+ */
+
+typedef
+  struct {
+    float upper;
+    float lower;
+    const char *token;
+  }
+  bucket_t;
+
+typedef
+  struct {
+    uint32_t frequency;
+    const char *token;
+  }
+  transition_t;
+
+
+typedef
+  union {
+    transition_t frequent;
+    bucket_t bucket;
+  }
+  probability_t;
+
+typedef
+  struct {
+    size_t number;
+    probability_t *transitions;
+  }
+  markov_trans_t;
+
+
+markov_chain_t
+build_markov_chain(token_stream);
+
+char *
+token_to_string(token_t);
+
+void
+release_markov_chain(markov_chain_t);
+
+lst_List
+generate_strings(markov_chain_t,
+                 char *,
+                 uint32_t);
diff --git a/roadnottaken b/roadnottaken
new file mode 100644
index 0000000..6558308
--- /dev/null
+++ b/roadnottaken
@@ -0,0 +1,23 @@
+Two roads diverged in a yellow wood,
+And sorry I could not travel both
+And be one traveler, long I stood
+And looked down one as far as I could
+To where it bent in the undergrowth;
+
+Then took the other, as just as fair,
+And having perhaps the better claim,
+Because it was grassy and wanted wear;
+Though as for that the passing there
+Had worn them really about the same,
+
+And both that morning equally lay
+In leaves no step had trodden black.
+Oh, I kept the first for another day!
+Yet knowing how way leads on to way,
+I doubted if I should ever come back.
+
+I shall be telling this with a sigh
+Somewhere ages and ages hence:
+Two roads diverged in a wood, and I—
+I took the one less traveled by,
+And that has made all the difference.
diff --git a/tokenize.c b/tokenize.c
new file mode 100644
index 0000000..680fe95
--- /dev/null
+++ b/tokenize.c
@@ -0,0 +1,551 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+#include "error.h"
+#include "maa.h"
+#include "tokenize.h"
+
+/*
+ * This is a basic s-expression tokenizer
+ * it also tokenizes things like number, string, and symbol literals
+ */
+
+const token_t nulltok = {
+  .token_type = EMPTY,
+    {
+      .null_token=false
+    }
+};
+
+static const token_t quote_tok = {
+  .token_type = QUOTE,
+  .token= {
+    .quote=true
+  }
+};
+
+static const token_t left_paren = {
+  .token_type = PAREN,
+                .token = {
+                  .parenthesis="("
+                }
+};
+
+static const token_t right_paren = {
+  .token_type = PAREN,
+  .token = {
+    .parenthesis=")"
+  }
+};
+
+static inline const char *
+string_head(uint32_t n,
+            const char *in,
+            char *out) {
+  /* out must be large enough to store the number of characters
+   * you want to select from in, plus a byte for the null terminator
+   */
+#ifndef NDEBUG
+  size_t in_len = strlen(in);
+#endif
+  assert((n > 0 && n <= in_len));
+  int iserror = snprintf(out, (size_t)n+1 , "%s", in);
+
+  assert((iserror != -1) && ((size_t)iserror == in_len));
+
+  if (iserror == -1) {
+    printf("Out of memory");
+    exit(EXIT_FAILURE);
+  }
+  return (const char*)out;
+}
+
+static inline token_t
+make_token(token_val_t val,
+           tok_t toktype) {
+  token_t result;
+  result.token_type = toktype;
+  result.token = val;
+  return result;
+}
+
+bool
+push_token(token_stream *tokens,
+           token_t token) {
+  /*
+   * Check if tokens points to NULL
+   */
+
+  size_t len;
+  size_t max;
+
+  CHECK(tokens);
+
+  len = tokens->length;
+  max = tokens->max_length;
+
+  assert(len <= max);
+  assert(max > 0);
+
+  if (len == max) {
+    /* We've reached the maximum stack size
+     * So we must try to increase that by GROWTH_SIZE
+     */
+    token_t *new_tokens = xrealloc(tokens->tokens, sizeof (token_t) * (max * GROWTH_FACTOR));
+    if (!new_tokens) {
+      printf("Could not allocate enough memory for the token stack\n");
+      exit(EXIT_FAILURE);
+    }
+    tokens->tokens = new_tokens;
+    tokens->max_length = max * GROWTH_FACTOR;
+    tokens->tokens[len] = token;
+    tokens->length++;
+    return true;
+  }
+  tokens->tokens[len] = token;
+  tokens->length++;
+  return true;
+}
+
+bool
+pop_token(token_stream *tokens) {
+  size_t len;
+  CHECK(tokens);
+
+  len = tokens->length;
+
+  assert(len != 0);
+  len--;
+  CHECK(tokens->tokens);
+
+  tokens->length--;
+  return true;
+}
+
+inline token_t
+peek_token(token_stream *tokens) {
+  /*
+   * Check if tokens points to NULL
+   */
+  size_t len = tokens->length;
+  size_t max = tokens->max_length;
+  CHECK(tokens);
+  assert(len != 0);
+
+  if (len == 0 || len > max) {
+    return nulltok;
+  }
+  return tokens->tokens[len-1];
+}
+
+static inline uint32_t
+match_int(source_t source,
+          uint32_t begin,
+          const uint32_t length) {
+  /* Return false if there is no match
+   * otherwise return the position of the end of the match + 1
+   */
+  uint32_t i = begin;
+  uint32_t test;
+  CHECK(source);
+  assert(length > 0);
+
+  if (source[i] == '+' ||
+      source[i] == '-') {
+    i++;
+  }
+  test = i;
+  while (i < length &&
+         isdigit(source[i])) {
+    i++;
+  }
+  if (i == test)
+    return false;
+  return i;
+}
+
+static inline uint32_t
+match_float(source_t source,
+            uint32_t begin,
+            const uint32_t length) {
+  /* Return false if there is no match
+   * otherwise:
+   *  if there is a leading decimal point and then a valid int match:
+   *    return the position of the end of the match
+   *  if there is a leading valid int match:
+   *    but no decimal point match after that:
+   *      return false
+   *    if there is a decimal point match and then a valid int match:
+   *        return the position of the match
+   *    if there is no valid int match:
+   *      return false
+   * ALWAYS returns the position + 1 to avoid confusion with false (which is a valid index)
+   */
+  uint32_t i, leading_int_match, trailing_int_match;
+  CHECK(source);
+  assert(length > 0);
+
+  i = begin;
+  leading_int_match = match_int(source, i, length);
+
+  if (leading_int_match) {
+    i = leading_int_match;
+  }
+
+  assert(i <= length);
+
+  if (source[i] != '.' ||
+      source[i] == '+' ||
+      source[i] == '-') {
+    if (((i+1) <= length) && /* Make sure there is at least two characters to look at */
+        ((source[i] == '+') ||
+         (source[i] == '-'))
+        && (source[i+1] == '.')) {
+      i++;
+    }
+    else {
+      return false;
+    }
+  }
+  i++;
+
+  trailing_int_match = match_int(source, i, length);
+  if (trailing_int_match) {
+    return trailing_int_match;
+  }
+  return false;
+}
+
+static inline uint32_t
+match_word(source_t source,
+                 uint32_t begin,
+                 const uint32_t length) {
+
+  /* Return false if there is no match
+   *    if there is a match for any characters that are not:
+   *      whitespace
+   *      a parenthesis ( )
+   *      a brace { }
+   *      a square bracket [ ]
+   *        then return the position of the match + 1
+   *    if there is nothing else to match:
+   *      return false
+   */
+  uint32_t i = begin;
+  CHECK(source);
+  assert(length > 0);
+
+  while (i < length &&
+         !(source[i] == '(' ||
+           source[i] == ')' ||
+           isspace(source[i]))) {
+    i++;
+  }
+
+  if (i == begin) {
+    return false;
+  }
+  assert(i <= length);
+  return i;
+}
+
+static inline uint32_t
+is_empty_string(const char *source,
+                uint32_t length) {
+  int allspace = false;
+  uint32_t i = 0;
+  if (source[i] != '\"') {
+    return false;
+  }
+  for (; i < length; i++) {
+    if (!isspace(source[i])) {
+      allspace = true;
+    }
+  }
+  /*if (allspace) {
+    printf("Actually found an empty string! Of length %d\n", i);
+  }*/
+  return allspace;
+}
+
+
+static inline uint32_t
+match_string(source_t source,
+             uint32_t begin,
+             const uint32_t length) {
+  CHECK(source);
+  (void)length;
+  assert(length > 0);
+  uint32_t i = begin;
+  if (source[i] != '\"') {
+    return false;
+  }
+  i++;
+  while (source[i] != '\"' &&
+         (i < length) &&
+         (i < (begin + MAX_STRING_SIZE))) {
+    i++;
+  }
+  if ((i != (begin+1)) &&
+      (i <= length) &&
+      (source[i] == '\"')) {
+    return i+1;
+  }
+  return false;
+}
+
+static inline void
+extract_token(uint32_t position,
+              uint32_t begin,
+              const source_t source,
+              const char *token_val) {
+    assert(position > begin);
+    string_head(position - begin,
+                &source[begin],
+                (char *)token_val);
+}
+
+token_stream
+tokenize(source_t source,
+         uint32_t begin,
+         const uint32_t length) {
+  /*
+   * Remember to free everything from this struct
+   * for example, token_stack.tokens will not necessarily be
+   * equal to tokens after this function has run
+   *
+   */
+  uint32_t position = begin;
+  uint32_t allspace = false;
+  const char *current_token_val;
+  token_stream token_stack;
+  token_val_t current_token;
+  token_t *tokens = xcalloc(STACK_SIZE, sizeof (token_t));
+
+  hsh_HashTable token_memo = hsh_create(NULL, NULL);
+
+  assert(begin == 0);
+  assert(length > 0);
+  CHECK(source);
+
+  token_stack.length = 0;
+  token_stack.max_length = STACK_SIZE;
+  token_stack.tokens = tokens;
+  token_stack.memo = token_memo;
+  char lookahead = '\0';
+  assert(STACK_SIZE > 0);
+
+  while (begin <= length && source[begin]) {
+      /* Possibly matched a string
+       * First look for closing "
+       * Then look for a newline to close it if no "
+       * Then stop after some large constant of characters maybe?
+       * We're dealing with real text so people might forget to close
+       * quotations, so we have to be clever about it and use heuristics (for performance)
+       */
+    if (source[begin] == '(') {
+      /*Matched a left paren */
+      position = begin + 1;
+      push_token(&token_stack, left_paren);
+    }
+    else if (source[begin] == ')') {
+      /*Matched a left paren */
+      position = begin + 1;
+      push_token(&token_stack, right_paren);
+    }
+    else if (isspace(source[begin])) {
+      position = begin + 1;
+      /* Matched a whitespace character */
+    }
+    else if ((position = match_string(source, begin, length))) {
+      /* Possibly matched a string
+       * First look for closing "
+       * Then look for a newline to close it if no "
+       * Then stop after some large constant of characters maybe?
+       * We're dealing with real text so people might forget to close
+       * quotations, so we have to be clever about it and use heuristics (for performance)
+       */
+      lookahead = source[position];
+      source[position] = '\0';
+      if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) {
+        current_token.string = current_token_val;
+        source[position] = lookahead;
+        allspace = false;
+      }
+      else {
+        assert(position > begin);
+        current_token_val = xcalloc(((position - begin) + 1), sizeof (char));
+        CHECK(current_token_val);
+        extract_token(position, begin, source, current_token_val);
+        if ((allspace = is_empty_string(current_token_val, (position - begin) + 1))) {
+          source[position] = lookahead;
+          current_token.string = current_token_val;
+        }
+        else {
+        source[position] = lookahead;
+        hsh_insert(token_stack.memo, current_token_val, current_token_val);
+        current_token.string = current_token_val;
+        }
+      }
+      if (allspace) {
+        push_token(&token_stack, make_token(current_token, STRING));
+      }
+    }
+    else if ((position = match_float(source, begin, length))) {
+      /* Matched a float */
+      lookahead = source[position];
+      source[position] = '\0';
+      if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) {
+        current_token.floating = current_token_val;
+        source[position] = lookahead;
+      }
+      else {
+        assert(position > begin);
+        current_token_val = xcalloc(((position - begin) + 1), sizeof (char));
+        CHECK(current_token_val);
+        extract_token(position, begin, source, current_token_val);
+        source[position] = lookahead;
+        hsh_insert(token_stack.memo, current_token_val, current_token_val);
+        current_token.floating = current_token_val;
+      }
+      push_token(&token_stack, make_token(current_token, FLOATING));
+    }
+    else if ((position = match_int(source, begin, length))) {
+      /* Matched an int */
+      lookahead = source[position];
+      source[position] = '\0';
+      if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) {
+        current_token.integer = current_token_val;
+        source[position] = lookahead;
+      }
+      else {
+        assert(position > begin);
+        assert(position <= length);
+
+        current_token_val = xcalloc(((position - begin) + 1), sizeof (char));
+        CHECK(current_token_val);
+        extract_token(position, begin, source, current_token_val);
+        source[position] = lookahead;
+        hsh_insert(token_stack.memo, current_token_val, current_token_val);
+        current_token.integer = current_token_val;
+      }
+      push_token(&token_stack, make_token(current_token, INTEGER));
+    }
+    else if (source[begin] == '\'') {
+      /* Matched a quote (apostrophe) */
+      position = begin + 1;
+      push_token(&token_stack, quote_tok);
+    }
+    else if ((position = match_word(source, begin, length))) {
+      /* Matched a word */
+      lookahead = source[position];
+      source[position] = '\0';
+      if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) {
+        current_token.word = current_token_val;
+        source[position] = lookahead;
+      }
+      else {
+        assert(position > begin);
+        assert(position <= length);
+
+        current_token_val = xcalloc(((position - begin) + 1), sizeof (char));
+        CHECK(current_token_val);
+        extract_token(position, begin, source, current_token_val);
+        source[position] = lookahead;
+        hsh_insert(token_stack.memo, current_token_val, current_token_val);
+        current_token.word = current_token_val;
+      }
+      push_token(&token_stack, make_token(current_token, WORD));
+      /* Matched a word */
+    }
+    else if (position <= begin) {
+      printf("Source is too large to read\n");
+      exit(EXIT_FAILURE);
+    }
+    else {
+      printf("Unmatched token\n");
+      exit(EXIT_FAILURE);
+    }
+    begin = position;
+  }
+
+  return token_stack;
+}
+
+int
+free_token(const void *key,
+           const void *val) {
+  /* silence warnings about unused parameters, key and val point to the same data*/
+  (void)key;
+  xfree((char *)val);
+  return true;
+}
+
+bool
+release_tokens(token_stream *tokens) {
+  /* Iterate through the stack, release each token
+   * Then release the entire stack
+   */
+  CHECK(tokens);
+  CHECK(tokens->tokens);
+  assert(tokens->max_length > 0);
+  xfree(tokens->tokens);
+  hsh_iterate(tokens->memo, free_token);
+
+  hsh_destroy(tokens->memo);
+  return true;
+}
+
+#ifndef TOK_LIB
+int main(void) {
+  void *source_code = malloc(111000);
+  size_t nbytes = read(STDIN_FILENO, source_code, 111000);
+  if (nbytes == 0) {
+    exit(EXIT_FAILURE);
+  }
+  token_stream toks = tokenize(source_code, 0, nbytes);
+  token_t current_tok;
+  while (toks.length > 0) {
+    current_tok = peek_token(&toks);
+    switch (current_tok.token_type) {
+      case SYMBOL:
+        printf("symbol: %s\n", current_tok.token.symbol);
+        break;
+      case WORD:
+        printf("identifer: %s\n", current_tok.token.word);
+        break;
+      case INTEGER:
+        printf("integer: %s\n", current_tok.token.integer);
+        break;
+      case FLOATING:
+        printf("floating: %s\n", current_tok.token.floating);
+        break;
+      case QUOTE:
+        printf("quote: '\n");
+        break;
+      case WSPACE:
+        printf("whitespace\n");
+        break;
+      case PAREN:
+        printf("paren: %s\n", current_tok.token.parenthesis);
+        break;
+      case EMPTY:
+        printf("this should not be empty\n");
+        break;
+      case STRING:
+        printf("string: %s\n", current_tok.token.string);
+        break;
+      default:
+        printf("oops, there was an unknown token, check valgrind or gdb\n");
+    }
+    pop_token(&toks);
+  }
+  release_tokens(&toks);
+  return 0;
+}
+#endif
diff --git a/tokenize.h b/tokenize.h
new file mode 100644
index 0000000..1844944
--- /dev/null
+++ b/tokenize.h
@@ -0,0 +1,73 @@
+#define STACK_SIZE 4096
+#define GROWTH_FACTOR 2
+#define MAX_STRING_SIZE 30
+
+typedef char* source_t;
+
+typedef enum {
+  WORD = 1,
+  INTEGER = 2,
+  FLOATING = 3,
+  QUOTE = 4,
+  WSPACE = 5,
+  PAREN = 6 ,
+  EMPTY = 7,
+  STRING = 8
+} tok_t;
+
+typedef union {
+    const char *word;
+    const char *integer;
+    const char *floating;
+    const char *parenthesis;
+    const char *string;
+    bool quote;
+    bool null_token;
+} token_val_t;
+
+typedef struct {
+  tok_t token_type;
+  token_val_t token;
+} token_t;
+
+typedef struct {
+  size_t length; /* Number of current elements */
+  size_t max_length; /* Maximum length of the stack */
+  token_t *tokens;
+  hsh_HashTable memo;
+} token_stream;
+
+bool
+push_token(token_stream*, token_t);
+
+bool
+pop_token(token_stream*);
+
+token_t
+peek_token(token_stream*);
+
+token_stream
+tokenize(source_t, uint32_t, const uint32_t);
+
+bool
+release_tokens(token_stream*);
+
+#ifndef TOK_LIB
+static uint32_t
+match_int(source_t, uint32_t, const uint32_t);
+
+static uint32_t
+match_float(source_t, uint32_t, const uint32_t);
+
+static uint32_t
+match_word(source_t, uint32_t, const uint32_t);
+
+static uint32_t
+match_string(source_t, uint32_t, const uint32_t);
+#endif
+
+int
+free_token(const void *,
+           const void *);
+token_t
+testfunc(void);