From afc4b0b1c1fd9454d3d73446536d99cf7581a0a5 Mon Sep 17 00:00:00 2001
From: wes <wjak56@gmail.com>
Date: Fri, 4 Dec 2015 21:29:49 -0500
Subject: [PATCH] first commit

---
 Makefile   |  21 +++
 error.h    |   1 +
 tokenize.c | 503 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 tokenize.h |  74 ++++++++
 4 files changed, 599 insertions(+)
 create mode 100644 Makefile
 create mode 100644 error.h
 create mode 100644 tokenize.c
 create mode 100644 tokenize.h

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..cd60754
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,21 @@
+default: tokenize.c  tokenize.h
+	$(MAKE) lib;
+	$(CC) -Wall -Wextra -pedantic -Wpointer-arith  -Werror -O3 ./tokenize.c -lmaa -o tokenize_test;
+
+unsafe: tokenize.c  tokenize.h
+	$(CC) -DNDEBUG -O3 ./tokenize.c -lmaa;
+
+unsafelib: tokenize.c  tokenize.h
+	$(CC) -DLIB -DNDEBUG -c -fpic -O3 ./tokenize.c;
+	$(CC) -shared -o tokenize.so tokenize.o -lmaa;
+
+lib: tokenize.c  tokenize.h
+	$(CC) -DLIB -DNDEBUG -c -fpic -Wall -Wextra -pedantic -Wpointer-arith -Werror -O3 ./tokenize.c
+	$(CC) -shared -o libtokenize.so tokenize.o -lmaa;
+
+debug: tokenize.c  tokenize.h
+	$(CC) -Wall -Wextra -pedantic -Wpointer-arith  -Werror ./tokenize.c -lmaa -o tokenize_debug;
+
+debuglib: tokenize.c  tokenize.h
+	$(CC) -g -c -fpic -Wall -Wextra -pedantic -Wpointer-arith  -Werror ./tokenize.c;
+	$(CC) -shared -o tokenize.so tokenize.o -lmaa;
diff --git a/error.h b/error.h
new file mode 100644
index 0000000..24bfcc3
--- /dev/null
+++ b/error.h
@@ -0,0 +1 @@
+#define CHECK(ptr) if ((ptr) == NULL) { printf("Failed to allocate memory\n"); exit(EXIT_FAILURE); }
diff --git a/tokenize.c b/tokenize.c
new file mode 100644
index 0000000..4bab20a
--- /dev/null
+++ b/tokenize.c
@@ -0,0 +1,503 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+#include "error.h"
+#include "tokenize.h"
+
+/*
+ * This is a basic s-expression tokenizer
+ * it also tokenizes things like number, string, and symbol literals
+ */
+
+token_t nulltok;
+
+static token_t whitespace_tok;
+
+static token_t quote_tok;
+
+static  token_t left_paren = {
+  PAREN,
+    {
+      "("
+    }
+};
+
+static  token_t right_paren = {
+  PAREN,
+  {
+  ")"
+  }
+};
+
+static inline  char *
+string_head(uint32_t n,
+             char *in,
+            char *out) {
+  /* out must be large enough to store the number of characters
+   * you want to select from in, plus a byte for the null terminator
+   */
+#ifndef NDEBUG
+  size_t in_len = strlen(in);
+#endif
+  assert(n > 0 && n <= in_len);
+  int iserror = snprintf(out, (size_t)n+1 , "%s", in);
+
+  assert((iserror != -1) && ((size_t)iserror == in_len));
+
+  if (iserror == -1) {
+    printf("Out of memory");
+    exit(EXIT_FAILURE);
+  }
+  return ( char*)out;
+}
+
+static inline token_t
+make_token(token_val_t val,
+           tok_t toktype) {
+  token_t result;
+  result.token_type = toktype;
+  result.token = val;
+  return result;
+}
+
+token_t
+testfunc(void) {
+  token_val_t wspace;
+  wspace.whitespace = true;
+  return make_token(wspace, QUOTE);
+}
+
+
+bool
+push_token(token_stream *tokens,
+           token_t token) {
+  /*
+   * Check if tokens points to NULL
+   */
+
+  size_t len;
+  size_t max;
+
+  CHECK(tokens);
+
+  len = tokens->length;
+  max = tokens->max_length;
+
+  assert(len <= max);
+  assert(max > 0);
+
+  if (len == max) {
+    /* We've reached the maximum stack size
+     * So we must try to increase that by GROWTH_SIZE
+     */
+    token_t *new_tokens = (token_t *)realloc(tokens->tokens, sizeof (token_t) * (max + GROWTH_SIZE));
+    if (!new_tokens) {
+      printf("Could not allocate enough memory for the token stack\n");
+      exit(EXIT_FAILURE);
+    }
+    tokens->tokens = new_tokens;
+    tokens->max_length = max + GROWTH_SIZE;
+    tokens->tokens[len] = token;
+    tokens->length++;
+    return true;
+  }
+  tokens->tokens[len] = token;
+  tokens->length++;
+  return true;
+}
+
+bool
+pop_token(token_stream *tokens) {
+  size_t len;
+  CHECK(tokens);
+
+  len = tokens->length;
+
+  assert(len != 0);
+  len--;
+  CHECK(tokens->tokens);
+
+  tokens->length--;
+  return true;
+}
+
+inline token_t
+peek_token(token_stream *tokens) {
+  /*
+   * Check if tokens points to NULL
+   */
+  size_t len = tokens->length;
+  size_t max = tokens->max_length;
+  CHECK(tokens);
+  assert(len != 0);
+
+  if (len == 0 || len > max) {
+    return nulltok;
+  }
+  return tokens->tokens[len-1];
+}
+
+static inline uint32_t
+match_int(source_t source,
+          uint32_t begin,
+           uint32_t length) {
+  /* Return false if there is no match
+   * otherwise return the position of the end of the match + 1
+   */
+  uint32_t i = begin;
+  uint32_t test;
+  CHECK(source);
+  assert(length > 0);
+
+  if (source[i] == '+' ||
+      source[i] == '-') {
+    i++;
+  }
+  test = i;
+  while (i < length &&
+         isdigit(source[i])) {
+    i++;
+  }
+  if (i == test)
+    return false;
+  return i;
+}
+
+static inline uint32_t
+match_float(source_t source,
+            uint32_t begin,
+             uint32_t length) {
+  /* Return false if there is no match
+   * otherwise:
+   *  if there is a leading decimal point and then a valid int match:
+   *    return the position of the end of the match
+   *  if there is a leading valid int match:
+   *    but no decimal point match after that:
+   *      return false
+   *    if there is a decimal point match and then a valid int match:
+   *        return the position of the match
+   *    if there is no valid int match:
+   *      return false
+   * ALWAYS returns the position + 1 to avoid confusion with false (which is a valid index)
+   */
+  uint32_t i, leading_int_match, trailing_int_match;
+  CHECK(source);
+  assert(length > 0);
+
+  i = begin;
+  leading_int_match = match_int(source, i, length);
+
+  if (leading_int_match) {
+    i = leading_int_match;
+  }
+
+  assert(i <= length);
+
+  if (source[i] != '.' ||
+      source[i] == '+' ||
+      source[i] == '-') {
+    if (((i+1) <= length) && /* Make sure there is at least two characters to look at */
+        ((source[i] == '+') ||
+         (source[i] == '-'))
+        && (source[i+1] == '.')) {
+      i++;
+    }
+    else {
+      return false;
+    }
+  }
+  i++;
+
+  trailing_int_match = match_int(source, i, length);
+  if (trailing_int_match) {
+    return trailing_int_match;
+  }
+  return false;
+}
+
+static inline uint32_t
+match_identifier(source_t source,
+                 uint32_t begin,
+                  uint32_t length) {
+
+  /* Return false if there is no match
+   *    if there is a match for any characters that are not:
+   *      whitespace
+   *      a parenthesis ( )
+   *      a brace { }
+   *      a square bracket [ ]
+   *        then return the position of the match + 1
+   *    if there is nothing else to match:
+   *      return false
+   */
+  uint32_t i = begin;
+  CHECK(source);
+  assert(length > 0);
+  if (source[i] == '*' ||
+      source[i] == '+' ||
+      source[i] == '-' ||
+      source[i] == '/') {
+    return i+1;
+  }
+
+  while (i < length &&
+         !(source[i] == '(' ||
+           source[i] == ')' ||
+           isspace(source[i]))) {
+    i++;
+  }
+
+  if (i == begin) {
+    return false;
+  }
+  assert(i <= length);
+  return i;
+}
+
+static inline uint32_t
+match_symbol(source_t source,
+             uint32_t begin,
+              uint32_t length) {
+  uint32_t i;
+  CHECK(source);
+  assert(length > 0);
+
+  i = begin;
+  if (source[i] != '\'') {
+    return false;
+  }
+  i++;
+  while ((isspace(source[i]) ||
+         (source[i] == '\'')) && i < length) { /* consume leading whitespace and quotes */
+    i++;
+  }
+  if (source[i] == ')') {
+    printf("Unexpected )\n");
+    exit(EXIT_FAILURE);
+  }
+  if (source[i] == '(') {
+   return i;
+  }
+  while (!isspace(source[i]) &&
+         source[i] != '(' &&
+         source[i] != ')' &&
+         i < length) {
+    i++;
+  }
+  if (i == begin+1) { /* if we did not increment i more than once (before the loop) */
+    return false;
+  }
+  return i;
+}
+
+static inline void
+extract_token(uint32_t position,
+              uint32_t begin,
+               source_t source,
+               char *token_val) {
+    assert(position > begin);
+    string_head(position - begin,
+                &source[begin],
+                (char *)token_val);
+}
+
+token_stream
+tokenize(source_t source,
+         uint32_t begin,
+         uint32_t length) {
+  /*
+   * Remember to free everything from this struct
+   * for example, token_stack.tokens will not necessarily be
+   * equal to tokens after this function has run
+   *
+   */
+  uint32_t position = begin;
+  char *current_token_val;
+  token_stream token_stack;
+  token_val_t current_token;
+  token_t *tokens = (token_t *)calloc(STACK_SIZE, sizeof (token_t));
+
+  assert(begin == 0);
+  assert(length > 0);
+  CHECK(source);
+
+  token_stack.length = 0;
+  token_stack.max_length = STACK_SIZE;
+  token_stack.tokens = tokens;
+  char lookahead = '\0';
+  assert(STACK_SIZE > 0);
+
+  while (begin <= length && source[begin]) {
+    if (source[begin] == '(') {
+      /*Matched a left paren */
+      position = begin + 1;
+      push_token(&token_stack, left_paren);
+    }
+    else if (source[begin] == ')') {
+      /*Matched a left paren */
+      position = begin + 1;
+      push_token(&token_stack, right_paren);
+    }
+    else if (isspace(source[begin])) {
+      position = begin + 1;
+      push_token(&token_stack, whitespace_tok);
+      /* Matched a whitespace character */
+    }
+    else if ((position = match_float(source, begin, length))) {
+      /* Matched a float */
+      lookahead = source[position];
+      source[position] = '\0';
+      source[position] = lookahead;
+      assert(position > begin);
+      current_token_val = (char *)calloc(((position - begin) + 1), sizeof (char));
+      CHECK(current_token_val);
+      extract_token(position, begin, source, current_token_val);
+      current_token.floating = current_token_val;
+      push_token(&token_stack, make_token(current_token, FLOATING));
+    }
+    else if ((position = match_int(source, begin, length))) {
+      /* Matched an int */
+      lookahead = source[position];
+      source[position] = '\0';
+      assert(position > begin);
+      assert(position <= length);
+
+      source[position] = lookahead;
+      current_token_val = (char *)calloc(((position - begin) + 1), sizeof (char));
+      CHECK(current_token_val);
+      extract_token(position, begin, source, current_token_val);
+      current_token.integer = current_token_val;
+      push_token(&token_stack, make_token(current_token, INTEGER));
+    }
+    else if ((position = match_symbol(source, begin, length))) {
+      /* Matched a symbol */
+      lookahead = source[position];
+      source[position] = '\0';
+      assert(position > begin);
+      assert(position <= length);
+
+      source[position] = lookahead;
+      current_token_val = (char *)calloc(((position - begin) + 1), sizeof (char));
+      CHECK(current_token_val);
+      extract_token(position, begin, source, current_token_val);
+      current_token.symbol = current_token_val;
+      push_token(&token_stack, make_token(current_token, SYMBOL));
+    }
+    else if (source[begin] == '\'') {
+      /* Matched a quote (apostrophe) */
+      position = begin + 1;
+      push_token(&token_stack, quote_tok);
+    }
+    else if ((position = match_identifier(source, begin, length))) {
+      /* Matched an identifier */
+      lookahead = source[position];
+      source[position] = '\0';
+      assert(position > begin);
+      assert(position <= length);
+
+      source[position] = lookahead;
+      current_token_val = (char *)calloc(((position - begin) + 1), sizeof (char));
+      CHECK(current_token_val);
+      extract_token(position, begin, source, current_token_val);
+      current_token.identifier = current_token_val;
+      push_token(&token_stack, make_token(current_token, IDENTIFIER));
+      /* Matched an identifier */
+    }
+    else if (position <= begin) {
+      printf("Source is too large to read\n");
+      exit(EXIT_FAILURE);
+    }
+    else {
+      printf("Unmatched token\n");
+      exit(EXIT_FAILURE);
+    }
+    begin = position;
+  }
+
+  return token_stack;
+}
+
+int
+free_token( void *key,
+            void *val) {
+  /* silence warnings about unused parameters, key and val point to the same data*/
+  (void)key;
+  free((char *)val);
+  return true;
+}
+
+bool
+release_tokens(token_stream *tokens) {
+  /* Iterate through the stack, release each token
+   * Then release the entire stack
+   */
+  CHECK(tokens);
+  CHECK(tokens->tokens);
+  assert(tokens->max_length > 0);
+  free(tokens->tokens);
+
+  return true;
+}
+
+#ifndef LIB
+int main(void) {
+  nulltok.token_type = EMPTY;
+  nulltok.token.null_token = false;
+
+  whitespace_tok.token_type = WSPACE;
+  whitespace_tok.token.whitespace= false;
+
+  quote_tok.token_type = QUOTE;
+  quote_tok.token.quote= false;
+
+  void *source_code = malloc(111000);
+  uint32_t nbytes = read(STDIN_FILENO, source_code, 111000);
+  if (nbytes == 0) {
+    exit(EXIT_FAILURE);
+  }
+  token_stream toks = tokenize((source_t)source_code, 0, nbytes);
+  token_t current_tok;
+  while (toks.length > 0) {
+    current_tok = peek_token(&toks);
+    switch (current_tok.token_type) {
+      case SYMBOL:
+        printf("symbol: %s\n", current_tok.token.symbol);
+        break;
+      case IDENTIFIER:
+        printf("identifer: %s\n", current_tok.token.identifier);
+        break;
+      case INTEGER:
+        printf("integer: %s\n", current_tok.token.integer);
+        break;
+      case FLOATING:
+        printf("floating: %s\n", current_tok.token.floating);
+        break;
+      case QUOTE:
+        printf("quote: '\n");
+        break;
+      case WSPACE:
+        printf("whitespace\n");
+        break;
+      case PAREN:
+        printf("paren: %s\n", current_tok.token.parenthesis);
+        break;
+      case EMPTY:
+        printf("this should not be empty\n");
+        break;
+      case STRING:
+        printf("string: %s\n", current_tok.token.string);
+        break;
+      default:
+        printf("oops, there was an unknown token, check valgrind or gdb\n");
+    }
+    pop_token(&toks);
+  }
+  release_tokens(&toks);
+  return 0;
+}
+#endif
diff --git a/tokenize.h b/tokenize.h
new file mode 100644
index 0000000..fd83054
--- /dev/null
+++ b/tokenize.h
@@ -0,0 +1,74 @@
+#define STACK_SIZE 4096
+#define GROWTH_SIZE 65536
+
+typedef char* source_t;
+
+typedef enum {
+  SYMBOL = 0,
+  IDENTIFIER = 1,
+  INTEGER = 2,
+  FLOATING = 3,
+  QUOTE = 4,
+  WSPACE = 5,
+  PAREN = 6 ,
+  EMPTY = 7,
+  STRING = 8
+} tok_t;
+
+typedef union {
+    const char *symbol;
+    const char *identifier;
+    const char *integer;
+    const char *floating;
+    const char *parenthesis;
+    const char *string;
+    bool quote;
+    bool whitespace;
+    bool null_token;
+} token_val_t;
+
+typedef struct {
+  tok_t token_type;
+  token_val_t token;
+} token_t;
+
+typedef struct {
+  size_t length; /* Number of current elements */
+  size_t max_length; /* Maximum length of the stack */
+  token_t *tokens;
+} token_stream;
+
+bool
+push_token(token_stream*, token_t);
+
+bool
+pop_token(token_stream*);
+
+token_t
+peek_token(token_stream*);
+
+token_stream
+tokenize(source_t, uint32_t, const uint32_t);
+
+bool
+release_tokens(token_stream*);
+
+#ifndef LIB
+static uint32_t
+match_int(source_t, uint32_t, const uint32_t);
+
+static uint32_t
+match_float(source_t, uint32_t, const uint32_t);
+
+static uint32_t
+match_identifier(source_t, uint32_t, const uint32_t);
+
+static uint32_t
+match_symbol(source_t, uint32_t, const uint32_t);
+#endif
+
+int
+free_token(const void *,
+           const void *);
+token_t
+testfunc(void);