initial commit

11 years ago · 215445c08c
3 changed files with 510 additions and 0 deletions
--- a/tokenize.c
+++ b/tokenize.c
@ -0,0 +1,405 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <ctype.h>
 #include <stdbool.h>
 #include <string.h>
 #include <assert.h>
 #include "tokenize.h"
 /*
 * This is a basic s-expression tokenizer
 * it also tokenizes things like number, string, and symbol literals
 */
 static const token_t nulltok = {.token_type=EMPTY, {.null_token=false}};
 static const token_t whitespace_tok = {.token_type=WSPACE, .token={.whitespace=true } };
 static const token_t quote_tok = {.token_type=QUOTE, .token={.quote=true} };
 static const token_t left_paren = {.token_type=PAREN, .token={.parenthesis="("} };
 static const token_t right_paren = {.token_type=PAREN, .token={.parenthesis=")"} };
 static
 inline
 char *
 string_head(uint32_t n, char *in, char *out) {
  /* out must be large enough to store the number of characters
   * you want to select from in, plus a byte for the null terminator
   */
 #ifndef NDEBUG
  size_t in_len = strlen(in);
 #endif
  assert(n > 0 && n <= in_len);
  int iserror = snprintf(out, (size_t)n+1 , "%s", in);
  assert((iserror != -1) && (iserror == in_len));
  if (iserror == -1) {
    printf("Out of memory");
    exit(EXIT_FAILURE);
  }
  return out;
 }
 static
 inline
 token_t
 make_token(token_val_t val, tok_t toktype) {
  token_t result;
  result.token_type = toktype;
  result.token = val;
  return result;
 }
 bool
 push_token(token_stream *tokens, token_t token) {
  /*
   * Check if tokens points to NULL
   */
  size_t len;
  size_t max;
  assert(tokens != NULL);
  len = tokens->length;
  max = tokens->max_length;
  assert(len <= max);
  assert(max > 0);
  if (len == max) {
    /* We've reached the maximum stack size
     * So we must try to increase that by GROWTH_SIZE
     */
    token_t *new_tokens = realloc(tokens->tokens, sizeof(token_t) * (max + GROWTH_SIZE));
    if (!new_tokens) {
      printf("Could not allocate enough memory for the token stack\n");
      exit(EXIT_FAILURE);
    }
    tokens->tokens = new_tokens;
    tokens->max_length = max + GROWTH_SIZE;
    tokens->tokens[len] = token;
    tokens->length++;
    return true;
  }
  tokens->tokens[len] = token;
  tokens->length++;
  return true;
 }
 bool
 pop_token(token_stream *tokens) {
  size_t len;
  assert(tokens != NULL);
  len = tokens->length;
  assert(len != 0);
  len--;
  assert(tokens->tokens != NULL);
  switch (tokens->tokens[len].token_type) {
    case SYMBOL:
      free(tokens->tokens[len].token.symbol);
      break;
    case IDENTIFIER:
      free(tokens->tokens[len].token.identifier);
      break;
    case INTEGER:
      free(tokens->tokens[len].token.integer);
      break;
    case FLOATING:
      free(tokens->tokens[len].token.floating);
      break;
    default:
      break;
  }
  tokens->length--;
  return true;
 }
 inline
 token_t
 peek_token(token_stream *tokens) {
  /*
   * Check if tokens points to NULL
   */
  size_t len = tokens->length;
  size_t max = tokens->max_length;
  assert(tokens != NULL);
  assert(len != 0);
  if (len == 0 || len > max) {
    return nulltok;
  }
  return tokens->tokens[len-1];
 }
 static
 inline
 uint32_t
 match_int(source_t source, uint32_t begin, const uint32_t length) {
  /* Return false if there is no match
   * otherwise return the position of the end of the match + 1
   */
  uint32_t i = begin;
  uint32_t test;
  assert(source != NULL);
  assert(length > 0);
  if (source[i] == '+' ||
      source[i] == '-') {
    i++;
  }
  test = i;
  while (i < length &&
         isdigit(source[i])) {
    i++;
  }
  if (i == test)
    return false;
  return i;
 }
 static
 inline
 uint32_t
 match_float(source_t source, uint32_t begin, const uint32_t length) {
  /* Return false if there is no match
   * otherwise:
   *  if there is a leading decimal point and then a valid int match:
   *    return the position of the end of the match
   *  if there is a leading valid int match:
   *    but no decimal point match after that:
   *      return false
   *    if there is a decimal point match and then a valid int match:
   *        return the position of the match
   *    if there is no valid int match:
   *      return false
   * ALWAYS returns the position + 1 to avoid confusion with false (which is a valid index)
   */
  uint32_t i, leading_int_match, trailing_int_match;
  assert(source != NULL);
  assert(length > 0);
  i = begin;
  leading_int_match = match_int(source, i, length);
  if (leading_int_match) {
    i = leading_int_match;
  }
  assert(i <= length);
  if (source[i] != '.' ||
      source[i] == '+' ||
      source[i] == '-') {
    if (((i+1) <= length) && /* Make sure there is at least two characters to look at */
        ((source[i] == '+') ||
         (source[i] == '-'))
        && (source[i+1] == '.')) {
      i++;
    }
    else {
      return false;
    }
  }
  i++;
  trailing_int_match = match_int(source, i, length);
  if (trailing_int_match) {
    return trailing_int_match;
  }
  return false;
 }
 static
 inline
 uint32_t
 match_identifier(source_t source, uint32_t begin, const uint32_t length) {
  /* Return false if there is no match
   *    if there is a match for any characters that are not:
   *      whitespace
   *      a parenthesis ( )
   *      a brace { }
   *      a square bracket [ ]
   *        then return the position of the match + 1
   *    if there is nothing else to match:
   *      return false
   */
  uint32_t i = begin;
  assert(source != NULL);
  assert(length > 0);
  while (i < length &&
         !(source[i] == '(' ||
           source[i] == ')' ||
           isspace(source[i]))) {
    i++;
  }
  if (i == begin) {
    return false;
  }
  assert(i <= length);
  return i;
 }
 static
 inline
 uint32_t
 match_symbol(source_t source, uint32_t begin, const uint32_t length) {
  uint32_t i, identifier_match;
  assert(source != NULL);
  assert(length > 0);
  i = begin;
  if (source[i] != '\'') {
    return false;
  }
  i++;
  identifier_match = match_identifier(source, i, length);
  if (identifier_match) {
    return identifier_match;
  }
  assert(identifier_match <= length);
  return false;
 }
 static
 inline
 void
 extract_token(uint32_t position,
                   uint32_t begin,
                   source_t source,
                   char *token_val) {
    assert(position > begin);
    string_head(position - begin,
                &source[begin],
                token_val);
 }
 token_stream
 tokenize(source_t source, uint32_t begin, const uint32_t length) {
  /*
   * Remember to free everything from this struct
   * for example, token_stack.tokens will not necessarily be
   * equal to tokens after this function has run
   *
   */
  uint32_t position = begin;
  char *current_token_val;
  token_stream token_stack;
  token_val_t current_token;
  token_t *tokens = calloc(STACK_SIZE, sizeof(token_t));
  assert(begin == 0);
  assert(length > 0);
  assert(source != NULL);
  token_stack.length = 0;
  token_stack.max_length = STACK_SIZE;
  token_stack.tokens = tokens;
  assert(STACK_SIZE > 0);
  while (begin <= length && source[begin]) {
    if ((position = match_float(source, begin, length))) {
      /* Matched a float */
      assert(position > begin);
      current_token_val = calloc(((position - begin) + 1), sizeof(char));
      assert(current_token_val != NULL);
      extract_token(position, begin, source, current_token_val);
      current_token.floating = current_token_val;
      push_token(&token_stack, make_token(current_token, FLOATING));
    }
    else if ((position = match_int(source, begin, length))) {
      /* Matched an int */
      assert(position > begin);
      assert(position <= length);
      current_token_val = calloc(((position - begin) + 1), sizeof(char));
      assert(current_token_val != NULL);
      extract_token(position, begin, source, current_token_val);
      current_token.integer = current_token_val;
      push_token(&token_stack, make_token(current_token, INTEGER));
    }
    else if ((position = match_symbol(source, begin, length))) {
      /* Matched a symbol */
      assert(position > begin);
      assert(position <= length);
      current_token_val = calloc(((position - begin) + 1), sizeof(char));
      assert(current_token_val != NULL);
      extract_token(position, begin, source, current_token_val);
      current_token.symbol = current_token_val;
      push_token(&token_stack, make_token(current_token, SYMBOL));
    }
    else if ((position = match_identifier(source, begin, length))) {
      assert(position > begin);
      assert(position <= length);
      current_token_val = calloc(((position - begin) + 1), sizeof(char));
      assert(current_token_val != NULL);
      extract_token(position, begin, source, current_token_val);
      current_token.identifier = current_token_val;
      push_token(&token_stack, make_token(current_token, IDENTIFIER));
      /* Matched an identifier */
    }
    else if (source[begin] == '(') {
      /*Matched a left paren */
      position = begin + 1;
      push_token(&token_stack, left_paren);
    }
    else if (source[begin] == ')') {
      /*Matched a left paren */
      position = begin + 1;
      push_token(&token_stack, right_paren);
    }
    else if (source[begin] == '\'') {
      /* Matched a quote (apostrophe) */
      position = begin + 1;
      push_token(&token_stack, quote_tok);
    }
    else if (isspace(source[begin])) {
      position = begin + 1;
      push_token(&token_stack, whitespace_tok);
      /* Matched a whitespace character */
    }
    else {
      printf("Unmatched token\n");
      exit(EXIT_FAILURE);
    }
    begin = position;
  }
  return token_stack;
 }
 bool
 release_tokens(token_stream *tokens) {
  /* Iterate through the stack, release each token
   * Then release the entire stack
   */
  assert(tokens != NULL);
  assert(tokens->tokens != NULL);
  assert(tokens->max_length > 0);
  while(tokens->length > 0) {
    pop_token(tokens);
  }
  free(tokens->tokens);
  return true;
 }
--- a/tokenize.h
+++ b/tokenize.h
@ -0,0 +1,52 @@
 #define STACK_SIZE 4096
 #define GROWTH_SIZE 512
 typedef char* source_t;
 typedef enum {
  SYMBOL = 0,
  IDENTIFIER = 1,
  INTEGER = 2,
  FLOATING = 3,
  QUOTE = 4,
  WSPACE = 5,
  PAREN = 6 ,
  EMPTY = 7
 } tok_t;
 typedef union {
    char *symbol;
    char *identifier;
    char *integer;
    char *floating;
    char *parenthesis;
    bool quote;
    bool whitespace;
    bool null_token;
 } token_val_t;
 typedef struct {
  tok_t token_type;
  token_val_t token;
 } token_t;
 typedef struct {
  size_t length; /* Number of current elements */
  size_t max_length; /* Maximum length of the stack */
  token_t *tokens;
 } token_stream;
 bool push_token(token_stream*, token_t);
 bool pop_token(token_stream*);
 token_t peek_token(token_stream*);
 token_stream tokenize(source_t, uint32_t, const uint32_t);
 bool release_tokens(token_stream*);
 static uint32_t match_int(source_t, uint32_t, const uint32_t);
 static uint32_t match_float(source_t, uint32_t, const uint32_t);
 static uint32_t match_identifier(source_t, uint32_t, const uint32_t);
 static uint32_t match_symbol(source_t, uint32_t, const uint32_t);
--- a/tokenize.py
+++ b/tokenize.py
@ -0,0 +1,53 @@
 #! /usr/bin/python2
 from ctypes import *
 tokenizer = cdll.LoadLibrary("./tokenize.so")
 toktypes = { 0 : "symbol",
             1 : "identifier",
             2 : "integer",
             3 : "floating",
             4 : "quote",
             5 : "whitespace",
             6 : "parenthesis"}
 class TokenValT(Union):
    _fields_ = [("symbol", c_char_p),
                ("identifier", c_char_p),
                ("integer", c_char_p),
                ("floating", c_char_p),
                ("parenthesis", c_char_p),
                ("quote", c_bool),
                ("whitespace", c_bool),
                ("null_token", c_bool)]
 class TokenT(Structure):
    _fields_ = [("token_type", c_int),
                ("token", TokenValT)]
 class TokStream(Structure):
    _fields_ = [("length", c_size_t),
                ("max_length", c_size_t),
                ("tokens", POINTER(TokenT))]
 tokenizer.tokenize.restype = TokStream
 tokenizer.peek_token.restype = TokenT
 tokenizer.pop_token.restype = c_bool
 tokenizer.release_tokens.restype = c_bool
 def tokenize(source):
    tokens = tokenizer.tokenize(source, 0, len(source))
    tp = pointer(tokens)
    while tokens.length > 0:
        tok = tokenizer.peek_token(tp)
        ttype = toktypes[tok.token_type]
        yield (ttype, getattr(tok.token, ttype))
        tokenizer.pop_token(tp)
    tokenizer.release_tokens(tp)
 tokens = tokenize("(+ a b) 'blah whatever (++ +.043 -4a +.0 +.3.2"*1610)
 xs = list(tokens)
 #print list(tokens)