commit 215445c08ccd9c7f64c36a876a304bb6789385eb Author: nisstyre56 Date: Wed Jul 2 00:40:10 2014 -0400 initial commit diff --git a/tokenize.c b/tokenize.c new file mode 100644 index 0000000..cf0bf46 --- /dev/null +++ b/tokenize.c @@ -0,0 +1,405 @@ +#include +#include +#include +#include +#include +#include +#include +#include "tokenize.h" + +/* + * This is a basic s-expression tokenizer + * it also tokenizes things like number, string, and symbol literals + */ + +static const token_t nulltok = {.token_type=EMPTY, {.null_token=false}}; + +static const token_t whitespace_tok = {.token_type=WSPACE, .token={.whitespace=true } }; + +static const token_t quote_tok = {.token_type=QUOTE, .token={.quote=true} }; + +static const token_t left_paren = {.token_type=PAREN, .token={.parenthesis="("} }; + +static const token_t right_paren = {.token_type=PAREN, .token={.parenthesis=")"} }; + +static +inline +char * +string_head(uint32_t n, char *in, char *out) { + /* out must be large enough to store the number of characters + * you want to select from in, plus a byte for the null terminator + */ +#ifndef NDEBUG + size_t in_len = strlen(in); +#endif + assert(n > 0 && n <= in_len); + int iserror = snprintf(out, (size_t)n+1 , "%s", in); + + assert((iserror != -1) && (iserror == in_len)); + + if (iserror == -1) { + printf("Out of memory"); + exit(EXIT_FAILURE); + } + return out; +} + +static +inline +token_t +make_token(token_val_t val, tok_t toktype) { + token_t result; + result.token_type = toktype; + result.token = val; + return result; +} + +bool +push_token(token_stream *tokens, token_t token) { + /* + * Check if tokens points to NULL + */ + + size_t len; + size_t max; + + assert(tokens != NULL); + + len = tokens->length; + max = tokens->max_length; + + assert(len <= max); + assert(max > 0); + + if (len == max) { + /* We've reached the maximum stack size + * So we must try to increase that by GROWTH_SIZE + */ + token_t *new_tokens = realloc(tokens->tokens, sizeof(token_t) * (max + GROWTH_SIZE)); + if (!new_tokens) { + printf("Could not allocate enough memory for the token stack\n"); + exit(EXIT_FAILURE); + } + tokens->tokens = new_tokens; + tokens->max_length = max + GROWTH_SIZE; + tokens->tokens[len] = token; + tokens->length++; + return true; + } + tokens->tokens[len] = token; + tokens->length++; + return true; +} + +bool +pop_token(token_stream *tokens) { + size_t len; + assert(tokens != NULL); + + len = tokens->length; + + assert(len != 0); + len--; + assert(tokens->tokens != NULL); + + switch (tokens->tokens[len].token_type) { + case SYMBOL: + free(tokens->tokens[len].token.symbol); + break; + case IDENTIFIER: + free(tokens->tokens[len].token.identifier); + break; + case INTEGER: + free(tokens->tokens[len].token.integer); + break; + case FLOATING: + free(tokens->tokens[len].token.floating); + break; + default: + break; + } + + tokens->length--; + return true; +} + +inline +token_t +peek_token(token_stream *tokens) { + /* + * Check if tokens points to NULL + */ + size_t len = tokens->length; + size_t max = tokens->max_length; + assert(tokens != NULL); + assert(len != 0); + + if (len == 0 || len > max) { + return nulltok; + } + return tokens->tokens[len-1]; +} + +static +inline +uint32_t +match_int(source_t source, uint32_t begin, const uint32_t length) { + /* Return false if there is no match + * otherwise return the position of the end of the match + 1 + */ + uint32_t i = begin; + uint32_t test; + assert(source != NULL); + assert(length > 0); + + if (source[i] == '+' || + source[i] == '-') { + i++; + } + test = i; + while (i < length && + isdigit(source[i])) { + i++; + } + if (i == test) + return false; + return i; +} + +static +inline +uint32_t +match_float(source_t source, uint32_t begin, const uint32_t length) { + /* Return false if there is no match + * otherwise: + * if there is a leading decimal point and then a valid int match: + * return the position of the end of the match + * if there is a leading valid int match: + * but no decimal point match after that: + * return false + * if there is a decimal point match and then a valid int match: + * return the position of the match + * if there is no valid int match: + * return false + * ALWAYS returns the position + 1 to avoid confusion with false (which is a valid index) + */ + uint32_t i, leading_int_match, trailing_int_match; + assert(source != NULL); + assert(length > 0); + + i = begin; + leading_int_match = match_int(source, i, length); + + if (leading_int_match) { + i = leading_int_match; + } + + assert(i <= length); + + if (source[i] != '.' || + source[i] == '+' || + source[i] == '-') { + if (((i+1) <= length) && /* Make sure there is at least two characters to look at */ + ((source[i] == '+') || + (source[i] == '-')) + && (source[i+1] == '.')) { + i++; + } + else { + return false; + } + } + i++; + + trailing_int_match = match_int(source, i, length); + if (trailing_int_match) { + return trailing_int_match; + } + return false; +} + +static +inline +uint32_t +match_identifier(source_t source, uint32_t begin, const uint32_t length) { + + /* Return false if there is no match + * if there is a match for any characters that are not: + * whitespace + * a parenthesis ( ) + * a brace { } + * a square bracket [ ] + * then return the position of the match + 1 + * if there is nothing else to match: + * return false + */ + uint32_t i = begin; + assert(source != NULL); + assert(length > 0); + + while (i < length && + !(source[i] == '(' || + source[i] == ')' || + isspace(source[i]))) { + i++; + } + + if (i == begin) { + return false; + } + assert(i <= length); + return i; +} + +static +inline +uint32_t +match_symbol(source_t source, uint32_t begin, const uint32_t length) { + uint32_t i, identifier_match; + assert(source != NULL); + assert(length > 0); + + i = begin; + if (source[i] != '\'') { + return false; + } + i++; + + identifier_match = match_identifier(source, i, length); + if (identifier_match) { + return identifier_match; + } + assert(identifier_match <= length); + return false; +} + +static +inline +void +extract_token(uint32_t position, + uint32_t begin, + source_t source, + char *token_val) { + assert(position > begin); + string_head(position - begin, + &source[begin], + token_val); +} + +token_stream +tokenize(source_t source, uint32_t begin, const uint32_t length) { + /* + * Remember to free everything from this struct + * for example, token_stack.tokens will not necessarily be + * equal to tokens after this function has run + * + */ + uint32_t position = begin; + char *current_token_val; + token_stream token_stack; + token_val_t current_token; + token_t *tokens = calloc(STACK_SIZE, sizeof(token_t)); + + assert(begin == 0); + assert(length > 0); + assert(source != NULL); + + token_stack.length = 0; + token_stack.max_length = STACK_SIZE; + token_stack.tokens = tokens; + assert(STACK_SIZE > 0); + + + while (begin <= length && source[begin]) { + if ((position = match_float(source, begin, length))) { + /* Matched a float */ + assert(position > begin); + + current_token_val = calloc(((position - begin) + 1), sizeof(char)); + assert(current_token_val != NULL); + extract_token(position, begin, source, current_token_val); + current_token.floating = current_token_val; + + push_token(&token_stack, make_token(current_token, FLOATING)); + } + else if ((position = match_int(source, begin, length))) { + /* Matched an int */ + assert(position > begin); + assert(position <= length); + current_token_val = calloc(((position - begin) + 1), sizeof(char)); + assert(current_token_val != NULL); + extract_token(position, begin, source, current_token_val); + + current_token.integer = current_token_val; + + push_token(&token_stack, make_token(current_token, INTEGER)); + } + else if ((position = match_symbol(source, begin, length))) { + /* Matched a symbol */ + assert(position > begin); + assert(position <= length); + current_token_val = calloc(((position - begin) + 1), sizeof(char)); + assert(current_token_val != NULL); + extract_token(position, begin, source, current_token_val); + + current_token.symbol = current_token_val; + + push_token(&token_stack, make_token(current_token, SYMBOL)); + + } + else if ((position = match_identifier(source, begin, length))) { + assert(position > begin); + assert(position <= length); + current_token_val = calloc(((position - begin) + 1), sizeof(char)); + assert(current_token_val != NULL); + extract_token(position, begin, source, current_token_val); + + current_token.identifier = current_token_val; + + push_token(&token_stack, make_token(current_token, IDENTIFIER)); + /* Matched an identifier */ + } + else if (source[begin] == '(') { + /*Matched a left paren */ + position = begin + 1; + push_token(&token_stack, left_paren); + } + else if (source[begin] == ')') { + /*Matched a left paren */ + position = begin + 1; + push_token(&token_stack, right_paren); + } + else if (source[begin] == '\'') { + /* Matched a quote (apostrophe) */ + position = begin + 1; + push_token(&token_stack, quote_tok); + } + else if (isspace(source[begin])) { + position = begin + 1; + push_token(&token_stack, whitespace_tok); + /* Matched a whitespace character */ + } + else { + printf("Unmatched token\n"); + exit(EXIT_FAILURE); + } + begin = position; + } + return token_stack; +} + +bool +release_tokens(token_stream *tokens) { + /* Iterate through the stack, release each token + * Then release the entire stack + */ + assert(tokens != NULL); + assert(tokens->tokens != NULL); + assert(tokens->max_length > 0); + + while(tokens->length > 0) { + pop_token(tokens); + } + free(tokens->tokens); + return true; +} diff --git a/tokenize.h b/tokenize.h new file mode 100644 index 0000000..9cdad42 --- /dev/null +++ b/tokenize.h @@ -0,0 +1,52 @@ +#define STACK_SIZE 4096 +#define GROWTH_SIZE 512 + +typedef char* source_t; + +typedef enum { + SYMBOL = 0, + IDENTIFIER = 1, + INTEGER = 2, + FLOATING = 3, + QUOTE = 4, + WSPACE = 5, + PAREN = 6 , + EMPTY = 7 +} tok_t; + +typedef union { + char *symbol; + char *identifier; + char *integer; + char *floating; + char *parenthesis; + bool quote; + bool whitespace; + bool null_token; +} token_val_t; + +typedef struct { + tok_t token_type; + token_val_t token; +} token_t; + +typedef struct { + size_t length; /* Number of current elements */ + size_t max_length; /* Maximum length of the stack */ + token_t *tokens; +} token_stream; + +bool push_token(token_stream*, token_t); + +bool pop_token(token_stream*); + +token_t peek_token(token_stream*); + +token_stream tokenize(source_t, uint32_t, const uint32_t); + +bool release_tokens(token_stream*); + +static uint32_t match_int(source_t, uint32_t, const uint32_t); +static uint32_t match_float(source_t, uint32_t, const uint32_t); +static uint32_t match_identifier(source_t, uint32_t, const uint32_t); +static uint32_t match_symbol(source_t, uint32_t, const uint32_t); diff --git a/tokenize.py b/tokenize.py new file mode 100644 index 0000000..7e8dfd3 --- /dev/null +++ b/tokenize.py @@ -0,0 +1,53 @@ +#! /usr/bin/python2 + +from ctypes import * + +tokenizer = cdll.LoadLibrary("./tokenize.so") + +toktypes = { 0 : "symbol", + 1 : "identifier", + 2 : "integer", + 3 : "floating", + 4 : "quote", + 5 : "whitespace", + 6 : "parenthesis"} + +class TokenValT(Union): + _fields_ = [("symbol", c_char_p), + ("identifier", c_char_p), + ("integer", c_char_p), + ("floating", c_char_p), + ("parenthesis", c_char_p), + ("quote", c_bool), + ("whitespace", c_bool), + ("null_token", c_bool)] + +class TokenT(Structure): + _fields_ = [("token_type", c_int), + ("token", TokenValT)] + +class TokStream(Structure): + _fields_ = [("length", c_size_t), + ("max_length", c_size_t), + ("tokens", POINTER(TokenT))] + +tokenizer.tokenize.restype = TokStream +tokenizer.peek_token.restype = TokenT +tokenizer.pop_token.restype = c_bool +tokenizer.release_tokens.restype = c_bool + +def tokenize(source): + tokens = tokenizer.tokenize(source, 0, len(source)) + tp = pointer(tokens) + while tokens.length > 0: + tok = tokenizer.peek_token(tp) + ttype = toktypes[tok.token_type] + yield (ttype, getattr(tok.token, ttype)) + tokenizer.pop_token(tp) + tokenizer.release_tokens(tp) + +tokens = tokenize("(+ a b) 'blah whatever (++ +.043 -4a +.0 +.3.2"*1610) +xs = list(tokens) + +#print list(tokens) +