From afc4b0b1c1fd9454d3d73446536d99cf7581a0a5 Mon Sep 17 00:00:00 2001 From: wes Date: Fri, 4 Dec 2015 21:29:49 -0500 Subject: [PATCH] first commit --- Makefile | 21 +++ error.h | 1 + tokenize.c | 503 +++++++++++++++++++++++++++++++++++++++++++++++++++++ tokenize.h | 74 ++++++++ 4 files changed, 599 insertions(+) create mode 100644 Makefile create mode 100644 error.h create mode 100644 tokenize.c create mode 100644 tokenize.h diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..cd60754 --- /dev/null +++ b/Makefile @@ -0,0 +1,21 @@ +default: tokenize.c tokenize.h + $(MAKE) lib; + $(CC) -Wall -Wextra -pedantic -Wpointer-arith -Werror -O3 ./tokenize.c -lmaa -o tokenize_test; + +unsafe: tokenize.c tokenize.h + $(CC) -DNDEBUG -O3 ./tokenize.c -lmaa; + +unsafelib: tokenize.c tokenize.h + $(CC) -DLIB -DNDEBUG -c -fpic -O3 ./tokenize.c; + $(CC) -shared -o tokenize.so tokenize.o -lmaa; + +lib: tokenize.c tokenize.h + $(CC) -DLIB -DNDEBUG -c -fpic -Wall -Wextra -pedantic -Wpointer-arith -Werror -O3 ./tokenize.c + $(CC) -shared -o libtokenize.so tokenize.o -lmaa; + +debug: tokenize.c tokenize.h + $(CC) -Wall -Wextra -pedantic -Wpointer-arith -Werror ./tokenize.c -lmaa -o tokenize_debug; + +debuglib: tokenize.c tokenize.h + $(CC) -g -c -fpic -Wall -Wextra -pedantic -Wpointer-arith -Werror ./tokenize.c; + $(CC) -shared -o tokenize.so tokenize.o -lmaa; diff --git a/error.h b/error.h new file mode 100644 index 0000000..24bfcc3 --- /dev/null +++ b/error.h @@ -0,0 +1 @@ +#define CHECK(ptr) if ((ptr) == NULL) { printf("Failed to allocate memory\n"); exit(EXIT_FAILURE); } diff --git a/tokenize.c b/tokenize.c new file mode 100644 index 0000000..4bab20a --- /dev/null +++ b/tokenize.c @@ -0,0 +1,503 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "error.h" +#include "tokenize.h" + +/* + * This is a basic s-expression tokenizer + * it also tokenizes things like number, string, and symbol literals + */ + +token_t nulltok; + +static token_t whitespace_tok; + +static token_t quote_tok; + +static token_t left_paren = { + PAREN, + { + "(" + } +}; + +static token_t right_paren = { + PAREN, + { + ")" + } +}; + +static inline char * +string_head(uint32_t n, + char *in, + char *out) { + /* out must be large enough to store the number of characters + * you want to select from in, plus a byte for the null terminator + */ +#ifndef NDEBUG + size_t in_len = strlen(in); +#endif + assert(n > 0 && n <= in_len); + int iserror = snprintf(out, (size_t)n+1 , "%s", in); + + assert((iserror != -1) && ((size_t)iserror == in_len)); + + if (iserror == -1) { + printf("Out of memory"); + exit(EXIT_FAILURE); + } + return ( char*)out; +} + +static inline token_t +make_token(token_val_t val, + tok_t toktype) { + token_t result; + result.token_type = toktype; + result.token = val; + return result; +} + +token_t +testfunc(void) { + token_val_t wspace; + wspace.whitespace = true; + return make_token(wspace, QUOTE); +} + + +bool +push_token(token_stream *tokens, + token_t token) { + /* + * Check if tokens points to NULL + */ + + size_t len; + size_t max; + + CHECK(tokens); + + len = tokens->length; + max = tokens->max_length; + + assert(len <= max); + assert(max > 0); + + if (len == max) { + /* We've reached the maximum stack size + * So we must try to increase that by GROWTH_SIZE + */ + token_t *new_tokens = (token_t *)realloc(tokens->tokens, sizeof (token_t) * (max + GROWTH_SIZE)); + if (!new_tokens) { + printf("Could not allocate enough memory for the token stack\n"); + exit(EXIT_FAILURE); + } + tokens->tokens = new_tokens; + tokens->max_length = max + GROWTH_SIZE; + tokens->tokens[len] = token; + tokens->length++; + return true; + } + tokens->tokens[len] = token; + tokens->length++; + return true; +} + +bool +pop_token(token_stream *tokens) { + size_t len; + CHECK(tokens); + + len = tokens->length; + + assert(len != 0); + len--; + CHECK(tokens->tokens); + + tokens->length--; + return true; +} + +inline token_t +peek_token(token_stream *tokens) { + /* + * Check if tokens points to NULL + */ + size_t len = tokens->length; + size_t max = tokens->max_length; + CHECK(tokens); + assert(len != 0); + + if (len == 0 || len > max) { + return nulltok; + } + return tokens->tokens[len-1]; +} + +static inline uint32_t +match_int(source_t source, + uint32_t begin, + uint32_t length) { + /* Return false if there is no match + * otherwise return the position of the end of the match + 1 + */ + uint32_t i = begin; + uint32_t test; + CHECK(source); + assert(length > 0); + + if (source[i] == '+' || + source[i] == '-') { + i++; + } + test = i; + while (i < length && + isdigit(source[i])) { + i++; + } + if (i == test) + return false; + return i; +} + +static inline uint32_t +match_float(source_t source, + uint32_t begin, + uint32_t length) { + /* Return false if there is no match + * otherwise: + * if there is a leading decimal point and then a valid int match: + * return the position of the end of the match + * if there is a leading valid int match: + * but no decimal point match after that: + * return false + * if there is a decimal point match and then a valid int match: + * return the position of the match + * if there is no valid int match: + * return false + * ALWAYS returns the position + 1 to avoid confusion with false (which is a valid index) + */ + uint32_t i, leading_int_match, trailing_int_match; + CHECK(source); + assert(length > 0); + + i = begin; + leading_int_match = match_int(source, i, length); + + if (leading_int_match) { + i = leading_int_match; + } + + assert(i <= length); + + if (source[i] != '.' || + source[i] == '+' || + source[i] == '-') { + if (((i+1) <= length) && /* Make sure there is at least two characters to look at */ + ((source[i] == '+') || + (source[i] == '-')) + && (source[i+1] == '.')) { + i++; + } + else { + return false; + } + } + i++; + + trailing_int_match = match_int(source, i, length); + if (trailing_int_match) { + return trailing_int_match; + } + return false; +} + +static inline uint32_t +match_identifier(source_t source, + uint32_t begin, + uint32_t length) { + + /* Return false if there is no match + * if there is a match for any characters that are not: + * whitespace + * a parenthesis ( ) + * a brace { } + * a square bracket [ ] + * then return the position of the match + 1 + * if there is nothing else to match: + * return false + */ + uint32_t i = begin; + CHECK(source); + assert(length > 0); + if (source[i] == '*' || + source[i] == '+' || + source[i] == '-' || + source[i] == '/') { + return i+1; + } + + while (i < length && + !(source[i] == '(' || + source[i] == ')' || + isspace(source[i]))) { + i++; + } + + if (i == begin) { + return false; + } + assert(i <= length); + return i; +} + +static inline uint32_t +match_symbol(source_t source, + uint32_t begin, + uint32_t length) { + uint32_t i; + CHECK(source); + assert(length > 0); + + i = begin; + if (source[i] != '\'') { + return false; + } + i++; + while ((isspace(source[i]) || + (source[i] == '\'')) && i < length) { /* consume leading whitespace and quotes */ + i++; + } + if (source[i] == ')') { + printf("Unexpected )\n"); + exit(EXIT_FAILURE); + } + if (source[i] == '(') { + return i; + } + while (!isspace(source[i]) && + source[i] != '(' && + source[i] != ')' && + i < length) { + i++; + } + if (i == begin+1) { /* if we did not increment i more than once (before the loop) */ + return false; + } + return i; +} + +static inline void +extract_token(uint32_t position, + uint32_t begin, + source_t source, + char *token_val) { + assert(position > begin); + string_head(position - begin, + &source[begin], + (char *)token_val); +} + +token_stream +tokenize(source_t source, + uint32_t begin, + uint32_t length) { + /* + * Remember to free everything from this struct + * for example, token_stack.tokens will not necessarily be + * equal to tokens after this function has run + * + */ + uint32_t position = begin; + char *current_token_val; + token_stream token_stack; + token_val_t current_token; + token_t *tokens = (token_t *)calloc(STACK_SIZE, sizeof (token_t)); + + assert(begin == 0); + assert(length > 0); + CHECK(source); + + token_stack.length = 0; + token_stack.max_length = STACK_SIZE; + token_stack.tokens = tokens; + char lookahead = '\0'; + assert(STACK_SIZE > 0); + + while (begin <= length && source[begin]) { + if (source[begin] == '(') { + /*Matched a left paren */ + position = begin + 1; + push_token(&token_stack, left_paren); + } + else if (source[begin] == ')') { + /*Matched a left paren */ + position = begin + 1; + push_token(&token_stack, right_paren); + } + else if (isspace(source[begin])) { + position = begin + 1; + push_token(&token_stack, whitespace_tok); + /* Matched a whitespace character */ + } + else if ((position = match_float(source, begin, length))) { + /* Matched a float */ + lookahead = source[position]; + source[position] = '\0'; + source[position] = lookahead; + assert(position > begin); + current_token_val = (char *)calloc(((position - begin) + 1), sizeof (char)); + CHECK(current_token_val); + extract_token(position, begin, source, current_token_val); + current_token.floating = current_token_val; + push_token(&token_stack, make_token(current_token, FLOATING)); + } + else if ((position = match_int(source, begin, length))) { + /* Matched an int */ + lookahead = source[position]; + source[position] = '\0'; + assert(position > begin); + assert(position <= length); + + source[position] = lookahead; + current_token_val = (char *)calloc(((position - begin) + 1), sizeof (char)); + CHECK(current_token_val); + extract_token(position, begin, source, current_token_val); + current_token.integer = current_token_val; + push_token(&token_stack, make_token(current_token, INTEGER)); + } + else if ((position = match_symbol(source, begin, length))) { + /* Matched a symbol */ + lookahead = source[position]; + source[position] = '\0'; + assert(position > begin); + assert(position <= length); + + source[position] = lookahead; + current_token_val = (char *)calloc(((position - begin) + 1), sizeof (char)); + CHECK(current_token_val); + extract_token(position, begin, source, current_token_val); + current_token.symbol = current_token_val; + push_token(&token_stack, make_token(current_token, SYMBOL)); + } + else if (source[begin] == '\'') { + /* Matched a quote (apostrophe) */ + position = begin + 1; + push_token(&token_stack, quote_tok); + } + else if ((position = match_identifier(source, begin, length))) { + /* Matched an identifier */ + lookahead = source[position]; + source[position] = '\0'; + assert(position > begin); + assert(position <= length); + + source[position] = lookahead; + current_token_val = (char *)calloc(((position - begin) + 1), sizeof (char)); + CHECK(current_token_val); + extract_token(position, begin, source, current_token_val); + current_token.identifier = current_token_val; + push_token(&token_stack, make_token(current_token, IDENTIFIER)); + /* Matched an identifier */ + } + else if (position <= begin) { + printf("Source is too large to read\n"); + exit(EXIT_FAILURE); + } + else { + printf("Unmatched token\n"); + exit(EXIT_FAILURE); + } + begin = position; + } + + return token_stack; +} + +int +free_token( void *key, + void *val) { + /* silence warnings about unused parameters, key and val point to the same data*/ + (void)key; + free((char *)val); + return true; +} + +bool +release_tokens(token_stream *tokens) { + /* Iterate through the stack, release each token + * Then release the entire stack + */ + CHECK(tokens); + CHECK(tokens->tokens); + assert(tokens->max_length > 0); + free(tokens->tokens); + + return true; +} + +#ifndef LIB +int main(void) { + nulltok.token_type = EMPTY; + nulltok.token.null_token = false; + + whitespace_tok.token_type = WSPACE; + whitespace_tok.token.whitespace= false; + + quote_tok.token_type = QUOTE; + quote_tok.token.quote= false; + + void *source_code = malloc(111000); + uint32_t nbytes = read(STDIN_FILENO, source_code, 111000); + if (nbytes == 0) { + exit(EXIT_FAILURE); + } + token_stream toks = tokenize((source_t)source_code, 0, nbytes); + token_t current_tok; + while (toks.length > 0) { + current_tok = peek_token(&toks); + switch (current_tok.token_type) { + case SYMBOL: + printf("symbol: %s\n", current_tok.token.symbol); + break; + case IDENTIFIER: + printf("identifer: %s\n", current_tok.token.identifier); + break; + case INTEGER: + printf("integer: %s\n", current_tok.token.integer); + break; + case FLOATING: + printf("floating: %s\n", current_tok.token.floating); + break; + case QUOTE: + printf("quote: '\n"); + break; + case WSPACE: + printf("whitespace\n"); + break; + case PAREN: + printf("paren: %s\n", current_tok.token.parenthesis); + break; + case EMPTY: + printf("this should not be empty\n"); + break; + case STRING: + printf("string: %s\n", current_tok.token.string); + break; + default: + printf("oops, there was an unknown token, check valgrind or gdb\n"); + } + pop_token(&toks); + } + release_tokens(&toks); + return 0; +} +#endif diff --git a/tokenize.h b/tokenize.h new file mode 100644 index 0000000..fd83054 --- /dev/null +++ b/tokenize.h @@ -0,0 +1,74 @@ +#define STACK_SIZE 4096 +#define GROWTH_SIZE 65536 + +typedef char* source_t; + +typedef enum { + SYMBOL = 0, + IDENTIFIER = 1, + INTEGER = 2, + FLOATING = 3, + QUOTE = 4, + WSPACE = 5, + PAREN = 6 , + EMPTY = 7, + STRING = 8 +} tok_t; + +typedef union { + const char *symbol; + const char *identifier; + const char *integer; + const char *floating; + const char *parenthesis; + const char *string; + bool quote; + bool whitespace; + bool null_token; +} token_val_t; + +typedef struct { + tok_t token_type; + token_val_t token; +} token_t; + +typedef struct { + size_t length; /* Number of current elements */ + size_t max_length; /* Maximum length of the stack */ + token_t *tokens; +} token_stream; + +bool +push_token(token_stream*, token_t); + +bool +pop_token(token_stream*); + +token_t +peek_token(token_stream*); + +token_stream +tokenize(source_t, uint32_t, const uint32_t); + +bool +release_tokens(token_stream*); + +#ifndef LIB +static uint32_t +match_int(source_t, uint32_t, const uint32_t); + +static uint32_t +match_float(source_t, uint32_t, const uint32_t); + +static uint32_t +match_identifier(source_t, uint32_t, const uint32_t); + +static uint32_t +match_symbol(source_t, uint32_t, const uint32_t); +#endif + +int +free_token(const void *, + const void *); +token_t +testfunc(void);