commit
215445c08c
3 changed files with 510 additions and 0 deletions
@ -0,0 +1,405 @@ |
|||
#include <stdint.h> |
|||
#include <stdio.h> |
|||
#include <stdlib.h> |
|||
#include <ctype.h> |
|||
#include <stdbool.h> |
|||
#include <string.h> |
|||
#include <assert.h> |
|||
#include "tokenize.h" |
|||
|
|||
/*
|
|||
* This is a basic s-expression tokenizer |
|||
* it also tokenizes things like number, string, and symbol literals |
|||
*/ |
|||
|
|||
static const token_t nulltok = {.token_type=EMPTY, {.null_token=false}}; |
|||
|
|||
static const token_t whitespace_tok = {.token_type=WSPACE, .token={.whitespace=true } }; |
|||
|
|||
static const token_t quote_tok = {.token_type=QUOTE, .token={.quote=true} }; |
|||
|
|||
static const token_t left_paren = {.token_type=PAREN, .token={.parenthesis="("} }; |
|||
|
|||
static const token_t right_paren = {.token_type=PAREN, .token={.parenthesis=")"} }; |
|||
|
|||
static |
|||
inline |
|||
char * |
|||
string_head(uint32_t n, char *in, char *out) { |
|||
/* out must be large enough to store the number of characters
|
|||
* you want to select from in, plus a byte for the null terminator |
|||
*/ |
|||
#ifndef NDEBUG |
|||
size_t in_len = strlen(in); |
|||
#endif |
|||
assert(n > 0 && n <= in_len); |
|||
int iserror = snprintf(out, (size_t)n+1 , "%s", in); |
|||
|
|||
assert((iserror != -1) && (iserror == in_len)); |
|||
|
|||
if (iserror == -1) { |
|||
printf("Out of memory"); |
|||
exit(EXIT_FAILURE); |
|||
} |
|||
return out; |
|||
} |
|||
|
|||
static |
|||
inline |
|||
token_t |
|||
make_token(token_val_t val, tok_t toktype) { |
|||
token_t result; |
|||
result.token_type = toktype; |
|||
result.token = val; |
|||
return result; |
|||
} |
|||
|
|||
bool |
|||
push_token(token_stream *tokens, token_t token) { |
|||
/*
|
|||
* Check if tokens points to NULL |
|||
*/ |
|||
|
|||
size_t len; |
|||
size_t max; |
|||
|
|||
assert(tokens != NULL); |
|||
|
|||
len = tokens->length; |
|||
max = tokens->max_length; |
|||
|
|||
assert(len <= max); |
|||
assert(max > 0); |
|||
|
|||
if (len == max) { |
|||
/* We've reached the maximum stack size
|
|||
* So we must try to increase that by GROWTH_SIZE |
|||
*/ |
|||
token_t *new_tokens = realloc(tokens->tokens, sizeof(token_t) * (max + GROWTH_SIZE)); |
|||
if (!new_tokens) { |
|||
printf("Could not allocate enough memory for the token stack\n"); |
|||
exit(EXIT_FAILURE); |
|||
} |
|||
tokens->tokens = new_tokens; |
|||
tokens->max_length = max + GROWTH_SIZE; |
|||
tokens->tokens[len] = token; |
|||
tokens->length++; |
|||
return true; |
|||
} |
|||
tokens->tokens[len] = token; |
|||
tokens->length++; |
|||
return true; |
|||
} |
|||
|
|||
bool |
|||
pop_token(token_stream *tokens) { |
|||
size_t len; |
|||
assert(tokens != NULL); |
|||
|
|||
len = tokens->length; |
|||
|
|||
assert(len != 0); |
|||
len--; |
|||
assert(tokens->tokens != NULL); |
|||
|
|||
switch (tokens->tokens[len].token_type) { |
|||
case SYMBOL: |
|||
free(tokens->tokens[len].token.symbol); |
|||
break; |
|||
case IDENTIFIER: |
|||
free(tokens->tokens[len].token.identifier); |
|||
break; |
|||
case INTEGER: |
|||
free(tokens->tokens[len].token.integer); |
|||
break; |
|||
case FLOATING: |
|||
free(tokens->tokens[len].token.floating); |
|||
break; |
|||
default: |
|||
break; |
|||
} |
|||
|
|||
tokens->length--; |
|||
return true; |
|||
} |
|||
|
|||
inline |
|||
token_t |
|||
peek_token(token_stream *tokens) { |
|||
/*
|
|||
* Check if tokens points to NULL |
|||
*/ |
|||
size_t len = tokens->length; |
|||
size_t max = tokens->max_length; |
|||
assert(tokens != NULL); |
|||
assert(len != 0); |
|||
|
|||
if (len == 0 || len > max) { |
|||
return nulltok; |
|||
} |
|||
return tokens->tokens[len-1]; |
|||
} |
|||
|
|||
static |
|||
inline |
|||
uint32_t |
|||
match_int(source_t source, uint32_t begin, const uint32_t length) { |
|||
/* Return false if there is no match
|
|||
* otherwise return the position of the end of the match + 1 |
|||
*/ |
|||
uint32_t i = begin; |
|||
uint32_t test; |
|||
assert(source != NULL); |
|||
assert(length > 0); |
|||
|
|||
if (source[i] == '+' || |
|||
source[i] == '-') { |
|||
i++; |
|||
} |
|||
test = i; |
|||
while (i < length && |
|||
isdigit(source[i])) { |
|||
i++; |
|||
} |
|||
if (i == test) |
|||
return false; |
|||
return i; |
|||
} |
|||
|
|||
static |
|||
inline |
|||
uint32_t |
|||
match_float(source_t source, uint32_t begin, const uint32_t length) { |
|||
/* Return false if there is no match
|
|||
* otherwise: |
|||
* if there is a leading decimal point and then a valid int match: |
|||
* return the position of the end of the match |
|||
* if there is a leading valid int match: |
|||
* but no decimal point match after that: |
|||
* return false |
|||
* if there is a decimal point match and then a valid int match: |
|||
* return the position of the match |
|||
* if there is no valid int match: |
|||
* return false |
|||
* ALWAYS returns the position + 1 to avoid confusion with false (which is a valid index) |
|||
*/ |
|||
uint32_t i, leading_int_match, trailing_int_match; |
|||
assert(source != NULL); |
|||
assert(length > 0); |
|||
|
|||
i = begin; |
|||
leading_int_match = match_int(source, i, length); |
|||
|
|||
if (leading_int_match) { |
|||
i = leading_int_match; |
|||
} |
|||
|
|||
assert(i <= length); |
|||
|
|||
if (source[i] != '.' || |
|||
source[i] == '+' || |
|||
source[i] == '-') { |
|||
if (((i+1) <= length) && /* Make sure there is at least two characters to look at */ |
|||
((source[i] == '+') || |
|||
(source[i] == '-')) |
|||
&& (source[i+1] == '.')) { |
|||
i++; |
|||
} |
|||
else { |
|||
return false; |
|||
} |
|||
} |
|||
i++; |
|||
|
|||
trailing_int_match = match_int(source, i, length); |
|||
if (trailing_int_match) { |
|||
return trailing_int_match; |
|||
} |
|||
return false; |
|||
} |
|||
|
|||
static |
|||
inline |
|||
uint32_t |
|||
match_identifier(source_t source, uint32_t begin, const uint32_t length) { |
|||
|
|||
/* Return false if there is no match
|
|||
* if there is a match for any characters that are not: |
|||
* whitespace |
|||
* a parenthesis ( ) |
|||
* a brace { } |
|||
* a square bracket [ ] |
|||
* then return the position of the match + 1 |
|||
* if there is nothing else to match: |
|||
* return false |
|||
*/ |
|||
uint32_t i = begin; |
|||
assert(source != NULL); |
|||
assert(length > 0); |
|||
|
|||
while (i < length && |
|||
!(source[i] == '(' || |
|||
source[i] == ')' || |
|||
isspace(source[i]))) { |
|||
i++; |
|||
} |
|||
|
|||
if (i == begin) { |
|||
return false; |
|||
} |
|||
assert(i <= length); |
|||
return i; |
|||
} |
|||
|
|||
static |
|||
inline |
|||
uint32_t |
|||
match_symbol(source_t source, uint32_t begin, const uint32_t length) { |
|||
uint32_t i, identifier_match; |
|||
assert(source != NULL); |
|||
assert(length > 0); |
|||
|
|||
i = begin; |
|||
if (source[i] != '\'') { |
|||
return false; |
|||
} |
|||
i++; |
|||
|
|||
identifier_match = match_identifier(source, i, length); |
|||
if (identifier_match) { |
|||
return identifier_match; |
|||
} |
|||
assert(identifier_match <= length); |
|||
return false; |
|||
} |
|||
|
|||
static |
|||
inline |
|||
void |
|||
extract_token(uint32_t position, |
|||
uint32_t begin, |
|||
source_t source, |
|||
char *token_val) { |
|||
assert(position > begin); |
|||
string_head(position - begin, |
|||
&source[begin], |
|||
token_val); |
|||
} |
|||
|
|||
token_stream |
|||
tokenize(source_t source, uint32_t begin, const uint32_t length) { |
|||
/*
|
|||
* Remember to free everything from this struct |
|||
* for example, token_stack.tokens will not necessarily be |
|||
* equal to tokens after this function has run |
|||
* |
|||
*/ |
|||
uint32_t position = begin; |
|||
char *current_token_val; |
|||
token_stream token_stack; |
|||
token_val_t current_token; |
|||
token_t *tokens = calloc(STACK_SIZE, sizeof(token_t)); |
|||
|
|||
assert(begin == 0); |
|||
assert(length > 0); |
|||
assert(source != NULL); |
|||
|
|||
token_stack.length = 0; |
|||
token_stack.max_length = STACK_SIZE; |
|||
token_stack.tokens = tokens; |
|||
assert(STACK_SIZE > 0); |
|||
|
|||
|
|||
while (begin <= length && source[begin]) { |
|||
if ((position = match_float(source, begin, length))) { |
|||
/* Matched a float */ |
|||
assert(position > begin); |
|||
|
|||
current_token_val = calloc(((position - begin) + 1), sizeof(char)); |
|||
assert(current_token_val != NULL); |
|||
extract_token(position, begin, source, current_token_val); |
|||
current_token.floating = current_token_val; |
|||
|
|||
push_token(&token_stack, make_token(current_token, FLOATING)); |
|||
} |
|||
else if ((position = match_int(source, begin, length))) { |
|||
/* Matched an int */ |
|||
assert(position > begin); |
|||
assert(position <= length); |
|||
current_token_val = calloc(((position - begin) + 1), sizeof(char)); |
|||
assert(current_token_val != NULL); |
|||
extract_token(position, begin, source, current_token_val); |
|||
|
|||
current_token.integer = current_token_val; |
|||
|
|||
push_token(&token_stack, make_token(current_token, INTEGER)); |
|||
} |
|||
else if ((position = match_symbol(source, begin, length))) { |
|||
/* Matched a symbol */ |
|||
assert(position > begin); |
|||
assert(position <= length); |
|||
current_token_val = calloc(((position - begin) + 1), sizeof(char)); |
|||
assert(current_token_val != NULL); |
|||
extract_token(position, begin, source, current_token_val); |
|||
|
|||
current_token.symbol = current_token_val; |
|||
|
|||
push_token(&token_stack, make_token(current_token, SYMBOL)); |
|||
|
|||
} |
|||
else if ((position = match_identifier(source, begin, length))) { |
|||
assert(position > begin); |
|||
assert(position <= length); |
|||
current_token_val = calloc(((position - begin) + 1), sizeof(char)); |
|||
assert(current_token_val != NULL); |
|||
extract_token(position, begin, source, current_token_val); |
|||
|
|||
current_token.identifier = current_token_val; |
|||
|
|||
push_token(&token_stack, make_token(current_token, IDENTIFIER)); |
|||
/* Matched an identifier */ |
|||
} |
|||
else if (source[begin] == '(') { |
|||
/*Matched a left paren */ |
|||
position = begin + 1; |
|||
push_token(&token_stack, left_paren); |
|||
} |
|||
else if (source[begin] == ')') { |
|||
/*Matched a left paren */ |
|||
position = begin + 1; |
|||
push_token(&token_stack, right_paren); |
|||
} |
|||
else if (source[begin] == '\'') { |
|||
/* Matched a quote (apostrophe) */ |
|||
position = begin + 1; |
|||
push_token(&token_stack, quote_tok); |
|||
} |
|||
else if (isspace(source[begin])) { |
|||
position = begin + 1; |
|||
push_token(&token_stack, whitespace_tok); |
|||
/* Matched a whitespace character */ |
|||
} |
|||
else { |
|||
printf("Unmatched token\n"); |
|||
exit(EXIT_FAILURE); |
|||
} |
|||
begin = position; |
|||
} |
|||
return token_stack; |
|||
} |
|||
|
|||
bool |
|||
release_tokens(token_stream *tokens) { |
|||
/* Iterate through the stack, release each token
|
|||
* Then release the entire stack |
|||
*/ |
|||
assert(tokens != NULL); |
|||
assert(tokens->tokens != NULL); |
|||
assert(tokens->max_length > 0); |
|||
|
|||
while(tokens->length > 0) { |
|||
pop_token(tokens); |
|||
} |
|||
free(tokens->tokens); |
|||
return true; |
|||
} |
@ -0,0 +1,52 @@ |
|||
#define STACK_SIZE 4096 |
|||
#define GROWTH_SIZE 512 |
|||
|
|||
typedef char* source_t; |
|||
|
|||
typedef enum { |
|||
SYMBOL = 0, |
|||
IDENTIFIER = 1, |
|||
INTEGER = 2, |
|||
FLOATING = 3, |
|||
QUOTE = 4, |
|||
WSPACE = 5, |
|||
PAREN = 6 , |
|||
EMPTY = 7 |
|||
} tok_t; |
|||
|
|||
typedef union { |
|||
char *symbol; |
|||
char *identifier; |
|||
char *integer; |
|||
char *floating; |
|||
char *parenthesis; |
|||
bool quote; |
|||
bool whitespace; |
|||
bool null_token; |
|||
} token_val_t; |
|||
|
|||
typedef struct { |
|||
tok_t token_type; |
|||
token_val_t token; |
|||
} token_t; |
|||
|
|||
typedef struct { |
|||
size_t length; /* Number of current elements */ |
|||
size_t max_length; /* Maximum length of the stack */ |
|||
token_t *tokens; |
|||
} token_stream; |
|||
|
|||
bool push_token(token_stream*, token_t); |
|||
|
|||
bool pop_token(token_stream*); |
|||
|
|||
token_t peek_token(token_stream*); |
|||
|
|||
token_stream tokenize(source_t, uint32_t, const uint32_t); |
|||
|
|||
bool release_tokens(token_stream*); |
|||
|
|||
static uint32_t match_int(source_t, uint32_t, const uint32_t); |
|||
static uint32_t match_float(source_t, uint32_t, const uint32_t); |
|||
static uint32_t match_identifier(source_t, uint32_t, const uint32_t); |
|||
static uint32_t match_symbol(source_t, uint32_t, const uint32_t); |
@ -0,0 +1,53 @@ |
|||
#! /usr/bin/python2 |
|||
|
|||
from ctypes import * |
|||
|
|||
tokenizer = cdll.LoadLibrary("./tokenize.so") |
|||
|
|||
toktypes = { 0 : "symbol", |
|||
1 : "identifier", |
|||
2 : "integer", |
|||
3 : "floating", |
|||
4 : "quote", |
|||
5 : "whitespace", |
|||
6 : "parenthesis"} |
|||
|
|||
class TokenValT(Union): |
|||
_fields_ = [("symbol", c_char_p), |
|||
("identifier", c_char_p), |
|||
("integer", c_char_p), |
|||
("floating", c_char_p), |
|||
("parenthesis", c_char_p), |
|||
("quote", c_bool), |
|||
("whitespace", c_bool), |
|||
("null_token", c_bool)] |
|||
|
|||
class TokenT(Structure): |
|||
_fields_ = [("token_type", c_int), |
|||
("token", TokenValT)] |
|||
|
|||
class TokStream(Structure): |
|||
_fields_ = [("length", c_size_t), |
|||
("max_length", c_size_t), |
|||
("tokens", POINTER(TokenT))] |
|||
|
|||
tokenizer.tokenize.restype = TokStream |
|||
tokenizer.peek_token.restype = TokenT |
|||
tokenizer.pop_token.restype = c_bool |
|||
tokenizer.release_tokens.restype = c_bool |
|||
|
|||
def tokenize(source): |
|||
tokens = tokenizer.tokenize(source, 0, len(source)) |
|||
tp = pointer(tokens) |
|||
while tokens.length > 0: |
|||
tok = tokenizer.peek_token(tp) |
|||
ttype = toktypes[tok.token_type] |
|||
yield (ttype, getattr(tok.token, ttype)) |
|||
tokenizer.pop_token(tp) |
|||
tokenizer.release_tokens(tp) |
|||
|
|||
tokens = tokenize("(+ a b) 'blah whatever (++ +.043 -4a +.0 +.3.2"*1610) |
|||
xs = list(tokens) |
|||
|
|||
#print list(tokens) |
|||
|
Reference in new issue