Browse Source

initial commit

pull/1/head
nisstyre56 10 years ago
commit
215445c08c
  1. 405
      tokenize.c
  2. 52
      tokenize.h
  3. 53
      tokenize.py

405
tokenize.c

@ -0,0 +1,405 @@
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <stdbool.h>
#include <string.h>
#include <assert.h>
#include "tokenize.h"
/*
* This is a basic s-expression tokenizer
* it also tokenizes things like number, string, and symbol literals
*/
static const token_t nulltok = {.token_type=EMPTY, {.null_token=false}};
static const token_t whitespace_tok = {.token_type=WSPACE, .token={.whitespace=true } };
static const token_t quote_tok = {.token_type=QUOTE, .token={.quote=true} };
static const token_t left_paren = {.token_type=PAREN, .token={.parenthesis="("} };
static const token_t right_paren = {.token_type=PAREN, .token={.parenthesis=")"} };
static
inline
char *
string_head(uint32_t n, char *in, char *out) {
/* out must be large enough to store the number of characters
* you want to select from in, plus a byte for the null terminator
*/
#ifndef NDEBUG
size_t in_len = strlen(in);
#endif
assert(n > 0 && n <= in_len);
int iserror = snprintf(out, (size_t)n+1 , "%s", in);
assert((iserror != -1) && (iserror == in_len));
if (iserror == -1) {
printf("Out of memory");
exit(EXIT_FAILURE);
}
return out;
}
static
inline
token_t
make_token(token_val_t val, tok_t toktype) {
token_t result;
result.token_type = toktype;
result.token = val;
return result;
}
bool
push_token(token_stream *tokens, token_t token) {
/*
* Check if tokens points to NULL
*/
size_t len;
size_t max;
assert(tokens != NULL);
len = tokens->length;
max = tokens->max_length;
assert(len <= max);
assert(max > 0);
if (len == max) {
/* We've reached the maximum stack size
* So we must try to increase that by GROWTH_SIZE
*/
token_t *new_tokens = realloc(tokens->tokens, sizeof(token_t) * (max + GROWTH_SIZE));
if (!new_tokens) {
printf("Could not allocate enough memory for the token stack\n");
exit(EXIT_FAILURE);
}
tokens->tokens = new_tokens;
tokens->max_length = max + GROWTH_SIZE;
tokens->tokens[len] = token;
tokens->length++;
return true;
}
tokens->tokens[len] = token;
tokens->length++;
return true;
}
bool
pop_token(token_stream *tokens) {
size_t len;
assert(tokens != NULL);
len = tokens->length;
assert(len != 0);
len--;
assert(tokens->tokens != NULL);
switch (tokens->tokens[len].token_type) {
case SYMBOL:
free(tokens->tokens[len].token.symbol);
break;
case IDENTIFIER:
free(tokens->tokens[len].token.identifier);
break;
case INTEGER:
free(tokens->tokens[len].token.integer);
break;
case FLOATING:
free(tokens->tokens[len].token.floating);
break;
default:
break;
}
tokens->length--;
return true;
}
inline
token_t
peek_token(token_stream *tokens) {
/*
* Check if tokens points to NULL
*/
size_t len = tokens->length;
size_t max = tokens->max_length;
assert(tokens != NULL);
assert(len != 0);
if (len == 0 || len > max) {
return nulltok;
}
return tokens->tokens[len-1];
}
static
inline
uint32_t
match_int(source_t source, uint32_t begin, const uint32_t length) {
/* Return false if there is no match
* otherwise return the position of the end of the match + 1
*/
uint32_t i = begin;
uint32_t test;
assert(source != NULL);
assert(length > 0);
if (source[i] == '+' ||
source[i] == '-') {
i++;
}
test = i;
while (i < length &&
isdigit(source[i])) {
i++;
}
if (i == test)
return false;
return i;
}
static
inline
uint32_t
match_float(source_t source, uint32_t begin, const uint32_t length) {
/* Return false if there is no match
* otherwise:
* if there is a leading decimal point and then a valid int match:
* return the position of the end of the match
* if there is a leading valid int match:
* but no decimal point match after that:
* return false
* if there is a decimal point match and then a valid int match:
* return the position of the match
* if there is no valid int match:
* return false
* ALWAYS returns the position + 1 to avoid confusion with false (which is a valid index)
*/
uint32_t i, leading_int_match, trailing_int_match;
assert(source != NULL);
assert(length > 0);
i = begin;
leading_int_match = match_int(source, i, length);
if (leading_int_match) {
i = leading_int_match;
}
assert(i <= length);
if (source[i] != '.' ||
source[i] == '+' ||
source[i] == '-') {
if (((i+1) <= length) && /* Make sure there is at least two characters to look at */
((source[i] == '+') ||
(source[i] == '-'))
&& (source[i+1] == '.')) {
i++;
}
else {
return false;
}
}
i++;
trailing_int_match = match_int(source, i, length);
if (trailing_int_match) {
return trailing_int_match;
}
return false;
}
static
inline
uint32_t
match_identifier(source_t source, uint32_t begin, const uint32_t length) {
/* Return false if there is no match
* if there is a match for any characters that are not:
* whitespace
* a parenthesis ( )
* a brace { }
* a square bracket [ ]
* then return the position of the match + 1
* if there is nothing else to match:
* return false
*/
uint32_t i = begin;
assert(source != NULL);
assert(length > 0);
while (i < length &&
!(source[i] == '(' ||
source[i] == ')' ||
isspace(source[i]))) {
i++;
}
if (i == begin) {
return false;
}
assert(i <= length);
return i;
}
static
inline
uint32_t
match_symbol(source_t source, uint32_t begin, const uint32_t length) {
uint32_t i, identifier_match;
assert(source != NULL);
assert(length > 0);
i = begin;
if (source[i] != '\'') {
return false;
}
i++;
identifier_match = match_identifier(source, i, length);
if (identifier_match) {
return identifier_match;
}
assert(identifier_match <= length);
return false;
}
static
inline
void
extract_token(uint32_t position,
uint32_t begin,
source_t source,
char *token_val) {
assert(position > begin);
string_head(position - begin,
&source[begin],
token_val);
}
token_stream
tokenize(source_t source, uint32_t begin, const uint32_t length) {
/*
* Remember to free everything from this struct
* for example, token_stack.tokens will not necessarily be
* equal to tokens after this function has run
*
*/
uint32_t position = begin;
char *current_token_val;
token_stream token_stack;
token_val_t current_token;
token_t *tokens = calloc(STACK_SIZE, sizeof(token_t));
assert(begin == 0);
assert(length > 0);
assert(source != NULL);
token_stack.length = 0;
token_stack.max_length = STACK_SIZE;
token_stack.tokens = tokens;
assert(STACK_SIZE > 0);
while (begin <= length && source[begin]) {
if ((position = match_float(source, begin, length))) {
/* Matched a float */
assert(position > begin);
current_token_val = calloc(((position - begin) + 1), sizeof(char));
assert(current_token_val != NULL);
extract_token(position, begin, source, current_token_val);
current_token.floating = current_token_val;
push_token(&token_stack, make_token(current_token, FLOATING));
}
else if ((position = match_int(source, begin, length))) {
/* Matched an int */
assert(position > begin);
assert(position <= length);
current_token_val = calloc(((position - begin) + 1), sizeof(char));
assert(current_token_val != NULL);
extract_token(position, begin, source, current_token_val);
current_token.integer = current_token_val;
push_token(&token_stack, make_token(current_token, INTEGER));
}
else if ((position = match_symbol(source, begin, length))) {
/* Matched a symbol */
assert(position > begin);
assert(position <= length);
current_token_val = calloc(((position - begin) + 1), sizeof(char));
assert(current_token_val != NULL);
extract_token(position, begin, source, current_token_val);
current_token.symbol = current_token_val;
push_token(&token_stack, make_token(current_token, SYMBOL));
}
else if ((position = match_identifier(source, begin, length))) {
assert(position > begin);
assert(position <= length);
current_token_val = calloc(((position - begin) + 1), sizeof(char));
assert(current_token_val != NULL);
extract_token(position, begin, source, current_token_val);
current_token.identifier = current_token_val;
push_token(&token_stack, make_token(current_token, IDENTIFIER));
/* Matched an identifier */
}
else if (source[begin] == '(') {
/*Matched a left paren */
position = begin + 1;
push_token(&token_stack, left_paren);
}
else if (source[begin] == ')') {
/*Matched a left paren */
position = begin + 1;
push_token(&token_stack, right_paren);
}
else if (source[begin] == '\'') {
/* Matched a quote (apostrophe) */
position = begin + 1;
push_token(&token_stack, quote_tok);
}
else if (isspace(source[begin])) {
position = begin + 1;
push_token(&token_stack, whitespace_tok);
/* Matched a whitespace character */
}
else {
printf("Unmatched token\n");
exit(EXIT_FAILURE);
}
begin = position;
}
return token_stack;
}
bool
release_tokens(token_stream *tokens) {
/* Iterate through the stack, release each token
* Then release the entire stack
*/
assert(tokens != NULL);
assert(tokens->tokens != NULL);
assert(tokens->max_length > 0);
while(tokens->length > 0) {
pop_token(tokens);
}
free(tokens->tokens);
return true;
}

52
tokenize.h

@ -0,0 +1,52 @@
#define STACK_SIZE 4096
#define GROWTH_SIZE 512
typedef char* source_t;
typedef enum {
SYMBOL = 0,
IDENTIFIER = 1,
INTEGER = 2,
FLOATING = 3,
QUOTE = 4,
WSPACE = 5,
PAREN = 6 ,
EMPTY = 7
} tok_t;
typedef union {
char *symbol;
char *identifier;
char *integer;
char *floating;
char *parenthesis;
bool quote;
bool whitespace;
bool null_token;
} token_val_t;
typedef struct {
tok_t token_type;
token_val_t token;
} token_t;
typedef struct {
size_t length; /* Number of current elements */
size_t max_length; /* Maximum length of the stack */
token_t *tokens;
} token_stream;
bool push_token(token_stream*, token_t);
bool pop_token(token_stream*);
token_t peek_token(token_stream*);
token_stream tokenize(source_t, uint32_t, const uint32_t);
bool release_tokens(token_stream*);
static uint32_t match_int(source_t, uint32_t, const uint32_t);
static uint32_t match_float(source_t, uint32_t, const uint32_t);
static uint32_t match_identifier(source_t, uint32_t, const uint32_t);
static uint32_t match_symbol(source_t, uint32_t, const uint32_t);

53
tokenize.py

@ -0,0 +1,53 @@
#! /usr/bin/python2
from ctypes import *
tokenizer = cdll.LoadLibrary("./tokenize.so")
toktypes = { 0 : "symbol",
1 : "identifier",
2 : "integer",
3 : "floating",
4 : "quote",
5 : "whitespace",
6 : "parenthesis"}
class TokenValT(Union):
_fields_ = [("symbol", c_char_p),
("identifier", c_char_p),
("integer", c_char_p),
("floating", c_char_p),
("parenthesis", c_char_p),
("quote", c_bool),
("whitespace", c_bool),
("null_token", c_bool)]
class TokenT(Structure):
_fields_ = [("token_type", c_int),
("token", TokenValT)]
class TokStream(Structure):
_fields_ = [("length", c_size_t),
("max_length", c_size_t),
("tokens", POINTER(TokenT))]
tokenizer.tokenize.restype = TokStream
tokenizer.peek_token.restype = TokenT
tokenizer.pop_token.restype = c_bool
tokenizer.release_tokens.restype = c_bool
def tokenize(source):
tokens = tokenizer.tokenize(source, 0, len(source))
tp = pointer(tokens)
while tokens.length > 0:
tok = tokenizer.peek_token(tp)
ttype = toktypes[tok.token_type]
yield (ttype, getattr(tok.token, ttype))
tokenizer.pop_token(tp)
tokenizer.release_tokens(tp)
tokens = tokenize("(+ a b) 'blah whatever (++ +.043 -4a +.0 +.3.2"*1610)
xs = list(tokens)
#print list(tokens)