Browse Source

Merge pull request #1 from nisstyre56/stringtable

use string tables to optimize
master
Wesley Kerfoot 10 years ago
parent
commit
ae9ea09026
  1. 11
      foo.c
  2. 167
      tokenize.c
  3. 10
      tokenize.py
  4. 12
      wat.c

11
foo.c

@ -0,0 +1,11 @@
#include <stdlib.h>
#include <stdio.h>
int main(void) {
char *foo = malloc(20);
snprintf(foo, 5, "%s", "1234");
char *bar = foo;
*foo++ = *bar++;
printf("%s\n", foo);
return 0;
}

167
tokenize.c

@ -5,6 +5,7 @@
#include <stdbool.h> #include <stdbool.h>
#include <string.h> #include <string.h>
#include <assert.h> #include <assert.h>
#include "maa.h"
#include "tokenize.h" #include "tokenize.h"
/* /*
@ -22,9 +23,7 @@ static const token_t left_paren = {.token_type=PAREN, .token={.parenthesis="("}
static const token_t right_paren = {.token_type=PAREN, .token={.parenthesis=")"} }; static const token_t right_paren = {.token_type=PAREN, .token={.parenthesis=")"} };
static static inline char *
inline
char *
string_head(uint32_t n, char *in, char *out) { string_head(uint32_t n, char *in, char *out) {
/* out must be large enough to store the number of characters /* out must be large enough to store the number of characters
* you want to select from in, plus a byte for the null terminator * you want to select from in, plus a byte for the null terminator
@ -44,9 +43,7 @@ string_head(uint32_t n, char *in, char *out) {
return out; return out;
} }
static static inline token_t
inline
token_t
make_token(token_val_t val, tok_t toktype) { make_token(token_val_t val, tok_t toktype) {
token_t result; token_t result;
result.token_type = toktype; result.token_type = toktype;
@ -102,29 +99,11 @@ pop_token(token_stream *tokens) {
len--; len--;
assert(tokens->tokens != NULL); assert(tokens->tokens != NULL);
switch (tokens->tokens[len].token_type) {
case SYMBOL:
free(tokens->tokens[len].token.symbol);
break;
case IDENTIFIER:
free(tokens->tokens[len].token.identifier);
break;
case INTEGER:
free(tokens->tokens[len].token.integer);
break;
case FLOATING:
free(tokens->tokens[len].token.floating);
break;
default:
break;
}
tokens->length--; tokens->length--;
return true; return true;
} }
inline inline token_t
token_t
peek_token(token_stream *tokens) { peek_token(token_stream *tokens) {
/* /*
* Check if tokens points to NULL * Check if tokens points to NULL
@ -140,9 +119,7 @@ peek_token(token_stream *tokens) {
return tokens->tokens[len-1]; return tokens->tokens[len-1];
} }
static static inline uint32_t
inline
uint32_t
match_int(source_t source, uint32_t begin, const uint32_t length) { match_int(source_t source, uint32_t begin, const uint32_t length) {
/* Return false if there is no match /* Return false if there is no match
* otherwise return the position of the end of the match + 1 * otherwise return the position of the end of the match + 1
@ -166,9 +143,7 @@ match_int(source_t source, uint32_t begin, const uint32_t length) {
return i; return i;
} }
static static inline uint32_t
inline
uint32_t
match_float(source_t source, uint32_t begin, const uint32_t length) { match_float(source_t source, uint32_t begin, const uint32_t length) {
/* Return false if there is no match /* Return false if there is no match
* otherwise: * otherwise:
@ -218,9 +193,7 @@ match_float(source_t source, uint32_t begin, const uint32_t length) {
return false; return false;
} }
static static inline uint32_t
inline
uint32_t
match_identifier(source_t source, uint32_t begin, const uint32_t length) { match_identifier(source_t source, uint32_t begin, const uint32_t length) {
/* Return false if there is no match /* Return false if there is no match
@ -251,9 +224,7 @@ match_identifier(source_t source, uint32_t begin, const uint32_t length) {
return i; return i;
} }
static static inline uint32_t
inline
uint32_t
match_symbol(source_t source, uint32_t begin, const uint32_t length) { match_symbol(source_t source, uint32_t begin, const uint32_t length) {
uint32_t i, identifier_match; uint32_t i, identifier_match;
assert(source != NULL); assert(source != NULL);
@ -273,9 +244,7 @@ match_symbol(source_t source, uint32_t begin, const uint32_t length) {
return false; return false;
} }
static static inline void
inline
void
extract_token(uint32_t position, extract_token(uint32_t position,
uint32_t begin, uint32_t begin,
source_t source, source_t source,
@ -300,6 +269,8 @@ tokenize(source_t source, uint32_t begin, const uint32_t length) {
token_val_t current_token; token_val_t current_token;
token_t *tokens = calloc(STACK_SIZE, sizeof(token_t)); token_t *tokens = calloc(STACK_SIZE, sizeof(token_t));
hsh_HashTable token_memo = hsh_create(NULL, NULL);
assert(begin == 0); assert(begin == 0);
assert(length > 0); assert(length > 0);
assert(source != NULL); assert(source != NULL);
@ -307,9 +278,10 @@ tokenize(source_t source, uint32_t begin, const uint32_t length) {
token_stack.length = 0; token_stack.length = 0;
token_stack.max_length = STACK_SIZE; token_stack.max_length = STACK_SIZE;
token_stack.tokens = tokens; token_stack.tokens = tokens;
token_stack.memo = token_memo;
char lookahead = '\0';
assert(STACK_SIZE > 0); assert(STACK_SIZE > 0);
while (begin <= length && source[begin]) { while (begin <= length && source[begin]) {
if (source[begin] == '(') { if (source[begin] == '(') {
/*Matched a left paren */ /*Matched a left paren */
@ -333,48 +305,86 @@ tokenize(source_t source, uint32_t begin, const uint32_t length) {
} }
else if ((position = match_float(source, begin, length))) { else if ((position = match_float(source, begin, length))) {
/* Matched a float */ /* Matched a float */
assert(position > begin); lookahead = source[position];
source[position] = '\0';
current_token_val = calloc(((position - begin) + 1), sizeof(char)); if ((current_token_val = (char *)hsh_retrieve(token_stack.memo, source+begin))) {
assert(current_token_val != NULL); current_token.floating = current_token_val;
extract_token(position, begin, source, current_token_val); source[position] = lookahead;
current_token.floating = current_token_val; }
else {
source[position] = lookahead;
assert(position > begin);
current_token_val = calloc(((position - begin) + 1), sizeof(char));
assert(current_token_val != NULL);
extract_token(position, begin, source, current_token_val);
hsh_insert(token_stack.memo, current_token_val, current_token_val);
current_token.floating = current_token_val;
}
push_token(&token_stack, make_token(current_token, FLOATING)); push_token(&token_stack, make_token(current_token, FLOATING));
} }
else if ((position = match_int(source, begin, length))) { else if ((position = match_int(source, begin, length))) {
/* Matched an int */ /* Matched an int */
assert(position > begin); lookahead = source[position];
assert(position <= length); source[position] = '\0';
current_token_val = calloc(((position - begin) + 1), sizeof(char)); if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) {
assert(current_token_val != NULL); current_token.integer = (char *)current_token_val;
extract_token(position, begin, source, current_token_val); source[position] = lookahead;
}
current_token.integer = current_token_val; else {
assert(position > begin);
assert(position <= length);
source[position] = lookahead;
current_token_val = calloc(((position - begin) + 1), sizeof(char));
assert(current_token_val != NULL);
extract_token(position, begin, source, current_token_val);
hsh_insert(token_stack.memo, current_token_val, current_token_val);
current_token.integer = current_token_val;
}
push_token(&token_stack, make_token(current_token, INTEGER)); push_token(&token_stack, make_token(current_token, INTEGER));
} }
else if ((position = match_symbol(source, begin, length))) { else if ((position = match_symbol(source, begin, length))) {
/* Matched a symbol */ /* Matched a symbol */
assert(position > begin); lookahead = source[position];
assert(position <= length); source[position] = '\0';
current_token_val = calloc(((position - begin) + 1), sizeof(char)); if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) {
assert(current_token_val != NULL); current_token.symbol = (char *)current_token_val;
extract_token(position, begin, source, current_token_val); source[position] = lookahead;
}
current_token.symbol = current_token_val; else {
assert(position > begin);
assert(position <= length);
source[position] = lookahead;
current_token_val = calloc(((position - begin) + 1), sizeof(char));
assert(current_token_val != NULL);
extract_token(position, begin, source, current_token_val);
hsh_insert(token_stack.memo, current_token_val, current_token_val);
current_token.symbol = current_token_val;
}
push_token(&token_stack, make_token(current_token, SYMBOL)); push_token(&token_stack, make_token(current_token, SYMBOL));
} }
else if ((position = match_identifier(source, begin, length))) { else if ((position = match_identifier(source, begin, length))) {
assert(position > begin); /* Matched an identifier */
assert(position <= length); lookahead = source[position];
current_token_val = calloc(((position - begin) + 1), sizeof(char)); source[position] = '\0';
assert(current_token_val != NULL); if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) {
extract_token(position, begin, source, current_token_val); current_token.identifier = (char *)current_token_val;
source[position] = lookahead;
current_token.identifier = current_token_val; }
else {
assert(position > begin);
assert(position <= length);
source[position] = lookahead;
current_token_val = calloc(((position - begin) + 1), sizeof(char));
assert(current_token_val != NULL);
extract_token(position, begin, source, current_token_val);
hsh_insert(token_stack.memo, current_token_val, current_token_val);
current_token.identifier = current_token_val;
}
push_token(&token_stack, make_token(current_token, IDENTIFIER)); push_token(&token_stack, make_token(current_token, IDENTIFIER));
/* Matched an identifier */ /* Matched an identifier */
@ -385,9 +395,17 @@ tokenize(source_t source, uint32_t begin, const uint32_t length) {
} }
begin = position; begin = position;
} }
return token_stack; return token_stack;
} }
int free_token(const void *key, const void *val) {
/* silence warnings about unused parameters, key and val point to the same data*/
(void)key;
free((char *)val);
return true;
}
bool bool
release_tokens(token_stream *tokens) { release_tokens(token_stream *tokens) {
/* Iterate through the stack, release each token /* Iterate through the stack, release each token
@ -396,10 +414,9 @@ release_tokens(token_stream *tokens) {
assert(tokens != NULL); assert(tokens != NULL);
assert(tokens->tokens != NULL); assert(tokens->tokens != NULL);
assert(tokens->max_length > 0); assert(tokens->max_length > 0);
while(tokens->length > 0) {
pop_token(tokens);
}
free(tokens->tokens); free(tokens->tokens);
hsh_iterate(tokens->memo, free_token);
hsh_destroy(tokens->memo);
return true; return true;
} }

10
tokenize.py

@ -29,7 +29,8 @@ class TokenT(Structure):
class TokStream(Structure): class TokStream(Structure):
_fields_ = [("length", c_size_t), _fields_ = [("length", c_size_t),
("max_length", c_size_t), ("max_length", c_size_t),
("tokens", POINTER(TokenT))] ("tokens", POINTER(TokenT)),
("memo", c_void_p)]
tokenizer.tokenize.restype = TokStream tokenizer.tokenize.restype = TokStream
tokenizer.peek_token.restype = TokenT tokenizer.peek_token.restype = TokenT
@ -46,8 +47,5 @@ def tokenize(source):
tokenizer.pop_token(tp) tokenizer.pop_token(tp)
tokenizer.release_tokens(tp) tokenizer.release_tokens(tp)
tokens = tokenize("(+ a b) 'blah whatever (++ +.043 -4a +.0 +.3.2"*1610) line = "(+ a b) 'blah whatever (++ +.043 -4a +.0 +.3.2"
xs = list(tokens) xs = list(tokenize(line*141500))
#print list(tokens)

12
wat.c

@ -0,0 +1,12 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(void) {
char *blah = malloc(2);
blah[0] = 'a';
blah[1] = '\0';
printf("%zd\n", strlen(blah));
free(blah);
return 0;
}