commit
3dc20ed2dd
7 changed files with 1139 additions and 0 deletions
@ -0,0 +1,13 @@ |
|||||
|
default: markov.c markov.h |
||||
|
$(MAKE) lib; |
||||
|
$(CC) -g -DTOK_LIB -Wall -Wextra -std=gnu99 -Wpointer-arith -Wmissing-prototypes -Werror -lmaa -lm -L. -ltokenize -O3 ./markov.c -o markov_test -Wl,-rpath,/home/wes/markov; |
||||
|
|
||||
|
unsafe: markov.c markov.h |
||||
|
$(MAKE) lib; |
||||
|
$(CC) -DNDEBUG -DTOK_LIB -Wall -std=gnu99 -Wextra -Wpointer-arith -Wmissing-prototypes -lmaa -lm -L. -ltokenize -O3 ./markov.c -o markov -Wl,-rpath,/home/wes/markov; |
||||
|
|
||||
|
lib: markov.c markov.h tokenize.c tokenize.h |
||||
|
$(CC) -g -DTOK_LIB -c -fpic -Wall -Wextra -pedantic -Wpointer-arith -Werror -std=c99 -O3 ./tokenize.c |
||||
|
$(CC) -shared -o libtokenize.so tokenize.o -lmaa; |
||||
|
$(CC) -g -DTOK_LIB -c -fpic -Wall -Wextra -pedantic -Wpointer-arith -Werror -std=gnu99 -O3 ./markov.c -lmaa -lm |
||||
|
$(CC) -shared -o markov.so markov.o -lmaa; |
@ -0,0 +1 @@ |
|||||
|
#define CHECK(ptr) if ((ptr) == NULL) { printf("Failed to allocate memory\n"); exit(EXIT_FAILURE); } |
@ -0,0 +1,400 @@ |
|||||
|
#include <time.h> |
||||
|
#include <math.h> |
||||
|
#include <stdint.h> |
||||
|
#include <stdio.h> |
||||
|
#include <unistd.h> |
||||
|
#include <stdlib.h> |
||||
|
#include <ctype.h> |
||||
|
#include <stdbool.h> |
||||
|
#include <string.h> |
||||
|
#include <assert.h> |
||||
|
#include "error.h" |
||||
|
#include "maa.h" |
||||
|
#include "tokenize.h" |
||||
|
#include "markov.h" |
||||
|
|
||||
|
#define LEN 50 |
||||
|
|
||||
|
static char quote = '\''; |
||||
|
|
||||
|
static inline void |
||||
|
initialize_neighbours(const char *str, |
||||
|
hsh_HashTable graph_table) { |
||||
|
/* Initialize the table of neighbours corresponding to some string in the markov graph */ |
||||
|
assert(!hsh_retrieve(graph_table, str)); |
||||
|
neighbours_t *new_neighbours = xmalloc(sizeof (neighbours_t)); |
||||
|
CHECK(new_neighbours); |
||||
|
new_neighbours->neighbours = hsh_create(NULL, NULL); |
||||
|
new_neighbours->number = 0; |
||||
|
new_neighbours->unique_num = 0; |
||||
|
hsh_insert(graph_table, str, new_neighbours); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
static inline unsigned long |
||||
|
numberof_keys(graph_t graph) { |
||||
|
/* Get the number of unique keys in the graph */ |
||||
|
hsh_Stats stats = hsh_get_stats(graph.cache); |
||||
|
unsigned long num = stats->entries; |
||||
|
xfree(stats); |
||||
|
return num; |
||||
|
} |
||||
|
|
||||
|
static inline unsigned long |
||||
|
numberof_transitionable(graph_t graph) { |
||||
|
/* Get the number of keys with >0 neighbours */ |
||||
|
/* Only call after graph has been converted */ |
||||
|
unsigned long num = 0; |
||||
|
void *p, *key; |
||||
|
markov_trans_t *val; |
||||
|
HSH_ITERATE(graph.graph, p, key, val) { |
||||
|
if (val->number > 0) { |
||||
|
num++; |
||||
|
} |
||||
|
} |
||||
|
return num; |
||||
|
} |
||||
|
|
||||
|
static const char* |
||||
|
get_ngram(const char* str, |
||||
|
graph_t graph) { |
||||
|
/* Try to get a string from the cache.
|
||||
|
* If it's not already cached, allocate the memory for it |
||||
|
* then return the freshly cached string |
||||
|
*/ |
||||
|
hsh_HashTable cache = graph.cache; |
||||
|
hsh_HashTable graph_table = graph.graph; |
||||
|
const char *exists = hsh_retrieve(cache, str); |
||||
|
if (exists) { |
||||
|
return exists; |
||||
|
} |
||||
|
else { |
||||
|
/* Add it to the cache and return it */ |
||||
|
size_t gram_size = strlen(str) + 1; |
||||
|
char *new_str = xmalloc(gram_size); |
||||
|
CHECK(str); |
||||
|
snprintf(new_str, gram_size, "%s", str); |
||||
|
hsh_insert(cache, new_str, new_str); |
||||
|
initialize_neighbours(new_str, graph_table); |
||||
|
return new_str; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
static inline void |
||||
|
insert_neighbour(const char *left, |
||||
|
const char *neighbour, |
||||
|
graph_t graph) { |
||||
|
/* Insert a neighbour into the table of neighbours for a given key */ |
||||
|
neighbours_t *neighbours = (neighbours_t *)hsh_retrieve(graph.graph, left); |
||||
|
neighbours->number++; |
||||
|
hsh_HashTable neighbours_table = neighbours->neighbours; |
||||
|
CHECK(neighbours->neighbours); |
||||
|
if (hsh_retrieve(neighbours_table, neighbour)) { |
||||
|
return; |
||||
|
} |
||||
|
neighbours->unique_num++; |
||||
|
const char *new_neighbour = get_ngram(neighbour, graph); |
||||
|
CHECK(new_neighbour); |
||||
|
uint32_t *count = xmalloc(sizeof (uint32_t)); |
||||
|
CHECK(count); |
||||
|
*count = 0; |
||||
|
hsh_insert(neighbours_table, new_neighbour, count); |
||||
|
} |
||||
|
|
||||
|
static inline void |
||||
|
increment_neighbour(const char *left, |
||||
|
const char *neighbour, |
||||
|
graph_t graph) { |
||||
|
/* Increment the frequency of a given bi-gram.
|
||||
|
* bi-gram does not necessarily mean a specific thing |
||||
|
* it could be pairs of words, pairs of letters, sequences of n letters, and so on |
||||
|
*/ |
||||
|
neighbours_t *neighbours = (neighbours_t *)hsh_retrieve(graph.graph, left); |
||||
|
hsh_HashTable neighbours_hash = neighbours->neighbours; |
||||
|
CHECK(neighbours_hash); |
||||
|
uint32_t *count = (uint32_t *)hsh_retrieve(neighbours_hash, neighbour); |
||||
|
CHECK(count); |
||||
|
(*count)++; |
||||
|
} |
||||
|
|
||||
|
static inline neighbours_t* |
||||
|
get_neighbours(graph_t graph, |
||||
|
char *gram) { |
||||
|
/* Simply return the table of neighbours corresponding to a given string */ |
||||
|
neighbours_t *neighbours; |
||||
|
neighbours = (neighbours_t *)hsh_retrieve(graph.graph, gram); |
||||
|
assert(neighbours); |
||||
|
return neighbours; |
||||
|
} |
||||
|
|
||||
|
static inline markov_trans_t* |
||||
|
get_prob_neighbours(graph_t graph, |
||||
|
char *gram) { |
||||
|
/* Return the converted probability transitions */ |
||||
|
markov_trans_t *neighbours; |
||||
|
neighbours = (markov_trans_t *)hsh_retrieve(graph.graph, gram); |
||||
|
assert(neighbours); |
||||
|
return neighbours; |
||||
|
} |
||||
|
|
||||
|
static inline void |
||||
|
convert_neighbours(graph_t graph, |
||||
|
char *gram) { |
||||
|
neighbours_t *neighbours = get_neighbours(graph, gram); |
||||
|
|
||||
|
markov_trans_t *result = xmalloc(sizeof (markov_trans_t)); |
||||
|
CHECK(result); |
||||
|
size_t nb_size = neighbours->number; |
||||
|
hsh_HashTable neighbours_hash = neighbours->neighbours; |
||||
|
|
||||
|
void *key; |
||||
|
uint32_t *frequency; |
||||
|
void *p; |
||||
|
uint32_t index = 0; |
||||
|
probability_t transition; |
||||
|
probability_t *neighbour_array = xcalloc(sizeof (probability_t), nb_size); |
||||
|
CHECK(neighbour_array); |
||||
|
HSH_ITERATE(neighbours_hash, p, key, frequency) { |
||||
|
transition.frequent.frequency = *frequency; |
||||
|
xfree(frequency); |
||||
|
transition.frequent.token = key; |
||||
|
neighbour_array[index] = transition; |
||||
|
index++; |
||||
|
} |
||||
|
float lower = 0.0; |
||||
|
probability_t current; |
||||
|
for (uint32_t i = 0; i < neighbours->unique_num; i++) { |
||||
|
current.frequent = neighbour_array[i].frequent; |
||||
|
neighbour_array[i].bucket.token = current.frequent.token; |
||||
|
neighbour_array[i].bucket.lower = lower; |
||||
|
neighbour_array[i].bucket.upper = lower + ((float)neighbour_array[i].frequent.frequency) / |
||||
|
(neighbours->number); |
||||
|
lower = neighbour_array[i].bucket.upper; |
||||
|
} |
||||
|
result->transitions = neighbour_array; |
||||
|
result->number = neighbours->unique_num; |
||||
|
hsh_delete(graph.graph, gram); |
||||
|
hsh_insert(graph.graph, gram, result); |
||||
|
hsh_destroy(neighbours->neighbours); |
||||
|
xfree(neighbours); |
||||
|
} |
||||
|
|
||||
|
static inline void |
||||
|
convert_all_neighbours(graph_t graph) { |
||||
|
void *p, *key; |
||||
|
char *current_key; |
||||
|
unsigned long num_keys = numberof_keys(graph); |
||||
|
if (num_keys == 0) { |
||||
|
return; |
||||
|
} |
||||
|
stk_Stack keys = stk_create(); |
||||
|
|
||||
|
/* iterate over all keys K, in hash table T */ |
||||
|
HSH_ITERATE_KEYS(graph.graph, p, key) { |
||||
|
stk_push(keys, key); |
||||
|
} |
||||
|
|
||||
|
for (uint32_t i = 0; i < num_keys; i++) { |
||||
|
current_key = (char *)stk_pop(keys); |
||||
|
convert_neighbours(graph, current_key); |
||||
|
} |
||||
|
stk_destroy(keys); |
||||
|
} |
||||
|
|
||||
|
static inline void |
||||
|
relate_bigram(const char *a, |
||||
|
const char *b, |
||||
|
graph_t graph) { |
||||
|
/* Update the graph with the information that b follows a */ |
||||
|
const char* str = get_ngram(a, graph); |
||||
|
insert_neighbour(str, b, graph); |
||||
|
increment_neighbour(str, b, graph); |
||||
|
} |
||||
|
|
||||
|
static int |
||||
|
transition_cmp(const void *keyval, |
||||
|
const void *datum) { |
||||
|
float chosen_number = *((float *)keyval); |
||||
|
probability_t *transition = (probability_t *)datum; |
||||
|
float lower = transition->bucket.lower; |
||||
|
float upper = transition->bucket.upper; |
||||
|
if ((chosen_number >= lower) && |
||||
|
(chosen_number <= upper)) { |
||||
|
return 0; |
||||
|
} |
||||
|
else if (chosen_number < lower) { |
||||
|
return -1; |
||||
|
} |
||||
|
else { |
||||
|
return 1; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
static inline char* |
||||
|
pick_random_transition(unique_keys_t unique_neighbours) { |
||||
|
size_t num = unique_neighbours.number; |
||||
|
char **keys = unique_neighbours.keys; |
||||
|
size_t selection = (size_t)floor(drand48() * (num - 1)); |
||||
|
return keys[selection]; |
||||
|
} |
||||
|
|
||||
|
|
||||
|
static inline char* |
||||
|
next_ngram(graph_t graph, |
||||
|
char *start, |
||||
|
unique_keys_t unique_neighbours) { |
||||
|
markov_trans_t *transitions = get_prob_neighbours(graph, start); |
||||
|
if (transitions->number == 0) { |
||||
|
return pick_random_transition(unique_neighbours); |
||||
|
} |
||||
|
probability_t *buckets = transitions->transitions; |
||||
|
size_t bucket_size = transitions->number; |
||||
|
float chosen = (float)drand48(); |
||||
|
probability_t *result = bsearch(&chosen, |
||||
|
buckets, |
||||
|
bucket_size, |
||||
|
sizeof (probability_t), |
||||
|
transition_cmp); |
||||
|
return ((char *)result->bucket.token); |
||||
|
} |
||||
|
|
||||
|
lst_List |
||||
|
generate_strings(markov_chain_t markov_chain, |
||||
|
char *start, |
||||
|
uint32_t n) { |
||||
|
unique_keys_t unique_neighbours = markov_chain.unique; |
||||
|
graph_t graph = markov_chain.graph; |
||||
|
lst_List result = lst_create(); |
||||
|
char *current = start; |
||||
|
for (uint32_t i = 0; i < n; i++) { |
||||
|
lst_append(result, current); |
||||
|
current = next_ngram(graph, current, unique_neighbours); |
||||
|
} |
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
static inline unique_keys_t |
||||
|
get_all_keys(graph_t graph) { |
||||
|
/* Gets all unique keys with neighbours */ |
||||
|
/* Should only be called after graph generation */ |
||||
|
unsigned long number = numberof_transitionable(graph); |
||||
|
char **keys = xcalloc(sizeof (char *), number); |
||||
|
CHECK(keys); |
||||
|
void *p, *key; |
||||
|
unique_keys_t result; |
||||
|
markov_trans_t *val; |
||||
|
uint32_t i = 0; |
||||
|
HSH_ITERATE(graph.graph, p, key, val) { |
||||
|
if (val->number > 0) { |
||||
|
keys[i] = key; |
||||
|
i++; |
||||
|
} |
||||
|
} |
||||
|
result.keys = keys; |
||||
|
result.number = i; |
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
static inline graph_t |
||||
|
make_graph(void) { |
||||
|
/* Make an initial empty graph */ |
||||
|
graph_t result; |
||||
|
result.cache = hsh_create(NULL, NULL); |
||||
|
result.graph = hsh_create(NULL, NULL); |
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
static inline void |
||||
|
release_converted_graph(graph_t graph) { |
||||
|
void *p, *key; |
||||
|
markov_trans_t *datum; |
||||
|
/* iterate over all keys K, in hash table and xfree them*/ |
||||
|
HSH_ITERATE(graph.graph, p, key, datum) { |
||||
|
xfree(datum->transitions); |
||||
|
xfree(datum); |
||||
|
xfree(key); |
||||
|
} |
||||
|
hsh_destroy(graph.cache); |
||||
|
hsh_destroy(graph.graph); |
||||
|
} |
||||
|
|
||||
|
markov_chain_t |
||||
|
build_markov_chain(token_stream tokens) { |
||||
|
markov_chain_t result; |
||||
|
graph_t graph = make_graph(); |
||||
|
token_t current; |
||||
|
token_t next; |
||||
|
while (tokens.length > 1) { |
||||
|
current = peek_token(&tokens); |
||||
|
pop_token(&tokens); |
||||
|
next = peek_token(&tokens); |
||||
|
relate_bigram(token_to_string(next), token_to_string(current), graph); |
||||
|
} |
||||
|
convert_all_neighbours(graph); |
||||
|
result.graph = graph; |
||||
|
result.unique = get_all_keys(graph); |
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
char * |
||||
|
token_to_string(token_t token) { |
||||
|
switch (token.token_type) { |
||||
|
case WORD: |
||||
|
return (char*)token.token.word; |
||||
|
break; |
||||
|
case INTEGER: |
||||
|
return (char*)token.token.integer; |
||||
|
break; |
||||
|
case FLOATING: |
||||
|
return (char*)token.token.floating; |
||||
|
break; |
||||
|
case QUOTE: |
||||
|
return "e; |
||||
|
break; |
||||
|
case PAREN: |
||||
|
return (char*)token.token.parenthesis; |
||||
|
break; |
||||
|
case EMPTY: |
||||
|
printf("should not be here\n"); |
||||
|
exit(EXIT_FAILURE); |
||||
|
break; |
||||
|
case STRING: |
||||
|
return (char*)token.token.string; |
||||
|
break; |
||||
|
default: |
||||
|
printf("oops, there was an unknown token, check valgrind or gdb\n"); |
||||
|
exit(EXIT_FAILURE); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
void |
||||
|
release_markov_chain(markov_chain_t chain) { |
||||
|
release_converted_graph(chain.graph); |
||||
|
xfree(chain.unique.keys); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
int |
||||
|
main (void) { |
||||
|
void *test_input = xmalloc(555000); |
||||
|
size_t nbytes = read(STDIN_FILENO, test_input, 555000); |
||||
|
|
||||
|
if (nbytes == 0) { |
||||
|
exit(EXIT_FAILURE); |
||||
|
} |
||||
|
token_stream test_bigrams_stack = tokenize(test_input, 0, nbytes); |
||||
|
markov_chain_t chain = build_markov_chain(test_bigrams_stack); |
||||
|
srand48(time(NULL)); |
||||
|
lst_List test = generate_strings(chain, token_to_string(peek_token(&test_bigrams_stack)), LEN); |
||||
|
lst_pop(test); |
||||
|
for (uint32_t i = 0; i < LEN-1; i++) { |
||||
|
printf("%s ", (char *)lst_pop(test)); |
||||
|
} |
||||
|
printf("\n"); |
||||
|
lst_destroy(test); |
||||
|
_lst_shutdown(); |
||||
|
release_markov_chain(chain); |
||||
|
xfree(test_input); |
||||
|
release_tokens(&test_bigrams_stack); |
||||
|
return EXIT_SUCCESS; |
||||
|
} |
@ -0,0 +1,78 @@ |
|||||
|
typedef |
||||
|
struct { |
||||
|
hsh_HashTable cache; |
||||
|
hsh_HashTable graph; |
||||
|
} |
||||
|
graph_t; |
||||
|
|
||||
|
typedef |
||||
|
struct { |
||||
|
size_t number; |
||||
|
char **keys; |
||||
|
} |
||||
|
unique_keys_t; |
||||
|
|
||||
|
typedef |
||||
|
struct { |
||||
|
graph_t graph; |
||||
|
unique_keys_t unique; |
||||
|
} |
||||
|
markov_chain_t; |
||||
|
|
||||
|
|
||||
|
typedef |
||||
|
struct { |
||||
|
hsh_HashTable neighbours; |
||||
|
size_t number; |
||||
|
size_t unique_num; |
||||
|
} |
||||
|
neighbours_t; |
||||
|
|
||||
|
/*
|
||||
|
* Transition types for various reasons |
||||
|
*/ |
||||
|
|
||||
|
typedef |
||||
|
struct { |
||||
|
float upper; |
||||
|
float lower; |
||||
|
const char *token; |
||||
|
} |
||||
|
bucket_t; |
||||
|
|
||||
|
typedef |
||||
|
struct { |
||||
|
uint32_t frequency; |
||||
|
const char *token; |
||||
|
} |
||||
|
transition_t; |
||||
|
|
||||
|
|
||||
|
typedef |
||||
|
union { |
||||
|
transition_t frequent; |
||||
|
bucket_t bucket; |
||||
|
} |
||||
|
probability_t; |
||||
|
|
||||
|
typedef |
||||
|
struct { |
||||
|
size_t number; |
||||
|
probability_t *transitions; |
||||
|
} |
||||
|
markov_trans_t; |
||||
|
|
||||
|
|
||||
|
markov_chain_t |
||||
|
build_markov_chain(token_stream); |
||||
|
|
||||
|
char * |
||||
|
token_to_string(token_t); |
||||
|
|
||||
|
void |
||||
|
release_markov_chain(markov_chain_t); |
||||
|
|
||||
|
lst_List |
||||
|
generate_strings(markov_chain_t, |
||||
|
char *, |
||||
|
uint32_t); |
@ -0,0 +1,23 @@ |
|||||
|
Two roads diverged in a yellow wood, |
||||
|
And sorry I could not travel both |
||||
|
And be one traveler, long I stood |
||||
|
And looked down one as far as I could |
||||
|
To where it bent in the undergrowth; |
||||
|
|
||||
|
Then took the other, as just as fair, |
||||
|
And having perhaps the better claim, |
||||
|
Because it was grassy and wanted wear; |
||||
|
Though as for that the passing there |
||||
|
Had worn them really about the same, |
||||
|
|
||||
|
And both that morning equally lay |
||||
|
In leaves no step had trodden black. |
||||
|
Oh, I kept the first for another day! |
||||
|
Yet knowing how way leads on to way, |
||||
|
I doubted if I should ever come back. |
||||
|
|
||||
|
I shall be telling this with a sigh |
||||
|
Somewhere ages and ages hence: |
||||
|
Two roads diverged in a wood, and I— |
||||
|
I took the one less traveled by, |
||||
|
And that has made all the difference. |
@ -0,0 +1,551 @@ |
|||||
|
#include <stdint.h> |
||||
|
#include <stdio.h> |
||||
|
#include <unistd.h> |
||||
|
#include <stdlib.h> |
||||
|
#include <ctype.h> |
||||
|
#include <stdbool.h> |
||||
|
#include <string.h> |
||||
|
#include <assert.h> |
||||
|
#include "error.h" |
||||
|
#include "maa.h" |
||||
|
#include "tokenize.h" |
||||
|
|
||||
|
/*
|
||||
|
* This is a basic s-expression tokenizer |
||||
|
* it also tokenizes things like number, string, and symbol literals |
||||
|
*/ |
||||
|
|
||||
|
const token_t nulltok = { |
||||
|
.token_type = EMPTY, |
||||
|
{ |
||||
|
.null_token=false |
||||
|
} |
||||
|
}; |
||||
|
|
||||
|
static const token_t quote_tok = { |
||||
|
.token_type = QUOTE, |
||||
|
.token= { |
||||
|
.quote=true |
||||
|
} |
||||
|
}; |
||||
|
|
||||
|
static const token_t left_paren = { |
||||
|
.token_type = PAREN, |
||||
|
.token = { |
||||
|
.parenthesis="(" |
||||
|
} |
||||
|
}; |
||||
|
|
||||
|
static const token_t right_paren = { |
||||
|
.token_type = PAREN, |
||||
|
.token = { |
||||
|
.parenthesis=")" |
||||
|
} |
||||
|
}; |
||||
|
|
||||
|
static inline const char * |
||||
|
string_head(uint32_t n, |
||||
|
const char *in, |
||||
|
char *out) { |
||||
|
/* out must be large enough to store the number of characters
|
||||
|
* you want to select from in, plus a byte for the null terminator |
||||
|
*/ |
||||
|
#ifndef NDEBUG |
||||
|
size_t in_len = strlen(in); |
||||
|
#endif |
||||
|
assert((n > 0 && n <= in_len)); |
||||
|
int iserror = snprintf(out, (size_t)n+1 , "%s", in); |
||||
|
|
||||
|
assert((iserror != -1) && ((size_t)iserror == in_len)); |
||||
|
|
||||
|
if (iserror == -1) { |
||||
|
printf("Out of memory"); |
||||
|
exit(EXIT_FAILURE); |
||||
|
} |
||||
|
return (const char*)out; |
||||
|
} |
||||
|
|
||||
|
static inline token_t |
||||
|
make_token(token_val_t val, |
||||
|
tok_t toktype) { |
||||
|
token_t result; |
||||
|
result.token_type = toktype; |
||||
|
result.token = val; |
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
bool |
||||
|
push_token(token_stream *tokens, |
||||
|
token_t token) { |
||||
|
/*
|
||||
|
* Check if tokens points to NULL |
||||
|
*/ |
||||
|
|
||||
|
size_t len; |
||||
|
size_t max; |
||||
|
|
||||
|
CHECK(tokens); |
||||
|
|
||||
|
len = tokens->length; |
||||
|
max = tokens->max_length; |
||||
|
|
||||
|
assert(len <= max); |
||||
|
assert(max > 0); |
||||
|
|
||||
|
if (len == max) { |
||||
|
/* We've reached the maximum stack size
|
||||
|
* So we must try to increase that by GROWTH_SIZE |
||||
|
*/ |
||||
|
token_t *new_tokens = xrealloc(tokens->tokens, sizeof (token_t) * (max * GROWTH_FACTOR)); |
||||
|
if (!new_tokens) { |
||||
|
printf("Could not allocate enough memory for the token stack\n"); |
||||
|
exit(EXIT_FAILURE); |
||||
|
} |
||||
|
tokens->tokens = new_tokens; |
||||
|
tokens->max_length = max * GROWTH_FACTOR; |
||||
|
tokens->tokens[len] = token; |
||||
|
tokens->length++; |
||||
|
return true; |
||||
|
} |
||||
|
tokens->tokens[len] = token; |
||||
|
tokens->length++; |
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
bool |
||||
|
pop_token(token_stream *tokens) { |
||||
|
size_t len; |
||||
|
CHECK(tokens); |
||||
|
|
||||
|
len = tokens->length; |
||||
|
|
||||
|
assert(len != 0); |
||||
|
len--; |
||||
|
CHECK(tokens->tokens); |
||||
|
|
||||
|
tokens->length--; |
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
inline token_t |
||||
|
peek_token(token_stream *tokens) { |
||||
|
/*
|
||||
|
* Check if tokens points to NULL |
||||
|
*/ |
||||
|
size_t len = tokens->length; |
||||
|
size_t max = tokens->max_length; |
||||
|
CHECK(tokens); |
||||
|
assert(len != 0); |
||||
|
|
||||
|
if (len == 0 || len > max) { |
||||
|
return nulltok; |
||||
|
} |
||||
|
return tokens->tokens[len-1]; |
||||
|
} |
||||
|
|
||||
|
static inline uint32_t |
||||
|
match_int(source_t source, |
||||
|
uint32_t begin, |
||||
|
const uint32_t length) { |
||||
|
/* Return false if there is no match
|
||||
|
* otherwise return the position of the end of the match + 1 |
||||
|
*/ |
||||
|
uint32_t i = begin; |
||||
|
uint32_t test; |
||||
|
CHECK(source); |
||||
|
assert(length > 0); |
||||
|
|
||||
|
if (source[i] == '+' || |
||||
|
source[i] == '-') { |
||||
|
i++; |
||||
|
} |
||||
|
test = i; |
||||
|
while (i < length && |
||||
|
isdigit(source[i])) { |
||||
|
i++; |
||||
|
} |
||||
|
if (i == test) |
||||
|
return false; |
||||
|
return i; |
||||
|
} |
||||
|
|
||||
|
static inline uint32_t |
||||
|
match_float(source_t source, |
||||
|
uint32_t begin, |
||||
|
const uint32_t length) { |
||||
|
/* Return false if there is no match
|
||||
|
* otherwise: |
||||
|
* if there is a leading decimal point and then a valid int match: |
||||
|
* return the position of the end of the match |
||||
|
* if there is a leading valid int match: |
||||
|
* but no decimal point match after that: |
||||
|
* return false |
||||
|
* if there is a decimal point match and then a valid int match: |
||||
|
* return the position of the match |
||||
|
* if there is no valid int match: |
||||
|
* return false |
||||
|
* ALWAYS returns the position + 1 to avoid confusion with false (which is a valid index) |
||||
|
*/ |
||||
|
uint32_t i, leading_int_match, trailing_int_match; |
||||
|
CHECK(source); |
||||
|
assert(length > 0); |
||||
|
|
||||
|
i = begin; |
||||
|
leading_int_match = match_int(source, i, length); |
||||
|
|
||||
|
if (leading_int_match) { |
||||
|
i = leading_int_match; |
||||
|
} |
||||
|
|
||||
|
assert(i <= length); |
||||
|
|
||||
|
if (source[i] != '.' || |
||||
|
source[i] == '+' || |
||||
|
source[i] == '-') { |
||||
|
if (((i+1) <= length) && /* Make sure there is at least two characters to look at */ |
||||
|
((source[i] == '+') || |
||||
|
(source[i] == '-')) |
||||
|
&& (source[i+1] == '.')) { |
||||
|
i++; |
||||
|
} |
||||
|
else { |
||||
|
return false; |
||||
|
} |
||||
|
} |
||||
|
i++; |
||||
|
|
||||
|
trailing_int_match = match_int(source, i, length); |
||||
|
if (trailing_int_match) { |
||||
|
return trailing_int_match; |
||||
|
} |
||||
|
return false; |
||||
|
} |
||||
|
|
||||
|
static inline uint32_t |
||||
|
match_word(source_t source, |
||||
|
uint32_t begin, |
||||
|
const uint32_t length) { |
||||
|
|
||||
|
/* Return false if there is no match
|
||||
|
* if there is a match for any characters that are not: |
||||
|
* whitespace |
||||
|
* a parenthesis ( ) |
||||
|
* a brace { } |
||||
|
* a square bracket [ ] |
||||
|
* then return the position of the match + 1 |
||||
|
* if there is nothing else to match: |
||||
|
* return false |
||||
|
*/ |
||||
|
uint32_t i = begin; |
||||
|
CHECK(source); |
||||
|
assert(length > 0); |
||||
|
|
||||
|
while (i < length && |
||||
|
!(source[i] == '(' || |
||||
|
source[i] == ')' || |
||||
|
isspace(source[i]))) { |
||||
|
i++; |
||||
|
} |
||||
|
|
||||
|
if (i == begin) { |
||||
|
return false; |
||||
|
} |
||||
|
assert(i <= length); |
||||
|
return i; |
||||
|
} |
||||
|
|
||||
|
static inline uint32_t |
||||
|
is_empty_string(const char *source, |
||||
|
uint32_t length) { |
||||
|
int allspace = false; |
||||
|
uint32_t i = 0; |
||||
|
if (source[i] != '\"') { |
||||
|
return false; |
||||
|
} |
||||
|
for (; i < length; i++) { |
||||
|
if (!isspace(source[i])) { |
||||
|
allspace = true; |
||||
|
} |
||||
|
} |
||||
|
/*if (allspace) {
|
||||
|
printf("Actually found an empty string! Of length %d\n", i); |
||||
|
}*/ |
||||
|
return allspace; |
||||
|
} |
||||
|
|
||||
|
|
||||
|
static inline uint32_t |
||||
|
match_string(source_t source, |
||||
|
uint32_t begin, |
||||
|
const uint32_t length) { |
||||
|
CHECK(source); |
||||
|
(void)length; |
||||
|
assert(length > 0); |
||||
|
uint32_t i = begin; |
||||
|
if (source[i] != '\"') { |
||||
|
return false; |
||||
|
} |
||||
|
i++; |
||||
|
while (source[i] != '\"' && |
||||
|
(i < length) && |
||||
|
(i < (begin + MAX_STRING_SIZE))) { |
||||
|
i++; |
||||
|
} |
||||
|
if ((i != (begin+1)) && |
||||
|
(i <= length) && |
||||
|
(source[i] == '\"')) { |
||||
|
return i+1; |
||||
|
} |
||||
|
return false; |
||||
|
} |
||||
|
|
||||
|
static inline void |
||||
|
extract_token(uint32_t position, |
||||
|
uint32_t begin, |
||||
|
const source_t source, |
||||
|
const char *token_val) { |
||||
|
assert(position > begin); |
||||
|
string_head(position - begin, |
||||
|
&source[begin], |
||||
|
(char *)token_val); |
||||
|
} |
||||
|
|
||||
|
token_stream |
||||
|
tokenize(source_t source, |
||||
|
uint32_t begin, |
||||
|
const uint32_t length) { |
||||
|
/*
|
||||
|
* Remember to free everything from this struct |
||||
|
* for example, token_stack.tokens will not necessarily be |
||||
|
* equal to tokens after this function has run |
||||
|
* |
||||
|
*/ |
||||
|
uint32_t position = begin; |
||||
|
uint32_t allspace = false; |
||||
|
const char *current_token_val; |
||||
|
token_stream token_stack; |
||||
|
token_val_t current_token; |
||||
|
token_t *tokens = xcalloc(STACK_SIZE, sizeof (token_t)); |
||||
|
|
||||
|
hsh_HashTable token_memo = hsh_create(NULL, NULL); |
||||
|
|
||||
|
assert(begin == 0); |
||||
|
assert(length > 0); |
||||
|
CHECK(source); |
||||
|
|
||||
|
token_stack.length = 0; |
||||
|
token_stack.max_length = STACK_SIZE; |
||||
|
token_stack.tokens = tokens; |
||||
|
token_stack.memo = token_memo; |
||||
|
char lookahead = '\0'; |
||||
|
assert(STACK_SIZE > 0); |
||||
|
|
||||
|
while (begin <= length && source[begin]) { |
||||
|
/* Possibly matched a string
|
||||
|
* First look for closing " |
||||
|
* Then look for a newline to close it if no " |
||||
|
* Then stop after some large constant of characters maybe? |
||||
|
* We're dealing with real text so people might forget to close |
||||
|
* quotations, so we have to be clever about it and use heuristics (for performance) |
||||
|
*/ |
||||
|
if (source[begin] == '(') { |
||||
|
/*Matched a left paren */ |
||||
|
position = begin + 1; |
||||
|
push_token(&token_stack, left_paren); |
||||
|
} |
||||
|
else if (source[begin] == ')') { |
||||
|
/*Matched a left paren */ |
||||
|
position = begin + 1; |
||||
|
push_token(&token_stack, right_paren); |
||||
|
} |
||||
|
else if (isspace(source[begin])) { |
||||
|
position = begin + 1; |
||||
|
/* Matched a whitespace character */ |
||||
|
} |
||||
|
else if ((position = match_string(source, begin, length))) { |
||||
|
/* Possibly matched a string
|
||||
|
* First look for closing " |
||||
|
* Then look for a newline to close it if no " |
||||
|
* Then stop after some large constant of characters maybe? |
||||
|
* We're dealing with real text so people might forget to close |
||||
|
* quotations, so we have to be clever about it and use heuristics (for performance) |
||||
|
*/ |
||||
|
lookahead = source[position]; |
||||
|
source[position] = '\0'; |
||||
|
if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) { |
||||
|
current_token.string = current_token_val; |
||||
|
source[position] = lookahead; |
||||
|
allspace = false; |
||||
|
} |
||||
|
else { |
||||
|
assert(position > begin); |
||||
|
current_token_val = xcalloc(((position - begin) + 1), sizeof (char)); |
||||
|
CHECK(current_token_val); |
||||
|
extract_token(position, begin, source, current_token_val); |
||||
|
if ((allspace = is_empty_string(current_token_val, (position - begin) + 1))) { |
||||
|
source[position] = lookahead; |
||||
|
current_token.string = current_token_val; |
||||
|
} |
||||
|
else { |
||||
|
source[position] = lookahead; |
||||
|
hsh_insert(token_stack.memo, current_token_val, current_token_val); |
||||
|
current_token.string = current_token_val; |
||||
|
} |
||||
|
} |
||||
|
if (allspace) { |
||||
|
push_token(&token_stack, make_token(current_token, STRING)); |
||||
|
} |
||||
|
} |
||||
|
else if ((position = match_float(source, begin, length))) { |
||||
|
/* Matched a float */ |
||||
|
lookahead = source[position]; |
||||
|
source[position] = '\0'; |
||||
|
if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) { |
||||
|
current_token.floating = current_token_val; |
||||
|
source[position] = lookahead; |
||||
|
} |
||||
|
else { |
||||
|
assert(position > begin); |
||||
|
current_token_val = xcalloc(((position - begin) + 1), sizeof (char)); |
||||
|
CHECK(current_token_val); |
||||
|
extract_token(position, begin, source, current_token_val); |
||||
|
source[position] = lookahead; |
||||
|
hsh_insert(token_stack.memo, current_token_val, current_token_val); |
||||
|
current_token.floating = current_token_val; |
||||
|
} |
||||
|
push_token(&token_stack, make_token(current_token, FLOATING)); |
||||
|
} |
||||
|
else if ((position = match_int(source, begin, length))) { |
||||
|
/* Matched an int */ |
||||
|
lookahead = source[position]; |
||||
|
source[position] = '\0'; |
||||
|
if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) { |
||||
|
current_token.integer = current_token_val; |
||||
|
source[position] = lookahead; |
||||
|
} |
||||
|
else { |
||||
|
assert(position > begin); |
||||
|
assert(position <= length); |
||||
|
|
||||
|
current_token_val = xcalloc(((position - begin) + 1), sizeof (char)); |
||||
|
CHECK(current_token_val); |
||||
|
extract_token(position, begin, source, current_token_val); |
||||
|
source[position] = lookahead; |
||||
|
hsh_insert(token_stack.memo, current_token_val, current_token_val); |
||||
|
current_token.integer = current_token_val; |
||||
|
} |
||||
|
push_token(&token_stack, make_token(current_token, INTEGER)); |
||||
|
} |
||||
|
else if (source[begin] == '\'') { |
||||
|
/* Matched a quote (apostrophe) */ |
||||
|
position = begin + 1; |
||||
|
push_token(&token_stack, quote_tok); |
||||
|
} |
||||
|
else if ((position = match_word(source, begin, length))) { |
||||
|
/* Matched a word */ |
||||
|
lookahead = source[position]; |
||||
|
source[position] = '\0'; |
||||
|
if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) { |
||||
|
current_token.word = current_token_val; |
||||
|
source[position] = lookahead; |
||||
|
} |
||||
|
else { |
||||
|
assert(position > begin); |
||||
|
assert(position <= length); |
||||
|
|
||||
|
current_token_val = xcalloc(((position - begin) + 1), sizeof (char)); |
||||
|
CHECK(current_token_val); |
||||
|
extract_token(position, begin, source, current_token_val); |
||||
|
source[position] = lookahead; |
||||
|
hsh_insert(token_stack.memo, current_token_val, current_token_val); |
||||
|
current_token.word = current_token_val; |
||||
|
} |
||||
|
push_token(&token_stack, make_token(current_token, WORD)); |
||||
|
/* Matched a word */ |
||||
|
} |
||||
|
else if (position <= begin) { |
||||
|
printf("Source is too large to read\n"); |
||||
|
exit(EXIT_FAILURE); |
||||
|
} |
||||
|
else { |
||||
|
printf("Unmatched token\n"); |
||||
|
exit(EXIT_FAILURE); |
||||
|
} |
||||
|
begin = position; |
||||
|
} |
||||
|
|
||||
|
return token_stack; |
||||
|
} |
||||
|
|
||||
|
int |
||||
|
free_token(const void *key, |
||||
|
const void *val) { |
||||
|
/* silence warnings about unused parameters, key and val point to the same data*/ |
||||
|
(void)key; |
||||
|
xfree((char *)val); |
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
bool |
||||
|
release_tokens(token_stream *tokens) { |
||||
|
/* Iterate through the stack, release each token
|
||||
|
* Then release the entire stack |
||||
|
*/ |
||||
|
CHECK(tokens); |
||||
|
CHECK(tokens->tokens); |
||||
|
assert(tokens->max_length > 0); |
||||
|
xfree(tokens->tokens); |
||||
|
hsh_iterate(tokens->memo, free_token); |
||||
|
|
||||
|
hsh_destroy(tokens->memo); |
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
#ifndef TOK_LIB |
||||
|
int main(void) { |
||||
|
void *source_code = malloc(111000); |
||||
|
size_t nbytes = read(STDIN_FILENO, source_code, 111000); |
||||
|
if (nbytes == 0) { |
||||
|
exit(EXIT_FAILURE); |
||||
|
} |
||||
|
token_stream toks = tokenize(source_code, 0, nbytes); |
||||
|
token_t current_tok; |
||||
|
while (toks.length > 0) { |
||||
|
current_tok = peek_token(&toks); |
||||
|
switch (current_tok.token_type) { |
||||
|
case SYMBOL: |
||||
|
printf("symbol: %s\n", current_tok.token.symbol); |
||||
|
break; |
||||
|
case WORD: |
||||
|
printf("identifer: %s\n", current_tok.token.word); |
||||
|
break; |
||||
|
case INTEGER: |
||||
|
printf("integer: %s\n", current_tok.token.integer); |
||||
|
break; |
||||
|
case FLOATING: |
||||
|
printf("floating: %s\n", current_tok.token.floating); |
||||
|
break; |
||||
|
case QUOTE: |
||||
|
printf("quote: '\n"); |
||||
|
break; |
||||
|
case WSPACE: |
||||
|
printf("whitespace\n"); |
||||
|
break; |
||||
|
case PAREN: |
||||
|
printf("paren: %s\n", current_tok.token.parenthesis); |
||||
|
break; |
||||
|
case EMPTY: |
||||
|
printf("this should not be empty\n"); |
||||
|
break; |
||||
|
case STRING: |
||||
|
printf("string: %s\n", current_tok.token.string); |
||||
|
break; |
||||
|
default: |
||||
|
printf("oops, there was an unknown token, check valgrind or gdb\n"); |
||||
|
} |
||||
|
pop_token(&toks); |
||||
|
} |
||||
|
release_tokens(&toks); |
||||
|
return 0; |
||||
|
} |
||||
|
#endif |
@ -0,0 +1,73 @@ |
|||||
|
#define STACK_SIZE 4096 |
||||
|
#define GROWTH_FACTOR 2 |
||||
|
#define MAX_STRING_SIZE 30 |
||||
|
|
||||
|
typedef char* source_t; |
||||
|
|
||||
|
typedef enum { |
||||
|
WORD = 1, |
||||
|
INTEGER = 2, |
||||
|
FLOATING = 3, |
||||
|
QUOTE = 4, |
||||
|
WSPACE = 5, |
||||
|
PAREN = 6 , |
||||
|
EMPTY = 7, |
||||
|
STRING = 8 |
||||
|
} tok_t; |
||||
|
|
||||
|
typedef union { |
||||
|
const char *word; |
||||
|
const char *integer; |
||||
|
const char *floating; |
||||
|
const char *parenthesis; |
||||
|
const char *string; |
||||
|
bool quote; |
||||
|
bool null_token; |
||||
|
} token_val_t; |
||||
|
|
||||
|
typedef struct { |
||||
|
tok_t token_type; |
||||
|
token_val_t token; |
||||
|
} token_t; |
||||
|
|
||||
|
typedef struct { |
||||
|
size_t length; /* Number of current elements */ |
||||
|
size_t max_length; /* Maximum length of the stack */ |
||||
|
token_t *tokens; |
||||
|
hsh_HashTable memo; |
||||
|
} token_stream; |
||||
|
|
||||
|
bool |
||||
|
push_token(token_stream*, token_t); |
||||
|
|
||||
|
bool |
||||
|
pop_token(token_stream*); |
||||
|
|
||||
|
token_t |
||||
|
peek_token(token_stream*); |
||||
|
|
||||
|
token_stream |
||||
|
tokenize(source_t, uint32_t, const uint32_t); |
||||
|
|
||||
|
bool |
||||
|
release_tokens(token_stream*); |
||||
|
|
||||
|
#ifndef TOK_LIB |
||||
|
static uint32_t |
||||
|
match_int(source_t, uint32_t, const uint32_t); |
||||
|
|
||||
|
static uint32_t |
||||
|
match_float(source_t, uint32_t, const uint32_t); |
||||
|
|
||||
|
static uint32_t |
||||
|
match_word(source_t, uint32_t, const uint32_t); |
||||
|
|
||||
|
static uint32_t |
||||
|
match_string(source_t, uint32_t, const uint32_t); |
||||
|
#endif |
||||
|
|
||||
|
int |
||||
|
free_token(const void *, |
||||
|
const void *); |
||||
|
token_t |
||||
|
testfunc(void); |
Reference in new issue