Browse Source

first commit

master
nisstyre56 8 years ago
commit
3dc20ed2dd
  1. 13
      Makefile
  2. 1
      error.h
  3. 400
      markov.c
  4. 78
      markov.h
  5. 23
      roadnottaken
  6. 551
      tokenize.c
  7. 73
      tokenize.h

13
Makefile

@ -0,0 +1,13 @@
default: markov.c markov.h
$(MAKE) lib;
$(CC) -g -DTOK_LIB -Wall -Wextra -std=gnu99 -Wpointer-arith -Wmissing-prototypes -Werror -lmaa -lm -L. -ltokenize -O3 ./markov.c -o markov_test -Wl,-rpath,/home/wes/markov;
unsafe: markov.c markov.h
$(MAKE) lib;
$(CC) -DNDEBUG -DTOK_LIB -Wall -std=gnu99 -Wextra -Wpointer-arith -Wmissing-prototypes -lmaa -lm -L. -ltokenize -O3 ./markov.c -o markov -Wl,-rpath,/home/wes/markov;
lib: markov.c markov.h tokenize.c tokenize.h
$(CC) -g -DTOK_LIB -c -fpic -Wall -Wextra -pedantic -Wpointer-arith -Werror -std=c99 -O3 ./tokenize.c
$(CC) -shared -o libtokenize.so tokenize.o -lmaa;
$(CC) -g -DTOK_LIB -c -fpic -Wall -Wextra -pedantic -Wpointer-arith -Werror -std=gnu99 -O3 ./markov.c -lmaa -lm
$(CC) -shared -o markov.so markov.o -lmaa;

1
error.h

@ -0,0 +1 @@
#define CHECK(ptr) if ((ptr) == NULL) { printf("Failed to allocate memory\n"); exit(EXIT_FAILURE); }

400
markov.c

@ -0,0 +1,400 @@
#include <time.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <ctype.h>
#include <stdbool.h>
#include <string.h>
#include <assert.h>
#include "error.h"
#include "maa.h"
#include "tokenize.h"
#include "markov.h"
#define LEN 50
static char quote = '\'';
static inline void
initialize_neighbours(const char *str,
hsh_HashTable graph_table) {
/* Initialize the table of neighbours corresponding to some string in the markov graph */
assert(!hsh_retrieve(graph_table, str));
neighbours_t *new_neighbours = xmalloc(sizeof (neighbours_t));
CHECK(new_neighbours);
new_neighbours->neighbours = hsh_create(NULL, NULL);
new_neighbours->number = 0;
new_neighbours->unique_num = 0;
hsh_insert(graph_table, str, new_neighbours);
return;
}
static inline unsigned long
numberof_keys(graph_t graph) {
/* Get the number of unique keys in the graph */
hsh_Stats stats = hsh_get_stats(graph.cache);
unsigned long num = stats->entries;
xfree(stats);
return num;
}
static inline unsigned long
numberof_transitionable(graph_t graph) {
/* Get the number of keys with >0 neighbours */
/* Only call after graph has been converted */
unsigned long num = 0;
void *p, *key;
markov_trans_t *val;
HSH_ITERATE(graph.graph, p, key, val) {
if (val->number > 0) {
num++;
}
}
return num;
}
static const char*
get_ngram(const char* str,
graph_t graph) {
/* Try to get a string from the cache.
* If it's not already cached, allocate the memory for it
* then return the freshly cached string
*/
hsh_HashTable cache = graph.cache;
hsh_HashTable graph_table = graph.graph;
const char *exists = hsh_retrieve(cache, str);
if (exists) {
return exists;
}
else {
/* Add it to the cache and return it */
size_t gram_size = strlen(str) + 1;
char *new_str = xmalloc(gram_size);
CHECK(str);
snprintf(new_str, gram_size, "%s", str);
hsh_insert(cache, new_str, new_str);
initialize_neighbours(new_str, graph_table);
return new_str;
}
}
static inline void
insert_neighbour(const char *left,
const char *neighbour,
graph_t graph) {
/* Insert a neighbour into the table of neighbours for a given key */
neighbours_t *neighbours = (neighbours_t *)hsh_retrieve(graph.graph, left);
neighbours->number++;
hsh_HashTable neighbours_table = neighbours->neighbours;
CHECK(neighbours->neighbours);
if (hsh_retrieve(neighbours_table, neighbour)) {
return;
}
neighbours->unique_num++;
const char *new_neighbour = get_ngram(neighbour, graph);
CHECK(new_neighbour);
uint32_t *count = xmalloc(sizeof (uint32_t));
CHECK(count);
*count = 0;
hsh_insert(neighbours_table, new_neighbour, count);
}
static inline void
increment_neighbour(const char *left,
const char *neighbour,
graph_t graph) {
/* Increment the frequency of a given bi-gram.
* bi-gram does not necessarily mean a specific thing
* it could be pairs of words, pairs of letters, sequences of n letters, and so on
*/
neighbours_t *neighbours = (neighbours_t *)hsh_retrieve(graph.graph, left);
hsh_HashTable neighbours_hash = neighbours->neighbours;
CHECK(neighbours_hash);
uint32_t *count = (uint32_t *)hsh_retrieve(neighbours_hash, neighbour);
CHECK(count);
(*count)++;
}
static inline neighbours_t*
get_neighbours(graph_t graph,
char *gram) {
/* Simply return the table of neighbours corresponding to a given string */
neighbours_t *neighbours;
neighbours = (neighbours_t *)hsh_retrieve(graph.graph, gram);
assert(neighbours);
return neighbours;
}
static inline markov_trans_t*
get_prob_neighbours(graph_t graph,
char *gram) {
/* Return the converted probability transitions */
markov_trans_t *neighbours;
neighbours = (markov_trans_t *)hsh_retrieve(graph.graph, gram);
assert(neighbours);
return neighbours;
}
static inline void
convert_neighbours(graph_t graph,
char *gram) {
neighbours_t *neighbours = get_neighbours(graph, gram);
markov_trans_t *result = xmalloc(sizeof (markov_trans_t));
CHECK(result);
size_t nb_size = neighbours->number;
hsh_HashTable neighbours_hash = neighbours->neighbours;
void *key;
uint32_t *frequency;
void *p;
uint32_t index = 0;
probability_t transition;
probability_t *neighbour_array = xcalloc(sizeof (probability_t), nb_size);
CHECK(neighbour_array);
HSH_ITERATE(neighbours_hash, p, key, frequency) {
transition.frequent.frequency = *frequency;
xfree(frequency);
transition.frequent.token = key;
neighbour_array[index] = transition;
index++;
}
float lower = 0.0;
probability_t current;
for (uint32_t i = 0; i < neighbours->unique_num; i++) {
current.frequent = neighbour_array[i].frequent;
neighbour_array[i].bucket.token = current.frequent.token;
neighbour_array[i].bucket.lower = lower;
neighbour_array[i].bucket.upper = lower + ((float)neighbour_array[i].frequent.frequency) /
(neighbours->number);
lower = neighbour_array[i].bucket.upper;
}
result->transitions = neighbour_array;
result->number = neighbours->unique_num;
hsh_delete(graph.graph, gram);
hsh_insert(graph.graph, gram, result);
hsh_destroy(neighbours->neighbours);
xfree(neighbours);
}
static inline void
convert_all_neighbours(graph_t graph) {
void *p, *key;
char *current_key;
unsigned long num_keys = numberof_keys(graph);
if (num_keys == 0) {
return;
}
stk_Stack keys = stk_create();
/* iterate over all keys K, in hash table T */
HSH_ITERATE_KEYS(graph.graph, p, key) {
stk_push(keys, key);
}
for (uint32_t i = 0; i < num_keys; i++) {
current_key = (char *)stk_pop(keys);
convert_neighbours(graph, current_key);
}
stk_destroy(keys);
}
static inline void
relate_bigram(const char *a,
const char *b,
graph_t graph) {
/* Update the graph with the information that b follows a */
const char* str = get_ngram(a, graph);
insert_neighbour(str, b, graph);
increment_neighbour(str, b, graph);
}
static int
transition_cmp(const void *keyval,
const void *datum) {
float chosen_number = *((float *)keyval);
probability_t *transition = (probability_t *)datum;
float lower = transition->bucket.lower;
float upper = transition->bucket.upper;
if ((chosen_number >= lower) &&
(chosen_number <= upper)) {
return 0;
}
else if (chosen_number < lower) {
return -1;
}
else {
return 1;
}
}
static inline char*
pick_random_transition(unique_keys_t unique_neighbours) {
size_t num = unique_neighbours.number;
char **keys = unique_neighbours.keys;
size_t selection = (size_t)floor(drand48() * (num - 1));
return keys[selection];
}
static inline char*
next_ngram(graph_t graph,
char *start,
unique_keys_t unique_neighbours) {
markov_trans_t *transitions = get_prob_neighbours(graph, start);
if (transitions->number == 0) {
return pick_random_transition(unique_neighbours);
}
probability_t *buckets = transitions->transitions;
size_t bucket_size = transitions->number;
float chosen = (float)drand48();
probability_t *result = bsearch(&chosen,
buckets,
bucket_size,
sizeof (probability_t),
transition_cmp);
return ((char *)result->bucket.token);
}
lst_List
generate_strings(markov_chain_t markov_chain,
char *start,
uint32_t n) {
unique_keys_t unique_neighbours = markov_chain.unique;
graph_t graph = markov_chain.graph;
lst_List result = lst_create();
char *current = start;
for (uint32_t i = 0; i < n; i++) {
lst_append(result, current);
current = next_ngram(graph, current, unique_neighbours);
}
return result;
}
static inline unique_keys_t
get_all_keys(graph_t graph) {
/* Gets all unique keys with neighbours */
/* Should only be called after graph generation */
unsigned long number = numberof_transitionable(graph);
char **keys = xcalloc(sizeof (char *), number);
CHECK(keys);
void *p, *key;
unique_keys_t result;
markov_trans_t *val;
uint32_t i = 0;
HSH_ITERATE(graph.graph, p, key, val) {
if (val->number > 0) {
keys[i] = key;
i++;
}
}
result.keys = keys;
result.number = i;
return result;
}
static inline graph_t
make_graph(void) {
/* Make an initial empty graph */
graph_t result;
result.cache = hsh_create(NULL, NULL);
result.graph = hsh_create(NULL, NULL);
return result;
}
static inline void
release_converted_graph(graph_t graph) {
void *p, *key;
markov_trans_t *datum;
/* iterate over all keys K, in hash table and xfree them*/
HSH_ITERATE(graph.graph, p, key, datum) {
xfree(datum->transitions);
xfree(datum);
xfree(key);
}
hsh_destroy(graph.cache);
hsh_destroy(graph.graph);
}
markov_chain_t
build_markov_chain(token_stream tokens) {
markov_chain_t result;
graph_t graph = make_graph();
token_t current;
token_t next;
while (tokens.length > 1) {
current = peek_token(&tokens);
pop_token(&tokens);
next = peek_token(&tokens);
relate_bigram(token_to_string(next), token_to_string(current), graph);
}
convert_all_neighbours(graph);
result.graph = graph;
result.unique = get_all_keys(graph);
return result;
}
char *
token_to_string(token_t token) {
switch (token.token_type) {
case WORD:
return (char*)token.token.word;
break;
case INTEGER:
return (char*)token.token.integer;
break;
case FLOATING:
return (char*)token.token.floating;
break;
case QUOTE:
return &quote;
break;
case PAREN:
return (char*)token.token.parenthesis;
break;
case EMPTY:
printf("should not be here\n");
exit(EXIT_FAILURE);
break;
case STRING:
return (char*)token.token.string;
break;
default:
printf("oops, there was an unknown token, check valgrind or gdb\n");
exit(EXIT_FAILURE);
}
}
void
release_markov_chain(markov_chain_t chain) {
release_converted_graph(chain.graph);
xfree(chain.unique.keys);
return;
}
int
main (void) {
void *test_input = xmalloc(555000);
size_t nbytes = read(STDIN_FILENO, test_input, 555000);
if (nbytes == 0) {
exit(EXIT_FAILURE);
}
token_stream test_bigrams_stack = tokenize(test_input, 0, nbytes);
markov_chain_t chain = build_markov_chain(test_bigrams_stack);
srand48(time(NULL));
lst_List test = generate_strings(chain, token_to_string(peek_token(&test_bigrams_stack)), LEN);
lst_pop(test);
for (uint32_t i = 0; i < LEN-1; i++) {
printf("%s ", (char *)lst_pop(test));
}
printf("\n");
lst_destroy(test);
_lst_shutdown();
release_markov_chain(chain);
xfree(test_input);
release_tokens(&test_bigrams_stack);
return EXIT_SUCCESS;
}

78
markov.h

@ -0,0 +1,78 @@
typedef
struct {
hsh_HashTable cache;
hsh_HashTable graph;
}
graph_t;
typedef
struct {
size_t number;
char **keys;
}
unique_keys_t;
typedef
struct {
graph_t graph;
unique_keys_t unique;
}
markov_chain_t;
typedef
struct {
hsh_HashTable neighbours;
size_t number;
size_t unique_num;
}
neighbours_t;
/*
* Transition types for various reasons
*/
typedef
struct {
float upper;
float lower;
const char *token;
}
bucket_t;
typedef
struct {
uint32_t frequency;
const char *token;
}
transition_t;
typedef
union {
transition_t frequent;
bucket_t bucket;
}
probability_t;
typedef
struct {
size_t number;
probability_t *transitions;
}
markov_trans_t;
markov_chain_t
build_markov_chain(token_stream);
char *
token_to_string(token_t);
void
release_markov_chain(markov_chain_t);
lst_List
generate_strings(markov_chain_t,
char *,
uint32_t);

23
roadnottaken

@ -0,0 +1,23 @@
Two roads diverged in a yellow wood,
And sorry I could not travel both
And be one traveler, long I stood
And looked down one as far as I could
To where it bent in the undergrowth;
Then took the other, as just as fair,
And having perhaps the better claim,
Because it was grassy and wanted wear;
Though as for that the passing there
Had worn them really about the same,
And both that morning equally lay
In leaves no step had trodden black.
Oh, I kept the first for another day!
Yet knowing how way leads on to way,
I doubted if I should ever come back.
I shall be telling this with a sigh
Somewhere ages and ages hence:
Two roads diverged in a wood, and I—
I took the one less traveled by,
And that has made all the difference.

551
tokenize.c

@ -0,0 +1,551 @@
#include <stdint.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <ctype.h>
#include <stdbool.h>
#include <string.h>
#include <assert.h>
#include "error.h"
#include "maa.h"
#include "tokenize.h"
/*
* This is a basic s-expression tokenizer
* it also tokenizes things like number, string, and symbol literals
*/
const token_t nulltok = {
.token_type = EMPTY,
{
.null_token=false
}
};
static const token_t quote_tok = {
.token_type = QUOTE,
.token= {
.quote=true
}
};
static const token_t left_paren = {
.token_type = PAREN,
.token = {
.parenthesis="("
}
};
static const token_t right_paren = {
.token_type = PAREN,
.token = {
.parenthesis=")"
}
};
static inline const char *
string_head(uint32_t n,
const char *in,
char *out) {
/* out must be large enough to store the number of characters
* you want to select from in, plus a byte for the null terminator
*/
#ifndef NDEBUG
size_t in_len = strlen(in);
#endif
assert((n > 0 && n <= in_len));
int iserror = snprintf(out, (size_t)n+1 , "%s", in);
assert((iserror != -1) && ((size_t)iserror == in_len));
if (iserror == -1) {
printf("Out of memory");
exit(EXIT_FAILURE);
}
return (const char*)out;
}
static inline token_t
make_token(token_val_t val,
tok_t toktype) {
token_t result;
result.token_type = toktype;
result.token = val;
return result;
}
bool
push_token(token_stream *tokens,
token_t token) {
/*
* Check if tokens points to NULL
*/
size_t len;
size_t max;
CHECK(tokens);
len = tokens->length;
max = tokens->max_length;
assert(len <= max);
assert(max > 0);
if (len == max) {
/* We've reached the maximum stack size
* So we must try to increase that by GROWTH_SIZE
*/
token_t *new_tokens = xrealloc(tokens->tokens, sizeof (token_t) * (max * GROWTH_FACTOR));
if (!new_tokens) {
printf("Could not allocate enough memory for the token stack\n");
exit(EXIT_FAILURE);
}
tokens->tokens = new_tokens;
tokens->max_length = max * GROWTH_FACTOR;
tokens->tokens[len] = token;
tokens->length++;
return true;
}
tokens->tokens[len] = token;
tokens->length++;
return true;
}
bool
pop_token(token_stream *tokens) {
size_t len;
CHECK(tokens);
len = tokens->length;
assert(len != 0);
len--;
CHECK(tokens->tokens);
tokens->length--;
return true;
}
inline token_t
peek_token(token_stream *tokens) {
/*
* Check if tokens points to NULL
*/
size_t len = tokens->length;
size_t max = tokens->max_length;
CHECK(tokens);
assert(len != 0);
if (len == 0 || len > max) {
return nulltok;
}
return tokens->tokens[len-1];
}
static inline uint32_t
match_int(source_t source,
uint32_t begin,
const uint32_t length) {
/* Return false if there is no match
* otherwise return the position of the end of the match + 1
*/
uint32_t i = begin;
uint32_t test;
CHECK(source);
assert(length > 0);
if (source[i] == '+' ||
source[i] == '-') {
i++;
}
test = i;
while (i < length &&
isdigit(source[i])) {
i++;
}
if (i == test)
return false;
return i;
}
static inline uint32_t
match_float(source_t source,
uint32_t begin,
const uint32_t length) {
/* Return false if there is no match
* otherwise:
* if there is a leading decimal point and then a valid int match:
* return the position of the end of the match
* if there is a leading valid int match:
* but no decimal point match after that:
* return false
* if there is a decimal point match and then a valid int match:
* return the position of the match
* if there is no valid int match:
* return false
* ALWAYS returns the position + 1 to avoid confusion with false (which is a valid index)
*/
uint32_t i, leading_int_match, trailing_int_match;
CHECK(source);
assert(length > 0);
i = begin;
leading_int_match = match_int(source, i, length);
if (leading_int_match) {
i = leading_int_match;
}
assert(i <= length);
if (source[i] != '.' ||
source[i] == '+' ||
source[i] == '-') {
if (((i+1) <= length) && /* Make sure there is at least two characters to look at */
((source[i] == '+') ||
(source[i] == '-'))
&& (source[i+1] == '.')) {
i++;
}
else {
return false;
}
}
i++;
trailing_int_match = match_int(source, i, length);
if (trailing_int_match) {
return trailing_int_match;
}
return false;
}
static inline uint32_t
match_word(source_t source,
uint32_t begin,
const uint32_t length) {
/* Return false if there is no match
* if there is a match for any characters that are not:
* whitespace
* a parenthesis ( )
* a brace { }
* a square bracket [ ]
* then return the position of the match + 1
* if there is nothing else to match:
* return false
*/
uint32_t i = begin;
CHECK(source);
assert(length > 0);
while (i < length &&
!(source[i] == '(' ||
source[i] == ')' ||
isspace(source[i]))) {
i++;
}
if (i == begin) {
return false;
}
assert(i <= length);
return i;
}
static inline uint32_t
is_empty_string(const char *source,
uint32_t length) {
int allspace = false;
uint32_t i = 0;
if (source[i] != '\"') {
return false;
}
for (; i < length; i++) {
if (!isspace(source[i])) {
allspace = true;
}
}
/*if (allspace) {
printf("Actually found an empty string! Of length %d\n", i);
}*/
return allspace;
}
static inline uint32_t
match_string(source_t source,
uint32_t begin,
const uint32_t length) {
CHECK(source);
(void)length;
assert(length > 0);
uint32_t i = begin;
if (source[i] != '\"') {
return false;
}
i++;
while (source[i] != '\"' &&
(i < length) &&
(i < (begin + MAX_STRING_SIZE))) {
i++;
}
if ((i != (begin+1)) &&
(i <= length) &&
(source[i] == '\"')) {
return i+1;
}
return false;
}
static inline void
extract_token(uint32_t position,
uint32_t begin,
const source_t source,
const char *token_val) {
assert(position > begin);
string_head(position - begin,
&source[begin],
(char *)token_val);
}
token_stream
tokenize(source_t source,
uint32_t begin,
const uint32_t length) {
/*
* Remember to free everything from this struct
* for example, token_stack.tokens will not necessarily be
* equal to tokens after this function has run
*
*/
uint32_t position = begin;
uint32_t allspace = false;
const char *current_token_val;
token_stream token_stack;
token_val_t current_token;
token_t *tokens = xcalloc(STACK_SIZE, sizeof (token_t));
hsh_HashTable token_memo = hsh_create(NULL, NULL);
assert(begin == 0);
assert(length > 0);
CHECK(source);
token_stack.length = 0;
token_stack.max_length = STACK_SIZE;
token_stack.tokens = tokens;
token_stack.memo = token_memo;
char lookahead = '\0';
assert(STACK_SIZE > 0);
while (begin <= length && source[begin]) {
/* Possibly matched a string
* First look for closing "
* Then look for a newline to close it if no "
* Then stop after some large constant of characters maybe?
* We're dealing with real text so people might forget to close
* quotations, so we have to be clever about it and use heuristics (for performance)
*/
if (source[begin] == '(') {
/*Matched a left paren */
position = begin + 1;
push_token(&token_stack, left_paren);
}
else if (source[begin] == ')') {
/*Matched a left paren */
position = begin + 1;
push_token(&token_stack, right_paren);
}
else if (isspace(source[begin])) {
position = begin + 1;
/* Matched a whitespace character */
}
else if ((position = match_string(source, begin, length))) {
/* Possibly matched a string
* First look for closing "
* Then look for a newline to close it if no "
* Then stop after some large constant of characters maybe?
* We're dealing with real text so people might forget to close
* quotations, so we have to be clever about it and use heuristics (for performance)
*/
lookahead = source[position];
source[position] = '\0';
if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) {
current_token.string = current_token_val;
source[position] = lookahead;
allspace = false;
}
else {
assert(position > begin);
current_token_val = xcalloc(((position - begin) + 1), sizeof (char));
CHECK(current_token_val);
extract_token(position, begin, source, current_token_val);
if ((allspace = is_empty_string(current_token_val, (position - begin) + 1))) {
source[position] = lookahead;
current_token.string = current_token_val;
}
else {
source[position] = lookahead;
hsh_insert(token_stack.memo, current_token_val, current_token_val);
current_token.string = current_token_val;
}
}
if (allspace) {
push_token(&token_stack, make_token(current_token, STRING));
}
}
else if ((position = match_float(source, begin, length))) {
/* Matched a float */
lookahead = source[position];
source[position] = '\0';
if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) {
current_token.floating = current_token_val;
source[position] = lookahead;
}
else {
assert(position > begin);
current_token_val = xcalloc(((position - begin) + 1), sizeof (char));
CHECK(current_token_val);
extract_token(position, begin, source, current_token_val);
source[position] = lookahead;
hsh_insert(token_stack.memo, current_token_val, current_token_val);
current_token.floating = current_token_val;
}
push_token(&token_stack, make_token(current_token, FLOATING));
}
else if ((position = match_int(source, begin, length))) {
/* Matched an int */
lookahead = source[position];
source[position] = '\0';
if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) {
current_token.integer = current_token_val;
source[position] = lookahead;
}
else {
assert(position > begin);
assert(position <= length);
current_token_val = xcalloc(((position - begin) + 1), sizeof (char));
CHECK(current_token_val);
extract_token(position, begin, source, current_token_val);
source[position] = lookahead;
hsh_insert(token_stack.memo, current_token_val, current_token_val);
current_token.integer = current_token_val;
}
push_token(&token_stack, make_token(current_token, INTEGER));
}
else if (source[begin] == '\'') {
/* Matched a quote (apostrophe) */
position = begin + 1;
push_token(&token_stack, quote_tok);
}
else if ((position = match_word(source, begin, length))) {
/* Matched a word */
lookahead = source[position];
source[position] = '\0';
if ((current_token_val = hsh_retrieve(token_stack.memo, source+begin))) {
current_token.word = current_token_val;
source[position] = lookahead;
}
else {
assert(position > begin);
assert(position <= length);
current_token_val = xcalloc(((position - begin) + 1), sizeof (char));
CHECK(current_token_val);
extract_token(position, begin, source, current_token_val);
source[position] = lookahead;
hsh_insert(token_stack.memo, current_token_val, current_token_val);
current_token.word = current_token_val;
}
push_token(&token_stack, make_token(current_token, WORD));
/* Matched a word */
}
else if (position <= begin) {
printf("Source is too large to read\n");
exit(EXIT_FAILURE);
}
else {
printf("Unmatched token\n");
exit(EXIT_FAILURE);
}
begin = position;
}
return token_stack;
}
int
free_token(const void *key,
const void *val) {
/* silence warnings about unused parameters, key and val point to the same data*/
(void)key;
xfree((char *)val);
return true;
}
bool
release_tokens(token_stream *tokens) {
/* Iterate through the stack, release each token
* Then release the entire stack
*/
CHECK(tokens);
CHECK(tokens->tokens);
assert(tokens->max_length > 0);
xfree(tokens->tokens);
hsh_iterate(tokens->memo, free_token);
hsh_destroy(tokens->memo);
return true;
}
#ifndef TOK_LIB
int main(void) {
void *source_code = malloc(111000);
size_t nbytes = read(STDIN_FILENO, source_code, 111000);
if (nbytes == 0) {
exit(EXIT_FAILURE);
}
token_stream toks = tokenize(source_code, 0, nbytes);
token_t current_tok;
while (toks.length > 0) {
current_tok = peek_token(&toks);
switch (current_tok.token_type) {
case SYMBOL:
printf("symbol: %s\n", current_tok.token.symbol);
break;
case WORD:
printf("identifer: %s\n", current_tok.token.word);
break;
case INTEGER:
printf("integer: %s\n", current_tok.token.integer);
break;
case FLOATING:
printf("floating: %s\n", current_tok.token.floating);
break;
case QUOTE:
printf("quote: '\n");
break;
case WSPACE:
printf("whitespace\n");
break;
case PAREN:
printf("paren: %s\n", current_tok.token.parenthesis);
break;
case EMPTY:
printf("this should not be empty\n");
break;
case STRING:
printf("string: %s\n", current_tok.token.string);
break;
default:
printf("oops, there was an unknown token, check valgrind or gdb\n");
}
pop_token(&toks);
}
release_tokens(&toks);
return 0;
}
#endif

73
tokenize.h

@ -0,0 +1,73 @@
#define STACK_SIZE 4096
#define GROWTH_FACTOR 2
#define MAX_STRING_SIZE 30
typedef char* source_t;
typedef enum {
WORD = 1,
INTEGER = 2,
FLOATING = 3,
QUOTE = 4,
WSPACE = 5,
PAREN = 6 ,
EMPTY = 7,
STRING = 8
} tok_t;
typedef union {
const char *word;
const char *integer;
const char *floating;
const char *parenthesis;
const char *string;
bool quote;
bool null_token;
} token_val_t;
typedef struct {
tok_t token_type;
token_val_t token;
} token_t;
typedef struct {
size_t length; /* Number of current elements */
size_t max_length; /* Maximum length of the stack */
token_t *tokens;
hsh_HashTable memo;
} token_stream;
bool
push_token(token_stream*, token_t);
bool
pop_token(token_stream*);
token_t
peek_token(token_stream*);
token_stream
tokenize(source_t, uint32_t, const uint32_t);
bool
release_tokens(token_stream*);
#ifndef TOK_LIB
static uint32_t
match_int(source_t, uint32_t, const uint32_t);
static uint32_t
match_float(source_t, uint32_t, const uint32_t);
static uint32_t
match_word(source_t, uint32_t, const uint32_t);
static uint32_t
match_string(source_t, uint32_t, const uint32_t);
#endif
int
free_token(const void *,
const void *);
token_t
testfunc(void);