[med-svn] [hat-trie] 01/07: Imported Upstream version 0.0~git25f9e946
Sascha Steinbiss
satta at debian.org
Wed Jun 22 17:12:31 UTC 2016
This is an automated email from the git hooks/post-receive script.
satta pushed a commit to branch master
in repository hat-trie.
commit 81e9ce8ed8a695d64de04d5c9a20f1971f0d3c43
Author: Sascha Steinbiss <satta at debian.org>
Date: Wed Jun 22 16:45:55 2016 +0000
Imported Upstream version 0.0~git25f9e946
---
.gitignore | 21 ++
.travis.yml | 6 +
COPYING | 19 ++
Makefile.am | 10 +
README.md | 47 +++
TODO | 6 +
configure.ac | 34 ++
hat-trie-0.1.pc.in | 12 +
m4/.gitignore | 0
src/Makefile.am | 11 +
src/ahtable.c | 564 ++++++++++++++++++++++++++++++++
src/ahtable.h | 115 +++++++
src/common.h | 22 ++
src/hat-trie.c | 711 +++++++++++++++++++++++++++++++++++++++++
src/hat-trie.h | 74 +++++
src/misc.c | 46 +++
src/misc.h | 22 ++
src/murmurhash3.c | 77 +++++
src/murmurhash3.h | 12 +
src/pstdint.h | 813 +++++++++++++++++++++++++++++++++++++++++++++++
test/Makefile.am | 15 +
test/bench_sorted_iter.c | 69 ++++
test/check_ahtable.c | 222 +++++++++++++
test/check_hattrie.c | 270 ++++++++++++++++
test/str_map.c | 241 ++++++++++++++
test/str_map.h | 54 ++++
26 files changed, 3493 insertions(+)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..6a9dcba
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,21 @@
+*.la
+*.lo
+*.o
+*~
+.DS_Store
+.deps
+.libs
+Makefile
+Makefile.in
+aclocal.m4
+autom4te.cache
+config.*
+configure
+depcomp
+hat-trie-*.pc
+hat-trie-*.tar.gz
+install-sh
+libtool
+ltmain.sh
+m4
+missing
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..4c10ab2
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,6 @@
+language: c
+compiler:
+ - clang
+ - gcc
+before_script: autoreconf -i
+script: ./configure && make && make check
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..bbc6dc3
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,19 @@
+Copyright (C) 2011 by Daniel C. Jones <dcjones at cs.washington.edu>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 0000000..9df925f
--- /dev/null
+++ b/Makefile.am
@@ -0,0 +1,10 @@
+
+SUBDIRS = src test
+
+EXTRA_DIST = README.md COPYING
+
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = hat-trie-0.1.pc
+
+ACLOCAL_AMFLAGS=-I m4
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..f0bee4f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,47 @@
+
+Hat-Trie
+========
+
+[![Build Status](https://travis-ci.org/dcjones/hat-trie.svg)](https://travis-ci.org/dcjones/hat-trie)
+
+This a ANSI C99 implementation of the HAT-trie data structure of Askitis and
+Sinha, an extremely efficient (space and time) modern variant of tries.
+
+The version implemented here maps arrays of bytes to words (i.e., unsigned
+longs), which can be used to store counts, pointers, etc, or not used at all if
+you simply want to maintain a set of unique strings.
+
+For details see,
+
+ 1. Askitis, N., & Sinha, R. (2007). HAT-trie: a cache-conscious trie-based data
+ structure for strings. Proceedings of the thirtieth Australasian conference on
+ Computer science-Volume 62 (pp. 97–105). Australian Computer Society, Inc.
+
+ 2. Askitis, N., & Zobel, J. (2005). Cache-conscious collision resolution in
+ string hash tables. String Processing and Information Retrieval (pp.
+ 91–102). Springer.
+
+
+Installation
+------------
+
+ git clone git at github.com:dcjones/hat-trie.git
+ cd hat-trie
+ autoreconf -i
+ ./configure
+ make install
+
+To use the library, include `hat-trie.h` and link using `-lhat-trie`.
+
+
+Tests
+-----
+
+Build and run the tests:
+
+ make check
+
+Other Language Bindings
+-----------------------
+ * Ruby - https://github.com/luikore/triez
+ * Python - https://github.com/kmike/hat-trie
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..be8bd3a
--- /dev/null
+++ b/TODO
@@ -0,0 +1,6 @@
+
+todo:
+ * Deletion in ahtable.
+ * Deletion in hattrie.
+
+
diff --git a/configure.ac b/configure.ac
new file mode 100644
index 0000000..870b786
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,34 @@
+
+AC_INIT([hat-trie], [0.1.0], [dcjones at cs.washington.edu])
+AM_INIT_AUTOMAKE([foreign])
+m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([yes])])
+AC_CONFIG_MACRO_DIR([m4])
+
+base_CFLAGS="-std=c99 -Wall -Wextra -pedantic"
+opt_CFLAGS="${base_CFLAGS} -O3"
+dbg_CFLAGS="${base_CFLAGS} -g -O0"
+
+AC_ARG_ENABLE([debugging],
+ [AS_HELP_STRING([--enable-debugging],
+ [enable debugging info (default is no)])],
+ [], [enable_debugging=no])
+
+AS_IF([test "x$enable_debugging" = xyes],
+ [CFLAGS="$dbg_CFLAGS"],
+ [CFLAGS="$opt_CFLAGS"])
+
+
+AC_PROG_CC
+AC_PROG_CPP
+AC_PROG_INSTALL
+AC_PROG_LN_S
+AC_PROG_MAKE_SET
+AC_DISABLE_SHARED
+AC_PROG_LIBTOOL
+
+AC_C_BIGENDIAN([AC_MSG_ERROR([Big-endian systems are not currently supported.])])
+AC_HEADER_STDBOOL
+
+AC_CONFIG_FILES([hat-trie-0.1.pc Makefile src/Makefile test/Makefile])
+AC_OUTPUT
+
diff --git a/hat-trie-0.1.pc.in b/hat-trie-0.1.pc.in
new file mode 100644
index 0000000..b694008
--- /dev/null
+++ b/hat-trie-0.1.pc.in
@@ -0,0 +1,12 @@
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: @PACKAGE_NAME@
+Description: An efficient trie implementation.
+Version: @PACKAGE_VERSION@
+Cflags: -I{includedir}
+Libs: -L${libdir}
+
diff --git a/m4/.gitignore b/m4/.gitignore
new file mode 100644
index 0000000..e69de29
diff --git a/src/Makefile.am b/src/Makefile.am
new file mode 100644
index 0000000..942bc65
--- /dev/null
+++ b/src/Makefile.am
@@ -0,0 +1,11 @@
+
+lib_LTLIBRARIES = libhat-trie.la
+
+libhat_trie_la_SOURCES = common.h \
+ ahtable.h ahtable.c \
+ hat-trie.h hat-trie.c \
+ misc.h misc.c \
+ murmurhash3.h murmurhash3.c
+
+pkginclude_HEADERS = hat-trie.h ahtable.h common.h pstdint.h
+
diff --git a/src/ahtable.c b/src/ahtable.c
new file mode 100644
index 0000000..c0f6fb3
--- /dev/null
+++ b/src/ahtable.c
@@ -0,0 +1,564 @@
+/*
+ * This file is part of hat-trie.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones at cs.washington.edu>
+ *
+ * See ahtable.h for description of the Array Hash Table.
+ *
+ */
+
+#include "ahtable.h"
+#include "misc.h"
+#include "murmurhash3.h"
+#include <assert.h>
+#include <string.h>
+
+const double ahtable_max_load_factor = 100000.0; /* arbitrary large number => don't resize */
+const size_t ahtable_initial_size = 4096;
+
+static size_t keylen(slot_t s) {
+ if (0x1 & *s) {
+ return (size_t) (*((uint16_t*) s) >> 1);
+ }
+ else {
+ return (size_t) (*s >> 1);
+ }
+}
+
+
+ahtable_t* ahtable_create()
+{
+ return ahtable_create_n(ahtable_initial_size);
+}
+
+
+ahtable_t* ahtable_create_n(size_t n)
+{
+ ahtable_t* table = malloc_or_die(sizeof(ahtable_t));
+ table->flag = 0;
+ table->c0 = table->c1 = '\0';
+
+ table->n = n;
+ table->m = 0;
+ table->max_m = (size_t) (ahtable_max_load_factor * (double) table->n);
+ table->slots = malloc_or_die(n * sizeof(slot_t));
+ memset(table->slots, 0, n * sizeof(slot_t));
+
+ table->slot_sizes = malloc_or_die(n * sizeof(size_t));
+ memset(table->slot_sizes, 0, n * sizeof(size_t));
+
+ return table;
+}
+
+
+void ahtable_free(ahtable_t* table)
+{
+ if (table == NULL) return;
+ size_t i;
+ for (i = 0; i < table->n; ++i) free(table->slots[i]);
+ free(table->slots);
+ free(table->slot_sizes);
+ free(table);
+}
+
+
+size_t ahtable_size(const ahtable_t* table)
+{
+ return table->m;
+}
+
+
+size_t ahtable_sizeof(const ahtable_t* table)
+{
+ size_t nbytes = sizeof(ahtable_t) +
+ table->n * (sizeof(size_t) + sizeof(slot_t));
+ size_t i;
+ for (i = 0; i < table->n; ++i) {
+ nbytes += table->slot_sizes[i];
+ }
+ return nbytes;
+}
+
+
+void ahtable_clear(ahtable_t* table)
+{
+ size_t i;
+ for (i = 0; i < table->n; ++i) free(table->slots[i]);
+ table->n = ahtable_initial_size;
+ table->slots = realloc_or_die(table->slots, table->n * sizeof(slot_t));
+ memset(table->slots, 0, table->n * sizeof(slot_t));
+
+ table->slot_sizes = realloc_or_die(table->slot_sizes, table->n * sizeof(size_t));
+ memset(table->slot_sizes, 0, table->n * sizeof(size_t));
+}
+
+/** Inserts a key with value into slot s, and returns a pointer to the
+ * space immediately after.
+ */
+static slot_t ins_key(slot_t s, const char* key, size_t len, value_t** val)
+{
+ // key length
+ if (len < 128) {
+ s[0] = (unsigned char) (len << 1);
+ s += 1;
+ }
+ else {
+ /* The least significant bit is set to indicate that two bytes are
+ * being used to store the key length. */
+ *((uint16_t*) s) = ((uint16_t) len << 1) | 0x1;
+ s += 2;
+ }
+
+ // key
+ memcpy(s, key, len * sizeof(unsigned char));
+ s += len;
+
+ // value
+ *val = (value_t*) s;
+ **val = 0;
+ s += sizeof(value_t);
+
+ return s;
+}
+
+
+static void ahtable_expand(ahtable_t* table)
+{
+ /* Resizing a table is essentially building a brand new one.
+ * One little shortcut we can take on the memory allocation front is to
+ * figure out how much memory each slot needs in advance.
+ */
+ assert(table->n > 0);
+ size_t new_n = 2 * table->n;
+ size_t* slot_sizes = malloc_or_die(new_n * sizeof(size_t));
+ memset(slot_sizes, 0, new_n * sizeof(size_t));
+
+ const char* key;
+ size_t len = 0;
+ size_t m = 0;
+ ahtable_iter_t* i = ahtable_iter_begin(table, false);
+ while (!ahtable_iter_finished(i)) {
+ key = ahtable_iter_key(i, &len);
+ slot_sizes[hash(key, len) % new_n] +=
+ len + sizeof(value_t) + (len >= 128 ? 2 : 1);
+
+ ++m;
+ ahtable_iter_next(i);
+ }
+ assert(m == table->m);
+ ahtable_iter_free(i);
+
+
+ /* allocate slots */
+ slot_t* slots = malloc_or_die(new_n * sizeof(slot_t));
+ size_t j;
+ for (j = 0; j < new_n; ++j) {
+ if (slot_sizes[j] > 0) {
+ slots[j] = malloc_or_die(slot_sizes[j]);
+ }
+ else slots[j] = NULL;
+ }
+
+ /* rehash values. A few shortcuts can be taken here as well, as we know
+ * there will be no collisions. Instead of the regular insertion routine,
+ * we keep track of the ends of every slot and simply insert keys.
+ * */
+ slot_t* slots_next = malloc_or_die(new_n * sizeof(slot_t));
+ memcpy(slots_next, slots, new_n * sizeof(slot_t));
+ size_t h;
+ m = 0;
+ value_t* u;
+ value_t* v;
+ i = ahtable_iter_begin(table, false);
+ while (!ahtable_iter_finished(i)) {
+
+ key = ahtable_iter_key(i, &len);
+ h = hash(key, len) % new_n;
+
+ slots_next[h] = ins_key(slots_next[h], key, len, &u);
+ v = ahtable_iter_val(i);
+ *u = *v;
+
+ ++m;
+ ahtable_iter_next(i);
+ }
+ assert(m == table->m);
+ ahtable_iter_free(i);
+
+
+ free(slots_next);
+ for (j = 0; j < table->n; ++j) free(table->slots[j]);
+
+ free(table->slots);
+ table->slots = slots;
+
+ free(table->slot_sizes);
+ table->slot_sizes = slot_sizes;
+
+ table->n = new_n;
+ table->max_m = (size_t) (ahtable_max_load_factor * (double) table->n);
+}
+
+
+static value_t* get_key(ahtable_t* table, const char* key, size_t len, bool insert_missing)
+{
+ /* if we are at capacity, preemptively resize */
+ if (insert_missing && table->m >= table->max_m) {
+ ahtable_expand(table);
+ }
+
+
+ uint32_t i = hash(key, len) % table->n;
+ size_t k;
+ slot_t s;
+ value_t* val;
+
+ /* search the array for our key */
+ s = table->slots[i];
+ while ((size_t) (s - table->slots[i]) < table->slot_sizes[i]) {
+ /* get the key length */
+ k = keylen(s);
+ s += k < 128 ? 1 : 2;
+
+ /* skip keys that are longer than ours */
+ if (k != len) {
+ s += k + sizeof(value_t);
+ continue;
+ }
+
+ /* key found. */
+ if (memcmp(s, key, len) == 0) {
+ return (value_t*) (s + len);
+ }
+ /* key not found. */
+ else {
+ s += k + sizeof(value_t);
+ continue;
+ }
+ }
+
+
+ if (insert_missing) {
+ /* the key was not found, so we must insert it. */
+ size_t new_size = table->slot_sizes[i];
+ new_size += 1 + (len >= 128 ? 1 : 0); // key length
+ new_size += len * sizeof(unsigned char); // key
+ new_size += sizeof(value_t); // value
+
+ table->slots[i] = realloc_or_die(table->slots[i], new_size);
+
+ ++table->m;
+ ins_key(table->slots[i] + table->slot_sizes[i], key, len, &val);
+ table->slot_sizes[i] = new_size;
+
+ return val;
+ }
+ else return NULL;
+}
+
+
+value_t* ahtable_get(ahtable_t* table, const char* key, size_t len)
+{
+ return get_key(table, key, len, true);
+}
+
+
+value_t* ahtable_tryget(ahtable_t* table, const char* key, size_t len )
+{
+ return get_key(table, key, len, false);
+}
+
+
+int ahtable_del(ahtable_t* table, const char* key, size_t len)
+{
+ uint32_t i = hash(key, len) % table->n;
+ size_t k;
+ slot_t s;
+
+ /* search the array for our key */
+ s = table->slots[i];
+ while ((size_t) (s - table->slots[i]) < table->slot_sizes[i]) {
+ /* get the key length */
+ k = keylen(s);
+ s += k < 128 ? 1 : 2;
+
+ /* skip keys that are longer than ours */
+ if (k != len) {
+ s += k + sizeof(value_t);
+ continue;
+ }
+
+ /* key found. */
+ if (memcmp(s, key, len) == 0) {
+ /* move everything over, resize the array */
+ unsigned char* t = s + len + sizeof(value_t);
+ s -= k < 128 ? 1 : 2;
+ memmove(s, t, table->slot_sizes[i] - (size_t) (t - table->slots[i]));
+ table->slot_sizes[i] -= (size_t) (t - s);
+ --table->m;
+ return 0;
+ }
+ /* key not found. */
+ else {
+ s += k + sizeof(value_t);
+ continue;
+ }
+ }
+
+ // Key was not found. Do nothing.
+ return -1;
+}
+
+
+
+static int cmpkey(const void* a_, const void* b_)
+{
+ slot_t a = *(slot_t*) a_;
+ slot_t b = *(slot_t*) b_;
+
+ size_t ka = keylen(a), kb = keylen(b);
+
+ a += ka < 128 ? 1 : 2;
+ b += kb < 128 ? 1 : 2;
+
+ int c = memcmp(a, b, ka < kb ? ka : kb);
+ return c == 0 ? (int) ka - (int) kb : c;
+}
+
+
+/* Sorted/unsorted iterators are kept private and exposed by passing the
+sorted flag to ahtable_iter_begin. */
+
+typedef struct ahtable_sorted_iter_t_
+{
+ const ahtable_t* table; // parent
+ slot_t* xs; // pointers to keys
+ size_t i; // current key
+} ahtable_sorted_iter_t;
+
+
+static ahtable_sorted_iter_t* ahtable_sorted_iter_begin(const ahtable_t* table)
+{
+ ahtable_sorted_iter_t* i = malloc_or_die(sizeof(ahtable_sorted_iter_t));
+ i->table = table;
+ i->xs = malloc_or_die(table->m * sizeof(slot_t));
+ i->i = 0;
+
+ slot_t s;
+ size_t j, k, u;
+ for (j = 0, u = 0; j < table->n; ++j) {
+ s = table->slots[j];
+ while (s < table->slots[j] + table->slot_sizes[j]) {
+ i->xs[u++] = s;
+ k = keylen(s);
+ s += k < 128 ? 1 : 2;
+ s += k + sizeof(value_t);
+ }
+ }
+
+ qsort(i->xs, table->m, sizeof(slot_t), cmpkey);
+
+ return i;
+}
+
+
+static bool ahtable_sorted_iter_finished(ahtable_sorted_iter_t* i)
+{
+ return i->i >= i->table->m;
+}
+
+
+static void ahtable_sorted_iter_next(ahtable_sorted_iter_t* i)
+{
+ if (ahtable_sorted_iter_finished(i)) return;
+ ++i->i;
+}
+
+
+static void ahtable_sorted_iter_free(ahtable_sorted_iter_t* i)
+{
+ if (i == NULL) return;
+ free(i->xs);
+ free(i);
+}
+
+
+static const char* ahtable_sorted_iter_key(ahtable_sorted_iter_t* i, size_t* len)
+{
+ if (ahtable_sorted_iter_finished(i)) return NULL;
+
+ slot_t s = i->xs[i->i];
+ if (len) *len = keylen(s);
+
+ return (const char*) (s + (*len < 128 ? 1 : 2));
+}
+
+
+static value_t* ahtable_sorted_iter_val(ahtable_sorted_iter_t* i)
+{
+ if (ahtable_sorted_iter_finished(i)) return NULL;
+
+ slot_t s = i->xs[i->i];
+ size_t k = keylen(s);
+
+ s += k < 128 ? 1 : 2;
+ s += k;
+
+ return (value_t*) s;
+}
+
+
+typedef struct ahtable_unsorted_iter_t_
+{
+ const ahtable_t* table; // parent
+ size_t i; // slot index
+ slot_t s; // slot position
+} ahtable_unsorted_iter_t;
+
+
+static ahtable_unsorted_iter_t* ahtable_unsorted_iter_begin(const ahtable_t* table)
+{
+ ahtable_unsorted_iter_t* i = malloc_or_die(sizeof(ahtable_unsorted_iter_t));
+ i->table = table;
+
+ for (i->i = 0; i->i < i->table->n; ++i->i) {
+ i->s = table->slots[i->i];
+ if ((size_t) (i->s - table->slots[i->i]) >= table->slot_sizes[i->i]) continue;
+ break;
+ }
+
+ return i;
+}
+
+
+static bool ahtable_unsorted_iter_finished(ahtable_unsorted_iter_t* i)
+{
+ return i->i >= i->table->n;
+}
+
+
+static void ahtable_unsorted_iter_next(ahtable_unsorted_iter_t* i)
+{
+ if (ahtable_unsorted_iter_finished(i)) return;
+
+ /* get the key length */
+ size_t k = keylen(i->s);
+ i->s += k < 128 ? 1 : 2;
+
+ /* skip to the next key */
+ i->s += k + sizeof(value_t);
+
+ if ((size_t) (i->s - i->table->slots[i->i]) >= i->table->slot_sizes[i->i]) {
+ do {
+ ++i->i;
+ } while(i->i < i->table->n &&
+ i->table->slot_sizes[i->i] == 0);
+
+ if (i->i < i->table->n) i->s = i->table->slots[i->i];
+ else i->s = NULL;
+ }
+}
+
+
+static void ahtable_unsorted_iter_free(ahtable_unsorted_iter_t* i)
+{
+ free(i);
+}
+
+
+static const char* ahtable_unsorted_iter_key(ahtable_unsorted_iter_t* i, size_t* len)
+{
+ if (ahtable_unsorted_iter_finished(i)) return NULL;
+
+ slot_t s = i->s;
+ size_t k;
+ if (0x1 & *s) {
+ k = (size_t) (*((uint16_t*) s)) >> 1;
+ s += 2;
+ }
+ else {
+ k = (size_t) (*s >> 1);
+ s += 1;
+ }
+
+ if(len) *len = k;
+ return (const char*) s;
+}
+
+
+static value_t* ahtable_unsorted_iter_val(ahtable_unsorted_iter_t* i)
+{
+ if (ahtable_unsorted_iter_finished(i)) return NULL;
+
+ slot_t s = i->s;
+
+ size_t k;
+ if (0x1 & *s) {
+ k = (size_t) (*((uint16_t*) s)) >> 1;
+ s += 2;
+ }
+ else {
+ k = (size_t) (*s >> 1);
+ s += 1;
+ }
+
+ s += k;
+ return (value_t*) s;
+}
+
+
+struct ahtable_iter_t_
+{
+ bool sorted;
+ union {
+ ahtable_unsorted_iter_t* unsorted;
+ ahtable_sorted_iter_t* sorted;
+ } i;
+};
+
+
+ahtable_iter_t* ahtable_iter_begin(const ahtable_t* table, bool sorted) {
+ ahtable_iter_t* i = malloc_or_die(sizeof(ahtable_iter_t));
+ i->sorted = sorted;
+ if (sorted) i->i.sorted = ahtable_sorted_iter_begin(table);
+ else i->i.unsorted = ahtable_unsorted_iter_begin(table);
+ return i;
+}
+
+
+void ahtable_iter_next(ahtable_iter_t* i)
+{
+ if (i->sorted) ahtable_sorted_iter_next(i->i.sorted);
+ else ahtable_unsorted_iter_next(i->i.unsorted);
+}
+
+
+bool ahtable_iter_finished(ahtable_iter_t* i)
+{
+ if (i->sorted) return ahtable_sorted_iter_finished(i->i.sorted);
+ else return ahtable_unsorted_iter_finished(i->i.unsorted);
+}
+
+
+void ahtable_iter_free(ahtable_iter_t* i)
+{
+ if (i == NULL) return;
+ if (i->sorted) ahtable_sorted_iter_free(i->i.sorted);
+ else ahtable_unsorted_iter_free(i->i.unsorted);
+ free(i);
+}
+
+
+const char* ahtable_iter_key(ahtable_iter_t* i, size_t* len)
+{
+ if (i->sorted) return ahtable_sorted_iter_key(i->i.sorted, len);
+ else return ahtable_unsorted_iter_key(i->i.unsorted, len);
+}
+
+
+value_t* ahtable_iter_val(ahtable_iter_t* i)
+{
+ if (i->sorted) return ahtable_sorted_iter_val(i->i.sorted);
+ else return ahtable_unsorted_iter_val(i->i.unsorted);
+}
+
diff --git a/src/ahtable.h b/src/ahtable.h
new file mode 100644
index 0000000..15e8e21
--- /dev/null
+++ b/src/ahtable.h
@@ -0,0 +1,115 @@
+/*
+ * This file is part of hat-trie.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones at cs.washington.edu>
+ *
+ *
+ * This is an implementation of the 'cache-conscious' hash tables described in,
+ *
+ * Askitis, N., & Zobel, J. (2005). Cache-conscious collision resolution in
+ * string hash tables. String Processing and Information Retrieval (pp.
+ * 91–102). Springer.
+ *
+ * http://naskitis.com/naskitis-spire05.pdf
+ *
+ * Briefly, the idea behind an Array Hash Table is, as opposed to separate
+ * chaining with linked lists, to store keys contiguously in one big array,
+ * thereby improving the caching behavior, and reducing space requirements.
+ *
+ * ahtable keeps a fixed number (array) of slots, each of which contains a
+ * variable number of key/value pairs. Each key is preceded by its length--
+ * one byte for lengths < 128 bytes, and TWO bytes for longer keys. The least
+ * significant bit of the first byte indicates, if set, that the size is two
+ * bytes. The slot number where a key/value pair goes is determined by finding
+ * the murmurhashed integer value of its key, modulus the number of slots.
+ * The number of slots expands in a stepwise fashion when the number of
+ # key/value pairs reaches an arbitrarily large number.
+ *
+ * +-------+-------+-------+-------+-------+-------+
+ * | 0 | 1 | 2 | 3 | ... | N |
+ * +-------+-------+-------+-------+-------+-------+
+ * | | | | |
+ * v | | v v
+ * NULL | | 4html[VALUE] etc.
+ * | v
+ * | 5space[VALUE]4jury[VALUE]
+ * v
+ * 6justice[VALUE]3car[VALUE]4star[VALUE]
+ *
+ */
+
+#ifndef HATTRIE_AHTABLE_H
+#define HATTRIE_AHTABLE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdlib.h>
+#include <stdbool.h>
+#include "pstdint.h"
+#include "common.h"
+
+typedef unsigned char* slot_t;
+
+typedef struct ahtable_t_
+{
+ /* these fields are reserved for hattrie to fiddle with */
+ uint8_t flag;
+ unsigned char c0;
+ unsigned char c1;
+
+ size_t n; // number of slots
+ size_t m; // number of key/value pairs stored
+ size_t max_m; // number of stored keys before we resize
+
+ size_t* slot_sizes;
+ slot_t* slots;
+} ahtable_t;
+
+extern const double ahtable_max_load_factor;
+extern const size_t ahtable_initial_size;
+
+ahtable_t* ahtable_create (void); // Create an empty hash table.
+ahtable_t* ahtable_create_n (size_t n); // Create an empty hash table, with
+ // n slots reserved.
+
+void ahtable_free (ahtable_t*); // Free all memory used by a table.
+void ahtable_clear (ahtable_t*); // Remove all entries.
+size_t ahtable_size (const ahtable_t*); // Number of stored keys.
+size_t ahtable_sizeof (const ahtable_t*); // Memory used by the table in bytes.
+
+
+/** Find the given key in the table, inserting it if it does not exist, and
+ * returning a pointer to it's value.
+ *
+ * This pointer is not guaranteed to be valid after additional calls to
+ * ahtable_get, ahtable_del, ahtable_clear, or other functions that modify the
+ * table.
+ */
+value_t* ahtable_get (ahtable_t*, const char* key, size_t len);
+
+
+/* Find a given key in the table, return a NULL pointer if it does not exist. */
+value_t* ahtable_tryget (ahtable_t*, const char* key, size_t len);
+
+
+int ahtable_del(ahtable_t*, const char* key, size_t len);
+
+
+typedef struct ahtable_iter_t_ ahtable_iter_t;
+
+ahtable_iter_t* ahtable_iter_begin (const ahtable_t*, bool sorted);
+void ahtable_iter_next (ahtable_iter_t*);
+bool ahtable_iter_finished (ahtable_iter_t*);
+void ahtable_iter_free (ahtable_iter_t*);
+const char* ahtable_iter_key (ahtable_iter_t*, size_t* len);
+value_t* ahtable_iter_val (ahtable_iter_t*);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/common.h b/src/common.h
new file mode 100644
index 0000000..7a3116a
--- /dev/null
+++ b/src/common.h
@@ -0,0 +1,22 @@
+/*
+ * This file is part of hat-trie.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones at cs.washington.edu>
+ *
+ *
+ * Common typedefs, etc.
+ *
+ */
+
+
+#ifndef HATTRIE_COMMON_H
+#define HATTRIE_COMMON_H
+
+#include "pstdint.h"
+
+// an unsigned int that is guaranteed to be the same size as a pointer
+typedef uintptr_t value_t;
+
+#endif
+
+
diff --git a/src/hat-trie.c b/src/hat-trie.c
new file mode 100644
index 0000000..6121bb7
--- /dev/null
+++ b/src/hat-trie.c
@@ -0,0 +1,711 @@
+/*
+ * This file is part of hat-trie.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones at cs.washington.edu>
+ *
+ */
+
+#include "hat-trie.h"
+#include "ahtable.h"
+#include "misc.h"
+#include "pstdint.h"
+#include <assert.h>
+#include <string.h>
+
+#define HT_UNUSED(x) x=x
+
+/* maximum number of keys that may be stored in a bucket before it is burst */
+static const size_t MAX_BUCKET_SIZE = 16384;
+#define NODE_MAXCHAR 0xff // 0x7f for 7-bit ASCII
+#define NODE_CHILDS (NODE_MAXCHAR+1)
+
+static const uint8_t NODE_TYPE_TRIE = 0x1;
+static const uint8_t NODE_TYPE_PURE_BUCKET = 0x2;
+static const uint8_t NODE_TYPE_HYBRID_BUCKET = 0x4;
+static const uint8_t NODE_HAS_VAL = 0x8;
+
+
+struct trie_node_t_;
+
+/* Node's may be trie nodes or buckets. This union allows us to keep
+ * non-specific pointer. */
+typedef union node_ptr_
+{
+ ahtable_t* b;
+ struct trie_node_t_* t;
+ uint8_t* flag;
+} node_ptr;
+
+
+typedef struct trie_node_t_
+{
+ uint8_t flag;
+
+ /* the value for the key that is consumed on a trie node */
+ value_t val;
+
+ /* Map a character to either a trie_node_t or a ahtable_t. The first byte
+ * must be examined to determine which. */
+ node_ptr xs[NODE_CHILDS];
+
+} trie_node_t;
+
+struct hattrie_t_
+{
+ node_ptr root; // root node
+ size_t m; // number of stored keys
+};
+
+
+
+size_t hattrie_size(const hattrie_t* T)
+{
+ return T->m;
+}
+
+
+static size_t node_sizeof(node_ptr node)
+{
+ if (*node.flag & NODE_TYPE_TRIE) {
+ size_t nbytes = sizeof(trie_node_t);
+ size_t i;
+ nbytes += node_sizeof(node.t->xs[0]);
+ for (i = 1; i < NODE_CHILDS; ++i) {
+ if (node.t->xs[i].t != node.t->xs[i-1].t) nbytes += node_sizeof(node.t->xs[i]);
+ }
+ return nbytes;
+ }
+ else {
+ return ahtable_sizeof(node.b);
+ }
+}
+
+
+size_t hattrie_sizeof(const hattrie_t* T)
+{
+ return sizeof(hattrie_t) + node_sizeof(T->root);
+}
+
+
+/* Create a new trie node with all pointers pointing to the given child (which
+ * can be NULL). */
+static trie_node_t* alloc_trie_node(hattrie_t* T, node_ptr child)
+{
+ trie_node_t* node = malloc_or_die(sizeof(trie_node_t));
+ node->flag = NODE_TYPE_TRIE;
+ node->val = 0;
+
+ /* pass T to allow custom allocator for trie. */
+ HT_UNUSED(T); /* unused now */
+
+ size_t i;
+ for (i = 0; i < NODE_CHILDS; ++i) node->xs[i] = child;
+ return node;
+}
+
+/* iterate trie nodes until string is consumed or bucket is found */
+static node_ptr hattrie_consume(node_ptr *p, const char **k, size_t *l, unsigned brk)
+{
+ node_ptr node = p->t->xs[(unsigned char) **k];
+ while (*node.flag & NODE_TYPE_TRIE && *l > brk) {
+ ++*k;
+ --*l;
+ *p = node;
+ node = node.t->xs[(unsigned char) **k];
+ }
+
+ /* copy and writeback variables if it's faster */
+
+ assert(*p->flag & NODE_TYPE_TRIE);
+ return node;
+}
+
+/* use node value and return pointer to it */
+static inline value_t* hattrie_useval(hattrie_t *T, node_ptr n)
+{
+ if (!(n.t->flag & NODE_HAS_VAL)) {
+ n.t->flag |= NODE_HAS_VAL;
+ ++T->m;
+ }
+ return &n.t->val;
+}
+
+/* clear node value if exists */
+static inline int hattrie_clrval(hattrie_t *T, node_ptr n)
+{
+ if (n.t->flag & NODE_HAS_VAL) {
+ n.t->flag &= ~NODE_HAS_VAL;
+ n.t->val = 0;
+ --T->m;
+ return 0;
+ }
+ return -1;
+}
+
+/* find node in trie */
+static node_ptr hattrie_find(hattrie_t* T, const char **key, size_t *len)
+{
+ node_ptr parent = T->root;
+ assert(*parent.flag & NODE_TYPE_TRIE);
+
+ if (*len == 0) return parent;
+
+ node_ptr node = hattrie_consume(&parent, key, len, 1);
+
+ /* if the trie node consumes value, use it */
+ if (*node.flag & NODE_TYPE_TRIE) {
+ if (!(node.t->flag & NODE_HAS_VAL)) {
+ node.flag = NULL;
+ }
+ return node;
+ }
+
+ /* pure bucket holds only key suffixes, skip current char */
+ if (*node.flag & NODE_TYPE_PURE_BUCKET) {
+ *key += 1;
+ *len -= 1;
+ }
+
+ /* do not scan bucket, it's not needed for this operation */
+ return node;
+}
+
+hattrie_t* hattrie_create()
+{
+ hattrie_t* T = malloc_or_die(sizeof(hattrie_t));
+ T->m = 0;
+
+ node_ptr node;
+ node.b = ahtable_create();
+ node.b->flag = NODE_TYPE_HYBRID_BUCKET;
+ node.b->c0 = 0x00;
+ node.b->c1 = NODE_MAXCHAR;
+ T->root.t = alloc_trie_node(T, node);
+
+ return T;
+}
+
+
+static void hattrie_free_node(node_ptr node)
+{
+ if (*node.flag & NODE_TYPE_TRIE) {
+ size_t i;
+ for (i = 0; i < NODE_CHILDS; ++i) {
+ if (i > 0 && node.t->xs[i].t == node.t->xs[i - 1].t) continue;
+
+ /* XXX: recursion might not be the best choice here. It is possible
+ * to build a very deep trie. */
+ if (node.t->xs[i].t) hattrie_free_node(node.t->xs[i]);
+ }
+ free(node.t);
+ }
+ else {
+ ahtable_free(node.b);
+ }
+}
+
+
+void hattrie_free(hattrie_t* T)
+{
+ hattrie_free_node(T->root);
+ free(T);
+}
+
+
+void hattrie_clear(hattrie_t* T)
+{
+ hattrie_free_node(T->root);
+ node_ptr node;
+ node.b = ahtable_create();
+ node.b->flag = NODE_TYPE_HYBRID_BUCKET;
+ node.b->c0 = 0x00;
+ node.b->c1 = 0xff;
+ T->root.t = alloc_trie_node(T, node);
+}
+
+
+/* Perform one split operation on the given node with the given parent.
+ */
+static void hattrie_split(hattrie_t* T, node_ptr parent, node_ptr node)
+{
+ /* only buckets may be split */
+ assert(*node.flag & NODE_TYPE_PURE_BUCKET ||
+ *node.flag & NODE_TYPE_HYBRID_BUCKET);
+
+ assert(*parent.flag & NODE_TYPE_TRIE);
+
+ if (*node.flag & NODE_TYPE_PURE_BUCKET) {
+ /* turn the pure bucket into a hybrid bucket */
+ parent.t->xs[node.b->c0].t = alloc_trie_node(T, node);
+
+ /* if the bucket had an empty key, move it to the new trie node */
+ value_t* val = ahtable_tryget(node.b, NULL, 0);
+ if (val) {
+ parent.t->xs[node.b->c0].t->val = *val;
+ parent.t->xs[node.b->c0].t->flag |= NODE_HAS_VAL;
+ *val = 0;
+ ahtable_del(node.b, NULL, 0);
+ }
+
+ node.b->c0 = 0x00;
+ node.b->c1 = NODE_MAXCHAR;
+ node.b->flag = NODE_TYPE_HYBRID_BUCKET;
+
+ return;
+ }
+
+ /* This is a hybrid bucket. Perform a proper split. */
+
+ /* count the number of occourances of every leading character */
+ unsigned int cs[NODE_CHILDS]; // occurance count for leading chars
+ memset(cs, 0, NODE_CHILDS * sizeof(unsigned int));
+ size_t len;
+ const char* key;
+
+ ahtable_iter_t* i = ahtable_iter_begin(node.b, false);
+ while (!ahtable_iter_finished(i)) {
+ key = ahtable_iter_key(i, &len);
+ assert(len > 0);
+ cs[(unsigned char) key[0]] += 1;
+ ahtable_iter_next(i);
+ }
+ ahtable_iter_free(i);
+
+ /* choose a split point */
+ unsigned int left_m, right_m, all_m;
+ unsigned char j = node.b->c0;
+ all_m = ahtable_size(node.b);
+ left_m = cs[j];
+ right_m = all_m - left_m;
+ int d;
+
+ while (j + 1 < node.b->c1) {
+ d = abs((int) (left_m + cs[j + 1]) - (int) (right_m - cs[j + 1]));
+ if (d <= abs(left_m - right_m) && left_m + cs[j + 1] < all_m) {
+ j += 1;
+ left_m += cs[j];
+ right_m -= cs[j];
+ }
+ else break;
+ }
+
+ /* now split into two node cooresponding to ranges [0, j] and
+ * [j + 1, NODE_MAXCHAR], respectively. */
+
+
+ /* create new left and right nodes */
+
+ /* TODO: Add a special case if either node is a hybrid bucket containing all
+ * the keys. In such a case, do not build a new table, just use the old one.
+ * */
+ size_t num_slots;
+
+
+ for (num_slots = ahtable_initial_size;
+ (double) left_m > ahtable_max_load_factor * (double) num_slots;
+ num_slots *= 2);
+
+ node_ptr left, right;
+ left.b = ahtable_create_n(num_slots);
+ left.b->c0 = node.b->c0;
+ left.b->c1 = j;
+ left.b->flag = left.b->c0 == left.b->c1 ?
+ NODE_TYPE_PURE_BUCKET : NODE_TYPE_HYBRID_BUCKET;
+
+
+ for (num_slots = ahtable_initial_size;
+ (double) right_m > ahtable_max_load_factor * (double) num_slots;
+ num_slots *= 2);
+
+ right.b = ahtable_create_n(num_slots);
+ right.b->c0 = j + 1;
+ right.b->c1 = node.b->c1;
+ right.b->flag = right.b->c0 == right.b->c1 ?
+ NODE_TYPE_PURE_BUCKET : NODE_TYPE_HYBRID_BUCKET;
+
+
+ /* update the parent's pointer */
+
+ unsigned int c;
+ for (c = node.b->c0; c <= j; ++c) parent.t->xs[c] = left;
+ for (; c <= node.b->c1; ++c) parent.t->xs[c] = right;
+
+
+
+ /* distribute keys to the new left or right node */
+ value_t* u;
+ value_t* v;
+ i = ahtable_iter_begin(node.b, false);
+ while (!ahtable_iter_finished(i)) {
+ key = ahtable_iter_key(i, &len);
+ u = ahtable_iter_val(i);
+ assert(len > 0);
+
+ /* left */
+ if ((unsigned char) key[0] <= j) {
+ if (*left.flag & NODE_TYPE_PURE_BUCKET) {
+ v = ahtable_get(left.b, key + 1, len - 1);
+ }
+ else {
+ v = ahtable_get(left.b, key, len);
+ }
+ *v = *u;
+ }
+
+ /* right */
+ else {
+ if (*right.flag & NODE_TYPE_PURE_BUCKET) {
+ v = ahtable_get(right.b, key + 1, len - 1);
+ }
+ else {
+ v = ahtable_get(right.b, key, len);
+ }
+ *v = *u;
+ }
+
+ ahtable_iter_next(i);
+ }
+
+ ahtable_iter_free(i);
+ ahtable_free(node.b);
+}
+
+value_t* hattrie_get(hattrie_t* T, const char* key, size_t len)
+{
+ node_ptr parent = T->root;
+ assert(*parent.flag & NODE_TYPE_TRIE);
+
+ if (len == 0) return &parent.t->val;
+
+ /* consume all trie nodes, now parent must be trie and child anything */
+ node_ptr node = hattrie_consume(&parent, &key, &len, 0);
+ assert(*parent.flag & NODE_TYPE_TRIE);
+
+ /* if the key has been consumed on a trie node, use its value */
+ if (len == 0) {
+ if (*node.flag & NODE_TYPE_TRIE) {
+ return hattrie_useval(T, node);
+ }
+ else if (*node.flag & NODE_TYPE_HYBRID_BUCKET) {
+ return hattrie_useval(T, parent);
+ }
+ }
+
+
+ /* preemptively split the bucket if it is full */
+ while (ahtable_size(node.b) >= MAX_BUCKET_SIZE) {
+ hattrie_split(T, parent, node);
+
+ /* after the split, the node pointer is invalidated, so we search from
+ * the parent again. */
+ node = hattrie_consume(&parent, &key, &len, 0);
+
+ /* if the key has been consumed on a trie node, use its value */
+ if (len == 0) {
+ if (*node.flag & NODE_TYPE_TRIE) {
+ return hattrie_useval(T, node);
+ }
+ else if (*node.flag & NODE_TYPE_HYBRID_BUCKET) {
+ return hattrie_useval(T, parent);
+ }
+ }
+ }
+
+ assert(*node.flag & NODE_TYPE_PURE_BUCKET || *node.flag & NODE_TYPE_HYBRID_BUCKET);
+
+ assert(len > 0);
+ size_t m_old = node.b->m;
+ value_t* val;
+ if (*node.flag & NODE_TYPE_PURE_BUCKET) {
+ val = ahtable_get(node.b, key + 1, len - 1);
+ }
+ else {
+ val = ahtable_get(node.b, key, len);
+ }
+ T->m += (node.b->m - m_old);
+
+ return val;
+}
+
+
+value_t* hattrie_tryget(hattrie_t* T, const char* key, size_t len)
+{
+ /* find node for given key */
+ node_ptr node = hattrie_find(T, &key, &len);
+ if (node.flag == NULL) {
+ return NULL;
+ }
+
+ /* if the trie node consumes value, use it */
+ if (*node.flag & NODE_TYPE_TRIE) {
+ return &node.t->val;
+ }
+
+ return ahtable_tryget(node.b, key, len);
+}
+
+
+int hattrie_del(hattrie_t* T, const char* key, size_t len)
+{
+ node_ptr parent = T->root;
+ HT_UNUSED(parent);
+ assert(*parent.flag & NODE_TYPE_TRIE);
+
+ /* find node for deletion */
+ node_ptr node = hattrie_find(T, &key, &len);
+ if (node.flag == NULL) {
+ return -1;
+ }
+
+ /* if consumed on a trie node, clear the value */
+ if (*node.flag & NODE_TYPE_TRIE) {
+ return hattrie_clrval(T, node);
+ }
+
+ /* remove from bucket */
+ size_t m_old = ahtable_size(node.b);
+ int ret = ahtable_del(node.b, key, len);
+ T->m -= (m_old - ahtable_size(node.b));
+
+ /* merge empty buckets */
+ /*! \todo */
+
+ return ret;
+}
+
+
+/* plan for iteration:
+ * This is tricky, as we have no parent pointers currently, and I would like to
+ * avoid adding them. That means maintaining a stack
+ *
+ */
+
+typedef struct hattrie_node_stack_t_
+{
+ unsigned char c;
+ size_t level;
+
+ node_ptr node;
+ struct hattrie_node_stack_t_* next;
+
+} hattrie_node_stack_t;
+
+
+struct hattrie_iter_t_
+{
+ char* key;
+ size_t keysize; // space reserved for the key
+ size_t level;
+
+ /* keep track of keys stored in trie nodes */
+ bool has_nil_key;
+ value_t nil_val;
+
+ const hattrie_t* T;
+ bool sorted;
+ ahtable_iter_t* i;
+ hattrie_node_stack_t* stack;
+};
+
+
+static void hattrie_iter_pushchar(hattrie_iter_t* i, size_t level, char c)
+{
+ if (i->keysize < level) {
+ i->keysize *= 2;
+ i->key = realloc_or_die(i->key, i->keysize * sizeof(char));
+ }
+
+ if (level > 0) {
+ i->key[level - 1] = c;
+ }
+
+ i->level = level;
+}
+
+
+static void hattrie_iter_nextnode(hattrie_iter_t* i)
+{
+ if (i->stack == NULL) return;
+
+ /* pop the stack */
+ node_ptr node;
+ hattrie_node_stack_t* next;
+ unsigned char c;
+ size_t level;
+
+ node = i->stack->node;
+ next = i->stack->next;
+ c = i->stack->c;
+ level = i->stack->level;
+
+ free(i->stack);
+ i->stack = next;
+
+ if (*node.flag & NODE_TYPE_TRIE) {
+ hattrie_iter_pushchar(i, level, c);
+
+ if(node.t->flag & NODE_HAS_VAL) {
+ i->has_nil_key = true;
+ i->nil_val = node.t->val;
+ }
+
+ /* push all child nodes from right to left */
+ int j;
+ for (j = NODE_MAXCHAR; j >= 0; --j) {
+
+ /* skip repeated pointers to hybrid bucket */
+ if (j < NODE_MAXCHAR && node.t->xs[j].t == node.t->xs[j + 1].t) continue;
+
+ // push stack
+ next = i->stack;
+ i->stack = malloc_or_die(sizeof(hattrie_node_stack_t));
+ i->stack->node = node.t->xs[j];
+ i->stack->next = next;
+ i->stack->level = level + 1;
+ i->stack->c = (unsigned char) j;
+ }
+ }
+ else {
+ if (*node.flag & NODE_TYPE_PURE_BUCKET) {
+ hattrie_iter_pushchar(i, level, c);
+ }
+ else {
+ i->level = level - 1;
+ }
+
+ i->i = ahtable_iter_begin(node.b, i->sorted);
+ }
+}
+
+
+hattrie_iter_t* hattrie_iter_begin(const hattrie_t* T, bool sorted)
+{
+ hattrie_iter_t* i = malloc_or_die(sizeof(hattrie_iter_t));
+ i->T = T;
+ i->sorted = sorted;
+ i->i = NULL;
+ i->keysize = 16;
+ i->key = malloc_or_die(i->keysize * sizeof(char));
+ i->level = 0;
+ i->has_nil_key = false;
+ i->nil_val = 0;
+
+ i->stack = malloc_or_die(sizeof(hattrie_node_stack_t));
+ i->stack->next = NULL;
+ i->stack->node = T->root;
+ i->stack->c = '\0';
+ i->stack->level = 0;
+
+
+ while (((i->i == NULL || ahtable_iter_finished(i->i)) && !i->has_nil_key) &&
+ i->stack != NULL ) {
+
+ ahtable_iter_free(i->i);
+ i->i = NULL;
+ hattrie_iter_nextnode(i);
+ }
+
+ if (i->i != NULL && ahtable_iter_finished(i->i)) {
+ ahtable_iter_free(i->i);
+ i->i = NULL;
+ }
+
+ return i;
+}
+
+
+void hattrie_iter_next(hattrie_iter_t* i)
+{
+ if (hattrie_iter_finished(i)) return;
+
+ if (i->i != NULL && !ahtable_iter_finished(i->i)) {
+ ahtable_iter_next(i->i);
+ }
+ else if (i->has_nil_key) {
+ i->has_nil_key = false;
+ i->nil_val = 0;
+ hattrie_iter_nextnode(i);
+ }
+
+ while (((i->i == NULL || ahtable_iter_finished(i->i)) && !i->has_nil_key) &&
+ i->stack != NULL ) {
+
+ ahtable_iter_free(i->i);
+ i->i = NULL;
+ hattrie_iter_nextnode(i);
+ }
+
+ if (i->i != NULL && ahtable_iter_finished(i->i)) {
+ ahtable_iter_free(i->i);
+ i->i = NULL;
+ }
+}
+
+
+bool hattrie_iter_finished(hattrie_iter_t* i)
+{
+ return i->stack == NULL && i->i == NULL && !i->has_nil_key;
+}
+
+
+void hattrie_iter_free(hattrie_iter_t* i)
+{
+ if (i == NULL) return;
+ if (i->i) ahtable_iter_free(i->i);
+
+ hattrie_node_stack_t* next;
+ while (i->stack) {
+ next = i->stack->next;
+ free(i->stack);
+ i->stack = next;
+ }
+
+ free(i->key);
+ free(i);
+}
+
+
+const char* hattrie_iter_key(hattrie_iter_t* i, size_t* len)
+{
+ if (hattrie_iter_finished(i)) return NULL;
+
+ size_t sublen;
+ const char* subkey;
+
+ if (i->has_nil_key) {
+ subkey = NULL;
+ sublen = 0;
+ }
+ else subkey = ahtable_iter_key(i->i, &sublen);
+
+ if (i->keysize < i->level + sublen + 1) {
+ while (i->keysize < i->level + sublen + 1) i->keysize *= 2;
+ i->key = realloc_or_die(i->key, i->keysize * sizeof(char));
+ }
+
+ memcpy(i->key + i->level, subkey, sublen);
+ i->key[i->level + sublen] = '\0';
+
+ if (len) *len = i->level + sublen;
+ return i->key;
+}
+
+
+value_t* hattrie_iter_val(hattrie_iter_t* i)
+{
+ if (i->has_nil_key) return &i->nil_val;
+
+ if (hattrie_iter_finished(i)) return NULL;
+
+ return ahtable_iter_val(i->i);
+}
+
+
+
+bool hattrie_iter_equal(const hattrie_iter_t* a,
+ const hattrie_iter_t* b)
+{
+ return a->T == b->T &&
+ a->sorted == b->sorted &&
+ a->i == b->i;
+}
diff --git a/src/hat-trie.h b/src/hat-trie.h
new file mode 100644
index 0000000..b6b0653
--- /dev/null
+++ b/src/hat-trie.h
@@ -0,0 +1,74 @@
+/*
+ * This file is part of hat-trie
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones at cs.washington.edu>
+ *
+ *
+ * This is an implementation of the HAT-trie data structure described in,
+ *
+ * Askitis, N., & Sinha, R. (2007). HAT-trie: a cache-conscious trie-based data
+ * structure for strings. Proceedings of the thirtieth Australasian conference on
+ * Computer science-Volume 62 (pp. 97–105). Australian Computer Society, Inc.
+ *
+ * The HAT-trie is in essence a hybrid data structure, combining tries and hash
+ * tables in a clever way to try to get the best of both worlds.
+ *
+ */
+
+#ifndef HATTRIE_HATTRIE_H
+#define HATTRIE_HATTRIE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "common.h"
+#include <stdlib.h>
+#include <stdbool.h>
+
+typedef struct hattrie_t_ hattrie_t;
+
+hattrie_t* hattrie_create (void); // Create an empty hat-trie.
+void hattrie_free (hattrie_t*); // Free all memory used by a trie.
+hattrie_t* hattrie_dup (const hattrie_t*); // Duplicate an existing trie.
+void hattrie_clear (hattrie_t*); // Remove all entries.
+size_t hattrie_size (const hattrie_t*); // Number of stored keys.
+size_t hattrie_sizeof (const hattrie_t*); // Memory used in structure in bytes.
+
+
+/** Find the given key in the trie, inserting it if it does not exist, and
+ * returning a pointer to it's key.
+ *
+ * This pointer is not guaranteed to be valid after additional calls to
+ * hattrie_get, hattrie_del, hattrie_clear, or other functions that modifies the
+ * trie.
+ */
+value_t* hattrie_get (hattrie_t*, const char* key, size_t len);
+
+
+/** Find a given key in the table, returning a NULL pointer if it does not
+ * exist. */
+value_t* hattrie_tryget (hattrie_t*, const char* key, size_t len);
+
+/** Delete a given key from trie. Returns 0 if successful or -1 if not found.
+ */
+int hattrie_del(hattrie_t* T, const char* key, size_t len);
+
+typedef struct hattrie_iter_t_ hattrie_iter_t;
+
+hattrie_iter_t* hattrie_iter_begin (const hattrie_t*, bool sorted);
+void hattrie_iter_next (hattrie_iter_t*);
+bool hattrie_iter_finished (hattrie_iter_t*);
+void hattrie_iter_free (hattrie_iter_t*);
+const char* hattrie_iter_key (hattrie_iter_t*, size_t* len);
+value_t* hattrie_iter_val (hattrie_iter_t*);
+
+/* Return true if two iterators are equal. */
+bool hattrie_iter_equal (const hattrie_iter_t* a,
+ const hattrie_iter_t* b);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/misc.c b/src/misc.c
new file mode 100644
index 0000000..0530c34
--- /dev/null
+++ b/src/misc.c
@@ -0,0 +1,46 @@
+/*
+ * This file is part of hat-trie.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones at cs.washington.edu>
+ *
+ */
+
+#include "misc.h"
+#include <stdlib.h>
+
+
+void* malloc_or_die(size_t n)
+{
+ void* p = malloc(n);
+ if (p == NULL && n != 0) {
+ fprintf(stderr, "Cannot allocate %zu bytes.\n", n);
+ exit(EXIT_FAILURE);
+ }
+ return p;
+}
+
+
+void* realloc_or_die(void* ptr, size_t n)
+{
+ void* p = realloc(ptr, n);
+ if (p == NULL && n != 0) {
+ fprintf(stderr, "Cannot allocate %zu bytes.\n", n);
+ exit(EXIT_FAILURE);
+ }
+ return p;
+}
+
+
+FILE* fopen_or_die(const char* path, const char* mode)
+{
+ FILE* f = fopen(path, mode);
+ if (f == NULL) {
+ fprintf(stderr, "Cannot open file %s with mode %s.\n", path, mode);
+ exit(EXIT_FAILURE);
+ }
+ return f;
+}
+
+
+
+
diff --git a/src/misc.h b/src/misc.h
new file mode 100644
index 0000000..7223b8b
--- /dev/null
+++ b/src/misc.h
@@ -0,0 +1,22 @@
+/*
+ * This file is part of hat-trie.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones at cs.washington.edu>
+ *
+ * misc :
+ * miscelaneous functions.
+ *
+ */
+
+#ifndef LINESET_MISC_H
+#define LINESET_MISC_H
+
+#include <stdio.h>
+
+void* malloc_or_die(size_t);
+void* realloc_or_die(void*, size_t);
+FILE* fopen_or_die(const char*, const char*);
+
+#endif
+
+
diff --git a/src/murmurhash3.c b/src/murmurhash3.c
new file mode 100644
index 0000000..cb24c8f
--- /dev/null
+++ b/src/murmurhash3.c
@@ -0,0 +1,77 @@
+/* This is MurmurHash3. The original C++ code was placed in the public domain
+ * by its author, Austin Appleby. */
+
+#include "murmurhash3.h"
+
+static inline uint32_t fmix(uint32_t h)
+{
+ h ^= h >> 16;
+ h *= 0x85ebca6b;
+ h ^= h >> 13;
+ h *= 0xc2b2ae35;
+ h ^= h >> 16;
+
+ return h;
+}
+
+
+static inline uint32_t rotl32(uint32_t x, int8_t r)
+{
+ return (x << r) | (x >> (32 - r));
+}
+
+
+uint32_t hash(const char* data, size_t len_)
+{
+ const int len = (int) len_;
+ const int nblocks = len / 4;
+
+ uint32_t h1 = 0xc062fb4a;
+
+ uint32_t c1 = 0xcc9e2d51;
+ uint32_t c2 = 0x1b873593;
+
+ //----------
+ // body
+
+ const uint32_t * blocks = (const uint32_t*) (data + nblocks * 4);
+
+ int i;
+ for(i = -nblocks; i; i++)
+ {
+ uint32_t k1 = blocks[i];
+
+ k1 *= c1;
+ k1 = rotl32(k1, 15);
+ k1 *= c2;
+
+ h1 ^= k1;
+ h1 = rotl32(h1, 13);
+ h1 = h1*5+0xe6546b64;
+ }
+
+ //----------
+ // tail
+
+ const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
+
+ uint32_t k1 = 0;
+
+ switch(len & 3)
+ {
+ case 3: k1 ^= tail[2] << 16;
+ case 2: k1 ^= tail[1] << 8;
+ case 1: k1 ^= tail[0];
+ k1 *= c1; k1 = rotl32(k1,15); k1 *= c2; h1 ^= k1;
+ }
+
+ //----------
+ // finalization
+
+ h1 ^= len;
+
+ h1 = fmix(h1);
+
+ return h1;
+}
+
diff --git a/src/murmurhash3.h b/src/murmurhash3.h
new file mode 100644
index 0000000..37fbf41
--- /dev/null
+++ b/src/murmurhash3.h
@@ -0,0 +1,12 @@
+
+#ifndef MURMURHASH3_H
+#define MURMURHASH3_H
+
+#include <stdlib.h>
+
+#include "pstdint.h"
+
+uint32_t hash(const char* data, size_t len);
+
+#endif
+
diff --git a/src/pstdint.h b/src/pstdint.h
new file mode 100644
index 0000000..18a26b5
--- /dev/null
+++ b/src/pstdint.h
@@ -0,0 +1,813 @@
+/* A portable stdint.h
+ ****************************************************************************
+ * BSD License:
+ ****************************************************************************
+ *
+ * Copyright (c) 2005-2014 Paul Hsieh
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ****************************************************************************
+ *
+ * Version 0.1.14
+ *
+ * The ANSI C standard committee, for the C99 standard, specified the
+ * inclusion of a new standard include file called stdint.h. This is
+ * a very useful and long desired include file which contains several
+ * very precise definitions for integer scalar types that is
+ * critically important for making portable several classes of
+ * applications including cryptography, hashing, variable length
+ * integer libraries and so on. But for most developers its likely
+ * useful just for programming sanity.
+ *
+ * The problem is that most compiler vendors have decided not to
+ * implement the C99 standard, and the next C++ language standard
+ * (which has a lot more mindshare these days) will be a long time in
+ * coming and its unknown whether or not it will include stdint.h or
+ * how much adoption it will have. Either way, it will be a long time
+ * before all compilers come with a stdint.h and it also does nothing
+ * for the extremely large number of compilers available today which
+ * do not include this file, or anything comparable to it.
+ *
+ * So that's what this file is all about. Its an attempt to build a
+ * single universal include file that works on as many platforms as
+ * possible to deliver what stdint.h is supposed to. A few things
+ * that should be noted about this file:
+ *
+ * 1) It is not guaranteed to be portable and/or present an identical
+ * interface on all platforms. The extreme variability of the
+ * ANSI C standard makes this an impossibility right from the
+ * very get go. Its really only meant to be useful for the vast
+ * majority of platforms that possess the capability of
+ * implementing usefully and precisely defined, standard sized
+ * integer scalars. Systems which are not intrinsically 2s
+ * complement may produce invalid constants.
+ *
+ * 2) There is an unavoidable use of non-reserved symbols.
+ *
+ * 3) Other standard include files are invoked.
+ *
+ * 4) This file may come in conflict with future platforms that do
+ * include stdint.h. The hope is that one or the other can be
+ * used with no real difference.
+ *
+ * 5) In the current verison, if your platform can't represent
+ * int32_t, int16_t and int8_t, it just dumps out with a compiler
+ * error.
+ *
+ * 6) 64 bit integers may or may not be defined. Test for their
+ * presence with the test: #ifdef INT64_MAX or #ifdef UINT64_MAX.
+ * Note that this is different from the C99 specification which
+ * requires the existence of 64 bit support in the compiler. If
+ * this is not defined for your platform, yet it is capable of
+ * dealing with 64 bits then it is because this file has not yet
+ * been extended to cover all of your system's capabilities.
+ *
+ * 7) (u)intptr_t may or may not be defined. Test for its presence
+ * with the test: #ifdef PTRDIFF_MAX. If this is not defined
+ * for your platform, then it is because this file has not yet
+ * been extended to cover all of your system's capabilities, not
+ * because its optional.
+ *
+ * 8) The following might not been defined even if your platform is
+ * capable of defining it:
+ *
+ * WCHAR_MIN
+ * WCHAR_MAX
+ * (u)int64_t
+ * PTRDIFF_MIN
+ * PTRDIFF_MAX
+ * (u)intptr_t
+ *
+ * 9) The following have not been defined:
+ *
+ * WINT_MIN
+ * WINT_MAX
+ *
+ * 10) The criteria for defining (u)int_least(*)_t isn't clear,
+ * except for systems which don't have a type that precisely
+ * defined 8, 16, or 32 bit types (which this include file does
+ * not support anyways). Default definitions have been given.
+ *
+ * 11) The criteria for defining (u)int_fast(*)_t isn't something I
+ * would trust to any particular compiler vendor or the ANSI C
+ * committee. It is well known that "compatible systems" are
+ * commonly created that have very different performance
+ * characteristics from the systems they are compatible with,
+ * especially those whose vendors make both the compiler and the
+ * system. Default definitions have been given, but its strongly
+ * recommended that users never use these definitions for any
+ * reason (they do *NOT* deliver any serious guarantee of
+ * improved performance -- not in this file, nor any vendor's
+ * stdint.h).
+ *
+ * 12) The following macros:
+ *
+ * PRINTF_INTMAX_MODIFIER
+ * PRINTF_INT64_MODIFIER
+ * PRINTF_INT32_MODIFIER
+ * PRINTF_INT16_MODIFIER
+ * PRINTF_LEAST64_MODIFIER
+ * PRINTF_LEAST32_MODIFIER
+ * PRINTF_LEAST16_MODIFIER
+ * PRINTF_INTPTR_MODIFIER
+ *
+ * are strings which have been defined as the modifiers required
+ * for the "d", "u" and "x" printf formats to correctly output
+ * (u)intmax_t, (u)int64_t, (u)int32_t, (u)int16_t, (u)least64_t,
+ * (u)least32_t, (u)least16_t and (u)intptr_t types respectively.
+ * PRINTF_INTPTR_MODIFIER is not defined for some systems which
+ * provide their own stdint.h. PRINTF_INT64_MODIFIER is not
+ * defined if INT64_MAX is not defined. These are an extension
+ * beyond what C99 specifies must be in stdint.h.
+ *
+ * In addition, the following macros are defined:
+ *
+ * PRINTF_INTMAX_HEX_WIDTH
+ * PRINTF_INT64_HEX_WIDTH
+ * PRINTF_INT32_HEX_WIDTH
+ * PRINTF_INT16_HEX_WIDTH
+ * PRINTF_INT8_HEX_WIDTH
+ * PRINTF_INTMAX_DEC_WIDTH
+ * PRINTF_INT64_DEC_WIDTH
+ * PRINTF_INT32_DEC_WIDTH
+ * PRINTF_INT16_DEC_WIDTH
+ * PRINTF_INT8_DEC_WIDTH
+ *
+ * Which specifies the maximum number of characters required to
+ * print the number of that type in either hexadecimal or decimal.
+ * These are an extension beyond what C99 specifies must be in
+ * stdint.h.
+ *
+ * Compilers tested (all with 0 warnings at their highest respective
+ * settings): Borland Turbo C 2.0, WATCOM C/C++ 11.0 (16 bits and 32
+ * bits), Microsoft Visual C++ 6.0 (32 bit), Microsoft Visual Studio
+ * .net (VC7), Intel C++ 4.0, GNU gcc v3.3.3
+ *
+ * This file should be considered a work in progress. Suggestions for
+ * improvements, especially those which increase coverage are strongly
+ * encouraged.
+ *
+ * Acknowledgements
+ *
+ * The following people have made significant contributions to the
+ * development and testing of this file:
+ *
+ * Chris Howie
+ * John Steele Scott
+ * Dave Thorup
+ * John Dill
+ * Florian Wobbe
+ * Christopher Sean Morrison
+ *
+ */
+
+#include <stddef.h>
+#include <limits.h>
+#include <signal.h>
+
+/*
+ * For gcc with _STDINT_H, fill in the PRINTF_INT*_MODIFIER macros, and
+ * do nothing else. On the Mac OS X version of gcc this is _STDINT_H_.
+ */
+
+#if ((defined(__STDC__) && __STDC__ && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || (defined (__WATCOMC__) && (defined (_STDINT_H_INCLUDED) || __WATCOMC__ >= 1250)) || (defined(__GNUC__) && (__GNUC__ > 3 || defined(_STDINT_H) || defined(_STDINT_H_) || defined (__UINT_FAST64_TYPE__)) )) && !defined (_PSTDINT_H_INCLUDED)
+#include <stdint.h>
+#define _PSTDINT_H_INCLUDED
+# if defined(__GNUC__) && (defined(__x86_64__) || defined(__ppc64__))
+# ifndef PRINTF_INT64_MODIFIER
+# define PRINTF_INT64_MODIFIER "l"
+# endif
+# ifndef PRINTF_INT32_MODIFIER
+# define PRINTF_INT32_MODIFIER ""
+# endif
+# else
+# ifndef PRINTF_INT64_MODIFIER
+# define PRINTF_INT64_MODIFIER "ll"
+# endif
+# ifndef PRINTF_INT32_MODIFIER
+# define PRINTF_INT32_MODIFIER "l"
+# endif
+# endif
+# ifndef PRINTF_INT16_MODIFIER
+# define PRINTF_INT16_MODIFIER "h"
+# endif
+# ifndef PRINTF_INTMAX_MODIFIER
+# define PRINTF_INTMAX_MODIFIER PRINTF_INT64_MODIFIER
+# endif
+# ifndef PRINTF_INT64_HEX_WIDTH
+# define PRINTF_INT64_HEX_WIDTH "16"
+# endif
+# ifndef PRINTF_INT32_HEX_WIDTH
+# define PRINTF_INT32_HEX_WIDTH "8"
+# endif
+# ifndef PRINTF_INT16_HEX_WIDTH
+# define PRINTF_INT16_HEX_WIDTH "4"
+# endif
+# ifndef PRINTF_INT8_HEX_WIDTH
+# define PRINTF_INT8_HEX_WIDTH "2"
+# endif
+# ifndef PRINTF_INT64_DEC_WIDTH
+# define PRINTF_INT64_DEC_WIDTH "20"
+# endif
+# ifndef PRINTF_INT32_DEC_WIDTH
+# define PRINTF_INT32_DEC_WIDTH "10"
+# endif
+# ifndef PRINTF_INT16_DEC_WIDTH
+# define PRINTF_INT16_DEC_WIDTH "5"
+# endif
+# ifndef PRINTF_INT8_DEC_WIDTH
+# define PRINTF_INT8_DEC_WIDTH "3"
+# endif
+# ifndef PRINTF_INTMAX_HEX_WIDTH
+# define PRINTF_INTMAX_HEX_WIDTH PRINTF_INT64_HEX_WIDTH
+# endif
+# ifndef PRINTF_INTMAX_DEC_WIDTH
+# define PRINTF_INTMAX_DEC_WIDTH PRINTF_INT64_DEC_WIDTH
+# endif
+
+/*
+ * Something really weird is going on with Open Watcom. Just pull some of
+ * these duplicated definitions from Open Watcom's stdint.h file for now.
+ */
+
+# if defined (__WATCOMC__) && __WATCOMC__ >= 1250
+# if !defined (INT64_C)
+# define INT64_C(x) (x + (INT64_MAX - INT64_MAX))
+# endif
+# if !defined (UINT64_C)
+# define UINT64_C(x) (x + (UINT64_MAX - UINT64_MAX))
+# endif
+# if !defined (INT32_C)
+# define INT32_C(x) (x + (INT32_MAX - INT32_MAX))
+# endif
+# if !defined (UINT32_C)
+# define UINT32_C(x) (x + (UINT32_MAX - UINT32_MAX))
+# endif
+# if !defined (INT16_C)
+# define INT16_C(x) (x)
+# endif
+# if !defined (UINT16_C)
+# define UINT16_C(x) (x)
+# endif
+# if !defined (INT8_C)
+# define INT8_C(x) (x)
+# endif
+# if !defined (UINT8_C)
+# define UINT8_C(x) (x)
+# endif
+# if !defined (UINT64_MAX)
+# define UINT64_MAX 18446744073709551615ULL
+# endif
+# if !defined (INT64_MAX)
+# define INT64_MAX 9223372036854775807LL
+# endif
+# if !defined (UINT32_MAX)
+# define UINT32_MAX 4294967295UL
+# endif
+# if !defined (INT32_MAX)
+# define INT32_MAX 2147483647L
+# endif
+# if !defined (INTMAX_MAX)
+# define INTMAX_MAX INT64_MAX
+# endif
+# if !defined (INTMAX_MIN)
+# define INTMAX_MIN INT64_MIN
+# endif
+# endif
+#endif
+
+#ifndef _PSTDINT_H_INCLUDED
+#define _PSTDINT_H_INCLUDED
+
+#ifndef SIZE_MAX
+# define SIZE_MAX (~(size_t)0)
+#endif
+
+/*
+ * Deduce the type assignments from limits.h under the assumption that
+ * integer sizes in bits are powers of 2, and follow the ANSI
+ * definitions.
+ */
+
+#ifndef UINT8_MAX
+# define UINT8_MAX 0xff
+#endif
+#if !defined(uint8_t) && !defined(_UINT8_T)
+# if (UCHAR_MAX == UINT8_MAX) || defined (S_SPLINT_S)
+ typedef unsigned char uint8_t;
+# define UINT8_C(v) ((uint8_t) v)
+# else
+# error "Platform not supported"
+# endif
+#endif
+
+#ifndef INT8_MAX
+# define INT8_MAX 0x7f
+#endif
+#ifndef INT8_MIN
+# define INT8_MIN INT8_C(0x80)
+#endif
+#if !defined(int8_t) && !defined(_INT8_T)
+# if (SCHAR_MAX == INT8_MAX) || defined (S_SPLINT_S)
+ typedef signed char int8_t;
+# define INT8_C(v) ((int8_t) v)
+# else
+# error "Platform not supported"
+# endif
+#endif
+
+#ifndef UINT16_MAX
+# define UINT16_MAX 0xffff
+#endif
+#if !defined(uint16_t) && !defined(_UINT16_T)
+#if (UINT_MAX == UINT16_MAX) || defined (S_SPLINT_S)
+ typedef unsigned int uint16_t;
+# ifndef PRINTF_INT16_MODIFIER
+# define PRINTF_INT16_MODIFIER ""
+# endif
+# define UINT16_C(v) ((uint16_t) (v))
+#elif (USHRT_MAX == UINT16_MAX)
+ typedef unsigned short uint16_t;
+# define UINT16_C(v) ((uint16_t) (v))
+# ifndef PRINTF_INT16_MODIFIER
+# define PRINTF_INT16_MODIFIER "h"
+# endif
+#else
+#error "Platform not supported"
+#endif
+#endif
+
+#ifndef INT16_MAX
+# define INT16_MAX 0x7fff
+#endif
+#ifndef INT16_MIN
+# define INT16_MIN INT16_C(0x8000)
+#endif
+#if !defined(int16_t) && !defined(_INT16_T)
+#if (INT_MAX == INT16_MAX) || defined (S_SPLINT_S)
+ typedef signed int int16_t;
+# define INT16_C(v) ((int16_t) (v))
+# ifndef PRINTF_INT16_MODIFIER
+# define PRINTF_INT16_MODIFIER ""
+# endif
+#elif (SHRT_MAX == INT16_MAX)
+ typedef signed short int16_t;
+# define INT16_C(v) ((int16_t) (v))
+# ifndef PRINTF_INT16_MODIFIER
+# define PRINTF_INT16_MODIFIER "h"
+# endif
+#else
+#error "Platform not supported"
+#endif
+#endif
+
+#ifndef UINT32_MAX
+# define UINT32_MAX (0xffffffffUL)
+#endif
+#if !defined(uint32_t) && !defined(_UINT32_T)
+#if (ULONG_MAX == UINT32_MAX) || defined (S_SPLINT_S)
+ typedef unsigned long uint32_t;
+# define UINT32_C(v) v ## UL
+# ifndef PRINTF_INT32_MODIFIER
+# define PRINTF_INT32_MODIFIER "l"
+# endif
+#elif (UINT_MAX == UINT32_MAX)
+ typedef unsigned int uint32_t;
+# ifndef PRINTF_INT32_MODIFIER
+# define PRINTF_INT32_MODIFIER ""
+# endif
+# define UINT32_C(v) v ## U
+#elif (USHRT_MAX == UINT32_MAX)
+ typedef unsigned short uint32_t;
+# define UINT32_C(v) ((unsigned short) (v))
+# ifndef PRINTF_INT32_MODIFIER
+# define PRINTF_INT32_MODIFIER ""
+# endif
+#else
+#error "Platform not supported"
+#endif
+#endif
+
+#ifndef INT32_MAX
+# define INT32_MAX (0x7fffffffL)
+#endif
+#ifndef INT32_MIN
+# define INT32_MIN INT32_C(0x80000000)
+#endif
+#if !defined(int32_t) && !defined(_INT32_T)
+#if (LONG_MAX == INT32_MAX) || defined (S_SPLINT_S)
+ typedef signed long int32_t;
+# define INT32_C(v) v ## L
+# ifndef PRINTF_INT32_MODIFIER
+# define PRINTF_INT32_MODIFIER "l"
+# endif
+#elif (INT_MAX == INT32_MAX)
+ typedef signed int int32_t;
+# define INT32_C(v) v
+# ifndef PRINTF_INT32_MODIFIER
+# define PRINTF_INT32_MODIFIER ""
+# endif
+#elif (SHRT_MAX == INT32_MAX)
+ typedef signed short int32_t;
+# define INT32_C(v) ((short) (v))
+# ifndef PRINTF_INT32_MODIFIER
+# define PRINTF_INT32_MODIFIER ""
+# endif
+#else
+#error "Platform not supported"
+#endif
+#endif
+
+/*
+ * The macro stdint_int64_defined is temporarily used to record
+ * whether or not 64 integer support is available. It must be
+ * defined for any 64 integer extensions for new platforms that are
+ * added.
+ */
+
+#undef stdint_int64_defined
+#if (defined(__STDC__) && defined(__STDC_VERSION__)) || defined (S_SPLINT_S)
+# if (__STDC__ && __STDC_VERSION__ >= 199901L) || defined (S_SPLINT_S)
+# define stdint_int64_defined
+ typedef long long int64_t;
+ typedef unsigned long long uint64_t;
+# define UINT64_C(v) v ## ULL
+# define INT64_C(v) v ## LL
+# ifndef PRINTF_INT64_MODIFIER
+# define PRINTF_INT64_MODIFIER "ll"
+# endif
+# endif
+#endif
+
+#if !defined (stdint_int64_defined)
+# if defined(__GNUC__)
+# define stdint_int64_defined
+ __extension__ typedef long long int64_t;
+ __extension__ typedef unsigned long long uint64_t;
+# define UINT64_C(v) v ## ULL
+# define INT64_C(v) v ## LL
+# ifndef PRINTF_INT64_MODIFIER
+# define PRINTF_INT64_MODIFIER "ll"
+# endif
+# elif defined(__MWERKS__) || defined (__SUNPRO_C) || defined (__SUNPRO_CC) || defined (__APPLE_CC__) || defined (_LONG_LONG) || defined (_CRAYC) || defined (S_SPLINT_S)
+# define stdint_int64_defined
+ typedef long long int64_t;
+ typedef unsigned long long uint64_t;
+# define UINT64_C(v) v ## ULL
+# define INT64_C(v) v ## LL
+# ifndef PRINTF_INT64_MODIFIER
+# define PRINTF_INT64_MODIFIER "ll"
+# endif
+# elif (defined(__WATCOMC__) && defined(__WATCOM_INT64__)) || (defined(_MSC_VER) && _INTEGRAL_MAX_BITS >= 64) || (defined (__BORLANDC__) && __BORLANDC__ > 0x460) || defined (__alpha) || defined (__DECC)
+# define stdint_int64_defined
+ typedef __int64 int64_t;
+ typedef unsigned __int64 uint64_t;
+# define UINT64_C(v) v ## UI64
+# define INT64_C(v) v ## I64
+# ifndef PRINTF_INT64_MODIFIER
+# define PRINTF_INT64_MODIFIER "I64"
+# endif
+# endif
+#endif
+
+#if !defined (LONG_LONG_MAX) && defined (INT64_C)
+# define LONG_LONG_MAX INT64_C (9223372036854775807)
+#endif
+#ifndef ULONG_LONG_MAX
+# define ULONG_LONG_MAX UINT64_C (18446744073709551615)
+#endif
+
+#if !defined (INT64_MAX) && defined (INT64_C)
+# define INT64_MAX INT64_C (9223372036854775807)
+#endif
+#if !defined (INT64_MIN) && defined (INT64_C)
+# define INT64_MIN INT64_C (-9223372036854775808)
+#endif
+#if !defined (UINT64_MAX) && defined (INT64_C)
+# define UINT64_MAX UINT64_C (18446744073709551615)
+#endif
+
+/*
+ * Width of hexadecimal for number field.
+ */
+
+#ifndef PRINTF_INT64_HEX_WIDTH
+# define PRINTF_INT64_HEX_WIDTH "16"
+#endif
+#ifndef PRINTF_INT32_HEX_WIDTH
+# define PRINTF_INT32_HEX_WIDTH "8"
+#endif
+#ifndef PRINTF_INT16_HEX_WIDTH
+# define PRINTF_INT16_HEX_WIDTH "4"
+#endif
+#ifndef PRINTF_INT8_HEX_WIDTH
+# define PRINTF_INT8_HEX_WIDTH "2"
+#endif
+
+#ifndef PRINTF_INT64_DEC_WIDTH
+# define PRINTF_INT64_DEC_WIDTH "20"
+#endif
+#ifndef PRINTF_INT32_DEC_WIDTH
+# define PRINTF_INT32_DEC_WIDTH "10"
+#endif
+#ifndef PRINTF_INT16_DEC_WIDTH
+# define PRINTF_INT16_DEC_WIDTH "5"
+#endif
+#ifndef PRINTF_INT8_DEC_WIDTH
+# define PRINTF_INT8_DEC_WIDTH "3"
+#endif
+
+/*
+ * Ok, lets not worry about 128 bit integers for now. Moore's law says
+ * we don't need to worry about that until about 2040 at which point
+ * we'll have bigger things to worry about.
+ */
+
+#ifdef stdint_int64_defined
+ typedef int64_t intmax_t;
+ typedef uint64_t uintmax_t;
+# define INTMAX_MAX INT64_MAX
+# define INTMAX_MIN INT64_MIN
+# define UINTMAX_MAX UINT64_MAX
+# define UINTMAX_C(v) UINT64_C(v)
+# define INTMAX_C(v) INT64_C(v)
+# ifndef PRINTF_INTMAX_MODIFIER
+# define PRINTF_INTMAX_MODIFIER PRINTF_INT64_MODIFIER
+# endif
+# ifndef PRINTF_INTMAX_HEX_WIDTH
+# define PRINTF_INTMAX_HEX_WIDTH PRINTF_INT64_HEX_WIDTH
+# endif
+# ifndef PRINTF_INTMAX_DEC_WIDTH
+# define PRINTF_INTMAX_DEC_WIDTH PRINTF_INT64_DEC_WIDTH
+# endif
+#else
+ typedef int32_t intmax_t;
+ typedef uint32_t uintmax_t;
+# define INTMAX_MAX INT32_MAX
+# define UINTMAX_MAX UINT32_MAX
+# define UINTMAX_C(v) UINT32_C(v)
+# define INTMAX_C(v) INT32_C(v)
+# ifndef PRINTF_INTMAX_MODIFIER
+# define PRINTF_INTMAX_MODIFIER PRINTF_INT32_MODIFIER
+# endif
+# ifndef PRINTF_INTMAX_HEX_WIDTH
+# define PRINTF_INTMAX_HEX_WIDTH PRINTF_INT32_HEX_WIDTH
+# endif
+# ifndef PRINTF_INTMAX_DEC_WIDTH
+# define PRINTF_INTMAX_DEC_WIDTH PRINTF_INT32_DEC_WIDTH
+# endif
+#endif
+
+/*
+ * Because this file currently only supports platforms which have
+ * precise powers of 2 as bit sizes for the default integers, the
+ * least definitions are all trivial. Its possible that a future
+ * version of this file could have different definitions.
+ */
+
+#ifndef stdint_least_defined
+ typedef int8_t int_least8_t;
+ typedef uint8_t uint_least8_t;
+ typedef int16_t int_least16_t;
+ typedef uint16_t uint_least16_t;
+ typedef int32_t int_least32_t;
+ typedef uint32_t uint_least32_t;
+# define PRINTF_LEAST32_MODIFIER PRINTF_INT32_MODIFIER
+# define PRINTF_LEAST16_MODIFIER PRINTF_INT16_MODIFIER
+# define UINT_LEAST8_MAX UINT8_MAX
+# define INT_LEAST8_MAX INT8_MAX
+# define UINT_LEAST16_MAX UINT16_MAX
+# define INT_LEAST16_MAX INT16_MAX
+# define UINT_LEAST32_MAX UINT32_MAX
+# define INT_LEAST32_MAX INT32_MAX
+# define INT_LEAST8_MIN INT8_MIN
+# define INT_LEAST16_MIN INT16_MIN
+# define INT_LEAST32_MIN INT32_MIN
+# ifdef stdint_int64_defined
+ typedef int64_t int_least64_t;
+ typedef uint64_t uint_least64_t;
+# define PRINTF_LEAST64_MODIFIER PRINTF_INT64_MODIFIER
+# define UINT_LEAST64_MAX UINT64_MAX
+# define INT_LEAST64_MAX INT64_MAX
+# define INT_LEAST64_MIN INT64_MIN
+# endif
+#endif
+#undef stdint_least_defined
+
+/*
+ * The ANSI C committee pretending to know or specify anything about
+ * performance is the epitome of misguided arrogance. The mandate of
+ * this file is to *ONLY* ever support that absolute minimum
+ * definition of the fast integer types, for compatibility purposes.
+ * No extensions, and no attempt to suggest what may or may not be a
+ * faster integer type will ever be made in this file. Developers are
+ * warned to stay away from these types when using this or any other
+ * stdint.h.
+ */
+
+typedef int_least8_t int_fast8_t;
+typedef uint_least8_t uint_fast8_t;
+typedef int_least16_t int_fast16_t;
+typedef uint_least16_t uint_fast16_t;
+typedef int_least32_t int_fast32_t;
+typedef uint_least32_t uint_fast32_t;
+#define UINT_FAST8_MAX UINT_LEAST8_MAX
+#define INT_FAST8_MAX INT_LEAST8_MAX
+#define UINT_FAST16_MAX UINT_LEAST16_MAX
+#define INT_FAST16_MAX INT_LEAST16_MAX
+#define UINT_FAST32_MAX UINT_LEAST32_MAX
+#define INT_FAST32_MAX INT_LEAST32_MAX
+#define INT_FAST8_MIN INT_LEAST8_MIN
+#define INT_FAST16_MIN INT_LEAST16_MIN
+#define INT_FAST32_MIN INT_LEAST32_MIN
+#ifdef stdint_int64_defined
+ typedef int_least64_t int_fast64_t;
+ typedef uint_least64_t uint_fast64_t;
+# define UINT_FAST64_MAX UINT_LEAST64_MAX
+# define INT_FAST64_MAX INT_LEAST64_MAX
+# define INT_FAST64_MIN INT_LEAST64_MIN
+#endif
+
+#undef stdint_int64_defined
+
+/*
+ * Whatever piecemeal, per compiler thing we can do about the wchar_t
+ * type limits.
+ */
+
+#if defined(__WATCOMC__) || defined(_MSC_VER) || defined (__GNUC__)
+# include <wchar.h>
+# ifndef WCHAR_MIN
+# define WCHAR_MIN 0
+# endif
+# ifndef WCHAR_MAX
+# define WCHAR_MAX ((wchar_t)-1)
+# endif
+#endif
+
+/*
+ * Whatever piecemeal, per compiler/platform thing we can do about the
+ * (u)intptr_t types and limits.
+ */
+
+#if (defined (_MSC_VER) && defined (_UINTPTR_T_DEFINED)) || defined (_UINTPTR_T)
+# define STDINT_H_UINTPTR_T_DEFINED
+#endif
+
+#ifndef STDINT_H_UINTPTR_T_DEFINED
+# if defined (__alpha__) || defined (__ia64__) || defined (__x86_64__) || defined (_WIN64) || defined (__ppc64__)
+# define stdint_intptr_bits 64
+# elif defined (__WATCOMC__) || defined (__TURBOC__)
+# if defined(__TINY__) || defined(__SMALL__) || defined(__MEDIUM__)
+# define stdint_intptr_bits 16
+# else
+# define stdint_intptr_bits 32
+# endif
+# elif defined (__i386__) || defined (_WIN32) || defined (WIN32) || defined (__ppc64__)
+# define stdint_intptr_bits 32
+# elif defined (__INTEL_COMPILER)
+/* TODO -- what did Intel do about x86-64? */
+# else
+/* #error "This platform might not be supported yet" */
+# endif
+
+# ifdef stdint_intptr_bits
+# define stdint_intptr_glue3_i(a,b,c) a##b##c
+# define stdint_intptr_glue3(a,b,c) stdint_intptr_glue3_i(a,b,c)
+# ifndef PRINTF_INTPTR_MODIFIER
+# define PRINTF_INTPTR_MODIFIER stdint_intptr_glue3(PRINTF_INT,stdint_intptr_bits,_MODIFIER)
+# endif
+# ifndef PTRDIFF_MAX
+# define PTRDIFF_MAX stdint_intptr_glue3(INT,stdint_intptr_bits,_MAX)
+# endif
+# ifndef PTRDIFF_MIN
+# define PTRDIFF_MIN stdint_intptr_glue3(INT,stdint_intptr_bits,_MIN)
+# endif
+# ifndef UINTPTR_MAX
+# define UINTPTR_MAX stdint_intptr_glue3(UINT,stdint_intptr_bits,_MAX)
+# endif
+# ifndef INTPTR_MAX
+# define INTPTR_MAX stdint_intptr_glue3(INT,stdint_intptr_bits,_MAX)
+# endif
+# ifndef INTPTR_MIN
+# define INTPTR_MIN stdint_intptr_glue3(INT,stdint_intptr_bits,_MIN)
+# endif
+# ifndef INTPTR_C
+# define INTPTR_C(x) stdint_intptr_glue3(INT,stdint_intptr_bits,_C)(x)
+# endif
+# ifndef UINTPTR_C
+# define UINTPTR_C(x) stdint_intptr_glue3(UINT,stdint_intptr_bits,_C)(x)
+# endif
+ typedef stdint_intptr_glue3(uint,stdint_intptr_bits,_t) uintptr_t;
+ typedef stdint_intptr_glue3( int,stdint_intptr_bits,_t) intptr_t;
+# else
+/* TODO -- This following is likely wrong for some platforms, and does
+ nothing for the definition of uintptr_t. */
+ typedef ptrdiff_t intptr_t;
+# endif
+# define STDINT_H_UINTPTR_T_DEFINED
+#endif
+
+/*
+ * Assumes sig_atomic_t is signed and we have a 2s complement machine.
+ */
+
+#ifndef SIG_ATOMIC_MAX
+# define SIG_ATOMIC_MAX ((((sig_atomic_t) 1) << (sizeof (sig_atomic_t)*CHAR_BIT-1)) - 1)
+#endif
+
+#endif
+
+#if defined (__TEST_PSTDINT_FOR_CORRECTNESS)
+
+/*
+ * Please compile with the maximum warning settings to make sure macros are not
+ * defined more than once.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#define glue3_aux(x,y,z) x ## y ## z
+#define glue3(x,y,z) glue3_aux(x,y,z)
+
+#define DECLU(bits) glue3(uint,bits,_t) glue3(u,bits,) = glue3(UINT,bits,_C) (0);
+#define DECLI(bits) glue3(int,bits,_t) glue3(i,bits,) = glue3(INT,bits,_C) (0);
+
+#define DECL(us,bits) glue3(DECL,us,) (bits)
+
+#define TESTUMAX(bits) glue3(u,bits,) = ~glue3(u,bits,); if (glue3(UINT,bits,_MAX) != glue3(u,bits,)) printf ("Something wrong with UINT%d_MAX\n", bits)
+
+int main () {
+ DECL(I,8)
+ DECL(U,8)
+ DECL(I,16)
+ DECL(U,16)
+ DECL(I,32)
+ DECL(U,32)
+#ifdef INT64_MAX
+ DECL(I,64)
+ DECL(U,64)
+#endif
+ intmax_t imax = INTMAX_C(0);
+ uintmax_t umax = UINTMAX_C(0);
+ char str0[256], str1[256];
+
+ sprintf (str0, "%d %x\n", 0, ~0);
+
+ sprintf (str1, "%d %x\n", i8, ~0);
+ if (0 != strcmp (str0, str1)) printf ("Something wrong with i8 : %s\n", str1);
+ sprintf (str1, "%u %x\n", u8, ~0);
+ if (0 != strcmp (str0, str1)) printf ("Something wrong with u8 : %s\n", str1);
+ sprintf (str1, "%d %x\n", i16, ~0);
+ if (0 != strcmp (str0, str1)) printf ("Something wrong with i16 : %s\n", str1);
+ sprintf (str1, "%u %x\n", u16, ~0);
+ if (0 != strcmp (str0, str1)) printf ("Something wrong with u16 : %s\n", str1);
+ sprintf (str1, "%" PRINTF_INT32_MODIFIER "d %x\n", i32, ~0);
+ if (0 != strcmp (str0, str1)) printf ("Something wrong with i32 : %s\n", str1);
+ sprintf (str1, "%" PRINTF_INT32_MODIFIER "u %x\n", u32, ~0);
+ if (0 != strcmp (str0, str1)) printf ("Something wrong with u32 : %s\n", str1);
+#ifdef INT64_MAX
+ sprintf (str1, "%" PRINTF_INT64_MODIFIER "d %x\n", i64, ~0);
+ if (0 != strcmp (str0, str1)) printf ("Something wrong with i64 : %s\n", str1);
+#endif
+ sprintf (str1, "%" PRINTF_INTMAX_MODIFIER "d %x\n", imax, ~0);
+ if (0 != strcmp (str0, str1)) printf ("Something wrong with imax : %s\n", str1);
+ sprintf (str1, "%" PRINTF_INTMAX_MODIFIER "u %x\n", umax, ~0);
+ if (0 != strcmp (str0, str1)) printf ("Something wrong with umax : %s\n", str1);
+
+ TESTUMAX(8);
+ TESTUMAX(16);
+ TESTUMAX(32);
+#ifdef INT64_MAX
+ TESTUMAX(64);
+#endif
+
+ return EXIT_SUCCESS;
+}
+
+#endif
diff --git a/test/Makefile.am b/test/Makefile.am
new file mode 100644
index 0000000..30a5e31
--- /dev/null
+++ b/test/Makefile.am
@@ -0,0 +1,15 @@
+
+TESTS = check_ahtable check_hattrie
+check_PROGRAMS = check_ahtable check_hattrie bench_sorted_iter
+
+check_ahtable_SOURCES = check_ahtable.c str_map.c
+check_ahtable_LDADD = $(top_builddir)/src/libhat-trie.la
+check_ahtable_CPPFLAGS = -I$(top_builddir)/src
+
+check_hattrie_SOURCES = check_hattrie.c str_map.c
+check_hattrie_LDADD = $(top_builddir)/src/libhat-trie.la
+check_hattrie_CPPFLAGS = -I$(top_builddir)/src
+
+bench_sorted_iter_SOURCES = bench_sorted_iter.c
+bench_sorted_iter_LDADD = $(top_builddir)/src/libhat-trie.la
+bench_sorted_iter_CPPFLAGS = -I$(top_builddir)/src
diff --git a/test/bench_sorted_iter.c b/test/bench_sorted_iter.c
new file mode 100644
index 0000000..0271bcb
--- /dev/null
+++ b/test/bench_sorted_iter.c
@@ -0,0 +1,69 @@
+
+/* A quick test of the degree to which ordered iteration is slower than unordered. */
+
+#include "../src/hat-trie.h"
+#include <stdio.h>
+#include <time.h>
+
+
+/* Simple random string generation. */
+void randstr(char* x, size_t len)
+{
+ x[len] = '\0';
+ while (len > 0) {
+ x[--len] = '\x20' + (rand() % ('\x7e' - '\x20' + 1));
+ }
+}
+
+int main()
+{
+ hattrie_t* T = hattrie_create();
+ const size_t n = 1000000; // how many strings
+ const size_t m_low = 50; // minimum length of each string
+ const size_t m_high = 500; // maximum length of each string
+ char x[501];
+
+ size_t i, m;
+ for (i = 0; i < n; ++i) {
+ m = m_low + rand() % (m_high - m_low);
+ randstr(x, m);
+ *hattrie_get(T, x, m) = 1;
+ }
+
+ hattrie_iter_t* it;
+ clock_t t0, t;
+ const size_t repetitions = 100;
+ size_t r;
+
+ /* iterate in unsorted order */
+ fprintf(stderr, "iterating out of order ... ");
+ t0 = clock();
+ for (r = 0; r < repetitions; ++r) {
+ it = hattrie_iter_begin(T, false);
+ while (!hattrie_iter_finished(it)) {
+ hattrie_iter_next(it);
+ }
+ hattrie_iter_free(it);
+ }
+ t = clock();
+ fprintf(stderr, "finished. (%0.2f seconds)\n", (double) (t - t0) / (double) CLOCKS_PER_SEC);
+
+
+ /* iterate in sorted order */
+ fprintf(stderr, "iterating in order ... ");
+ t0 = clock();
+ for (r = 0; r < repetitions; ++r) {
+ it = hattrie_iter_begin(T, true);
+ while (!hattrie_iter_finished(it)) {
+ hattrie_iter_next(it);
+ }
+ hattrie_iter_free(it);
+ }
+ t = clock();
+ fprintf(stderr, "finished. (%0.2f seconds)\n", (double) (t - t0) / (double) CLOCKS_PER_SEC);
+
+
+ hattrie_free(T);
+
+ return 0;
+}
diff --git a/test/check_ahtable.c b/test/check_ahtable.c
new file mode 100644
index 0000000..f61132b
--- /dev/null
+++ b/test/check_ahtable.c
@@ -0,0 +1,222 @@
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "str_map.h"
+#include "../src/ahtable.h"
+
+/* Simple random string generation. */
+void randstr(char* x, size_t len)
+{
+ x[len] = '\0';
+ while (len > 0) {
+ x[--len] = '\x20' + (rand() % ('\x7e' - '\x20' + 1));
+ }
+}
+
+
+const size_t n = 100000; // how many unique strings
+const size_t m_low = 50; // minimum length of each string
+const size_t m_high = 500; // maximum length of each string
+const size_t k = 200000; // number of insertions
+char** xs;
+
+ahtable_t* T;
+str_map* M;
+
+
+void setup()
+{
+ fprintf(stderr, "generating %zu keys ... ", n);
+ xs = malloc(n * sizeof(char*));
+ size_t i;
+ size_t m;
+ for (i = 0; i < n; ++i) {
+ m = m_low + rand() % (m_high - m_low);
+ xs[i] = malloc(m + 1);
+ randstr(xs[i], m);
+ }
+
+ T = ahtable_create();
+ M = str_map_create();
+ fprintf(stderr, "done.\n");
+}
+
+
+void teardown()
+{
+ ahtable_free(T);
+ str_map_destroy(M);
+
+ size_t i;
+ for (i = 0; i < n; ++i) {
+ free(xs[i]);
+ }
+ free(xs);
+}
+
+
+void test_ahtable_insert()
+{
+ fprintf(stderr, "inserting %zu keys ... \n", k);
+
+ size_t i, j;
+ value_t* u;
+ value_t v;
+
+ for (j = 0; j < k; ++j) {
+ i = rand() % n;
+
+
+ v = 1 + str_map_get(M, xs[i], strlen(xs[i]));
+ str_map_set(M, xs[i], strlen(xs[i]), v);
+
+
+ u = ahtable_get(T, xs[i], strlen(xs[i]));
+ *u += 1;
+
+
+ if (*u != v) {
+ fprintf(stderr, "[error] tally mismatch (reported: %lu, correct: %lu)\n",
+ *u, v);
+ }
+ }
+
+ fprintf(stderr, "sizeof: %zu\n", ahtable_sizeof(T));
+
+ /* delete some keys */
+ for (j = 0; i < k/100; ++j) {
+ i = rand() % n;
+ ahtable_del(T, xs[i], strlen(xs[i]));
+ str_map_del(M, xs[i], strlen(xs[i]));
+ u = ahtable_tryget(T, xs[i], strlen(xs[i]));
+ if (u) {
+ fprintf(stderr, "[error] deleted node found in ahtable\n");
+ }
+ }
+
+ fprintf(stderr, "done.\n");
+}
+
+
+void test_ahtable_iteration()
+{
+ fprintf(stderr, "iterating through %zu keys ... \n", k);
+
+ ahtable_iter_t* i = ahtable_iter_begin(T, false);
+
+ size_t count = 0;
+ value_t* u;
+ value_t v;
+
+ size_t len;
+ const char* key;
+
+ while (!ahtable_iter_finished(i)) {
+ ++count;
+
+ key = ahtable_iter_key(i, &len);
+ u = ahtable_iter_val(i);
+ v = str_map_get(M, key, len);
+
+ if (*u != v) {
+ if (v == 0) {
+ fprintf(stderr, "[error] incorrect iteration (%lu, %lu)\n", *u, v);
+ }
+ else {
+ fprintf(stderr, "[error] incorrect iteration tally (%lu, %lu)\n", *u, v);
+ }
+ }
+
+ // this way we will see an error if the same key is iterated through
+ // twice
+ str_map_set(M, key, len, 0);
+
+ ahtable_iter_next(i);
+ }
+
+ if (count != M->m) {
+ fprintf(stderr, "[error] iterated through %zu element, expected %zu\n",
+ count, M->m);
+ }
+
+ ahtable_iter_free(i);
+
+ fprintf(stderr, "done.\n");
+}
+
+
+int cmpkey(const char* a, size_t ka, const char* b, size_t kb)
+{
+ int c = memcmp(a, b, ka < kb ? ka : kb);
+ return c == 0 ? (int) ka - (int) kb : c;
+}
+
+
+void test_ahtable_sorted_iteration()
+{
+ fprintf(stderr, "iterating in order through %zu keys ... \n", k);
+
+ ahtable_iter_t* i = ahtable_iter_begin(T, true);
+
+ size_t count = 0;
+ value_t* u;
+ value_t v;
+
+ char* prev_key = malloc(m_high + 1);
+ size_t prev_len = 0;
+
+ const char *key = NULL;
+ size_t len = 0;
+
+ while (!ahtable_iter_finished(i)) {
+ memcpy(prev_key, key, len);
+ prev_len = len;
+ ++count;
+
+ key = ahtable_iter_key(i, &len);
+ if (prev_key != NULL && cmpkey(prev_key, prev_len, key, len) > 0) {
+ fprintf(stderr, "[error] iteration is not correctly ordered.\n");
+ }
+
+ u = ahtable_iter_val(i);
+ v = str_map_get(M, key, len);
+
+ if (*u != v) {
+ if (v == 0) {
+ fprintf(stderr, "[error] incorrect iteration (%lu, %lu)\n", *u, v);
+ }
+ else {
+ fprintf(stderr, "[error] incorrect iteration tally (%lu, %lu)\n", *u, v);
+ }
+ }
+
+ // this way we will see an error if the same key is iterated through
+ // twice
+ str_map_set(M, key, len, 0);
+
+ ahtable_iter_next(i);
+ }
+
+ ahtable_iter_free(i);
+ free(prev_key);
+
+ fprintf(stderr, "done.\n");
+}
+
+
+int main()
+{
+ setup();
+ test_ahtable_insert();
+ test_ahtable_iteration();
+ teardown();
+
+ setup();
+ test_ahtable_insert();
+ test_ahtable_sorted_iteration();
+ teardown();
+
+ return 0;
+}
diff --git a/test/check_hattrie.c b/test/check_hattrie.c
new file mode 100644
index 0000000..5bb6b38
--- /dev/null
+++ b/test/check_hattrie.c
@@ -0,0 +1,270 @@
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "str_map.h"
+#include "../src/hat-trie.h"
+
+/* Simple random string generation. */
+void randstr(char* x, size_t len)
+{
+ x[len] = '\0';
+ while (len > 0) {
+ x[--len] = '\x20' + (rand() % ('\x7e' - '\x20' + 1));
+ }
+}
+
+const size_t n = 100000; // how many unique strings
+const size_t m_low = 50; // minimum length of each string
+const size_t m_high = 500; // maximum length of each string
+const size_t k = 200000; // number of insertions
+const size_t d = 50000;
+
+char** xs;
+char** ds;
+
+hattrie_t* T;
+str_map* M;
+
+
+void setup()
+{
+ fprintf(stderr, "generating %zu keys ... ", n);
+ xs = malloc(n * sizeof(char*));
+ ds = malloc(d * sizeof(char*));
+ size_t i;
+ size_t m;
+ for (i = 0; i < n; ++i) {
+ m = m_low + rand() % (m_high - m_low);
+ xs[i] = malloc(m + 1);
+ randstr(xs[i], m);
+ }
+ for (i = 0; i < d; ++i) {
+ m = rand()%n;
+ ds[i] = xs[m];
+ }
+
+ T = hattrie_create();
+ M = str_map_create();
+ fprintf(stderr, "done.\n");
+}
+
+
+void teardown()
+{
+ hattrie_free(T);
+ str_map_destroy(M);
+
+ size_t i;
+ for (i = 0; i < n; ++i) {
+ free(xs[i]);
+ }
+ free(xs);
+ free(ds);
+}
+
+
+void test_hattrie_insert()
+{
+ fprintf(stderr, "inserting %zu keys ... \n", k);
+
+ size_t i, j;
+ value_t* u;
+ value_t v;
+
+ for (j = 0; j < k; ++j) {
+ i = rand() % n;
+
+
+ v = 1 + str_map_get(M, xs[i], strlen(xs[i]));
+ str_map_set(M, xs[i], strlen(xs[i]), v);
+
+
+ u = hattrie_get(T, xs[i], strlen(xs[i]));
+ *u += 1;
+
+
+ if (*u != v) {
+ fprintf(stderr, "[error] tally mismatch (reported: %lu, correct: %lu)\n",
+ *u, v);
+ }
+ }
+
+ fprintf(stderr, "sizeof: %zu\n", hattrie_sizeof(T));
+
+ fprintf(stderr, "deleting %zu keys ... \n", d);
+ for (j = 0; j < d; ++j) {
+ str_map_del(M, ds[j], strlen(ds[j]));
+ hattrie_del(T, ds[j], strlen(ds[j]));
+ u = hattrie_tryget(T, ds[j], strlen(ds[j]));
+ if (u) {
+ fprintf(stderr, "[error] item %zu still found in trie after delete\n",
+ j);
+ }
+ }
+
+ fprintf(stderr, "done.\n");
+}
+
+
+
+void test_hattrie_iteration()
+{
+ fprintf(stderr, "iterating through %zu keys ... \n", k);
+
+ hattrie_iter_t* i = hattrie_iter_begin(T, false);
+
+ size_t count = 0;
+ value_t* u;
+ value_t v;
+
+ size_t len;
+ const char* key;
+
+ while (!hattrie_iter_finished(i)) {
+ ++count;
+
+ key = hattrie_iter_key(i, &len);
+ u = hattrie_iter_val(i);
+
+ v = str_map_get(M, key, len);
+
+ if (*u != v) {
+ if (v == 0) {
+ fprintf(stderr, "[error] incorrect iteration (%lu, %lu)\n", *u, v);
+ }
+ else {
+ fprintf(stderr, "[error] incorrect iteration tally (%lu, %lu)\n", *u, v);
+ }
+ }
+
+ // this way we will see an error if the same key is iterated through
+ // twice
+ str_map_set(M, key, len, 0);
+
+ hattrie_iter_next(i);
+ }
+
+ if (count != M->m) {
+ fprintf(stderr, "[error] iterated through %zu element, expected %zu\n",
+ count, M->m);
+ }
+
+ hattrie_iter_free(i);
+
+ fprintf(stderr, "done.\n");
+}
+
+
+int cmpkey(const char* a, size_t ka, const char* b, size_t kb)
+{
+ int c = memcmp(a, b, ka < kb ? ka : kb);
+ return c == 0 ? (int) ka - (int) kb : c;
+}
+
+
+void test_hattrie_sorted_iteration()
+{
+ fprintf(stderr, "iterating in order through %zu keys ... \n", k);
+
+ hattrie_iter_t* i = hattrie_iter_begin(T, true);
+
+ size_t count = 0;
+ value_t* u;
+ value_t v;
+
+ char* key_copy = malloc(m_high + 1);
+ char* prev_key = malloc(m_high + 1);
+ memset(prev_key, 0, m_high + 1);
+ size_t prev_len = 0;
+
+ const char *key = NULL;
+ size_t len = 0;
+
+ while (!hattrie_iter_finished(i)) {
+ memcpy(prev_key, key_copy, len);
+ prev_key[len] = '\0';
+ prev_len = len;
+ ++count;
+
+ key = hattrie_iter_key(i, &len);
+
+ /* memory for key may be changed on iter, copy it */
+ strncpy(key_copy, key, len);
+
+ if (prev_key != NULL && cmpkey(prev_key, prev_len, key, len) > 0) {
+ fprintf(stderr, "[error] iteration is not correctly ordered.\n");
+ }
+
+ u = hattrie_iter_val(i);
+ v = str_map_get(M, key, len);
+
+ if (*u != v) {
+ if (v == 0) {
+ fprintf(stderr, "[error] incorrect iteration (%lu, %lu)\n", *u, v);
+ }
+ else {
+ fprintf(stderr, "[error] incorrect iteration tally (%lu, %lu)\n", *u, v);
+ }
+ }
+
+ // this way we will see an error if the same key is iterated through
+ // twice
+ str_map_set(M, key, len, 0);
+
+ hattrie_iter_next(i);
+ }
+
+ if (count != M->m) {
+ fprintf(stderr, "[error] iterated through %zu element, expected %zu\n",
+ count, M->m);
+ }
+
+ hattrie_iter_free(i);
+ free(prev_key);
+ free(key_copy);
+
+ fprintf(stderr, "done.\n");
+}
+
+
+void test_trie_non_ascii()
+{
+ fprintf(stderr, "checking non-ascii... \n");
+
+ value_t* u;
+ hattrie_t* T = hattrie_create();
+ char* txt = "\x81\x70";
+
+ u = hattrie_get(T, txt, strlen(txt));
+ *u = 10;
+
+ u = hattrie_tryget(T, txt, strlen(txt));
+ if (*u != 10){
+ fprintf(stderr, "can't store non-ascii strings\n");
+ }
+ hattrie_free(T);
+
+ fprintf(stderr, "done.\n");
+}
+
+
+
+
+int main()
+{
+ test_trie_non_ascii();
+
+ setup();
+ test_hattrie_insert();
+ test_hattrie_iteration();
+ teardown();
+
+ setup();
+ test_hattrie_insert();
+ test_hattrie_sorted_iteration();
+ teardown();
+
+ return 0;
+}
diff --git a/test/str_map.c b/test/str_map.c
new file mode 100644
index 0000000..68303a3
--- /dev/null
+++ b/test/str_map.c
@@ -0,0 +1,241 @@
+
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones at cs.washington.edu>
+ *
+ */
+
+
+#include "str_map.h"
+#include "misc.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+
+static const size_t INITIAL_TABLE_SIZE = 16;
+static const double MAX_LOAD = 0.77;
+
+
+/*
+ * Paul Hsieh's SuperFastHash
+ * http://www.azillionmonkeys.com/qed/hash.html
+ */
+
+
+#undef get16bits
+#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \
+ || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
+#define get16bits(d) (*((const uint16_t *) (d)))
+#endif
+
+#if !defined (get16bits)
+#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\
+ +(uint32_t)(((const uint8_t *)(d))[0]) )
+#endif
+
+static uint32_t hash(const char * data, size_t len) {
+ uint32_t hash = len, tmp;
+ int rem;
+
+ if (len <= 0 || data == NULL) return 0;
+
+ rem = len & 3;
+ len >>= 2;
+
+ /* Main loop */
+ for (;len > 0; len--) {
+ hash += get16bits (data);
+ tmp = (get16bits (data+2) << 11) ^ hash;
+ hash = (hash << 16) ^ tmp;
+ data += 2*sizeof (uint16_t);
+ hash += hash >> 11;
+ }
+
+ /* Handle end cases */
+ switch (rem) {
+ case 3: hash += get16bits (data);
+ hash ^= hash << 16;
+ hash ^= data[sizeof (uint16_t)] << 18;
+ hash += hash >> 11;
+ break;
+ case 2: hash += get16bits (data);
+ hash ^= hash << 11;
+ hash += hash >> 17;
+ break;
+ case 1: hash += *data;
+ hash ^= hash << 10;
+ hash += hash >> 1;
+ }
+
+ /* Force "avalanching" of final 127 bits */
+ hash ^= hash << 3;
+ hash += hash >> 5;
+ hash ^= hash << 4;
+ hash += hash >> 17;
+ hash ^= hash << 25;
+ hash += hash >> 6;
+
+ return hash;
+}
+
+
+
+static void rehash(str_map* T, size_t new_n);
+static void clear(str_map*);
+
+
+
+str_map* str_map_create()
+{
+ str_map* T = malloc_or_die(sizeof(str_map));
+ T->A = malloc_or_die(INITIAL_TABLE_SIZE * sizeof(str_map_pair*));
+ memset(T->A, 0, INITIAL_TABLE_SIZE * sizeof(str_map_pair*));
+ T->n = INITIAL_TABLE_SIZE;
+ T->m = 0;
+ T->max_m = T->n * MAX_LOAD;
+
+ return T;
+}
+
+
+void str_map_destroy(str_map* T)
+{
+ if (T != NULL) {
+ clear(T);
+ free(T->A);
+ free(T);
+ }
+}
+
+
+
+void clear(str_map* T)
+{
+ str_map_pair* u;
+ size_t i;
+ for (i = 0; i < T->n; i++) {
+ while (T->A[i]) {
+ u = T->A[i]->next;
+ free(T->A[i]->key);
+ free(T->A[i]);
+ T->A[i] = u;
+ }
+ }
+
+ T->m = 0;
+}
+
+
+static void insert_without_copy(str_map* T, str_map_pair* V)
+{
+ uint32_t h = hash(V->key, V->keylen) % T->n;
+ V->next = T->A[h];
+ T->A[h] = V;
+ T->m++;
+}
+
+
+
+static void rehash(str_map* T, size_t new_n)
+{
+ str_map U;
+ U.n = new_n;
+ U.m = 0;
+ U.max_m = U.n * MAX_LOAD;
+ U.A = malloc_or_die(U.n * sizeof(str_map_pair*));
+ memset(U.A, 0, U.n * sizeof(str_map_pair*));
+
+ str_map_pair *j, *k;
+ size_t i;
+ for (i = 0; i < T->n; i++) {
+ j = T->A[i];
+ while (j) {
+ k = j->next;
+ insert_without_copy(&U, j);
+ j = k;
+ }
+ T->A[i] = NULL;
+ }
+
+ free(T->A);
+ T->A = U.A;
+ T->n = U.n;
+ T->max_m = U.max_m;
+}
+
+
+void str_map_set(str_map* T, const char* key, size_t keylen, value_t value)
+{
+ if (T->m >= T->max_m) rehash(T, T->n * 2);
+
+ uint32_t h = hash(key, keylen) % T->n;
+
+ str_map_pair* u = T->A[h];
+
+ while (u) {
+ if (u->keylen == keylen && memcmp(u->key, key, keylen) == 0) {
+ u->value = value;
+ return;
+ }
+
+ u = u->next;
+ }
+
+ u = malloc_or_die(sizeof(str_map_pair));
+ u->key = malloc_or_die(keylen);
+ memcpy(u->key, key, keylen);
+ u->keylen = keylen;
+ u->value = value;
+
+ u->next = T->A[h];
+ T->A[h] = u;
+
+ T->m++;
+}
+
+
+value_t str_map_get(const str_map* T, const char* key, size_t keylen)
+{
+ uint32_t h = hash(key, keylen) % T->n;
+
+ str_map_pair* u = T->A[h];
+
+ while (u) {
+ if (u->keylen == keylen && memcmp(u->key, key, keylen) == 0) {
+ return u->value;
+ }
+
+ u = u->next;
+ }
+
+ return 0;
+}
+
+void str_map_del(str_map* T, const char* key, size_t keylen)
+{
+ uint32_t h = hash(key, keylen) % T->n;
+
+ str_map_pair* u = T->A[h];
+ str_map_pair* p = NULL;
+ while (u) {
+
+ if (u->keylen == keylen && memcmp(u->key, key, keylen) == 0) {
+ if (p) {
+ p->next = u->next;
+ } else {
+ T->A[h] = u->next;
+ }
+ free(u->key);
+ free(u);
+ --T->m;
+ return;
+ }
+
+ p = u;
+ u = u->next;
+ }
+
+}
+
diff --git a/test/str_map.h b/test/str_map.h
new file mode 100644
index 0000000..7d000d5
--- /dev/null
+++ b/test/str_map.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones at cs.washington.edu>
+ *
+ * hash :
+ * A quick and simple hash table mapping strings to things.
+ *
+ */
+
+
+#ifndef ISOLATOR_STR_MAP_H
+#define ISOLATOR_STR_MAP_H
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "common.h"
+
+
+typedef struct str_map_pair_
+{
+ char* key;
+ size_t keylen;
+ value_t value;
+
+ struct str_map_pair_* next;
+} str_map_pair;
+
+
+typedef struct
+{
+ str_map_pair** A; /* table proper */
+ size_t n; /* table size */
+ size_t m; /* hashed items */
+ size_t max_m; /* max hashed items before rehash */
+} str_map;
+
+
+
+str_map* str_map_create(void);
+void str_map_destroy(str_map*);
+void str_map_set(str_map*, const char* key, size_t keylen, value_t value);
+value_t str_map_get(const str_map*, const char* key, size_t keylen);
+void str_map_del(str_map* T, const char* key, size_t keylen);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/hat-trie.git
More information about the debian-med-commit
mailing list