summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorXavier Roche <xroche@users.noreply.github.com>2013-06-25 19:10:40 +0000
committerXavier Roche <xroche@users.noreply.github.com>2013-06-25 19:10:40 +0000
commit9c3e5b43bdf982348a5d5ec84dd2c6fd9b990331 (patch)
tree94ebe3cdc774db3537842446fc77515a20bf4d45
parent9090ee17b364f44a8df75f927b26c6e352ef9fff (diff)
hashtables: use fnv-1 for hashing
-rw-r--r--src/htshash.c1
-rw-r--r--src/htsinthash.c443
-rw-r--r--src/htsinthash.h9
3 files changed, 277 insertions, 176 deletions
diff --git a/src/htshash.c b/src/htshash.c
index 2d12941..1ac20ee 100644
--- a/src/htshash.c
+++ b/src/htshash.c
@@ -143,7 +143,6 @@ int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
void hash_write(hash_struct * hash, int lpos, int normalized) {
char BIGSTK normfil_[HTS_URLMAXSIZE * 2];
char catbuff[CATBUFF_SIZE];
- const char *normfil;
const char *name;
/* first entry: destination filename (lowercased) */
diff --git a/src/htsinthash.c b/src/htsinthash.c
index d82a6d7..c8bf836 100644
--- a/src/htsinthash.c
+++ b/src/htsinthash.c
@@ -43,27 +43,80 @@ Please visit our Website: http://www.httrack.com
#include "htsinthash.h"
+/* We may use md5 as hashing function for its quality regarding diffusion and
+ collisions. MD5 is slower than other hashing functions, but is known to be
+ an excellent hashing function. FNV-1 is however generally good enough for
+ this purpose, too.
+*/
+#if 0
#define USES_MD5
-
+#endif
#ifdef USES_MD5
#include "htsmd5.h"
#endif
+#define STASH_SIZE 16
+
/** Hashtable. **/
struct struct_inthash {
+ /** Hashtable items. **/
inthash_item *items;
- size_t nitems;
- t_inthash_freehandler free_handler;
- size_t hash_size_power;
- inthash_item stash[STASH_SIZE];
- size_t stash_size;
- char *string_pool;
- size_t string_pool_size;
- size_t string_pool_capacity;
- size_t string_pool_chars;
- unsigned char flag_valueismalloc;
+
+ /** Log-2 of the hashtable size. **/
+ size_t lg_size;
+
+ /** Number of used items (<= POW2(lg_size)). **/
+ size_t used;
+
+ /** Stash area for collisions. **/
+ struct {
+ /** Stash items. **/
+ inthash_item items[STASH_SIZE];
+
+ /** Stash size (<= stash.size). **/
+ size_t size;
+ } stash;
+
+ /** String pool. **/
+ struct {
+ /** String buffer. **/
+ char *buffer;
+ /** Buffer used size (high watermark). **/
+ size_t size;
+ /** Buffer capacity. **/
+ size_t capacity;
+ /** Used chars (== size if compacted). **/
+ size_t used;
+ } pool;
+
+ /** Statistics **/
+ struct {
+ /** Highest stash.size seen. **/
+ size_t max_stash_size;
+ /** Number of writes. **/
+ size_t write_count;
+ /** Number of writes causing an add. **/
+ size_t add_count;
+ /** Number of cuckoo moved during adds. **/
+ size_t cuckoo_moved;
+ /** Number of items added to stash. **/
+ size_t stash_added;
+ /** Number of hashtable rehash/expand operations. **/
+ size_t rehash_count;
+ /** Number of pool compact operations. **/
+ size_t pool_compact_count;
+ /** Number of pool compact operations. **/
+ size_t pool_realloc_count;
+ } stats;
+
+ /** Settings. **/
+ struct {
+ /** Items value de-allocator. (may be NULL) **/
+ t_inthash_freehandler free_handler;
+ } custom;
};
+/* Using httrack code */
#ifdef LIBHTTRACK_EXPORTS
#include "htsbase.h"
#define inthash_assert assertf
@@ -77,76 +130,76 @@ static void inthash_fail(const char* exp, const char* file, int line) {
#define HTS_INLINE
#endif
-/* Numerical Recipes function. */
-#define HASH_PRIME ( 1664525 )
-#define HASH_CONST ( 1013904223 )
-#define HASH_ADD(HASH, C) do { \
- (HASH) *= HASH_PRIME; \
- (HASH) += HASH_CONST; \
- (HASH) ^= ( (C) << 24); \
- } while(0)
-
/* 2**X */
#define POW2(X) ( (size_t) 1 << (X) )
-static void inthash_log(inthash hashtable,
- const char *format, ...)
+/* Logging */
+static void inthash_log(const inthash hashtable, const char *format, ...)
HTS_PRINTF_FUN(2, 3) {
va_list args;
inthash_assert(format != NULL);
- va_start(args, format);
fprintf(stderr, "[%p] ", (void*) hashtable);
+ va_start(args, format);
(void) vfprintf(stderr, format, args);
- putc('\n', stderr);
va_end(args);
+ putc('\n', stderr);
}
-#ifndef USES_MD5
-static unsigned int rev32(unsigned int v) {
- /* swap odd and even bits */
- v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
- /* swap consecutive pairs */
- v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
- /* swap nibbles ... */
- v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
- /* swap bytes */
- v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
- /* swap 2-byte long pairs */
- v = ( v >> 16 ) | ( v << 16);
- return v;
-}
-#endif
-
static inthash_keys inthash_calc_hashes(const char *value) {
#ifdef USES_MD5
+ /* compute a regular MD5 and extract two 32-bit integers */
union {
char md5digest[16];
inthash_keys hashes;
} u;
+
+ /* compute MD5 */
domd5mem(value, strlen(value), u.md5digest, 0);
+
+ /* do not keep identical hashes */
+ if (u.hashes.hash1 == u.hashes.hash2) {
+ hashes.hash2 = ~hashes.hash2;
+ }
+
return u.hashes;
#else
- /* compute two lcg hashes using different seeds */
- inthash_keys hash = { 0, ~0 };
+ /* compute two Fowler–Noll–Vo hashes (64-bit FNV-1 variant) ;
+ each 64-bit hash being XOR-folded into a single 32-bit hash. */
size_t i;
+ inthash_keys hashes;
+ uint64_t h1, h2;
+
+ /* FNV-1, 64-bit. */
+#define FNV1_PRIME 1099511628211
+#define FNV1_OFFSET_BASIS 14695981039346656037
+
+ /* compute the hashes ; second variant is using xored data */
+ h1 = (uint64_t) FNV1_OFFSET_BASIS;
+ h2 = (uint64_t) ~FNV1_OFFSET_BASIS;
for(i = 0 ; value[i] != '\0' ; i++) {
const unsigned char c1 = value[i];
- const unsigned char c2 = 0xff - c1;
- HASH_ADD(hash.hash1, c1);
- HASH_ADD(hash.hash2, c2);
- }
- /* yes, it may happen */
- if (hash.hash1 == hash.hash2) {
- hash.hash2 = ~hash.hash2;
- }
- /* reverse bits: "higher-order bits have longer periods than the lower
- order bits" (Wikipedia) */
- hash.hash1 ^= rev32(hash.hash1);
- hash.hash2 ^= rev32(hash.hash2);
- return hash;
+ const unsigned char c2 = ~c1;
+ h1 = ( h1 * FNV1_PRIME ) ^ c1;
+ h2 = ( h2 * FNV1_PRIME ) ^ c2;
+ }
+
+ /* XOR-folding to improve diffusion (Wikipedia) */
+ hashes.hash1 = ( (uint32_t) h1 ^ (uint32_t) ( h1 >> 32 ) );
+ hashes.hash2 = ( (uint32_t) h2 ^ (uint32_t) ( h2 >> 32 ) );
+
+#undef FNV1_PRIME
+#undef FNV1_OFFSET_BASIS
+
+ /* do not keep identical hashes */
+ if (hashes.hash1 == hashes.hash2) {
+ hashes.hash2 = ~hashes.hash2;
+ }
+
+ return hashes;
#endif
}
+/* position 'pos' is free ? */
static HTS_INLINE int inthash_is_free(const inthash hashtable, size_t pos) {
return hashtable->items[pos].name == NULL;
}
@@ -169,17 +222,21 @@ static HTS_INLINE int inthash_matches(const inthash hashtable, size_t pos,
/* realloc (expand) string pool ; does not change the compacity */
static void inthash_realloc_pool(inthash hashtable) {
- const size_t hash_size = POW2(hashtable->hash_size_power);
- char *const oldbase = hashtable->string_pool;
+ const size_t hash_size = POW2(hashtable->lg_size);
+ char *const oldbase = hashtable->pool.buffer;
size_t i;
int count = 0;
- hashtable->string_pool = realloc(hashtable->string_pool,
- hashtable->string_pool_capacity);
- if (hashtable->string_pool == NULL) {
+ /* statistics */
+ hashtable->stats.pool_realloc_count++;
+
+ /* realloc */
+ hashtable->pool.buffer = realloc(hashtable->pool.buffer,
+ hashtable->pool.capacity);
+ if (hashtable->pool.buffer == NULL) {
fprintf(stderr,
"** hashtable string pool allocation error: could not allocate %u bytes",
- (int) hashtable->string_pool_capacity);
+ (int) hashtable->pool.capacity);
inthash_assert(! "hashtable string pool allocation error");
}
@@ -187,7 +244,7 @@ static void inthash_realloc_pool(inthash hashtable) {
#define RECOMPUTE_STRING(S) do { \
if (S != NULL) { \
const size_t offset = (S) - oldbase; \
- hashtable->items[i].name = &hashtable->string_pool[offset]; \
+ hashtable->items[i].name = &hashtable->pool.buffer[offset]; \
count++; \
} \
} while(0)
@@ -196,31 +253,35 @@ static void inthash_realloc_pool(inthash hashtable) {
for(i = 0 ; i < hash_size ; i++) {
RECOMPUTE_STRING(hashtable->items[i].name);
}
- for(i = 0 ; i < hashtable->stash_size ; i++) {
- RECOMPUTE_STRING(hashtable->stash[i].name);
+ for(i = 0 ; i < hashtable->stash.size ; i++) {
+ RECOMPUTE_STRING(hashtable->stash.items[i].name);
}
#undef RECOMPUTE_STRING
inthash_log(hashtable, "reallocated string pool for %d strings: %d bytes",
- count, (int) hashtable->string_pool_capacity);
+ count, (int) hashtable->pool.capacity);
}
/* compact string pool ; does not change the capacity */
static void inthash_compact_pool(inthash hashtable) {
- const size_t hash_size = POW2(hashtable->hash_size_power);
+ const size_t hash_size = POW2(hashtable->lg_size);
size_t i;
- char* old_pool = hashtable->string_pool;
- const size_t old_size = hashtable->string_pool_size;
+ char* old_pool = hashtable->pool.buffer;
+ const size_t old_size = hashtable->pool.size;
int count = 0;
- hashtable->string_pool = malloc(hashtable->string_pool_capacity);
- hashtable->string_pool_size = 0;
- hashtable->string_pool_chars = 0;
- if (hashtable->string_pool == NULL) {
+ /* statistics */
+ hashtable->stats.pool_compact_count++;
+
+ /* realloc */
+ hashtable->pool.buffer = malloc(hashtable->pool.capacity);
+ hashtable->pool.size = 0;
+ hashtable->pool.used = 0;
+ if (hashtable->pool.buffer == NULL) {
fprintf(stderr,
"** hashtable string pool compaction error: could not allocate %u bytes",
- (int) hashtable->string_pool_capacity);
+ (int) hashtable->pool.capacity);
inthash_assert(! "hashtable string pool compaction error");
}
@@ -228,12 +289,12 @@ static void inthash_compact_pool(inthash hashtable) {
#define RELOCATE_STRING(S) do { \
if (S != NULL) { \
const size_t len = strlen(S) + 1; \
- inthash_assert(hashtable->string_pool_size + len \
- <= hashtable->string_pool_capacity); \
- memcpy(&hashtable->string_pool[hashtable->string_pool_size], \
+ inthash_assert(hashtable->pool.size + len \
+ <= hashtable->pool.capacity); \
+ memcpy(&hashtable->pool.buffer[hashtable->pool.size], \
S, len); \
- S = &hashtable->string_pool[hashtable->string_pool_size]; \
- hashtable->string_pool_size += len; \
+ S = &hashtable->pool.buffer[hashtable->pool.size]; \
+ hashtable->pool.size += len; \
count++; \
} \
} while(0)
@@ -242,40 +303,40 @@ static void inthash_compact_pool(inthash hashtable) {
for(i = 0 ; i < hash_size ; i++) {
RELOCATE_STRING(hashtable->items[i].name);
}
- for(i = 0 ; i < hashtable->stash_size ; i++) {
- RELOCATE_STRING(hashtable->stash[i].name);
+ for(i = 0 ; i < hashtable->stash.size ; i++) {
+ RELOCATE_STRING(hashtable->stash.items[i].name);
}
#undef RELOCATE_STRING
/* compacted (used chars == current size) */
- hashtable->string_pool_chars = hashtable->string_pool_size;
+ hashtable->pool.used = hashtable->pool.size;
/* wipe previous pool */
free(old_pool);
inthash_log(hashtable,
"compacted string pool for %d strings: %d bytes => %d bytes",
- count, (int) old_size, (int) hashtable->string_pool_size);
+ count, (int) old_size, (int) hashtable->pool.size);
}
static char* inthash_dup_name(inthash hashtable, const char *name) {
const size_t len = strlen(name) + 1;
char *s;
- if (hashtable->string_pool_capacity - hashtable->string_pool_size < len) {
- if (hashtable->string_pool_capacity == 0) {
- hashtable->string_pool_capacity = 16;
+ if (hashtable->pool.capacity - hashtable->pool.size < len) {
+ if (hashtable->pool.capacity == 0) {
+ hashtable->pool.capacity = 16;
}
- for( ; hashtable->string_pool_capacity - hashtable->string_pool_size < len
- ; hashtable->string_pool_capacity <<= 1) ;
+ for( ; hashtable->pool.capacity - hashtable->pool.size < len
+ ; hashtable->pool.capacity <<= 1) ;
inthash_realloc_pool(hashtable);
}
- s = &hashtable->string_pool[hashtable->string_pool_size];
+ s = &hashtable->pool.buffer[hashtable->pool.size];
memcpy(s, name, len);
- hashtable->string_pool_size += len;
- hashtable->string_pool_chars += len;
+ hashtable->pool.size += len;
+ hashtable->pool.used += len;
return s;
}
@@ -283,21 +344,21 @@ static char* inthash_dup_name(inthash hashtable, const char *name) {
/* note: pointer must have been kicked from the pool first */
static void inthash_free_name(inthash hashtable, char *name) {
const size_t len = strlen(name) + 1;
- hashtable->string_pool_chars -= len;
- if (hashtable->string_pool_chars < hashtable->string_pool_size / 2) {
+ hashtable->pool.used -= len;
+ if (hashtable->pool.used < hashtable->pool.size / 2) {
inthash_compact_pool(hashtable);
}
}
-static HTS_INLINE size_t inthash_hash_to_pos_(size_t hash_size_power,
+static HTS_INLINE size_t inthash_hash_to_pos_(size_t lg_size,
inthash_key hash) {
- const inthash_key mask = POW2(hash_size_power) - 1;
+ const inthash_key mask = POW2(lg_size) - 1;
return hash & mask;
}
static HTS_INLINE size_t inthash_hash_to_pos(const inthash hashtable,
inthash_key hash) {
- return inthash_hash_to_pos_(hashtable->hash_size_power, hash);
+ return inthash_hash_to_pos_(hashtable->lg_size, hash);
}
int inthash_read_pvoid(inthash hashtable, const char *name, void **pvalue) {
@@ -336,12 +397,8 @@ static void inthash_default_free_handler(void *value) {
}
static void inthash_del_value_(inthash hashtable, inthash_value *pvalue) {
- if (hashtable->flag_valueismalloc) {
- if (hashtable->free_handler)
- hashtable->free_handler(pvalue->ptr);
- else
- inthash_default_free_handler(pvalue->ptr);
- }
+ if (hashtable->custom.free_handler != NULL)
+ hashtable->custom.free_handler(pvalue->ptr);
pvalue->ptr = NULL;
}
@@ -371,6 +428,9 @@ static int inthash_write_value_(inthash hashtable, const char *name,
inthash_key cuckoo_hash, initial_cuckoo_hash;
size_t loops;
+ /* Statistics */
+ hashtable->stats.write_count++;
+
/* replace at position 1 ? */
pos = inthash_hash_to_pos(hashtable, hashes.hash1);
if (inthash_matches(hashtable, pos, name, &hashes)) {
@@ -388,17 +448,20 @@ static int inthash_write_value_(inthash hashtable, const char *name,
}
/* replace in the stash ? */
- if (hashtable->stash_size != 0) {
+ if (hashtable->stash.size != 0) {
size_t i;
- for(i = 0 ; i < hashtable->stash_size ; i++) {
- if (inthash_matches_(&hashtable->stash[i], name, &hashes)) {
- inthash_del_value_(hashtable, &hashtable->stash[i].value);
- hashtable->stash[i].value = value;
+ for(i = 0 ; i < hashtable->stash.size ; i++) {
+ if (inthash_matches_(&hashtable->stash.items[i], name, &hashes)) {
+ inthash_del_value_(hashtable, &hashtable->stash.items[i].value);
+ hashtable->stash.items[i].value = value;
return 0; /* replaced */
}
}
}
+ /* Statistics */
+ hashtable->stats.add_count++;
+
/* otherwise we need to create a new item */
item.name = inthash_dup_name(hashtable, name);
item.value = value;
@@ -425,7 +488,7 @@ static int inthash_write_value_(inthash hashtable, const char *name,
}
/* put 'item' in place with hash 'cuckoo_hash' */
- for(loops = POW2(hashtable->hash_size_power) ; loops != 0 ; --loops) {
+ for(loops = POW2(hashtable->lg_size) ; loops != 0 ; --loops) {
const size_t pos = inthash_hash_to_pos(hashtable, cuckoo_hash);
inthash_log(hashtable, "\tdebug:placing cuckoo '%s' at %d (%x)",
@@ -439,9 +502,13 @@ static int inthash_write_value_(inthash hashtable, const char *name,
}
/* then cuckoo's place it is */
else {
+ /* replace */
const inthash_item backup_item = hashtable->items[pos];
hashtable->items[pos] = item;
+ /* statistics */
+ hashtable->stats.cuckoo_moved++;
+
/* take care of new lost item */
item = backup_item;
@@ -471,11 +538,16 @@ static int inthash_write_value_(inthash hashtable, const char *name,
}
/* emergency stashing */
- if (hashtable->stash_size < STASH_SIZE) {
- hashtable->stash[hashtable->stash_size] = item;
- hashtable->stash_size++;
+ if (hashtable->stash.size < STASH_SIZE) {
+ hashtable->stash.items[hashtable->stash.size] = item;
+ hashtable->stash.size++;
+ /* for statistics */
+ hashtable->stats.stash_added++;
+ if (hashtable->stash.size > hashtable->stats.max_stash_size) {
+ hashtable->stats.max_stash_size = hashtable->stash.size;
+ }
inthash_log(hashtable, "used stash because of collision (%d entries)",
- (int) hashtable->stash_size);
+ (int) hashtable->stash.size);
inthash_log(hashtable, "debug:stash position");
return 1; /* added */
} else {
@@ -495,26 +567,29 @@ int inthash_write_value(inthash hashtable, const char *name,
/* added ? */
if (ret) {
/* size of half of the table */
- const size_t half_size = POW2(hashtable->hash_size_power - 1);
+ const size_t half_size = POW2(hashtable->lg_size - 1);
/* item was added: increase count */
- hashtable->nitems++;
+ hashtable->used++;
/* table is more than half-full */
- if (hashtable->nitems >= half_size) {
+ if (hashtable->used >= half_size) {
size_t i;
char *data;
/* size before */
- const size_t prev_power = hashtable->hash_size_power;
+ const size_t prev_power = hashtable->lg_size;
const size_t prev_size = half_size * 2;
const size_t prev_alloc_size = prev_size*sizeof(inthash_item);
/* size after doubling it */
const size_t alloc_size = prev_alloc_size * 2;
+ /* statistics */
+ hashtable->stats.rehash_count++;
+
/* realloc */
- hashtable->hash_size_power++;
+ hashtable->lg_size++;
hashtable->items =
(inthash_item *) realloc(hashtable->items, alloc_size);
if (hashtable->items == NULL) {
@@ -559,17 +634,17 @@ int inthash_write_value(inthash hashtable, const char *name,
}
inthash_log(hashtable, "expanded hashtable to %d elements",
- (int) POW2(hashtable->hash_size_power));
+ (int) POW2(hashtable->lg_size));
/* attempt to merge the stash if present */
- if (hashtable->stash_size != 0) {
- const size_t old_size = hashtable->stash_size;
+ if (hashtable->stash.size != 0) {
+ const size_t old_size = hashtable->stash.size;
size_t i;
/* backup stash and reset it */
inthash_item stash[STASH_SIZE];
- memcpy(&stash, hashtable->stash, sizeof(hashtable->stash));
- hashtable->stash_size = 0;
+ memcpy(&stash, hashtable->stash.items, sizeof(hashtable->stash.items));
+ hashtable->stash.size = 0;
/* insert all items */
for(i = 0 ; i < old_size ; i++) {
@@ -581,18 +656,18 @@ int inthash_write_value(inthash hashtable, const char *name,
}
/* delete old names (not values) */
- for(i = 0 ; i < hashtable->stash_size ; i++) {
+ for(i = 0 ; i < hashtable->stash.size ; i++) {
inthash_del_name(hashtable, &stash[i]);
}
/* logging */
- inthash_assert(hashtable->stash_size <= old_size);
- if (hashtable->stash_size < old_size) {
+ inthash_assert(hashtable->stash.size <= old_size);
+ if (hashtable->stash.size < old_size) {
inthash_log(hashtable, "reduced stash size from %d to %d",
- (int) old_size, (int) hashtable->stash_size);
+ (int) old_size, (int) hashtable->stash.size);
} else {
inthash_log(hashtable, "stash has still %d elements",
- (int) hashtable->stash_size);
+ (int) hashtable->stash.size);
}
}
@@ -637,11 +712,11 @@ static inthash_value* inthash_read_value_(inthash hashtable,
}
/* find in stash ? */
- if (hashtable->stash_size != 0) {
+ if (hashtable->stash.size != 0) {
size_t i;
- for(i = 0 ; i < hashtable->stash_size ; i++) {
- if (inthash_matches_(&hashtable->stash[i], name, &hashes)) {
- return &hashtable->stash[i].value;
+ for(i = 0 ; i < hashtable->stash.size ; i++) {
+ if (inthash_matches_(&hashtable->stash.items[i], name, &hashes)) {
+ return &hashtable->stash.items[i].value;
}
}
}
@@ -709,18 +784,18 @@ static int inthash_remove_(inthash hashtable, const char *name,
}
/* find in stash ? */
- if (hashtable->stash_size != 0) {
+ if (hashtable->stash.size != 0) {
size_t i;
- for(i = 0 ; i < hashtable->stash_size ; i++) {
- if (inthash_matches_(&hashtable->stash[i], name, hashes)) {
- inthash_del_item(hashtable, &hashtable->stash[i]);
- for( ; i + 1 < hashtable->stash_size ; i++) {
- hashtable->stash[i] = hashtable->stash[i + 1];
+ for(i = 0 ; i < hashtable->stash.size ; i++) {
+ if (inthash_matches_(&hashtable->stash.items[i], name, hashes)) {
+ inthash_del_item(hashtable, &hashtable->stash.items[i]);
+ for( ; i + 1 < hashtable->stash.size ; i++) {
+ hashtable->stash.items[i] = hashtable->stash.items[i + 1];
}
- hashtable->stash_size--;
+ hashtable->stash.size--;
*removed = (size_t) -1;
inthash_log(hashtable, "debug:deleted item in stash (%d entries)",
- (int) hashtable->stash_size);
+ (int) hashtable->stash.size);
return 1;
}
}
@@ -737,29 +812,29 @@ int inthash_remove(inthash hashtable, const char *name) {
if (ret) {
/* item was removed: decrease count */
- inthash_assert(hashtable->nitems != 0);
- hashtable->nitems--;
+ inthash_assert(hashtable->used != 0);
+ hashtable->used--;
/* can we place stash entry back to the table ? */
- if (hashtable->stash_size != 0 && removed != (size_t) -1) {
+ if (hashtable->stash.size != 0 && removed != (size_t) -1) {
size_t i;
- for(i = 0 ; i < hashtable->stash_size ; i++) {
+ for(i = 0 ; i < hashtable->stash.size ; i++) {
const size_t pos1 =
- inthash_hash_to_pos(hashtable, hashtable->stash[i].hashes.hash1);
+ inthash_hash_to_pos(hashtable, hashtable->stash.items[i].hashes.hash1);
const size_t pos2 =
- inthash_hash_to_pos(hashtable, hashtable->stash[i].hashes.hash2);
+ inthash_hash_to_pos(hashtable, hashtable->stash.items[i].hashes.hash2);
if (pos1 == removed || pos2 == removed) {
if (pos1 == removed) {
- hashtable->items[pos1] = hashtable->stash[i];
+ hashtable->items[pos1] = hashtable->stash.items[i];
} else if (pos2 == removed) {
- hashtable->items[pos2] = hashtable->stash[i];
+ hashtable->items[pos2] = hashtable->stash.items[i];
}
- for( ; i + 1 < hashtable->stash_size ; i++) {
- hashtable->stash[i] = hashtable->stash[i + 1];
+ for( ; i + 1 < hashtable->stash.size ; i++) {
+ hashtable->stash.items[i] = hashtable->stash.items[i + 1];
}
- hashtable->stash_size--;
+ hashtable->stash.size--;
inthash_log(hashtable, "debug:moved item from stash (%d entries)",
- (int) hashtable->stash_size);
+ (int) hashtable->stash.size);
break;
}
}
@@ -785,17 +860,24 @@ inthash inthash_new(size_t initial_size) {
if (hashtable) {
size_t size;
for(size = 8 ; POW2(size) < initial_size ; size++) ;
- hashtable->flag_valueismalloc = 0;
if ((hashtable->items =
(inthash_item *) calloc(POW2(size), sizeof(inthash_item)))) {
- hashtable->hash_size_power = size;
+ hashtable->lg_size = size;
}
- hashtable->nitems = 0;
- hashtable->stash_size = 0;
- hashtable->string_pool = NULL;
- hashtable->string_pool_size = 0;
- hashtable->string_pool_capacity = 0;
- hashtable->string_pool_chars = 0;
+ hashtable->used = 0;
+ hashtable->stash.size = 0;
+ hashtable->pool.buffer = NULL;
+ hashtable->pool.size = 0;
+ hashtable->pool.capacity = 0;
+ hashtable->pool.used = 0;
+ hashtable->stats.max_stash_size = 0;
+ hashtable->stats.write_count = 0;
+ hashtable->stats.add_count = 0;
+ hashtable->stats.cuckoo_moved = 0;
+ hashtable->stats.stash_added= 0;
+ hashtable->stats.pool_compact_count = 0;
+ hashtable->stats.pool_realloc_count = 0;
+ hashtable->stats.rehash_count = 0;
}
return hashtable;
}
@@ -805,17 +887,23 @@ int inthash_created(inthash hashtable) {
}
void inthash_value_is_malloc(inthash hashtable, int flag) {
- hashtable->flag_valueismalloc = flag;
+ if (flag) {
+ if (hashtable->custom.free_handler == NULL) {
+ hashtable->custom.free_handler = inthash_default_free_handler;
+ }
+ } else {
+ hashtable->custom.free_handler = NULL;
+ }
}
void inthash_value_set_free_handler(inthash hashtable,
t_inthash_freehandler free_handler) {
- hashtable->free_handler = free_handler;
+ hashtable->custom.free_handler = free_handler;
}
size_t inthash_nitems(inthash hashtable) {
if (hashtable != NULL)
- return hashtable->nitems;
+ return hashtable->used;
return 0;
}
@@ -823,9 +911,22 @@ void inthash_delete(inthash *phashtable) {
if (phashtable != NULL) {
inthash hashtable = *phashtable;
if (hashtable != NULL) {
+ inthash_log(hashtable, "freeing table ; "
+ "writes=%d (new=%d) moved=%d stashed=%d collision-room=%d "
+ "avg-moved=%g rehash=%d pool-compact=%d pool-realloc=%d",
+ (int) hashtable->stats.write_count,
+ (int) hashtable->stats.add_count,
+ (int) hashtable->stats.cuckoo_moved,
+ (int) hashtable->stats.stash_added,
+ (int) hashtable->stats.max_stash_size,
+ (double) hashtable->stats.cuckoo_moved / (double) hashtable->stats.add_count,
+ (int) hashtable->stats.rehash_count,
+ (int) hashtable->stats.pool_compact_count,
+ (int) hashtable->stats.pool_realloc_count
+ );
if (hashtable->items != NULL) {
/* we need to delete values */
- const size_t hash_size = POW2(hashtable->hash_size_power);
+ const size_t hash_size = POW2(hashtable->lg_size);
size_t i;
/* wipe hashtable values (not names) */
@@ -836,15 +937,15 @@ void inthash_delete(inthash *phashtable) {
}
/* wipe auxiliary stash values (not names) if any */
- for(i = 0 ; i < hashtable->stash_size ; i++) {
- inthash_del_value_(hashtable, &hashtable->stash[i].value);
+ for(i = 0 ; i < hashtable->stash.size ; i++) {
+ inthash_del_value_(hashtable, &hashtable->stash.items[i].value);
}
}
/* wipe top-level */
- hashtable->hash_size_power = 0;
- hashtable->nitems = 0;
- free(hashtable->string_pool);
- hashtable->string_pool = NULL;
+ hashtable->lg_size = 0;
+ hashtable->used = 0;
+ free(hashtable->pool.buffer);
+ hashtable->pool.buffer = NULL;
free(hashtable->items);
hashtable->items = NULL;
free(hashtable);
@@ -864,7 +965,7 @@ struct_inthash_enum inthash_enum_new(inthash hashtable) {
}
inthash_item *inthash_enum_next(struct_inthash_enum * e) {
- const size_t hash_size = POW2(e->table->hash_size_power);
+ const size_t hash_size = POW2(e->table->lg_size);
for( ; e->index < hash_size
&& inthash_is_free(e->table, e->index) ; e->index++) ;
/* enumerate all table */
@@ -874,9 +975,9 @@ inthash_item *inthash_enum_next(struct_inthash_enum * e) {
return next;
}
/* enumerate stash if present */
- else if (e->index < hash_size + e->table->stash_size) {
+ else if (e->index < hash_size + e->table->stash.size) {
const size_t index = e->index - hash_size;
- inthash_item *const next = &e->table->stash[index];
+ inthash_item *const next = &e->table->stash.items[index];
e->index++;
return next;
}
diff --git a/src/htsinthash.h b/src/htsinthash.h
index 1178e46..b448840 100644
--- a/src/htsinthash.h
+++ b/src/htsinthash.h
@@ -40,7 +40,7 @@ Please visit our Website: http://www.httrack.com
*
* Implementation notes:
* Implementation is auto-rehashable, and uses cuckoo hashing of size 2**n
- * with a LCG hash function, with one additional auxiliary hash function.
+ * with a FNV hash function, with one additional auxiliary hash function.
* It also uses a small stash area to handle rare cases of collisions.
* Enumeration of all key/values is possible, deletion is also possible, but
* currently without any auto-shrinking (ie. table will never shrink).
@@ -49,8 +49,8 @@ Please visit our Website: http://www.httrack.com
*
* References:
* Cuckoo Hashing http://en.wikipedia.org/wiki/Cuckoo_hashing
- * LCG http://en.wikipedia.org/wiki/Linear_congruential_generator
* Cuckoo Stash http://research.microsoft.com/pubs/73856/stash-full.9-30.pdf
+ * FNV http://www.isthe.com/chongo/tech/comp/fnv/
**/
#ifndef HTSINTHASH_DEFH
@@ -59,6 +59,8 @@ Please visit our Website: http://www.httrack.com
/* Includes */
#ifdef _WIN32
#include <stddef.h>
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
#elif (defined(SOLARIS) || defined(sun) || defined(HAVE_INTTYPES_H) \
|| defined(BSD) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__FreeBSD_kernel__))
#include <inttypes.h>
@@ -84,7 +86,7 @@ typedef struct inthash_item inthash_item;
#endif
/* Hash key (32-bit) */
-typedef unsigned int inthash_key;
+typedef uint32_t inthash_key;
/* Pair of hashes */
typedef struct inthash_keys {
@@ -118,7 +120,6 @@ typedef void (*t_inthash_freehandler) (void *value);
#define HTS_DEF_FWSTRUCT_struct_inthash
typedef struct struct_inthash struct_inthash, *inthash;
#endif
-#define STASH_SIZE 16
/** Hashtable enumeration (opaque structure). **/
#ifndef HTS_DEF_FWSTRUCT_struct_inthash_enum