hashtables: use fnv-1 for hashing

author: Xavier Roche <xroche@users.noreply.github.com> 2013-06-25 19:10:40 +0000
committer: Xavier Roche <xroche@users.noreply.github.com> 2013-06-25 19:10:40 +0000
commit: 9c3e5b43bdf982348a5d5ec84dd2c6fd9b990331 (patch)
tree: 94ebe3cdc774db3537842446fc77515a20bf4d45
parent: 9090ee17b364f44a8df75f927b26c6e352ef9fff (diff)
3 files changed, 277 insertions, 176 deletions
diff --git a/src/htshash.c b/src/htshash.c
index 2d12941..1ac20ee 100644
--- a/src/htshash.c
+++ b/src/htshash.c
@@ -143,7 +143,6 @@ int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
 void hash_write(hash_struct * hash, int lpos, int normalized) {
   char BIGSTK normfil_[HTS_URLMAXSIZE * 2];
   char catbuff[CATBUFF_SIZE];
-  const char *normfil;
   const char *name;
 
   /* first entry: destination filename (lowercased) */
diff --git a/src/htsinthash.c b/src/htsinthash.c
index d82a6d7..c8bf836 100644
--- a/src/htsinthash.c
+++ b/src/htsinthash.c
@@ -43,27 +43,80 @@ Please visit our Website: http://www.httrack.com
 
 #include "htsinthash.h"
 
+/* We may use md5 as hashing function for its quality regarding diffusion and
+   collisions. MD5 is slower than other hashing functions, but is known to be
+   an excellent hashing function. FNV-1 is however generally good enough for
+   this purpose, too.
+*/
+#if 0
 #define USES_MD5
-
+#endif
 #ifdef USES_MD5
 #include "htsmd5.h"
 #endif
 
+#define STASH_SIZE 16
+
 /** Hashtable. **/
 struct struct_inthash {
+  /** Hashtable items. **/
   inthash_item *items;
-  size_t nitems;
-  t_inthash_freehandler free_handler;
-  size_t hash_size_power;
-  inthash_item stash[STASH_SIZE];
-  size_t stash_size;
-  char *string_pool;
-  size_t string_pool_size;
-  size_t string_pool_capacity;
-  size_t string_pool_chars;
-  unsigned char flag_valueismalloc;
+
+  /** Log-2 of the hashtable size. **/
+  size_t lg_size;
+
+  /** Number of used items (<= POW2(lg_size)). **/
+  size_t used;
+
+  /** Stash area for collisions. **/
+  struct {
+    /** Stash items. **/
+    inthash_item items[STASH_SIZE];
+
+    /** Stash size (<= stash.size). **/
+    size_t size;
+  } stash;
+
+  /** String pool. **/
+  struct {
+    /** String buffer. **/
+    char *buffer;
+    /** Buffer used size (high watermark). **/
+    size_t size;
+    /** Buffer capacity. **/
+    size_t capacity;
+    /** Used chars (== size if compacted). **/
+    size_t used;
+  } pool;
+
+  /** Statistics **/
+  struct {
+    /** Highest stash.size seen. **/
+    size_t max_stash_size;
+    /** Number of writes. **/
+    size_t write_count;
+    /** Number of writes causing an add. **/
+    size_t add_count;
+    /** Number of cuckoo moved during adds. **/
+    size_t cuckoo_moved;
+    /** Number of items added to stash. **/
+    size_t stash_added;
+    /** Number of hashtable rehash/expand operations. **/
+    size_t rehash_count;
+    /** Number of pool compact operations. **/
+    size_t pool_compact_count;
+    /** Number of pool compact operations. **/
+    size_t pool_realloc_count;
+  } stats;
+
+  /** Settings. **/
+  struct {
+    /** Items value de-allocator. (may be NULL) **/
+    t_inthash_freehandler free_handler;
+  } custom;
 };
 
+/* Using httrack code */
 #ifdef LIBHTTRACK_EXPORTS
 #include "htsbase.h"
 #define inthash_assert assertf
@@ -77,76 +130,76 @@ static void inthash_fail(const char* exp, const char* file, int line) {
 #define HTS_INLINE
 #endif
 
-/* Numerical Recipes function. */
-#define HASH_PRIME ( 1664525 )
-#define HASH_CONST ( 1013904223 )
-#define HASH_ADD(HASH, C) do {                  \
-    (HASH) *= HASH_PRIME;                       \
-    (HASH) += HASH_CONST;                       \
-    (HASH) ^= ( (C) << 24);                     \
-  } while(0)
-
 /* 2**X */
 #define POW2(X) ( (size_t) 1 << (X) )
 
-static void inthash_log(inthash hashtable,
-                        const char *format, ...)
+/* Logging */
+static void inthash_log(const inthash hashtable, const char *format, ...)
                         HTS_PRINTF_FUN(2, 3) {
   va_list args;
   inthash_assert(format != NULL);
-  va_start(args, format);
   fprintf(stderr, "[%p] ", (void*) hashtable);
+  va_start(args, format);
   (void) vfprintf(stderr, format, args);
-  putc('\n', stderr);
   va_end(args);
+  putc('\n', stderr);
 }
 
-#ifndef USES_MD5
-static unsigned int rev32(unsigned int v) {
-  /* swap odd and even bits */
-  v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
-  /* swap consecutive pairs */
-  v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
-  /* swap nibbles ...  */
-  v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
-  /* swap bytes */
-  v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
-  /* swap 2-byte long pairs */
-  v = ( v >> 16             ) | ( v               << 16);
-  return v;
-}
-#endif
-
 static inthash_keys inthash_calc_hashes(const char *value) {
 #ifdef USES_MD5
+  /* compute a regular MD5 and extract two 32-bit integers */
   union {
     char md5digest[16];
     inthash_keys hashes;
   } u;
+
+  /* compute MD5 */
   domd5mem(value, strlen(value), u.md5digest, 0);
+
+  /* do not keep identical hashes */
+  if (u.hashes.hash1 == u.hashes.hash2) {
+    hashes.hash2 = ~hashes.hash2;
+  }
+
   return u.hashes;
 #else
-  /* compute two lcg hashes using different seeds */
-  inthash_keys hash = { 0, ~0 };
+  /* compute two Fowler–Noll–Vo hashes (64-bit FNV-1 variant) ;
+     each 64-bit hash being XOR-folded into a single 32-bit hash. */
   size_t i;
+  inthash_keys hashes;
+  uint64_t h1, h2;
+
+  /* FNV-1, 64-bit. */
+#define FNV1_PRIME 1099511628211
+#define FNV1_OFFSET_BASIS 14695981039346656037
+
+  /* compute the hashes ; second variant is using xored data */
+  h1 = (uint64_t) FNV1_OFFSET_BASIS;
+  h2 = (uint64_t) ~FNV1_OFFSET_BASIS;
   for(i = 0 ; value[i] != '\0' ; i++) {
     const unsigned char c1 = value[i];
-    const unsigned char c2 = 0xff - c1;
-    HASH_ADD(hash.hash1, c1);
-    HASH_ADD(hash.hash2, c2);
-  }
-  /* yes, it may happen */
-  if (hash.hash1 == hash.hash2) {
-    hash.hash2 = ~hash.hash2;
-  }
-  /* reverse bits: "higher-order bits have longer periods than the lower 
-     order bits" (Wikipedia) */
-  hash.hash1 ^= rev32(hash.hash1);
-  hash.hash2 ^= rev32(hash.hash2);
-  return hash;
+    const unsigned char c2 = ~c1;
+    h1 = ( h1 * FNV1_PRIME ) ^ c1;
+    h2 = ( h2 * FNV1_PRIME ) ^ c2;
+  }
+
+  /* XOR-folding to improve diffusion (Wikipedia) */
+  hashes.hash1 = ( (uint32_t) h1 ^ (uint32_t) ( h1 >> 32 ) );
+  hashes.hash2 = ( (uint32_t) h2 ^ (uint32_t) ( h2 >> 32 ) );
+
+#undef FNV1_PRIME
+#undef FNV1_OFFSET_BASIS
+
+  /* do not keep identical hashes */
+  if (hashes.hash1 == hashes.hash2) {
+    hashes.hash2 = ~hashes.hash2;
+  }
+
+  return hashes;
 #endif
 }
 
+/* position 'pos' is free ? */
 static HTS_INLINE int inthash_is_free(const inthash hashtable, size_t pos) {
   return hashtable->items[pos].name == NULL;
 }
@@ -169,17 +222,21 @@ static HTS_INLINE int inthash_matches(const inthash hashtable, size_t pos,
 
 /* realloc (expand) string pool ; does not change the compacity */
 static void inthash_realloc_pool(inthash hashtable) {
-  const size_t hash_size = POW2(hashtable->hash_size_power);
-  char *const oldbase = hashtable->string_pool;
+  const size_t hash_size = POW2(hashtable->lg_size);
+  char *const oldbase = hashtable->pool.buffer;
   size_t i;
   int count = 0;
 
-  hashtable->string_pool = realloc(hashtable->string_pool,
-    hashtable->string_pool_capacity);
-  if (hashtable->string_pool == NULL) {
+  /* statistics */
+  hashtable->stats.pool_realloc_count++;
+
+  /* realloc */
+  hashtable->pool.buffer = realloc(hashtable->pool.buffer,
+    hashtable->pool.capacity);
+  if (hashtable->pool.buffer == NULL) {
     fprintf(stderr, 
       "** hashtable string pool allocation error: could not allocate %u bytes", 
-      (int) hashtable->string_pool_capacity);
+      (int) hashtable->pool.capacity);
     inthash_assert(! "hashtable string pool allocation error");
   }
 
@@ -187,7 +244,7 @@ static void inthash_realloc_pool(inthash hashtable) {
 #define RECOMPUTE_STRING(S) do {                                   \
     if (S != NULL) {                                               \
       const size_t offset = (S) - oldbase;                         \
-      hashtable->items[i].name = &hashtable->string_pool[offset];  \
+      hashtable->items[i].name = &hashtable->pool.buffer[offset];  \
       count++;                                                     \
     }                                                              \
 } while(0)
@@ -196,31 +253,35 @@ static void inthash_realloc_pool(inthash hashtable) {
   for(i = 0 ; i < hash_size ; i++) {
     RECOMPUTE_STRING(hashtable->items[i].name);
   }
-  for(i = 0 ; i < hashtable->stash_size ; i++) {
-    RECOMPUTE_STRING(hashtable->stash[i].name);
+  for(i = 0 ; i < hashtable->stash.size ; i++) {
+    RECOMPUTE_STRING(hashtable->stash.items[i].name);
   }
 
 #undef RECOMPUTE_STRING
 
   inthash_log(hashtable, "reallocated string pool for %d strings: %d bytes",
-              count, (int) hashtable->string_pool_capacity);
+              count, (int) hashtable->pool.capacity);
 }
 
 /* compact string pool ; does not change the capacity */
 static void inthash_compact_pool(inthash hashtable) {
-  const size_t hash_size = POW2(hashtable->hash_size_power);
+  const size_t hash_size = POW2(hashtable->lg_size);
   size_t i;
-  char* old_pool = hashtable->string_pool;
-  const size_t old_size = hashtable->string_pool_size;
+  char* old_pool = hashtable->pool.buffer;
+  const size_t old_size = hashtable->pool.size;
   int count = 0;
 
-  hashtable->string_pool = malloc(hashtable->string_pool_capacity);
-  hashtable->string_pool_size = 0;
-  hashtable->string_pool_chars = 0;
-  if (hashtable->string_pool == NULL) {
+  /* statistics */
+  hashtable->stats.pool_compact_count++;
+
+  /* realloc */
+  hashtable->pool.buffer = malloc(hashtable->pool.capacity);
+  hashtable->pool.size = 0;
+  hashtable->pool.used = 0;
+  if (hashtable->pool.buffer == NULL) {
     fprintf(stderr, 
       "** hashtable string pool compaction error: could not allocate %u bytes", 
-      (int) hashtable->string_pool_capacity);
+      (int) hashtable->pool.capacity);
     inthash_assert(! "hashtable string pool compaction error");
   }
 
@@ -228,12 +289,12 @@ static void inthash_compact_pool(inthash hashtable) {
 #define RELOCATE_STRING(S) do {                                    \
     if (S != NULL) {                                               \
       const size_t len = strlen(S) + 1;                            \
-      inthash_assert(hashtable->string_pool_size + len             \
-        <= hashtable->string_pool_capacity);                       \
-      memcpy(&hashtable->string_pool[hashtable->string_pool_size], \
+      inthash_assert(hashtable->pool.size + len             \
+        <= hashtable->pool.capacity);                       \
+      memcpy(&hashtable->pool.buffer[hashtable->pool.size], \
         S, len);                                                   \
-      S = &hashtable->string_pool[hashtable->string_pool_size];    \
-      hashtable->string_pool_size += len;                          \
+      S = &hashtable->pool.buffer[hashtable->pool.size];    \
+      hashtable->pool.size += len;                          \
       count++;                                                     \
     }                                                              \
 } while(0)
@@ -242,40 +303,40 @@ static void inthash_compact_pool(inthash hashtable) {
   for(i = 0 ; i < hash_size ; i++) {
     RELOCATE_STRING(hashtable->items[i].name);
   }
-  for(i = 0 ; i < hashtable->stash_size ; i++) {
-    RELOCATE_STRING(hashtable->stash[i].name);
+  for(i = 0 ; i < hashtable->stash.size ; i++) {
+    RELOCATE_STRING(hashtable->stash.items[i].name);
   }
 
 #undef RELOCATE_STRING
 
   /* compacted (used chars == current size) */
-  hashtable->string_pool_chars = hashtable->string_pool_size;
+  hashtable->pool.used = hashtable->pool.size;
 
   /* wipe previous pool */
   free(old_pool);
 
   inthash_log(hashtable,
               "compacted string pool for %d strings: %d bytes => %d bytes",
-              count, (int) old_size, (int) hashtable->string_pool_size);
+              count, (int) old_size, (int) hashtable->pool.size);
 }
 
 static char* inthash_dup_name(inthash hashtable, const char *name) {
   const size_t len = strlen(name) + 1;
   char *s;
 
-  if (hashtable->string_pool_capacity - hashtable->string_pool_size < len) {
-    if (hashtable->string_pool_capacity == 0) {
-      hashtable->string_pool_capacity = 16;
+  if (hashtable->pool.capacity - hashtable->pool.size < len) {
+    if (hashtable->pool.capacity == 0) {
+      hashtable->pool.capacity = 16;
     }
-    for( ; hashtable->string_pool_capacity - hashtable->string_pool_size < len
-      ; hashtable->string_pool_capacity <<= 1) ;
+    for( ; hashtable->pool.capacity - hashtable->pool.size < len
+      ; hashtable->pool.capacity <<= 1) ;
     inthash_realloc_pool(hashtable);
   }
 
-  s = &hashtable->string_pool[hashtable->string_pool_size];
+  s = &hashtable->pool.buffer[hashtable->pool.size];
   memcpy(s, name, len);
-  hashtable->string_pool_size += len;
-  hashtable->string_pool_chars += len;
+  hashtable->pool.size += len;
+  hashtable->pool.used += len;
 
   return s;
 }
@@ -283,21 +344,21 @@ static char* inthash_dup_name(inthash hashtable, const char *name) {
 /* note: pointer must have been kicked from the pool first */
 static void inthash_free_name(inthash hashtable, char *name) {
   const size_t len = strlen(name) + 1;
-  hashtable->string_pool_chars -= len;
-  if (hashtable->string_pool_chars < hashtable->string_pool_size / 2) {
+  hashtable->pool.used -= len;
+  if (hashtable->pool.used < hashtable->pool.size / 2) {
     inthash_compact_pool(hashtable);
   }
 }
 
-static HTS_INLINE size_t inthash_hash_to_pos_(size_t hash_size_power,
+static HTS_INLINE size_t inthash_hash_to_pos_(size_t lg_size,
                                               inthash_key hash) {
-  const inthash_key mask = POW2(hash_size_power) - 1;
+  const inthash_key mask = POW2(lg_size) - 1;
   return hash & mask;
 }
 
 static HTS_INLINE size_t inthash_hash_to_pos(const inthash hashtable,
                                              inthash_key hash) {
-  return inthash_hash_to_pos_(hashtable->hash_size_power, hash);
+  return inthash_hash_to_pos_(hashtable->lg_size, hash);
 }
 
 int inthash_read_pvoid(inthash hashtable, const char *name, void **pvalue) {
@@ -336,12 +397,8 @@ static void inthash_default_free_handler(void *value) {
 }
 
 static void inthash_del_value_(inthash hashtable, inthash_value *pvalue) {
-  if (hashtable->flag_valueismalloc) {
-    if (hashtable->free_handler)
-      hashtable->free_handler(pvalue->ptr);
-    else
-      inthash_default_free_handler(pvalue->ptr);
-  }
+  if (hashtable->custom.free_handler != NULL)
+    hashtable->custom.free_handler(pvalue->ptr);
   pvalue->ptr = NULL;
 }
 
@@ -371,6 +428,9 @@ static int inthash_write_value_(inthash hashtable, const char *name,
   inthash_key cuckoo_hash, initial_cuckoo_hash;
   size_t loops;
 
+  /* Statistics */
+  hashtable->stats.write_count++;
+
   /* replace at position 1 ? */
   pos = inthash_hash_to_pos(hashtable, hashes.hash1);
   if (inthash_matches(hashtable, pos, name, &hashes)) {
@@ -388,17 +448,20 @@ static int inthash_write_value_(inthash hashtable, const char *name,
   }
 
   /* replace in the stash ? */
-  if (hashtable->stash_size != 0) {
+  if (hashtable->stash.size != 0) {
     size_t i;
-    for(i = 0 ; i < hashtable->stash_size ; i++) {
-      if (inthash_matches_(&hashtable->stash[i], name, &hashes)) {
-        inthash_del_value_(hashtable, &hashtable->stash[i].value);
-        hashtable->stash[i].value = value;
+    for(i = 0 ; i < hashtable->stash.size ; i++) {
+      if (inthash_matches_(&hashtable->stash.items[i], name, &hashes)) {
+        inthash_del_value_(hashtable, &hashtable->stash.items[i].value);
+        hashtable->stash.items[i].value = value;
         return 0;  /* replaced */
       }
     }
   }
 
+  /* Statistics */
+  hashtable->stats.add_count++;
+
   /* otherwise we need to create a new item */
   item.name = inthash_dup_name(hashtable, name);
   item.value = value;
@@ -425,7 +488,7 @@ static int inthash_write_value_(inthash hashtable, const char *name,
   }
 
   /* put 'item' in place with hash 'cuckoo_hash' */
-  for(loops = POW2(hashtable->hash_size_power) ; loops != 0 ; --loops) {
+  for(loops = POW2(hashtable->lg_size) ; loops != 0 ; --loops) {
     const size_t pos = inthash_hash_to_pos(hashtable, cuckoo_hash);
 
     inthash_log(hashtable, "\tdebug:placing cuckoo '%s' at %d (%x)", 
@@ -439,9 +502,13 @@ static int inthash_write_value_(inthash hashtable, const char *name,
     }
     /* then cuckoo's place it is */
     else {
+      /* replace */
       const inthash_item backup_item = hashtable->items[pos];
       hashtable->items[pos] = item;
 
+      /* statistics */
+      hashtable->stats.cuckoo_moved++;
+
       /* take care of new lost item */
       item = backup_item;
 
@@ -471,11 +538,16 @@ static int inthash_write_value_(inthash hashtable, const char *name,
   }
 
   /* emergency stashing */
-  if (hashtable->stash_size < STASH_SIZE) {
-    hashtable->stash[hashtable->stash_size] = item;
-    hashtable->stash_size++;
+  if (hashtable->stash.size < STASH_SIZE) {
+    hashtable->stash.items[hashtable->stash.size] = item;
+    hashtable->stash.size++;
+    /* for statistics */
+    hashtable->stats.stash_added++;
+    if (hashtable->stash.size > hashtable->stats.max_stash_size) {
+      hashtable->stats.max_stash_size = hashtable->stash.size;
+    }
     inthash_log(hashtable, "used stash because of collision (%d entries)",
-                (int) hashtable->stash_size);
+                (int) hashtable->stash.size);
     inthash_log(hashtable, "debug:stash position");
     return 1; /* added */
   } else {
@@ -495,26 +567,29 @@ int inthash_write_value(inthash hashtable, const char *name,
   /* added ? */
   if (ret) {
     /* size of half of the table */
-    const size_t half_size = POW2(hashtable->hash_size_power - 1);
+    const size_t half_size = POW2(hashtable->lg_size - 1);
 
     /* item was added: increase count */
-    hashtable->nitems++;
+    hashtable->used++;
 
     /* table is more than half-full */
-    if (hashtable->nitems >= half_size) {
+    if (hashtable->used >= half_size) {
       size_t i;
       char *data;
 
       /* size before  */
-      const size_t prev_power = hashtable->hash_size_power;
+      const size_t prev_power = hashtable->lg_size;
       const size_t prev_size = half_size * 2;
       const size_t prev_alloc_size = prev_size*sizeof(inthash_item);
 
       /* size after doubling it */
       const size_t alloc_size = prev_alloc_size * 2;
 
+      /* statistics */
+      hashtable->stats.rehash_count++;
+
       /* realloc */
-      hashtable->hash_size_power++;
+      hashtable->lg_size++;
       hashtable->items = 
         (inthash_item *) realloc(hashtable->items, alloc_size);
       if (hashtable->items == NULL) {
@@ -559,17 +634,17 @@ int inthash_write_value(inthash hashtable, const char *name,
       }
 
       inthash_log(hashtable, "expanded hashtable to %d elements",
-                  (int) POW2(hashtable->hash_size_power));
+                  (int) POW2(hashtable->lg_size));
 
       /* attempt to merge the stash if present */
-      if (hashtable->stash_size != 0) {
-        const size_t old_size = hashtable->stash_size;
+      if (hashtable->stash.size != 0) {
+        const size_t old_size = hashtable->stash.size;
         size_t i;
 
         /* backup stash and reset it */
         inthash_item stash[STASH_SIZE];
-        memcpy(&stash, hashtable->stash, sizeof(hashtable->stash));
-        hashtable->stash_size = 0;
+        memcpy(&stash, hashtable->stash.items, sizeof(hashtable->stash.items));
+        hashtable->stash.size = 0;
 
         /* insert all items */
         for(i = 0 ; i < old_size ; i++) {
@@ -581,18 +656,18 @@ int inthash_write_value(inthash hashtable, const char *name,
         }
 
         /* delete old names (not values) */
-        for(i = 0 ; i < hashtable->stash_size ; i++) {
+        for(i = 0 ; i < hashtable->stash.size ; i++) {
           inthash_del_name(hashtable, &stash[i]);
         }
 
         /* logging */
-        inthash_assert(hashtable->stash_size <= old_size);
-        if (hashtable->stash_size < old_size) {
+        inthash_assert(hashtable->stash.size <= old_size);
+        if (hashtable->stash.size < old_size) {
           inthash_log(hashtable, "reduced stash size from %d to %d",
-                      (int) old_size, (int) hashtable->stash_size);
+                      (int) old_size, (int) hashtable->stash.size);
         } else {
           inthash_log(hashtable, "stash has still %d elements",
-                      (int) hashtable->stash_size);
+                      (int) hashtable->stash.size);
         }
       }
 
@@ -637,11 +712,11 @@ static inthash_value* inthash_read_value_(inthash hashtable,
   }
 
   /* find in stash ? */
-  if (hashtable->stash_size != 0) {
+  if (hashtable->stash.size != 0) {
     size_t i;
-    for(i = 0 ; i < hashtable->stash_size ; i++) {
-      if (inthash_matches_(&hashtable->stash[i], name, &hashes)) {
-        return &hashtable->stash[i].value;
+    for(i = 0 ; i < hashtable->stash.size ; i++) {
+      if (inthash_matches_(&hashtable->stash.items[i], name, &hashes)) {
+        return &hashtable->stash.items[i].value;
       }
     }
   }
@@ -709,18 +784,18 @@ static int inthash_remove_(inthash hashtable, const char *name,
   }
 
   /* find in stash ? */
-  if (hashtable->stash_size != 0) {
+  if (hashtable->stash.size != 0) {
     size_t i;
-    for(i = 0 ; i < hashtable->stash_size ; i++) {
-      if (inthash_matches_(&hashtable->stash[i], name, hashes)) {
-        inthash_del_item(hashtable, &hashtable->stash[i]);
-        for( ; i + 1 < hashtable->stash_size ; i++) {
-          hashtable->stash[i] = hashtable->stash[i + 1];
+    for(i = 0 ; i < hashtable->stash.size ; i++) {
+      if (inthash_matches_(&hashtable->stash.items[i], name, hashes)) {
+        inthash_del_item(hashtable, &hashtable->stash.items[i]);
+        for( ; i + 1 < hashtable->stash.size ; i++) {
+          hashtable->stash.items[i] = hashtable->stash.items[i + 1];
         }
-        hashtable->stash_size--;
+        hashtable->stash.size--;
         *removed = (size_t) -1;
         inthash_log(hashtable, "debug:deleted item in stash (%d entries)",
-          (int) hashtable->stash_size);
+          (int) hashtable->stash.size);
         return 1;
       }
     }
@@ -737,29 +812,29 @@ int inthash_remove(inthash hashtable, const char *name) {
 
   if (ret) {
     /* item was removed: decrease count */
-    inthash_assert(hashtable->nitems != 0);
-    hashtable->nitems--;
+    inthash_assert(hashtable->used != 0);
+    hashtable->used--;
 
     /* can we place stash entry back to the table ? */
-    if (hashtable->stash_size != 0 && removed != (size_t) -1) {
+    if (hashtable->stash.size != 0 && removed != (size_t) -1) {
       size_t i;
-      for(i = 0 ; i < hashtable->stash_size ; i++) {
+      for(i = 0 ; i < hashtable->stash.size ; i++) {
         const size_t pos1 =
-          inthash_hash_to_pos(hashtable, hashtable->stash[i].hashes.hash1);
+          inthash_hash_to_pos(hashtable, hashtable->stash.items[i].hashes.hash1);
         const size_t pos2 =
-          inthash_hash_to_pos(hashtable, hashtable->stash[i].hashes.hash2);
+          inthash_hash_to_pos(hashtable, hashtable->stash.items[i].hashes.hash2);
         if (pos1 == removed || pos2 == removed) {
           if (pos1 == removed) {
-            hashtable->items[pos1] = hashtable->stash[i];
+            hashtable->items[pos1] = hashtable->stash.items[i];
           } else if (pos2 == removed) {
-            hashtable->items[pos2] = hashtable->stash[i];
+            hashtable->items[pos2] = hashtable->stash.items[i];
           }
-          for( ; i + 1 < hashtable->stash_size ; i++) {
-            hashtable->stash[i] = hashtable->stash[i + 1];
+          for( ; i + 1 < hashtable->stash.size ; i++) {
+            hashtable->stash.items[i] = hashtable->stash.items[i + 1];
           }
-          hashtable->stash_size--;
+          hashtable->stash.size--;
           inthash_log(hashtable, "debug:moved item from stash (%d entries)",
-            (int) hashtable->stash_size);
+            (int) hashtable->stash.size);
           break;
         }
       }
@@ -785,17 +860,24 @@ inthash inthash_new(size_t initial_size) {
   if (hashtable) {
     size_t size;
     for(size = 8 ; POW2(size) < initial_size ; size++) ;
-    hashtable->flag_valueismalloc = 0;
     if ((hashtable->items =
          (inthash_item *) calloc(POW2(size), sizeof(inthash_item)))) {
-      hashtable->hash_size_power = size;
+      hashtable->lg_size = size;
     }
-    hashtable->nitems = 0;
-    hashtable->stash_size = 0;
-    hashtable->string_pool = NULL;
-    hashtable->string_pool_size = 0;
-    hashtable->string_pool_capacity = 0;
-    hashtable->string_pool_chars = 0;
+    hashtable->used = 0;
+    hashtable->stash.size = 0;
+    hashtable->pool.buffer = NULL;
+    hashtable->pool.size = 0;
+    hashtable->pool.capacity = 0;
+    hashtable->pool.used = 0;
+    hashtable->stats.max_stash_size = 0;
+    hashtable->stats.write_count = 0;
+    hashtable->stats.add_count = 0;
+    hashtable->stats.cuckoo_moved = 0;
+    hashtable->stats.stash_added= 0;
+    hashtable->stats.pool_compact_count = 0;
+    hashtable->stats.pool_realloc_count = 0;
+    hashtable->stats.rehash_count = 0;
   }
   return hashtable;
 }
@@ -805,17 +887,23 @@ int inthash_created(inthash hashtable) {
 }
 
 void inthash_value_is_malloc(inthash hashtable, int flag) {
-  hashtable->flag_valueismalloc = flag;
+  if (flag) {
+    if (hashtable->custom.free_handler == NULL) {
+      hashtable->custom.free_handler = inthash_default_free_handler;
+    }
+  } else {
+    hashtable->custom.free_handler = NULL;
+  }
 }
 
 void inthash_value_set_free_handler(inthash hashtable,
                                     t_inthash_freehandler free_handler) {
-  hashtable->free_handler = free_handler;
+  hashtable->custom.free_handler = free_handler;
 }
 
 size_t inthash_nitems(inthash hashtable) {
   if (hashtable != NULL)
-    return hashtable->nitems;
+    return hashtable->used;
   return 0;
 }
 
@@ -823,9 +911,22 @@ void inthash_delete(inthash *phashtable) {
   if (phashtable != NULL) {
     inthash hashtable = *phashtable;
     if (hashtable != NULL) {
+      inthash_log(hashtable, "freeing table ; "
+                  "writes=%d (new=%d) moved=%d stashed=%d collision-room=%d "
+                  "avg-moved=%g rehash=%d pool-compact=%d pool-realloc=%d",
+                  (int) hashtable->stats.write_count,
+                  (int) hashtable->stats.add_count,
+                  (int) hashtable->stats.cuckoo_moved,
+                  (int) hashtable->stats.stash_added,
+                  (int) hashtable->stats.max_stash_size,
+                  (double) hashtable->stats.cuckoo_moved / (double) hashtable->stats.add_count,
+                  (int) hashtable->stats.rehash_count,
+                  (int) hashtable->stats.pool_compact_count,
+                  (int) hashtable->stats.pool_realloc_count
+                  );
       if (hashtable->items != NULL) {
         /* we need to delete values */
-        const size_t hash_size = POW2(hashtable->hash_size_power);
+        const size_t hash_size = POW2(hashtable->lg_size);
         size_t i;
 
         /* wipe hashtable values (not names) */
@@ -836,15 +937,15 @@ void inthash_delete(inthash *phashtable) {
         }
 
         /* wipe auxiliary stash values (not names) if any */
-        for(i = 0 ; i < hashtable->stash_size ; i++) {
-          inthash_del_value_(hashtable, &hashtable->stash[i].value);
+        for(i = 0 ; i < hashtable->stash.size ; i++) {
+          inthash_del_value_(hashtable, &hashtable->stash.items[i].value);
         }
       }
       /* wipe top-level */
-      hashtable->hash_size_power = 0;
-      hashtable->nitems = 0;
-      free(hashtable->string_pool);
-      hashtable->string_pool = NULL;
+      hashtable->lg_size = 0;
+      hashtable->used = 0;
+      free(hashtable->pool.buffer);
+      hashtable->pool.buffer = NULL;
       free(hashtable->items);
       hashtable->items = NULL;
       free(hashtable);
@@ -864,7 +965,7 @@ struct_inthash_enum inthash_enum_new(inthash hashtable) {
 }
 
 inthash_item *inthash_enum_next(struct_inthash_enum * e) {
-  const size_t hash_size = POW2(e->table->hash_size_power);
+  const size_t hash_size = POW2(e->table->lg_size);
   for( ; e->index < hash_size 
     && inthash_is_free(e->table, e->index) ; e->index++) ;
   /* enumerate all table */
@@ -874,9 +975,9 @@ inthash_item *inthash_enum_next(struct_inthash_enum * e) {
     return next;
   }
   /* enumerate stash if present */
-  else if (e->index < hash_size + e->table->stash_size) {
+  else if (e->index < hash_size + e->table->stash.size) {
     const size_t index = e->index - hash_size;
-    inthash_item *const next = &e->table->stash[index];
+    inthash_item *const next = &e->table->stash.items[index];
     e->index++;
     return next;
   }
diff --git a/src/htsinthash.h b/src/htsinthash.h
index 1178e46..b448840 100644
--- a/src/htsinthash.h
+++ b/src/htsinthash.h
@@ -40,7 +40,7 @@ Please visit our Website: http://www.httrack.com
  *
  * Implementation notes:
  * Implementation is auto-rehashable, and uses cuckoo hashing of size 2**n
- * with a LCG hash function, with one additional auxiliary hash function.
+ * with a FNV hash function, with one additional auxiliary hash function.
  * It also uses a small stash area to handle rare cases of collisions.
  * Enumeration of all key/values is possible, deletion is also possible, but
  * currently without any auto-shrinking (ie. table will never shrink).
@@ -49,8 +49,8 @@ Please visit our Website: http://www.httrack.com
  *
  * References: 
  * Cuckoo Hashing http://en.wikipedia.org/wiki/Cuckoo_hashing
- * LCG http://en.wikipedia.org/wiki/Linear_congruential_generator
  * Cuckoo Stash http://research.microsoft.com/pubs/73856/stash-full.9-30.pdf
+ * FNV http://www.isthe.com/chongo/tech/comp/fnv/
  **/
 
 #ifndef HTSINTHASH_DEFH
@@ -59,6 +59,8 @@ Please visit our Website: http://www.httrack.com
 /* Includes */
 #ifdef _WIN32
 #include <stddef.h>
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
 #elif (defined(SOLARIS) || defined(sun) || defined(HAVE_INTTYPES_H) \
   || defined(BSD) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__FreeBSD_kernel__))
 #include <inttypes.h>
@@ -84,7 +86,7 @@ typedef struct inthash_item inthash_item;
 #endif
 
 /* Hash key (32-bit) */
-typedef unsigned int inthash_key;
+typedef uint32_t inthash_key;
 
 /* Pair of hashes */
 typedef struct inthash_keys {
@@ -118,7 +120,6 @@ typedef void (*t_inthash_freehandler) (void *value);
 #define HTS_DEF_FWSTRUCT_struct_inthash
 typedef struct struct_inthash struct_inthash, *inthash;
 #endif
-#define STASH_SIZE 16
 
 /** Hashtable enumeration (opaque structure). **/
 #ifndef HTS_DEF_FWSTRUCT_struct_inthash_enum
author	Xavier Roche <xroche@users.noreply.github.com>	2013-06-25 19:10:40 +0000
committer	Xavier Roche <xroche@users.noreply.github.com>	2013-06-25 19:10:40 +0000
commit	9c3e5b43bdf982348a5d5ec84dd2c6fd9b990331 (patch)
tree	94ebe3cdc774db3537842446fc77515a20bf4d45
parent	9090ee17b364f44a8df75f927b26c6e352ef9fff (diff)