diff options
author | Xavier Roche <xroche@users.noreply.github.com> | 2013-06-28 17:35:18 +0000 |
---|---|---|
committer | Xavier Roche <xroche@users.noreply.github.com> | 2013-06-28 17:35:18 +0000 |
commit | 7d27b60e55ac39ceed52530b39ae9b6e9a92f708 (patch) | |
tree | e342a052c783597cd42af00dc1f56cb5e8474bf4 | |
parent | 4e0f29cc071a23132700d53dc80ba4bc67d0a877 (diff) |
Fixed spurious stash handling wrt the strings pool leading to corrupt the hashtable
Moved to MD5 after all
Cleaned up logging
-rw-r--r-- | src/htsinthash.c | 184 | ||||
-rw-r--r-- | src/htsinthash.h | 4 |
2 files changed, 114 insertions, 74 deletions
diff --git a/src/htsinthash.c b/src/htsinthash.c index afad344..6faeb34 100644 --- a/src/htsinthash.c +++ b/src/htsinthash.c @@ -43,15 +43,30 @@ Please visit our Website: http://www.httrack.com #include "htsinthash.h" -/* We may use md5 as hashing function for its quality regarding diffusion and +/* We use md5 as hashing function for its quality regarding diffusion and collisions. MD5 is slower than other hashing functions, but is known to be - an excellent hashing function. FNV-1 is however generally good enough for - this purpose, too. + an excellent hashing function. FNV-1 is generally good enough for this + purpose, too, but the performance gain is not sufficient to use it by + default. + + On several benchmarks, both MD5 and FNV were quite good (0.45 cuckoo moved + on average for each new item inserted in the hashtable), but FNV-1 was more + prone to mutual collisions (creating cycles requiring stash handling), and + was causing the stash area to be more filled than the MD5 variant. + + Simpler hashing functions, such as rolling hashes (LCG) were also tested, + but bith collision rate and diffusion were terrible. + + [ On a 10M key tests, both variants acheived 0.45 cuckoo/add ration, + but the FNV-1 variant collided 11 times with a maximum stash area + filled with 4 entries ; whereas the MD5 variant did only collide + once ] */ -#if 0 -#define USES_MD5 +#ifndef HTS_INTHASH_USES_MD5 +#define HTS_INTHASH_USES_MD5 1 #endif -#ifdef USES_MD5 + +#if HTS_INTHASH_USES_MD5 == 1 #include "htsmd5.h" #endif @@ -150,6 +165,27 @@ static void inthash_fail(const char* exp, const char* file, int line) { #define HTS_INLINE #endif +/* Logging level. */ +#if 0 +/* Verbose. */ +#define inthash_crit inthash_log +#define inthash_info inthash_log +#define inthash_debug inthash_log +#define inthash_trace inthash_log +#elif 0 +/* Info. */ +#define inthash_crit inthash_log +#define inthash_info inthash_log +#define inthash_debug inthash_log +#define inthash_trace inthash_nolog +#else +/* No logging except stats and critical. */ +#define inthash_crit inthash_log +#define inthash_info inthash_log +#define inthash_debug inthash_nolog +#define inthash_trace inthash_nolog +#endif + /* 2**X */ #define POW2(X) ( (size_t) 1 << (X) ) @@ -165,8 +201,13 @@ static void inthash_log(const inthash hashtable, const char *format, ...) putc('\n', stderr); } +/* No logging (should be dropped by the compiler) */ +static void inthash_nolog(const inthash hashtable, const char *format, ...) + HTS_PRINTF_FUN(2, 3) { +} + static inthash_keys inthash_calc_hashes(const char *value) { -#ifdef USES_MD5 +#if HTS_INTHASH_USES_MD5 == 1 /* compute a regular MD5 and extract two 32-bit integers */ union { char md5digest[16]; @@ -178,7 +219,7 @@ static inthash_keys inthash_calc_hashes(const char *value) { /* do not keep identical hashes */ if (u.hashes.hash1 == u.hashes.hash2) { - hashes.hash2 = ~hashes.hash2; + u.hashes.hash2 = ~u.hashes.hash2; } return u.hashes; @@ -254,7 +295,7 @@ static void inthash_realloc_pool(inthash hashtable) { hashtable->pool.buffer = realloc(hashtable->pool.buffer, hashtable->pool.capacity); if (hashtable->pool.buffer == NULL) { - inthash_log(hashtable, + inthash_crit(hashtable, "** hashtable string pool allocation error: could not allocate " "%"UINT_64_FORMAT" bytes", (uint64_t) hashtable->pool.capacity); @@ -265,7 +306,7 @@ static void inthash_realloc_pool(inthash hashtable) { #define RECOMPUTE_STRING(S) do { \ if (S != NULL) { \ const size_t offset = (S) - oldbase; \ - hashtable->items[i].name = &hashtable->pool.buffer[offset]; \ + S = &hashtable->pool.buffer[offset]; \ count++; \ } \ } while(0) @@ -280,9 +321,9 @@ static void inthash_realloc_pool(inthash hashtable) { #undef RECOMPUTE_STRING - inthash_log(hashtable, "reallocated string pool for " - "%"UINT_64_FORMAT" strings: %"UINT_64_FORMAT" bytes", - (uint64_t) count, (uint64_t) hashtable->pool.capacity); + inthash_debug(hashtable, "reallocated string pool for " + "%"UINT_64_FORMAT" strings: %"UINT_64_FORMAT" bytes", + (uint64_t) count, (uint64_t) hashtable->pool.capacity); } /* compact string pool ; does not change the capacity */ @@ -301,7 +342,7 @@ static void inthash_compact_pool(inthash hashtable) { hashtable->pool.size = 0; hashtable->pool.used = 0; if (hashtable->pool.buffer == NULL) { - inthash_log(hashtable, + inthash_debug(hashtable, "** hashtable string pool compaction error: could not allocate " "%"UINT_64_FORMAT" bytes", (uint64_t) hashtable->pool.capacity); @@ -309,17 +350,17 @@ static void inthash_compact_pool(inthash hashtable) { } /* relocate a string on a different pool */ -#define RELOCATE_STRING(S) do { \ - if (S != NULL) { \ - const size_t len = strlen(S) + 1; \ +#define RELOCATE_STRING(S) do { \ + if (S != NULL) { \ + const size_t len = strlen(S) + 1; \ inthash_assert(hashtable->pool.size + len \ <= hashtable->pool.capacity); \ memcpy(&hashtable->pool.buffer[hashtable->pool.size], \ - S, len); \ + S, len); \ S = &hashtable->pool.buffer[hashtable->pool.size]; \ hashtable->pool.size += len; \ - count++; \ - } \ + count++; \ + } \ } while(0) /* relocate */ @@ -338,11 +379,11 @@ static void inthash_compact_pool(inthash hashtable) { /* wipe previous pool */ free(old_pool); - inthash_log(hashtable, - "compacted string pool for %"UINT_64_FORMAT" strings: " - "%"UINT_64_FORMAT" bytes => %"UINT_64_FORMAT" bytes", - (uint64_t) count, (uint64_t) old_size, - (uint64_t) hashtable->pool.size); + inthash_debug(hashtable, + "compacted string pool for %"UINT_64_FORMAT" strings: " + "%"UINT_64_FORMAT" bytes => %"UINT_64_FORMAT" bytes", + (uint64_t) count, (uint64_t) old_size, + (uint64_t) hashtable->pool.size); } static char* inthash_dup_name(inthash hashtable, const char *name) { @@ -507,9 +548,9 @@ static int inthash_write_value_(inthash hashtable, const char *name, /* prepare cuckoo ; let's take position 1 */ else { cuckoo_hash = initial_cuckoo_hash = hashes.hash1; - inthash_log(hashtable, - "debug:collision with '%s' at %"UINT_64_FORMAT" (%x)", - item.name, (uint64_t) pos, cuckoo_hash); + inthash_trace(hashtable, + "debug:collision with '%s' at %"UINT_64_FORMAT" (%x)", + item.name, (uint64_t) pos, cuckoo_hash); } } @@ -517,13 +558,13 @@ static int inthash_write_value_(inthash hashtable, const char *name, for(loops = POW2(hashtable->lg_size) ; loops != 0 ; --loops) { const size_t pos = inthash_hash_to_pos(hashtable, cuckoo_hash); - inthash_log(hashtable, - "\tdebug:placing cuckoo '%s' at %"UINT_64_FORMAT" (%x)", - item.name, (uint64_t) pos, cuckoo_hash); + inthash_trace(hashtable, + "\tdebug:placing cuckoo '%s' at %"UINT_64_FORMAT" (%x)", + item.name, (uint64_t) pos, cuckoo_hash); /* place at alternate free position ? */ if (inthash_is_free(hashtable, pos)) { - inthash_log(hashtable, "debug:free position"); + inthash_trace(hashtable, "debug:free position"); hashtable->items[pos] = item; return 1; /* added */ } @@ -542,13 +583,13 @@ static int inthash_write_value_(inthash hashtable, const char *name, /* we just kicked this item from its position 1 */ if (pos == inthash_hash_to_pos(hashtable, item.hashes.hash1)) { /* then place it on position 2 on next run */ - inthash_log(hashtable, "\tdebug:position 1"); + inthash_trace(hashtable, "\tdebug:position 1"); cuckoo_hash = item.hashes.hash2; } /* we just kicked this item from its position 2 */ else if (pos == inthash_hash_to_pos(hashtable, item.hashes.hash2)) { /* then place it on position 1 on next run */ - inthash_log(hashtable, "\tdebug:position 2"); + inthash_trace(hashtable, "\tdebug:position 2"); cuckoo_hash = item.hashes.hash1; } else { @@ -573,9 +614,8 @@ static int inthash_write_value_(inthash hashtable, const char *name, if (hashtable->stash.size > hashtable->stats.max_stash_size) { hashtable->stats.max_stash_size = hashtable->stash.size; } - inthash_log(hashtable, "used stash because of collision (%d entries)", - (int) hashtable->stash.size); - inthash_log(hashtable, "debug:stash position"); + inthash_debug(hashtable, "used stash because of collision (%d entries)", + (int) hashtable->stash.size); return 1; /* added */ } else { /* we are doomed. hopefully the probability is lower than being killed @@ -602,7 +642,6 @@ int inthash_write_value(inthash hashtable, const char *name, /* table is more than half-full */ if (hashtable->used >= half_size) { size_t i; - char *data; /* size before */ const size_t prev_power = hashtable->lg_size; @@ -620,7 +659,7 @@ int inthash_write_value(inthash hashtable, const char *name, hashtable->items = (inthash_item *) realloc(hashtable->items, alloc_size); if (hashtable->items == NULL) { - inthash_log(hashtable, + inthash_crit(hashtable, "** hashtable allocation error: " "could not allocate %"UINT_64_FORMAT" bytes", (uint64_t) alloc_size); @@ -628,8 +667,7 @@ int inthash_write_value(inthash hashtable, const char *name, } /* clear upper half */ - data = (char*) hashtable->items; - memset(data + prev_alloc_size, 0, prev_alloc_size); + memset(&hashtable->items[prev_size], 0, prev_alloc_size); /* relocate lower half items when needed */ for(i = 0 ; i < prev_size ; i++) { @@ -661,9 +699,9 @@ int inthash_write_value(inthash hashtable, const char *name, } } - inthash_log(hashtable, - "expanded hashtable to %"UINT_64_FORMAT" elements", - (uint64_t) POW2(hashtable->lg_size)); + inthash_debug(hashtable, + "expanded hashtable to %"UINT_64_FORMAT" elements", + (uint64_t) POW2(hashtable->lg_size)); /* attempt to merge the stash if present */ if (hashtable->stash.size != 0) { @@ -692,12 +730,12 @@ int inthash_write_value(inthash hashtable, const char *name, /* logging */ inthash_assert(hashtable->stash.size <= old_size); if (hashtable->stash.size < old_size) { - inthash_log(hashtable, "reduced stash size from %"UINT_64_FORMAT" " - "to %"UINT_64_FORMAT, - (uint64_t) old_size, (uint64_t) hashtable->stash.size); + inthash_debug(hashtable, "reduced stash size from %"UINT_64_FORMAT" " + "to %"UINT_64_FORMAT, + (uint64_t) old_size, (uint64_t) hashtable->stash.size); } else { - inthash_log(hashtable, "stash has still %"UINT_64_FORMAT" elements", - (uint64_t) hashtable->stash.size); + inthash_trace(hashtable, "stash has still %"UINT_64_FORMAT" elements", + (uint64_t) hashtable->stash.size); } } @@ -824,7 +862,7 @@ static int inthash_remove_(inthash hashtable, const char *name, } hashtable->stash.size--; *removed = (size_t) -1; - inthash_log(hashtable, "debug:deleted item in stash (%d entries)", + inthash_debug(hashtable, "debug:deleted item in stash (%d entries)", (int) hashtable->stash.size); return 1; } @@ -863,7 +901,7 @@ int inthash_remove(inthash hashtable, const char *name) { hashtable->stash.items[i] = hashtable->stash.items[i + 1]; } hashtable->stash.size--; - inthash_log(hashtable, "debug:moved item from stash (%d entries)", + inthash_debug(hashtable, "debug:moved item from stash (%d entries)", (int) hashtable->stash.size); break; } @@ -945,28 +983,28 @@ size_t inthash_memory_size(inthash hashtable) { } static void inthash_log_stats(inthash hashtable) { - inthash_log(hashtable, "freeing table ; " - "writes=%"UINT_64_FORMAT" " - "(new=%"UINT_64_FORMAT") " - "moved=%"UINT_64_FORMAT " " - "stashed=%"UINT_64_FORMAT" " - "max-stash-size=%"UINT_64_FORMAT" " - "avg-moved=%g " - "rehash=%"UINT_64_FORMAT" " - "pool-compact=%"UINT_64_FORMAT" " - "pool-realloc=%"UINT_64_FORMAT" " - "memory=%"UINT_64_FORMAT, - (uint64_t) hashtable->stats.write_count, - (uint64_t) hashtable->stats.add_count, - (uint64_t) hashtable->stats.cuckoo_moved, - (uint64_t) hashtable->stats.stash_added, - (uint64_t) hashtable->stats.max_stash_size, - (double) hashtable->stats.cuckoo_moved / (double) hashtable->stats.add_count, - (uint64_t) hashtable->stats.rehash_count, - (uint64_t) hashtable->stats.pool_compact_count, - (uint64_t) hashtable->stats.pool_realloc_count, - (uint64_t) inthash_memory_size(hashtable) - ); + inthash_info(hashtable, "freeing table ; " + "writes=%"UINT_64_FORMAT" " + "(new=%"UINT_64_FORMAT") " + "moved=%"UINT_64_FORMAT " " + "stashed=%"UINT_64_FORMAT" " + "max-stash-size=%"UINT_64_FORMAT" " + "avg-moved=%g " + "rehash=%"UINT_64_FORMAT" " + "pool-compact=%"UINT_64_FORMAT" " + "pool-realloc=%"UINT_64_FORMAT" " + "memory=%"UINT_64_FORMAT, + (uint64_t) hashtable->stats.write_count, + (uint64_t) hashtable->stats.add_count, + (uint64_t) hashtable->stats.cuckoo_moved, + (uint64_t) hashtable->stats.stash_added, + (uint64_t) hashtable->stats.max_stash_size, + (double) hashtable->stats.cuckoo_moved / (double) hashtable->stats.add_count, + (uint64_t) hashtable->stats.rehash_count, + (uint64_t) hashtable->stats.pool_compact_count, + (uint64_t) hashtable->stats.pool_realloc_count, + (uint64_t) inthash_memory_size(hashtable) + ); } void inthash_delete(inthash *phashtable) { diff --git a/src/htsinthash.h b/src/htsinthash.h index 636b4b3..732fbb4 100644 --- a/src/htsinthash.h +++ b/src/htsinthash.h @@ -40,7 +40,8 @@ Please visit our Website: http://www.httrack.com * * Implementation notes: * Implementation is auto-rehashable, and uses cuckoo hashing of size 2**n - * with a FNV hash function, with one additional auxiliary hash function. + * with a MD5 or FNV-1 hash function, with one additional auxiliary hash + * function. * It also uses a small stash area to handle rare cases of collisions. * Enumeration of all key/values is possible, deletion is also possible, but * currently without any auto-shrinking (ie. table will never shrink). @@ -51,6 +52,7 @@ Please visit our Website: http://www.httrack.com * Cuckoo Hashing http://en.wikipedia.org/wiki/Cuckoo_hashing * Cuckoo Stash http://research.microsoft.com/pubs/73856/stash-full.9-30.pdf * FNV http://www.isthe.com/chongo/tech/comp/fnv/ + * MD5 http://en.wikipedia.org/wiki/MD5 **/ #ifndef HTSINTHASH_DEFH |