diff options
-rw-r--r-- | src/htsinthash.c | 318 | ||||
-rw-r--r-- | src/htsinthash.h | 18 |
2 files changed, 226 insertions, 110 deletions
diff --git a/src/htsinthash.c b/src/htsinthash.c index f04c680..e8a72d3 100644 --- a/src/htsinthash.c +++ b/src/htsinthash.c @@ -39,6 +39,7 @@ Please visit our Website: http://www.httrack.com #include <stdlib.h> #include <string.h> #include <assert.h> +#include <stdarg.h> #include "htsinthash.h" @@ -59,13 +60,17 @@ struct struct_inthash { #ifdef LIBHTTRACK_EXPORTS #include "htsbase.h" -#undef assert -#define assert assertf +#define inthash_assert assertf +#else +static void inthash_fail(const char* exp, const char* file, int line) { + fprintf(stderr, "assertion '%s' failed at %s:%d\n", exp, file, line); + abort(); +} +#define inthash_assert(EXP) (void)( (EXP) || (inthash_fail(#EXP, __FILE__, __LINE__), 0) ) +#define HTS_PRINTF_FUN(FMT, ARGS) +#define HTS_INLINE #endif -/* Key (32-bit) */ -typedef unsigned int inthash_key; - /* Numerical Recipes function. */ #define HASH_PRIME ( 1664525 ) #define HASH_CONST ( 1013904223 ) @@ -78,11 +83,17 @@ typedef unsigned int inthash_key; /* 2**X */ #define POW2(X) ( (size_t) 1 << (X) ) -/* pair of hashes */ -typedef struct inthash_keys { - inthash_key hash1; - inthash_key hash2; -} inthash_keys; +static void inthash_log(inthash hashtable, + const char *format, ...) + HTS_PRINTF_FUN(2, 3) { + va_list args; + inthash_assert(format != NULL); + va_start(args, format); + fprintf(stderr, "[%p] ", (void*) hashtable); + (void) vfprintf(stderr, format, args); + putc('\n', stderr); + va_end(args); +} static inthash_keys inthash_calc_hashes(const char *value) { /* compute two lcg hashes using different seeds */ @@ -101,14 +112,24 @@ static inthash_keys inthash_calc_hashes(const char *value) { return hash; } -static int inthash_is_free(const inthash hashtable, size_t pos) { +static HTS_INLINE int inthash_is_free(const inthash hashtable, size_t pos) { return hashtable->items[pos].name == NULL; } -static int inthash_matches(const inthash hashtable, - size_t pos, const char *name) { - return hashtable->items[pos].name != NULL - && strcmp(hashtable->items[pos].name, name) == 0; +static HTS_INLINE int inthash_matches_(const inthash_item *const item, + const char *name, + const inthash_keys *hashes) { + return item->name != NULL + && item->hashes.hash1 == hashes->hash1 + && item->hashes.hash2 == hashes->hash2 + && strcmp(item->name, name) == 0; +} + +static HTS_INLINE int inthash_matches(const inthash hashtable, size_t pos, + const char *name, + const inthash_keys *hashes) { + const inthash_item *const item = &hashtable->items[pos]; + return inthash_matches_(item, name, hashes); } /* realloc (expand) string pool ; does not change the compacity */ @@ -116,20 +137,23 @@ static void inthash_realloc_pool(inthash hashtable) { const size_t hash_size = POW2(hashtable->hash_size_power); char *const oldbase = hashtable->string_pool; size_t i; + int count = 0; + hashtable->string_pool = realloc(hashtable->string_pool, hashtable->string_pool_capacity); if (hashtable->string_pool == NULL) { fprintf(stderr, "** hashtable string pool allocation error: could not allocate %u bytes", (int) hashtable->string_pool_capacity); - assert(! "hashtable string pool allocation error"); + inthash_assert(! "hashtable string pool allocation error"); } /* recompute string address */ #define RECOMPUTE_STRING(S) do { \ if (S != NULL) { \ const size_t offset = (S) - oldbase; \ - hashtable->items[i].name = &hashtable->string_pool[offset]; \ + hashtable->items[i].name = &hashtable->string_pool[offset]; \ + count++; \ } \ } while(0) @@ -142,6 +166,9 @@ static void inthash_realloc_pool(inthash hashtable) { } #undef RECOMPUTE_STRING + + inthash_log(hashtable, "reallocated string pool for %d strings: %d bytes\n", + count, (int) hashtable->string_pool_capacity); } /* compact string pool ; does not change the capacity */ @@ -149,6 +176,8 @@ static void inthash_compact_pool(inthash hashtable) { const size_t hash_size = POW2(hashtable->hash_size_power); size_t i; char* old_pool = hashtable->string_pool; + const size_t old_size = hashtable->string_pool_size; + int count = 0; hashtable->string_pool = malloc(hashtable->string_pool_capacity); hashtable->string_pool_size = 0; @@ -157,19 +186,20 @@ static void inthash_compact_pool(inthash hashtable) { fprintf(stderr, "** hashtable string pool compaction error: could not allocate %u bytes", (int) hashtable->string_pool_capacity); - assert(! "hashtable string pool compaction error"); + inthash_assert(! "hashtable string pool compaction error"); } /* relocate a string on a different pool */ #define RELOCATE_STRING(S) do { \ if (S != NULL) { \ const size_t len = strlen(S) + 1; \ - assert(hashtable->string_pool_size + len \ + inthash_assert(hashtable->string_pool_size + len \ <= hashtable->string_pool_capacity); \ memcpy(&hashtable->string_pool[hashtable->string_pool_size], \ S, len); \ S = &hashtable->string_pool[hashtable->string_pool_size]; \ hashtable->string_pool_size += len; \ + count++; \ } \ } while(0) @@ -188,6 +218,10 @@ static void inthash_compact_pool(inthash hashtable) { /* wipe previous pool */ free(old_pool); + + inthash_log(hashtable, + "compacted string pool for %d strings: %d bytes => %d bytes", + count, (int) old_size, (int) hashtable->string_pool_size); } static char* inthash_dup_name(inthash hashtable, const char *name) { @@ -211,6 +245,7 @@ static char* inthash_dup_name(inthash hashtable, const char *name) { return s; } +/* note: pointer must have been kicked from the pool first */ static void inthash_free_name(inthash hashtable, char *name) { const size_t len = strlen(name) + 1; hashtable->string_pool_chars -= len; @@ -219,13 +254,15 @@ static void inthash_free_name(inthash hashtable, char *name) { } } -static size_t inthash_hash_to_pos_(size_t hash_size_power, inthash_key hash) { +static HTS_INLINE size_t inthash_hash_to_pos_(size_t hash_size_power, + inthash_key hash) { const inthash_key mask = POW2(hash_size_power) - 1; /* TODO: investigate if lower bits of LCG are sane */ return hash & mask; } -static size_t inthash_hash_to_pos(const inthash hashtable, inthash_key hash) { +static HTS_INLINE size_t inthash_hash_to_pos(const inthash hashtable, + inthash_key hash) { return inthash_hash_to_pos_(hashtable->hash_size_power, hash); } @@ -278,33 +315,31 @@ static void inthash_del_value(inthash hashtable, size_t pos) { inthash_del_value_(hashtable, &hashtable->items[pos].value); } -static void inthash_del_name_(inthash hashtable, char **pname) { - char *const name = *pname; - *pname = NULL; +static void inthash_del_name(inthash hashtable, inthash_item *item) { + const inthash_keys nullHash = INTHASH_KEYS_NULL; + char *const name = item->name; + item->name = NULL; /* there must be no reference remaining */ + item->hashes = nullHash; /* free after detach (we may compact the pool) */ inthash_free_name(hashtable, name); } -static void inthash_del_name(inthash hashtable, size_t pos) { - inthash_del_name_(hashtable, &hashtable->items[pos].name); -} - static void inthash_del_item(inthash hashtable, inthash_item *pitem) { inthash_del_value_(hashtable, &pitem->value); - inthash_del_name_(hashtable, &pitem->name); + inthash_del_name(hashtable, pitem); } static int inthash_write_value_(inthash hashtable, const char *name, inthash_value value) { inthash_item item; size_t pos; - inthash_keys hashes = inthash_calc_hashes(name); + const inthash_keys hashes = inthash_calc_hashes(name); inthash_key cuckoo_hash, initial_cuckoo_hash; size_t loops; /* replace at position 1 ? */ pos = inthash_hash_to_pos(hashtable, hashes.hash1); - if (inthash_matches(hashtable, pos, name)) { + if (inthash_matches(hashtable, pos, name, &hashes)) { inthash_del_value(hashtable, pos); hashtable->items[pos].value = value; return 0; /* replaced */ @@ -312,7 +347,7 @@ static int inthash_write_value_(inthash hashtable, const char *name, /* replace at position 2 ? */ pos = inthash_hash_to_pos(hashtable, hashes.hash2); - if (inthash_matches(hashtable, pos, name)) { + if (inthash_matches(hashtable, pos, name, &hashes)) { inthash_del_value(hashtable, pos); hashtable->items[pos].value = value; return 0; /* replaced */ @@ -322,7 +357,7 @@ static int inthash_write_value_(inthash hashtable, const char *name, if (hashtable->stash_size != 0) { size_t i; for(i = 0 ; i < hashtable->stash_size ; i++) { - if (strcmp(hashtable->stash[i].name, name) == 0) { + if (inthash_matches_(&hashtable->stash[i], name, &hashes)) { inthash_del_value_(hashtable, &hashtable->stash[i].value); hashtable->stash[i].value = value; return 0; /* replaced */ @@ -333,6 +368,7 @@ static int inthash_write_value_(inthash hashtable, const char *name, /* otherwise we need to create a new item */ item.name = inthash_dup_name(hashtable, name); item.value = value; + item.hashes = hashes; /* place at free position 1 ? */ pos = inthash_hash_to_pos(hashtable, hashes.hash1); @@ -365,27 +401,29 @@ static int inthash_write_value_(inthash hashtable, const char *name, else { const inthash_item backup_item = hashtable->items[pos]; hashtable->items[pos] = item; + /* take care of new lost item */ item = backup_item; - hashes = inthash_calc_hashes(item.name); + /* we just kicked this item from its position 1 */ - if (pos == inthash_hash_to_pos(hashtable, hashes.hash1)) { + if (pos == inthash_hash_to_pos(hashtable, item.hashes.hash1)) { /* then place it on position 2 on next run */ - cuckoo_hash = hashes.hash2; + cuckoo_hash = item.hashes.hash2; } /* we just kicked this item from its position 2 */ - else if (pos == inthash_hash_to_pos(hashtable, hashes.hash2)) { + else if (pos == inthash_hash_to_pos(hashtable, item.hashes.hash2)) { /* then place it on position 1 on next run */ - cuckoo_hash = hashes.hash1; - - /* we are looping */ - if (cuckoo_hash == initial_cuckoo_hash) { - /* emergency stash */ - break; - } + cuckoo_hash = item.hashes.hash1; } else { - assert(! "hashtable internal error: unexpected position"); + inthash_assert(! "hashtable internal error: unexpected position"); + } + + /* we are looping (back to same hash) */ + /* TODO FIXME: we should actually check the positions */ + if (cuckoo_hash == initial_cuckoo_hash) { + /* emergency stash */ + break; } } } @@ -394,11 +432,13 @@ static int inthash_write_value_(inthash hashtable, const char *name, if (hashtable->stash_size < STASH_SIZE) { hashtable->stash[hashtable->stash_size] = item; hashtable->stash_size++; + inthash_log(hashtable, "used stash because of collision (%d entries)", + (int) hashtable->stash_size++); return 1; /* added */ } else { /* we are doomed. hopefully the probability is lower than being killed by a wandering radioactive monkey */ - assert(! "hashtable internal error: cuckoo/stash collision"); + inthash_assert(! "hashtable internal error: cuckoo/stash collision"); /* not reachable code */ return -1; } @@ -408,6 +448,8 @@ int inthash_write_value(inthash hashtable, const char *name, inthash_value value) { /* replace of add item */ const int ret = inthash_write_value_(hashtable, name, value); + + /* added ? */ if (ret) { /* size of half of the table */ const size_t half_size = POW2(hashtable->hash_size_power - 1); @@ -426,7 +468,6 @@ int inthash_write_value(inthash hashtable, const char *name, const size_t prev_alloc_size = prev_size*sizeof(inthash_item); /* size after doubling it */ - const size_t new_size = prev_size * 2; const size_t alloc_size = prev_alloc_size * 2; /* realloc */ @@ -437,7 +478,7 @@ int inthash_write_value(inthash hashtable, const char *name, fprintf(stderr, "** hashtable allocation error: could not allocate %u bytes", (int) alloc_size); - assert(! "hashtable allocation error"); + inthash_assert(! "hashtable allocation error"); } /* clear upper half */ @@ -447,14 +488,16 @@ int inthash_write_value(inthash hashtable, const char *name, /* relocate lower half items when needed */ for(i = 0 ; i < prev_size ; i++) { if (!inthash_is_free(hashtable, i)) { - const inthash_keys hashes = inthash_calc_hashes(hashtable->items[i].name); + const inthash_keys hashes = hashtable->items[i].hashes; size_t pos; + /* currently at old position 1 */ pos = inthash_hash_to_pos_(prev_power, hashes.hash1); if (pos == i) { const size_t pos = inthash_hash_to_pos(hashtable, hashes.hash1); /* no more the expected position */ if (pos != i) { + inthash_assert(pos >= prev_size); hashtable->items[pos] = hashtable->items[i]; memset(&hashtable->items[i], 0, sizeof(hashtable->items[i])); } @@ -463,30 +506,52 @@ int inthash_write_value(inthash hashtable, const char *name, const size_t pos = inthash_hash_to_pos(hashtable, hashes.hash2); /* no more the expected position */ if (pos != i) { + inthash_assert(pos >= prev_size); hashtable->items[pos] = hashtable->items[i]; memset(&hashtable->items[i], 0, sizeof(hashtable->items[i])); } } else { - assert(! "hashtable unexpected internal error"); + inthash_assert(! "hashtable unexpected internal error"); } } } + inthash_log(hashtable, "expanded hashtable to %d elements", + (int) POW2(hashtable->hash_size_power)); + /* attempt to merge the stash if present */ if (hashtable->stash_size != 0) { + const size_t old_size = hashtable->stash_size; size_t i; + /* backup stash and reset it */ inthash_item stash[STASH_SIZE]; memcpy(&stash, hashtable->stash, sizeof(hashtable->stash)); hashtable->stash_size = 0; + /* insert all items */ for(i = 0 ; i < hashtable->stash_size ; i++) { - const int ret = inthash_write_value_(hashtable, stash[i].name, stash[i].value); + const int ret = inthash_write_value_(hashtable, stash[i].name, + stash[i].value); if (ret == 0) { - assert(! "hashtable duplicate key when merging the stash"); + inthash_assert(! "hashtable duplicate key when merging the stash"); } - inthash_free_name(hashtable, stash[i].name); + } + + /* delete old names */ + for(i = 0 ; i < hashtable->stash_size ; i++) { + inthash_del_name(hashtable, &stash[i]); + } + + /* logging */ + inthash_assert(hashtable->stash_size <= old_size); + if (hashtable->stash_size < old_size) { + inthash_log(hashtable, "reduced stash size from %d to %d", + (int) old_size, (int) hashtable->stash_size); + } else { + inthash_log(hashtable, "stash has still %d elements", + (int) hashtable->stash_size); } } @@ -496,23 +561,6 @@ int inthash_write_value(inthash hashtable, const char *name, return ret; } -// Increment pos value, create one if necessary (=0) -// (==1 : created) -int inthash_inc(inthash hashtable, const char *name) { - intptr_t value = 0; - int r = 0; - - if (inthash_read(hashtable, name, &value)) { - value++; - } else { /* create new value */ - value = 0; - r = 1; - } - inthash_write(hashtable, name, value); - return (r); -} - -// Does not check for duplicate entry void inthash_add(inthash hashtable, const char *name, intptr_t intvalue) { inthash_value value = INTHASH_VALUE_NULL; @@ -530,62 +578,92 @@ int inthash_read(inthash hashtable, const char *name, intptr_t * intvalue) { return ret; } -int inthash_read_value(inthash hashtable, const char *name, - inthash_value * value) { +static inthash_value* inthash_read_value_(inthash hashtable, + const char *name) { const inthash_keys hashes = inthash_calc_hashes(name); size_t pos; /* found at position 1 ? */ pos = inthash_hash_to_pos(hashtable, hashes.hash1); - if (inthash_matches(hashtable, pos, name)) { - if (value != NULL) - *value = hashtable->items[pos].value; - return 1; + if (inthash_matches(hashtable, pos, name, &hashes)) { + return &hashtable->items[pos].value; } /* found at position 2 ? */ pos = inthash_hash_to_pos(hashtable, hashes.hash2); - if (inthash_matches(hashtable, pos, name)) { - if (value != NULL) - *value = hashtable->items[pos].value; - return 1; + if (inthash_matches(hashtable, pos, name, &hashes)) { + return &hashtable->items[pos].value; } /* find in stash ? */ if (hashtable->stash_size != 0) { size_t i; for(i = 0 ; i < hashtable->stash_size ; i++) { - if (strcmp(hashtable->stash[i].name, name) == 0) { - if (value != NULL) - *value = hashtable->stash[i].value; - return 1; + if (inthash_matches_(&hashtable->stash[i], name, &hashes)) { + return &hashtable->stash[i].value; } } } /* not found */ + return NULL; +} + +int inthash_read_value(inthash hashtable, const char *name, + inthash_value * pvalue) { + inthash_value* const value = inthash_read_value_(hashtable, name); + if (value != NULL) { + if (pvalue != NULL) { + *pvalue = *value; + } + return 1; + } return 0; } +static size_t inthash_inc_(inthash hashtable, const char *name, + size_t inc) { + inthash_value* const value = inthash_read_value_(hashtable, name); + if (value != NULL) { + value->intg += inc; + return value->intg; + } else { + /* create a new value */ + const int ret = inthash_write(hashtable, name, inc); + inthash_assert(ret); + return inc; + } +} + +int inthash_inc(inthash hashtable, const char *name) { + return (int) inthash_inc_(hashtable, name, 1); +} + +int inthash_dec(inthash hashtable, const char *name) { + return (int) inthash_inc_(hashtable, name, (size_t) -1); +} + int inthash_exists(inthash hashtable, const char *name) { return inthash_read_value(hashtable, name, NULL); } -static int inthash_remove_(inthash hashtable, const char *name) { - const inthash_keys hashes = inthash_calc_hashes(name); +static int inthash_remove_(inthash hashtable, const char *name, + const inthash_keys *hashes, size_t *removed) { size_t pos; /* found at position 1 ? */ - pos = inthash_hash_to_pos(hashtable, hashes.hash1); - if (inthash_matches(hashtable, pos, name)) { + pos = inthash_hash_to_pos(hashtable, hashes->hash1); + if (inthash_matches(hashtable, pos, name, hashes)) { inthash_del_item(hashtable, &hashtable->items[pos]); + *removed = pos; return 1; } /* found at position 2 ? */ - pos = inthash_hash_to_pos(hashtable, hashes.hash2); - if (inthash_matches(hashtable, pos, name)) { + pos = inthash_hash_to_pos(hashtable, hashes->hash2); + if (inthash_matches(hashtable, pos, name, hashes)) { inthash_del_item(hashtable, &hashtable->items[pos]); + *removed = pos; return 1; } @@ -593,12 +671,13 @@ static int inthash_remove_(inthash hashtable, const char *name) { if (hashtable->stash_size != 0) { size_t i; for(i = 0 ; i < hashtable->stash_size ; i++) { - if (strcmp(hashtable->stash[i].name, name) == 0) { + if (inthash_matches_(&hashtable->stash[i], name, hashes)) { inthash_del_item(hashtable, &hashtable->stash[i]); for( ; i + 1 < hashtable->stash_size ; i++) { hashtable->stash[i] = hashtable->stash[i + 1]; } hashtable->stash_size--; + *removed = (size_t) -1; return 1; } } @@ -609,7 +688,39 @@ static int inthash_remove_(inthash hashtable, const char *name) { } int inthash_remove(inthash hashtable, const char *name) { - return inthash_remove_(hashtable, name); + const inthash_keys hashes = inthash_calc_hashes(name); + size_t removed; + const int ret = inthash_remove_(hashtable, name, &hashes, &removed); + + if (ret) { + /* item was removed: decrease count */ + inthash_assert(hashtable->nitems != 0); + hashtable->nitems--; + + /* can we place stash entry back to the table ? */ + if (hashtable->stash_size != 0 && removed != (size_t) -1) { + size_t i; + for(i = 0 ; i < hashtable->stash_size ; i++) { + const size_t pos1 = + inthash_hash_to_pos(hashtable, hashtable->stash[i].hashes.hash1); + const size_t pos2 = + inthash_hash_to_pos(hashtable, hashtable->stash[i].hashes.hash2); + if (pos1 == removed || pos2 == removed) { + if (pos1 == removed) { + hashtable->items[pos1] = hashtable->stash[i]; + } else if (pos2 == removed) { + hashtable->items[pos2] = hashtable->stash[i]; + } + for( ; i + 1 < hashtable->stash_size ; i++) { + hashtable->stash[i] = hashtable->stash[i + 1]; + } + hashtable->stash_size--; + } + } + } + } + + return ret; } int inthash_readptr(inthash hashtable, const char *name, intptr_t * value) { @@ -669,31 +780,18 @@ void inthash_delete(inthash *phashtable) { if (hashtable->items != NULL) { /* we need to delete values */ const size_t hash_size = POW2(hashtable->hash_size_power); - const t_inthash_freehandler free_handler = - hashtable->flag_valueismalloc ? - ( hashtable->free_handler != NULL - ? hashtable->free_handler : inthash_default_free_handler ) - : NULL; size_t i; + /* wipe hashtable */ for(i = 0 ; i < hash_size ; i++) { if (!inthash_is_free(hashtable, i)) { - if (free_handler != NULL) { - free_handler(hashtable->items[i].value.ptr); - } - hashtable->items[i].value.ptr = NULL; - inthash_free_name(hashtable, hashtable->items[i].name); - hashtable->items[i].name = NULL; + inthash_del_item(hashtable, &hashtable->items[i]); } } + /* wipe auxiliary stash if any */ for(i = 0 ; i < hashtable->stash_size ; i++) { - if (free_handler != NULL) { - free_handler(hashtable->stash[i].value.ptr); - } - hashtable->stash[i].value.ptr = NULL; - inthash_free_name(hashtable, hashtable->stash[i].name); - hashtable->stash[i].name = NULL; + inthash_del_item(hashtable, &hashtable->stash[i]); } } /* wipe top-level */ @@ -706,7 +804,7 @@ void inthash_delete(inthash *phashtable) { } } -// Enumerators +/* Enumerator */ struct_inthash_enum inthash_enum_new(inthash hashtable) { struct_inthash_enum e; diff --git a/src/htsinthash.h b/src/htsinthash.h index 610ee34..1178e46 100644 --- a/src/htsinthash.h +++ b/src/htsinthash.h @@ -70,6 +70,7 @@ Please visit our Website: http://www.httrack.com typedef union inthash_value { /** Integer value. **/ uintptr_t intg; + /** Pointer value. **/ void *ptr; } inthash_value; @@ -82,12 +83,28 @@ typedef union inthash_value { typedef struct inthash_item inthash_item; #endif +/* Hash key (32-bit) */ +typedef unsigned int inthash_key; + +/* Pair of hashes */ +typedef struct inthash_keys { + inthash_key hash1; + inthash_key hash2; +} inthash_keys; + +/** NULL pair of hashes. **/ +#define INTHASH_KEYS_NULL { 0, 0 } + /** Item holding a value. **/ struct inthash_item { /** Key. **/ char *name; + /** Value. **/ inthash_value value; + + /** Hashes of the key. **/ + inthash_keys hashes; }; /* Alias for legacy code */ @@ -146,6 +163,7 @@ void inthash_add_pvoid(inthash hashtable, const char *name, void *value); void inthash_add(inthash hashtable, const char *name, intptr_t value); /* Add entry in the hash table */ int inthash_write(inthash hashtable, const char *name, intptr_t value); /* Overwrite/add entry in the hash table */ int inthash_inc(inthash hashtable, const char *name); /* Increment entry in the hash table */ +int inthash_dec(inthash hashtable, const char *name); /* Decrement entry in the hash table */ int inthash_remove(inthash hashtable, const char *name); /* Remove an entry from the hashtable */ /* */ |