summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorXavier Roche <xroche@users.noreply.github.com>2013-06-24 20:06:12 +0000
committerXavier Roche <xroche@users.noreply.github.com>2013-06-24 20:06:12 +0000
commit9090ee17b364f44a8df75f927b26c6e352ef9fff (patch)
tree1bcaabe45742f7e3492ed0c7f3e205cd3e7ed849 /src
parent9e61bc36f3a3d15375b11321893dd27cd7d4b88b (diff)
Hashtable fixes
MD5 is temporarily back until some better hash function is found (current LCG is too lame and prone to collisions)
Diffstat (limited to 'src')
-rw-r--r--src/htshash.c76
-rw-r--r--src/htshash.h11
-rw-r--r--src/htsinthash.c89
3 files changed, 112 insertions, 64 deletions
diff --git a/src/htshash.c b/src/htshash.c
index 475f3fc..2d12941 100644
--- a/src/htshash.c
+++ b/src/htshash.c
@@ -67,59 +67,65 @@ void hash_init(hash_struct * hash) {
hash->former_adrfil = inthash_new(0);
}
-// retour: position ou -1 si non trouvé
-int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
- int type, int normalized) {
- char BIGSTK normfil_[HTS_URLMAXSIZE * 2];
- char catbuff[CATBUFF_SIZE];
- const char *normfil;
- intptr_t intvalue;
- char *name;
-
+static char * normalize_key(const char *nom1, const char *nom2,
+ hash_struct_type type, int normalized,
+ char *normfil_, char *catbuff) {
/* dispatch type */
+ const char *normfil;
switch(type) {
- case 0:
+ case HASH_STRUCT_FILENAME:
/* first entry: destination filename (lowercased) */
assertf(nom2 == NULL || *nom2 == '\0');
- name = convtolower(catbuff, nom1);
+ return convtolower(catbuff, nom1);
break;
- case 1:
- case 2:
+ case HASH_STRUCT_ADR_PATH:
+ case HASH_STRUCT_ORIGINAL_ADR_PATH:
/* second and third entries: URL address and path */
if (!normalized)
- normfil = nom1;
+ normfil = nom2;
else
- normfil = fil_normalized(nom1, normfil_);
+ normfil = fil_normalized(nom2, normfil_);
if (!normalized) {
- strcpybuff(catbuff, jump_identification(nom2));
+ strcpybuff(catbuff, jump_identification(nom1));
} else {
- strcpybuff(catbuff, jump_normalized(nom2));
+ strcpybuff(catbuff, jump_normalized(nom1));
}
strcatbuff(catbuff, normfil);
- name = catbuff;
+ return catbuff;
break;
default:
assertf(! "unexpected case");
+ return NULL;
break;
}
+}
+
+// retour: position ou -1 si non trouvé
+int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
+ hash_struct_type type, int normalized) {
+ char BIGSTK normfil_[HTS_URLMAXSIZE * 2];
+ char catbuff[CATBUFF_SIZE];
+ intptr_t intvalue;
+ char *const name = normalize_key(nom1, nom2, type, normalized,
+ normfil_, catbuff);
/* read */
switch(type) {
- case 0:
+ case HASH_STRUCT_FILENAME:
if (inthash_read(hash->sav, name, &intvalue)) {
return (int) intvalue;
} else {
return -1;
}
break;
- case 1:
+ case HASH_STRUCT_ADR_PATH:
if (inthash_read(hash->adrfil, name, &intvalue)) {
return (int) intvalue;
} else {
return -1;
}
break;
- case 2:
+ case HASH_STRUCT_ORIGINAL_ADR_PATH:
if (inthash_read(hash->former_adrfil, name, &intvalue)) {
return (int) intvalue;
} else {
@@ -141,34 +147,20 @@ void hash_write(hash_struct * hash, int lpos, int normalized) {
const char *name;
/* first entry: destination filename (lowercased) */
- name = convtolower(catbuff, hash->liens[lpos]->sav);
+ name = normalize_key(hash->liens[lpos]->sav, NULL, HASH_STRUCT_FILENAME,
+ normalized, normfil_, catbuff);
inthash_write(hash->sav, name, lpos);
/* second entry: URL address and path */
- if (!normalized)
- normfil = hash->liens[lpos]->fil;
- else
- normfil = fil_normalized(hash->liens[lpos]->fil, normfil_);
- if (!normalized) {
- strcpybuff(catbuff, jump_identification(hash->liens[lpos]->adr));
- } else {
- strcpybuff(catbuff, jump_normalized(hash->liens[lpos]->adr));
- }
- strcatbuff(catbuff, normfil);
+ name = normalize_key(hash->liens[lpos]->adr, hash->liens[lpos]->fil,
+ HASH_STRUCT_ADR_PATH, normalized, normfil_, catbuff);
inthash_write(hash->adrfil, name, lpos);
/* third entry: URL address and path before redirect */
if (hash->liens[lpos]->former_adr) { // former_adr existe?
- if (!normalized) {
- normfil = hash->liens[lpos]->former_fil;
- } else {
- normfil = fil_normalized(hash->liens[lpos]->former_fil, normfil_);
- }
- if (!normalized)
- strcpybuff(catbuff, jump_identification(hash->liens[lpos]->former_adr));
- else
- strcpybuff(catbuff, jump_normalized(hash->liens[lpos]->former_adr));
- strcatbuff(catbuff, normfil);
+ name = normalize_key(hash->liens[lpos]->former_adr,
+ hash->liens[lpos]->former_fil, HASH_STRUCT_ORIGINAL_ADR_PATH,
+ normalized, normfil_, catbuff);
inthash_write(hash->former_adrfil, name, lpos);
}
}
diff --git a/src/htshash.h b/src/htshash.h
index 2b0d9bd..62ea118 100644
--- a/src/htshash.h
+++ b/src/htshash.h
@@ -44,12 +44,19 @@ Please visit our Website: http://www.httrack.com
typedef struct hash_struct hash_struct;
#endif
+/** Type of hash. **/
+typedef enum hash_struct_type {
+ HASH_STRUCT_FILENAME = 0,
+ HASH_STRUCT_ADR_PATH,
+ HASH_STRUCT_ORIGINAL_ADR_PATH
+} hash_struct_type;
+
// tables de hachage
void hash_init(hash_struct * hash);
int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
- int type, int normalized);
+ hash_struct_type type, int normalized);
void hash_write(hash_struct * hash, int lpos, int normalized);
-int *hash_calc_chaine(hash_struct * hash, int type, int pos);
+int *hash_calc_chaine(hash_struct * hash, hash_struct_type type, int pos);
unsigned long int hash_cle(const char *nom1, const char *nom2);
#endif
diff --git a/src/htsinthash.c b/src/htsinthash.c
index e8a72d3..d82a6d7 100644
--- a/src/htsinthash.c
+++ b/src/htsinthash.c
@@ -43,6 +43,12 @@ Please visit our Website: http://www.httrack.com
#include "htsinthash.h"
+#define USES_MD5
+
+#ifdef USES_MD5
+#include "htsmd5.h"
+#endif
+
/** Hashtable. **/
struct struct_inthash {
inthash_item *items;
@@ -77,7 +83,7 @@ static void inthash_fail(const char* exp, const char* file, int line) {
#define HASH_ADD(HASH, C) do { \
(HASH) *= HASH_PRIME; \
(HASH) += HASH_CONST; \
- (HASH) += (C); \
+ (HASH) ^= ( (C) << 24); \
} while(0)
/* 2**X */
@@ -95,7 +101,31 @@ static void inthash_log(inthash hashtable,
va_end(args);
}
+#ifndef USES_MD5
+static unsigned int rev32(unsigned int v) {
+ /* swap odd and even bits */
+ v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+ /* swap consecutive pairs */
+ v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+ /* swap nibbles ... */
+ v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+ /* swap bytes */
+ v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+ /* swap 2-byte long pairs */
+ v = ( v >> 16 ) | ( v << 16);
+ return v;
+}
+#endif
+
static inthash_keys inthash_calc_hashes(const char *value) {
+#ifdef USES_MD5
+ union {
+ char md5digest[16];
+ inthash_keys hashes;
+ } u;
+ domd5mem(value, strlen(value), u.md5digest, 0);
+ return u.hashes;
+#else
/* compute two lcg hashes using different seeds */
inthash_keys hash = { 0, ~0 };
size_t i;
@@ -109,7 +139,12 @@ static inthash_keys inthash_calc_hashes(const char *value) {
if (hash.hash1 == hash.hash2) {
hash.hash2 = ~hash.hash2;
}
+ /* reverse bits: "higher-order bits have longer periods than the lower
+ order bits" (Wikipedia) */
+ hash.hash1 ^= rev32(hash.hash1);
+ hash.hash2 ^= rev32(hash.hash2);
return hash;
+#endif
}
static HTS_INLINE int inthash_is_free(const inthash hashtable, size_t pos) {
@@ -167,7 +202,7 @@ static void inthash_realloc_pool(inthash hashtable) {
#undef RECOMPUTE_STRING
- inthash_log(hashtable, "reallocated string pool for %d strings: %d bytes\n",
+ inthash_log(hashtable, "reallocated string pool for %d strings: %d bytes",
count, (int) hashtable->string_pool_capacity);
}
@@ -257,7 +292,6 @@ static void inthash_free_name(inthash hashtable, char *name) {
static HTS_INLINE size_t inthash_hash_to_pos_(size_t hash_size_power,
inthash_key hash) {
const inthash_key mask = POW2(hash_size_power) - 1;
- /* TODO: investigate if lower bits of LCG are sane */
return hash & mask;
}
@@ -385,6 +419,8 @@ static int inthash_write_value_(inthash hashtable, const char *name,
/* prepare cuckoo ; let's take position 1 */
else {
cuckoo_hash = initial_cuckoo_hash = hashes.hash1;
+ inthash_log(hashtable, "debug:collision with '%s' at %d (%x)",
+ item.name, pos, cuckoo_hash);
}
}
@@ -392,8 +428,12 @@ static int inthash_write_value_(inthash hashtable, const char *name,
for(loops = POW2(hashtable->hash_size_power) ; loops != 0 ; --loops) {
const size_t pos = inthash_hash_to_pos(hashtable, cuckoo_hash);
+ inthash_log(hashtable, "\tdebug:placing cuckoo '%s' at %d (%x)",
+ item.name, pos, cuckoo_hash);
+
/* place at alternate free position ? */
if (inthash_is_free(hashtable, pos)) {
+ inthash_log(hashtable, "debug:free position");
hashtable->items[pos] = item;
return 1; /* added */
}
@@ -408,11 +448,13 @@ static int inthash_write_value_(inthash hashtable, const char *name,
/* we just kicked this item from its position 1 */
if (pos == inthash_hash_to_pos(hashtable, item.hashes.hash1)) {
/* then place it on position 2 on next run */
+ inthash_log(hashtable, "\tdebug:position 1");
cuckoo_hash = item.hashes.hash2;
}
/* we just kicked this item from its position 2 */
else if (pos == inthash_hash_to_pos(hashtable, item.hashes.hash2)) {
/* then place it on position 1 on next run */
+ inthash_log(hashtable, "\tdebug:position 2");
cuckoo_hash = item.hashes.hash1;
}
else {
@@ -421,7 +463,7 @@ static int inthash_write_value_(inthash hashtable, const char *name,
/* we are looping (back to same hash) */
/* TODO FIXME: we should actually check the positions */
- if (cuckoo_hash == initial_cuckoo_hash) {
+ if (cuckoo_hash == initial_cuckoo_hash && 0) {
/* emergency stash */
break;
}
@@ -433,7 +475,8 @@ static int inthash_write_value_(inthash hashtable, const char *name,
hashtable->stash[hashtable->stash_size] = item;
hashtable->stash_size++;
inthash_log(hashtable, "used stash because of collision (%d entries)",
- (int) hashtable->stash_size++);
+ (int) hashtable->stash_size);
+ inthash_log(hashtable, "debug:stash position");
return 1; /* added */
} else {
/* we are doomed. hopefully the probability is lower than being killed
@@ -483,18 +526,16 @@ int inthash_write_value(inthash hashtable, const char *name,
/* clear upper half */
data = (char*) hashtable->items;
- memset(data + prev_size, 0, prev_size);
+ memset(data + prev_alloc_size, 0, prev_alloc_size);
/* relocate lower half items when needed */
for(i = 0 ; i < prev_size ; i++) {
if (!inthash_is_free(hashtable, i)) {
- const inthash_keys hashes = hashtable->items[i].hashes;
- size_t pos;
+ const inthash_keys *const hashes = &hashtable->items[i].hashes;
/* currently at old position 1 */
- pos = inthash_hash_to_pos_(prev_power, hashes.hash1);
- if (pos == i) {
- const size_t pos = inthash_hash_to_pos(hashtable, hashes.hash1);
+ if (inthash_hash_to_pos_(prev_power, hashes->hash1) == i) {
+ const size_t pos = inthash_hash_to_pos(hashtable, hashes->hash1);
/* no more the expected position */
if (pos != i) {
inthash_assert(pos >= prev_size);
@@ -502,8 +543,8 @@ int inthash_write_value(inthash hashtable, const char *name,
memset(&hashtable->items[i], 0, sizeof(hashtable->items[i]));
}
}
- else if (inthash_hash_to_pos_(prev_power, hashes.hash2) == i) {
- const size_t pos = inthash_hash_to_pos(hashtable, hashes.hash2);
+ else if (inthash_hash_to_pos_(prev_power, hashes->hash2) == i) {
+ const size_t pos = inthash_hash_to_pos(hashtable, hashes->hash2);
/* no more the expected position */
if (pos != i) {
inthash_assert(pos >= prev_size);
@@ -512,7 +553,7 @@ int inthash_write_value(inthash hashtable, const char *name,
}
}
else {
- inthash_assert(! "hashtable unexpected internal error");
+ inthash_assert(! "hashtable unexpected internal error (bad position)");
}
}
}
@@ -531,7 +572,7 @@ int inthash_write_value(inthash hashtable, const char *name,
hashtable->stash_size = 0;
/* insert all items */
- for(i = 0 ; i < hashtable->stash_size ; i++) {
+ for(i = 0 ; i < old_size ; i++) {
const int ret = inthash_write_value_(hashtable, stash[i].name,
stash[i].value);
if (ret == 0) {
@@ -539,7 +580,7 @@ int inthash_write_value(inthash hashtable, const char *name,
}
}
- /* delete old names */
+ /* delete old names (not values) */
for(i = 0 ; i < hashtable->stash_size ; i++) {
inthash_del_name(hashtable, &stash[i]);
}
@@ -678,6 +719,8 @@ static int inthash_remove_(inthash hashtable, const char *name,
}
hashtable->stash_size--;
*removed = (size_t) -1;
+ inthash_log(hashtable, "debug:deleted item in stash (%d entries)",
+ (int) hashtable->stash_size);
return 1;
}
}
@@ -715,6 +758,9 @@ int inthash_remove(inthash hashtable, const char *name) {
hashtable->stash[i] = hashtable->stash[i + 1];
}
hashtable->stash_size--;
+ inthash_log(hashtable, "debug:moved item from stash (%d entries)",
+ (int) hashtable->stash_size);
+ break;
}
}
}
@@ -782,22 +828,25 @@ void inthash_delete(inthash *phashtable) {
const size_t hash_size = POW2(hashtable->hash_size_power);
size_t i;
- /* wipe hashtable */
+ /* wipe hashtable values (not names) */
for(i = 0 ; i < hash_size ; i++) {
if (!inthash_is_free(hashtable, i)) {
- inthash_del_item(hashtable, &hashtable->items[i]);
+ inthash_del_value(hashtable, i);
}
}
- /* wipe auxiliary stash if any */
+ /* wipe auxiliary stash values (not names) if any */
for(i = 0 ; i < hashtable->stash_size ; i++) {
- inthash_del_item(hashtable, &hashtable->stash[i]);
+ inthash_del_value_(hashtable, &hashtable->stash[i].value);
}
}
/* wipe top-level */
hashtable->hash_size_power = 0;
hashtable->nitems = 0;
+ free(hashtable->string_pool);
+ hashtable->string_pool = NULL;
free(hashtable->items);
+ hashtable->items = NULL;
free(hashtable);
*phashtable = NULL;
}