Hashtable fixes

MD5 is temporarily back until some better hash function is found (current LCG is too lame and prone to collisions)
author: Xavier Roche <xroche@users.noreply.github.com> 2013-06-24 20:06:12 +0000
committer: Xavier Roche <xroche@users.noreply.github.com> 2013-06-24 20:06:12 +0000
commit: 9090ee17b364f44a8df75f927b26c6e352ef9fff (patch)
tree: 1bcaabe45742f7e3492ed0c7f3e205cd3e7ed849 /src
parent: 9e61bc36f3a3d15375b11321893dd27cd7d4b88b (diff)
3 files changed, 112 insertions, 64 deletions
diff --git a/src/htshash.c b/src/htshash.c
index 475f3fc..2d12941 100644
--- a/src/htshash.c
+++ b/src/htshash.c
@@ -67,59 +67,65 @@ void hash_init(hash_struct * hash) {
   hash->former_adrfil = inthash_new(0);
 }
 
-// retour: position ou -1 si non trouvé
-int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
-              int type, int normalized) {
-  char BIGSTK normfil_[HTS_URLMAXSIZE * 2];
-  char catbuff[CATBUFF_SIZE];
-  const char *normfil;
-  intptr_t intvalue;
-  char *name;
-
+static char * normalize_key(const char *nom1, const char *nom2,
+                            hash_struct_type type, int normalized, 
+                            char *normfil_, char *catbuff) {
   /* dispatch type */
+  const char *normfil;
   switch(type) {
-  case 0:
+  case HASH_STRUCT_FILENAME:
     /* first entry: destination filename (lowercased) */
     assertf(nom2 == NULL || *nom2 == '\0');
-    name = convtolower(catbuff, nom1);
+    return convtolower(catbuff, nom1);
     break;
-  case 1:
-  case 2:
+  case HASH_STRUCT_ADR_PATH:
+  case HASH_STRUCT_ORIGINAL_ADR_PATH:
     /* second and third entries: URL address and path */
     if (!normalized)
-      normfil = nom1;
+      normfil = nom2;
     else
-      normfil = fil_normalized(nom1, normfil_);
+      normfil = fil_normalized(nom2, normfil_);
     if (!normalized) {
-      strcpybuff(catbuff, jump_identification(nom2));
+      strcpybuff(catbuff, jump_identification(nom1));
     } else {
-      strcpybuff(catbuff, jump_normalized(nom2));
+      strcpybuff(catbuff, jump_normalized(nom1));
     }
     strcatbuff(catbuff, normfil);
-    name = catbuff;
+    return catbuff;
     break;
   default:
     assertf(! "unexpected case");
+    return NULL;
     break;
   }
+}
+
+// retour: position ou -1 si non trouvé
+int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
+              hash_struct_type type, int normalized) {
+  char BIGSTK normfil_[HTS_URLMAXSIZE * 2];
+  char catbuff[CATBUFF_SIZE];
+  intptr_t intvalue;
+  char *const name = normalize_key(nom1, nom2, type, normalized, 
+    normfil_, catbuff);
 
   /* read */
   switch(type) {
-  case 0:
+  case HASH_STRUCT_FILENAME:
     if (inthash_read(hash->sav, name, &intvalue)) {
       return (int) intvalue;
     } else {
       return -1;
     }
     break;
-  case 1:
+  case HASH_STRUCT_ADR_PATH:
     if (inthash_read(hash->adrfil, name, &intvalue)) {
       return (int) intvalue;
     } else {
       return -1;
     }
     break;
-  case 2:
+  case HASH_STRUCT_ORIGINAL_ADR_PATH:
     if (inthash_read(hash->former_adrfil, name, &intvalue)) {
       return (int) intvalue;
     } else {
@@ -141,34 +147,20 @@ void hash_write(hash_struct * hash, int lpos, int normalized) {
   const char *name;
 
   /* first entry: destination filename (lowercased) */
-  name = convtolower(catbuff, hash->liens[lpos]->sav);
+  name = normalize_key(hash->liens[lpos]->sav, NULL, HASH_STRUCT_FILENAME,
+    normalized, normfil_, catbuff);
   inthash_write(hash->sav, name, lpos);
 
   /* second entry: URL address and path */
-  if (!normalized)
-    normfil = hash->liens[lpos]->fil;
-  else
-    normfil = fil_normalized(hash->liens[lpos]->fil, normfil_);
-  if (!normalized) {
-    strcpybuff(catbuff, jump_identification(hash->liens[lpos]->adr));
-  } else {
-    strcpybuff(catbuff, jump_normalized(hash->liens[lpos]->adr));
-  }
-  strcatbuff(catbuff, normfil);
+  name = normalize_key(hash->liens[lpos]->adr, hash->liens[lpos]->fil, 
+    HASH_STRUCT_ADR_PATH, normalized, normfil_, catbuff);
   inthash_write(hash->adrfil, name, lpos);
 
   /* third entry: URL address and path before redirect */
   if (hash->liens[lpos]->former_adr) {        // former_adr existe?
-    if (!normalized) {
-      normfil = hash->liens[lpos]->former_fil;
-    } else {
-      normfil = fil_normalized(hash->liens[lpos]->former_fil, normfil_);
-    }
-    if (!normalized)
-      strcpybuff(catbuff, jump_identification(hash->liens[lpos]->former_adr));
-    else
-      strcpybuff(catbuff, jump_normalized(hash->liens[lpos]->former_adr));
-    strcatbuff(catbuff, normfil);
+    name = normalize_key(hash->liens[lpos]->former_adr, 
+      hash->liens[lpos]->former_fil, HASH_STRUCT_ORIGINAL_ADR_PATH,
+      normalized, normfil_, catbuff);
     inthash_write(hash->former_adrfil, name, lpos);
   }
 }
diff --git a/src/htshash.h b/src/htshash.h
index 2b0d9bd..62ea118 100644
--- a/src/htshash.h
+++ b/src/htshash.h
@@ -44,12 +44,19 @@ Please visit our Website: http://www.httrack.com
 typedef struct hash_struct hash_struct;
 #endif
 
+/** Type of hash. **/
+typedef enum hash_struct_type {
+  HASH_STRUCT_FILENAME = 0,
+  HASH_STRUCT_ADR_PATH,
+  HASH_STRUCT_ORIGINAL_ADR_PATH
+} hash_struct_type;
+
 // tables de hachage
 void hash_init(hash_struct * hash);
 int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
-              int type, int normalized);
+              hash_struct_type type, int normalized);
 void hash_write(hash_struct * hash, int lpos, int normalized);
-int *hash_calc_chaine(hash_struct * hash, int type, int pos);
+int *hash_calc_chaine(hash_struct * hash, hash_struct_type type, int pos);
 unsigned long int hash_cle(const char *nom1, const char *nom2);
 #endif
 
diff --git a/src/htsinthash.c b/src/htsinthash.c
index e8a72d3..d82a6d7 100644
--- a/src/htsinthash.c
+++ b/src/htsinthash.c
@@ -43,6 +43,12 @@ Please visit our Website: http://www.httrack.com
 
 #include "htsinthash.h"
 
+#define USES_MD5
+
+#ifdef USES_MD5
+#include "htsmd5.h"
+#endif
+
 /** Hashtable. **/
 struct struct_inthash {
   inthash_item *items;
@@ -77,7 +83,7 @@ static void inthash_fail(const char* exp, const char* file, int line) {
 #define HASH_ADD(HASH, C) do {                  \
     (HASH) *= HASH_PRIME;                       \
     (HASH) += HASH_CONST;                       \
-    (HASH) += (C);                              \
+    (HASH) ^= ( (C) << 24);                     \
   } while(0)
 
 /* 2**X */
@@ -95,7 +101,31 @@ static void inthash_log(inthash hashtable,
   va_end(args);
 }
 
+#ifndef USES_MD5
+static unsigned int rev32(unsigned int v) {
+  /* swap odd and even bits */
+  v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+  /* swap consecutive pairs */
+  v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+  /* swap nibbles ...  */
+  v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+  /* swap bytes */
+  v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+  /* swap 2-byte long pairs */
+  v = ( v >> 16             ) | ( v               << 16);
+  return v;
+}
+#endif
+
 static inthash_keys inthash_calc_hashes(const char *value) {
+#ifdef USES_MD5
+  union {
+    char md5digest[16];
+    inthash_keys hashes;
+  } u;
+  domd5mem(value, strlen(value), u.md5digest, 0);
+  return u.hashes;
+#else
   /* compute two lcg hashes using different seeds */
   inthash_keys hash = { 0, ~0 };
   size_t i;
@@ -109,7 +139,12 @@ static inthash_keys inthash_calc_hashes(const char *value) {
   if (hash.hash1 == hash.hash2) {
     hash.hash2 = ~hash.hash2;
   }
+  /* reverse bits: "higher-order bits have longer periods than the lower 
+     order bits" (Wikipedia) */
+  hash.hash1 ^= rev32(hash.hash1);
+  hash.hash2 ^= rev32(hash.hash2);
   return hash;
+#endif
 }
 
 static HTS_INLINE int inthash_is_free(const inthash hashtable, size_t pos) {
@@ -167,7 +202,7 @@ static void inthash_realloc_pool(inthash hashtable) {
 
 #undef RECOMPUTE_STRING
 
-  inthash_log(hashtable, "reallocated string pool for %d strings: %d bytes\n",
+  inthash_log(hashtable, "reallocated string pool for %d strings: %d bytes",
               count, (int) hashtable->string_pool_capacity);
 }
 
@@ -257,7 +292,6 @@ static void inthash_free_name(inthash hashtable, char *name) {
 static HTS_INLINE size_t inthash_hash_to_pos_(size_t hash_size_power,
                                               inthash_key hash) {
   const inthash_key mask = POW2(hash_size_power) - 1;
-  /* TODO: investigate if lower bits of LCG are sane */
   return hash & mask;
 }
 
@@ -385,6 +419,8 @@ static int inthash_write_value_(inthash hashtable, const char *name,
     /* prepare cuckoo ; let's take position 1 */
     else {
       cuckoo_hash = initial_cuckoo_hash = hashes.hash1;
+      inthash_log(hashtable, "debug:collision with '%s' at %d (%x)", 
+        item.name, pos, cuckoo_hash);
     }
   }
 
@@ -392,8 +428,12 @@ static int inthash_write_value_(inthash hashtable, const char *name,
   for(loops = POW2(hashtable->hash_size_power) ; loops != 0 ; --loops) {
     const size_t pos = inthash_hash_to_pos(hashtable, cuckoo_hash);
 
+    inthash_log(hashtable, "\tdebug:placing cuckoo '%s' at %d (%x)", 
+      item.name, pos, cuckoo_hash);
+
     /* place at alternate free position ? */
     if (inthash_is_free(hashtable, pos)) {
+      inthash_log(hashtable, "debug:free position");
       hashtable->items[pos] = item;
       return 1; /* added */
     }
@@ -408,11 +448,13 @@ static int inthash_write_value_(inthash hashtable, const char *name,
       /* we just kicked this item from its position 1 */
       if (pos == inthash_hash_to_pos(hashtable, item.hashes.hash1)) {
         /* then place it on position 2 on next run */
+        inthash_log(hashtable, "\tdebug:position 1");
         cuckoo_hash = item.hashes.hash2;
       }
       /* we just kicked this item from its position 2 */
       else if (pos == inthash_hash_to_pos(hashtable, item.hashes.hash2)) {
         /* then place it on position 1 on next run */
+        inthash_log(hashtable, "\tdebug:position 2");
         cuckoo_hash = item.hashes.hash1;
       }
       else {
@@ -421,7 +463,7 @@ static int inthash_write_value_(inthash hashtable, const char *name,
 
       /* we are looping (back to same hash) */
       /* TODO FIXME: we should actually check the positions */
-      if (cuckoo_hash == initial_cuckoo_hash) {
+      if (cuckoo_hash == initial_cuckoo_hash && 0) {
         /* emergency stash */
         break;
       }
@@ -433,7 +475,8 @@ static int inthash_write_value_(inthash hashtable, const char *name,
     hashtable->stash[hashtable->stash_size] = item;
     hashtable->stash_size++;
     inthash_log(hashtable, "used stash because of collision (%d entries)",
-                (int) hashtable->stash_size++);
+                (int) hashtable->stash_size);
+    inthash_log(hashtable, "debug:stash position");
     return 1; /* added */
   } else {
     /* we are doomed. hopefully the probability is lower than being killed
@@ -483,18 +526,16 @@ int inthash_write_value(inthash hashtable, const char *name,
 
       /* clear upper half */
       data = (char*) hashtable->items;
-      memset(data + prev_size, 0, prev_size);
+      memset(data + prev_alloc_size, 0, prev_alloc_size);
 
       /* relocate lower half items when needed */
       for(i = 0 ; i < prev_size ; i++) {
         if (!inthash_is_free(hashtable, i)) {
-          const inthash_keys hashes = hashtable->items[i].hashes;
-          size_t pos;
+          const inthash_keys *const hashes = &hashtable->items[i].hashes;
 
           /* currently at old position 1 */
-          pos = inthash_hash_to_pos_(prev_power, hashes.hash1);
-          if (pos == i) {
-            const size_t pos = inthash_hash_to_pos(hashtable, hashes.hash1);
+          if (inthash_hash_to_pos_(prev_power, hashes->hash1) == i) {
+            const size_t pos = inthash_hash_to_pos(hashtable, hashes->hash1);
             /* no more the expected position */
             if (pos != i) {
               inthash_assert(pos >= prev_size);
@@ -502,8 +543,8 @@ int inthash_write_value(inthash hashtable, const char *name,
               memset(&hashtable->items[i], 0, sizeof(hashtable->items[i]));
             }
           }
-          else if (inthash_hash_to_pos_(prev_power, hashes.hash2) == i) {
-            const size_t pos = inthash_hash_to_pos(hashtable, hashes.hash2);
+          else if (inthash_hash_to_pos_(prev_power, hashes->hash2) == i) {
+            const size_t pos = inthash_hash_to_pos(hashtable, hashes->hash2);
             /* no more the expected position */
             if (pos != i) {
               inthash_assert(pos >= prev_size);
@@ -512,7 +553,7 @@ int inthash_write_value(inthash hashtable, const char *name,
             }
           }
           else {
-            inthash_assert(! "hashtable unexpected internal error");
+            inthash_assert(! "hashtable unexpected internal error (bad position)");
           }
         }
       }
@@ -531,7 +572,7 @@ int inthash_write_value(inthash hashtable, const char *name,
         hashtable->stash_size = 0;
 
         /* insert all items */
-        for(i = 0 ; i < hashtable->stash_size ; i++) {
+        for(i = 0 ; i < old_size ; i++) {
           const int ret = inthash_write_value_(hashtable, stash[i].name,
                                                stash[i].value);
           if (ret == 0) {
@@ -539,7 +580,7 @@ int inthash_write_value(inthash hashtable, const char *name,
           }
         }
 
-        /* delete old names */
+        /* delete old names (not values) */
         for(i = 0 ; i < hashtable->stash_size ; i++) {
           inthash_del_name(hashtable, &stash[i]);
         }
@@ -678,6 +719,8 @@ static int inthash_remove_(inthash hashtable, const char *name,
         }
         hashtable->stash_size--;
         *removed = (size_t) -1;
+        inthash_log(hashtable, "debug:deleted item in stash (%d entries)",
+          (int) hashtable->stash_size);
         return 1;
       }
     }
@@ -715,6 +758,9 @@ int inthash_remove(inthash hashtable, const char *name) {
             hashtable->stash[i] = hashtable->stash[i + 1];
           }
           hashtable->stash_size--;
+          inthash_log(hashtable, "debug:moved item from stash (%d entries)",
+            (int) hashtable->stash_size);
+          break;
         }
       }
     }
@@ -782,22 +828,25 @@ void inthash_delete(inthash *phashtable) {
         const size_t hash_size = POW2(hashtable->hash_size_power);
         size_t i;
 
-        /* wipe hashtable */
+        /* wipe hashtable values (not names) */
         for(i = 0 ; i < hash_size ; i++) {
           if (!inthash_is_free(hashtable, i)) {
-            inthash_del_item(hashtable, &hashtable->items[i]);
+            inthash_del_value(hashtable, i);
           }
         }
 
-        /* wipe auxiliary stash if any */
+        /* wipe auxiliary stash values (not names) if any */
         for(i = 0 ; i < hashtable->stash_size ; i++) {
-          inthash_del_item(hashtable, &hashtable->stash[i]);
+          inthash_del_value_(hashtable, &hashtable->stash[i].value);
         }
       }
       /* wipe top-level */
       hashtable->hash_size_power = 0;
       hashtable->nitems = 0;
+      free(hashtable->string_pool);
+      hashtable->string_pool = NULL;
       free(hashtable->items);
+      hashtable->items = NULL;
       free(hashtable);
       *phashtable = NULL;
     }
author	Xavier Roche <xroche@users.noreply.github.com>	2013-06-24 20:06:12 +0000
committer	Xavier Roche <xroche@users.noreply.github.com>	2013-06-24 20:06:12 +0000
commit	9090ee17b364f44a8df75f927b26c6e352ef9fff (patch)
tree	1bcaabe45742f7e3492ed0c7f3e205cd3e7ed849 /src
parent	9e61bc36f3a3d15375b11321893dd27cd7d4b88b (diff)