summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorXavier Roche <xroche@users.noreply.github.com>2013-07-12 18:03:14 +0000
committerXavier Roche <xroche@users.noreply.github.com>2013-07-12 18:03:14 +0000
commit1ed1ddf658370994ece99f904b386256374ab2d1 (patch)
treea7929cc3d4b139f83dfd3be0eb1bcbed1fcc3046 /src
parent676776e99be7a347eab5d4bbf4196204467571f5 (diff)
Experiments with MurmurHash3 as hashing function for the hashtable
Diffstat (limited to 'src')
-rw-r--r--src/htscoremain.c56
-rw-r--r--src/htsinthash.c35
-rw-r--r--src/murmurhash3.h127
3 files changed, 210 insertions, 8 deletions
diff --git a/src/htscoremain.c b/src/htscoremain.c
index 3af785e..ea89810 100644
--- a/src/htscoremain.c
+++ b/src/htscoremain.c
@@ -2374,7 +2374,7 @@ HTSEXT_API int hts_main2(int argc, char **argv, httrackp * opt) {
case '7': // hashtable selftest: httrack -#7 nb_entries
if (++na < argc) {
char *const snum = strdup(argv[na]);
- unsigned long count;
+ unsigned long count = 0;
const char *const names[] = {
"", "add", "delete", "dry-add", "dry-del",
"test-exists", "test-not-exist"
@@ -2419,14 +2419,56 @@ HTSEXT_API int hts_main2(int argc, char **argv, httrackp * opt) {
{ TEST_ADD, 42, 2 }, /* check 42/2 */
{ DO_END }
};
+ char *buff = NULL;
+ const char **strings = NULL;
+
+ /* produce key #i */
#define FMT() \
- char name[256]; \
+ char buffer[256]; \
+ char *name = buffer; \
const long expected = (long) i * 1664525 + 1013904223; \
- snprintf(name, sizeof(name), \
- "http://www.example.com/website/sample/for/hashtable/" \
- "%ld/index.html?foo=%ld&bar", \
- (long) i, (long) (expected))
- if (sscanf(snum, "%lu", &count) == 1) {
+ do { \
+ if (strings == NULL) { \
+ snprintf(name, sizeof(name), \
+ "http://www.example.com/website/sample/for/hashtable/" \
+ "%ld/index.html?foo=%ld&bar", \
+ (long) i, (long) (expected)); \
+ } else { \
+ name = strings[i]; \
+ } \
+ } while(0)
+
+ /* produce random patterns, or read from a file */
+ if (sscanf(snum, "%lu", &count) != 1) {
+ const off_t size = fsize(snum);
+ FILE *fp = fopen(snum, "rb");
+ if (fp != NULL) {
+ buff = malloc(size);
+ if (buff != NULL && fread(buff, 1, size, fp) == size) {
+ size_t capa = 0;
+ size_t i, last;
+ for(i = 0, last = 0, count = 0 ; i < size ; i++) {
+ if (buff[i] == 10 || buff[i] == 0) {
+ buff[i] = '\0';
+ if (capa == count) {
+ if (capa == 0) {
+ capa = 16;
+ } else {
+ capa <<= 1;
+ }
+ strings = realloc(strings, capa*sizeof(char*));
+ }
+ strings[count++] = &buff[last];
+ last = i + 1;
+ }
+ }
+ }
+ fclose(fp);
+ }
+ }
+
+ /* successfully read */
+ if (count > 0) {
inthash hashtable = inthash_new(0);
size_t loop;
for(loop = 0 ; bench[loop].type != DO_END ; loop++) {
diff --git a/src/htsinthash.c b/src/htsinthash.c
index b273a01..6cba621 100644
--- a/src/htsinthash.c
+++ b/src/htsinthash.c
@@ -62,12 +62,22 @@ Please visit our Website: http://www.httrack.com
filled with 4 entries ; whereas the MD5 variant did only collide
once ]
*/
-#ifndef HTS_INTHASH_USES_MD5
+#if (!defined(HTS_INTHASH_USES_MD5) && !defined(HTS_INTHASH_USES_MURMUR))
#define HTS_INTHASH_USES_MD5 1
#endif
#if HTS_INTHASH_USES_MD5 == 1
#include "md5.h"
+#elif (defined(HTS_INTHASH_USES_MURMUR))
+#include "murmurhash3.h"
+#else
+/* use the Openssl implementation */
+#if 0
+#include <openssl/md5.h>
+#define MD5Init MD5_Init
+#define MD5Update MD5_Update
+#define MD5Final MD5_Final
+#endif
#endif
/** Size of auxiliary stash. **/
@@ -231,6 +241,7 @@ inthash_keys inthash_hash_value(const char *value) {
MD5_CTX ctx;
union {
unsigned char md5digest[16];
+ inthash_keys mhashes[2];
inthash_keys hashes;
} u;
@@ -240,6 +251,28 @@ inthash_keys inthash_hash_value(const char *value) {
(unsigned int) strlen(value));
MD5Final(u.md5digest, &ctx);
+ /* mix mix mix */
+ u.mhashes[0].hash1 ^= u.mhashes[1].hash1;
+ u.mhashes[0].hash2 ^= u.mhashes[1].hash2;
+
+ /* do not keep identical hashes */
+ if (u.hashes.hash1 == u.hashes.hash2) {
+ u.hashes.hash2 = ~u.hashes.hash2;
+ }
+
+ return u.hashes;
+#elif (defined(HTS_INTHASH_USES_MURMUR))
+ union {
+ uint32_t result[4];
+ inthash_keys hashes;
+ } u;
+ MurmurHash3_x86_128(value, (const int) strlen(value),
+ 42, &u.result) ;
+
+ /* mix mix mix */
+ u.result[0] ^= u.result[2];
+ u.result[1] ^= u.result[3];
+
/* do not keep identical hashes */
if (u.hashes.hash1 == u.hashes.hash2) {
u.hashes.hash2 = ~u.hashes.hash2;
diff --git a/src/murmurhash3.h b/src/murmurhash3.h
new file mode 100644
index 0000000..62cda97
--- /dev/null
+++ b/src/murmurhash3.h
@@ -0,0 +1,127 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+
+#include <stdint.h>
+
+static uint32_t rotl32 ( uint32_t x, int8_t r )
+{
+ return (x << r) | (x >> (32 - r));
+}
+#define ROTL32(x,y) rotl32(x,y)
+static uint32_t getblock32 ( const uint32_t * p, int i )
+{
+ return p[i];
+}
+static uint32_t fmix32 ( uint32_t h )
+{
+ h ^= h >> 16;
+ h *= 0x85ebca6b;
+ h ^= h >> 13;
+ h *= 0xc2b2ae35;
+ h ^= h >> 16;
+
+ return h;
+}
+#define BIG_CONSTANT(x) (x##LLU)
+static void MurmurHash3_x86_128 ( const void * key, const int len,
+ uint32_t seed, void * out ) {
+ const uint8_t * data = (const uint8_t*)key;
+ const int nblocks = len / 16;
+
+ uint32_t h1 = seed;
+ uint32_t h2 = seed;
+ uint32_t h3 = seed;
+ uint32_t h4 = seed;
+
+ const uint32_t c1 = 0x239b961b;
+ const uint32_t c2 = 0xab0e9789;
+ const uint32_t c3 = 0x38b34ae5;
+ const uint32_t c4 = 0xa1e38b93;
+
+ const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
+ int i;
+
+ for(i = -nblocks; i; i++)
+ {
+ uint32_t k1 = getblock32(blocks,i*4+0);
+ uint32_t k2 = getblock32(blocks,i*4+1);
+ uint32_t k3 = getblock32(blocks,i*4+2);
+ uint32_t k4 = getblock32(blocks,i*4+3);
+
+ k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+
+ h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
+
+ k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+ h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
+
+ k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+ h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
+
+ k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+ h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
+ }
+
+ {
+ const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+ uint32_t k1 = 0;
+ uint32_t k2 = 0;
+ uint32_t k3 = 0;
+ uint32_t k4 = 0;
+
+ switch(len & 15)
+ {
+ case 15: k4 ^= tail[14] << 16;
+ case 14: k4 ^= tail[13] << 8;
+ case 13: k4 ^= tail[12] << 0;
+ k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+ case 12: k3 ^= tail[11] << 24;
+ case 11: k3 ^= tail[10] << 16;
+ case 10: k3 ^= tail[ 9] << 8;
+ case 9: k3 ^= tail[ 8] << 0;
+ k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+ case 8: k2 ^= tail[ 7] << 24;
+ case 7: k2 ^= tail[ 6] << 16;
+ case 6: k2 ^= tail[ 5] << 8;
+ case 5: k2 ^= tail[ 4] << 0;
+ k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+ case 4: k1 ^= tail[ 3] << 24;
+ case 3: k1 ^= tail[ 2] << 16;
+ case 2: k1 ^= tail[ 1] << 8;
+ case 1: k1 ^= tail[ 0] << 0;
+ k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+ };
+
+
+ h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
+
+ h1 += h2; h1 += h3; h1 += h4;
+ h2 += h1; h3 += h1; h4 += h1;
+
+ h1 = fmix32(h1);
+ h2 = fmix32(h2);
+ h3 = fmix32(h3);
+ h4 = fmix32(h4);
+
+ h1 += h2; h1 += h3; h1 += h4;
+ h2 += h1; h3 += h1; h4 += h1;
+
+ ((uint32_t*)out)[0] = h1;
+ ((uint32_t*)out)[1] = h2;
+ ((uint32_t*)out)[2] = h3;
+ ((uint32_t*)out)[3] = h4;
+ }
+}