diff options
-rw-r--r-- | src/htscoremain.c | 56 | ||||
-rw-r--r-- | src/htsinthash.c | 35 | ||||
-rw-r--r-- | src/murmurhash3.h | 127 |
3 files changed, 210 insertions, 8 deletions
diff --git a/src/htscoremain.c b/src/htscoremain.c index 3af785e..ea89810 100644 --- a/src/htscoremain.c +++ b/src/htscoremain.c @@ -2374,7 +2374,7 @@ HTSEXT_API int hts_main2(int argc, char **argv, httrackp * opt) { case '7': // hashtable selftest: httrack -#7 nb_entries if (++na < argc) { char *const snum = strdup(argv[na]); - unsigned long count; + unsigned long count = 0; const char *const names[] = { "", "add", "delete", "dry-add", "dry-del", "test-exists", "test-not-exist" @@ -2419,14 +2419,56 @@ HTSEXT_API int hts_main2(int argc, char **argv, httrackp * opt) { { TEST_ADD, 42, 2 }, /* check 42/2 */ { DO_END } }; + char *buff = NULL; + const char **strings = NULL; + + /* produce key #i */ #define FMT() \ - char name[256]; \ + char buffer[256]; \ + char *name = buffer; \ const long expected = (long) i * 1664525 + 1013904223; \ - snprintf(name, sizeof(name), \ - "http://www.example.com/website/sample/for/hashtable/" \ - "%ld/index.html?foo=%ld&bar", \ - (long) i, (long) (expected)) - if (sscanf(snum, "%lu", &count) == 1) { + do { \ + if (strings == NULL) { \ + snprintf(name, sizeof(name), \ + "http://www.example.com/website/sample/for/hashtable/" \ + "%ld/index.html?foo=%ld&bar", \ + (long) i, (long) (expected)); \ + } else { \ + name = strings[i]; \ + } \ + } while(0) + + /* produce random patterns, or read from a file */ + if (sscanf(snum, "%lu", &count) != 1) { + const off_t size = fsize(snum); + FILE *fp = fopen(snum, "rb"); + if (fp != NULL) { + buff = malloc(size); + if (buff != NULL && fread(buff, 1, size, fp) == size) { + size_t capa = 0; + size_t i, last; + for(i = 0, last = 0, count = 0 ; i < size ; i++) { + if (buff[i] == 10 || buff[i] == 0) { + buff[i] = '\0'; + if (capa == count) { + if (capa == 0) { + capa = 16; + } else { + capa <<= 1; + } + strings = realloc(strings, capa*sizeof(char*)); + } + strings[count++] = &buff[last]; + last = i + 1; + } + } + } + fclose(fp); + } + } + + /* successfully read */ + if (count > 0) { inthash hashtable = inthash_new(0); size_t loop; for(loop = 0 ; bench[loop].type != DO_END ; loop++) { diff --git a/src/htsinthash.c b/src/htsinthash.c index b273a01..6cba621 100644 --- a/src/htsinthash.c +++ b/src/htsinthash.c @@ -62,12 +62,22 @@ Please visit our Website: http://www.httrack.com filled with 4 entries ; whereas the MD5 variant did only collide once ] */ -#ifndef HTS_INTHASH_USES_MD5 +#if (!defined(HTS_INTHASH_USES_MD5) && !defined(HTS_INTHASH_USES_MURMUR)) #define HTS_INTHASH_USES_MD5 1 #endif #if HTS_INTHASH_USES_MD5 == 1 #include "md5.h" +#elif (defined(HTS_INTHASH_USES_MURMUR)) +#include "murmurhash3.h" +#else +/* use the Openssl implementation */ +#if 0 +#include <openssl/md5.h> +#define MD5Init MD5_Init +#define MD5Update MD5_Update +#define MD5Final MD5_Final +#endif #endif /** Size of auxiliary stash. **/ @@ -231,6 +241,7 @@ inthash_keys inthash_hash_value(const char *value) { MD5_CTX ctx; union { unsigned char md5digest[16]; + inthash_keys mhashes[2]; inthash_keys hashes; } u; @@ -240,6 +251,28 @@ inthash_keys inthash_hash_value(const char *value) { (unsigned int) strlen(value)); MD5Final(u.md5digest, &ctx); + /* mix mix mix */ + u.mhashes[0].hash1 ^= u.mhashes[1].hash1; + u.mhashes[0].hash2 ^= u.mhashes[1].hash2; + + /* do not keep identical hashes */ + if (u.hashes.hash1 == u.hashes.hash2) { + u.hashes.hash2 = ~u.hashes.hash2; + } + + return u.hashes; +#elif (defined(HTS_INTHASH_USES_MURMUR)) + union { + uint32_t result[4]; + inthash_keys hashes; + } u; + MurmurHash3_x86_128(value, (const int) strlen(value), + 42, &u.result) ; + + /* mix mix mix */ + u.result[0] ^= u.result[2]; + u.result[1] ^= u.result[3]; + /* do not keep identical hashes */ if (u.hashes.hash1 == u.hashes.hash2) { u.hashes.hash2 = ~u.hashes.hash2; diff --git a/src/murmurhash3.h b/src/murmurhash3.h new file mode 100644 index 0000000..62cda97 --- /dev/null +++ b/src/murmurhash3.h @@ -0,0 +1,127 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. + +#include <stdint.h> + +static uint32_t rotl32 ( uint32_t x, int8_t r ) +{ + return (x << r) | (x >> (32 - r)); +} +#define ROTL32(x,y) rotl32(x,y) +static uint32_t getblock32 ( const uint32_t * p, int i ) +{ + return p[i]; +} +static uint32_t fmix32 ( uint32_t h ) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} +#define BIG_CONSTANT(x) (x##LLU) +static void MurmurHash3_x86_128 ( const void * key, const int len, + uint32_t seed, void * out ) { + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + + const uint32_t c1 = 0x239b961b; + const uint32_t c2 = 0xab0e9789; + const uint32_t c3 = 0x38b34ae5; + const uint32_t c4 = 0xa1e38b93; + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); + int i; + + for(i = -nblocks; i; i++) + { + uint32_t k1 = getblock32(blocks,i*4+0); + uint32_t k2 = getblock32(blocks,i*4+1); + uint32_t k3 = getblock32(blocks,i*4+2); + uint32_t k4 = getblock32(blocks,i*4+3); + + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + + h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; + + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; + + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; + + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; + } + + { + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + + switch(len & 15) + { + case 15: k4 ^= tail[14] << 16; + case 14: k4 ^= tail[13] << 8; + case 13: k4 ^= tail[12] << 0; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + case 12: k3 ^= tail[11] << 24; + case 11: k3 ^= tail[10] << 16; + case 10: k3 ^= tail[ 9] << 8; + case 9: k3 ^= tail[ 8] << 0; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + case 8: k2 ^= tail[ 7] << 24; + case 7: k2 ^= tail[ 6] << 16; + case 6: k2 ^= tail[ 5] << 8; + case 5: k2 ^= tail[ 4] << 0; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + case 4: k1 ^= tail[ 3] << 24; + case 3: k1 ^= tail[ 2] << 16; + case 2: k1 ^= tail[ 1] << 8; + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + + h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + h1 = fmix32(h1); + h2 = fmix32(h2); + h3 = fmix32(h3); + h4 = fmix32(h4); + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + ((uint32_t*)out)[0] = h1; + ((uint32_t*)out)[1] = h2; + ((uint32_t*)out)[2] = h3; + ((uint32_t*)out)[3] = h4; + } +} |