diff options
author | Xavier Roche <xroche@users.noreply.github.com> | 2013-06-23 19:40:47 +0000 |
---|---|---|
committer | Xavier Roche <xroche@users.noreply.github.com> | 2013-06-23 19:40:47 +0000 |
commit | 0817df2fb67e3a2be95b311c4c2d42d2a1e1bc41 (patch) | |
tree | 25123b396b822a7c2219eb3456eef3dfd9a6462a | |
parent | f42a5fa5dd4b5b8b91b66e7b47602be12aa298b2 (diff) |
Trashed historical link heap hashtable, and replaced it by cleaner code using new cuckoo hashtables
I can not believe I kept such a terible and frightening code for such a long time, geez.
-rw-r--r-- | src/htsback.c | 4 | ||||
-rw-r--r-- | src/htscore.c | 12 | ||||
-rw-r--r-- | src/htscore.h | 11 | ||||
-rw-r--r-- | src/htshash.c | 332 | ||||
-rw-r--r-- | src/htshash.h | 1 | ||||
-rw-r--r-- | src/htsopt.h | 7 | ||||
-rw-r--r-- | src/htsserver.h | 5 |
7 files changed, 118 insertions, 254 deletions
diff --git a/src/htsback.c b/src/htsback.c index 0644de1..e270d81 100644 --- a/src/htsback.c +++ b/src/htsback.c @@ -408,7 +408,7 @@ int back_done_incache(struct_back * sback) { // stored (ready) slots if (sback->ready != NULL) { #ifndef HTS_NO_BACK_ON_DISK - n += inthash_nitems(sback->ready); + n += (int) inthash_nitems(sback->ready); #else struct_inthash_enum e = inthash_enum_new(sback->ready); inthash_chain *item; @@ -2308,7 +2308,7 @@ void back_clean(httrackp * opt, cache_back * cache, struct_back * sback) { int index = hash_read(opt->hash, back[i].url_sav, "", 0, opt->urlhack); // lecture type 0 (sav) if (index >= 0) { - opt->hash->liens[index]->pass2 = -1; /* DONE! */ + opt->liens[index]->pass2 = -1; /* DONE! */ } else { hts_log_print(opt, LOG_INFO, "engine: warning: entry cleaned up, but no trace on heap: %s%s (%s)", diff --git a/src/htscore.c b/src/htscore.c index 9aaf285..5170148 100644 --- a/src/htscore.c +++ b/src/htscore.c @@ -449,15 +449,13 @@ int httpmirror(char *url1, httrackp * opt) { // initialiser ptr et lien_tot ptr = 0; lien_tot = 0; + // initialiser hachage - { - int i; + hash_init(&hash); + hash.liens = liens; - for(i = 0; i < HTS_HASH_SIZE; i++) - hash.hash[0][i] = hash.hash[1][i] = hash.hash[2][i] = -1; // pas d'entrées - hash.liens = liens; - hash.max_lien = 0; - } + // we need it + opt->liens = liens; // copier adresse(s) dans liste des adresses { diff --git a/src/htscore.h b/src/htscore.h index d718dfe..2c7a30b 100644 --- a/src/htscore.h +++ b/src/htscore.h @@ -261,9 +261,14 @@ struct cache_back { typedef struct hash_struct hash_struct; #endif struct hash_struct { - lien_url **liens; // pointeur sur liens - int max_lien; // indice le plus grand rencontré - int hash[3][HTS_HASH_SIZE]; // tables pour sav/adr-fil/former_adr-former_fil + /* Links big array reference */ + const lien_url **liens; + /* Savename (case insensitive ; lowercased) */ + inthash sav; + /* Address and path */ + inthash adrfil; + /* Former address and path */ + inthash former_adrfil; }; #ifndef HTS_DEF_FWSTRUCT_filecreate_params diff --git a/src/htshash.c b/src/htshash.c index de673b5..475f3fc 100644 --- a/src/htshash.c +++ b/src/htshash.c @@ -43,6 +43,7 @@ Please visit our Website: http://www.httrack.com #include "htsglobal.h" #include "htsmd5.h" #include "htscore.h" +#include "htsinthash.h" /* END specific definitions */ /* Specific macros */ @@ -59,143 +60,76 @@ Please visit our Website: http://www.httrack.com // #define HTS_HASH_SIZE 8191 (premier si possible!) // type: numero enregistrement - 0 est case insensitive (sav) 1 (adr+fil) 2 (former_adr+former_fil) // recherche dans la table selon nom1,nom2 et le no d'enregistrement + +void hash_init(hash_struct * hash) { + hash->sav = inthash_new(0); + hash->adrfil = inthash_new(0); + hash->former_adrfil = inthash_new(0); +} + // retour: position ou -1 si non trouvé int hash_read(const hash_struct * hash, const char *nom1, const char *nom2, int type, int normalized) { char BIGSTK normfil_[HTS_URLMAXSIZE * 2]; char catbuff[CATBUFF_SIZE]; const char *normfil; - const char *normadr; - unsigned int cle; - int pos; - - // calculer la clé de recherche, non modulée - if (type) - cle = hash_cle(nom1, nom2); - else - cle = hash_cle(convtolower(catbuff, nom1), nom2); // case insensitive - // la position se calcule en modulant - pos = (int) (cle % HTS_HASH_SIZE); - // entrée trouvée? - if (hash->hash[type][pos] >= 0) { // un ou plusieurs enregistrement(s) avec une telle clé existe.. - // tester table de raccourcis (hash) - // pos est maintenant la position recherchée dans liens - pos = hash->hash[type][pos]; - while(pos >= 0) { // parcourir la chaine - switch (type) { - case 0: // sav - if (strfield2(nom1, hash->liens[pos]->sav)) { // case insensitive -#if DEBUG_HASH==2 - printf("hash: found shortcut at %d\n", pos); -#endif - return pos; - } - break; - case 1: // adr+fil - { - if (!normalized) - normfil = hash->liens[pos]->fil; - else - normfil = fil_normalized(hash->liens[pos]->fil, normfil_); - if (!normalized) - normadr = jump_identification(hash->liens[pos]->adr); - else - normadr = jump_normalized(hash->liens[pos]->adr); - if ((strfield2(nom1, normadr) != 0) && (strcmp(nom2, normfil) == 0)) { -#if DEBUG_HASH==2 - printf("hash: found shortcut at %d\n", pos); -#endif - return pos; - } - } - break; - case 2: // former_adr+former_fil - { - if (hash->liens[pos]->former_adr) { - if (!normalized) - normfil = hash->liens[pos]->former_fil; - else - normfil = fil_normalized(hash->liens[pos]->former_fil, normfil_); - if (!normalized) - normadr = jump_identification(hash->liens[pos]->former_adr); - else - normadr = jump_normalized(hash->liens[pos]->former_adr); - - if ((strfield2(nom1, normadr) != 0) && (strcmp(nom2, normfil) == 0)) { -#if DEBUG_HASH==2 - printf("hash: found shortcut at %d\n", pos); -#endif - return pos; - } - } - } - break; - } - // calculer prochaine position dans la chaine - { - int old = pos; - - pos = hash->liens[pos]->hash_next[type]; // sinon prochain dans la chaine - if (old == pos) - pos = -1; // erreur de bouclage (ne devrait pas arriver) - } + intptr_t intvalue; + char *name; + + /* dispatch type */ + switch(type) { + case 0: + /* first entry: destination filename (lowercased) */ + assertf(nom2 == NULL || *nom2 == '\0'); + name = convtolower(catbuff, nom1); + break; + case 1: + case 2: + /* second and third entries: URL address and path */ + if (!normalized) + normfil = nom1; + else + normfil = fil_normalized(nom1, normfil_); + if (!normalized) { + strcpybuff(catbuff, jump_identification(nom2)); + } else { + strcpybuff(catbuff, jump_normalized(nom2)); } + strcatbuff(catbuff, normfil); + name = catbuff; + break; + default: + assertf(! "unexpected case"); + break; + } - // Ok va falloir chercher alors.. - /*pos=hash->max_lien; // commencer à max_lien - switch (type) { - case 0: // sav - while(pos>=0) { - if (hash->liens[pos]->hash_sav == cle ) { - if (strcmp(nom1,hash->liens[pos]->sav)==0) { - hash->hash[type][(int) (cle%HTS_HASH_SIZE)] = pos; // noter plus récent dans shortcut table - #if DEBUG_HASH==2 - printf("hash: found long search at %d\n",pos); - #endif - return pos; - } - } - pos--; - } - break; - case 1: // adr+fil - while(pos>=0) { - if (hash->liens[pos]->hash_adrfil == cle ) { - if ((strcmp(nom1,hash->liens[pos]->adr)==0) && (strcmp(nom2,hash->liens[pos]->fil)==0)) { - hash->hash[type][(int) (cle%HTS_HASH_SIZE)] = pos; // noter plus récent dans shortcut table - #if DEBUG_HASH==2 - printf("hash: found long search at %d\n",pos); - #endif - return pos; - } - } - pos--; - } - break; - case 2: // former_adr+former_fil - while(pos>=0) { - if (hash->liens[pos]->hash_fadrfil == cle ) { - if (hash->liens[pos]->former_adr) - if ((strcmp(nom1,hash->liens[pos]->former_adr)==0) && (strcmp(nom2,hash->liens[pos]->former_fil)==0)) { - hash->hash[type][(int) (cle%HTS_HASH_SIZE)] = pos; // noter plus récent dans shortcut table - #if DEBUG_HASH==2 - printf("hash: found long search at %d\n",pos); - #endif - return pos; - } - } - pos--; - } - } */ -#if DEBUG_HASH==1 - printf("hash: not found after test %s%s\n", nom1, nom2); -#endif - return -1; // non trouvé - } else { -#if DEBUG_HASH==2 - printf("hash: not found %s%s\n", nom1, nom2); -#endif - return -1; // non trouvé : clé non entrée (même une fois) + /* read */ + switch(type) { + case 0: + if (inthash_read(hash->sav, name, &intvalue)) { + return (int) intvalue; + } else { + return -1; + } + break; + case 1: + if (inthash_read(hash->adrfil, name, &intvalue)) { + return (int) intvalue; + } else { + return -1; + } + break; + case 2: + if (inthash_read(hash->former_adrfil, name, &intvalue)) { + return (int) intvalue; + } else { + return -1; + } + break; + default: + assertf(! "unexpected case"); + return -1; + break; } } @@ -204,119 +138,37 @@ void hash_write(hash_struct * hash, int lpos, int normalized) { char BIGSTK normfil_[HTS_URLMAXSIZE * 2]; char catbuff[CATBUFF_SIZE]; const char *normfil; - unsigned int cle; - int pos; - int *ptr; + const char *name; - // - if (hash->liens[lpos]) { // on sait jamais.. - hash->max_lien = max(hash->max_lien, lpos); -#if DEBUG_HASH - hashnumber = hash->max_lien; -#endif - // élément actuel sur -1 (fin de chaine) - hash->liens[lpos]->hash_next[0] = hash->liens[lpos]->hash_next[1] = - hash->liens[lpos]->hash_next[2] = -1; - // - cle = hash_cle(convtolower(catbuff, hash->liens[lpos]->sav), ""); // CASE INSENSITIVE - pos = (int) (cle % HTS_HASH_SIZE); - ptr = hash_calc_chaine(hash, 0, pos); // calculer adresse chaine - *ptr = lpos; // noter dernier enregistré -#if DEBUG_HASH==3 - printf("[%d", pos); -#endif - // - if (!normalized) - normfil = hash->liens[lpos]->fil; - else - normfil = fil_normalized(hash->liens[lpos]->fil, normfil_); + /* first entry: destination filename (lowercased) */ + name = convtolower(catbuff, hash->liens[lpos]->sav); + inthash_write(hash->sav, name, lpos); + + /* second entry: URL address and path */ + if (!normalized) + normfil = hash->liens[lpos]->fil; + else + normfil = fil_normalized(hash->liens[lpos]->fil, normfil_); + if (!normalized) { + strcpybuff(catbuff, jump_identification(hash->liens[lpos]->adr)); + } else { + strcpybuff(catbuff, jump_normalized(hash->liens[lpos]->adr)); + } + strcatbuff(catbuff, normfil); + inthash_write(hash->adrfil, name, lpos); + + /* third entry: URL address and path before redirect */ + if (hash->liens[lpos]->former_adr) { // former_adr existe? + if (!normalized) { + normfil = hash->liens[lpos]->former_fil; + } else { + normfil = fil_normalized(hash->liens[lpos]->former_fil, normfil_); + } if (!normalized) - cle = hash_cle(jump_identification(hash->liens[lpos]->adr), normfil); + strcpybuff(catbuff, jump_identification(hash->liens[lpos]->former_adr)); else - cle = hash_cle(jump_normalized(hash->liens[lpos]->adr), normfil); - pos = (int) (cle % HTS_HASH_SIZE); - ptr = hash_calc_chaine(hash, 1, pos); // calculer adresse chaine - *ptr = lpos; // noter dernier enregistré -#if DEBUG_HASH==3 - printf(",%d", pos); -#endif - // - if (hash->liens[lpos]->former_adr) { // former_adr existe? - if (!normalized) - normfil = hash->liens[lpos]->former_fil; - else - normfil = fil_normalized(hash->liens[lpos]->former_fil, normfil_); - if (!normalized) - cle = - hash_cle(jump_identification(hash->liens[lpos]->former_adr), normfil); - else - cle = hash_cle(jump_normalized(hash->liens[lpos]->former_adr), normfil); - pos = (int) (cle % HTS_HASH_SIZE); - ptr = hash_calc_chaine(hash, 2, pos); // calculer adresse chaine - *ptr = lpos; // noter dernier enregistré -#if DEBUG_HASH==3 - printf(",%d", pos); -#endif - } -#if DEBUG_HASH==3 - printf("] "); - fflush(stdout); -#endif - } -#if DEBUT_HASH - else { - printf("* hash_write=0!!\n"); - abortLogFmt("unexpected error in hash_write (pos=%d)" _pos); - abort(); + strcpybuff(catbuff, jump_normalized(hash->liens[lpos]->former_adr)); + strcatbuff(catbuff, normfil); + inthash_write(hash->former_adrfil, name, lpos); } -#endif - // -} - -// calcul clé -// il n'y a pas de formule de hashage universelle, celle-ci semble acceptable.. -unsigned long int hash_cle(const char *nom1, const char *nom2) { - /* - unsigned int sum=0; - int i=0; - while(*nom1) { - sum += 1; - sum += (unsigned int) *(nom1); - sum *= (unsigned int) *(nom1++); - sum += (unsigned int) i; - i++; - } - while(*nom2) { - sum += 1; - sum += (unsigned int) *(nom2); - sum *= (unsigned int) *(nom2++); - sum += (unsigned int) i; - i++; - } - */ - return md5sum32(nom1) - + md5sum32(nom2); -} - -// calcul de la position finale dans la chaine des elements ayant la même clé -int *hash_calc_chaine(hash_struct * hash, int type, int pos) { -#if DEBUG_HASH - int count = 0; -#endif - if (hash->hash[type][pos] == -1) - return &(hash->hash[type][pos]); // premier élément dans la chaine - pos = hash->hash[type][pos]; - while(hash->liens[pos]->hash_next[type] != -1) { - pos = hash->liens[pos]->hash_next[type]; -#if DEBUG_HASH - count++; -#endif - } -#if DEBUG_HASH - count++; - longest_hash[type] = max(longest_hash[type], count); -#endif - return &(hash->liens[pos]->hash_next[type]); } - -// FIN GESTION DES TABLES DE HACHAGE diff --git a/src/htshash.h b/src/htshash.h index 36e0ae8..2b0d9bd 100644 --- a/src/htshash.h +++ b/src/htshash.h @@ -45,6 +45,7 @@ typedef struct hash_struct hash_struct; #endif // tables de hachage +void hash_init(hash_struct * hash); int hash_read(const hash_struct * hash, const char *nom1, const char *nom2, int type, int normalized); void hash_write(hash_struct * hash, int lpos, int normalized); diff --git a/src/htsopt.h b/src/htsopt.h index e76d8f2..a841b49 100644 --- a/src/htsopt.h +++ b/src/htsopt.h @@ -163,6 +163,12 @@ struct fspc_strc { int info; }; +/* lien_url */ +#ifndef HTS_DEF_FWSTRUCT_lien_url +#define HTS_DEF_FWSTRUCT_lien_url +typedef struct lien_url lien_url; +#endif + #ifndef HTS_DEF_DEFSTRUCT_hts_log_type #define HTS_DEF_DEFSTRUCT_hts_log_type typedef enum hts_log_type { @@ -355,6 +361,7 @@ struct httrackp { String urllist; // fichier liste de filtres à inclure htsfilters filters; // contient les pointeurs pour les filtres hash_struct *hash; // hash structure + lien_url **liens; // liens robots_wizard *robotsptr; // robots ptr String lang_iso; // en, fr .. String mimedefs; // ext1=mimetype1\next2=mimetype2.. diff --git a/src/htsserver.h b/src/htsserver.h index 85d0a9f..03eae71 100644 --- a/src/htsserver.h +++ b/src/htsserver.h @@ -36,6 +36,7 @@ Please visit our Website: http://www.httrack.com #ifndef HTS_SERVER_DEFH #define HTS_SERVER_DEFH +#include <sys/stat.h> #include "htsnet.h" /* String */ @@ -223,7 +224,7 @@ static int linput(FILE * fp, char *s, int max) { } static int linput_trim(FILE * fp, char *s, int max) { int rlen = 0; - char *ls = (char *) malloct(max + 2); + char *ls = (char *) malloc(max + 2); s[0] = '\0'; if (ls) { @@ -247,7 +248,7 @@ static int linput_trim(FILE * fp, char *s, int max) { } } // - freet(ls); + free(ls); } return rlen; } |