summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorXavier Roche <xroche@users.noreply.github.com>2013-06-23 19:40:47 +0000
committerXavier Roche <xroche@users.noreply.github.com>2013-06-23 19:40:47 +0000
commit0817df2fb67e3a2be95b311c4c2d42d2a1e1bc41 (patch)
tree25123b396b822a7c2219eb3456eef3dfd9a6462a
parentf42a5fa5dd4b5b8b91b66e7b47602be12aa298b2 (diff)
Trashed historical link heap hashtable, and replaced it by cleaner code using new cuckoo hashtables
I can not believe I kept such a terible and frightening code for such a long time, geez.
-rw-r--r--src/htsback.c4
-rw-r--r--src/htscore.c12
-rw-r--r--src/htscore.h11
-rw-r--r--src/htshash.c332
-rw-r--r--src/htshash.h1
-rw-r--r--src/htsopt.h7
-rw-r--r--src/htsserver.h5
7 files changed, 118 insertions, 254 deletions
diff --git a/src/htsback.c b/src/htsback.c
index 0644de1..e270d81 100644
--- a/src/htsback.c
+++ b/src/htsback.c
@@ -408,7 +408,7 @@ int back_done_incache(struct_back * sback) {
// stored (ready) slots
if (sback->ready != NULL) {
#ifndef HTS_NO_BACK_ON_DISK
- n += inthash_nitems(sback->ready);
+ n += (int) inthash_nitems(sback->ready);
#else
struct_inthash_enum e = inthash_enum_new(sback->ready);
inthash_chain *item;
@@ -2308,7 +2308,7 @@ void back_clean(httrackp * opt, cache_back * cache, struct_back * sback) {
int index = hash_read(opt->hash, back[i].url_sav, "", 0, opt->urlhack); // lecture type 0 (sav)
if (index >= 0) {
- opt->hash->liens[index]->pass2 = -1; /* DONE! */
+ opt->liens[index]->pass2 = -1; /* DONE! */
} else {
hts_log_print(opt, LOG_INFO,
"engine: warning: entry cleaned up, but no trace on heap: %s%s (%s)",
diff --git a/src/htscore.c b/src/htscore.c
index 9aaf285..5170148 100644
--- a/src/htscore.c
+++ b/src/htscore.c
@@ -449,15 +449,13 @@ int httpmirror(char *url1, httrackp * opt) {
// initialiser ptr et lien_tot
ptr = 0;
lien_tot = 0;
+
// initialiser hachage
- {
- int i;
+ hash_init(&hash);
+ hash.liens = liens;
- for(i = 0; i < HTS_HASH_SIZE; i++)
- hash.hash[0][i] = hash.hash[1][i] = hash.hash[2][i] = -1; // pas d'entrées
- hash.liens = liens;
- hash.max_lien = 0;
- }
+ // we need it
+ opt->liens = liens;
// copier adresse(s) dans liste des adresses
{
diff --git a/src/htscore.h b/src/htscore.h
index d718dfe..2c7a30b 100644
--- a/src/htscore.h
+++ b/src/htscore.h
@@ -261,9 +261,14 @@ struct cache_back {
typedef struct hash_struct hash_struct;
#endif
struct hash_struct {
- lien_url **liens; // pointeur sur liens
- int max_lien; // indice le plus grand rencontré
- int hash[3][HTS_HASH_SIZE]; // tables pour sav/adr-fil/former_adr-former_fil
+ /* Links big array reference */
+ const lien_url **liens;
+ /* Savename (case insensitive ; lowercased) */
+ inthash sav;
+ /* Address and path */
+ inthash adrfil;
+ /* Former address and path */
+ inthash former_adrfil;
};
#ifndef HTS_DEF_FWSTRUCT_filecreate_params
diff --git a/src/htshash.c b/src/htshash.c
index de673b5..475f3fc 100644
--- a/src/htshash.c
+++ b/src/htshash.c
@@ -43,6 +43,7 @@ Please visit our Website: http://www.httrack.com
#include "htsglobal.h"
#include "htsmd5.h"
#include "htscore.h"
+#include "htsinthash.h"
/* END specific definitions */
/* Specific macros */
@@ -59,143 +60,76 @@ Please visit our Website: http://www.httrack.com
// #define HTS_HASH_SIZE 8191 (premier si possible!)
// type: numero enregistrement - 0 est case insensitive (sav) 1 (adr+fil) 2 (former_adr+former_fil)
// recherche dans la table selon nom1,nom2 et le no d'enregistrement
+
+void hash_init(hash_struct * hash) {
+ hash->sav = inthash_new(0);
+ hash->adrfil = inthash_new(0);
+ hash->former_adrfil = inthash_new(0);
+}
+
// retour: position ou -1 si non trouvé
int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
int type, int normalized) {
char BIGSTK normfil_[HTS_URLMAXSIZE * 2];
char catbuff[CATBUFF_SIZE];
const char *normfil;
- const char *normadr;
- unsigned int cle;
- int pos;
-
- // calculer la clé de recherche, non modulée
- if (type)
- cle = hash_cle(nom1, nom2);
- else
- cle = hash_cle(convtolower(catbuff, nom1), nom2); // case insensitive
- // la position se calcule en modulant
- pos = (int) (cle % HTS_HASH_SIZE);
- // entrée trouvée?
- if (hash->hash[type][pos] >= 0) { // un ou plusieurs enregistrement(s) avec une telle clé existe..
- // tester table de raccourcis (hash)
- // pos est maintenant la position recherchée dans liens
- pos = hash->hash[type][pos];
- while(pos >= 0) { // parcourir la chaine
- switch (type) {
- case 0: // sav
- if (strfield2(nom1, hash->liens[pos]->sav)) { // case insensitive
-#if DEBUG_HASH==2
- printf("hash: found shortcut at %d\n", pos);
-#endif
- return pos;
- }
- break;
- case 1: // adr+fil
- {
- if (!normalized)
- normfil = hash->liens[pos]->fil;
- else
- normfil = fil_normalized(hash->liens[pos]->fil, normfil_);
- if (!normalized)
- normadr = jump_identification(hash->liens[pos]->adr);
- else
- normadr = jump_normalized(hash->liens[pos]->adr);
- if ((strfield2(nom1, normadr) != 0) && (strcmp(nom2, normfil) == 0)) {
-#if DEBUG_HASH==2
- printf("hash: found shortcut at %d\n", pos);
-#endif
- return pos;
- }
- }
- break;
- case 2: // former_adr+former_fil
- {
- if (hash->liens[pos]->former_adr) {
- if (!normalized)
- normfil = hash->liens[pos]->former_fil;
- else
- normfil = fil_normalized(hash->liens[pos]->former_fil, normfil_);
- if (!normalized)
- normadr = jump_identification(hash->liens[pos]->former_adr);
- else
- normadr = jump_normalized(hash->liens[pos]->former_adr);
-
- if ((strfield2(nom1, normadr) != 0) && (strcmp(nom2, normfil) == 0)) {
-#if DEBUG_HASH==2
- printf("hash: found shortcut at %d\n", pos);
-#endif
- return pos;
- }
- }
- }
- break;
- }
- // calculer prochaine position dans la chaine
- {
- int old = pos;
-
- pos = hash->liens[pos]->hash_next[type]; // sinon prochain dans la chaine
- if (old == pos)
- pos = -1; // erreur de bouclage (ne devrait pas arriver)
- }
+ intptr_t intvalue;
+ char *name;
+
+ /* dispatch type */
+ switch(type) {
+ case 0:
+ /* first entry: destination filename (lowercased) */
+ assertf(nom2 == NULL || *nom2 == '\0');
+ name = convtolower(catbuff, nom1);
+ break;
+ case 1:
+ case 2:
+ /* second and third entries: URL address and path */
+ if (!normalized)
+ normfil = nom1;
+ else
+ normfil = fil_normalized(nom1, normfil_);
+ if (!normalized) {
+ strcpybuff(catbuff, jump_identification(nom2));
+ } else {
+ strcpybuff(catbuff, jump_normalized(nom2));
}
+ strcatbuff(catbuff, normfil);
+ name = catbuff;
+ break;
+ default:
+ assertf(! "unexpected case");
+ break;
+ }
- // Ok va falloir chercher alors..
- /*pos=hash->max_lien; // commencer à max_lien
- switch (type) {
- case 0: // sav
- while(pos>=0) {
- if (hash->liens[pos]->hash_sav == cle ) {
- if (strcmp(nom1,hash->liens[pos]->sav)==0) {
- hash->hash[type][(int) (cle%HTS_HASH_SIZE)] = pos; // noter plus récent dans shortcut table
- #if DEBUG_HASH==2
- printf("hash: found long search at %d\n",pos);
- #endif
- return pos;
- }
- }
- pos--;
- }
- break;
- case 1: // adr+fil
- while(pos>=0) {
- if (hash->liens[pos]->hash_adrfil == cle ) {
- if ((strcmp(nom1,hash->liens[pos]->adr)==0) && (strcmp(nom2,hash->liens[pos]->fil)==0)) {
- hash->hash[type][(int) (cle%HTS_HASH_SIZE)] = pos; // noter plus récent dans shortcut table
- #if DEBUG_HASH==2
- printf("hash: found long search at %d\n",pos);
- #endif
- return pos;
- }
- }
- pos--;
- }
- break;
- case 2: // former_adr+former_fil
- while(pos>=0) {
- if (hash->liens[pos]->hash_fadrfil == cle ) {
- if (hash->liens[pos]->former_adr)
- if ((strcmp(nom1,hash->liens[pos]->former_adr)==0) && (strcmp(nom2,hash->liens[pos]->former_fil)==0)) {
- hash->hash[type][(int) (cle%HTS_HASH_SIZE)] = pos; // noter plus récent dans shortcut table
- #if DEBUG_HASH==2
- printf("hash: found long search at %d\n",pos);
- #endif
- return pos;
- }
- }
- pos--;
- }
- } */
-#if DEBUG_HASH==1
- printf("hash: not found after test %s%s\n", nom1, nom2);
-#endif
- return -1; // non trouvé
- } else {
-#if DEBUG_HASH==2
- printf("hash: not found %s%s\n", nom1, nom2);
-#endif
- return -1; // non trouvé : clé non entrée (même une fois)
+ /* read */
+ switch(type) {
+ case 0:
+ if (inthash_read(hash->sav, name, &intvalue)) {
+ return (int) intvalue;
+ } else {
+ return -1;
+ }
+ break;
+ case 1:
+ if (inthash_read(hash->adrfil, name, &intvalue)) {
+ return (int) intvalue;
+ } else {
+ return -1;
+ }
+ break;
+ case 2:
+ if (inthash_read(hash->former_adrfil, name, &intvalue)) {
+ return (int) intvalue;
+ } else {
+ return -1;
+ }
+ break;
+ default:
+ assertf(! "unexpected case");
+ return -1;
+ break;
}
}
@@ -204,119 +138,37 @@ void hash_write(hash_struct * hash, int lpos, int normalized) {
char BIGSTK normfil_[HTS_URLMAXSIZE * 2];
char catbuff[CATBUFF_SIZE];
const char *normfil;
- unsigned int cle;
- int pos;
- int *ptr;
+ const char *name;
- //
- if (hash->liens[lpos]) { // on sait jamais..
- hash->max_lien = max(hash->max_lien, lpos);
-#if DEBUG_HASH
- hashnumber = hash->max_lien;
-#endif
- // élément actuel sur -1 (fin de chaine)
- hash->liens[lpos]->hash_next[0] = hash->liens[lpos]->hash_next[1] =
- hash->liens[lpos]->hash_next[2] = -1;
- //
- cle = hash_cle(convtolower(catbuff, hash->liens[lpos]->sav), ""); // CASE INSENSITIVE
- pos = (int) (cle % HTS_HASH_SIZE);
- ptr = hash_calc_chaine(hash, 0, pos); // calculer adresse chaine
- *ptr = lpos; // noter dernier enregistré
-#if DEBUG_HASH==3
- printf("[%d", pos);
-#endif
- //
- if (!normalized)
- normfil = hash->liens[lpos]->fil;
- else
- normfil = fil_normalized(hash->liens[lpos]->fil, normfil_);
+ /* first entry: destination filename (lowercased) */
+ name = convtolower(catbuff, hash->liens[lpos]->sav);
+ inthash_write(hash->sav, name, lpos);
+
+ /* second entry: URL address and path */
+ if (!normalized)
+ normfil = hash->liens[lpos]->fil;
+ else
+ normfil = fil_normalized(hash->liens[lpos]->fil, normfil_);
+ if (!normalized) {
+ strcpybuff(catbuff, jump_identification(hash->liens[lpos]->adr));
+ } else {
+ strcpybuff(catbuff, jump_normalized(hash->liens[lpos]->adr));
+ }
+ strcatbuff(catbuff, normfil);
+ inthash_write(hash->adrfil, name, lpos);
+
+ /* third entry: URL address and path before redirect */
+ if (hash->liens[lpos]->former_adr) { // former_adr existe?
+ if (!normalized) {
+ normfil = hash->liens[lpos]->former_fil;
+ } else {
+ normfil = fil_normalized(hash->liens[lpos]->former_fil, normfil_);
+ }
if (!normalized)
- cle = hash_cle(jump_identification(hash->liens[lpos]->adr), normfil);
+ strcpybuff(catbuff, jump_identification(hash->liens[lpos]->former_adr));
else
- cle = hash_cle(jump_normalized(hash->liens[lpos]->adr), normfil);
- pos = (int) (cle % HTS_HASH_SIZE);
- ptr = hash_calc_chaine(hash, 1, pos); // calculer adresse chaine
- *ptr = lpos; // noter dernier enregistré
-#if DEBUG_HASH==3
- printf(",%d", pos);
-#endif
- //
- if (hash->liens[lpos]->former_adr) { // former_adr existe?
- if (!normalized)
- normfil = hash->liens[lpos]->former_fil;
- else
- normfil = fil_normalized(hash->liens[lpos]->former_fil, normfil_);
- if (!normalized)
- cle =
- hash_cle(jump_identification(hash->liens[lpos]->former_adr), normfil);
- else
- cle = hash_cle(jump_normalized(hash->liens[lpos]->former_adr), normfil);
- pos = (int) (cle % HTS_HASH_SIZE);
- ptr = hash_calc_chaine(hash, 2, pos); // calculer adresse chaine
- *ptr = lpos; // noter dernier enregistré
-#if DEBUG_HASH==3
- printf(",%d", pos);
-#endif
- }
-#if DEBUG_HASH==3
- printf("] ");
- fflush(stdout);
-#endif
- }
-#if DEBUT_HASH
- else {
- printf("* hash_write=0!!\n");
- abortLogFmt("unexpected error in hash_write (pos=%d)" _pos);
- abort();
+ strcpybuff(catbuff, jump_normalized(hash->liens[lpos]->former_adr));
+ strcatbuff(catbuff, normfil);
+ inthash_write(hash->former_adrfil, name, lpos);
}
-#endif
- //
-}
-
-// calcul clé
-// il n'y a pas de formule de hashage universelle, celle-ci semble acceptable..
-unsigned long int hash_cle(const char *nom1, const char *nom2) {
- /*
- unsigned int sum=0;
- int i=0;
- while(*nom1) {
- sum += 1;
- sum += (unsigned int) *(nom1);
- sum *= (unsigned int) *(nom1++);
- sum += (unsigned int) i;
- i++;
- }
- while(*nom2) {
- sum += 1;
- sum += (unsigned int) *(nom2);
- sum *= (unsigned int) *(nom2++);
- sum += (unsigned int) i;
- i++;
- }
- */
- return md5sum32(nom1)
- + md5sum32(nom2);
-}
-
-// calcul de la position finale dans la chaine des elements ayant la même clé
-int *hash_calc_chaine(hash_struct * hash, int type, int pos) {
-#if DEBUG_HASH
- int count = 0;
-#endif
- if (hash->hash[type][pos] == -1)
- return &(hash->hash[type][pos]); // premier élément dans la chaine
- pos = hash->hash[type][pos];
- while(hash->liens[pos]->hash_next[type] != -1) {
- pos = hash->liens[pos]->hash_next[type];
-#if DEBUG_HASH
- count++;
-#endif
- }
-#if DEBUG_HASH
- count++;
- longest_hash[type] = max(longest_hash[type], count);
-#endif
- return &(hash->liens[pos]->hash_next[type]);
}
-
-// FIN GESTION DES TABLES DE HACHAGE
diff --git a/src/htshash.h b/src/htshash.h
index 36e0ae8..2b0d9bd 100644
--- a/src/htshash.h
+++ b/src/htshash.h
@@ -45,6 +45,7 @@ typedef struct hash_struct hash_struct;
#endif
// tables de hachage
+void hash_init(hash_struct * hash);
int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
int type, int normalized);
void hash_write(hash_struct * hash, int lpos, int normalized);
diff --git a/src/htsopt.h b/src/htsopt.h
index e76d8f2..a841b49 100644
--- a/src/htsopt.h
+++ b/src/htsopt.h
@@ -163,6 +163,12 @@ struct fspc_strc {
int info;
};
+/* lien_url */
+#ifndef HTS_DEF_FWSTRUCT_lien_url
+#define HTS_DEF_FWSTRUCT_lien_url
+typedef struct lien_url lien_url;
+#endif
+
#ifndef HTS_DEF_DEFSTRUCT_hts_log_type
#define HTS_DEF_DEFSTRUCT_hts_log_type
typedef enum hts_log_type {
@@ -355,6 +361,7 @@ struct httrackp {
String urllist; // fichier liste de filtres à inclure
htsfilters filters; // contient les pointeurs pour les filtres
hash_struct *hash; // hash structure
+ lien_url **liens; // liens
robots_wizard *robotsptr; // robots ptr
String lang_iso; // en, fr ..
String mimedefs; // ext1=mimetype1\next2=mimetype2..
diff --git a/src/htsserver.h b/src/htsserver.h
index 85d0a9f..03eae71 100644
--- a/src/htsserver.h
+++ b/src/htsserver.h
@@ -36,6 +36,7 @@ Please visit our Website: http://www.httrack.com
#ifndef HTS_SERVER_DEFH
#define HTS_SERVER_DEFH
+#include <sys/stat.h>
#include "htsnet.h"
/* String */
@@ -223,7 +224,7 @@ static int linput(FILE * fp, char *s, int max) {
}
static int linput_trim(FILE * fp, char *s, int max) {
int rlen = 0;
- char *ls = (char *) malloct(max + 2);
+ char *ls = (char *) malloc(max + 2);
s[0] = '\0';
if (ls) {
@@ -247,7 +248,7 @@ static int linput_trim(FILE * fp, char *s, int max) {
}
}
//
- freet(ls);
+ free(ls);
}
return rlen;
}