summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorXavier Roche <xroche@users.noreply.github.com>2013-08-12 13:42:55 +0000
committerXavier Roche <xroche@users.noreply.github.com>2013-08-12 13:42:55 +0000
commit4e903579b22f3698934294370f076119576041ab (patch)
treeb2007c09c0994a8a3c2d1aba4c8c04ae540daf9a /src
parente77141ceaefb55f427edcfb777f5f1a96b885583 (diff)
Big cleanup in core heap hashtable code, rewritten using new fancy hashtables.
Diffstat (limited to 'src')
-rw-r--r--src/htsback.c4
-rw-r--r--src/htscore.c10
-rw-r--r--src/htscore.h5
-rw-r--r--src/htshash.c206
-rw-r--r--src/htshash.h6
-rw-r--r--src/htsname.c8
-rw-r--r--src/htsparse.c10
-rw-r--r--src/htstools.c6
-rw-r--r--src/htswizard.c2
9 files changed, 180 insertions, 77 deletions
diff --git a/src/htsback.c b/src/htsback.c
index 4de49dc..18ea8ac 100644
--- a/src/htsback.c
+++ b/src/htsback.c
@@ -2308,7 +2308,7 @@ static int slot_can_be_finalized(httrackp * opt, const lien_back * back) {
&& !may_be_hypertext_mime(opt, back->r.contenttype, back->url_fil) // may NOT be parseable mime type
/* Has not been added before the heap saw the link, or now exists on heap */
&& (!back->early_add
- || hash_read(opt->hash, back->url_sav, "", 0, opt->urlhack) >= 0);
+ || hash_read(opt->hash, back->url_sav, NULL, HASH_STRUCT_FILENAME) >= 0);
}
void back_clean(httrackp * opt, cache_back * cache, struct_back * sback) {
@@ -2330,7 +2330,7 @@ void back_clean(httrackp * opt, cache_back * cache, struct_back * sback) {
//}
/* MANDATORY if we don't want back_fill() to endlessly put the same file on download! */
{
- int index = hash_read(opt->hash, back[i].url_sav, "", 0, opt->urlhack); // lecture type 0 (sav)
+ int index = hash_read(opt->hash, back[i].url_sav, NULL, HASH_STRUCT_FILENAME ); // lecture type 0 (sav)
if (index >= 0) {
opt->liens[index]->pass2 = -1; /* DONE! */
diff --git a/src/htscore.c b/src/htscore.c
index fd4e9ff..8c86914 100644
--- a/src/htscore.c
+++ b/src/htscore.c
@@ -229,7 +229,7 @@ strcpybuff(liens[lien_tot]->adr,A); \
strcpybuff(liens[lien_tot]->fil,F); \
strcpybuff(liens[lien_tot]->sav,S); \
liens_record_sav_len(liens[lien_tot]); \
-hash_write(hashptr,lien_tot,NORM); \
+hash_write(hashptr,lien_tot); \
} \
}
@@ -449,7 +449,7 @@ int httpmirror(char *url1, httrackp * opt) {
lien_tot = 0;
// initialiser hachage
- hash_init(&hash);
+ hash_init(&hash, opt->urlhack);
// note: we need a cast because of the const
hash.liens = (const lien_url **) liens;
@@ -3224,12 +3224,12 @@ int back_fill(struct_back * sback, httrackp * opt, cache_back * cache,
// Why in hell did I do that ?
//if (ok && liens[p]->sav != NULL && liens[p]->sav[0] != '\0'
- // && hash_read(opt->hash,liens[p]->sav,"",0,opt->urlhack) >= 0) // lookup in liens_record
+ // && hash_read(opt->hash,liens[p]->sav,NULL,HASH_STRUCT_FILENAME ) >= 0) // lookup in liens_record
//{
// ok = 0;
//}
if (liens[p]->sav == NULL || liens[p]->sav[0] == '\0'
- || hash_read(opt->hash, liens[p]->sav, "", 0, opt->urlhack) < 0) {
+ || hash_read(opt->hash, liens[p]->sav, NULL, HASH_STRUCT_FILENAME ) < 0) {
ok = 0;
}
// note: si un backing est fini, il reste en mémoire jusqu'à ce que
@@ -3766,7 +3766,7 @@ int htsAddLink(htsmoduleStruct * str, char *link) {
//
// On part de la fin et on essaye de se presser (économise temps machine)
{
- int i = hash_read(hashptr, save, "", 0, opt->urlhack); // lecture type 0 (sav)
+ int i = hash_read(hashptr, save, NULL, HASH_STRUCT_FILENAME ); // lecture type 0 (sav)
if (i >= 0) {
liens[i]->depth = maximum(liens[i]->depth, prio_fix);
diff --git a/src/htscore.h b/src/htscore.h
index f884b7d..a8587a4 100644
--- a/src/htscore.h
+++ b/src/htscore.h
@@ -267,6 +267,11 @@ struct hash_struct {
inthash adrfil;
/* Former address and path */
inthash former_adrfil;
+ /** Buffers **/
+ int normalized;
+ char normfil[HTS_URLMAXSIZE * 2];
+ char normfil2[HTS_URLMAXSIZE * 2];
+ char catbuff[CATBUFF_SIZE];
};
#ifndef HTS_DEF_FWSTRUCT_filecreate_params
diff --git a/src/htshash.c b/src/htshash.c
index daf987c..d740de7 100644
--- a/src/htshash.c
+++ b/src/htshash.c
@@ -60,10 +60,143 @@ Please visit our Website: http://www.httrack.com
// type: numero enregistrement - 0 est case insensitive (sav) 1 (adr+fil) 2 (former_adr+former_fil)
// recherche dans la table selon nom1,nom2 et le no d'enregistrement
-void hash_init(hash_struct * hash) {
+/* Key free handler (NOOP) ; addresses are kept */
+static void key_freehandler(void *arg, void *value) {
+}
+
+/* Key strdup (pointer copy) */
+static char* key_duphandler(void *arg, const char *name) {
+ union {
+ const char *roname;
+ char *name;
+ } u;
+ u.roname = name;
+ return u.name;
+}
+
+/* Key sav hashes are using case-insensitive version */
+static inthash_keys key_sav_hashes(void *arg, const char *value) {
+ hash_struct *const hash = (hash_struct*) arg;
+ convtolower(hash->catbuff, value);
+ return inthash_hash_value(hash->catbuff);
+}
+
+/* Key sav comparison is case-insensitive */
+static int key_sav_equals(void *arg, const char *a, const char *b) {
+ return strcasecmp(a, b) == 0;
+}
+
+/* Pseudo-key (lien_url structure) hash function */
+static inthash_keys key_adrfil_hashes_generic(void *arg, const char *value_,
+ const int former) {
+ hash_struct *const hash = (hash_struct*) arg;
+ const lien_url*const lien = (lien_url*) value_;
+ int i;
+ const char *const adr = !former ? lien->adr : lien->former_adr;
+ const char *const fil = !former ? lien->fil : lien->former_fil;
+ const char *const adr_norm = adr != NULL ?
+ ( hash->normalized ? jump_normalized(adr) : jump_identification(adr) )
+ : NULL;
+
+ // copy address
+ assertf(adr_norm != NULL);
+ strcpy(hash->normfil, adr_norm);
+
+ // copy link
+ assertf(fil != NULL);
+ if (hash->normalized) {
+ fil_normalized(fil, &hash->normfil[strlen(hash->normfil)]);
+ } else {
+ strcpy(&hash->normfil[i], fil);
+ }
+
+ // hash
+ return inthash_hash_value(hash->normfil);
+}
+
+/* Pseudo-key (lien_url structure) comparison function */
+static int key_adrfil_equals_generic(void *arg, const char *a_, const char *b_,
+ const int former) {
+ hash_struct *const hash = (hash_struct*) arg;
+ const int normalized = hash->normalized;
+ const lien_url*const a = (lien_url*) a_;
+ const lien_url*const b = (lien_url*) b_;
+ const char *const a_adr = !former ? a->adr : a->former_adr;
+ const char *const b_adr = !former ? b->adr : b->former_adr;
+ const char *const a_fil = !former ? a->fil : a->former_fil;
+ const char *const b_fil = !former ? b->fil : b->former_fil;
+ const char *ja;
+ const char *jb;
+
+ // safety
+ assertf(a_adr != NULL);
+ assertf(b_adr != NULL);
+
+ // skip scheme and authentication to the domain (possibly without www.)
+ ja = normalized ? jump_normalized(a_adr) : jump_identification(a_adr);
+ jb = normalized ? jump_normalized(b_adr) : jump_identification(b_adr);
+ if (strcasecmp(ja, jb) != 0) {
+ return 0;
+ }
+
+ // now compare pathes
+ if (normalized) {
+ fil_normalized(a_fil, hash->normfil);
+ fil_normalized(b_fil, hash->normfil2);
+ return strcmp(hash->normfil, hash->normfil2) == 0;
+ } else {
+ return strcmp(a_fil, b_fil) == 0;
+ }
+}
+
+/* "adr"/"fil" lien_url structure members hashing function */
+static inthash_keys key_adrfil_hashes(void *arg, const char *value_) {
+ return key_adrfil_hashes_generic(arg, value_, 0);
+}
+
+/* "adr"/"fil" lien_url structure members comparison function */
+static int key_adrfil_equals(void *arg, const char *a, const char *b) {
+ return key_adrfil_equals_generic(arg, a, b, 0);
+}
+
+/* "former_adr"/"former_fil" lien_url structure members hashing function */
+static inthash_keys key_former_adrfil_hashes(void *arg, const char *value_) {
+ return key_adrfil_hashes_generic(arg, value_, 1);
+}
+
+/* "former_adr"/"former_fil" lien_url structure members comparison function */
+static int key_former_adrfil_equals(void *arg, const char *a, const char *b) {
+ return key_adrfil_equals_generic(arg, a, b, 1);
+}
+
+void hash_init(hash_struct * hash, int normalized) {
hash->sav = inthash_new(0);
hash->adrfil = inthash_new(0);
hash->former_adrfil = inthash_new(0);
+ hash->normalized = normalized;
+
+ /* Case-insensitive comparison ; keys are direct char* filenames */
+ inthash_value_set_key_handler(hash->sav,
+ key_duphandler,
+ key_freehandler,
+ key_sav_hashes,
+ key_sav_equals,
+ hash);
+
+ /* URL-style comparison ; keys are lien_url structure pointers casted
+ to char* */
+ inthash_value_set_key_handler(hash->adrfil,
+ key_duphandler,
+ key_freehandler,
+ key_adrfil_hashes,
+ key_adrfil_equals,
+ hash);
+ inthash_value_set_key_handler(hash->former_adrfil,
+ key_duphandler,
+ key_freehandler,
+ key_former_adrfil_hashes,
+ key_former_adrfil_equals,
+ hash);
}
void hash_free(hash_struct *hash) {
@@ -74,66 +207,36 @@ void hash_free(hash_struct *hash) {
}
}
-static char * normalize_key(const char *nom1, const char *nom2,
- hash_struct_type type, int normalized,
- char *normfil_, char *catbuff) {
- /* dispatch type */
- const char *normfil;
- switch(type) {
- case HASH_STRUCT_FILENAME:
- /* first entry: destination filename (lowercased) */
- assertf(nom2 == NULL || *nom2 == '\0');
- return convtolower(catbuff, nom1);
- break;
- case HASH_STRUCT_ADR_PATH:
- case HASH_STRUCT_ORIGINAL_ADR_PATH:
- /* second and third entries: URL address and path */
- if (!normalized)
- normfil = nom2;
- else
- normfil = fil_normalized(nom2, normfil_);
- if (!normalized) {
- strcpybuff(catbuff, jump_identification(nom1));
- } else {
- strcpybuff(catbuff, jump_normalized(nom1));
- }
- strcatbuff(catbuff, normfil);
- return catbuff;
- break;
- default:
- assertf(! "unexpected case");
- return NULL;
- break;
- }
-}
-
// retour: position ou -1 si non trouvé
int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
- hash_struct_type type, int normalized) {
- char BIGSTK normfil_[HTS_URLMAXSIZE * 2];
- char catbuff[CATBUFF_SIZE];
+ hash_struct_type type) {
intptr_t intvalue;
- char *const name = normalize_key(nom1, nom2, type, normalized,
- normfil_, catbuff);
+ lien_url lien;
/* read */
switch(type) {
case HASH_STRUCT_FILENAME:
- if (inthash_read(hash->sav, name, &intvalue)) {
+ if (inthash_read(hash->sav, nom1, &intvalue)) {
return (int) intvalue;
} else {
return -1;
}
break;
case HASH_STRUCT_ADR_PATH:
- if (inthash_read(hash->adrfil, name, &intvalue)) {
+ memset(&lien, 0, sizeof(lien));
+ lien.adr = key_duphandler(NULL, nom1);
+ lien.fil = key_duphandler(NULL, nom2);
+ if (inthash_read(hash->adrfil, (char*) &lien, &intvalue)) {
return (int) intvalue;
} else {
return -1;
}
break;
case HASH_STRUCT_ORIGINAL_ADR_PATH:
- if (inthash_read(hash->former_adrfil, name, &intvalue)) {
+ memset(&lien, 0, sizeof(lien));
+ lien.former_adr = key_duphandler(NULL, nom1);
+ lien.former_fil = key_duphandler(NULL, nom2);
+ if (inthash_read(hash->former_adrfil, (char*) &lien, &intvalue)) {
return (int) intvalue;
} else {
return -1;
@@ -147,26 +250,15 @@ int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
}
// enregistrement lien lpos dans les 3 tables hash1..3
-void hash_write(hash_struct * hash, int lpos, int normalized) {
- char BIGSTK normfil_[HTS_URLMAXSIZE * 2];
- char catbuff[CATBUFF_SIZE];
- const char *name;
-
+void hash_write(hash_struct * hash, int lpos) {
/* first entry: destination filename (lowercased) */
- name = normalize_key(hash->liens[lpos]->sav, NULL, HASH_STRUCT_FILENAME,
- normalized, normfil_, catbuff);
- inthash_write(hash->sav, name, lpos);
+ inthash_write(hash->sav, hash->liens[lpos]->sav, lpos);
/* second entry: URL address and path */
- name = normalize_key(hash->liens[lpos]->adr, hash->liens[lpos]->fil,
- HASH_STRUCT_ADR_PATH, normalized, normfil_, catbuff);
- inthash_write(hash->adrfil, name, lpos);
+ inthash_write(hash->adrfil, (char*) hash->liens[lpos], lpos);
/* third entry: URL address and path before redirect */
if (hash->liens[lpos]->former_adr) { // former_adr existe?
- name = normalize_key(hash->liens[lpos]->former_adr,
- hash->liens[lpos]->former_fil, HASH_STRUCT_ORIGINAL_ADR_PATH,
- normalized, normfil_, catbuff);
- inthash_write(hash->former_adrfil, name, lpos);
+ inthash_write(hash->former_adrfil, (char*) hash->liens[lpos], lpos);
}
}
diff --git a/src/htshash.h b/src/htshash.h
index 97b35ab..454556f 100644
--- a/src/htshash.h
+++ b/src/htshash.h
@@ -52,11 +52,11 @@ typedef enum hash_struct_type {
} hash_struct_type;
// tables de hachage
-void hash_init(hash_struct *hash);
+void hash_init(hash_struct *hash, int normalized);
void hash_free(hash_struct *hash);
int hash_read(const hash_struct * hash, const char *nom1, const char *nom2,
- hash_struct_type type, int normalized);
-void hash_write(hash_struct * hash, int lpos, int normalized);
+ hash_struct_type type);
+void hash_write(hash_struct * hash, int lpos);
int *hash_calc_chaine(hash_struct * hash, hash_struct_type type, int pos);
unsigned long int hash_cle(const char *nom1, const char *nom2);
#endif
diff --git a/src/htsname.c b/src/htsname.c
index 4719ec5..4e3b763 100644
--- a/src/htsname.c
+++ b/src/htsname.c
@@ -267,12 +267,12 @@ int url_savename(char *adr_complete, char *fil_complete, char *save,
if (liens != NULL) {
int i;
- i = hash_read(hash, normadr, normfil, 1, opt->urlhack); // recherche table 1 (adr+fil)
+ i = hash_read(hash, normadr, normfil, HASH_STRUCT_ADR_PATH); // recherche table 1 (adr+fil)
if (i >= 0) { // ok, trouvé
strcpybuff(save, liens[i]->sav);
return 0;
}
- i = hash_read(hash, normadr, normfil, 2, opt->urlhack); // recherche table 2 (former_adr+former_fil)
+ i = hash_read(hash, normadr, normfil, HASH_STRUCT_ORIGINAL_ADR_PATH); // recherche table 2 (former_adr+former_fil)
if (i >= 0) { // ok, trouvé
// copier location moved!
strcpybuff(adr_complete, liens[i]->adr);
@@ -291,7 +291,7 @@ int url_savename(char *adr_complete, char *fil_complete, char *save,
fil_complete_patche[strlen(fil_complete_patche) - 1] = '\0';
else
strcatbuff(fil_complete_patche, "/");
- i = hash_read(hash, normadr, fil_complete_patche, 2, opt->urlhack); // recherche table 2 (former_adr+former_fil)
+ i = hash_read(hash, normadr, fil_complete_patche, HASH_STRUCT_ORIGINAL_ADR_PATH); // recherche table 2 (former_adr+former_fil)
if (i >= 0) {
// écraser fil et adr (pas former_fil?????)
strcpybuff(adr_complete, liens[i]->adr);
@@ -1555,7 +1555,7 @@ int url_savename(char *adr_complete, char *fil_complete, char *save,
printf("\nStart search\n");
#endif
- i = hash_read(hash, save, "", 0, 0); // lecture type 0 (sav)
+ i = hash_read(hash, save, NULL, HASH_STRUCT_FILENAME); // lecture type 0 (sav)
if (i >= 0) {
int sameAdr = (strfield2(liens[i]->adr, normadr) != 0);
int sameFil;
diff --git a/src/htsparse.c b/src/htsparse.c
index 02e4e3a..7648d07 100644
--- a/src/htsparse.c
+++ b/src/htsparse.c
@@ -295,7 +295,7 @@ Please visit our Website: http://www.httrack.com
strcpybuff(liens[lien_tot]->fil,F); \
strcpybuff(liens[lien_tot]->sav,S); \
liens_record_sav_len(liens[lien_tot]); \
- hash_write(hashptr,lien_tot,opt->urlhack); \
+ hash_write(hashptr,lien_tot); \
} \
}
@@ -3196,7 +3196,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
//
// On part de la fin et on essaye de se presser (économise temps machine)
{
- int i = hash_read(hash, save, "", 0, opt->urlhack); // lecture type 0 (sav)
+ int i = hash_read(hash, save, NULL, 0); // lecture type 0 (sav)
if (i >= 0) {
if ((opt->debug > 1) && (opt->log != NULL)) {
@@ -3636,7 +3636,7 @@ int hts_mirror_check_moved(htsmoduleStruct * str,
liens[liens[ptr]->precedent]->adr,
liens[liens[ptr]->precedent]->fil, opt, liens, lien_tot,
sback, cache, hash, ptr, numero_passe, NULL) != -1) {
- if (hash_read(hash, mov_sav, "", 0, 0) < 0) { // n'existe pas déja
+ if (hash_read(hash, mov_sav, NULL, HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
// enregistrer lien (MACRO) avec SAV IDENTIQUE
liens_record(mov_adr, mov_fil, liens[ptr]->sav, "", "");
//liens_record(mov_adr,mov_fil,mov_sav,"","");
@@ -4074,7 +4074,7 @@ void hts_mirror_process_user_interaction(htsmoduleStruct * str,
if (url_savename
(add_adr, add_fil, add_sav, NULL, NULL, NULL, NULL, opt, liens,
lien_tot, sback, cache, hash, ptr, numero_passe, NULL) != -1) {
- if (hash_read(hash, add_sav, "", 0, 0) < 0) { // n'existe pas déja
+ if (hash_read(hash, add_sav, NULL, HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja
// enregistrer lien (MACRO)
liens_record(add_adr, add_fil, add_sav, "", "");
if (liens[lien_tot] != NULL) { // OK, pas d'erreur
@@ -4624,7 +4624,7 @@ int hts_wait_delayed(htsmoduleStruct * str, char *adr, char *fil, char *save,
}
/* Check if the file was recorded already (necessary for redirects) */
- if (hash_read(hash, save, "", 0, opt->urlhack) >= 0) {
+ if (hash_read(hash, save, NULL, HASH_STRUCT_FILENAME) >= 0) {
if (loops == 0) { /* Should not happend */
hts_log_print(opt, LOG_ERROR,
"Duplicate entry in hts_wait_delayed() cancelled: %s%s -> %s",
diff --git a/src/htstools.c b/src/htstools.c
index 255a1a2..7c543f6 100644
--- a/src/htstools.c
+++ b/src/htstools.c
@@ -187,6 +187,12 @@ int ident_url_relatif(const char *lien, const char *origin_adr,
// On forme l'URL complète à partie de l'url actuelle
// et du chemin actuel si besoin est.
+ // sanity check
+ if (origin_adr == NULL || origin_fil == NULL
+ || *origin_adr == '\0' || *origin_fil == '\0') {
+ return -1;
+ }
+
// copier adresse
if (((int) strlen(origin_adr) < HTS_URLMAXSIZE)
&& ((int) strlen(origin_fil) < HTS_URLMAXSIZE)
diff --git a/src/htswizard.c b/src/htswizard.c
index b4ceaeb..03605a7 100644
--- a/src/htswizard.c
+++ b/src/htswizard.c
@@ -154,7 +154,7 @@ static int hts_acceptlink_(httrackp * opt, int ptr, int lien_tot,
/* Already exists? Then, we know that we knew that this link had to be known */
if (adr[0] != '\0' && fil[0] != '\0' && opt->hash != NULL
- && hash_read(opt->hash, adr, fil, 1, opt->urlhack) >= 0) {
+ && hash_read(opt->hash, adr, fil, 1) >= 0) {
return 0; /* Yokai */
}
// -------------------- PRELUDE OF PHASE 3-BIS --------------------