diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/htsback.c | 4 | ||||
-rw-r--r-- | src/htscore.c | 10 | ||||
-rw-r--r-- | src/htscore.h | 5 | ||||
-rw-r--r-- | src/htshash.c | 206 | ||||
-rw-r--r-- | src/htshash.h | 6 | ||||
-rw-r--r-- | src/htsname.c | 8 | ||||
-rw-r--r-- | src/htsparse.c | 10 | ||||
-rw-r--r-- | src/htstools.c | 6 | ||||
-rw-r--r-- | src/htswizard.c | 2 |
9 files changed, 180 insertions, 77 deletions
diff --git a/src/htsback.c b/src/htsback.c index 4de49dc..18ea8ac 100644 --- a/src/htsback.c +++ b/src/htsback.c @@ -2308,7 +2308,7 @@ static int slot_can_be_finalized(httrackp * opt, const lien_back * back) { && !may_be_hypertext_mime(opt, back->r.contenttype, back->url_fil) // may NOT be parseable mime type /* Has not been added before the heap saw the link, or now exists on heap */ && (!back->early_add - || hash_read(opt->hash, back->url_sav, "", 0, opt->urlhack) >= 0); + || hash_read(opt->hash, back->url_sav, NULL, HASH_STRUCT_FILENAME) >= 0); } void back_clean(httrackp * opt, cache_back * cache, struct_back * sback) { @@ -2330,7 +2330,7 @@ void back_clean(httrackp * opt, cache_back * cache, struct_back * sback) { //} /* MANDATORY if we don't want back_fill() to endlessly put the same file on download! */ { - int index = hash_read(opt->hash, back[i].url_sav, "", 0, opt->urlhack); // lecture type 0 (sav) + int index = hash_read(opt->hash, back[i].url_sav, NULL, HASH_STRUCT_FILENAME ); // lecture type 0 (sav) if (index >= 0) { opt->liens[index]->pass2 = -1; /* DONE! */ diff --git a/src/htscore.c b/src/htscore.c index fd4e9ff..8c86914 100644 --- a/src/htscore.c +++ b/src/htscore.c @@ -229,7 +229,7 @@ strcpybuff(liens[lien_tot]->adr,A); \ strcpybuff(liens[lien_tot]->fil,F); \ strcpybuff(liens[lien_tot]->sav,S); \ liens_record_sav_len(liens[lien_tot]); \ -hash_write(hashptr,lien_tot,NORM); \ +hash_write(hashptr,lien_tot); \ } \ } @@ -449,7 +449,7 @@ int httpmirror(char *url1, httrackp * opt) { lien_tot = 0; // initialiser hachage - hash_init(&hash); + hash_init(&hash, opt->urlhack); // note: we need a cast because of the const hash.liens = (const lien_url **) liens; @@ -3224,12 +3224,12 @@ int back_fill(struct_back * sback, httrackp * opt, cache_back * cache, // Why in hell did I do that ? //if (ok && liens[p]->sav != NULL && liens[p]->sav[0] != '\0' - // && hash_read(opt->hash,liens[p]->sav,"",0,opt->urlhack) >= 0) // lookup in liens_record + // && hash_read(opt->hash,liens[p]->sav,NULL,HASH_STRUCT_FILENAME ) >= 0) // lookup in liens_record //{ // ok = 0; //} if (liens[p]->sav == NULL || liens[p]->sav[0] == '\0' - || hash_read(opt->hash, liens[p]->sav, "", 0, opt->urlhack) < 0) { + || hash_read(opt->hash, liens[p]->sav, NULL, HASH_STRUCT_FILENAME ) < 0) { ok = 0; } // note: si un backing est fini, il reste en mémoire jusqu'à ce que @@ -3766,7 +3766,7 @@ int htsAddLink(htsmoduleStruct * str, char *link) { // // On part de la fin et on essaye de se presser (économise temps machine) { - int i = hash_read(hashptr, save, "", 0, opt->urlhack); // lecture type 0 (sav) + int i = hash_read(hashptr, save, NULL, HASH_STRUCT_FILENAME ); // lecture type 0 (sav) if (i >= 0) { liens[i]->depth = maximum(liens[i]->depth, prio_fix); diff --git a/src/htscore.h b/src/htscore.h index f884b7d..a8587a4 100644 --- a/src/htscore.h +++ b/src/htscore.h @@ -267,6 +267,11 @@ struct hash_struct { inthash adrfil; /* Former address and path */ inthash former_adrfil; + /** Buffers **/ + int normalized; + char normfil[HTS_URLMAXSIZE * 2]; + char normfil2[HTS_URLMAXSIZE * 2]; + char catbuff[CATBUFF_SIZE]; }; #ifndef HTS_DEF_FWSTRUCT_filecreate_params diff --git a/src/htshash.c b/src/htshash.c index daf987c..d740de7 100644 --- a/src/htshash.c +++ b/src/htshash.c @@ -60,10 +60,143 @@ Please visit our Website: http://www.httrack.com // type: numero enregistrement - 0 est case insensitive (sav) 1 (adr+fil) 2 (former_adr+former_fil) // recherche dans la table selon nom1,nom2 et le no d'enregistrement -void hash_init(hash_struct * hash) { +/* Key free handler (NOOP) ; addresses are kept */ +static void key_freehandler(void *arg, void *value) { +} + +/* Key strdup (pointer copy) */ +static char* key_duphandler(void *arg, const char *name) { + union { + const char *roname; + char *name; + } u; + u.roname = name; + return u.name; +} + +/* Key sav hashes are using case-insensitive version */ +static inthash_keys key_sav_hashes(void *arg, const char *value) { + hash_struct *const hash = (hash_struct*) arg; + convtolower(hash->catbuff, value); + return inthash_hash_value(hash->catbuff); +} + +/* Key sav comparison is case-insensitive */ +static int key_sav_equals(void *arg, const char *a, const char *b) { + return strcasecmp(a, b) == 0; +} + +/* Pseudo-key (lien_url structure) hash function */ +static inthash_keys key_adrfil_hashes_generic(void *arg, const char *value_, + const int former) { + hash_struct *const hash = (hash_struct*) arg; + const lien_url*const lien = (lien_url*) value_; + int i; + const char *const adr = !former ? lien->adr : lien->former_adr; + const char *const fil = !former ? lien->fil : lien->former_fil; + const char *const adr_norm = adr != NULL ? + ( hash->normalized ? jump_normalized(adr) : jump_identification(adr) ) + : NULL; + + // copy address + assertf(adr_norm != NULL); + strcpy(hash->normfil, adr_norm); + + // copy link + assertf(fil != NULL); + if (hash->normalized) { + fil_normalized(fil, &hash->normfil[strlen(hash->normfil)]); + } else { + strcpy(&hash->normfil[i], fil); + } + + // hash + return inthash_hash_value(hash->normfil); +} + +/* Pseudo-key (lien_url structure) comparison function */ +static int key_adrfil_equals_generic(void *arg, const char *a_, const char *b_, + const int former) { + hash_struct *const hash = (hash_struct*) arg; + const int normalized = hash->normalized; + const lien_url*const a = (lien_url*) a_; + const lien_url*const b = (lien_url*) b_; + const char *const a_adr = !former ? a->adr : a->former_adr; + const char *const b_adr = !former ? b->adr : b->former_adr; + const char *const a_fil = !former ? a->fil : a->former_fil; + const char *const b_fil = !former ? b->fil : b->former_fil; + const char *ja; + const char *jb; + + // safety + assertf(a_adr != NULL); + assertf(b_adr != NULL); + + // skip scheme and authentication to the domain (possibly without www.) + ja = normalized ? jump_normalized(a_adr) : jump_identification(a_adr); + jb = normalized ? jump_normalized(b_adr) : jump_identification(b_adr); + if (strcasecmp(ja, jb) != 0) { + return 0; + } + + // now compare pathes + if (normalized) { + fil_normalized(a_fil, hash->normfil); + fil_normalized(b_fil, hash->normfil2); + return strcmp(hash->normfil, hash->normfil2) == 0; + } else { + return strcmp(a_fil, b_fil) == 0; + } +} + +/* "adr"/"fil" lien_url structure members hashing function */ +static inthash_keys key_adrfil_hashes(void *arg, const char *value_) { + return key_adrfil_hashes_generic(arg, value_, 0); +} + +/* "adr"/"fil" lien_url structure members comparison function */ +static int key_adrfil_equals(void *arg, const char *a, const char *b) { + return key_adrfil_equals_generic(arg, a, b, 0); +} + +/* "former_adr"/"former_fil" lien_url structure members hashing function */ +static inthash_keys key_former_adrfil_hashes(void *arg, const char *value_) { + return key_adrfil_hashes_generic(arg, value_, 1); +} + +/* "former_adr"/"former_fil" lien_url structure members comparison function */ +static int key_former_adrfil_equals(void *arg, const char *a, const char *b) { + return key_adrfil_equals_generic(arg, a, b, 1); +} + +void hash_init(hash_struct * hash, int normalized) { hash->sav = inthash_new(0); hash->adrfil = inthash_new(0); hash->former_adrfil = inthash_new(0); + hash->normalized = normalized; + + /* Case-insensitive comparison ; keys are direct char* filenames */ + inthash_value_set_key_handler(hash->sav, + key_duphandler, + key_freehandler, + key_sav_hashes, + key_sav_equals, + hash); + + /* URL-style comparison ; keys are lien_url structure pointers casted + to char* */ + inthash_value_set_key_handler(hash->adrfil, + key_duphandler, + key_freehandler, + key_adrfil_hashes, + key_adrfil_equals, + hash); + inthash_value_set_key_handler(hash->former_adrfil, + key_duphandler, + key_freehandler, + key_former_adrfil_hashes, + key_former_adrfil_equals, + hash); } void hash_free(hash_struct *hash) { @@ -74,66 +207,36 @@ void hash_free(hash_struct *hash) { } } -static char * normalize_key(const char *nom1, const char *nom2, - hash_struct_type type, int normalized, - char *normfil_, char *catbuff) { - /* dispatch type */ - const char *normfil; - switch(type) { - case HASH_STRUCT_FILENAME: - /* first entry: destination filename (lowercased) */ - assertf(nom2 == NULL || *nom2 == '\0'); - return convtolower(catbuff, nom1); - break; - case HASH_STRUCT_ADR_PATH: - case HASH_STRUCT_ORIGINAL_ADR_PATH: - /* second and third entries: URL address and path */ - if (!normalized) - normfil = nom2; - else - normfil = fil_normalized(nom2, normfil_); - if (!normalized) { - strcpybuff(catbuff, jump_identification(nom1)); - } else { - strcpybuff(catbuff, jump_normalized(nom1)); - } - strcatbuff(catbuff, normfil); - return catbuff; - break; - default: - assertf(! "unexpected case"); - return NULL; - break; - } -} - // retour: position ou -1 si non trouvé int hash_read(const hash_struct * hash, const char *nom1, const char *nom2, - hash_struct_type type, int normalized) { - char BIGSTK normfil_[HTS_URLMAXSIZE * 2]; - char catbuff[CATBUFF_SIZE]; + hash_struct_type type) { intptr_t intvalue; - char *const name = normalize_key(nom1, nom2, type, normalized, - normfil_, catbuff); + lien_url lien; /* read */ switch(type) { case HASH_STRUCT_FILENAME: - if (inthash_read(hash->sav, name, &intvalue)) { + if (inthash_read(hash->sav, nom1, &intvalue)) { return (int) intvalue; } else { return -1; } break; case HASH_STRUCT_ADR_PATH: - if (inthash_read(hash->adrfil, name, &intvalue)) { + memset(&lien, 0, sizeof(lien)); + lien.adr = key_duphandler(NULL, nom1); + lien.fil = key_duphandler(NULL, nom2); + if (inthash_read(hash->adrfil, (char*) &lien, &intvalue)) { return (int) intvalue; } else { return -1; } break; case HASH_STRUCT_ORIGINAL_ADR_PATH: - if (inthash_read(hash->former_adrfil, name, &intvalue)) { + memset(&lien, 0, sizeof(lien)); + lien.former_adr = key_duphandler(NULL, nom1); + lien.former_fil = key_duphandler(NULL, nom2); + if (inthash_read(hash->former_adrfil, (char*) &lien, &intvalue)) { return (int) intvalue; } else { return -1; @@ -147,26 +250,15 @@ int hash_read(const hash_struct * hash, const char *nom1, const char *nom2, } // enregistrement lien lpos dans les 3 tables hash1..3 -void hash_write(hash_struct * hash, int lpos, int normalized) { - char BIGSTK normfil_[HTS_URLMAXSIZE * 2]; - char catbuff[CATBUFF_SIZE]; - const char *name; - +void hash_write(hash_struct * hash, int lpos) { /* first entry: destination filename (lowercased) */ - name = normalize_key(hash->liens[lpos]->sav, NULL, HASH_STRUCT_FILENAME, - normalized, normfil_, catbuff); - inthash_write(hash->sav, name, lpos); + inthash_write(hash->sav, hash->liens[lpos]->sav, lpos); /* second entry: URL address and path */ - name = normalize_key(hash->liens[lpos]->adr, hash->liens[lpos]->fil, - HASH_STRUCT_ADR_PATH, normalized, normfil_, catbuff); - inthash_write(hash->adrfil, name, lpos); + inthash_write(hash->adrfil, (char*) hash->liens[lpos], lpos); /* third entry: URL address and path before redirect */ if (hash->liens[lpos]->former_adr) { // former_adr existe? - name = normalize_key(hash->liens[lpos]->former_adr, - hash->liens[lpos]->former_fil, HASH_STRUCT_ORIGINAL_ADR_PATH, - normalized, normfil_, catbuff); - inthash_write(hash->former_adrfil, name, lpos); + inthash_write(hash->former_adrfil, (char*) hash->liens[lpos], lpos); } } diff --git a/src/htshash.h b/src/htshash.h index 97b35ab..454556f 100644 --- a/src/htshash.h +++ b/src/htshash.h @@ -52,11 +52,11 @@ typedef enum hash_struct_type { } hash_struct_type; // tables de hachage -void hash_init(hash_struct *hash); +void hash_init(hash_struct *hash, int normalized); void hash_free(hash_struct *hash); int hash_read(const hash_struct * hash, const char *nom1, const char *nom2, - hash_struct_type type, int normalized); -void hash_write(hash_struct * hash, int lpos, int normalized); + hash_struct_type type); +void hash_write(hash_struct * hash, int lpos); int *hash_calc_chaine(hash_struct * hash, hash_struct_type type, int pos); unsigned long int hash_cle(const char *nom1, const char *nom2); #endif diff --git a/src/htsname.c b/src/htsname.c index 4719ec5..4e3b763 100644 --- a/src/htsname.c +++ b/src/htsname.c @@ -267,12 +267,12 @@ int url_savename(char *adr_complete, char *fil_complete, char *save, if (liens != NULL) { int i; - i = hash_read(hash, normadr, normfil, 1, opt->urlhack); // recherche table 1 (adr+fil) + i = hash_read(hash, normadr, normfil, HASH_STRUCT_ADR_PATH); // recherche table 1 (adr+fil) if (i >= 0) { // ok, trouvé strcpybuff(save, liens[i]->sav); return 0; } - i = hash_read(hash, normadr, normfil, 2, opt->urlhack); // recherche table 2 (former_adr+former_fil) + i = hash_read(hash, normadr, normfil, HASH_STRUCT_ORIGINAL_ADR_PATH); // recherche table 2 (former_adr+former_fil) if (i >= 0) { // ok, trouvé // copier location moved! strcpybuff(adr_complete, liens[i]->adr); @@ -291,7 +291,7 @@ int url_savename(char *adr_complete, char *fil_complete, char *save, fil_complete_patche[strlen(fil_complete_patche) - 1] = '\0'; else strcatbuff(fil_complete_patche, "/"); - i = hash_read(hash, normadr, fil_complete_patche, 2, opt->urlhack); // recherche table 2 (former_adr+former_fil) + i = hash_read(hash, normadr, fil_complete_patche, HASH_STRUCT_ORIGINAL_ADR_PATH); // recherche table 2 (former_adr+former_fil) if (i >= 0) { // écraser fil et adr (pas former_fil?????) strcpybuff(adr_complete, liens[i]->adr); @@ -1555,7 +1555,7 @@ int url_savename(char *adr_complete, char *fil_complete, char *save, printf("\nStart search\n"); #endif - i = hash_read(hash, save, "", 0, 0); // lecture type 0 (sav) + i = hash_read(hash, save, NULL, HASH_STRUCT_FILENAME); // lecture type 0 (sav) if (i >= 0) { int sameAdr = (strfield2(liens[i]->adr, normadr) != 0); int sameFil; diff --git a/src/htsparse.c b/src/htsparse.c index 02e4e3a..7648d07 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -295,7 +295,7 @@ Please visit our Website: http://www.httrack.com strcpybuff(liens[lien_tot]->fil,F); \ strcpybuff(liens[lien_tot]->sav,S); \ liens_record_sav_len(liens[lien_tot]); \ - hash_write(hashptr,lien_tot,opt->urlhack); \ + hash_write(hashptr,lien_tot); \ } \ } @@ -3196,7 +3196,7 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { // // On part de la fin et on essaye de se presser (économise temps machine) { - int i = hash_read(hash, save, "", 0, opt->urlhack); // lecture type 0 (sav) + int i = hash_read(hash, save, NULL, 0); // lecture type 0 (sav) if (i >= 0) { if ((opt->debug > 1) && (opt->log != NULL)) { @@ -3636,7 +3636,7 @@ int hts_mirror_check_moved(htsmoduleStruct * str, liens[liens[ptr]->precedent]->adr, liens[liens[ptr]->precedent]->fil, opt, liens, lien_tot, sback, cache, hash, ptr, numero_passe, NULL) != -1) { - if (hash_read(hash, mov_sav, "", 0, 0) < 0) { // n'existe pas déja + if (hash_read(hash, mov_sav, NULL, HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja // enregistrer lien (MACRO) avec SAV IDENTIQUE liens_record(mov_adr, mov_fil, liens[ptr]->sav, "", ""); //liens_record(mov_adr,mov_fil,mov_sav,"",""); @@ -4074,7 +4074,7 @@ void hts_mirror_process_user_interaction(htsmoduleStruct * str, if (url_savename (add_adr, add_fil, add_sav, NULL, NULL, NULL, NULL, opt, liens, lien_tot, sback, cache, hash, ptr, numero_passe, NULL) != -1) { - if (hash_read(hash, add_sav, "", 0, 0) < 0) { // n'existe pas déja + if (hash_read(hash, add_sav, NULL, HASH_STRUCT_FILENAME) < 0) { // n'existe pas déja // enregistrer lien (MACRO) liens_record(add_adr, add_fil, add_sav, "", ""); if (liens[lien_tot] != NULL) { // OK, pas d'erreur @@ -4624,7 +4624,7 @@ int hts_wait_delayed(htsmoduleStruct * str, char *adr, char *fil, char *save, } /* Check if the file was recorded already (necessary for redirects) */ - if (hash_read(hash, save, "", 0, opt->urlhack) >= 0) { + if (hash_read(hash, save, NULL, HASH_STRUCT_FILENAME) >= 0) { if (loops == 0) { /* Should not happend */ hts_log_print(opt, LOG_ERROR, "Duplicate entry in hts_wait_delayed() cancelled: %s%s -> %s", diff --git a/src/htstools.c b/src/htstools.c index 255a1a2..7c543f6 100644 --- a/src/htstools.c +++ b/src/htstools.c @@ -187,6 +187,12 @@ int ident_url_relatif(const char *lien, const char *origin_adr, // On forme l'URL complète à partie de l'url actuelle // et du chemin actuel si besoin est. + // sanity check + if (origin_adr == NULL || origin_fil == NULL + || *origin_adr == '\0' || *origin_fil == '\0') { + return -1; + } + // copier adresse if (((int) strlen(origin_adr) < HTS_URLMAXSIZE) && ((int) strlen(origin_fil) < HTS_URLMAXSIZE) diff --git a/src/htswizard.c b/src/htswizard.c index b4ceaeb..03605a7 100644 --- a/src/htswizard.c +++ b/src/htswizard.c @@ -154,7 +154,7 @@ static int hts_acceptlink_(httrackp * opt, int ptr, int lien_tot, /* Already exists? Then, we know that we knew that this link had to be known */ if (adr[0] != '\0' && fil[0] != '\0' && opt->hash != NULL - && hash_read(opt->hash, adr, fil, 1, opt->urlhack) >= 0) { + && hash_read(opt->hash, adr, fil, 1) >= 0) { return 0; /* Yokai */ } // -------------------- PRELUDE OF PHASE 3-BIS -------------------- |