diff options
author | Xavier Roche <xroche@users.noreply.github.com> | 2012-03-19 12:36:11 +0000 |
---|---|---|
committer | Xavier Roche <xroche@users.noreply.github.com> | 2012-03-19 12:36:11 +0000 |
commit | ad5b7acc19290ff91e0f42a0de448a26760fcf99 (patch) | |
tree | 2d1867758835fd0c4e443ff3cc7e5c774af85874 /src/htshash.c |
Imported httrack 3.20.2
Diffstat (limited to 'src/htshash.c')
-rw-r--r-- | src/htshash.c | 453 |
1 files changed, 453 insertions, 0 deletions
diff --git a/src/htshash.c b/src/htshash.c new file mode 100644 index 0000000..b02f2ba --- /dev/null +++ b/src/htshash.c @@ -0,0 +1,453 @@ +/* ------------------------------------------------------------ */ +/* +HTTrack Website Copier, Offline Browser for Windows and Unix +Copyright (C) Xavier Roche and other contributors + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +Important notes: + +- We hereby ask people using this source NOT to use it in purpose of grabbing +emails addresses, or collecting any other private information on persons. +This would disgrace our work, and spoil the many hours we spent on it. + + +Please visit our Website: http://www.httrack.com +*/ + + +/* ------------------------------------------------------------ */ +/* File: httrack.c subroutines: */ +/* hash table system (fast index) */ +/* Author: Xavier Roche */ +/* ------------------------------------------------------------ */ + +#include "htshash.h" + +/* specific definitions */ +#include "htsbase.h" +#include "htsmd5.h" +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +/* END specific definitions */ + +// GESTION DES TABLES DE HACHAGE +// Méthode à 2 clés (adr+fil), 2e cle facultative +// hash[no_enregistrement][pos]->hash est un index dans le tableau général liens +// #define HTS_HASH_SIZE 8191 (premier si possible!) +// type: numero enregistrement - 0 est case insensitive (sav) 1 (adr+fil) 2 (former_adr+former_fil) +#if HTS_HASH +// recherche dans la table selon nom1,nom2 et le no d'enregistrement +// retour: position ou -1 si non trouvé +int hash_read(hash_struct* hash,char* nom1,char* nom2,int type) { + unsigned int cle; + int pos; + // calculer la clé de recherche, non modulée + if (type) + cle = hash_cle(nom1,nom2); + else + cle = hash_cle(convtolower(nom1),nom2); // case insensitive + // la position se calcule en modulant + pos = (int) (cle%HTS_HASH_SIZE); + // entrée trouvée? + if (hash->hash[type][pos] >= 0) { // un enregistrement avec une telle clé existe.. + // tester table de raccourcis (hash) + // pos est maintenant la position recherchée dans liens + pos = hash->hash[type][pos]; + while (pos>=0) { // parcourir la chaine + switch (type) { + case 0: // sav + if (strfield2(nom1,hash->liens[pos]->sav)) { // case insensitive +#if DEBUG_HASH==2 + printf("hash: found shortcut at %d\n",pos); +#endif + return pos; + } + break; + case 1: // adr+fil + if ((strcmp(nom1,jump_identification(hash->liens[pos]->adr))==0) && (strcmp(nom2,hash->liens[pos]->fil)==0)) { +#if DEBUG_HASH==2 + printf("hash: found shortcut at %d\n",pos); +#endif + return pos; + } + break; + case 2: // former_adr+former_fil + if (hash->liens[pos]->former_adr) + if ((strcmp(nom1,jump_identification(hash->liens[pos]->former_adr))==0) && (strcmp(nom2,hash->liens[pos]->former_fil)==0)) { +#if DEBUG_HASH==2 + printf("hash: found shortcut at %d\n",pos); +#endif + return pos; + } + break; + } + // calculer prochaine position dans la chaine + { + int old=pos; + pos=hash->liens[pos]->hash_next[type]; // sinon prochain dans la chaine + if (old==pos) + pos=-1; // erreur de bouclage (ne devrait pas arriver) + } + } + + // Ok va falloir chercher alors.. + /*pos=hash->max_lien; // commencer à max_lien + switch (type) { + case 0: // sav + while(pos>=0) { + if (hash->liens[pos]->hash_sav == cle ) { + if (strcmp(nom1,hash->liens[pos]->sav)==0) { + hash->hash[type][(int) (cle%HTS_HASH_SIZE)] = pos; // noter plus récent dans shortcut table +#if DEBUG_HASH==2 + printf("hash: found long search at %d\n",pos); +#endif + return pos; + } + } + pos--; + } + break; + case 1: // adr+fil + while(pos>=0) { + if (hash->liens[pos]->hash_adrfil == cle ) { + if ((strcmp(nom1,hash->liens[pos]->adr)==0) && (strcmp(nom2,hash->liens[pos]->fil)==0)) { + hash->hash[type][(int) (cle%HTS_HASH_SIZE)] = pos; // noter plus récent dans shortcut table +#if DEBUG_HASH==2 + printf("hash: found long search at %d\n",pos); +#endif + return pos; + } + } + pos--; + } + break; + case 2: // former_adr+former_fil + while(pos>=0) { + if (hash->liens[pos]->hash_fadrfil == cle ) { + if (hash->liens[pos]->former_adr) + if ((strcmp(nom1,hash->liens[pos]->former_adr)==0) && (strcmp(nom2,hash->liens[pos]->former_fil)==0)) { + hash->hash[type][(int) (cle%HTS_HASH_SIZE)] = pos; // noter plus récent dans shortcut table +#if DEBUG_HASH==2 + printf("hash: found long search at %d\n",pos); +#endif + return pos; + } + } + pos--; + } + }*/ +#if DEBUG_HASH==1 + printf("hash: not found after test %s%s\n",nom1,nom2); +#endif + return -1; // non trouvé + } else { +#if DEBUG_HASH==2 + printf("hash: not found %s%s\n",nom1,nom2); +#endif + return -1; // non trouvé : clé non entrée (même une fois) + } +} + +// enregistrement lien lpos dans les 3 tables hash1..3 +void hash_write(hash_struct* hash,int lpos) { + unsigned int cle; + int pos; + int* ptr; + // + if (hash->liens[lpos]) { // on sait jamais.. + hash->max_lien = max(hash->max_lien,lpos); +#if DEBUG_HASH + hashnumber=hash->max_lien; +#endif + // élément actuel sur -1 (fin de chaine) + hash->liens[lpos]->hash_next[0]=hash->liens[lpos]->hash_next[1]=hash->liens[lpos]->hash_next[2]=-1; + // + cle = hash_cle(convtolower(hash->liens[lpos]->sav),""); // CASE INSENSITIVE + pos = (int) (cle%HTS_HASH_SIZE); + ptr = hash_calc_chaine(hash,0,pos); // calculer adresse chaine + *ptr = lpos; // noter dernier enregistré +#if DEBUG_HASH==3 + printf("[%d",pos); +#endif + // + cle = hash_cle(jump_identification(hash->liens[lpos]->adr),hash->liens[lpos]->fil); + pos = (int) (cle%HTS_HASH_SIZE); + ptr = hash_calc_chaine(hash,1,pos); // calculer adresse chaine + *ptr = lpos; // noter dernier enregistré +#if DEBUG_HASH==3 + printf(",%d",pos); +#endif + // + if (hash->liens[lpos]->former_adr) { // former_adr existe? + cle = hash_cle(jump_identification(hash->liens[lpos]->former_adr),hash->liens[lpos]->former_fil); + pos = (int) (cle%HTS_HASH_SIZE); + ptr = hash_calc_chaine(hash,2,pos); // calculer adresse chaine + *ptr = lpos; // noter dernier enregistré +#if DEBUG_HASH==3 + printf(",%d",pos); +#endif + } +#if DEBUG_HASH==3 + printf("] "); fflush(stdout); +#endif + } +#if DEBUT_HASH + else { + printf("* hash_write=0!!\n"); + exit(1); + } +#endif + // +} + +// calcul clé +// il n'y a pas de formule de hashage universelle, celle-ci semble acceptable.. +unsigned long int hash_cle(char* nom1,char* nom2) { + /* + unsigned int sum=0; + int i=0; + while(*nom1) { + sum += 1; + sum += (unsigned int) *(nom1); + sum *= (unsigned int) *(nom1++); + sum += (unsigned int) i; + i++; + } + while(*nom2) { + sum += 1; + sum += (unsigned int) *(nom2); + sum *= (unsigned int) *(nom2++); + sum += (unsigned int) i; + i++; + } + */ + return md5sum32(nom1) + +md5sum32(nom2); +} + +// calcul de la position finale dans la chaine des elements ayant la même clé +int* hash_calc_chaine(hash_struct* hash,int type,int pos) { +#if DEBUG_HASH + int count=0; +#endif + if (hash->hash[type][pos] == -1) + return &(hash->hash[type][pos]); // premier élément dans la chaine + pos=hash->hash[type][pos]; + while(hash->liens[pos]->hash_next[type] != -1) { + pos = hash->liens[pos]->hash_next[type]; +#if DEBUG_HASH + count++; +#endif + } +#if DEBUG_HASH + count++; + longest_hash[type]=max(longest_hash[type],count); +#endif + return &(hash->liens[pos]->hash_next[type]); +} +#endif +// FIN GESTION DES TABLES DE HACHAGE + + + + + + + + + + + + +// inthash -- simple hash table, using a key (char[]) and a value (ulong int) + +unsigned long int inthash_key(char* value) { + return md5sum32(value); +} + +// Check for duplicate entry (==1 : added) +int inthash_write(inthash hashtable,char* name,long int value) { + int pos = (inthash_key(name) % hashtable->hash_size); + inthash_chain* h=hashtable->hash[pos]; + while (h) { + if (strcmp(h->name,name)==0) { + h->value.intg=value; + return 0; + } + h=h->next; + } + // Not found, add it! + inthash_add(hashtable,name,value); + return 1; +} + +// Increment pos value, create one if necessary (=0) +// (==1 : created) +int inthash_inc(inthash hashtable,char* name) { + long int value=0; + int r=0; + if (inthash_read(hashtable,name,&value)) { + value++; + } + else { /* create new value */ + value=0; + r=1; + } + inthash_write(hashtable,name,value); + return (r); +} + + +// Does not check for duplicate entry +void inthash_add(inthash hashtable,char* name,long int value) { + int pos = (inthash_key(name) % hashtable->hash_size); + inthash_chain** h=&hashtable->hash[pos]; + + while (*h) + h=&((*h)->next); + *h=(inthash_chain*)calloc(1, + sizeof(inthash_chain) + + + strlen(name)+2 + ); + if (*h) { + (*h)->name=((char*)(*h)) + sizeof(inthash_chain); + (*h)->next=NULL; + strcpy((*h)->name,name); + (*h)->value.intg=value; + } +} + +void* inthash_addblk(inthash hashtable,char* name,int blksize) { + int pos = (inthash_key(name) % hashtable->hash_size); + inthash_chain** h=&hashtable->hash[pos]; + + while (*h) + h=&((*h)->next); + *h=(inthash_chain*)calloc(1, + sizeof(inthash_chain) + + + strlen(name)+2 + + + blksize + ); + if (*h) { + (*h)->name = ((char*)(*h)) + sizeof(inthash_chain); + (*h)->next=NULL; + strcpy((*h)->name,name); + (*h)->value.intg = (unsigned long) (char*) ((char*)(*h)) + sizeof(inthash_chain) + strlen(name) + 2; + return (void*)(*h)->value.intg; + } + return NULL; +} + +int inthash_read(inthash hashtable,char* name,long int* value) { + int pos = (inthash_key(name) % hashtable->hash_size); + inthash_chain* h=hashtable->hash[pos]; + while (h) { + if (strcmp(h->name,name)==0) { + *value=h->value.intg; + return 1; + } + h=h->next; + } + return 0; +} + +void inthash_init(inthash hashtable) { + unsigned int i; + for(i=0;i<hashtable->hash_size;i++) { + hashtable->hash[i]=NULL; + } +} + +void inthash_delchain(inthash_chain* hash,t_inthash_freehandler free_handler) { + if (hash) { + inthash_delchain(hash->next,free_handler); + if (free_handler) { // pos is a malloc() block, delete it! + if (hash->value.intg) { + if (free_handler) + free_handler((void*)hash->value.intg); + else + free((void*)hash->value.intg); + } + hash->value.intg=0; + } + free(hash); + } +} + +void inthash_default_free_handler(void* value) { + if (value) + free(value); +} + +// -- + +inthash inthash_new(int size) { + inthash hashtable=(inthash)calloc(1,sizeof(struct_inthash)); + if (hashtable) { + hashtable->hash_size=0; + hashtable->flag_valueismalloc=0; + if ((hashtable->hash=(inthash_chain**)calloc(size,sizeof(inthash_chain*)))) { + hashtable->hash_size=size; + inthash_init(hashtable); + } + } + return hashtable; +} + +int inthash_created(inthash hashtable) { + if (hashtable) + if (hashtable->hash) + return 1; + return 0; +} + +void inthash_value_is_malloc(inthash hashtable,int flag) { + hashtable->flag_valueismalloc=flag; +} + +void inthash_value_set_free_handler(inthash hashtable, t_inthash_freehandler free_handler) { + hashtable->free_handler = free_handler; +} + +void inthash_delete(inthash* hashtable) { + if (hashtable) { + if (*hashtable) { + if ((*hashtable)->hash) { + unsigned int i; + t_inthash_freehandler free_handler=NULL; + if ( (*hashtable)->flag_valueismalloc ) { + if ( (*hashtable)->free_handler ) + free_handler=(*hashtable)->free_handler; + else + free_handler=inthash_default_free_handler; + } + for(i=0;i<(*hashtable)->hash_size;i++) { + inthash_delchain((*hashtable)->hash[i],(*hashtable)->free_handler); + (*hashtable)->hash[i]=NULL; + } + } + free(*hashtable); + *hashtable=NULL; + } + } +} + + |