diff options
author | Xavier Roche <xroche@users.noreply.github.com> | 2012-03-19 12:36:11 +0000 |
---|---|---|
committer | Xavier Roche <xroche@users.noreply.github.com> | 2012-03-19 12:36:11 +0000 |
commit | ad5b7acc19290ff91e0f42a0de448a26760fcf99 (patch) | |
tree | 2d1867758835fd0c4e443ff3cc7e5c774af85874 /src/htsindex.c |
Imported httrack 3.20.2
Diffstat (limited to 'src/htsindex.c')
-rw-r--r-- | src/htsindex.c | 483 |
1 files changed, 483 insertions, 0 deletions
diff --git a/src/htsindex.c b/src/htsindex.c new file mode 100644 index 0000000..5a66724 --- /dev/null +++ b/src/htsindex.c @@ -0,0 +1,483 @@ +/* ------------------------------------------------------------ */ +/* +HTTrack Website Copier, Offline Browser for Windows and Unix +Copyright (C) Xavier Roche and other contributors + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +Important notes: + +- We hereby ask people using this source NOT to use it in purpose of grabbing +emails addresses, or collecting any other private information on persons. +This would disgrace our work, and spoil the many hours we spent on it. + + +Please visit our Website: http://www.httrack.com +*/ + + +/* ------------------------------------------------------------ */ +/* File: htsindex.c */ +/* keyword indexing system (search index) */ +/* Author: Xavier Roche */ +/* ------------------------------------------------------------ */ + + +#include <stdio.h> +#include <stdlib.h> +#include "htsindex.h" +#include "htsglobal.h" +#include "htslib.h" + +#if HTS_MAKE_KEYWORD_INDEX +#include "htshash.h" + + +/* Keyword Indexer Parameters */ + +// Maximum length for a keyword +#define KEYW_LEN 50 +// Minimum length for a keyword - MUST NOT BE NULL!!! +#define KEYW_MIN_LEN 3 +// What characters to accept? - MUST NOT BE EMPTY AND MUST NOT CONTAIN THE SPACE (32) CHARACTER!!! +#define KEYW_ACCEPT "abcdefghijklmnopqrstuvwxyz0123456789-_." +// Convert A to a, and so on.. to avoid case problems in indexing +// This can be a generic table, containing characters that are in fact not accepted by KEYW_ACCEPT +// MUST HAVE SAME SIZES!! +#define KEYW_TRANSCODE_FROM (\ + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \ + "àâä" \ + "ÀÂÄ" \ + "éèêë" \ + "ÈÈÊË" \ + "ìîï" \ + "ÌÎÏ" \ + "òôö" \ + "ÒÔÖ" \ + "ùûü" \ + "ÙÛÜ" \ + "ÿ" \ + ) +#define KEYW_TRANSCODE_TO ( \ + "abcdefghijklmnopqrstuvwxyz" \ + "aaa" \ + "aaa" \ + "eeee" \ + "eeee" \ + "iii" \ + "iii" \ + "ooo" \ + "ooo" \ + "uuu" \ + "uuu" \ + "y" \ + ) +// These (accepted) characters will be ignored at begining of a keyword +#define KEYW_IGNORE_BEG "-_." +// These (accepted) characters will be stripped if at the end of a keyword +#define KEYW_STRIP_END "-_." +// Words begining with these (accepted) characters will be ignored +#define KEYW_NOT_BEG "0123456789" +// Treat these characters as space characters - MUST NOT BE EMPTY!!! +#define KEYW_SPACE " ',;:!?\"\x0d\x0a\x09\x0c" +// Common words (the,for..) detector +// If a word represents more than KEYW_USELESS1K (%1000) of total words, then ignore it +// 5 (0.5%) +#define KEYW_USELESS1K 5 +// If a word is present in more than KEYW_USELESS1KPG (%1000) pages, then ignore it +// 800 (80%) +#define KEYW_USELESS1KPG 800 +// This number will be reduced by index hit for sorting purpose +// leave it as it is here if you don't REALLY know what you are doing +// Yes, I may be the only person, maybe +#define KEYW_SORT_MAXCOUNT 999999999 + +/* End of Keyword Indexer Parameters */ + +int strcpos(char* adr,char c); +int mystrcmp(const void* _e1,const void* _e2); + +// Global variables +int hts_index_init=1; +int hts_primindex_size=0; +FILE* fp_tmpproject=NULL; +int hts_primindex_words=0; + +#endif + +/* + Init index +*/ +void index_init(const char* indexpath) { +#if HTS_MAKE_KEYWORD_INDEX + /* remove(concat(indexpath,"index.txt")); */ + hts_index_init=1; + hts_primindex_size=0; + hts_primindex_words=0; + fp_tmpproject=tmpfile(); +#endif +} + + +/* + Indexing system + A little bit dirty, (quick'n dirty, in fact) + But should be okay on most cases + Tags and javascript handled (ignored) +*/ +int index_keyword(const char* html_data,LLint size,const char* mime,const char* filename,const char* indexpath) { +#if HTS_MAKE_KEYWORD_INDEX + int intag=0,inscript=0,incomment=0; + char keyword[KEYW_LEN+32]; + int i=0; + // + int WordIndexSize=1024; + inthash WordIndexHash=NULL; + FILE *tmpfp=NULL; + // + + // Check parameters + if (!html_data) + return 0; + if (!size) + return 0; + if (!mime) + return 0; + if (!filename) + return 0; + + // Init ? + if (hts_index_init) { + remove(concat(indexpath,"index.txt")); + remove(concat(indexpath,"sindex.html")); + hts_index_init=0; + } + + // Check MIME type + if (strfield2(mime,"text/html")) { + inscript=0; + } + // FIXME - temporary fix for image/svg+xml (svg) + // "IN XML" (html like, in fact :) ) + else if ( + (strfield2(mime,"image/svg+xml")) + || + (strfield2(mime,"image/svg-xml")) + ) { + inscript=0; + } + else if ( + (strfield2(mime,"application/x-javascript")) + || (strfield2(mime,"text/css")) + ) { + inscript=1; + } else + return 0; + + // Temporary file + tmpfp = tmpfile(); + if (!tmpfp) + return 0; + + // Create hash structure + // Hash tables rulez da world! + WordIndexHash=inthash_new(WordIndexSize); + if (!WordIndexHash) + return 0; + + // Start indexing this page + keyword[0]='\0'; + while(i<size) { + if (strfield(html_data + i , "<script")) { + inscript=1; + } + else if (strfield(html_data + i , "<!--")) { + incomment=1; + } + else if (strfield(html_data + i , "</script")) { + if (!incomment) + inscript=0; + } + else if (strfield(html_data + i , "-->")) { + incomment=0; + } + else if (html_data[i]=='<') { + if (!inscript) + intag=1; + } + else if (html_data[i]=='>') { + intag=0; + } + else { + // Okay, parse keywords + if ( (!inscript) && (!incomment) && (!intag) ) { + char cchar=html_data[i]; + int pos; + int len=strlen(keyword); + + // Replace (ignore case, and so on..) + if ((pos=strcpos(KEYW_TRANSCODE_FROM,cchar))>=0) + cchar=KEYW_TRANSCODE_TO[pos]; + + if (strchr(KEYW_ACCEPT,cchar)) { + /* Ignore some characters at begining */ + if ((len>0) || (!strchr(KEYW_IGNORE_BEG,cchar))) { + keyword[len++]=cchar; + keyword[len]='\0'; + } + } else if ( (strchr(KEYW_SPACE,cchar)) || (!cchar) ) { + + + /* Avoid these words */ + if (len>0) { + if (strchr(KEYW_NOT_BEG,keyword[0])) { + keyword[(len=0)]='\0'; + } + } + + /* Strip ending . and so */ + { + int ok=0; + while((len=strlen(keyword)) && (!ok)) { + if (strchr(KEYW_STRIP_END,keyword[len-1])) { /* strip it */ + keyword[len-1]='\0'; + } else + ok=1; + } + } + + /* Store it ? */ + if (len >= KEYW_MIN_LEN ) { + hts_primindex_words++; + if (inthash_inc(WordIndexHash,keyword)) { /* added new */ + fprintf(tmpfp,"%s\n",keyword); + } + } + keyword[(len=0)]='\0'; + } else /* Invalid */ + keyword[(len=0)]='\0'; + + if (len>KEYW_LEN) { + keyword[(len=0)]='\0'; + } + } + + } + + i++; + } + + // Reset temp file + fseek(tmpfp,0,SEEK_SET); + + // Process indexing for this page + { + //FILE* fp=NULL; + //fp=fopen(concat(indexpath,"index.txt"),"ab"); + if (fp_tmpproject) { + while(!feof(tmpfp)) { + char line[KEYW_LEN + 32]; + linput(tmpfp,line,KEYW_LEN + 2); + if (strnotempty(line)) { + unsigned long int e=0; + if (inthash_read(WordIndexHash,line,&e)) { + //if (e) { + char savelst[HTS_URLMAXSIZE*2]; + e++; /* 0 means "once" */ + + if (strncmp((const char*)fslash((char*)indexpath),filename,strlen(indexpath))==0) // couper + strcpy(savelst,filename+strlen(indexpath)); + else + strcpy(savelst,filename); + + // Add entry for this file and word + fprintf(fp_tmpproject,"%s %d %s\n",line,(int) (KEYW_SORT_MAXCOUNT - e),savelst); + hts_primindex_size++; + //} + } + } + } + //fclose(fp); + } + } + + // Delete temp file + fclose(tmpfp); + tmpfp=NULL; + + // Clear hash table + inthash_delete(&WordIndexHash); +#endif + return 1; +} + +/* + Sort index! +*/ +void index_finish(const char* indexpath,int mode) { +#if HTS_MAKE_KEYWORD_INDEX + char** tab; + char* blk; + int size; + + size=fpsize(fp_tmpproject); + if (size>0) { + //FILE* fp=fopen(concat(indexpath,"index.txt"),"rb"); + if (fp_tmpproject) { + tab=(char**)malloct(sizeof(char*) * (hts_primindex_size+2) ); + if (tab) { + blk = malloct(size+4); + if (blk) { + fseek(fp_tmpproject,0,SEEK_SET); + if ((int)fread(blk,1,size,fp_tmpproject) == size) { + char *a=blk,*b; + int index=0; + int i; + FILE* fp; + + while( (b=strchr(a,'\n')) && (index < hts_primindex_size) ) { + tab[index++]=a; + *b='\0'; + a=b+1; + } + + // Sort it! + qsort(tab,index,sizeof(char*),mystrcmp); + + // Delete fp_tmpproject + fclose(fp_tmpproject); + fp_tmpproject=NULL; + + // Write new file + if (mode == 1) // TEXT + fp=fopen(concat(indexpath,"index.txt"),"wb"); + else // HTML + fp=fopen(concat(indexpath,"sindex.html"),"wb"); + if (fp) { + char current_word[KEYW_LEN + 32]; + char word[KEYW_LEN + 32]; + int hit; + int total_hit=0; + int total_line=0; + int last_pos=0; + char word0='\0'; + current_word[0]='\0'; + + if (mode == 2) { // HTML + for(i=0;i<index;i++) { + if (word0 != tab[i][0]) { + word0 = tab[i][0]; + fprintf(fp," <a href=\"#%c\">%c</a>\r\n",word0,word0); + } + } + word0='\0'; + fprintf(fp,"<br><br>\r\n"); + fprintf(fp,"<table width=\"100%%\" border=\"0\">\r\n<tr>\r\n<td>word</td>\r\n<td>location\r\n"); + } + + for(i=0;i<index;i++) { + if (sscanf(tab[i],"%s %d",word,&hit) == 2) { + char* a=strchr(tab[i],' '); + if (a) a=strchr(a+1,' '); + if (a++) { /* Yes, a++, not ++a :) */ + hit=KEYW_SORT_MAXCOUNT-hit; + if (strcmp(word,current_word)) { /* New word */ + if (total_hit) { + if (mode == 1) // TEXT + fprintf(fp,"\t=%d\r\n",total_hit); + //else // HTML + // fprintf(fp,"<br>(%d total hits)\r\n",total_hit); + if ( + ( ((total_hit*1000 ) / hts_primindex_words) >= KEYW_USELESS1K ) + || + ( ((total_line*1000) / index ) >= KEYW_USELESS1KPG ) + ) { + fseek(fp,last_pos,SEEK_SET); + if (mode == 1) // TEXT + fprintf(fp,"\tignored (%d)\r\n",((total_hit*1000)/hts_primindex_words)); + else + fprintf(fp,"(ignored) [%d hits]<br>\r\n",total_hit); + } + else { + if (mode == 1) // TEXT + fprintf(fp,"\t(%d)\r\n",((total_hit*1000)/hts_primindex_words)); + //else // HTML + // fprintf(fp,"(%d)\r\n",((total_hit*1000)/hts_primindex_words)); + } + } + if (mode == 1) // TEXT + fprintf(fp,"%s\r\n",word); + else { // HTML + fprintf(fp,"</td></tr>\r\n"); + if (word0 != word[0]) { + word0 = word[0]; + fprintf(fp,"<th>%c</th>\r\n",word0); + fprintf(fp,"<a name=\"%c\"></a>\r\n",word0); + } + fprintf(fp,"<tr>\r\n<td>%s</td>\r\n<td>\r\n",word); + } + fflush(fp); last_pos=ftell(fp); + strcpy(current_word,word); + total_hit=total_line=0; + } + total_hit+=hit; + total_line++; + if (mode == 1) // TEXT + fprintf(fp,"\t%d %s\r\n",hit,a); + else // HTML + fprintf(fp,"<a href=\"%s\">%s</a> [%d hits]<br>\r\n",a,a,hit); + } + } + } + if (mode == 2) // HTML + fprintf(fp,"</td></tr>\r\n</table>\r\n"); + fclose(fp); + } + + } + freet(blk); + } + freet(tab); + } + + } + //qsort + } + if (fp_tmpproject) + fclose(fp_tmpproject); + fp_tmpproject=NULL; +#endif +} + + +/* Subroutines */ + +#if HTS_MAKE_KEYWORD_INDEX +int strcpos(char* adr,char c) { + char* apos=strchr(adr,c); + if (apos) + return (int)(apos-adr); + else + return -1; +} + +int mystrcmp(const void* _e1,const void* _e2) { + char** e1=(char**)_e1; + char** e2=(char**)_e2; + return strcmp(*e1,*e2); +} +#endif + |