Imported httrack 3.20.2

author: Xavier Roche <xroche@users.noreply.github.com> 2012-03-19 12:36:11 +0000
committer: Xavier Roche <xroche@users.noreply.github.com> 2012-03-19 12:36:11 +0000
commit: ad5b7acc19290ff91e0f42a0de448a26760fcf99 (patch)
tree: 2d1867758835fd0c4e443ff3cc7e5c774af85874 /src/htsindex.c
1 files changed, 483 insertions, 0 deletions
diff --git a/src/htsindex.c b/src/htsindex.c
new file mode 100644
index 0000000..5a66724
--- /dev/null
+++ b/src/htsindex.c
@@ -0,0 +1,483 @@
+/* ------------------------------------------------------------ */
+/*
+HTTrack Website Copier, Offline Browser for Windows and Unix
+Copyright (C) Xavier Roche and other contributors
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+
+Important notes:
+
+- We hereby ask people using this source NOT to use it in purpose of grabbing
+emails addresses, or collecting any other private information on persons.
+This would disgrace our work, and spoil the many hours we spent on it.
+
+
+Please visit our Website: http://www.httrack.com
+*/
+
+
+/* ------------------------------------------------------------ */
+/* File: htsindex.c                                             */
+/*       keyword indexing system (search index)                 */
+/* Author: Xavier Roche                                         */
+/* ------------------------------------------------------------ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "htsindex.h"
+#include "htsglobal.h"
+#include "htslib.h"
+
+#if HTS_MAKE_KEYWORD_INDEX
+#include "htshash.h"
+
+
+/* Keyword Indexer Parameters */
+
+// Maximum length for a keyword
+#define KEYW_LEN             50
+// Minimum length for a keyword - MUST NOT BE NULL!!!
+#define KEYW_MIN_LEN         3
+// What characters to accept? - MUST NOT BE EMPTY AND MUST NOT CONTAIN THE SPACE (32) CHARACTER!!!
+#define KEYW_ACCEPT          "abcdefghijklmnopqrstuvwxyz0123456789-_."
+// Convert A to a, and so on.. to avoid case problems in indexing
+// This can be a generic table, containing characters that are in fact not accepted by KEYW_ACCEPT
+// MUST HAVE SAME SIZES!!
+#define KEYW_TRANSCODE_FROM  (\
+                               "ABCDEFGHIJKLMNOPQRSTUVWXYZ" \
+                               "àâä" \
+                               "ÀÂÄ" \
+                               "éèêë" \
+                               "ÈÈÊË" \
+                               "ìîï" \
+                               "ÌÎÏ" \
+                               "òôö" \
+                               "ÒÔÖ" \
+                               "ùûü" \
+                               "ÙÛÜ" \
+                               "ÿ" \
+                             )
+#define KEYW_TRANSCODE_TO    ( \
+                               "abcdefghijklmnopqrstuvwxyz" \
+                               "aaa" \
+                               "aaa" \
+                               "eeee" \
+                               "eeee" \
+                               "iii" \
+                               "iii" \
+                               "ooo" \
+                               "ooo" \
+                               "uuu" \
+                               "uuu" \
+                               "y" \
+                             )
+// These (accepted) characters will be ignored at begining of a keyword
+#define KEYW_IGNORE_BEG       "-_."
+// These (accepted) characters will be stripped if at the end of a keyword
+#define KEYW_STRIP_END       "-_."
+// Words begining with these (accepted) characters will be ignored
+#define KEYW_NOT_BEG         "0123456789"
+// Treat these characters as space characters - MUST NOT BE EMPTY!!!
+#define KEYW_SPACE           " ',;:!?\"\x0d\x0a\x09\x0c"
+// Common words (the,for..) detector
+// If a word represents more than KEYW_USELESS1K (%1000) of total words, then ignore it
+// 5 (0.5%)
+#define KEYW_USELESS1K       5
+// If a word is present in more than KEYW_USELESS1KPG (%1000) pages, then ignore it
+// 800 (80%)
+#define KEYW_USELESS1KPG     800
+// This number will be reduced by index hit for sorting purpose
+// leave it as it is here if you don't REALLY know what you are doing
+// Yes, I may be the only person, maybe
+#define KEYW_SORT_MAXCOUNT 999999999
+
+/* End of Keyword Indexer Parameters */
+
+int strcpos(char* adr,char c);
+int mystrcmp(const void* _e1,const void* _e2);
+
+// Global variables
+int hts_index_init=1;
+int hts_primindex_size=0;
+FILE* fp_tmpproject=NULL;
+int hts_primindex_words=0;
+
+#endif
+
+/* 
+  Init index 
+*/
+void index_init(const char* indexpath) {
+#if HTS_MAKE_KEYWORD_INDEX
+  /* remove(concat(indexpath,"index.txt")); */
+  hts_index_init=1;
+  hts_primindex_size=0;
+  hts_primindex_words=0;
+  fp_tmpproject=tmpfile();
+#endif
+}
+
+
+/* 
+   Indexing system
+   A little bit dirty, (quick'n dirty, in fact)
+   But should be okay on most cases
+   Tags and javascript handled (ignored)
+*/
+int index_keyword(const char* html_data,LLint size,const char* mime,const char* filename,const char* indexpath) {
+#if HTS_MAKE_KEYWORD_INDEX
+  int intag=0,inscript=0,incomment=0;
+  char keyword[KEYW_LEN+32];
+  int i=0;
+  //
+  int WordIndexSize=1024;
+  inthash WordIndexHash=NULL;
+  FILE *tmpfp=NULL;
+  //
+
+  // Check parameters
+  if (!html_data)
+    return 0;
+  if (!size)
+    return 0;
+  if (!mime)
+    return 0;
+  if (!filename)
+    return 0;
+
+  // Init ?
+  if (hts_index_init) {
+    remove(concat(indexpath,"index.txt"));
+    remove(concat(indexpath,"sindex.html"));
+    hts_index_init=0;
+  }
+
+  // Check MIME type
+  if (strfield2(mime,"text/html")) {
+    inscript=0;
+  } 
+  // FIXME - temporary fix for image/svg+xml (svg)
+  // "IN XML" (html like, in fact :) )
+  else if (
+    (strfield2(mime,"image/svg+xml"))
+    ||
+    (strfield2(mime,"image/svg-xml"))
+    ) {
+    inscript=0;
+  }
+  else if (
+    (strfield2(mime,"application/x-javascript"))
+    || (strfield2(mime,"text/css"))
+    ) {
+    inscript=1;
+  } else
+    return 0;
+
+  // Temporary file
+  tmpfp = tmpfile();
+  if (!tmpfp)
+    return 0;
+
+  // Create hash structure
+  // Hash tables rulez da world!
+  WordIndexHash=inthash_new(WordIndexSize);
+  if (!WordIndexHash)
+    return 0;
+
+  // Start indexing this page
+  keyword[0]='\0';
+  while(i<size) {
+    if (strfield(html_data + i , "<script")) {
+      inscript=1;
+    } 
+    else if (strfield(html_data + i , "<!--")) {
+      incomment=1;
+    }
+    else if (strfield(html_data + i , "</script")) {
+      if (!incomment)
+        inscript=0;
+    } 
+    else if (strfield(html_data + i , "-->")) {
+      incomment=0;
+    }
+    else if (html_data[i]=='<') {
+      if (!inscript)
+        intag=1;
+    }    
+    else if (html_data[i]=='>') {
+      intag=0;
+    }    
+    else {    
+      // Okay, parse keywords
+      if ( (!inscript) && (!incomment) && (!intag) ) {
+        char cchar=html_data[i];
+        int pos;
+        int len=strlen(keyword);
+        
+        // Replace (ignore case, and so on..)
+        if ((pos=strcpos(KEYW_TRANSCODE_FROM,cchar))>=0)
+          cchar=KEYW_TRANSCODE_TO[pos];
+        
+        if (strchr(KEYW_ACCEPT,cchar)) {
+          /* Ignore some characters at begining */
+          if ((len>0) || (!strchr(KEYW_IGNORE_BEG,cchar))) {
+            keyword[len++]=cchar;
+            keyword[len]='\0';
+          }
+        } else if ( (strchr(KEYW_SPACE,cchar)) || (!cchar) ) {
+
+
+          /* Avoid these words */
+          if (len>0) {
+            if (strchr(KEYW_NOT_BEG,keyword[0])) {
+              keyword[(len=0)]='\0';
+            }
+          }
+
+          /* Strip ending . and so */
+          {
+            int ok=0;
+            while((len=strlen(keyword)) && (!ok)) {
+              if (strchr(KEYW_STRIP_END,keyword[len-1])) {      /* strip it */
+                keyword[len-1]='\0';
+              } else
+                ok=1;
+            }
+          }
+          
+          /* Store it ? */
+          if (len >= KEYW_MIN_LEN ) {
+            hts_primindex_words++;
+            if (inthash_inc(WordIndexHash,keyword)) {   /* added new */
+              fprintf(tmpfp,"%s\n",keyword);
+            }
+          }
+          keyword[(len=0)]='\0';
+        } else      /* Invalid */
+          keyword[(len=0)]='\0';
+
+        if (len>KEYW_LEN) {
+          keyword[(len=0)]='\0';
+        }
+      }
+      
+    }
+    
+    i++;
+  }
+
+  // Reset temp file
+  fseek(tmpfp,0,SEEK_SET);
+
+  // Process indexing for this page
+  {
+    //FILE* fp=NULL;
+    //fp=fopen(concat(indexpath,"index.txt"),"ab");
+    if (fp_tmpproject) {
+      while(!feof(tmpfp)) {
+        char line[KEYW_LEN + 32];
+        linput(tmpfp,line,KEYW_LEN + 2);
+        if (strnotempty(line)) {
+          unsigned long int e=0;
+          if (inthash_read(WordIndexHash,line,&e)) {
+            //if (e) {
+            char savelst[HTS_URLMAXSIZE*2];
+            e++;          /* 0 means "once" */
+            
+            if (strncmp((const char*)fslash((char*)indexpath),filename,strlen(indexpath))==0)  // couper
+              strcpy(savelst,filename+strlen(indexpath));
+            else
+              strcpy(savelst,filename);
+            
+            // Add entry for this file and word
+            fprintf(fp_tmpproject,"%s %d %s\n",line,(int) (KEYW_SORT_MAXCOUNT - e),savelst);
+            hts_primindex_size++;
+            //}
+          }
+        }
+      }
+      //fclose(fp);
+    }
+  }
+
+  // Delete temp file
+  fclose(tmpfp);
+  tmpfp=NULL;
+
+  // Clear hash table
+  inthash_delete(&WordIndexHash);
+#endif
+  return 1;
+}
+
+/*
+  Sort index!
+*/
+void index_finish(const char* indexpath,int mode) {
+#if HTS_MAKE_KEYWORD_INDEX
+  char** tab;
+  char* blk;
+  int size;
+  
+  size=fpsize(fp_tmpproject);
+  if (size>0) {
+    //FILE* fp=fopen(concat(indexpath,"index.txt"),"rb");
+    if (fp_tmpproject) {
+      tab=(char**)malloct(sizeof(char*) * (hts_primindex_size+2) );
+      if (tab) {
+        blk = malloct(size+4);
+        if (blk) {
+          fseek(fp_tmpproject,0,SEEK_SET);
+          if ((int)fread(blk,1,size,fp_tmpproject) == size) {
+            char *a=blk,*b;
+            int index=0;
+            int i;
+            FILE* fp;
+
+            while( (b=strchr(a,'\n')) && (index < hts_primindex_size) ) {
+              tab[index++]=a;
+              *b='\0';
+              a=b+1;
+            }
+            
+            // Sort it!
+            qsort(tab,index,sizeof(char*),mystrcmp);
+
+            // Delete fp_tmpproject
+            fclose(fp_tmpproject);
+            fp_tmpproject=NULL;
+
+            // Write new file
+            if (mode == 1)      // TEXT
+              fp=fopen(concat(indexpath,"index.txt"),"wb");
+            else                // HTML
+              fp=fopen(concat(indexpath,"sindex.html"),"wb");
+            if (fp) {
+              char current_word[KEYW_LEN + 32];
+              char word[KEYW_LEN + 32];
+              int hit;
+              int total_hit=0;
+              int total_line=0;
+              int last_pos=0;
+              char word0='\0';
+              current_word[0]='\0';
+
+              if (mode == 2) {         // HTML
+                for(i=0;i<index;i++) {
+                  if (word0 != tab[i][0]) {
+                    word0 = tab[i][0];
+                    fprintf(fp," <a href=\"#%c\">%c</a>\r\n",word0,word0);
+                  }
+                }
+                word0='\0';
+                fprintf(fp,"<br><br>\r\n");
+                fprintf(fp,"<table width=\"100%%\" border=\"0\">\r\n<tr>\r\n<td>word</td>\r\n<td>location\r\n");
+              }
+
+              for(i=0;i<index;i++) {
+                if (sscanf(tab[i],"%s %d",word,&hit) == 2) {
+                  char*  a=strchr(tab[i],' ');
+                  if (a) a=strchr(a+1,' ');
+                  if (a++) {                            /* Yes, a++, not ++a :) */
+                    hit=KEYW_SORT_MAXCOUNT-hit;
+                    if (strcmp(word,current_word)) {    /* New word */
+                      if (total_hit) {
+                        if (mode == 1)      // TEXT
+                          fprintf(fp,"\t=%d\r\n",total_hit);
+                        //else                // HTML
+                        //  fprintf(fp,"<br>(%d total hits)\r\n",total_hit);
+                        if ( 
+                              ( ((total_hit*1000 ) / hts_primindex_words) >= KEYW_USELESS1K   )
+                            ||
+                              ( ((total_line*1000) / index              ) >= KEYW_USELESS1KPG )
+                          ) {
+                          fseek(fp,last_pos,SEEK_SET);
+                          if (mode == 1)      // TEXT
+                            fprintf(fp,"\tignored (%d)\r\n",((total_hit*1000)/hts_primindex_words));
+                          else
+                            fprintf(fp,"(ignored) [%d hits]<br>\r\n",total_hit);
+                        }
+                        else {
+                          if (mode == 1)      // TEXT
+                            fprintf(fp,"\t(%d)\r\n",((total_hit*1000)/hts_primindex_words));
+                          //else                // HTML
+                          //  fprintf(fp,"(%d)\r\n",((total_hit*1000)/hts_primindex_words));
+                        }
+                      }
+                      if (mode == 1)      // TEXT
+                        fprintf(fp,"%s\r\n",word);
+                      else {              // HTML
+                        fprintf(fp,"</td></tr>\r\n");
+                        if (word0 != word[0]) {
+                          word0 = word[0];
+                          fprintf(fp,"<th>%c</th>\r\n",word0);
+                          fprintf(fp,"<a name=\"%c\"></a>\r\n",word0);
+                        }
+                        fprintf(fp,"<tr>\r\n<td>%s</td>\r\n<td>\r\n",word);
+                      }
+                      fflush(fp); last_pos=ftell(fp);
+                      strcpy(current_word,word);
+                      total_hit=total_line=0;
+                    }
+                    total_hit+=hit;
+                    total_line++;
+                    if (mode == 1)      // TEXT
+                      fprintf(fp,"\t%d %s\r\n",hit,a);
+                    else                // HTML
+                      fprintf(fp,"<a href=\"%s\">%s</a> [%d hits]<br>\r\n",a,a,hit);
+                  }
+                }
+              }
+              if (mode == 2)         // HTML
+                fprintf(fp,"</td></tr>\r\n</table>\r\n");
+              fclose(fp);
+            }
+            
+          }
+          freet(blk);
+        }
+        freet(tab);
+      }
+
+    }
+    //qsort
+  }
+  if (fp_tmpproject)
+    fclose(fp_tmpproject);
+  fp_tmpproject=NULL;
+#endif
+}
+
+
+/* Subroutines */
+
+#if HTS_MAKE_KEYWORD_INDEX
+int strcpos(char* adr,char c) {
+  char* apos=strchr(adr,c);
+  if (apos)
+    return (int)(apos-adr);
+  else
+    return -1;
+}
+
+int mystrcmp(const void* _e1,const void* _e2) {
+  char** e1=(char**)_e1;
+  char** e2=(char**)_e2;
+  return strcmp(*e1,*e2);
+}
+#endif
+
author	Xavier Roche <xroche@users.noreply.github.com>	2012-03-19 12:36:11 +0000
committer	Xavier Roche <xroche@users.noreply.github.com>	2012-03-19 12:36:11 +0000
commit	ad5b7acc19290ff91e0f42a0de448a26760fcf99 (patch)
tree	2d1867758835fd0c4e443ff3cc7e5c774af85874 /src/htsindex.c