From 844ecc37072d515513177c65a8c9dc35c9cdfc1a Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Mon, 19 Mar 2012 12:55:42 +0000 Subject: httrack 3.33.16 --- src/htsparse.c | 4682 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 2448 insertions(+), 2234 deletions(-) (limited to 'src/htsparse.c') diff --git a/src/htsparse.c b/src/htsparse.c index 3d35252..79cc1cc 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -37,12 +37,12 @@ Please visit our Website: http://www.httrack.com /* ------------------------------------------------------------ */ +/* Internal engine bytecode */ +#define HTS_INTERNAL_BYTECODE -#include -#include -#include -#include +#ifndef _WIN32_WCE #include +#endif #include /* File defs */ @@ -92,7 +92,7 @@ Please visit our Website: http://www.httrack.com abortLogFmt("not enough memory for current html document in HT_ADD_CHK : realloct(%d) failed" _ ht_size); \ exit(1); \ } \ - } \ +} \ ht_len+=A; #define HT_ADD_ADR \ if ((opt->getmode & 1) && (ptr>0)) { \ @@ -103,11 +103,35 @@ Please visit our Website: http://www.httrack.com } #define HT_ADD(A) \ if ((opt->getmode & 1) && (ptr>0)) { \ - int i=strlen(A),j=ht_len; \ - if (i) { \ - HT_ADD_CHK(i) \ - memcpy(ht_buff+j, A, i); \ - ht_buff[j+i]='\0'; \ + int i_=strlen(A),j_=ht_len; \ + if (i_) { \ + HT_ADD_CHK(i_) \ + memcpy(ht_buff+j_, A, i_); \ + ht_buff[j_+i_]='\0'; \ + } } +#define HT_ADD_HTMLESCAPED(A) \ + if ((opt->getmode & 1) && (ptr>0)) { \ + int i_, j_; \ + char BIGSTK tempo_[HTS_URLMAXSIZE*2]; \ + escape_for_html_print(A, tempo_); \ + i_=strlen(tempo_); \ + j_=ht_len; \ + if (i_) { \ + HT_ADD_CHK(i_) \ + memcpy(ht_buff+j_, tempo_, i_); \ + ht_buff[j_+i_]='\0'; \ + } } +#define HT_ADD_HTMLESCAPED_FULL(A) \ + if ((opt->getmode & 1) && (ptr>0)) { \ + int i_, j_; \ + char BIGSTK tempo_[HTS_URLMAXSIZE*2]; \ + escape_for_html_print_full(A, tempo_); \ + i_=strlen(tempo_); \ + j_=ht_len; \ + if (i_) { \ + HT_ADD_CHK(i_) \ + memcpy(ht_buff+j_, tempo_, i_); \ + ht_buff[j_+i_]='\0'; \ } } #define HT_ADD_START \ int ht_size=(int)(r->size*5)/4+REALLOC_SIZE; \ @@ -126,12 +150,11 @@ Please visit our Website: http://www.httrack.com #define HT_ADD_END { \ int ok=0;\ if (ht_buff) { \ - INTsys file_len=(INTsys) strlen(ht_buff);\ char digest[32+2];\ digest[0]='\0';\ - domd5mem(ht_buff,file_len,digest,1);\ - if (fsize(fconv(savename))==file_len) { \ - int mlen;\ + domd5mem(ht_buff,ht_len,digest,1);\ + if (fsize(fconv(savename))==ht_len) { \ + int mlen = 0;\ char* mbuff;\ cache_readdata(cache,"//[HTML-MD5]//",savename,&mbuff,&mlen);\ if (mlen) mbuff[mlen]='\0';\ @@ -148,8 +171,8 @@ Please visit our Website: http://www.httrack.com if (!ok) { \ fp=filecreate(savename); \ if (fp) { \ - if (file_len>0) {\ - if ((INTsys)fwrite(ht_buff,1,file_len,fp) != file_len) { \ + if (ht_len>0) {\ + if ((INTsys)fwrite(ht_buff,1,ht_len,fp) != ht_len) { \ int fcheck;\ if ((fcheck=check_fatal_io_errno())) {\ opt->state.exit_xh=-1;\ @@ -186,32 +209,32 @@ Please visit our Website: http://www.httrack.com filenote(savename,NULL); \ }\ if (cache->ndx)\ - cache_writedata(cache->ndx,cache->dat,"//[HTML-MD5]//",savename,digest,(int)strlen(digest));\ + cache_writedata(cache->ndx,cache->dat,"//[HTML-MD5]//",savename,digest,(int)strlen(digest));\ } \ freet(ht_buff); ht_buff=NULL; \ - } +} #define HT_ADD_FOP // COPY IN HTSCORE.C #define HT_INDEX_END do { \ -if (!makeindex_done) { \ -if (makeindex_fp) { \ - char tempo[1024]; \ + if (!makeindex_done) { \ + if (makeindex_fp) { \ + char BIGSTK tempo[1024]; \ if (makeindex_links == 1) { \ - sprintf(tempo,""CRLF,makeindex_firstlink); \ + sprintf(tempo,""CRLF,makeindex_firstlink); \ } else \ - tempo[0]='\0'; \ + tempo[0]='\0'; \ fprintf(makeindex_fp,template_footer, \ - "", \ - tempo \ - ); \ + "", \ + tempo \ + ); \ fflush(makeindex_fp); \ fclose(makeindex_fp); /* à ne pas oublier sinon on passe une nuit blanche */ \ makeindex_fp=NULL; \ usercommand(opt,0,NULL,fconcat(opt->path_html,"index.html"),"primary","primary"); \ -} \ -} \ -makeindex_done=1; /* ok c'est fait */ \ + } \ + } \ + makeindex_done=1; /* ok c'est fait */ \ } while(0) // Enregistrement d'un lien: @@ -228,50 +251,50 @@ makeindex_done=1; /* ok c'est fait */ \ // COPIE DE HTSCORE.C #define liens_record(A,F,S,FA,FF) { \ -int notecode=0; \ -int lienurl_len=((sizeof(lien_url)+HTS_ALIGN-1)/HTS_ALIGN)*HTS_ALIGN,\ + int notecode=0; \ + int lienurl_len=((sizeof(lien_url)+HTS_ALIGN-1)/HTS_ALIGN)*HTS_ALIGN,\ adr_len=strlen(A),\ fil_len=strlen(F),\ sav_len=strlen(S),\ cod_len=0,\ former_adr_len=strlen(FA),\ former_fil_len=strlen(FF); \ -if (former_adr_len>0) {\ + if (former_adr_len>0) {\ former_adr_len=(former_adr_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; \ former_fil_len=(former_fil_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; \ -} else former_adr_len=former_fil_len=0;\ -if (strlen(F)>6) if (strnotempty(codebase)) if (strfield(F+strlen(F)-6,".class")) { notecode=1; \ -cod_len=strlen(codebase); cod_len=(cod_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; } \ -adr_len=(adr_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; fil_len=(fil_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; sav_len=(sav_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; \ -if ((int) lien_size < (int) (adr_len+fil_len+sav_len+cod_len+former_adr_len+former_fil_len+lienurl_len)) { \ -lien_buffer=(char*) ((void*) calloct(add_tab_alloc,1)); \ -lien_size=add_tab_alloc; \ -if (lien_buffer!=NULL) { \ -liens[lien_tot]=(lien_url*) (void*) lien_buffer; lien_buffer+=lienurl_len; lien_size-=lienurl_len; \ -liens[lien_tot]->firstblock=1; \ -} \ -} else { \ -liens[lien_tot]=(lien_url*) (void*) lien_buffer; lien_buffer+=lienurl_len; lien_size-=lienurl_len; \ -liens[lien_tot]->firstblock=0; \ -} \ -if (liens[lien_tot]!=NULL) { \ -liens[lien_tot]->adr=lien_buffer; lien_buffer+=adr_len; lien_size-=adr_len; \ -liens[lien_tot]->fil=lien_buffer; lien_buffer+=fil_len; lien_size-=fil_len; \ -liens[lien_tot]->sav=lien_buffer; lien_buffer+=sav_len; lien_size-=sav_len; \ -liens[lien_tot]->cod=NULL; \ -if (notecode) { liens[lien_tot]->cod=lien_buffer; lien_buffer+=cod_len; lien_size-=cod_len; strcpybuff(liens[lien_tot]->cod,codebase); } \ -if (former_adr_len>0) {\ -liens[lien_tot]->former_adr=lien_buffer; lien_buffer+=former_adr_len; lien_size-=former_adr_len; \ -liens[lien_tot]->former_fil=lien_buffer; lien_buffer+=former_fil_len; lien_size-=former_fil_len; \ -strcpybuff(liens[lien_tot]->former_adr,FA); \ -strcpybuff(liens[lien_tot]->former_fil,FF); \ -}\ -strcpybuff(liens[lien_tot]->adr,A); \ -strcpybuff(liens[lien_tot]->fil,F); \ -strcpybuff(liens[lien_tot]->sav,S); \ -liens_record_sav_len(liens[lien_tot]); \ -hash_write(hashptr,lien_tot,opt->urlhack); \ -} \ + } else former_adr_len=former_fil_len=0;\ + if (strlen(F)>6) if (strnotempty(codebase)) if (strfield(F+strlen(F)-6,".class")) { notecode=1; \ + cod_len=strlen(codebase); cod_len=(cod_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; } \ + adr_len=(adr_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; fil_len=(fil_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; sav_len=(sav_len/HTS_ALIGN)*HTS_ALIGN+HTS_ALIGN*2; \ + if ((int) lien_size < (int) (adr_len+fil_len+sav_len+cod_len+former_adr_len+former_fil_len+lienurl_len)) { \ + lien_buffer=(char*) ((void*) calloct(add_tab_alloc,1)); \ + lien_size=add_tab_alloc; \ + if (lien_buffer!=NULL) { \ + liens[lien_tot]=(lien_url*) (void*) lien_buffer; lien_buffer+=lienurl_len; lien_size-=lienurl_len; \ + liens[lien_tot]->firstblock=1; \ + } \ + } else { \ + liens[lien_tot]=(lien_url*) (void*) lien_buffer; lien_buffer+=lienurl_len; lien_size-=lienurl_len; \ + liens[lien_tot]->firstblock=0; \ + } \ + if (liens[lien_tot]!=NULL) { \ + liens[lien_tot]->adr=lien_buffer; lien_buffer+=adr_len; lien_size-=adr_len; \ + liens[lien_tot]->fil=lien_buffer; lien_buffer+=fil_len; lien_size-=fil_len; \ + liens[lien_tot]->sav=lien_buffer; lien_buffer+=sav_len; lien_size-=sav_len; \ + liens[lien_tot]->cod=NULL; \ + if (notecode) { liens[lien_tot]->cod=lien_buffer; lien_buffer+=cod_len; lien_size-=cod_len; strcpybuff(liens[lien_tot]->cod,codebase); } \ + if (former_adr_len>0) {\ + liens[lien_tot]->former_adr=lien_buffer; lien_buffer+=former_adr_len; lien_size-=former_adr_len; \ + liens[lien_tot]->former_fil=lien_buffer; lien_buffer+=former_fil_len; lien_size-=former_fil_len; \ + strcpybuff(liens[lien_tot]->former_adr,FA); \ + strcpybuff(liens[lien_tot]->former_fil,FF); \ + }\ + strcpybuff(liens[lien_tot]->adr,A); \ + strcpybuff(liens[lien_tot]->fil,F); \ + strcpybuff(liens[lien_tot]->sav,S); \ + liens_record_sav_len(liens[lien_tot]); \ + hash_write(hashptr,lien_tot,opt->urlhack); \ + } \ } #define ENGINE_LOAD_CONTEXT() \ @@ -314,32 +337,67 @@ hash_write(hashptr,lien_tot,opt->urlhack); \ #define ENGINE_SAVE_CONTEXT() \ /* Apply changes */ \ - * ( (int*) (str->lien_tot_) ) = lien_tot; \ - * ( (int*) (str->ptr_) ) = ptr; \ - * ( (int*) (str->lien_size_) ) = lien_size; \ - * ( (char**) (str->lien_buffer_) ) = lien_buffer; \ - /* */ \ - * stre->error_ = error; \ - * stre->store_errpage_ = store_errpage; \ - * stre->lien_max_ = lien_max; \ - /* */ \ - *stre->makeindex_done_ = makeindex_done; \ - *stre->makeindex_fp_ = makeindex_fp; \ - *stre->makeindex_links_ = makeindex_links; \ - /* */ \ - *stre->stat_fragment_ = stat_fragment + * ( (int*) (str->lien_tot_) ) = lien_tot; \ + * ( (int*) (str->ptr_) ) = ptr; \ + * ( (int*) (str->lien_size_) ) = lien_size; \ + * ( (char**) (str->lien_buffer_) ) = lien_buffer; \ + /* */ \ + * stre->error_ = error; \ + * stre->store_errpage_ = store_errpage; \ + * stre->lien_max_ = lien_max; \ + /* */ \ + *stre->makeindex_done_ = makeindex_done; \ + *stre->makeindex_fp_ = makeindex_fp; \ + *stre->makeindex_links_ = makeindex_links; \ + /* */ \ + *stre->stat_fragment_ = stat_fragment #define _FILTERS (*opt->filters.filters) #define _FILTERS_PTR (opt->filters.filptr) #define _ROBOTS ((robots_wizard*)opt->robotsptr) +/* Apply current *adr character for the script automate */ +#define AUTOMATE_LOOKUP_CURRENT_ADR() do { \ + if (inscript) { \ + int new_state_pos; \ + new_state_pos=inscript_state[inscript_state_pos][(unsigned char)*adr]; \ + if (new_state_pos < 0) { \ + new_state_pos=inscript_state[inscript_state_pos][INSCRIPT_DEFAULT]; \ + } \ + assertf(new_state_pos >= 0); \ + assertf(new_state_pos*sizeof(inscript_state[0]) < sizeof(inscript_state)); \ + inscript_state_pos=new_state_pos; \ + } \ +} while(0) + +/* Increment current pointer to 'steps' characters, modifying automate if necessary */ +#define INCREMENT_CURRENT_ADR(steps) do { \ + int steps__ = (steps); \ + while(steps__ > 0) { \ + adr++; \ + AUTOMATE_LOOKUP_CURRENT_ADR(); \ + steps__ --; \ + } \ +} while(0) + /* Main parser */ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { /* Load engine variables */ ENGINE_LOAD_CONTEXT(); - + #if HTS_ANALYSTE + { + char* cAddr = r->adr; + int cSize = (int) r->size; + if ( (opt->debug>0) && (opt->log!=NULL) ) { + fspc(opt->log,"info"); fprintf(opt->log,"engine: preprocess-html: %s%s"LF, urladr, urlfil); + } + if (hts_htmlcheck_preprocess(&cAddr, &cSize, urladr, urlfil) == 1) { + r->adr = cAddr; + r->size = cSize; + } + } if (hts_htmlcheck(r->adr,(int)r->size,urladr,urlfil)) { #endif FILE* fp=NULL; // fichier écrit localement @@ -348,8 +406,8 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { if ( (opt->debug>1) && (opt->log!=NULL) ) { fspc(opt->log,"debug"); fprintf(opt->log,"scan file.."LF); test_flush; } - - + + // Indexing! #if HTS_MAKE_KEYWORD_INDEX if (opt->kindex) { @@ -364,13 +422,13 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { } } #endif - + // Now, parsing if ((opt->getmode & 1) && (ptr>0)) { // récupérer les html sur disque // créer le fichier html local HT_ADD_FOP; // écrire peu à peu le fichier } - + if (!error) { int detect_title=0; // détection du title int back_add_stats = opt->state.back_add_stats; @@ -410,10 +468,11 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { char* intag_start=adr; char* intag_startattr=NULL; int intag_start_valid=0; + int intag_ctype=0; // int parent_relative=0; // the parent is the base path (.js, .css..) HT_ADD_START; // débuter - + /* Initialize script automate for comments, quotes.. */ memset(inscript_state, 0xff, sizeof(inscript_state)); inscript_state[INSCRIPT_START][INSCRIPT_DEFAULT]=INSCRIPT_START; /* by default, stay in START */ @@ -444,12 +503,12 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { /* statistics */ if ((opt->getmode & 1) && (ptr>0)) { - /* - HTS_STAT.stat_files++; - HTS_STAT.stat_bytes+=r->size; + /* + HTS_STAT.stat_files++; + HTS_STAT.stat_bytes+=r->size; */ } - + /* Primary list or URLs */ if (ptr == 0) { intag=1; @@ -457,28 +516,46 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { } /* Check is the file is a .js file */ else if ( - (strfield2(r->contenttype,"application/x-javascript")!=0) - || (strfield2(r->contenttype,"text/css")!=0) + (compare_mime(r->contenttype, str->url_file, "application/x-javascript")!=0) + || (compare_mime(r->contenttype, str->url_file, "text/css")!=0) ) { /* JavaScript js file */ - inscript=1; - inscript_name="script"; - intag=1; // because après