diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/htsalias.c | 2 | ||||
-rw-r--r-- | src/htsback.c | 69 | ||||
-rw-r--r-- | src/htscache.c | 35 | ||||
-rw-r--r-- | src/htscore.c | 179 | ||||
-rw-r--r-- | src/htscore.h | 1 | ||||
-rw-r--r-- | src/htscoremain.c | 34 | ||||
-rw-r--r-- | src/htsglobal.h | 4 | ||||
-rw-r--r-- | src/htshelp.c | 3 | ||||
-rw-r--r-- | src/htsindex.c | 6 | ||||
-rw-r--r-- | src/htslib.c | 146 | ||||
-rw-r--r-- | src/htslib.h | 25 | ||||
-rw-r--r-- | src/htsmodules.h | 1 | ||||
-rw-r--r-- | src/htsname.c | 42 | ||||
-rw-r--r-- | src/htsname.h | 11 | ||||
-rw-r--r-- | src/htsopt.h | 2 | ||||
-rw-r--r-- | src/htsparse.c | 47 | ||||
-rw-r--r-- | src/htstools.c | 8 |
17 files changed, 486 insertions, 129 deletions
diff --git a/src/htsalias.c b/src/htsalias.c index d06936f..1a413fe 100644 --- a/src/htsalias.c +++ b/src/htsalias.c @@ -176,6 +176,8 @@ const char* hts_optalias[][4] = { {"disable-module","-%w","param1",""}, {"no-background-on-suspend","-y0","single",""}, {"background-on-suspend","-y","single",""}, + {"utf8-conversion","-%T","single",""}, + {"no-utf8-conversion","-%T0","single",""}, /* */ /* DEPRECATED */ diff --git a/src/htsback.c b/src/htsback.c index a6b19ab..a7fe76c 100644 --- a/src/htsback.c +++ b/src/htsback.c @@ -128,7 +128,7 @@ void back_delete_all(httrackp* opt, cache_back* cache, struct_back* sback) { #ifndef HTS_NO_BACK_ON_DISK char *filename = (char*) item->value.ptr; if (filename != NULL) { - (void) unlink(filename); + (void) UNLINK(filename); } #else /* clear entry content (but not yet the entry) */ @@ -196,7 +196,7 @@ static int back_index_ready(httrackp* opt, struct_back* sback, char* adr, char* FILE *fp; char* fileback = (char*) ptr; char catbuff[CATBUFF_SIZE]; - if (( fp = fopen(fconv(catbuff, fileback), "rb") ) != NULL ) { + if (( fp = FOPEN(fconv(catbuff, fileback), "rb") ) != NULL ) { if (back_unserialize(fp, &itemback) != 0) { if (itemback != NULL) { back_clear_entry(itemback); @@ -217,7 +217,7 @@ static int back_index_ready(httrackp* opt, struct_back* sback, char* adr, char* test_flush; } } - (void) unlink(fileback); + (void) UNLINK(fileback); #else itemback = (lien_back*) ptr; #endif @@ -293,10 +293,10 @@ int back_cleanup_background(httrackp* opt,cache_back* cache,struct_back* sback) if (opt->getmode != 0) { sprintf(filename, "%s.tmp", back[i].url_sav); } else { - sprintf(filename, "%stmpfile%d.tmp", StringBuff(opt->path_html), opt->state.tmpnameid++); + sprintf(filename, "%stmpfile%d.tmp", StringBuff(opt->path_html_utf8), opt->state.tmpnameid++); } /* Security check */ - if (fexist(filename)) { + if (fexist_utf8(filename)) { if (opt->log != NULL) { HTS_LOG(opt,LOG_INFO); fprintf(opt->log,"engine: warning: temporary file %s already exists"LF, filename); test_flush; @@ -323,7 +323,7 @@ int back_cleanup_background(httrackp* opt,cache_back* cache,struct_back* sback) } else { if (opt->log != NULL) { int last_errno = errno; - HTS_LOG(opt,LOG_INFO); fprintf(opt->log,"engine: warning: serialize error for %s%s to %s: open error: %s (%s, %s)"LF, back[i].url_adr, back[i].url_fil, filename, strerror(last_errno), dir_exists(filename) ? "directory exists" : "directory does NOT exist!", fexist(filename) ? "file already exists!" : "file does not exist"); + HTS_LOG(opt,LOG_INFO); fprintf(opt->log,"engine: warning: serialize error for %s%s to %s: open error: %s (%s, %s)"LF, back[i].url_adr, back[i].url_fil, filename, strerror(last_errno), dir_exists(filename) ? "directory exists" : "directory does NOT exist!", fexist_utf8(filename) ? "file already exists!" : "file does not exist"); test_flush; } } @@ -501,7 +501,7 @@ int back_finalize(httrackp* opt,cache_back* cache,struct_back* sback,int p) { back[p].tmpfile=tmpnam(back[p].tmpfile_buffer); #endif if (back[p].tmpfile != NULL && back[p].tmpfile[0] != '\0') { - back[p].r.out=fopen(back[p].tmpfile,"wb"); + back[p].r.out=FOPEN(back[p].tmpfile,"wb"); if (back[p].r.out) { if ((back[p].r.adr) && (back[p].r.size>0)) { if (fwrite(back[p].r.adr,1,(size_t)back[p].r.size,back[p].r.out) != back[p].r.size) { @@ -541,12 +541,12 @@ int back_finalize(httrackp* opt,cache_back* cache,struct_back* sback,int p) { back[p].r.statuscode=STATUSCODE_INVALID; strcpybuff(back[p].r.msg,"Read error when decompressing"); } - unlink(back[p].url_sav); + UNLINK(back[p].url_sav); } } } /* encore that no remaining temporary file exists */ - unlink(back[p].tmpfile); + UNLINK(back[p].tmpfile); back[p].tmpfile = NULL; } // stats @@ -578,10 +578,10 @@ int back_finalize(httrackp* opt,cache_back* cache,struct_back* sback,int p) { if (back[p].r.is_write) { // Written file if (may_be_hypertext_mime(opt,back[p].r.contenttype, back[p].url_fil)) { // to parse! off_t sz; - sz=fsize(back[p].url_sav); + sz=fsize_utf8(back[p].url_sav); if (sz>0) { // ok, exists! if (sz < 8192) { // ok, small file --> to parse! - FILE* fp=fopen(back[p].url_sav,"rb"); + FILE* fp=FOPEN(back[p].url_sav,"rb"); if (fp) { back[p].r.adr=malloct((int)sz + 2); if (back[p].r.adr) { @@ -599,7 +599,7 @@ int back_finalize(httrackp* opt,cache_back* cache,struct_back* sback,int p) { fclose(fp); fp=NULL; // remove (temporary) file! - unlink(fconv(catbuff,back[p].url_sav)); + UNLINK(fconv(catbuff,back[p].url_sav)); } if (fp) fclose(fp); @@ -909,6 +909,7 @@ int back_unserialize(FILE *fp, lien_back** dst) { } /* serialize a reference ; used to store references of files being downloaded in case of broken download */ +/* Note: NOT utf-8 */ int back_serialize_ref(httrackp* opt, const lien_back* src) { char *filename = url_savename_refname_fullpath(opt, src->url_adr, src->url_fil); FILE *fp = fopen(filename, "wb"); @@ -934,7 +935,7 @@ int back_serialize_ref(httrackp* opt, const lien_back* src) { /* unserialize a reference ; used to store references of files being downloaded in case of broken download */ int back_unserialize_ref(httrackp* opt, const char *adr, const char *fil, lien_back** dst) { char *filename = url_savename_refname_fullpath(opt, adr, fil); - FILE *fp = fopen(filename, "rb"); + FILE *fp = FOPEN(filename, "rb"); if (fp != NULL) { int ser = back_unserialize(fp, dst); fclose(fp); @@ -1174,7 +1175,7 @@ int back_flush_output(httrackp* opt, cache_back* cache, struct_back* sback, int /* écrire date "remote" */ if (strnotempty(back[p].url_sav) && strnotempty(back[p].r.lastmodified) - && fexist(back[p].url_sav)) // normalement existe si on a un fichier de sortie + && fexist_utf8(back[p].url_sav)) // normalement existe si on a un fichier de sortie { set_filetime_rfc822(back[p].url_sav,back[p].r.lastmodified); } @@ -1268,7 +1269,7 @@ int back_clear_entry(lien_back* back) { // only for security if (back->tmpfile && back->tmpfile[0] != '\0') { - (void) unlink(back->tmpfile); + (void) UNLINK(back->tmpfile); back->tmpfile = NULL; } @@ -1471,7 +1472,7 @@ int back_add(struct_back* sback,httrackp* opt,cache_back* cache,char* adr,char* if (pos<0) { // pas de mise en cache data, vérifier existence #endif /* note: no check with IS_DELAYED_EXT() enabled - postcheck by client please! */ - if (save[0] != '\0' && !IS_DELAYED_EXT(save) && fsize(fconv(catbuff,save)) <= 0) { // fichier final n'existe pas ou est vide! + if (save[0] != '\0' && !IS_DELAYED_EXT(save) && fsize_utf8(fconv(catbuff,save)) <= 0) { // fichier final n'existe pas ou est vide! int found=0; /* It is possible that the file has been moved due to changes in build structure */ @@ -1483,9 +1484,9 @@ int back_add(struct_back* sback,httrackp* opt,cache_back* cache,char* adr,char* /* Is supposed to be on disk only */ if (r.is_write && previous_save[0] != '\0') { /* Exists, but with another (old) filename: rename (almost) silently */ - if (strcmp(previous_save, save) != 0 && fexist(fconv(catbuff, previous_save))) { + if (strcmp(previous_save, save) != 0 && fexist_utf8(fconv(catbuff, previous_save))) { rename(fconv(catbuff, previous_save), fconv(catbuff2,save)); - if (fexist(fconv(catbuff,save))) { + if (fexist_utf8(fconv(catbuff,save))) { found = 1; if ((opt->debug>1) && (opt->log!=NULL)) { HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"File '%s' has been renamed since last mirror to '%s' ; applying changes"LF, previous_save, save); test_flush; @@ -1511,8 +1512,8 @@ int back_add(struct_back* sback,httrackp* opt,cache_back* cache,char* adr,char* // sinon, le fichier est ok à priori, mais on renverra un if-modified-since pour // en être sûr if (opt->norecatch) { // tester norecatch - if (!fexist(fconv(catbuff,save))) { // fichier existe pas mais déclaré: on l'a effacé - FILE* fp=fopen(fconv(catbuff,save),"wb"); + if (!fexist_utf8(fconv(catbuff,save))) { // fichier existe pas mais déclaré: on l'a effacé + FILE* fp=FOPEN(fconv(catbuff,save),"wb"); if (fp) fclose(fp); if (opt->log!=NULL) { HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"Previous file '%s' not found (erased by user ?), ignoring: %s%s"LF,save,back[p].url_adr,back[p].url_fil); test_flush; @@ -1655,7 +1656,7 @@ int back_add(struct_back* sback,httrackp* opt,cache_back* cache,char* adr,char* } /* Not in cache ; maybe in temporary cache ? Warning: non-movable "url_sav" */ else if (back_unserialize_ref(opt, adr, fil, &itemback) == 0) { - const long file_size = fsize(itemback->url_sav); + const long file_size = fsize_utf8(itemback->url_sav); /* Found file on disk */ if (file_size > 0) { char *send_too = back[p].send_too; @@ -1686,8 +1687,8 @@ int back_add(struct_back* sback,httrackp* opt,cache_back* cache,char* adr,char* itemback = NULL; } /* Not in cache or temporary cache ; found on disk ? (hack) */ - else if (fexist(save)) { - off_t sz=fsize(save); + else if (fexist_utf8(save)) { + off_t sz=fsize_utf8(save); // Bon, là il est possible que le fichier ait été partiellement transféré // (s'il l'avait été en totalité il aurait été inscrit dans le cache ET existerait sur disque) // PAS de If-Modified-Since, on a pas connaissance des données à la date du cache @@ -2668,7 +2669,7 @@ void back_wait(struct_back* sback,httrackp* opt,cache_back* cache,TStamp stat_ti else if (back[i].status==STATUS_FTP_TRANSFER) { // en réception ftp if (!fexist(back[i].location_buffer)) { // terminé FILE* fp; - fp=fopen(fconcat(OPT_GET_BUFF(opt), back[i].location_buffer,".ok"),"rb"); + fp=FOPEN(fconcat(OPT_GET_BUFF(opt), back[i].location_buffer,".ok"),"rb"); if (fp) { int j=0; fscanf(fp,"%d ",&(back[i].r.statuscode)); @@ -2679,7 +2680,7 @@ void back_wait(struct_back* sback,httrackp* opt,cache_back* cache,TStamp stat_ti } back[i].r.msg[j++]='\0'; fclose(fp); - unlink(fconcat(OPT_GET_BUFF(opt), back[i].location_buffer,".ok")); + UNLINK(fconcat(OPT_GET_BUFF(opt), back[i].location_buffer,".ok")); strcpybuff(fconcat(OPT_GET_BUFF(opt), back[i].location_buffer,".ok"),""); } else { strcpybuff(back[i].r.msg,"Unknown ftp result, check if file is ok"); @@ -2772,7 +2773,7 @@ void back_wait(struct_back* sback,httrackp* opt,cache_back* cache,TStamp stat_ti back[i].tmpfile=tmpnam(back[p].tmpfile_buffer); #endif if (back[i].tmpfile != NULL && back[i].tmpfile[0]) { - if ((back[i].r.out=fopen(back[i].tmpfile,"wb")) == NULL) { + if ((back[i].r.out=FOPEN(back[i].tmpfile,"wb")) == NULL) { last_errno = errno; } } @@ -3292,7 +3293,7 @@ void back_wait(struct_back* sback,httrackp* opt,cache_back* cache,TStamp stat_ti // with an error : consider a 304 error if (!opt->delete_old) { if (HTTP_IS_ERROR(back[i].r.statuscode) && back[i].is_update && !back[i].testmode) { - if (back[i].url_sav[0] && fexist(back[i].url_sav)) { + if (back[i].url_sav[0] && fexist_utf8(back[i].url_sav)) { if ((opt->debug>1) && (opt->log!=NULL)) { HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"Error ignored %d (%s) because of 'no purge' option for %s%s"LF,back[i].r.statuscode,back[i].r.msg,back[i].url_adr,back[i].url_fil); test_flush; } @@ -3350,7 +3351,7 @@ void back_wait(struct_back* sback,httrackp* opt,cache_back* cache,TStamp stat_ti if (back[i].r.statuscode==HTTP_OK && !back[i].testmode) { // 'OK' if (!is_hypertext_mime(opt,back[i].r.contenttype, back[i].url_fil)) { // not HTML if (strnotempty(back[i].url_sav)) { // target found - int size = fsize(back[i].url_sav); // target size + int size = fsize_utf8(back[i].url_sav); // target size if (size >= 0) { if (back[i].r.totalsize == size) { // same size! deletehttp(&back[i].r); back[i].r.soc=INVALID_SOCKET; @@ -3473,11 +3474,11 @@ void back_wait(struct_back* sback,httrackp* opt,cache_back* cache,TStamp stat_ti // In case of 'if-unmodified-since' hack, a 304 status can be sent // then, force 'ok' status if (back[i].r.statuscode == STATUSCODE_INVALID) { - if (fexist(back[i].url_sav)) { + if (fexist_utf8(back[i].url_sav)) { back[i].r.statuscode=HTTP_OK; // OK strcpybuff(back[i].r.msg, "OK (cached)"); back[i].r.is_file=1; - back[i].r.totalsize = back[i].r.size = fsize(back[i].url_sav); + back[i].r.totalsize = back[i].r.size = fsize_utf8(back[i].url_sav); get_httptype(opt,back[i].r.contenttype, back[i].url_sav, 1); if ((opt->debug>0) && (opt->log!=NULL)) { HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"Not-modified status without cache guessed: %s%s"LF,back[i].url_adr,back[i].url_fil); test_flush; @@ -3544,7 +3545,7 @@ void back_wait(struct_back* sback,httrackp* opt,cache_back* cache,TStamp stat_ti // traiter 206 (partial content) // xxc SI CHUNK VERIFIER QUE CA MARCHE?? if (back[i].r.statuscode==206) { // on nous envoie un morceau (la fin) coz une partie sur disque! - off_t sz=fsize(back[i].url_sav); + off_t sz=fsize_utf8(back[i].url_sav); #if HDEBUG printf("partial content: "LLintP" on disk..\n",(LLint)sz); #endif @@ -3553,7 +3554,7 @@ void back_wait(struct_back* sback,httrackp* opt,cache_back* cache,TStamp stat_ti if (opt->getmode&2) { // on peut ecrire des non html **sinon ben euhh sera intercepté plus loin, donc rap sur ce qui va sortir** filenote(&opt->state.strc,back[i].url_sav,NULL); // noter fichier comme connu file_notify(opt,back[i].url_adr, back[i].url_fil, back[i].url_sav, 0, 1, back[i].r.notmodified); - back[i].r.out=fopen(fconv(catbuff,back[i].url_sav),"ab"); // append + back[i].r.out=FOPEN(fconv(catbuff,back[i].url_sav),"ab"); // append if (back[i].r.out) { back[i].r.is_write=1; // écrire back[i].r.size=sz; // déja écrit @@ -3577,7 +3578,7 @@ void back_wait(struct_back* sback,httrackp* opt,cache_back* cache,TStamp stat_ti } } } else { // mémoire - FILE* fp=fopen(fconv(catbuff,back[i].url_sav),"rb"); + FILE* fp=FOPEN(fconv(catbuff,back[i].url_sav),"rb"); if (fp) { LLint alloc_mem=sz + 1; if (back[i].r.totalsize>0) @@ -3703,7 +3704,7 @@ void back_wait(struct_back* sback,httrackp* opt,cache_back* cache,TStamp stat_ti #if HTS_REMOVE_BAD_FILES if (back[i].status<0) { if (!back[i].testmode) { // pas en test - unlink(back[i].url_sav); // éliminer fichier (endommagé) + UNLINK(back[i].url_sav); // éliminer fichier (endommagé) //printf("&& %s\n",back[i].url_sav); } } diff --git a/src/htscache.c b/src/htscache.c index 9ff8055..2679111 100644 --- a/src/htscache.c +++ b/src/htscache.c @@ -147,7 +147,7 @@ void cache_mayadd(httrackp* opt,cache_back* cache,htsblk* r,const char* url_adr, // stocker fichiers (et robots.txt) if ( url_save == NULL || (strnotempty(url_save)) || (strcmp(url_fil,"/robots.txt")==0)) { // ajouter le fichier au cache - cache_add(opt,cache,r,url_adr,url_fil,url_save,opt->all_in_cache,StringBuff(opt->path_html)); + cache_add(opt,cache,r,url_adr,url_fil,url_save,opt->all_in_cache,StringBuff(opt->path_html_utf8)); // // store a reference NOT to redo the same test zillions of times! // (problem reported by Lars Clausen) @@ -367,9 +367,9 @@ void cache_add(httrackp* opt,cache_back* cache,const htsblk *r,const char* url_a } else { FILE* fp; // On recopie le fichier->. - off_t file_size=fsize(fconv(catbuff, url_save)); + off_t file_size=fsize_utf8(fconv(catbuff, url_save)); if (file_size>=0) { - fp=fopen(fconv(catbuff, url_save),"rb"); + fp=FOPEN(fconv(catbuff, url_save),"rb"); if (fp!=NULL) { char BIGSTK buff[32768]; size_t nl; @@ -498,10 +498,10 @@ void cache_add(httrackp* opt,cache_back* cache,const htsblk *r,char* url_adr,cha } else { // recopier fichier dans cache FILE* fp; // On recopie le fichier->. - off_t file_size=fsize(fconv(catbuff, url_save)); + off_t file_size=fsize_utf8(fconv(catbuff, url_save)); if (file_size>=0) { if (cache_wLLint(cache_dat,file_size)!=-1) { - fp=fopen(fconv(catbuff, url_save),"rb"); + fp=FOPEN(fconv(catbuff, url_save),"rb"); if (fp!=NULL) { char BIGSTK buff[32768]; ssize_t nl; @@ -678,9 +678,9 @@ static htsblk cache_readex_new(httrackp* opt,cache_back* cache,const char* adr,c /* Previous entry */ if (previous_save_[0] != '\0') { - int pathLen = (int) strlen(StringBuff(opt->path_html)); - if (pathLen != 0 && strncmp(previous_save_, StringBuff(opt->path_html), pathLen) != 0) { // old (<3.40) buggy format - sprintf(previous_save, "%s%s", StringBuff(opt->path_html), previous_save_); + int pathLen = (int) strlen(StringBuff(opt->path_html_utf8)); + if (pathLen != 0 && strncmp(previous_save_, StringBuff(opt->path_html_utf8), pathLen) != 0) { // old (<3.40) buggy format + sprintf(previous_save, "%s%s", StringBuff(opt->path_html_utf8), previous_save_); } else { strcpy(previous_save, previous_save_); } @@ -710,8 +710,8 @@ static htsblk cache_readex_new(httrackp* opt,cache_back* cache,const char* adr,c r.is_write=1; // écrire if (!dataincache) { - if (fexist(fconv(catbuff, save))) { // un fichier existe déja - //if (fsize(fconv(save))==r.size) { // même taille -- NON tant pis (taille mal declaree) + if (fexist_utf8(fconv(catbuff, save))) { // un fichier existe déja + //if (fsize_utf8(fconv(save))==r.size) { // même taille -- NON tant pis (taille mal declaree) ok=1; // plus rien à faire filenote(&opt->state.strc,save,NULL); // noter comme connu file_notify(opt,adr, fil, save, 0, 0, 1); // data in cache @@ -797,8 +797,8 @@ static htsblk cache_readex_new(httrackp* opt,cache_back* cache,const char* adr,c strcpybuff(r.msg,"Previous cache file not found (2)"); } } else { /* Read in memory from cache */ - if (strnotempty(previous_save) && fexist(previous_save)) { - FILE* fp = fopen(fconv(catbuff, previous_save), "rb"); + if (strnotempty(previous_save) && fexist_utf8(previous_save)) { + FILE* fp = FOPEN(fconv(catbuff, previous_save), "rb"); if (fp != NULL) { r.adr = (char*) malloct((int) r.size + 4); if (r.adr != NULL) { @@ -1016,8 +1016,8 @@ static htsblk cache_readex_old(httrackp* opt,cache_back* cache,const char* adr,c int ok=0; r.is_write=1; // écrire - if (fexist(fconv(catbuff, save))) { // un fichier existe déja - //if (fsize(fconv(save))==r.size) { // même taille -- NON tant pis (taille mal declaree) + if (fexist_utf8(fconv(catbuff, save))) { // un fichier existe déja + //if (fsize_utf8(fconv(save))==r.size) { // même taille -- NON tant pis (taille mal declaree) ok=1; // plus rien à faire filenote(&opt->state.strc,save,NULL); // noter comme connu file_notify(opt,adr, fil, save, 0, 0, 0); @@ -1082,8 +1082,8 @@ static htsblk cache_readex_old(httrackp* opt,cache_back* cache,const char* adr,c r.statuscode=STATUSCODE_INVALID; strcpybuff(r.msg,"Previous cache file not found (2)"); } else { /* Read in memory from cache */ - if (strnotempty(return_save) && fexist(return_save)) { - FILE* fp = fopen(fconv(catbuff, return_save), "rb"); + if (strnotempty(return_save) && fexist_utf8(return_save)) { + FILE* fp = FOPEN(fconv(catbuff, return_save), "rb"); if (fp != NULL) { r.adr = (char*) malloct((size_t)r.size + 4); if (r.adr != NULL) { @@ -1685,10 +1685,12 @@ void cache_init(cache_back* cache,httrackp* opt) { // lire un fichier.. (compatible \0) +/* Note: NOT utf-8 */ char* readfile(char* fil) { return readfile2(fil, NULL); } +/* Note: NOT utf-8 */ char* readfile2(char* fil, LLint* size) { char* adr=NULL; char catbuff[CATBUFF_SIZE]; @@ -1714,6 +1716,7 @@ char* readfile2(char* fil, LLint* size) { return adr; } +/* Note: NOT utf-8 */ char* readfile_or(char* fil,char* defaultdata) { char* realfile=fil; char* ret; diff --git a/src/htscore.c b/src/htscore.c index fc352f4..8d62df7 100644 --- a/src/htscore.c +++ b/src/htscore.c @@ -64,6 +64,9 @@ Please visit our Website: http://www.httrack.com /* Cache */ #include "htszlib.h" +/* Charset handling */ +#include "htscharset.h" + /* END specific definitions */ @@ -256,7 +259,7 @@ if (makeindex_fp) { \ fflush(makeindex_fp); \ fclose(makeindex_fp); /* à ne pas oublier sinon on passe une nuit blanche */ \ makeindex_fp=NULL; \ - usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt),StringBuff(opt->path_html),"index.html"),"",""); \ + usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt),StringBuff(opt->path_html_utf8),"index.html"),"",""); \ } \ } \ makeindex_done=1; /* ok c'est fait */ \ @@ -601,7 +604,7 @@ int httpmirror(char* url1, httrackp* opt) { // lien primaire - liens_record("primary","/primary",fslash(OPT_GET_BUFF(opt),fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html),"index.html")),"","",opt->urlhack); + liens_record("primary","/primary",fslash(OPT_GET_BUFF(opt),fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html_utf8),"index.html")),"","",opt->urlhack); if (liens[lien_tot]==NULL) { // erreur, pas de place réservée printf("PANIC! : Not enough memory [%d]\n",__LINE__); if (opt->log) { @@ -890,6 +893,8 @@ int httpmirror(char* url1, httrackp* opt) { str.lien_size_ = &lien_size; str.lien_buffer_ = &lien_buffer; /* */ + str.page_charset_ = NULL; + /* */ /* */ stre.r_ = &r; /* */ @@ -1048,6 +1053,33 @@ int httpmirror(char* url1, httrackp* opt) { (is_hypertext_mime(opt,r.contenttype, urlfil) /* Is HTML or Js, .. */ || may_be_hypertext_mime(opt,r.contenttype, urlfil)) /* Is real media, .. */ ) { + + /* Convert charset to UTF-8 - NOT! (what about links ? remote server side will have troubles with converted names) */ + //if (r.adr != NULL && r.size != 0 && opt->convert_utf8) { + // char *charset; + // char *pos; + // if (r.charset[0] != '\0') { + // charset = strdup(r.charset); + // } else { + // charset = hts_getCharsetFromMeta(r.adr, r.size); + // } + // if (charset != NULL) { + // char *const utf8 = hts_convertStringToUTF8(r.adr, r.size, charset); + // /* Use new buffer */ + // if (utf8 != NULL) { + // freet(r.adr); + // r.size = strlen(utf8); + // r.adr = utf8; + // /* New UTF-8 charset */ + // r.charset[0] = '\0'; + // strcpy(r.charset, "utf-8"); + // } + // /* Free charset */ + // free(charset); + // } + //} + + /* Check bogus chars */ if ((r.adr) && (r.size)) { unsigned int map[256]; int i; @@ -1199,10 +1231,10 @@ int httpmirror(char* url1, httrackp* opt) { // if (r.adr==NULL) { // Written file // if (may_be_hypertext_mime(r.contenttype, urlfil)) { // to parse! // LLint sz; - // sz=fsize(savename); + // sz=fsize_utf8(savename); // if (sz>0) { // ok, exists! // if (sz < 8192) { // ok, small file --> to parse! - // FILE* fp=fopen(savename,"rb"); + // FILE* fp=FOPEN(savename,"rb"); // if (fp) { // r.adr=malloct((int)sz + 2); // if (r.adr) { @@ -1285,6 +1317,8 @@ int httpmirror(char* url1, httrackp* opt) { str.lien_size_ = &lien_size; str.lien_buffer_ = &lien_buffer; /* */ + str.page_charset_ = NULL; + /* */ /* */ stre.r_ = &r; /* */ @@ -1401,6 +1435,7 @@ int httpmirror(char* url1, httrackp* opt) { // -- -- -- -- // Parsing HTML if (!error) { + char page_charset[32]; /* Remove file if being processed */ if (is_loaded_from_file) { @@ -1408,6 +1443,23 @@ int httpmirror(char* url1, httrackp* opt) { is_loaded_from_file = 0; } + /* Detect charset to convert links into proper UTF8 filenames */ + page_charset[0] = '\0'; + if (opt->convert_utf8) { + if (r.charset[0] != '\0') { + if (strlen(r.charset) < sizeof(page_charset)) { + strcpy(page_charset, r.charset); + } + } else if (is_html_mime_type(r.contenttype)) { + char *const charset = hts_getCharsetFromMeta(r.adr, r.size); + if (charset != NULL && strlen(charset) < sizeof(page_charset)) { + strcpy(page_charset, charset); + } + if (charset != NULL) + free(charset); + } + } + /* Info for wrappers */ if ( (opt->debug>0) && (opt->log!=NULL) ) { HTS_LOG(opt,LOG_INFO); fprintf(opt->log,"engine: check-html: %s%s"LF,urladr,urlfil); @@ -1442,6 +1494,8 @@ int httpmirror(char* url1, httrackp* opt) { str.lien_size_ = &lien_size; str.lien_buffer_ = &lien_buffer; /* */ + str.page_charset_ = page_charset[0] != '\0' ? page_charset : NULL; + /* */ /* */ stre.r_ = &r; /* */ @@ -1750,7 +1804,7 @@ int httpmirror(char* url1, httrackp* opt) { HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"(Real Media): parsing %s"LF,savename); test_flush; } if (fexist(savename)) { // ok, existe bien! - FILE* fp=fopen(savename,"r+b"); + FILE* fp=FOPEN(savename,"r+b"); if (fp) { if (!fseek(fp,0,SEEK_SET)) { char BIGSTK line[HTS_URLMAXSIZE*2]; @@ -2328,7 +2382,7 @@ static int mkdir_compat(const char *pathname) { /* path must end with "/" or with the finename (/tmp/bar/ or /tmp/bar/foo.zip) */ HTSEXT_API int dir_exists(const char* path) { - struct stat st; + STRUCT_STAT st; char BIGSTK file[HTS_URLMAXSIZE*2]; int i = 0; if (strnotempty(path) == 0) { @@ -2356,7 +2410,7 @@ HTSEXT_API int dir_exists(const char* path) { file[i + 1] = '\0'; /* Check the final dir */ - if (stat(file, &st) == 0 && S_ISDIR(st.st_mode)) { + if (STAT(file, &st) == 0 && S_ISDIR(st.st_mode)) { errno = 0; return 1; /* EXISTS */ } @@ -2365,6 +2419,7 @@ HTSEXT_API int dir_exists(const char* path) { } /* path must end with "/" or with the finename (/tmp/bar/ or /tmp/bar/foo.zip) */ +/* Note: *not* UTF-8 */ HTSEXT_API int structcheck(const char* path) { struct stat st; char BIGSTK tmpbuf[HTS_URLMAXSIZE*2]; @@ -2459,6 +2514,102 @@ HTSEXT_API int structcheck(const char* path) { return 0; } +/* path must end with "/" or with the finename (/tmp/bar/ or /tmp/bar/foo.zip) */ +/* Note: UTF-8 */ +HTSEXT_API int structcheck_utf8(const char* path) { + STRUCT_STAT st; + char BIGSTK tmpbuf[HTS_URLMAXSIZE*2]; + char BIGSTK file[HTS_URLMAXSIZE*2]; + int i = 0; + int npaths; + if (strnotempty(path) == 0) + return 0; + if (strlen(path) > HTS_URLMAXSIZE) { + errno = EINVAL; + return -1; + } + + /* Get a copy */ + strcpybuff(file, path); +#ifdef _WIN32 + /* To system name */ + for(i = 0 ; file[i] != 0 ; i++) { + if (file[i] == '/') { + file[i] = PATH_SEPARATOR; + } + } +#endif + /* Get prefix (note: file can not be empty here) */ + for(i = (int) strlen(file) - 1 ; i > 0 && file[i] != PATH_SEPARATOR ; i--); + for( ; i > 0 && file[i] == PATH_SEPARATOR ; i--); + file[i + 1] = '\0'; + + /* First check the final dir */ + if (STAT(file, &st) == 0 && S_ISDIR(st.st_mode)) { + return 0; /* OK */ + } + + /* Start from the beginning */ + i = 0; + + /* Skip irrelevant part (the root slash, or the drive path) */ +#ifdef _WIN32 + if (file[0] != 0 && file[1] == ':') { /* f:\ */ + i+= 2; + if (file[i] == PATH_SEPARATOR) { /* f:\ */ + i++; + } + } else if (file[0] == PATH_SEPARATOR && file[1] == PATH_SEPARATOR) { /* \\mch */ + i+= 2; + } +#endif + + /* Check paths */ + for(npaths = 1 ; ; npaths++) { + char end_char; + + /* Go to next path */ + + /* Skip separator(s) */ + for( ; file[i] == PATH_SEPARATOR ; i++); + /* Next separator */ + for( ; file[i] != 0 && file[i] != PATH_SEPARATOR ; i++); + + /* Check */ + end_char = file[i]; + if (end_char != 0) { + file[i] = '\0'; + } + if (STAT(file, &st) == 0) { /* Something exists */ + if (!S_ISDIR(st.st_mode)) { +#if HTS_REMOVE_ANNOYING_INDEX + if (S_ISREG(st.st_mode)) { /* Regular file in place ; move it and create directory */ + sprintf(tmpbuf, "%s.txt", file); + if (RENAME(file, tmpbuf) != 0) { /* Can't rename regular file */ + return -1; + } + if (MKDIR(file) != 0) { /* Can't create directory */ + return -1; + } + } +#else +#error Not implemented +#endif + } + } else { /* Nothing exists ; create directory */ + if (MKDIR(file) != 0) { /* Can't create directory */ + return -1; + } + } + if (end_char == 0) { /* End */ + break; + } else { + file[i] = end_char; /* Restore / */ + } + } + return 0; +} + // sauver un fichier int filesave(httrackp* opt,const char* adr,int len,const char* s,const char* url_adr,const char* url_fil) { FILE* fp; @@ -2497,6 +2648,7 @@ int check_fatal_io_errno(void) { // ouvrir un fichier (avec chemin Un*x) +/* Note: utf-8 */ FILE* filecreate(filenote_strc *strc, const char* s) { char BIGSTK fname[HTS_URLMAXSIZE*2]; FILE* fp; @@ -2523,17 +2675,17 @@ FILE* filecreate(filenote_strc *strc, const char* s) { #endif /* Try to open the file */ - fp = fopen(fname, "wb"); + fp = FOPEN(fname, "wb"); /* Error ? Check the directory structure and retry. */ if (fp == NULL) { last_errno = errno; - if (structcheck(s) != 0) { + if (structcheck_utf8(s) != 0) { last_errno = errno; } else { last_errno = 0; } - fp = fopen(fname, "wb"); + fp = FOPEN(fname, "wb"); } if (fp == NULL && last_errno != 0) { errno = last_errno; @@ -2571,7 +2723,7 @@ FILE* fileappend(filenote_strc *strc,const char* s) { #endif // ouvrir - fp=fopen(fname,"ab"); + fp=FOPEN(fname,"ab"); #ifndef _WIN32 if (fp!=NULL) chmod(fname,HTS_ACCESS_FILE); @@ -2616,6 +2768,7 @@ int filenote(filenote_strc *strc, const char* s, filecreate_params* params) { return 1; } +/* Note: utf-8 */ void file_notify(httrackp* opt,const char* adr,const char* fil,const char* save,int create,int modify,int not_updated) { RUN_CALLBACK6(opt, filesave2, adr, fil, save, create, modify, not_updated); } @@ -2681,7 +2834,7 @@ static void postprocess_file(httrackp* opt,const char* save, const char* adr, co int n; if (rsc_fil == NULL) rsc_fil = fil; - if (strncmp(fslash(OPT_GET_BUFF(opt),save), fslash(OPT_GET_BUFF(opt),StringBuff(opt->path_html)), (n = (int)strlen(StringBuff(opt->path_html)))) == 0) { + if (strncmp(fslash(OPT_GET_BUFF(opt),save), fslash(OPT_GET_BUFF(opt),StringBuff(opt->path_html_utf8)), (n = (int)strlen(StringBuff(opt->path_html_utf8)))) == 0) { rsc_save += n; } @@ -2716,7 +2869,7 @@ static void postprocess_file(httrackp* opt,const char* save, const char* adr, co } } if (opt->state.mimehtml_created == 1 && opt->state.mimefp != NULL) { - FILE* fp = fopen(save, "rb"); + FILE* fp = FOPEN(save, "rb"); if (fp != NULL) { char buff[60*100 + 2]; char mimebuff[256]; diff --git a/src/htscore.h b/src/htscore.h index a3467f4..21161a9 100644 --- a/src/htscore.h +++ b/src/htscore.h @@ -340,6 +340,7 @@ void usercommand_exe(const char* cmd,const char* file); int filters_init(char*** ptrfilters, int maxfilter, int filterinc); #ifndef HTTRACK_DEFLIB HTSEXT_API int structcheck(const char* path); +HTSEXT_API int structcheck_utf8(const char* path); HTSEXT_API int dir_exists(const char* path); #endif HTS_INLINE int fspc(httrackp *opt,FILE* fp,const char* type); diff --git a/src/htscoremain.c b/src/htscoremain.c index e7d7ad0..3654c7d 100644 --- a/src/htscoremain.c +++ b/src/htscoremain.c @@ -47,6 +47,7 @@ Please visit our Website: http://www.httrack.com #include "htswrap.h" #include "htsmodules.h" #include "htszlib.h" +#include "htscharset.h" #include <ctype.h> #if USE_BEGINTHREAD @@ -394,6 +395,22 @@ HTSEXT_API int hts_main2(int argc, char **argv, httrackp *opt) { } // for + // Convert path to UTF-8 +#ifdef _WIN32 + { + char *const path = hts_convertStringSystemToUTF8(StringBuff(opt->path_html), (int) StringLength(opt->path_html)); + if (path != NULL) { + StringCopy(opt->path_html_utf8, path); + free(path); + } else { + StringCopyN(opt->path_html_utf8, StringBuff(opt->path_html), StringLength(opt->path_html)); + } + } +#else + // Assume UTF-8 filesystem. + StringCopyN(opt->path_html_utf8, StringBuff(opt->path_html), StringLength(opt->path_html)); +#endif + /* if doit.log exists, or if new URL(s) defined, then DO NOT load standard config files */ /* (config files are added in doit.log) */ @@ -1058,6 +1075,7 @@ HTSEXT_API int hts_main2(int argc, char **argv, httrackp *opt) { case 'i': opt->dir_topindex = 1; if (*(com+1)=='0') { opt->dir_topindex=0; com++; } break; case 'N': opt->savename_delayed = 2; if (isdigit((unsigned char)*(com+1))) { sscanf(com+1,"%d",&opt->savename_delayed); while(isdigit((unsigned char)*(com+1))) com++; } break; case 'D': opt->delayed_cached=1; if (*(com+1)=='0') { opt->delayed_cached=0; com++; } break; // url hack + case 'T': opt->convert_utf8=1; if (*(com+1)=='0') { opt->convert_utf8=0; com++; } break; // convert to utf-8 case '!': opt->bypass_limits = 1; if (*(com+1)=='0') { opt->bypass_limits=0; com++; } break; #if HTS_USEMMS case 'm': sscanf(com+1,"%d",&opt->mms_maxtime); while(isdigit((unsigned char)*(com+1))) com++; break; @@ -2114,10 +2132,20 @@ HTSEXT_API int hts_main2(int argc, char **argv, httrackp *opt) { t, url); fprintf(opt->log,"("); for(i=0;i<argc;i++) { - if (strchr(argv[i],' ') == NULL || strchr(argv[i],'\"') != NULL) - fprintf(opt->log,"%s ",argv[i]); +#ifdef _WIN32 + char *carg = hts_convertStringSystemToUTF8(argv[i], (int) strlen(argv[i])); + char *arg = carg != NULL ? carg : argv[i]; +#else + const char *arg = argv[i]; +#endif + if (strchr(arg, ' ') == NULL || strchr(arg, '\"') != NULL) + fprintf(opt->log,"%s ", arg); else // entre "" (si espace(s) et pas déja de ") - fprintf(opt->log,"\"%s\" ",argv[i]); + fprintf(opt->log,"\"%s\" ", arg); +#ifdef _WIN32 + if (carg != NULL) + free(carg); +#endif } fprintf(opt->log,")"LF); fprintf(opt->log,LF); diff --git a/src/htsglobal.h b/src/htsglobal.h index 2fc430f..2b10a8a 100644 --- a/src/htsglobal.h +++ b/src/htsglobal.h @@ -40,8 +40,8 @@ Please visit our Website: http://www.httrack.com #define HTTRACK_GLOBAL_DEFH // Version (also check external version information) -#define HTTRACK_VERSION "3.45-4" -#define HTTRACK_VERSIONID "3.45.4" +#define HTTRACK_VERSION "3.46-1" +#define HTTRACK_VERSIONID "3.46.1" #define HTTRACK_AFF_VERSION "3.x" #define HTTRACK_LIB_VERSION "2.0" diff --git a/src/htshelp.c b/src/htshelp.c index 676ed46..6ba34c6 100644 --- a/src/htshelp.c +++ b/src/htshelp.c @@ -262,7 +262,7 @@ void help_wizard(httrackp* opt) { linput(stdin,str,250); if (strnotempty(str)) { if (!((str[0]=='y') || (str[0]=='Y'))) - return 0; + return ; } printf("\n"); @@ -468,6 +468,7 @@ void help(char* app,int more) { infomsg(" o *generate output html file in case of error (404..) (o0 don't generate)"); infomsg(" X *purge old files after update (X0 keep delete)"); infomsg(" %p preserve html files 'as is' (identical to '-K4 -%F \"\"')"); + infomsg(" %T links conversion to UTF-8"); infomsg(""); infomsg("Spider options:"); infomsg(" bN accept cookies in cookies.txt (0=do not accept,* 1=accept)"); diff --git a/src/htsindex.c b/src/htsindex.c index f4a984b..eea1c47 100644 --- a/src/htsindex.c +++ b/src/htsindex.c @@ -142,6 +142,7 @@ void index_init(const char* indexpath) { But should be okay on most cases Tags and javascript handled (ignored) */ +/* Note: utf-8 */ int index_keyword(const char* html_data,LLint size,const char* mime,const char* filename,const char* indexpath) { #if HTS_MAKE_KEYWORD_INDEX char catbuff[CATBUFF_SIZE]; @@ -166,8 +167,8 @@ int index_keyword(const char* html_data,LLint size,const char* mime,const char* // Init ? if (hts_index_init) { - remove(concat(catbuff,indexpath,"index.txt")); - remove(concat(catbuff,indexpath,"sindex.html")); + UNLINK(concat(catbuff,indexpath,"index.txt")); + UNLINK(concat(catbuff,indexpath,"sindex.html")); hts_index_init=0; } @@ -338,6 +339,7 @@ int index_keyword(const char* html_data,LLint size,const char* mime,const char* /* Sort index! */ +/* Note: NOT utf-8 */ void index_finish(const char* indexpath,int mode) { #if HTS_MAKE_KEYWORD_INDEX char catbuff[CATBUFF_SIZE]; diff --git a/src/htslib.c b/src/htslib.c index c2fcc7d..1c1e54e 100644 --- a/src/htslib.c +++ b/src/htslib.c @@ -56,6 +56,7 @@ Please visit our Website: http://www.httrack.com #include "htswrap.h" #include "htsmd5.h" #include "htsmodules.h" +#include "htscharset.h" #ifdef _WIN32 #ifndef _WIN32_WCE @@ -745,7 +746,7 @@ int http_xfopen(httrackp *opt,int mode,int treat,int waitconnect,char* xsend,cha else { // Note: On passe par un FILE* (plus propre) //soc=open(fil,O_RDONLY,0); // en lecture seule! - retour->fp=fopen(fconv(OPT_GET_BUFF(opt), unescape_http(OPT_GET_BUFF(opt),fil)),"rb"); // ouvrir + retour->fp=FOPEN(fconv(OPT_GET_BUFF(opt), unescape_http(OPT_GET_BUFF(opt),fil)),"rb"); // ouvrir if (retour->fp==NULL) soc=INVALID_SOCKET; else @@ -861,7 +862,7 @@ int http_sendhead(httrackp *opt,t_cookie* cookie,int mode,char* xsend,char* adr, search_tag=strstr(fil,POSTTOK"file:"); if (search_tag) { // postfile if (mode==0) { // GET! - FILE* fp=fopen(unescape_http(OPT_GET_BUFF(opt),search_tag+strlen(POSTTOK)+5),"rb"); + FILE* fp=FOPEN(unescape_http(OPT_GET_BUFF(opt),search_tag+strlen(POSTTOK)+5),"rb"); if (fp) { char BIGSTK line[1100]; char BIGSTK protocol[256],url[HTS_URLMAXSIZE*2],method[256]; @@ -2710,9 +2711,9 @@ int set_filetime_rfc822(const char* file, const char* date) { } int get_filetime_rfc822(const char* file, char* date) { - struct stat buf; + STRUCT_STAT buf; date[0] = '\0'; - if (stat(file, &buf) == 0) { + if (STAT(file, &buf) == 0) { struct tm* A; time_t tt = buf.st_mtime; A=gmtime(&tt); @@ -4306,6 +4307,7 @@ void fprintfio(FILE* fp,char* buff,char* prefix) { } /* Le fichier existe-t-il? (ou est-il accessible?) */ +/* Note: NOT utf-8 */ int fexist(const char* s) { char catbuff[CATBUFF_SIZE]; struct stat st; @@ -4318,27 +4320,44 @@ int fexist(const char* s) { return 0; } +/* Le fichier existe-t-il? (ou est-il accessible?) */ +/* Note: utf-8 */ +int fexist_utf8(const char* s) { + char catbuff[CATBUFF_SIZE]; + STRUCT_STAT st; + memset(&st, 0, sizeof(st)); + if (STAT(fconv(catbuff,s), &st) == 0) { + if (S_ISREG(st.st_mode)) { + return 1; + } + } + return 0; +} + /* Taille d'un fichier, -1 si n'existe pas */ -/* fp->_cnt ne fonctionne pas sur toute les plate-formes :-(( */ -/* Note: NOT YET READY FOR 64-bit */ +/* Note: NOT utf-8 */ off_t fsize(const char* s) { - char catbuff[CATBUFF_SIZE]; - FILE* fp; - if (strnotempty(s)==0) // nom vide: erreur + struct stat st; + if (!strnotempty(s)) // nom vide: erreur return -1; - fp=fopen(fconv(catbuff,s),"rb"); - if (fp!=NULL) { - off_t i; - fseek(fp,0,SEEK_END); -#ifdef HTS_FSEEKO - i=ftello(fp); -#else - i=ftell(fp); -#endif - fclose(fp); - return i; - } else + if (stat(s, &st) == 0) { + return st.st_size; + } else { return -1; + } +} + +/* Taille d'un fichier, -1 si n'existe pas */ +/* Note: utf-8 */ +off_t fsize_utf8(const char* s) { + STRUCT_STAT st; + if (!strnotempty(s)) // nom vide: erreur + return -1; + if (STAT(s, &st) == 0) { + return st.st_size; + } else { + return -1; + } } off_t fpsize(FILE* fp) { @@ -5029,7 +5048,7 @@ FILE *hts_dgb_(void) { #ifdef _WIN32_WCE hts_dgb_init_fp = fopen("\\Temp\\hts-debug.txt", "wb"); #else - hts_dgb_init_fp = fopen("hts-debug.txt", "wb"); + hts_dgb_init_fp = FOPEN("hts-debug.txt", "wb"); #endif if (hts_dgb_init_fp != NULL) { fprintf(hts_dgb_init_fp, "* Creating file\r\n"); @@ -5296,6 +5315,7 @@ HTSEXT_API httrackp *hts_create_opt(void) { opt->urlhack=1; // url hack (normalizer) StringCopy(opt->footer,HTS_DEFAULT_FOOTER); opt->ftp_proxy=1; // proxy http pour ftp + opt->convert_utf8 = 1; // convert html to UTF-8 StringCopy(opt->filelist,""); StringCopy(opt->lang_iso,"en, *"); StringCopy(opt->mimedefs,"\n"); // aucun filtre mime (\n IMPORTANT) @@ -5308,6 +5328,7 @@ HTSEXT_API httrackp *hts_create_opt(void) { opt->keyboard=0; // StringCopy(opt->path_html,""); + StringCopy(opt->path_html_utf8,""); StringCopy(opt->path_log,""); StringCopy(opt->path_bin,""); // @@ -5420,6 +5441,7 @@ HTSEXT_API void hts_free_opt(httrackp *opt) { StringFree(opt->mod_blacklist); StringFree(opt->path_html); + StringFree(opt->path_html_utf8); StringFree(opt->path_log); StringFree(opt->path_bin); @@ -5690,6 +5712,86 @@ int closedir(DIR *dir) { errno = EBADF; return -1; } + +// UTF-8 aware FILE API + +static void copyWchar(LPWSTR dest, const char *src) { + int i; + for(i = 0 ; src[i] ; i++) { + dest[i] = src[i]; + } + dest[i] = '\0'; +} + +FILE* hts_fopen_utf8(const char *path, const char *mode) { + WCHAR wmode[32]; + LPWSTR wpath = hts_convertUTF8StringToUCS2(path, strlen(path), NULL); + assertf(strlen(mode) < sizeof(wmode) / sizeof(WCHAR)); + copyWchar(wmode, mode); + if (wpath != NULL) { + FILE *const fp = _wfopen(wpath, wmode); + free(wpath); + return fp; + } else { + // Fallback on conversion error. + return fopen(path, mode); + } +} + +int hts_stat_utf8(const char *path, STRUCT_STAT *buf) { + LPWSTR wpath = hts_convertUTF8StringToUCS2(path, strlen(path), NULL); + if (wpath != NULL) { + const int result = _wstat(wpath, buf); + free(wpath); + return result; + } else { + // Fallback on conversion error. + return stat(path, buf); + } +} + +int hts_unlink_utf8(const char *path) { + LPWSTR wpath = hts_convertUTF8StringToUCS2(path, strlen(path), NULL); + if (wpath != NULL) { + const int result = _wunlink(wpath); + free(wpath); + return result; + } else { + // Fallback on conversion error. + return unlink(path); + } +} + +int hts_rename_utf8(const char *oldpath, const char *newpath) { + LPWSTR woldpath = hts_convertUTF8StringToUCS2(oldpath, strlen(oldpath), NULL); + LPWSTR wnewpath = hts_convertUTF8StringToUCS2(newpath, strlen(newpath), NULL); + if (woldpath != NULL && wnewpath != NULL) { + const int result = _wrename(woldpath, wnewpath); + free(woldpath); + free(wnewpath); + return result; + } else { + if (woldpath != NULL) + free(woldpath); + if (wnewpath != NULL) + free(wnewpath); + // Fallback on conversion error. + return rename(oldpath, newpath); + } +} + +int hts_mkdir_utf8(const char *path) { + LPWSTR wpath = hts_convertUTF8StringToUCS2(path, strlen(path), NULL); + if (wpath != NULL) { + const int result = _wmkdir(wpath); + free(wpath); + return result; + } else { + // Fallback on conversion error. + return mkdir(path); + } +} + #endif // Fin diff --git a/src/htslib.h b/src/htslib.h index d9b6a42..521fd3c 100644 --- a/src/htslib.h +++ b/src/htslib.h @@ -411,9 +411,11 @@ int sig_ignore_flag( int setflag ); // flag ignore void cut_path(char* fullpath,char* path,char* pname); int fexist(const char* s); +int fexist_utf8(const char* s); /*LLint fsize(const char* s); */ off_t fpsize(FILE* fp); off_t fsize(const char* s); +off_t fsize_utf8(const char* s); /* root dir */ #ifndef HTTRACK_DEFLIB HTSEXT_API char* hts_rootdir(char* file); @@ -488,6 +490,29 @@ void *hts_get_callback(t_hts_htmlcheck_callbacks *callbacks, const char *name); ) */ +/* UTF-8 aware FILE operations */ +#ifdef _WIN32 +#define FOPEN hts_fopen_utf8 +extern FILE* hts_fopen_utf8(const char *path, const char *mode); +#define STAT hts_stat_utf8 +typedef struct _stat STRUCT_STAT; +extern int hts_stat_utf8(const char *path, STRUCT_STAT *buf); +#define UNLINK hts_unlink_utf8 +extern int hts_unlink_utf8(const char *pathname); +#define RENAME hts_rename_utf8 +extern int hts_rename_utf8(const char *oldpath, const char *newpath); +#define MKDIR(F) hts_mkdir_utf8(F) +extern int hts_mkdir_utf8(const char *pathname); +#else +/* The underlying filesystem charset is supposed to be UTF-8 */ +#define FOPEN fopen +#define STAT stat +typedef struct stat STRUCT_STAT; +#define UNLINK unlink +#define RENAME rename +#define MKDIR(F) mkdir(F, HTS_ACCESS_FOLDER) +#endif + #endif // internals #undef PATH_SEPARATOR diff --git a/src/htsmodules.h b/src/htsmodules.h index 2712b8f..e03354e 100644 --- a/src/htsmodules.h +++ b/src/htsmodules.h @@ -119,6 +119,7 @@ struct htsmoduleStruct { int* ptr_; size_t* lien_size_; char** lien_buffer_; + const char *page_charset_; /* Internal use - please don't touch */ }; diff --git a/src/htsname.c b/src/htsname.c index e5f0cb5..e5b0715 100644 --- a/src/htsname.c +++ b/src/htsname.c @@ -43,6 +43,7 @@ Please visit our Website: http://www.httrack.com #include "md5.h" #include "htsmd5.h" #include "htstools.h" +#include "htscharset.h" #include <ctype.h> #undef test_flush @@ -119,16 +120,28 @@ static void cleanDoubleSlash(char *s) { } } +// legacy version, without page charset +int url_savename(char* adr_complete, char* fil_complete, char* save, + char* former_adr, char* former_fil, + char* referer_adr, char* referer_fil, + httrackp* opt, + lien_url** liens, int lien_tot, + struct_back* sback, cache_back* cache, hash_struct* hash, + int ptr, int numero_passe, const lien_back* headers) { + return url_savename2(adr_complete, fil_complete, save, former_adr, former_fil, + referer_adr, referer_fil, opt, + liens, lien_tot, sback, cache, hash, ptr, numero_passe, headers, /* unknown */ NULL); +} // forme le nom du fichier à sauver (save) à partir de fil et adr // système intelligent, qui renomme en cas de besoin (exemple: deux INDEX.HTML et index.html) -int url_savename(char* adr_complete, char* fil_complete, char* save, - char* former_adr, char* former_fil, - char* referer_adr, char* referer_fil, - httrackp* opt, - lien_url** liens, int lien_tot, - struct_back* sback, cache_back* cache, hash_struct* hash, - int ptr, int numero_passe, const lien_back* headers) { +int url_savename2(char* adr_complete, char* fil_complete, char* save, + char* former_adr, char* former_fil, + char* referer_adr, char* referer_fil, + httrackp* opt, + lien_url** liens, int lien_tot, + struct_back* sback, cache_back* cache, hash_struct* hash, + int ptr, int numero_passe, const lien_back* headers, const char *charset) { char catbuff[CATBUFF_SIZE]; const char* mime_type = ( headers && !HTTP_IS_REDIRECT(headers->r.statuscode) ) ? headers->r.contenttype : NULL; /*const char* mime_type = ( headers && HTTP_IS_OK(headers->r.statuscode) ) ? headers->r.contenttype : NULL;*/ @@ -1306,6 +1319,15 @@ int url_savename(char* adr_complete, char* fil_complete, char* save, /* ensure that there is no ../ (potential vulnerability) */ fil_simplifie(save); + /* convert name to UTF-8 ? */ + if (charset != NULL && charset[0] != '\0') { + char *const s = hts_convertStringToUTF8(save, (int) strlen(save), charset); + if (s != NULL) { + strcpy(save, s); + free(s); + } + } + /* callback */ RUN_CALLBACK5(opt, savename, adr_complete,fil_complete,referer_adr,referer_fil,save); @@ -1333,9 +1355,9 @@ int url_savename(char* adr_complete, char* fil_complete, char* save, } // chemin primaire éventuel A METTRE AVANT - if (strnotempty(StringBuff(opt->path_html))) { + if (strnotempty(StringBuff(opt->path_html_utf8))) { char BIGSTK tempo[HTS_URLMAXSIZE*2]; - strcpybuff(tempo,StringBuff(opt->path_html)); + strcpybuff(tempo,StringBuff(opt->path_html_utf8)); strcatbuff(tempo,save); strcpybuff(save,tempo); } @@ -1531,7 +1553,7 @@ char *url_savename_refname_fullpath(httrackp* opt, const char *adr, const char * /* remove refname if any */ void url_savename_refname_remove(httrackp* opt, const char *adr, const char *fil) { char *filename = url_savename_refname_fullpath(opt, adr, fil); - (void) unlink(filename); + (void) UNLINK(filename); } #undef test_flush diff --git a/src/htsname.h b/src/htsname.h index 225fa92..7cb7dda 100644 --- a/src/htsname.h +++ b/src/htsname.h @@ -96,6 +96,17 @@ int url_savename(char* adr_complete, char* fil_complete, char* save, hash_struct* hash, int ptr, int numero_passe, const lien_back* headers); +int url_savename2(char* adr_complete, char* fil_complete, char* save, + char* former_adr, char* former_fil, + char* referer_adr, char* referer_fil, + httrackp* opt, + lien_url** liens, int lien_tot, + struct_back* sback, + cache_back* cache, + hash_struct* hash, + int ptr, int numero_passe, + const lien_back* headers, + const char *charset); void standard_name(char* b,char* dot_pos,char* nom_pos,char* fil_complete,int short_ver); void url_savename_addstr(char* d,char* s); char* url_md5(char* digest_buffer, char* fil_complete); diff --git a/src/htsopt.h b/src/htsopt.h index b5e0212..2f586bb 100644 --- a/src/htsopt.h +++ b/src/htsopt.h @@ -309,6 +309,7 @@ struct httrackp { String from; // from String path_log; // chemin pour cache et log String path_html; // chemin pour miroir + String path_html_utf8; // chemin pour miroir, UTF-8 String path_bin; // chemin pour templates int retry; // nombre d'essais supplémentaires en cas d'échec int makestat; // mettre à jour un fichier log de statistiques de transfert @@ -349,6 +350,7 @@ struct httrackp { String lang_iso; // en, fr .. String mimedefs; // ext1=mimetype1\next2=mimetype2.. String mod_blacklist; // (3.41) + int convert_utf8; // UTF-8 conversion ; 3.46 // int maxlink; // nombre max de liens int maxfilter; // nombre max de filtres diff --git a/src/htsparse.c b/src/htsparse.c index 7e6bbc4..f127f0d 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -239,7 +239,7 @@ Please visit our Website: http://www.httrack.com fflush(makeindex_fp); \ fclose(makeindex_fp); /* à ne pas oublier sinon on passe une nuit blanche */ \ makeindex_fp=NULL; \ - usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html),"index.html"),"primary","primary"); \ + usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html_utf8),"index.html"),"primary","primary"); \ } \ } \ makeindex_done=1; /* ok c'est fait */ \ @@ -429,7 +429,7 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { // Indexing! #if HTS_MAKE_KEYWORD_INDEX if (opt->kindex) { - if (index_keyword(r->adr,r->size,r->contenttype,savename,StringBuff(opt->path_html))) { + if (index_keyword(r->adr,r->size,r->contenttype,savename,StringBuff(opt->path_html_utf8))) { if ( (opt->debug>1) && (opt->log!=NULL) ) { HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"indexing file..done"LF); test_flush; } @@ -656,9 +656,9 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { if (p) { // ok center if (makeindex_fp==NULL) { - file_notify(opt,"", "", fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html),"index.html"), 1, 1, 0); - verif_backblue(opt,StringBuff(opt->path_html)); // générer gif - makeindex_fp=filecreate(&opt->state.strc, fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html),"index.html")); + file_notify(opt,"", "", fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html_utf8),"index.html"), 1, 1, 0); + verif_backblue(opt,StringBuff(opt->path_html_utf8)); // générer gif + makeindex_fp=filecreate(&opt->state.strc, fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html_utf8),"index.html")); if (makeindex_fp!=NULL) { // Header @@ -683,7 +683,7 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { b=strchr(a,'<'); // prochain tag } } - if (lienrelatif(tempo,liens[ptr]->sav,concat(OPT_GET_BUFF(opt),StringBuff(opt->path_html),"index.html"))==0) { + if (lienrelatif(tempo,liens[ptr]->sav,concat(OPT_GET_BUFF(opt),StringBuff(opt->path_html_utf8),"index.html"))==0) { detect_title=1; // ok détecté pour cette page! makeindex_links++; // un de plus strcpybuff(makeindex_firstlink,tempo); @@ -753,6 +753,8 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { if (len > 0) { if (strfield(token, "content-type")) { intag_ctype=1; + //NOPE-we do not convert the whole page actually + //intag_start[1] = 'X'; } else if (strfield(token, "refresh")) { intag_ctype=2; @@ -1104,7 +1106,7 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { ) { chpos++; while(is_space(*chpos)) chpod++; - chpos + //chpos } } #endif @@ -2381,7 +2383,7 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { //char last_fil[HTS_URLMAXSIZE*2]=""; strcpybuff(last_adr,adr); // ancienne adresse //strcpybuff(last_fil,fil); // ancien chemin - r_sv=url_savename(adr,fil,save,former_adr,former_fil,liens[ptr]->adr,liens[ptr]->fil,opt,liens,lien_tot,sback,cache,hash,ptr,numero_passe,NULL); + r_sv=url_savename2(adr,fil,save,former_adr,former_fil,liens[ptr]->adr,liens[ptr]->fil,opt,liens,lien_tot,sback,cache,hash,ptr,numero_passe,NULL,str->page_charset_); if (strcmp(jump_identification(last_adr),jump_identification(adr)) != 0) { // a changé // 2e test si moved @@ -2578,7 +2580,7 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { if (patch_it) { char BIGSTK save[HTS_URLMAXSIZE*2]; char BIGSTK tempo[HTS_URLMAXSIZE*2]; - strcpybuff(save,StringBuff(opt->path_html)); + strcpybuff(save,StringBuff(opt->path_html_utf8)); strcatbuff(save,cat_name); if (lienrelatif(tempo,save, relativesavename)==0) { /* Never escape high-chars (we don't know the encoding!!) */ @@ -2626,17 +2628,16 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { // écrire fichier? if (verif_external(opt,cat_nb,1)) { - //if (!fexist(fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html),cat_name))) { - FILE* fp = filecreate(&opt->state.strc, fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html),cat_name)); + FILE* fp = filecreate(&opt->state.strc, fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html_utf8),cat_name)); if (fp) { if (cat_data_len==0) { // texte - verif_backblue(opt,StringBuff(opt->path_html)); + verif_backblue(opt,StringBuff(opt->path_html_utf8)); fprintf(fp,"%s%s","<!-- Created by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->"LF,cat_data); } else { // data fwrite(cat_data,cat_data_len,1,fp); } fclose(fp); - usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html),cat_name),"",""); + usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html_utf8),cat_name),"",""); } } } else { // écrire normalement le nom de fichier @@ -2769,8 +2770,8 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { uri = save; // .. after stripping the path prefix (ex: "www.example.com\foo4242.html) - if (strnotempty(StringBuff(opt->path_html))) { - uri += StringLength(opt->path_html); + if (strnotempty(StringBuff(opt->path_html_utf8))) { + uri += StringLength(opt->path_html_utf8); for( ; uri[0] == '/' || uri[0] == '\\' ; uri++) ; } @@ -3383,7 +3384,7 @@ int hts_mirror_check_moved(htsmoduleStruct* str, htsmoduleStructExtended* stre) { char BIGSTK mov_sav[HTS_URLMAXSIZE*2]; // calculer lien et éventuellement modifier addresse/fichier - if (url_savename(mov_adr,mov_fil,mov_sav,NULL,NULL,liens[liens[ptr]->precedent]->adr,liens[liens[ptr]->precedent]->fil,opt,liens,lien_tot,sback,cache,hash,ptr,numero_passe,NULL)!=-1) { + if (url_savename2(mov_adr,mov_fil,mov_sav,NULL,NULL,liens[liens[ptr]->precedent]->adr,liens[liens[ptr]->precedent]->fil,opt,liens,lien_tot,sback,cache,hash,ptr,numero_passe,NULL,str->page_charset_)!=-1) { if (hash_read(hash,mov_sav,"",0,0)<0) { // n'existe pas déja // enregistrer lien (MACRO) avec SAV IDENTIQUE liens_record(mov_adr,mov_fil,liens[ptr]->sav,"",""); @@ -3480,9 +3481,9 @@ int hts_mirror_check_moved(htsmoduleStruct* str, htsmoduleStructExtended* stre) (r->statuscode==HTTP_PRECONDITION_FAILED) || (r->statuscode==HTTP_REQUESTED_RANGE_NOT_SATISFIABLE) ) { // Precondition Failed, c'est à dire pour nous redemander TOUT le fichier - if (fexist(liens[ptr]->sav)) { + if (fexist_utf8(liens[ptr]->sav)) { remove(liens[ptr]->sav); // Eliminer - if (!fexist(liens[ptr]->sav)) { // Bien éliminé? (sinon on boucle..) + if (!fexist_utf8(liens[ptr]->sav)) { // Bien éliminé? (sinon on boucle..) #if HDEBUG printf("Partial content NOT up-to-date, reget all file for %s\n",liens[ptr]->sav); #endif @@ -3800,7 +3801,7 @@ void hts_mirror_process_user_interaction(htsmoduleStruct* str, htsmoduleStructEx // noter NOUVEAU lien char BIGSTK add_sav[HTS_URLMAXSIZE*2]; // calculer lien et éventuellement modifier addresse/fichier - if (url_savename(add_adr,add_fil,add_sav,NULL,NULL,NULL,NULL,opt,liens,lien_tot,sback,cache,hash,ptr,numero_passe,NULL)!=-1) { + if (url_savename2(add_adr,add_fil,add_sav,NULL,NULL,NULL,NULL,opt,liens,lien_tot,sback,cache,hash,ptr,numero_passe,NULL,str->page_charset_)!=-1) { if (hash_read(hash,add_sav,"",0,0)<0) { // n'existe pas déja // enregistrer lien (MACRO) liens_record(add_adr,add_fil,add_sav,"",""); @@ -4296,7 +4297,7 @@ int hts_wait_delayed(htsmoduleStruct* str, /* Recompute filename with MIME type */ save[0] = '\0'; - r_sv=url_savename(adr,fil,save,former_adr,former_fil,liens[ptr]->adr,liens[ptr]->fil,opt,liens,lien_tot,sback,cache,hash,ptr,numero_passe,&back); + r_sv=url_savename2(adr,fil,save,former_adr,former_fil,liens[ptr]->adr,liens[ptr]->fil,opt,liens,lien_tot,sback,cache,hash,ptr,numero_passe,&back,str->page_charset_); /* Recompute authorization with MIME type */ { @@ -4364,7 +4365,7 @@ int hts_wait_delayed(htsmoduleStruct* str, /* Recompute filename with MIME type */ save[0] = '\0'; - r_sv=url_savename(adr,fil,save,former_adr,former_fil,liens[ptr]->adr,liens[ptr]->fil,opt,liens,lien_tot,sback,cache,hash,ptr,numero_passe,&delayed_back); + r_sv=url_savename2(adr,fil,save,former_adr,former_fil,liens[ptr]->adr,liens[ptr]->fil,opt,liens,lien_tot,sback,cache,hash,ptr,numero_passe,&delayed_back,str->page_charset_); /* Recompute authorization with MIME type */ { @@ -4482,7 +4483,7 @@ int hts_wait_delayed(htsmoduleStruct* str, strcpybuff(mov_url, back[b].r.location); // copier URL /* Remove (temporarily created) file if it was created */ - unlink(fconv(OPT_GET_BUFF(opt),back[b].url_sav)); + UNLINK(fconv(OPT_GET_BUFF(opt),back[b].url_sav)); /* Remove slot! */ if (back[b].status == STATUS_READY) { @@ -4553,7 +4554,7 @@ int hts_wait_delayed(htsmoduleStruct* str, /* Recompute filename for hash lookup */ save[0] = '\0'; - r_sv=url_savename(adr,fil,save,former_adr,former_fil,liens[ptr]->adr,liens[ptr]->fil,opt,liens,lien_tot,sback,cache,hash,ptr,numero_passe,&delayed_back); + r_sv=url_savename2(adr,fil,save,former_adr,former_fil,liens[ptr]->adr,liens[ptr]->fil,opt,liens,lien_tot,sback,cache,hash,ptr,numero_passe,&delayed_back,str->page_charset_); } else { if ( opt->log!=NULL ) { HTS_LOG(opt,LOG_WARNING); fprintf(opt->log,"Unable to test %s%s (loop to same filename)"LF,adr,fil); diff --git a/src/htstools.c b/src/htstools.c index d615f23..ebb5e01 100644 --- a/src/htstools.c +++ b/src/htstools.c @@ -67,7 +67,7 @@ struct find_handle_struct { struct find_handle_struct { DIR * hdir; struct dirent* dirp; - struct stat filestat; + STRUCT_STAT filestat; char path[2048]; }; #endif @@ -481,6 +481,7 @@ void longfile_to_83(int mode,char* n83,char* save) { } // écrire backblue.gif +/* Note: utf-8 */ int verif_backblue(httrackp* opt, const char* base) { int* done = &opt->state.verif_backblue_done; int ret=0; @@ -490,7 +491,7 @@ int verif_backblue(httrackp* opt, const char* base) { return 0; } if ( (!*done) - || (fsize(fconcat(OPT_GET_BUFF(opt), base,"backblue.gif")) != HTS_DATA_BACK_GIF_LEN)) { + || (fsize_utf8(fconcat(OPT_GET_BUFF(opt), base,"backblue.gif")) != HTS_DATA_BACK_GIF_LEN)) { FILE* fp = filecreate(&opt->state.strc, fconcat(OPT_GET_BUFF(opt), base,"backblue.gif")); *done=1; if (fp) { @@ -683,6 +684,7 @@ static int sortTopIndexFnc(const void * a_, const void * b_) { HTSEXT_API char* hts_getcategory(const char* filename); +/* Note: NOT utf-8 */ HTSEXT_API int hts_buildtopindex(httrackp* opt,const char* path,const char* binpath) { FILE* fpo; int retval=0; @@ -1006,7 +1008,7 @@ HTSEXT_API int hts_findnext(find_handle find) { memset(&(find->filestat), 0, sizeof(find->filestat)); if ((find->dirp=readdir(find->hdir))) if (find->dirp->d_name) - if (!stat(concat(catbuff, find->path,find->dirp->d_name),&find->filestat)) + if (!STAT(concat(catbuff, find->path,find->dirp->d_name),&find->filestat)) return 1; #endif } |