diff options
author | Xavier Roche <xroche@users.noreply.github.com> | 2012-05-07 10:02:58 +0000 |
---|---|---|
committer | Xavier Roche <xroche@users.noreply.github.com> | 2012-05-07 10:02:58 +0000 |
commit | 525118dfe8bb3f6b874db7920dc0ee04eede0585 (patch) | |
tree | c2f57a84561e427c09ea7a248f438c187a004c58 /src | |
parent | aad35562b3293c792e858b36c3ca7d1ab2f386c6 (diff) |
Charset fixes
Diffstat (limited to 'src')
-rw-r--r-- | src/htsalias.c | 1 | ||||
-rw-r--r-- | src/htsback.c | 19 | ||||
-rw-r--r-- | src/htscache.c | 23 | ||||
-rw-r--r-- | src/htscore.c | 13 | ||||
-rw-r--r-- | src/htscore.h | 1 | ||||
-rw-r--r-- | src/htsjava.c | 17 | ||||
-rw-r--r-- | src/htslib.c | 8 | ||||
-rw-r--r-- | src/htslib.h | 17 | ||||
-rw-r--r-- | src/htsname.c | 1 | ||||
-rw-r--r-- | src/htsparse.c | 5 | ||||
-rw-r--r-- | src/htstools.c | 2 | ||||
-rw-r--r-- | src/htszlib.c | 11 | ||||
-rw-r--r-- | src/httrack-library.h | 27 |
13 files changed, 109 insertions, 36 deletions
diff --git a/src/htsalias.c b/src/htsalias.c index 1a413fe..908bc13 100644 --- a/src/htsalias.c +++ b/src/htsalias.c @@ -451,6 +451,7 @@ const char* optalias_help(const char* token) { allow *.gif deny ad.* */ +/* Note: NOT utf-8 */ int optinclude_file(const char* name, int* argc,char** argv,char* x_argvblk,int* x_ptr) { FILE* fp; diff --git a/src/htsback.c b/src/htsback.c index a7fe76c..16c54d0 100644 --- a/src/htsback.c +++ b/src/htsback.c @@ -501,7 +501,8 @@ int back_finalize(httrackp* opt,cache_back* cache,struct_back* sback,int p) { back[p].tmpfile=tmpnam(back[p].tmpfile_buffer); #endif if (back[p].tmpfile != NULL && back[p].tmpfile[0] != '\0') { - back[p].r.out=FOPEN(back[p].tmpfile,"wb"); + /* note: tmpfile is a local system filename */ + back[p].r.out=fopen(back[p].tmpfile, "wb"); if (back[p].r.out) { if ((back[p].r.adr) && (back[p].r.size>0)) { if (fwrite(back[p].r.adr,1,(size_t)back[p].r.size,back[p].r.out) != back[p].r.size) { @@ -531,22 +532,25 @@ int back_finalize(httrackp* opt,cache_back* cache,struct_back* sback,int p) { LLint size; file_notify(opt,back[p].url_adr, back[p].url_fil, back[p].url_sav, 1, 1, back[p].r.notmodified); filecreateempty(&opt->state.strc, back[p].url_sav); // filenote & co - if ((size = hts_zunpack(back[p].tmpfile,back[p].url_sav))>=0) { + if ((size = hts_zunpack(back[p].tmpfile, back[p].url_sav))>=0) { back[p].r.size=back[p].r.totalsize=size; // fichier -> mémoire if (!back[p].r.is_write) { deleteaddr(&back[p].r); - back[p].r.adr=readfile(back[p].url_sav); + back[p].r.adr = readfile_utf8(back[p].url_sav); if (!back[p].r.adr) { back[p].r.statuscode=STATUSCODE_INVALID; strcpybuff(back[p].r.msg,"Read error when decompressing"); } UNLINK(back[p].url_sav); } - } + } else { + back[p].r.statuscode = STATUSCODE_INVALID; + strcpybuff(back[p].r.msg, "Error when decompressing"); + } } - /* encore that no remaining temporary file exists */ - UNLINK(back[p].tmpfile); + /* ensure that no remaining temporary file exists */ + unlink(back[p].tmpfile); back[p].tmpfile = NULL; } // stats @@ -920,6 +924,7 @@ int back_serialize_ref(httrackp* opt, const lien_back* src) { if (mkdir(fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_log), CACHE_REFNAME), S_IRWXU | S_IRWXG | S_IRWXO) == 0) #endif { + /* note: local filename */ filename = url_savename_refname_fullpath(opt, src->url_adr, src->url_fil); fp = fopen(filename, "wb"); } @@ -1269,7 +1274,7 @@ int back_clear_entry(lien_back* back) { // only for security if (back->tmpfile && back->tmpfile[0] != '\0') { - (void) UNLINK(back->tmpfile); + (void) unlink(back->tmpfile); back->tmpfile = NULL; } diff --git a/src/htscache.c b/src/htscache.c index 2679111..e8e7701 100644 --- a/src/htscache.c +++ b/src/htscache.c @@ -1716,6 +1716,29 @@ char* readfile2(char* fil, LLint* size) { return adr; } +/* Note: utf-8 */ +char* readfile_utf8(char* fil) { + char* adr=NULL; + char catbuff[CATBUFF_SIZE]; + const off_t len = fsize_utf8(fil); + if (len >= 0) { // exists + FILE*const fp = FOPEN(fconv(catbuff, fil),"rb"); + if (fp!=NULL) { // n'existe pas (!) + adr = (char*) malloct(len+1); + if (adr!=NULL) { + if (len > 0 && fread(adr,1,len,fp) != len) { // fichier endommagé ? + freet(adr); + adr=NULL; + } else { + adr[len] = '\0'; + } + } + fclose(fp); + } + } + return adr; +} + /* Note: NOT utf-8 */ char* readfile_or(char* fil,char* defaultdata) { char* realfile=fil; diff --git a/src/htscore.c b/src/htscore.c index de202db..6e2b198 100644 --- a/src/htscore.c +++ b/src/htscore.c @@ -249,7 +249,10 @@ if (!makeindex_done) { \ if (makeindex_fp) { \ char BIGSTK tempo[1024]; \ if (makeindex_links == 1) { \ - sprintf(tempo,"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">"CRLF,makeindex_firstlink); \ + char BIGSTK link_escaped[HTS_URLMAXSIZE*2]; \ + strcpybuff(link_escaped, makeindex_firstlink); \ + escape_check_url(link_escaped); \ + sprintf(tempo,"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">"CRLF, link_escaped); \ } else \ tempo[0]='\0'; \ fprintf(makeindex_fp,template_footer, \ @@ -1461,6 +1464,12 @@ int httpmirror(char* url1, httrackp* opt) { if (charset != NULL) free(charset); } + /* Could not detect charset: could it be UTF-8 ? */ + if (page_charset[0] == '\0') { + if (is_unicode_utf8(r.adr, r.size)) { + strcpy(page_charset, "utf-8"); + } + } /* Could not detect charset */ if (page_charset[0] == '\0') { if ( (opt->debug>0) && (opt->log!=NULL) ) { @@ -1741,7 +1750,7 @@ int httpmirror(char* url1, httrackp* opt) { // a partir d'ici le slash devient antislash #endif - if ((fp=fopen(tempo,"wb"))!=NULL) { + if ((fp=FOPEN(tempo,"wb"))!=NULL) { fprintf(fp,"Info-file generated by HTTrack Website Copier "HTTRACK_VERSION"%s"CRLF""CRLF, hts_get_version_info(opt)); fprintf(fp,"The file %s has not been scanned by HTS"CRLF,savename); fprintf(fp,"Some links contained in it may be unreachable locally."CRLF); diff --git a/src/htscore.h b/src/htscore.h index 21161a9..2272327 100644 --- a/src/htscore.h +++ b/src/htscore.h @@ -348,6 +348,7 @@ char* next_token(char* p,int flag); // char* readfile(char* fil); char* readfile2(char* fil, LLint* size); +char* readfile_utf8(char* fil); char* readfile_or(char* fil,char* defaultdata); #if 0 void check_rate(TStamp stat_timestart,int maxrate); diff --git a/src/htsjava.c b/src/htsjava.c index 335d378..e3af5ea 100644 --- a/src/htsjava.c +++ b/src/htsjava.c @@ -46,6 +46,7 @@ Please visit our Website: http://www.httrack.com #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <sys/stat.h> #if ( defined(_WIN32) ||defined(HAVE_SYS_TYPES_H) ) #include <sys/types.h> #endif @@ -136,16 +137,12 @@ static int hts_detect_java(t_hts_callbackarg *carg, httrackp *opt, } static off_t fsize(const char* s) { - FILE* fp; - fp=fopen(s,"rb"); - if (fp!=NULL) { - off_t i; - fseek(fp,0,SEEK_END); - i = ftell(fp); - fclose(fp); - return i; - } else + STRUCT_STAT st; + if (STAT(s, &st) == 0 && S_ISREG(st.st_mode)) { + return st.st_size; + } else { return -1; + } } static int hts_parse_java(t_hts_callbackarg *carg, httrackp *opt, @@ -174,7 +171,7 @@ static int hts_parse_java(t_hts_callbackarg *carg, httrackp *opt, #if JAVADEBUG printf("fopen\n"); #endif - if ((fpout = fopen(fconv(catbuff, file), "r+b")) == NULL) + if ((fpout = FOPEN(fconv(catbuff, file), "r+b")) == NULL) { //fprintf(stderr, "Cannot open input file.\n"); sprintf(str->err_msg,"Unable to open file %s",file); diff --git a/src/htslib.c b/src/htslib.c index 1c1e54e..feba33a 100644 --- a/src/htslib.c +++ b/src/htslib.c @@ -3032,13 +3032,13 @@ typedef struct { // 0 : no // 1 : yes // -1: don't know -int is_unicode_utf8(unsigned char* buffer, unsigned int size) { +int is_unicode_utf8(const unsigned char* buffer, size_t size) { t_auto_seq seq; - unsigned int i; - int is_utf=-1; + size_t i; + int is_utf = -1; seq.pos=0; - for(i=0 ; i < size ; i++) { + for(i = 0 ; i < size ; i++) { unsigned int ok=0; unsigned int inseq=0; unsigned int err=0; diff --git a/src/htslib.h b/src/htslib.h index 521fd3c..9290c78 100644 --- a/src/htslib.h +++ b/src/htslib.h @@ -337,7 +337,7 @@ void rawlinput(FILE* fp,char* s,int max); char* strstrcase(char *s,char *o); int ident_url_absolute(const char* url,char* adr,char* fil); void fil_simplifie(char* f); -int is_unicode_utf8(unsigned char* buffer, unsigned int size); +int is_unicode_utf8(const unsigned char* buffer, size_t size); void map_characters(unsigned char* buffer, unsigned int size, unsigned int* map); int ishtml(httrackp *opt,const char* urlfil); int ishtml_ext(const char* a); @@ -490,19 +490,20 @@ void *hts_get_callback(t_hts_htmlcheck_callbacks *callbacks, const char *name); ) */ -/* UTF-8 aware FILE operations */ +/* UTF-8 aware FILE API */ +#ifndef HTS_DEF_FILEAPI #ifdef _WIN32 #define FOPEN hts_fopen_utf8 -extern FILE* hts_fopen_utf8(const char *path, const char *mode); +HTSEXT_API FILE* hts_fopen_utf8(const char *path, const char *mode); #define STAT hts_stat_utf8 typedef struct _stat STRUCT_STAT; -extern int hts_stat_utf8(const char *path, STRUCT_STAT *buf); +HTSEXT_API int hts_stat_utf8(const char *path, STRUCT_STAT *buf); #define UNLINK hts_unlink_utf8 -extern int hts_unlink_utf8(const char *pathname); +HTSEXT_API int hts_unlink_utf8(const char *pathname); #define RENAME hts_rename_utf8 -extern int hts_rename_utf8(const char *oldpath, const char *newpath); +HTSEXT_API int hts_rename_utf8(const char *oldpath, const char *newpath); #define MKDIR(F) hts_mkdir_utf8(F) -extern int hts_mkdir_utf8(const char *pathname); +HTSEXT_API int hts_mkdir_utf8(const char *pathname); #else /* The underlying filesystem charset is supposed to be UTF-8 */ #define FOPEN fopen @@ -512,6 +513,8 @@ typedef struct stat STRUCT_STAT; #define RENAME rename #define MKDIR(F) mkdir(F, HTS_ACCESS_FOLDER) #endif +#define HTS_DEF_FILEAPI +#endif #endif // internals diff --git a/src/htsname.c b/src/htsname.c index e5b0715..74172e2 100644 --- a/src/htsname.c +++ b/src/htsname.c @@ -1544,6 +1544,7 @@ void url_savename_refname(const char *adr, const char *fil, char *filename) { bindigest[12], bindigest[13], bindigest[14], bindigest[15]); } +/* note: return a local filename */ char *url_savename_refname_fullpath(httrackp* opt, const char *adr, const char *fil) { char digest_filename[64]; url_savename_refname(adr, fil, digest_filename); diff --git a/src/htsparse.c b/src/htsparse.c index 18059f5..35d1bb9 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -230,7 +230,10 @@ Please visit our Website: http://www.httrack.com if (makeindex_fp) { \ char BIGSTK tempo[1024]; \ if (makeindex_links == 1) { \ - sprintf(tempo,"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">"CRLF,makeindex_firstlink); \ + char BIGSTK link_escaped[HTS_URLMAXSIZE*2]; \ + strcpybuff(link_escaped, makeindex_firstlink); \ + escape_check_url(link_escaped); \ + sprintf(tempo,"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">"CRLF,link_escaped); \ } else \ tempo[0]='\0'; \ fprintf(makeindex_fp,template_footer, \ diff --git a/src/htstools.c b/src/htstools.c index ebb5e01..049fc3c 100644 --- a/src/htstools.c +++ b/src/htstools.c @@ -845,6 +845,7 @@ HTSEXT_API int hts_buildtopindex(httrackp* opt,const char* path,const char* binp return retval; } +/* Note: NOT utf-8 */ HTSEXT_API char* hts_getcategory(const char* filename) { String categ = STRING_EMPTY; if (fexist(filename)) { @@ -867,6 +868,7 @@ HTSEXT_API char* hts_getcategory(const char* filename) { return StringBuffRW(categ); } +/* Note: NOT utf-8 */ HTSEXT_API char* hts_getcategories(char* path, int type) { String categ = STRING_EMPTY; String profiles = STRING_EMPTY; diff --git a/src/htszlib.c b/src/htszlib.c index 4fd33da..d313d89 100644 --- a/src/htszlib.c +++ b/src/htszlib.c @@ -54,13 +54,16 @@ Please visit our Website: http://www.httrack.com Unpack file into a new file Return value: size of the new file, or -1 if an error occured */ +/* Note: utf-8 */ int hts_zunpack(char* filename,char* newfile) { + int ret = -1; char catbuff[CATBUFF_SIZE]; if (gz_is_available && filename && newfile) { if (filename[0] && newfile[0]) { - gzFile gz = gzopen (filename, "rb"); + // not: NOT an UTF-8 filename + gzFile gz = gzopen(filename, "rb"); if (gz) { - FILE* fpout=fopen(fconv(catbuff, newfile),"wb"); + FILE*const fpout = FOPEN(fconv(catbuff, newfile), "wb"); int size=0; if (fpout) { int nr; @@ -77,11 +80,11 @@ int hts_zunpack(char* filename,char* newfile) { } else size=-1; gzclose(gz); - return (int) size; + ret = (int) size; } } } - return -1; + return ret; } int hts_extract_meta(const char* path) { diff --git a/src/httrack-library.h b/src/httrack-library.h index 352e2c2..39b00c0 100644 --- a/src/httrack-library.h +++ b/src/httrack-library.h @@ -5,7 +5,7 @@ Copyright (C) Xavier Roche and other contributors This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 3 +as published by the Free Software Foundation; either version 2 of the License, or any later version. This program is distributed in the hope that it will be useful, @@ -214,4 +214,29 @@ HTSEXT_API int hts_findisdir(find_handle find); HTSEXT_API int hts_findisfile(find_handle find); HTSEXT_API int hts_findissystem(find_handle find); +/* UTF-8 aware FILE API */ +#ifndef HTS_DEF_FILEAPI +#ifdef _WIN32 +#define FOPEN hts_fopen_utf8 +HTSEXT_API FILE* hts_fopen_utf8(const char *path, const char *mode); +#define STAT hts_stat_utf8 +typedef struct _stat STRUCT_STAT; +HTSEXT_API int hts_stat_utf8(const char *path, STRUCT_STAT *buf); +#define UNLINK hts_unlink_utf8 +HTSEXT_API int hts_unlink_utf8(const char *pathname); +#define RENAME hts_rename_utf8 +HTSEXT_API int hts_rename_utf8(const char *oldpath, const char *newpath); +#define MKDIR(F) hts_mkdir_utf8(F) +HTSEXT_API int hts_mkdir_utf8(const char *pathname); +#else +#define FOPEN fopen +#define STAT stat +typedef struct stat STRUCT_STAT; +#define UNLINK unlink +#define RENAME rename +#define MKDIR(F) mkdir(F, HTS_ACCESS_FOLDER) +#endif +#define HTS_DEF_FILEAPI +#endif + #endif |