diff options
Diffstat (limited to 'src/htscore.c')
-rw-r--r-- | src/htscore.c | 179 |
1 files changed, 166 insertions, 13 deletions
diff --git a/src/htscore.c b/src/htscore.c index fc352f4..8d62df7 100644 --- a/src/htscore.c +++ b/src/htscore.c @@ -64,6 +64,9 @@ Please visit our Website: http://www.httrack.com /* Cache */ #include "htszlib.h" +/* Charset handling */ +#include "htscharset.h" + /* END specific definitions */ @@ -256,7 +259,7 @@ if (makeindex_fp) { \ fflush(makeindex_fp); \ fclose(makeindex_fp); /* à ne pas oublier sinon on passe une nuit blanche */ \ makeindex_fp=NULL; \ - usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt),StringBuff(opt->path_html),"index.html"),"",""); \ + usercommand(opt,0,NULL,fconcat(OPT_GET_BUFF(opt),StringBuff(opt->path_html_utf8),"index.html"),"",""); \ } \ } \ makeindex_done=1; /* ok c'est fait */ \ @@ -601,7 +604,7 @@ int httpmirror(char* url1, httrackp* opt) { // lien primaire - liens_record("primary","/primary",fslash(OPT_GET_BUFF(opt),fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html),"index.html")),"","",opt->urlhack); + liens_record("primary","/primary",fslash(OPT_GET_BUFF(opt),fconcat(OPT_GET_BUFF(opt), StringBuff(opt->path_html_utf8),"index.html")),"","",opt->urlhack); if (liens[lien_tot]==NULL) { // erreur, pas de place réservée printf("PANIC! : Not enough memory [%d]\n",__LINE__); if (opt->log) { @@ -890,6 +893,8 @@ int httpmirror(char* url1, httrackp* opt) { str.lien_size_ = &lien_size; str.lien_buffer_ = &lien_buffer; /* */ + str.page_charset_ = NULL; + /* */ /* */ stre.r_ = &r; /* */ @@ -1048,6 +1053,33 @@ int httpmirror(char* url1, httrackp* opt) { (is_hypertext_mime(opt,r.contenttype, urlfil) /* Is HTML or Js, .. */ || may_be_hypertext_mime(opt,r.contenttype, urlfil)) /* Is real media, .. */ ) { + + /* Convert charset to UTF-8 - NOT! (what about links ? remote server side will have troubles with converted names) */ + //if (r.adr != NULL && r.size != 0 && opt->convert_utf8) { + // char *charset; + // char *pos; + // if (r.charset[0] != '\0') { + // charset = strdup(r.charset); + // } else { + // charset = hts_getCharsetFromMeta(r.adr, r.size); + // } + // if (charset != NULL) { + // char *const utf8 = hts_convertStringToUTF8(r.adr, r.size, charset); + // /* Use new buffer */ + // if (utf8 != NULL) { + // freet(r.adr); + // r.size = strlen(utf8); + // r.adr = utf8; + // /* New UTF-8 charset */ + // r.charset[0] = '\0'; + // strcpy(r.charset, "utf-8"); + // } + // /* Free charset */ + // free(charset); + // } + //} + + /* Check bogus chars */ if ((r.adr) && (r.size)) { unsigned int map[256]; int i; @@ -1199,10 +1231,10 @@ int httpmirror(char* url1, httrackp* opt) { // if (r.adr==NULL) { // Written file // if (may_be_hypertext_mime(r.contenttype, urlfil)) { // to parse! // LLint sz; - // sz=fsize(savename); + // sz=fsize_utf8(savename); // if (sz>0) { // ok, exists! // if (sz < 8192) { // ok, small file --> to parse! - // FILE* fp=fopen(savename,"rb"); + // FILE* fp=FOPEN(savename,"rb"); // if (fp) { // r.adr=malloct((int)sz + 2); // if (r.adr) { @@ -1285,6 +1317,8 @@ int httpmirror(char* url1, httrackp* opt) { str.lien_size_ = &lien_size; str.lien_buffer_ = &lien_buffer; /* */ + str.page_charset_ = NULL; + /* */ /* */ stre.r_ = &r; /* */ @@ -1401,6 +1435,7 @@ int httpmirror(char* url1, httrackp* opt) { // -- -- -- -- // Parsing HTML if (!error) { + char page_charset[32]; /* Remove file if being processed */ if (is_loaded_from_file) { @@ -1408,6 +1443,23 @@ int httpmirror(char* url1, httrackp* opt) { is_loaded_from_file = 0; } + /* Detect charset to convert links into proper UTF8 filenames */ + page_charset[0] = '\0'; + if (opt->convert_utf8) { + if (r.charset[0] != '\0') { + if (strlen(r.charset) < sizeof(page_charset)) { + strcpy(page_charset, r.charset); + } + } else if (is_html_mime_type(r.contenttype)) { + char *const charset = hts_getCharsetFromMeta(r.adr, r.size); + if (charset != NULL && strlen(charset) < sizeof(page_charset)) { + strcpy(page_charset, charset); + } + if (charset != NULL) + free(charset); + } + } + /* Info for wrappers */ if ( (opt->debug>0) && (opt->log!=NULL) ) { HTS_LOG(opt,LOG_INFO); fprintf(opt->log,"engine: check-html: %s%s"LF,urladr,urlfil); @@ -1442,6 +1494,8 @@ int httpmirror(char* url1, httrackp* opt) { str.lien_size_ = &lien_size; str.lien_buffer_ = &lien_buffer; /* */ + str.page_charset_ = page_charset[0] != '\0' ? page_charset : NULL; + /* */ /* */ stre.r_ = &r; /* */ @@ -1750,7 +1804,7 @@ int httpmirror(char* url1, httrackp* opt) { HTS_LOG(opt,LOG_DEBUG); fprintf(opt->log,"(Real Media): parsing %s"LF,savename); test_flush; } if (fexist(savename)) { // ok, existe bien! - FILE* fp=fopen(savename,"r+b"); + FILE* fp=FOPEN(savename,"r+b"); if (fp) { if (!fseek(fp,0,SEEK_SET)) { char BIGSTK line[HTS_URLMAXSIZE*2]; @@ -2328,7 +2382,7 @@ static int mkdir_compat(const char *pathname) { /* path must end with "/" or with the finename (/tmp/bar/ or /tmp/bar/foo.zip) */ HTSEXT_API int dir_exists(const char* path) { - struct stat st; + STRUCT_STAT st; char BIGSTK file[HTS_URLMAXSIZE*2]; int i = 0; if (strnotempty(path) == 0) { @@ -2356,7 +2410,7 @@ HTSEXT_API int dir_exists(const char* path) { file[i + 1] = '\0'; /* Check the final dir */ - if (stat(file, &st) == 0 && S_ISDIR(st.st_mode)) { + if (STAT(file, &st) == 0 && S_ISDIR(st.st_mode)) { errno = 0; return 1; /* EXISTS */ } @@ -2365,6 +2419,7 @@ HTSEXT_API int dir_exists(const char* path) { } /* path must end with "/" or with the finename (/tmp/bar/ or /tmp/bar/foo.zip) */ +/* Note: *not* UTF-8 */ HTSEXT_API int structcheck(const char* path) { struct stat st; char BIGSTK tmpbuf[HTS_URLMAXSIZE*2]; @@ -2459,6 +2514,102 @@ HTSEXT_API int structcheck(const char* path) { return 0; } +/* path must end with "/" or with the finename (/tmp/bar/ or /tmp/bar/foo.zip) */ +/* Note: UTF-8 */ +HTSEXT_API int structcheck_utf8(const char* path) { + STRUCT_STAT st; + char BIGSTK tmpbuf[HTS_URLMAXSIZE*2]; + char BIGSTK file[HTS_URLMAXSIZE*2]; + int i = 0; + int npaths; + if (strnotempty(path) == 0) + return 0; + if (strlen(path) > HTS_URLMAXSIZE) { + errno = EINVAL; + return -1; + } + + /* Get a copy */ + strcpybuff(file, path); +#ifdef _WIN32 + /* To system name */ + for(i = 0 ; file[i] != 0 ; i++) { + if (file[i] == '/') { + file[i] = PATH_SEPARATOR; + } + } +#endif + /* Get prefix (note: file can not be empty here) */ + for(i = (int) strlen(file) - 1 ; i > 0 && file[i] != PATH_SEPARATOR ; i--); + for( ; i > 0 && file[i] == PATH_SEPARATOR ; i--); + file[i + 1] = '\0'; + + /* First check the final dir */ + if (STAT(file, &st) == 0 && S_ISDIR(st.st_mode)) { + return 0; /* OK */ + } + + /* Start from the beginning */ + i = 0; + + /* Skip irrelevant part (the root slash, or the drive path) */ +#ifdef _WIN32 + if (file[0] != 0 && file[1] == ':') { /* f:\ */ + i+= 2; + if (file[i] == PATH_SEPARATOR) { /* f:\ */ + i++; + } + } else if (file[0] == PATH_SEPARATOR && file[1] == PATH_SEPARATOR) { /* \\mch */ + i+= 2; + } +#endif + + /* Check paths */ + for(npaths = 1 ; ; npaths++) { + char end_char; + + /* Go to next path */ + + /* Skip separator(s) */ + for( ; file[i] == PATH_SEPARATOR ; i++); + /* Next separator */ + for( ; file[i] != 0 && file[i] != PATH_SEPARATOR ; i++); + + /* Check */ + end_char = file[i]; + if (end_char != 0) { + file[i] = '\0'; + } + if (STAT(file, &st) == 0) { /* Something exists */ + if (!S_ISDIR(st.st_mode)) { +#if HTS_REMOVE_ANNOYING_INDEX + if (S_ISREG(st.st_mode)) { /* Regular file in place ; move it and create directory */ + sprintf(tmpbuf, "%s.txt", file); + if (RENAME(file, tmpbuf) != 0) { /* Can't rename regular file */ + return -1; + } + if (MKDIR(file) != 0) { /* Can't create directory */ + return -1; + } + } +#else +#error Not implemented +#endif + } + } else { /* Nothing exists ; create directory */ + if (MKDIR(file) != 0) { /* Can't create directory */ + return -1; + } + } + if (end_char == 0) { /* End */ + break; + } else { + file[i] = end_char; /* Restore / */ + } + } + return 0; +} + // sauver un fichier int filesave(httrackp* opt,const char* adr,int len,const char* s,const char* url_adr,const char* url_fil) { FILE* fp; @@ -2497,6 +2648,7 @@ int check_fatal_io_errno(void) { // ouvrir un fichier (avec chemin Un*x) +/* Note: utf-8 */ FILE* filecreate(filenote_strc *strc, const char* s) { char BIGSTK fname[HTS_URLMAXSIZE*2]; FILE* fp; @@ -2523,17 +2675,17 @@ FILE* filecreate(filenote_strc *strc, const char* s) { #endif /* Try to open the file */ - fp = fopen(fname, "wb"); + fp = FOPEN(fname, "wb"); /* Error ? Check the directory structure and retry. */ if (fp == NULL) { last_errno = errno; - if (structcheck(s) != 0) { + if (structcheck_utf8(s) != 0) { last_errno = errno; } else { last_errno = 0; } - fp = fopen(fname, "wb"); + fp = FOPEN(fname, "wb"); } if (fp == NULL && last_errno != 0) { errno = last_errno; @@ -2571,7 +2723,7 @@ FILE* fileappend(filenote_strc *strc,const char* s) { #endif // ouvrir - fp=fopen(fname,"ab"); + fp=FOPEN(fname,"ab"); #ifndef _WIN32 if (fp!=NULL) chmod(fname,HTS_ACCESS_FILE); @@ -2616,6 +2768,7 @@ int filenote(filenote_strc *strc, const char* s, filecreate_params* params) { return 1; } +/* Note: utf-8 */ void file_notify(httrackp* opt,const char* adr,const char* fil,const char* save,int create,int modify,int not_updated) { RUN_CALLBACK6(opt, filesave2, adr, fil, save, create, modify, not_updated); } @@ -2681,7 +2834,7 @@ static void postprocess_file(httrackp* opt,const char* save, const char* adr, co int n; if (rsc_fil == NULL) rsc_fil = fil; - if (strncmp(fslash(OPT_GET_BUFF(opt),save), fslash(OPT_GET_BUFF(opt),StringBuff(opt->path_html)), (n = (int)strlen(StringBuff(opt->path_html)))) == 0) { + if (strncmp(fslash(OPT_GET_BUFF(opt),save), fslash(OPT_GET_BUFF(opt),StringBuff(opt->path_html_utf8)), (n = (int)strlen(StringBuff(opt->path_html_utf8)))) == 0) { rsc_save += n; } @@ -2716,7 +2869,7 @@ static void postprocess_file(httrackp* opt,const char* save, const char* adr, co } } if (opt->state.mimehtml_created == 1 && opt->state.mimefp != NULL) { - FILE* fp = fopen(save, "rb"); + FILE* fp = FOPEN(save, "rb"); if (fp != NULL) { char buff[60*100 + 2]; char mimebuff[256]; |