diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/htscharset.c | 93 | ||||
-rw-r--r-- | src/htscharset.h | 6 | ||||
-rw-r--r-- | src/htscore.c | 13 | ||||
-rw-r--r-- | src/htsparse.c | 31 |
4 files changed, 134 insertions, 9 deletions
diff --git a/src/htscharset.c b/src/htscharset.c index 4c97228..1c21590 100644 --- a/src/htscharset.c +++ b/src/htscharset.c @@ -34,6 +34,17 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ +int hts_isStringAscii(const char *s, size_t size) { + size_t i; + for(i = 0 ; i < size ; i++) { + const unsigned char c = (const unsigned char) s[i]; + if (c >= 0x80) { + return 0; + } + } + return 1; +} + #ifdef _WIN32 #include <windows.h> @@ -255,12 +266,12 @@ LPWSTR hts_convertUTF8StringToUCS2(const char *s, int size, int* pwsize) { return hts_convertStringToUCS2(s, size, CP_UTF8, pwsize); } -char *hts_convertUCS2StringToUTF8(LPWSTR woutput, int wsize) { - const int usize = WideCharToMultiByte(CP_UTF8, 0, woutput, wsize, NULL, 0, NULL, FALSE); +char *hts_convertUCS2StringToCP(LPWSTR woutput, int wsize, UINT cp) { + const int usize = WideCharToMultiByte(cp, 0, woutput, wsize, NULL, 0, NULL, FALSE); if (usize > 0) { char *const uoutput = malloc((usize + 1)*sizeof(char)); if (uoutput != NULL) { - if (WideCharToMultiByte(CP_UTF8, 0, woutput, wsize, uoutput, usize, NULL, FALSE) == usize) { + if (WideCharToMultiByte(cp, 0, woutput, wsize, uoutput, usize, NULL, FALSE) == usize) { uoutput[usize] = '\0'; return uoutput; } else { @@ -271,13 +282,17 @@ char *hts_convertUCS2StringToUTF8(LPWSTR woutput, int wsize) { return NULL; } +char *hts_convertUCS2StringToUTF8(LPWSTR woutput, int wsize) { + return hts_convertUCS2StringToCP(woutput, wsize, CP_UTF8); +} + char *hts_convertStringCPToUTF8(const char *s, size_t size, UINT cp) { /* Empty string ? */ if (size == 0) { return strndup(s, size); } /* Already UTF-8 ? */ - if (cp == CP_UTF8) { + if (cp == CP_UTF8 || hts_isStringAscii(s, size)) { return strndup(s, size); } /* Other (valid) charset */ @@ -291,6 +306,32 @@ char *hts_convertStringCPToUTF8(const char *s, size_t size, UINT cp) { return uoutput; } } + + /* Error, charset not found! */ + return NULL; +} + +char *hts_convertStringCPFromUTF8(const char *s, size_t size, UINT cp) { + /* Empty string ? */ + if (size == 0) { + return strndup(s, size); + } + /* Already UTF-8 ? */ + if (cp == CP_UTF8 || hts_isStringAscii(s, size)) { + return strndup(s, size); + } + /* Other (valid) charset */ + else if (cp != 0) { + /* Size in wide chars of the output */ + int wsize; + LPWSTR woutput = hts_convertStringToUCS2(s, (int) size, CP_UTF8, &wsize); + if (woutput != NULL) { + char *const uoutput = hts_convertUCS2StringToCP(woutput, wsize, cp); + free(woutput); + return uoutput; + } + } + /* Error, charset not found! */ return NULL; } @@ -300,6 +341,11 @@ char *hts_convertStringToUTF8(const char *s, size_t size, const char *charset) { return hts_convertStringCPToUTF8(s, size, cp); } +char *hts_convertStringFromUTF8(const char *s, size_t size, const char *charset) { + const UINT cp = hts_getCodepage(charset); + return hts_convertStringCPFromUTF8(s, size, cp); +} + char *hts_convertStringSystemToUTF8(const char *s, size_t size) { return hts_convertStringCPToUTF8(s, size, GetACP()); } @@ -310,18 +356,18 @@ char *hts_convertStringSystemToUTF8(const char *s, size_t size) { #include <errno.h> #include <iconv.h> -char *hts_convertStringToUTF8(const char *s, size_t size, const char *charset) { +char *hts_convertStringToUTF8_(const char *s, size_t size, const char *to, const char *from) { /* Empty string ? */ if (size == 0) { return strdup(""); } - /* Already UTF-8 ? */ - if (strcasecmp(charset, "utf-8") == 0 || strcasecmp(charset, "utf8") == 0) { + /* Already on correct charset ? */ + if (strcasecmp(from, to) == 0) { return strndup(s, size); } /* Find codepage */ else { - const iconv_t cp = iconv_open("utf-8", charset); + const iconv_t cp = iconv_open(to, from); if (cp != (iconv_t) -1) { char *inbuf = (char*) s; size_t inbytesleft = size; @@ -373,10 +419,41 @@ char *hts_convertStringToUTF8(const char *s, size_t size, const char *charset) { return outbuf; } } + /* Error, charset not found! */ return NULL; } +char *hts_convertStringToUTF8(const char *s, size_t size, const char *charset) { + /* Empty string ? */ + if (size == 0) { + return strdup(""); + } + /* Already UTF-8 ? */ + if (strcasecmp(charset, "utf-8") == 0 || strcasecmp(charset, "utf8") == 0 || hts_isStringAscii(s, size)) { + return strndup(s, size); + } + /* Find codepage */ + else { + return hts_convertStringToUTF8_(s, size, "utf-8", charset); + } +} + +char *hts_convertStringFromUTF8(const char *s, size_t size, const char *charset) { + /* Empty string ? */ + if (size == 0) { + return strdup(""); + } + /* Already UTF-8 ? */ + if (strcasecmp(charset, "utf-8") == 0 || strcasecmp(charset, "utf8") == 0 || hts_isStringAscii(s, size)) { + return strndup(s, size); + } + /* Find codepage */ + else { + return hts_convertStringToUTF8_(s, size, charset, "utf-8"); + } +} + #endif char* hts_getCharsetFromContentType(const char *mime) { diff --git a/src/htscharset.h b/src/htscharset.h index 035f05e..2d76b3c 100644 --- a/src/htscharset.h +++ b/src/htscharset.h @@ -44,6 +44,12 @@ Please visit our Website: http://www.httrack.com extern char *hts_convertStringToUTF8(const char *s, size_t size, const char *charset); /** + * Convert the string "s" from UTF-8 to charset "charset". + * Return NULL upon error. + **/ +extern char *hts_convertStringFromUTF8(const char *s, size_t size, const char *charset); + +/** * Extract the charset from the HTML buffer "html" **/ extern char* hts_getCharsetFromMeta(const char *html, size_t size); diff --git a/src/htscore.c b/src/htscore.c index 8d62df7..de202db 100644 --- a/src/htscore.c +++ b/src/htscore.c @@ -1446,11 +1446,14 @@ int httpmirror(char* url1, httrackp* opt) { /* Detect charset to convert links into proper UTF8 filenames */ page_charset[0] = '\0'; if (opt->convert_utf8) { + /* HTTP charset is prioritary over meta */ if (r.charset[0] != '\0') { if (strlen(r.charset) < sizeof(page_charset)) { strcpy(page_charset, r.charset); } - } else if (is_html_mime_type(r.contenttype)) { + } + /* Attempt to find a meta charset */ + else if (is_html_mime_type(r.contenttype)) { char *const charset = hts_getCharsetFromMeta(r.adr, r.size); if (charset != NULL && strlen(charset) < sizeof(page_charset)) { strcpy(page_charset, charset); @@ -1458,6 +1461,14 @@ int httpmirror(char* url1, httrackp* opt) { if (charset != NULL) free(charset); } + /* Could not detect charset */ + if (page_charset[0] == '\0') { + if ( (opt->debug>0) && (opt->log!=NULL) ) { + HTS_LOG(opt,LOG_INFO); fprintf(opt->log,"Warning: could not detect encoding for: %s%s"LF,urladr,urlfil); + } + /* Fallback to ISO-8859-1 (~== identity) ; accents will look weird */ + strcpy(page_charset, "iso-8859-1"); + } } /* Info for wrappers */ diff --git a/src/htsparse.c b/src/htsparse.c index f127f0d..18059f5 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -54,6 +54,7 @@ Please visit our Website: http://www.httrack.com #include "htsbauth.h" #include "htsmd5.h" #include "htsindex.h" +#include "htscharset.h" /* external modules */ #include "htsmodules.h" @@ -2783,6 +2784,21 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { } } + // convert to local codepage + if (str->page_charset_ != NULL && *str->page_charset_ != '\0') { + char *const local_save = hts_convertStringFromUTF8(tempo, strlen(tempo), str->page_charset_); + if (local_save != NULL) { + strcpybuff(tempo, local_save); + free(local_save); + } else { + if ((opt->debug>1) && (opt->log!=NULL)) { + HTS_LOG(opt,LOG_DEBUG); + fprintf(opt->log, "Warning: could not build local charset representation of '%s' in '%s'"LF, tempo, str->page_charset_); + test_flush; + } + } + } + // put original query string if any (ex: "www.example.com/foo4242.html?q=45) pos = strchr(fil, '?'); if (pos != NULL) { @@ -2872,6 +2888,21 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) { } if ((opt->getmode & 1) && (ptr>0)) { + // convert to local codepage - NOT, already converted into %NN, and passed to the remote server so we do not have anything to do + //if (str->page_charset_ != NULL && *str->page_charset_ != '\0') { + // char *const local_save = hts_convertStringFromUTF8(tempo, strlen(tempo), str->page_charset_); + // if (local_save != NULL) { + // strcpybuff(tempo, local_save); + // free(local_save); + // } else { + // if ((opt->debug>1) && (opt->log!=NULL)) { + // HTS_LOG(opt,LOG_DEBUG); + // fprintf(opt->log, "Warning: could not build local charset representation of '%s' in '%s'"LF, tempo, str->page_charset_); + // test_flush; + // } + // } + //} + // écrire le lien modifié, relatif // Note: escape all chars, even >127 (no UTF) HT_ADD_HTMLESCAPED_FULL(tempo); |