diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/htscharset.c | 204 | ||||
-rw-r--r-- | src/htscharset.h | 15 | ||||
-rw-r--r-- | src/htsparse.c | 11 | ||||
-rw-r--r-- | src/htstools.c | 16 |
4 files changed, 236 insertions, 10 deletions
diff --git a/src/htscharset.c b/src/htscharset.c index ea96eff..48afe47 100644 --- a/src/htscharset.c +++ b/src/htscharset.c @@ -33,8 +33,11 @@ Please visit our Website: http://www.httrack.com #include "htscharset.h" #include "htsbase.h" +#include "punycode.h" -static int hts_isStringAscii(const char *s, size_t size) { +#include <assert.h> + +int hts_isStringAscii(const char *s, size_t size) { size_t i; for(i = 0; i < size; i++) { @@ -451,8 +454,7 @@ char *hts_convertStringToUTF8(const char *s, size_t size, const char *charset) { return strdup(""); } /* Already UTF-8 ? */ - if (strcasecmp(charset, "utf-8") == 0 || strcasecmp(charset, "utf8") == 0 - || hts_isStringAscii(s, size)) { + if (hts_isCharsetUTF8(charset) || hts_isStringAscii(s, size)) { return strndup(s, size); } /* Find codepage */ @@ -467,8 +469,7 @@ char *hts_convertStringFromUTF8(const char *s, size_t size, const char *charset) return strdup(""); } /* Already UTF-8 ? */ - if (strcasecmp(charset, "utf-8") == 0 || strcasecmp(charset, "utf8") == 0 - || hts_isStringAscii(s, size)) { + if (hts_isCharsetUTF8(charset) || hts_isStringAscii(s, size)) { return strndup(s, size); } /* Find codepage */ @@ -546,6 +547,12 @@ size_t hts_stringLengthUTF8(const char *s) { return len; } +int hts_isCharsetUTF8(const char *charset) { + return charset != NULL + && ( strcasecmp(charset, "utf-8") == 0 + || strcasecmp(charset, "utf8") == 0 ); +} + char *hts_getCharsetFromMeta(const char *html, size_t size) { int i; @@ -601,3 +608,190 @@ char *hts_getCharsetFromMeta(const char *html, size_t size) { } return NULL; } + +char *hts_convertStringUTF8ToIDNA(const char *s, size_t size) { + char *dest = NULL; + size_t capa = 0, destSize = 0; + size_t i, startSeg; + int nonAsciiFound; + +#undef ADD_BYTE +#undef INCREASE_CAPA +#define INCREASE_CAPA() do { \ + capa = capa < 16 ? 16 : capa << 1; \ + dest = realloc(dest, capa); \ + if (dest == NULL) { \ + return NULL; \ + } \ +} while(0) +#define ADD_BYTE(C) do { \ + if (capa == destSize) { \ + INCREASE_CAPA(); \ + } \ + dest[destSize++] = (char) (C); \ +} while(0) + + for(i = startSeg = 0, nonAsciiFound = FALSE ; i <= size ; i++) { + const unsigned char c = i < size ? (unsigned char) s[i] : 0; + /* separator (ending, url segment, scheme, path segment, query string) */ + if (c == 0 || c == '.' || c == ':' || c == '/' || c == '?') { + /* non-empty segment */ + if (startSeg != i) { + /* IDNA ? */ + if (nonAsciiFound) { + const size_t segSize = i - startSeg; + const unsigned char *segData = (const unsigned char*) &s[startSeg]; + punycode_uint *segInt = NULL; + size_t j, utfSeq, segOutputSize; + + punycode_uint output_length; + punycode_status status; + + /* IDNA prefix */ + ADD_BYTE('x'); + ADD_BYTE('n'); + ADD_BYTE('-'); + ADD_BYTE('-'); + + /* copy utf-8 to integers */ + segInt = malloc(segSize*sizeof(punycode_uint)); + for(j = 0, segOutputSize = 0, utfSeq = (size_t) -1 + ; j <= segSize ; j++) { + const unsigned char c = j < segSize ? segData[j] : 0; + + /* character start (ascii, or utf-8 leading sequence) */ + if (HTS_IS_LEADING_UTF8(c)) { + /* commit sequence ? */ + if (utfSeq != (size_t) -1) { + /* unicode character */ + punycode_uint uc = 0; + size_t step = 0; + + /* utf-8 sequence macro */ +#define SEQ_MATCH(FROM, TO) \ + (utfSeq < j && segData[utfSeq] >= FROM && segData[utfSeq] <= TO \ + && (uc *= (TO - FROM + 1), \ + uc += segData[utfSeq] - FROM, \ + utfSeq++, \ + 1) \ + ) + /* decode UTF-8 sequence */ + if (SEQ_MATCH(0xC2, 0xDF)) { + if (SEQ_MATCH(0x80, 0xBF)) { + uc += 0x0080; + } else { + uc = 0xfffd; /* replacement character */ + } + } else if (SEQ_MATCH(0xE0, 0xE0)) { + if (SEQ_MATCH(0xA0, 0xBF) && SEQ_MATCH(0x80, 0xBF)) { + uc += 0x0800; + } else { + uc = 0xfffd; /* replacement character */ + } + } else if (SEQ_MATCH(0xE1, 0xEC)) { + if (SEQ_MATCH(0x80, 0xBF) && SEQ_MATCH(0x80, 0xBF)) { + uc += 0x1000; + } else { + uc = 0xfffd; /* replacement character */ + } + } else if (SEQ_MATCH(0xED, 0xED)) { + if (SEQ_MATCH(0x80, 0x9F) && SEQ_MATCH(0x80, 0xBF)) { + uc += 0xD000; + } else { + uc = 0xfffd; /* replacement character */ + } + } else if (SEQ_MATCH(0xEE, 0xEF)) { + if (SEQ_MATCH(0x80, 0xBF) && SEQ_MATCH(0x80, 0xBF)) { + uc += 0xE000; + } else { + uc = 0xfffd; /* replacement character */ + } + } else if (SEQ_MATCH(0xF0, 0xF0)) { + if (SEQ_MATCH(0x90, 0xBF) && SEQ_MATCH(0x80, 0xBF) + && SEQ_MATCH(0x80, 0xBF)) { + uc += 0x10000; + } else { + uc = 0xfffd; /* replacement character */ + } + } else if (SEQ_MATCH(0xF1, 0xF3)) { + if (SEQ_MATCH(0x80, 0xBF) && SEQ_MATCH(0x80, 0xBF) + && SEQ_MATCH(0x80, 0xBF)) { + uc += 0x40000; + } else { + uc = 0xfffd; /* replacement character */ + } + } else if (SEQ_MATCH(0xF4, 0xF4)) { + if (SEQ_MATCH(0x80, 0x8F) && SEQ_MATCH(0x80, 0xBF) + && SEQ_MATCH(0x80, 0xBF)) { + uc += 0x100000; + } else { + uc = 0xfffd; /* replacement character */ + } + } else { + uc = 0xfffd; /* replacement character */ + } +#undef SEQ_MATCH + + /* copy character */ + assert(segOutputSize < segSize); + segInt[segOutputSize++] = uc; + + /* not anymore in sequence */ + utfSeq = (size_t) -1; + } + + /* ascii ? */ + if (c < 0x80) { + assert(segOutputSize < segSize); + segInt[segOutputSize] = c; + if (c != 0) { + segOutputSize++; + } + } + /* new UTF8 sequence */ + else { + utfSeq = j; + } + } + } + + /* encode */ + output_length = (punycode_uint) ( capa - destSize ); + while(status = punycode_encode((punycode_uint) segOutputSize, + segInt, NULL, &output_length, &dest[destSize]) + == punycode_big_output) { + INCREASE_CAPA(); + output_length = (punycode_uint) ( capa - destSize ); + } + + /* success ? */ + if (status == punycode_success) { + destSize += output_length; + } + } + /* copy ascii segment otherwise */ + else { + size_t j; + for(j = startSeg ; j < i ; j++) { + const unsigned char c = (unsigned char) s[j]; + ADD_BYTE(c); + } + } + } + /* next segment start */ + startSeg = i + 1; + nonAsciiFound = 0; + /* add separator (including terminating \0) */ + ADD_BYTE(c); + } + /* found non-ascii */ + else if (c >= 0x80) { + nonAsciiFound = 1; + } + } + +#undef ADD_BYTE +#undef INCREASE_CAPA + + return dest; +} diff --git a/src/htscharset.h b/src/htscharset.h index 2b9238a..cee644c 100644 --- a/src/htscharset.h +++ b/src/htscharset.h @@ -64,6 +64,16 @@ extern char *hts_convertStringFromUTF8(const char *s, size_t size, extern char *hts_getCharsetFromMeta(const char *html, size_t size); /** + * Is the given string an ASCII string ? + **/ +extern int hts_isStringAscii(const char *s, size_t size); + +/** + * Is the given charset the UTF-8 charset ? + **/ +extern int hts_isCharsetUTF8(const char *charset); + +/** * Get an UTF-8 string length in characters. **/ extern size_t hts_stringLengthUTF8(const char *s); @@ -85,6 +95,11 @@ extern char *hts_convertUCS2StringToUTF8(LPWSTR woutput, int wsize); **/ extern char *hts_convertStringSystemToUTF8(const char *s, size_t size); +/** + * Convert an UTF-8 string to an IDNA (RFC 3492) string. + **/ +extern char *hts_convertStringUTF8ToIDNA(const char *s, size_t size); + #endif #endif diff --git a/src/htsparse.c b/src/htsparse.c index fc3b6e1..73f66ac 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -2103,14 +2103,15 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { // ???? No! escape_spc_url(lien); strcatbuff(lien, query); /* restore */ - // Charset conversion for the URI filename - // (not for the query string!) - if (hasCharset) { + // Charset conversion for the URI filename, + // and not already UTF-8 + // (note: not for the query string!) + if (hasCharset && !hts_isCharsetUTF8(charset)) { char *const s = hts_convertStringToUTF8(lien, (int) strlen(lien), charset); if (s != NULL) { hts_log_print(opt, LOG_DEBUG, - "engine: save-name: charset conversion from '%s' to '%s' using charset '%s'", - lien, s, charset); + "engine: save-name: '%s' charset conversion from '%s' to '%s'", + charset, lien, s); strcpybuff(lien, s); free(s); } diff --git a/src/htstools.c b/src/htstools.c index 885bdc8..3a5ca70 100644 --- a/src/htstools.c +++ b/src/htstools.c @@ -40,6 +40,7 @@ Please visit our Website: http://www.httrack.com #include "htscore.h" #include "htstools.h" #include "htsstrings.h" +#include "htscharset.h" #ifdef _WIN32 #include "windows.h" #else @@ -276,6 +277,21 @@ int ident_url_relatif(const char *lien, const char *origin_adr, } } + // IDNA / RFC 3492 (Punycode) handling for HTTP(s) + if (!link_has_authority(adr) || strfield(adr, "https:")) { + char *const a = jump_identification(adr); + // Non-ASCII characters (theorically forbidden, but browsers are lenient) + if (!hts_isStringAscii(a, strlen(a))) { + char *const idna = hts_convertStringUTF8ToIDNA(a, strlen(a)); + if (idna != NULL) { + if (strlen(idna) < HTS_URLMAXSIZE) { + strcpybuff(a, idna); + } + free(idna); + } + } + } + return ok; } |