From bc31ec0da9573d482de24f27241482f50e46e60c Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Fri, 31 May 2013 11:38:53 +0000 Subject: Fixed issue 14 (http://code.google.com/p/httrack/issues/detail?id=14) Rationale: * hostname is ASCII, non-ascii characters shall be encoded with IDNA * URI filenames may embed non-ascii characters, which MUST be UTF-8 encoded * query string may embed non-ascii characters, which are encoded with the pahe charset into %xx codes --- src/htsencoding.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'src/htsencoding.c') diff --git a/src/htsencoding.c b/src/htsencoding.c index 46c57a4..0fa21fc 100644 --- a/src/htsencoding.c +++ b/src/htsencoding.c @@ -60,7 +60,7 @@ static int get_hex_value(char c) { (HASH) += (C); \ } while(0) -int hts_unescape_entities(const char *src, char *dest, const size_t max) { +int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t max, const char *charset) { size_t i, j, ampStart, ampStartDest; int uc; int hex; @@ -106,8 +106,29 @@ int hts_unescape_entities(const char *src, char *dest, const size_t max) { /* success ? */ if (uc > 0) { + const size_t maxOut = max - ampStartDest; /* write at position */ - len = hts_writeUTF8(uc, &dest[ampStartDest], max - ampStartDest); + if (charset != NULL && hts_isCharsetUTF8(charset)) { + len = hts_writeUTF8(uc, &dest[ampStartDest], maxOut); + } else { + size_t ulen; + char buffer[32]; + len = 0; + if ( ( ulen = hts_writeUTF8(uc, buffer, sizeof(buffer)) ) != 0) { + char *s; + buffer[ulen] = '\0'; + s = hts_convertStringFromUTF8(buffer, strlen(buffer), charset); + if (s != NULL) { + const size_t sLen = strlen(s); + if (sLen < maxOut) { + // Do not copy \0. + memcpy(&dest[ampStartDest], s, sLen); + ulen = sLen; + } + free(s); + } + } + } if (len > 0) { /* new dest position */ j = ampStartDest + len; @@ -174,3 +195,7 @@ int hts_unescape_entities(const char *src, char *dest, const size_t max) { return 0; } + +int hts_unescapeEntities(const char *src, char *dest, const size_t max) { + return hts_unescapeEntitiesWithCharset(src, dest, max, "UTF-8"); +} -- cgit v1.2.3