From 2d4f6880c1f9fe436d5dd4286fa584503c18c98d Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Sat, 1 Jun 2013 09:39:09 +0000 Subject: Fixed issue 14 (http://code.google.com/p/httrack/issues/detail?id=14) related to the way non-ascii characters are being decoded MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rationale: * inside URI * non-ascii characters are read with the page encoding, and transformed into UTF-8 * url-escaped %xx are considered utf-8 sequences to be decoded, unless they form invalis sequences (in such case we left them as-is) * html entities (names, or decimal/hex) are decoded as utf-8 characters * inside query string * non-ascii characters are read as binary, and escaped using %xx * url-escaped %xx are left unless not harmful (alphanum, for example) * html entities (names, or decimal/hex) are decoded as utf-8 characters and encoded back to the page encoding (possibly using %xx) * inside hostnames * non-ascii characters are encoded using IDNA Example: * are equivalent in a iso-8859-1 page: http://foo/café.html http://foo/caf%c3%a9.html http://caf&#a9;.html --- src/htsparse.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'src/htsparse.c') diff --git a/src/htsparse.c b/src/htsparse.c index 1619041..b6aa3b5 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -2109,15 +2109,12 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { query[0] = '\0'; } - // décoder l'inutile (%2E par exemple) et coder espaces - // Unescape high-chars for UTF-8 conversion - strcpybuff(lien, unescape_http_unharm(catbuff, lien, !hasCharset)); /* note: '%' is still escaped */ + // Unescape %XX, but not yet high-chars (supposedly encoded with UTF-8) + strcpybuff(lien, unescape_http_unharm(catbuff, lien, 1)); /* note: '%' is still escaped */ + + // Force to encode non-printable chars (should never happend) escape_remove_control(lien); - // we need to encode query string non-ascii chars, - // leaving the encoding as-is (unlike the file part) - escape_check_url(query); - // charset conversion for the URI filename, // and not already UTF-8 // (note: not for the query string!) @@ -2148,6 +2145,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { } } + // Decode remaining %XX high characters with UTF-8 + // but only when this leads to valid UTF-8. + // Otherwise, leave them unescaped. + if (hts_unescapeUrl(lien, catbuff, sizeof(catbuff)) == 0) { + strcpybuff(lien, catbuff); + } else { + hts_log_print(opt, LOG_WARNING, + "could not URL-decode string '%s'", lien); + } + + // we need to encode query string non-ascii chars, + // leaving the encoding as-is (unlike the file part) + escape_check_url(query); + // copy back query strcatbuff(lien, query); /* restore */ } -- cgit v1.2.3