diff options
author | Xavier Roche <xroche@users.noreply.github.com> | 2013-06-01 09:39:09 +0000 |
---|---|---|
committer | Xavier Roche <xroche@users.noreply.github.com> | 2013-06-01 09:39:09 +0000 |
commit | 2d4f6880c1f9fe436d5dd4286fa584503c18c98d (patch) | |
tree | 07fdf1a70871e323c8e280c313449146ae9865c3 /src | |
parent | 7b5c1c5a8487fe9dfcd2799359a5395ccf797372 (diff) |
Fixed issue 14 (http://code.google.com/p/httrack/issues/detail?id=14) related to the way non-ascii characters are being decoded
Rationale:
* inside URI
* non-ascii characters are read with the page encoding, and transformed into UTF-8
* url-escaped %xx are considered utf-8 sequences to be decoded, unless they form invalis sequences (in such case we left them as-is)
* html entities (names, or decimal/hex) are decoded as utf-8 characters
* inside query string
* non-ascii characters are read as binary, and escaped using %xx
* url-escaped %xx are left unless not harmful (alphanum, for example)
* html entities (names, or decimal/hex) are decoded as utf-8 characters and encoded back to the page encoding (possibly using %xx)
* inside hostnames
* non-ascii characters are encoded using IDNA
Example:
* are equivalent in a iso-8859-1 page: http://foo/café.html http://foo/caf%c3%a9.html http://caf&#a9;.html
Diffstat (limited to 'src')
-rw-r--r-- | src/htsparse.c | 25 |
1 files changed, 18 insertions, 7 deletions
diff --git a/src/htsparse.c b/src/htsparse.c index 1619041..b6aa3b5 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -2109,15 +2109,12 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { query[0] = '\0'; } - // décoder l'inutile (%2E par exemple) et coder espaces - // Unescape high-chars for UTF-8 conversion - strcpybuff(lien, unescape_http_unharm(catbuff, lien, !hasCharset)); /* note: '%' is still escaped */ + // Unescape %XX, but not yet high-chars (supposedly encoded with UTF-8) + strcpybuff(lien, unescape_http_unharm(catbuff, lien, 1)); /* note: '%' is still escaped */ + + // Force to encode non-printable chars (should never happend) escape_remove_control(lien); - // we need to encode query string non-ascii chars, - // leaving the encoding as-is (unlike the file part) - escape_check_url(query); - // charset conversion for the URI filename, // and not already UTF-8 // (note: not for the query string!) @@ -2148,6 +2145,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { } } + // Decode remaining %XX high characters with UTF-8 + // but only when this leads to valid UTF-8. + // Otherwise, leave them unescaped. + if (hts_unescapeUrl(lien, catbuff, sizeof(catbuff)) == 0) { + strcpybuff(lien, catbuff); + } else { + hts_log_print(opt, LOG_WARNING, + "could not URL-decode string '%s'", lien); + } + + // we need to encode query string non-ascii chars, + // leaving the encoding as-is (unlike the file part) + escape_check_url(query); + // copy back query strcatbuff(lien, query); /* restore */ } |