summaryrefslogtreecommitdiff
path: root/src/htsparse.c
diff options
context:
space:
mode:
authorXavier Roche <xroche@users.noreply.github.com>2013-06-01 09:39:09 +0000
committerXavier Roche <xroche@users.noreply.github.com>2013-06-01 09:39:09 +0000
commit2d4f6880c1f9fe436d5dd4286fa584503c18c98d (patch)
tree07fdf1a70871e323c8e280c313449146ae9865c3 /src/htsparse.c
parent7b5c1c5a8487fe9dfcd2799359a5395ccf797372 (diff)
Fixed issue 14 (http://code.google.com/p/httrack/issues/detail?id=14) related to the way non-ascii characters are being decoded
Rationale: * inside URI * non-ascii characters are read with the page encoding, and transformed into UTF-8 * url-escaped %xx are considered utf-8 sequences to be decoded, unless they form invalis sequences (in such case we left them as-is) * html entities (names, or decimal/hex) are decoded as utf-8 characters * inside query string * non-ascii characters are read as binary, and escaped using %xx * url-escaped %xx are left unless not harmful (alphanum, for example) * html entities (names, or decimal/hex) are decoded as utf-8 characters and encoded back to the page encoding (possibly using %xx) * inside hostnames * non-ascii characters are encoded using IDNA Example: * are equivalent in a iso-8859-1 page: http://foo/café.html http://foo/caf%c3%a9.html http://caf&#a9;.html
Diffstat (limited to 'src/htsparse.c')
-rw-r--r--src/htsparse.c25
1 files changed, 18 insertions, 7 deletions
diff --git a/src/htsparse.c b/src/htsparse.c
index 1619041..b6aa3b5 100644
--- a/src/htsparse.c
+++ b/src/htsparse.c
@@ -2109,15 +2109,12 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
query[0] = '\0';
}
- // décoder l'inutile (%2E par exemple) et coder espaces
- // Unescape high-chars for UTF-8 conversion
- strcpybuff(lien, unescape_http_unharm(catbuff, lien, !hasCharset)); /* note: '%' is still escaped */
+ // Unescape %XX, but not yet high-chars (supposedly encoded with UTF-8)
+ strcpybuff(lien, unescape_http_unharm(catbuff, lien, 1)); /* note: '%' is still escaped */
+
+ // Force to encode non-printable chars (should never happend)
escape_remove_control(lien);
- // we need to encode query string non-ascii chars,
- // leaving the encoding as-is (unlike the file part)
- escape_check_url(query);
-
// charset conversion for the URI filename,
// and not already UTF-8
// (note: not for the query string!)
@@ -2148,6 +2145,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
}
}
+ // Decode remaining %XX high characters with UTF-8
+ // but only when this leads to valid UTF-8.
+ // Otherwise, leave them unescaped.
+ if (hts_unescapeUrl(lien, catbuff, sizeof(catbuff)) == 0) {
+ strcpybuff(lien, catbuff);
+ } else {
+ hts_log_print(opt, LOG_WARNING,
+ "could not URL-decode string '%s'", lien);
+ }
+
+ // we need to encode query string non-ascii chars,
+ // leaving the encoding as-is (unlike the file part)
+ escape_check_url(query);
+
// copy back query
strcatbuff(lien, query); /* restore */
}