From 2d4f6880c1f9fe436d5dd4286fa584503c18c98d Mon Sep 17 00:00:00 2001
From: Xavier Roche <xroche@users.noreply.github.com>
Date: Sat, 1 Jun 2013 09:39:09 +0000
Subject: Fixed issue 14 (http://code.google.com/p/httrack/issues/detail?id=14)
 related to the way non-ascii characters are being decoded
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rationale:
  * inside URI
    * non-ascii characters are read with the page encoding, and transformed into UTF-8
    * url-escaped %xx are considered utf-8 sequences to be decoded, unless they form invalis sequences (in such case we left them as-is)
    * html entities (names, or decimal/hex) are decoded as utf-8 characters
  * inside query string
    * non-ascii characters are read as binary, and escaped using %xx
    * url-escaped %xx are left unless not harmful (alphanum, for example)
    * html entities (names, or decimal/hex) are decoded as utf-8 characters and encoded back to the page encoding (possibly using %xx)
  * inside hostnames
    * non-ascii characters are encoded using IDNA

Example:
  * are equivalent in a iso-8859-1 page: http://foo/café.html http://foo/caf%c3%a9.html http://caf&#a9;.html
---
 src/htsparse.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'src/htsparse.c')

diff --git a/src/htsparse.c b/src/htsparse.c
index 1619041..b6aa3b5 100644
--- a/src/htsparse.c
+++ b/src/htsparse.c
@@ -2109,15 +2109,12 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                     query[0] = '\0';
                   }
 
-                  // décoder l'inutile (%2E par exemple) et coder espaces
-                  // Unescape high-chars for UTF-8 conversion
-                  strcpybuff(lien, unescape_http_unharm(catbuff, lien, !hasCharset));     /* note: '%' is still escaped */
+                  // Unescape %XX, but not yet high-chars (supposedly encoded with UTF-8)
+                  strcpybuff(lien, unescape_http_unharm(catbuff, lien, 1));     /* note: '%' is still escaped */
+
+                  // Force to encode non-printable chars (should never happend)
                   escape_remove_control(lien);
                   
-                  // we need to encode query string non-ascii chars, 
-                  // leaving the encoding as-is (unlike the file part)
-                  escape_check_url(query);
-
                   // charset conversion for the URI filename, 
                   // and not already UTF-8
                   // (note: not for the query string!)
@@ -2148,6 +2145,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                     }
                   }
 
+                  // Decode remaining %XX high characters with UTF-8 
+                  // but only when this leads to valid UTF-8.
+                  // Otherwise, leave them unescaped.
+                  if (hts_unescapeUrl(lien, catbuff, sizeof(catbuff)) == 0) {
+                    strcpybuff(lien, catbuff);
+                  } else {
+                    hts_log_print(opt, LOG_WARNING,
+                      "could not URL-decode string '%s'", lien);
+                  }
+
+                  // we need to encode query string non-ascii chars, 
+                  // leaving the encoding as-is (unlike the file part)
+                  escape_check_url(query);
+
                   // copy back query
                   strcatbuff(lien, query);      /* restore */
                 }
-- 
cgit v1.2.3