Fixed issue 14 (http://code.google.com/p/httrack/issues/detail?id=14)

Rationale: * hostname is ASCII, non-ascii characters shall be encoded with IDNA * URI filenames may embed non-ascii characters, which MUST be UTF-8 encoded * query string may embed non-ascii characters, which are encoded with the pahe charset into %xx codes
author: Xavier Roche <xroche@users.noreply.github.com> 2013-05-31 11:38:53 +0000
committer: Xavier Roche <xroche@users.noreply.github.com> 2013-05-31 11:38:53 +0000
commit: bc31ec0da9573d482de24f27241482f50e46e60c (patch)
tree: e5e80dd055b2e4790802728d4e3b4b5b8c361277 /src/htsparse.c
parent: 8767fd0e750b70a121d95e3ecf7e59bcec499d95 (diff)
1 files changed, 32 insertions, 9 deletions
diff --git a/src/htsparse.c b/src/htsparse.c
index 52445b3..caace62 100644
--- a/src/htsparse.c
+++ b/src/htsparse.c
@@ -51,6 +51,7 @@ Please visit our Website: http://www.httrack.com
 #include "htsmd5.h"
 #include "htsindex.h"
 #include "htscharset.h"
+#include "htsencoding.h"
 
 /* external modules */
 #include "htsmodules.h"
@@ -2081,25 +2082,31 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
 
                 /* Unescape/escape %20 and other &nbsp; */
                 {
+                  // Note: always true (iso-8859-1 as default)
                   const char *const charset = str->page_charset_;
                   const int hasCharset = charset != NULL 
                     && *charset != '\0';
                   char BIGSTK query[HTS_URLMAXSIZE * 2];
-                  char *a = strchr(lien, '?');
+                  char *const a = strchr(lien, '?');
 
-                  if (a) {
+                  // cut query string
+                  if (a != NULL) {
                     strcpybuff(query, a);
                     *a = '\0';
-                  } else
+                  } else {
                     query[0] = '\0';
+                  }
+
                   // décoder l'inutile (%2E par exemple) et coder espaces
                   // Unescape high-chars for UTF-8 conversion
                   strcpybuff(lien, unescape_http_unharm(catbuff, lien, !hasCharset));     /* note: '%' is still escaped */
                   escape_remove_control(lien);
-                  // ???? No! escape_spc_url(lien);
-                  strcatbuff(lien, query);      /* restore */
+                  
+                  // we need to encode query string non-ascii chars, 
+                  // leaving the encoding as-is (unlike the file part)
+                  escape_check_url(query);
 
-                  // Charset conversion for the URI filename, 
+                  // charset conversion for the URI filename, 
                   // and not already UTF-8
                   // (note: not for the query string!)
                   if (hasCharset && !hts_isCharsetUTF8(charset)) {
@@ -2112,9 +2119,25 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                       free(s);
                     }
                   }
-                  // conversion entities
-                  unescape_amp(lien);
-                  unescape_amp(query);
+
+                  // decode URI entities with UTF-8 charset
+                  if (!hts_unescapeEntities(lien, lien, strlen(lien))) {
+                    hts_log_print(opt, LOG_WARNING,
+                      "could not decode URI '%s' with charset '%s'", lien, charset);
+                  }
+
+                  // decode query string entities with page charset
+                  if (hasCharset) {
+                    if (!hts_unescapeEntitiesWithCharset(query, 
+                                                         query, strlen(query),
+                                                         charset)) {
+                        hts_log_print(opt, LOG_WARNING,
+                          "could not decode query string '%s' with charset '%s'", query, charset);
+                    }
+                  }
+
+                  // copy back query
+                  strcatbuff(lien, query);      /* restore */
                 }
 
                 // convertir les éventuels \ en des / pour éviter des problèmes de reconnaissance!
author	Xavier Roche <xroche@users.noreply.github.com>	2013-05-31 11:38:53 +0000
committer	Xavier Roche <xroche@users.noreply.github.com>	2013-05-31 11:38:53 +0000
commit	bc31ec0da9573d482de24f27241482f50e46e60c (patch)
tree	e5e80dd055b2e4790802728d4e3b4b5b8c361277 /src/htsparse.c
parent	8767fd0e750b70a121d95e3ecf7e59bcec499d95 (diff)