5 files changed, 80 insertions, 18 deletions
diff --git a/src/htscoremain.c b/src/htscoremain.c
index 534c469..c2ff520 100644
--- a/src/htscoremain.c
+++ b/src/htscoremain.c
@@ -2342,10 +2342,13 @@ HTSEXT_API int hts_main2(int argc, char **argv, httrackp * opt) {
                 htsmain_free();
                 return 0;
                 break;
-              case '6':  // entities: httrack -#6
+              case '6':  // entities: httrack -#6 "&foo;" ["encoding"]
                 if (++na < argc) {
                   char *const s = strdup(argv[na]);
-                  if (s != NULL && hts_unescape_entities(s, s, strlen(s)) == 0) {
+                  const char *const enc = na + 1 < argc ? argv[na + 1] : "UTF-8";
+                  if (s != NULL 
+                    && hts_unescapeEntitiesWithCharset(s, s, strlen(s), 
+                                                       enc) == 0) {
                     printf("%s\n", s);
                     free(s);
                   } else {
diff --git a/src/htsencoding.c b/src/htsencoding.c
index 46c57a4..0fa21fc 100644
--- a/src/htsencoding.c
+++ b/src/htsencoding.c
@@ -60,7 +60,7 @@ static int get_hex_value(char c) {
     (HASH) += (C);                              \
   } while(0)
 
-int hts_unescape_entities(const char *src, char *dest, const size_t max) {
+int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t max, const char *charset) {
   size_t i, j, ampStart, ampStartDest;
   int uc;
   int hex;
@@ -106,8 +106,29 @@ int hts_unescape_entities(const char *src, char *dest, const size_t max) {
         
         /* success ? */
         if (uc > 0) {
+          const size_t maxOut = max - ampStartDest;
           /* write at position */
-          len = hts_writeUTF8(uc, &dest[ampStartDest], max - ampStartDest);
+          if (charset != NULL && hts_isCharsetUTF8(charset)) {
+            len = hts_writeUTF8(uc, &dest[ampStartDest], maxOut);
+          } else {
+            size_t ulen;
+            char buffer[32];
+            len = 0;
+            if ( ( ulen = hts_writeUTF8(uc, buffer, sizeof(buffer)) ) != 0) {
+              char *s;
+              buffer[ulen] = '\0';
+              s = hts_convertStringFromUTF8(buffer, strlen(buffer), charset);
+              if (s != NULL) {
+                const size_t sLen = strlen(s);
+                if (sLen < maxOut) {
+                  // Do not copy \0.
+                  memcpy(&dest[ampStartDest], s, sLen);
+                  ulen = sLen;
+                }
+                free(s);
+              }
+            }
+          }
           if (len > 0) {
             /* new dest position */
             j = ampStartDest + len;
@@ -174,3 +195,7 @@ int hts_unescape_entities(const char *src, char *dest, const size_t max) {
 
   return 0;
 }
+
+int hts_unescapeEntities(const char *src, char *dest, const size_t max) {
+  return hts_unescapeEntitiesWithCharset(src, dest, max, "UTF-8");
+}
diff --git a/src/htsencoding.h b/src/htsencoding.h
index 4dfd367..cd35a00 100644
--- a/src/htsencoding.h
+++ b/src/htsencoding.h
@@ -31,8 +31,8 @@ Please visit our Website: http://www.httrack.com
 /* Author: Xavier Roche                                         */
 /* ------------------------------------------------------------ */
 
-#ifndef HTS_CHARSET_DEFH
-#define HTS_CHARSET_DEFH
+#ifndef HTS_ENCODING_DEFH
+#define HTS_ENCODING_DEFH
 
 /** Standard includes. **/
 #include <stdlib.h>
@@ -48,8 +48,19 @@ Please visit our Website: http://www.httrack.com
  * needs to hold as space as the source.
  * Returns 0 upon success.
  **/
-extern int hts_unescape_entities(const char *src,
-                                 char *dest, const size_t max);
+extern int hts_unescapeEntities(const char *src,
+                                char *dest, const size_t max);
+
+/**
+ * Unescape HTML entities (as per HTML 4.0 Specification)
+ * and replace them in-place by their charset equivalents.
+ * Note: source and destination may be the same, and the destination only
+ * needs to hold as space as the source.
+ * Returns 0 upon success.
+ **/
+extern int hts_unescapeEntitiesWithCharset(const char *src,
+                                           char *dest, const size_t max,
+                                           const char *charset);
 
 #endif
 
diff --git a/src/htslib.c b/src/htslib.c
index 8b53b88..f74efe1 100644
--- a/src/htslib.c
+++ b/src/htslib.c
@@ -3753,7 +3753,7 @@ static int ehex(const char *s) {
 }
 
 void unescape_amp(char *s) {
-  if (hts_unescape_entities(s, s, strlen(s) + 1) != 0) {
+  if (hts_unescapeEntities(s, s, strlen(s) + 1) != 0) {
     assertf(! "error escaping html entities");
   }
 }
diff --git a/src/htsparse.c b/src/htsparse.c
index 52445b3..caace62 100644
--- a/src/htsparse.c
+++ b/src/htsparse.c
@@ -51,6 +51,7 @@ Please visit our Website: http://www.httrack.com
 #include "htsmd5.h"
 #include "htsindex.h"
 #include "htscharset.h"
+#include "htsencoding.h"
 
 /* external modules */
 #include "htsmodules.h"
@@ -2081,25 +2082,31 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
 
                 /* Unescape/escape %20 and other &nbsp; */
                 {
+                  // Note: always true (iso-8859-1 as default)
                   const char *const charset = str->page_charset_;
                   const int hasCharset = charset != NULL 
                     && *charset != '\0';
                   char BIGSTK query[HTS_URLMAXSIZE * 2];
-                  char *a = strchr(lien, '?');
+                  char *const a = strchr(lien, '?');
 
-                  if (a) {
+                  // cut query string
+                  if (a != NULL) {
                     strcpybuff(query, a);
                     *a = '\0';
-                  } else
+                  } else {
                     query[0] = '\0';
+                  }
+
                   // décoder l'inutile (%2E par exemple) et coder espaces
                   // Unescape high-chars for UTF-8 conversion
                   strcpybuff(lien, unescape_http_unharm(catbuff, lien, !hasCharset));     /* note: '%' is still escaped */
                   escape_remove_control(lien);
-                  // ???? No! escape_spc_url(lien);
-                  strcatbuff(lien, query);      /* restore */
+                  
+                  // we need to encode query string non-ascii chars, 
+                  // leaving the encoding as-is (unlike the file part)
+                  escape_check_url(query);
 
-                  // Charset conversion for the URI filename, 
+                  // charset conversion for the URI filename, 
                   // and not already UTF-8
                   // (note: not for the query string!)
                   if (hasCharset && !hts_isCharsetUTF8(charset)) {
@@ -2112,9 +2119,25 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                       free(s);
                     }
                   }
-                  // conversion entities
-                  unescape_amp(lien);
-                  unescape_amp(query);
+
+                  // decode URI entities with UTF-8 charset
+                  if (!hts_unescapeEntities(lien, lien, strlen(lien))) {
+                    hts_log_print(opt, LOG_WARNING,
+                      "could not decode URI '%s' with charset '%s'", lien, charset);
+                  }
+
+                  // decode query string entities with page charset
+                  if (hasCharset) {
+                    if (!hts_unescapeEntitiesWithCharset(query, 
+                                                         query, strlen(query),
+                                                         charset)) {
+                        hts_log_print(opt, LOG_WARNING,
+                          "could not decode query string '%s' with charset '%s'", query, charset);
+                    }
+                  }
+
+                  // copy back query
+                  strcatbuff(lien, query);      /* restore */
                 }
 
                 // convertir les éventuels \ en des / pour éviter des problèmes de reconnaissance!