4 files changed, 236 insertions, 10 deletions
diff --git a/src/htscharset.c b/src/htscharset.c
index ea96eff..48afe47 100644
--- a/src/htscharset.c
+++ b/src/htscharset.c
@@ -33,8 +33,11 @@ Please visit our Website: http://www.httrack.com
 
 #include "htscharset.h"
 #include "htsbase.h"
+#include "punycode.h"
 
-static int hts_isStringAscii(const char *s, size_t size) {
+#include <assert.h>
+
+int hts_isStringAscii(const char *s, size_t size) {
   size_t i;
 
   for(i = 0; i < size; i++) {
@@ -451,8 +454,7 @@ char *hts_convertStringToUTF8(const char *s, size_t size, const char *charset) {
     return strdup("");
   }
   /* Already UTF-8 ? */
-  if (strcasecmp(charset, "utf-8") == 0 || strcasecmp(charset, "utf8") == 0
-      || hts_isStringAscii(s, size)) {
+  if (hts_isCharsetUTF8(charset) || hts_isStringAscii(s, size)) {
     return strndup(s, size);
   }
   /* Find codepage */
@@ -467,8 +469,7 @@ char *hts_convertStringFromUTF8(const char *s, size_t size, const char *charset)
     return strdup("");
   }
   /* Already UTF-8 ? */
-  if (strcasecmp(charset, "utf-8") == 0 || strcasecmp(charset, "utf8") == 0
-      || hts_isStringAscii(s, size)) {
+  if (hts_isCharsetUTF8(charset) || hts_isStringAscii(s, size)) {
     return strndup(s, size);
   }
   /* Find codepage */
@@ -546,6 +547,12 @@ size_t hts_stringLengthUTF8(const char *s) {
   return len;
 }
 
+int hts_isCharsetUTF8(const char *charset) {
+  return charset != NULL 
+    && ( strcasecmp(charset, "utf-8") == 0 
+         || strcasecmp(charset, "utf8") == 0 );
+}
+
 char *hts_getCharsetFromMeta(const char *html, size_t size) {
   int i;
 
@@ -601,3 +608,190 @@ char *hts_getCharsetFromMeta(const char *html, size_t size) {
   }
   return NULL;
 }
+
+char *hts_convertStringUTF8ToIDNA(const char *s, size_t size) {
+  char *dest = NULL;
+  size_t capa = 0, destSize = 0;
+  size_t i, startSeg;
+  int nonAsciiFound;
+
+#undef ADD_BYTE
+#undef INCREASE_CAPA
+#define INCREASE_CAPA() do { \
+  capa = capa < 16 ? 16 : capa << 1; \
+  dest = realloc(dest, capa); \
+  if (dest == NULL) { \
+    return NULL; \
+  } \
+} while(0)
+#define ADD_BYTE(C) do { \
+  if (capa == destSize) { \
+    INCREASE_CAPA(); \
+  } \
+  dest[destSize++] = (char) (C); \
+} while(0)
+
+  for(i = startSeg = 0, nonAsciiFound = FALSE ; i <= size ; i++) {
+    const unsigned char c = i < size ? (unsigned char) s[i] : 0;
+    /* separator (ending, url segment, scheme, path segment, query string) */
+    if (c == 0 || c == '.' || c == ':' || c == '/' || c == '?') {
+      /* non-empty segment */
+      if (startSeg != i) {
+        /* IDNA ? */
+        if (nonAsciiFound) {
+          const size_t segSize = i - startSeg;
+          const unsigned char *segData = (const unsigned char*) &s[startSeg];
+          punycode_uint *segInt = NULL;
+          size_t j, utfSeq, segOutputSize;
+
+          punycode_uint output_length;
+          punycode_status status;
+
+          /* IDNA prefix */
+          ADD_BYTE('x');
+          ADD_BYTE('n');
+          ADD_BYTE('-');
+          ADD_BYTE('-');
+          
+          /* copy utf-8 to integers */
+          segInt = malloc(segSize*sizeof(punycode_uint));
+          for(j = 0, segOutputSize = 0, utfSeq = (size_t) -1
+            ; j <= segSize ; j++) {
+            const unsigned char c = j < segSize ? segData[j] : 0;
+
+            /* character start (ascii, or utf-8 leading sequence) */
+            if (HTS_IS_LEADING_UTF8(c)) {
+              /* commit sequence ? */
+              if (utfSeq != (size_t) -1) {
+                /* unicode character */
+                punycode_uint uc = 0;
+                size_t step = 0;
+
+                /* utf-8 sequence macro */
+#define SEQ_MATCH(FROM, TO) \
+  (utfSeq < j && segData[utfSeq] >= FROM && segData[utfSeq] <= TO \
+    && (uc *= (TO - FROM + 1), \
+        uc += segData[utfSeq] - FROM, \
+        utfSeq++, \
+        1) \
+  )
+                /* decode UTF-8 sequence */
+                if (SEQ_MATCH(0xC2, 0xDF)) {
+                  if (SEQ_MATCH(0x80, 0xBF)) {
+                    uc += 0x0080;
+                  } else {
+                    uc = 0xfffd;  /* replacement character */
+                  }
+                } else if (SEQ_MATCH(0xE0, 0xE0)) {
+                  if (SEQ_MATCH(0xA0, 0xBF) && SEQ_MATCH(0x80, 0xBF)) {
+                    uc += 0x0800;
+                  } else {
+                    uc = 0xfffd;  /* replacement character */
+                  }
+                } else if (SEQ_MATCH(0xE1, 0xEC)) {
+                  if (SEQ_MATCH(0x80, 0xBF) && SEQ_MATCH(0x80, 0xBF)) {
+                    uc += 0x1000;
+                  } else {
+                    uc = 0xfffd;  /* replacement character */
+                  }
+                } else if (SEQ_MATCH(0xED, 0xED)) {
+                  if (SEQ_MATCH(0x80, 0x9F) && SEQ_MATCH(0x80, 0xBF)) {
+                    uc += 0xD000;
+                  } else {
+                    uc = 0xfffd;  /* replacement character */
+                  }
+                } else if (SEQ_MATCH(0xEE, 0xEF)) {
+                  if (SEQ_MATCH(0x80, 0xBF) && SEQ_MATCH(0x80, 0xBF)) {
+                    uc += 0xE000;
+                  } else {
+                    uc = 0xfffd;  /* replacement character */
+                  }
+                } else if (SEQ_MATCH(0xF0, 0xF0)) {
+                  if (SEQ_MATCH(0x90, 0xBF) && SEQ_MATCH(0x80, 0xBF) 
+                    && SEQ_MATCH(0x80, 0xBF)) {
+                      uc += 0x10000;
+                  } else {
+                    uc = 0xfffd;  /* replacement character */
+                  }
+                } else if (SEQ_MATCH(0xF1, 0xF3)) {
+                  if (SEQ_MATCH(0x80, 0xBF) && SEQ_MATCH(0x80, 0xBF)
+                    && SEQ_MATCH(0x80, 0xBF)) {
+                      uc += 0x40000;
+                  } else {
+                    uc = 0xfffd;  /* replacement character */
+                  }
+                } else if (SEQ_MATCH(0xF4, 0xF4)) {
+                  if (SEQ_MATCH(0x80, 0x8F) && SEQ_MATCH(0x80, 0xBF)
+                    && SEQ_MATCH(0x80, 0xBF)) {
+                      uc += 0x100000;
+                  } else {
+                    uc = 0xfffd;  /* replacement character */
+                  }
+                } else {
+                  uc = 0xfffd;  /* replacement character */
+                }
+#undef SEQ_MATCH
+
+                /* copy character */
+                assert(segOutputSize < segSize);
+                segInt[segOutputSize++] = uc;
+
+                /* not anymore in sequence */
+                utfSeq = (size_t) -1;
+              }
+
+              /* ascii ? */
+              if (c < 0x80) {
+                assert(segOutputSize < segSize);
+                segInt[segOutputSize] = c;
+                if (c != 0) {
+                  segOutputSize++;
+                }
+              }
+              /* new UTF8 sequence */
+              else {
+                utfSeq = j;
+              }
+            }
+          }
+
+          /* encode */
+          output_length = (punycode_uint) ( capa - destSize );
+          while(status = punycode_encode((punycode_uint) segOutputSize,
+            segInt, NULL, &output_length, &dest[destSize])
+            == punycode_big_output) {
+              INCREASE_CAPA();
+              output_length = (punycode_uint) ( capa - destSize );
+          }
+
+          /* success ? */
+          if (status == punycode_success) {
+            destSize += output_length;
+          }
+        }
+        /* copy ascii segment otherwise */
+        else {
+          size_t j;
+          for(j = startSeg ; j < i ; j++) {
+            const unsigned char c = (unsigned char) s[j];
+            ADD_BYTE(c);
+          }
+        }
+      }
+      /* next segment start */
+      startSeg = i + 1;
+      nonAsciiFound = 0;
+      /* add separator (including terminating \0) */
+      ADD_BYTE(c);
+    }
+    /* found non-ascii */
+    else if (c >= 0x80) {
+      nonAsciiFound = 1;
+    }
+  }
+
+#undef ADD_BYTE
+#undef INCREASE_CAPA
+
+  return dest;
+}
diff --git a/src/htscharset.h b/src/htscharset.h
index 2b9238a..cee644c 100644
--- a/src/htscharset.h
+++ b/src/htscharset.h
@@ -64,6 +64,16 @@ extern char *hts_convertStringFromUTF8(const char *s, size_t size,
 extern char *hts_getCharsetFromMeta(const char *html, size_t size);
 
 /**
+ * Is the given string an ASCII string ?
+ **/
+extern int hts_isStringAscii(const char *s, size_t size);
+
+/**
+ * Is the given charset the UTF-8 charset ?
+ **/
+extern int hts_isCharsetUTF8(const char *charset);
+
+/**
  * Get an UTF-8 string length in characters.
  **/
 extern size_t hts_stringLengthUTF8(const char *s);
@@ -85,6 +95,11 @@ extern char *hts_convertUCS2StringToUTF8(LPWSTR woutput, int wsize);
  **/
 extern char *hts_convertStringSystemToUTF8(const char *s, size_t size);
 
+/**
+ * Convert an UTF-8 string to an IDNA (RFC 3492) string.
+ **/
+extern char *hts_convertStringUTF8ToIDNA(const char *s, size_t size);
+
 #endif
 
 #endif
diff --git a/src/htsparse.c b/src/htsparse.c
index fc3b6e1..73f66ac 100644
--- a/src/htsparse.c
+++ b/src/htsparse.c
@@ -2103,14 +2103,15 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                   // ???? No! escape_spc_url(lien);
                   strcatbuff(lien, query);      /* restore */
 
-                  // Charset conversion for the URI filename
-                  // (not for the query string!)
-                  if (hasCharset) {
+                  // Charset conversion for the URI filename, 
+                  // and not already UTF-8
+                  // (note: not for the query string!)
+                  if (hasCharset && !hts_isCharsetUTF8(charset)) {
                     char *const s = hts_convertStringToUTF8(lien, (int) strlen(lien), charset);
                     if (s != NULL) {
                       hts_log_print(opt, LOG_DEBUG,
-                        "engine: save-name: charset conversion from '%s' to '%s' using charset '%s'",
-                        lien, s, charset);
+                        "engine: save-name: '%s' charset conversion from '%s' to '%s'",
+                        charset, lien, s);
                       strcpybuff(lien, s);
                       free(s);
                     }
diff --git a/src/htstools.c b/src/htstools.c
index 885bdc8..3a5ca70 100644
--- a/src/htstools.c
+++ b/src/htstools.c
@@ -40,6 +40,7 @@ Please visit our Website: http://www.httrack.com
 #include "htscore.h"
 #include "htstools.h"
 #include "htsstrings.h"
+#include "htscharset.h"
 #ifdef _WIN32
 #include "windows.h"
 #else
@@ -276,6 +277,21 @@ int ident_url_relatif(const char *lien, const char *origin_adr,
     }
   }
 
+  // IDNA / RFC 3492 (Punycode) handling for HTTP(s)
+  if (!link_has_authority(adr) || strfield(adr, "https:")) {
+    char *const a = jump_identification(adr);
+    // Non-ASCII characters (theorically forbidden, but browsers are lenient)
+    if (!hts_isStringAscii(a, strlen(a))) {
+      char *const idna = hts_convertStringUTF8ToIDNA(a, strlen(a));
+      if (idna != NULL) {
+        if (strlen(idna) < HTS_URLMAXSIZE) {
+          strcpybuff(a, idna);
+        }
+        free(idna);
+      }
+    }
+  }
+
   return ok;
 }