Fixed issue 25 regarding un-encoding of characters such as # in the filename.

author: Xavier Roche <xroche@users.noreply.github.com> 2013-08-17 09:09:13 +0000
committer: Xavier Roche <xroche@users.noreply.github.com> 2013-08-17 09:09:13 +0000
commit: 2d6017ad06ed6ea571384f51705ce1e53aefb2da (patch)
tree: 4ee026c01b5f68204837abc898c0d7d490528b0f
parent: e0022540014d498ee2ba366000c91c118db52b36 (diff)
5 files changed, 55 insertions, 13 deletions
diff --git a/src/htsencoding.c b/src/htsencoding.c
index 184cca6..4160fa2 100644
--- a/src/htsencoding.c
+++ b/src/htsencoding.c
@@ -204,7 +204,8 @@ int hts_unescapeEntities(const char *src, char *dest, const size_t max) {
   return hts_unescapeEntitiesWithCharset(src, dest, max, "UTF-8");
 }
 
-int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
+int hts_unescapeUrlSpecial(const char *src, char *dest, const size_t max,
+                           const int flags) {
   size_t i, j, lastI, lastJ, k, utfBufferJ, utfBufferSize;
   int seenQuery = 0;
   char utfBuffer[32];
@@ -239,7 +240,10 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
         cUtf = (unsigned char) ec;
 
         /* Shortcut for ASCII (do not unescape non-printable) */
-        if ((unsigned char) ec < 0x80 && (unsigned char) ec >= 32) {
+        if (
+            (cUtf < 0x80 && cUtf >= 32)
+            && ( flags & UNESCAPE_URL_NO_ASCII ) == 0
+            ) {
           /* Rollback new write position and character */
           j = lastJ;
           c = ec;
@@ -251,7 +255,7 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
     /* ASCII (and not in %xx) */
     else if (cUtf < 0x80 && i != lastI + 1) {
       k = 0;  /* cancel any sequence */
-      if (!seenQuery && c == '?') {
+      if (c == '?' && !seenQuery) {
         seenQuery = 1;
       }
     }
@@ -316,3 +320,7 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
 
   return 0;
 }
+
+int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
+  return hts_unescapeUrlSpecial(src, dest, max, 0);
+}
diff --git a/src/htsencoding.h b/src/htsencoding.h
index 4ab660b..e119c4a 100644
--- a/src/htsencoding.h
+++ b/src/htsencoding.h
@@ -42,6 +42,14 @@ Please visit our Website: http://www.httrack.com
 #endif
 
 /**
+ * Flags for hts_unescapeUrlSpecial().
+ **/
+typedef enum unescapeFlags {
+  /** Do not decode ASCII. **/
+  UNESCAPE_URL_NO_ASCII = 1
+} unescapeFlags;
+
+/**
  * Unescape HTML entities (as per HTML 4.0 Specification)
  * and replace them in-place by their UTF-8 equivalents.
  * Note: source and destination may be the same, and the destination only
@@ -71,4 +79,16 @@ extern int hts_unescapeEntitiesWithCharset(const char *src,
  **/
 extern int hts_unescapeUrl(const char *src, char *dest, const size_t max);
 
+/**
+ * Unescape an URL-encoded string. The implicit charset is UTF-8.
+ * In case of UTF-8 decoding error inside URL-encoded characters,
+ * the characters are left undecoded.
+ * "flags" is a mask composed of UNESCAPE_URL_XXX constants.
+ * Note: source and destination MUST NOT be the same.
+ * Returns 0 upon success, -1 upon overflow or error.
+ **/
+extern int hts_unescapeUrlSpecial(const char *src,
+                                  char *dest, const size_t max,
+                                  int flags);
+
 #endif
diff --git a/src/htslib.c b/src/htslib.c
index 63a3abb..bb46f94 100644
--- a/src/htslib.c
+++ b/src/htslib.c
@@ -3708,6 +3708,8 @@ HTSEXT_API char *unescape_http(char *catbuff, const char *s) {
 
 // unescape in URL/URI ONLY what has to be escaped, to form a standard URL/URI
 // DOES NOT DECODE %25 (part of CHAR_DELIM)
+// no_high & 1: decode high chars
+// no_high & 2: decode space
 HTSEXT_API char *unescape_http_unharm(char *catbuff, const char *s, int no_high) {
   size_t i, j;
 
@@ -3720,8 +3722,8 @@ HTSEXT_API char *unescape_http_unharm(char *catbuff, const char *s, int no_high)
         || CHAR_DELIM(nchar)
         || CHAR_UNWISE(nchar)
         || CHAR_LOW(nchar)    /* CHAR_SPECIAL */
-        || CHAR_XXAVOID(nchar)
-        || ( no_high && CHAR_HIG(nchar) )
+        || ( CHAR_XXAVOID(nchar) && ( nchar != ' ' || ( no_high & 2) == 0 ) )
+        || ( ( no_high & 1 ) && CHAR_HIG(nchar) )
         ;
 
       if (!test && nchar >= 0) {  /* can safely unescape */
diff --git a/src/htsparse.c b/src/htsparse.c
index 711165c..819c25f 100644
--- a/src/htsparse.c
+++ b/src/htsparse.c
@@ -2100,18 +2100,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                   const int hasCharset = charset != NULL 
                     && *charset != '\0';
                   char BIGSTK query[HTS_URLMAXSIZE * 2];
-                  char *const a = strchr(lien, '?');
 
                   // cut query string
-                  if (a != NULL) {
-                    strcpybuff(query, a);
-                    *a = '\0';
-                  } else {
-                    query[0] = '\0';
+                  {
+                    char *const a = strchr(lien, '?');
+                    if (a != NULL) {
+                      strcpybuff(query, a);
+                      *a = '\0';
+                    } else {
+                      query[0] = '\0';
+                    }
                   }
 
                   // Unescape %XX, but not yet high-chars (supposedly encoded with UTF-8)
-                  strcpybuff(lien, unescape_http_unharm(catbuff, lien, 1));     /* note: '%' is still escaped */
+                  strcpybuff(lien, unescape_http_unharm(catbuff, lien, 1 | 2));     /* note: '%' is still escaped */
 
                   // Force to encode non-printable chars (should never happend)
                   escape_remove_control(lien);
@@ -2149,7 +2151,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
                   // Decode remaining %XX high characters with UTF-8 
                   // but only when this leads to valid UTF-8.
                   // Otherwise, leave them unescaped.
-                  if (hts_unescapeUrl(lien, catbuff, sizeof(catbuff)) == 0) {
+                  if (hts_unescapeUrlSpecial(lien, catbuff, sizeof(catbuff),
+                                             UNESCAPE_URL_NO_ASCII) == 0) {
                     strcpybuff(lien, catbuff);
                   } else {
                     hts_log_print(opt, LOG_WARNING,
diff --git a/tests/11_crawl-parsing.test b/tests/11_crawl-parsing.test
index 39aeb03..d0f092d 100755
--- a/tests/11_crawl-parsing.test
+++ b/tests/11_crawl-parsing.test
@@ -34,3 +34,12 @@ bash crawl-test.sh --errors 0 --files 6 \
 	--found "ut.httrack.com/parsing/foo barae52.html" \
 	--found "ut.httrack.com/parsing/foo bar7b30.html" \
 	httrack http://ut.httrack.com/parsing/escaping.html
+
+# handling of # encoded in filename
+# see http://code.google.com/p/httrack/issues/detail?id=25
+bash crawl-test.sh --errors 2 --files 4 \
+	--found "ut.httrack.com/parsing/escaping2.html" \
+	--found "ut.httrack.com/parsing/++foo++bar++plus++.html" \
+	--found "ut.httrack.com/parsing/foo#bar#.html" \
+	--found "ut.httrack.com/parsing/foo bar.html" \
+	httrack http://ut.httrack.com/parsing/escaping2.html
author	Xavier Roche <xroche@users.noreply.github.com>	2013-08-17 09:09:13 +0000
committer	Xavier Roche <xroche@users.noreply.github.com>	2013-08-17 09:09:13 +0000
commit	2d6017ad06ed6ea571384f51705ce1e53aefb2da (patch)
tree	4ee026c01b5f68204837abc898c0d7d490528b0f
parent	e0022540014d498ee2ba366000c91c118db52b36 (diff)