diff options
author | Xavier Roche <xroche@users.noreply.github.com> | 2013-08-17 09:09:13 +0000 |
---|---|---|
committer | Xavier Roche <xroche@users.noreply.github.com> | 2013-08-17 09:09:13 +0000 |
commit | 2d6017ad06ed6ea571384f51705ce1e53aefb2da (patch) | |
tree | 4ee026c01b5f68204837abc898c0d7d490528b0f | |
parent | e0022540014d498ee2ba366000c91c118db52b36 (diff) |
Fixed issue 25 regarding un-encoding of characters such as # in the filename.
-rw-r--r-- | src/htsencoding.c | 14 | ||||
-rw-r--r-- | src/htsencoding.h | 20 | ||||
-rw-r--r-- | src/htslib.c | 6 | ||||
-rw-r--r-- | src/htsparse.c | 19 | ||||
-rwxr-xr-x | tests/11_crawl-parsing.test | 9 |
5 files changed, 55 insertions, 13 deletions
diff --git a/src/htsencoding.c b/src/htsencoding.c index 184cca6..4160fa2 100644 --- a/src/htsencoding.c +++ b/src/htsencoding.c @@ -204,7 +204,8 @@ int hts_unescapeEntities(const char *src, char *dest, const size_t max) { return hts_unescapeEntitiesWithCharset(src, dest, max, "UTF-8"); } -int hts_unescapeUrl(const char *src, char *dest, const size_t max) { +int hts_unescapeUrlSpecial(const char *src, char *dest, const size_t max, + const int flags) { size_t i, j, lastI, lastJ, k, utfBufferJ, utfBufferSize; int seenQuery = 0; char utfBuffer[32]; @@ -239,7 +240,10 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) { cUtf = (unsigned char) ec; /* Shortcut for ASCII (do not unescape non-printable) */ - if ((unsigned char) ec < 0x80 && (unsigned char) ec >= 32) { + if ( + (cUtf < 0x80 && cUtf >= 32) + && ( flags & UNESCAPE_URL_NO_ASCII ) == 0 + ) { /* Rollback new write position and character */ j = lastJ; c = ec; @@ -251,7 +255,7 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) { /* ASCII (and not in %xx) */ else if (cUtf < 0x80 && i != lastI + 1) { k = 0; /* cancel any sequence */ - if (!seenQuery && c == '?') { + if (c == '?' && !seenQuery) { seenQuery = 1; } } @@ -316,3 +320,7 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) { return 0; } + +int hts_unescapeUrl(const char *src, char *dest, const size_t max) { + return hts_unescapeUrlSpecial(src, dest, max, 0); +} diff --git a/src/htsencoding.h b/src/htsencoding.h index 4ab660b..e119c4a 100644 --- a/src/htsencoding.h +++ b/src/htsencoding.h @@ -42,6 +42,14 @@ Please visit our Website: http://www.httrack.com #endif /** + * Flags for hts_unescapeUrlSpecial(). + **/ +typedef enum unescapeFlags { + /** Do not decode ASCII. **/ + UNESCAPE_URL_NO_ASCII = 1 +} unescapeFlags; + +/** * Unescape HTML entities (as per HTML 4.0 Specification) * and replace them in-place by their UTF-8 equivalents. * Note: source and destination may be the same, and the destination only @@ -71,4 +79,16 @@ extern int hts_unescapeEntitiesWithCharset(const char *src, **/ extern int hts_unescapeUrl(const char *src, char *dest, const size_t max); +/** + * Unescape an URL-encoded string. The implicit charset is UTF-8. + * In case of UTF-8 decoding error inside URL-encoded characters, + * the characters are left undecoded. + * "flags" is a mask composed of UNESCAPE_URL_XXX constants. + * Note: source and destination MUST NOT be the same. + * Returns 0 upon success, -1 upon overflow or error. + **/ +extern int hts_unescapeUrlSpecial(const char *src, + char *dest, const size_t max, + int flags); + #endif diff --git a/src/htslib.c b/src/htslib.c index 63a3abb..bb46f94 100644 --- a/src/htslib.c +++ b/src/htslib.c @@ -3708,6 +3708,8 @@ HTSEXT_API char *unescape_http(char *catbuff, const char *s) { // unescape in URL/URI ONLY what has to be escaped, to form a standard URL/URI // DOES NOT DECODE %25 (part of CHAR_DELIM) +// no_high & 1: decode high chars +// no_high & 2: decode space HTSEXT_API char *unescape_http_unharm(char *catbuff, const char *s, int no_high) { size_t i, j; @@ -3720,8 +3722,8 @@ HTSEXT_API char *unescape_http_unharm(char *catbuff, const char *s, int no_high) || CHAR_DELIM(nchar) || CHAR_UNWISE(nchar) || CHAR_LOW(nchar) /* CHAR_SPECIAL */ - || CHAR_XXAVOID(nchar) - || ( no_high && CHAR_HIG(nchar) ) + || ( CHAR_XXAVOID(nchar) && ( nchar != ' ' || ( no_high & 2) == 0 ) ) + || ( ( no_high & 1 ) && CHAR_HIG(nchar) ) ; if (!test && nchar >= 0) { /* can safely unescape */ diff --git a/src/htsparse.c b/src/htsparse.c index 711165c..819c25f 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -2100,18 +2100,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { const int hasCharset = charset != NULL && *charset != '\0'; char BIGSTK query[HTS_URLMAXSIZE * 2]; - char *const a = strchr(lien, '?'); // cut query string - if (a != NULL) { - strcpybuff(query, a); - *a = '\0'; - } else { - query[0] = '\0'; + { + char *const a = strchr(lien, '?'); + if (a != NULL) { + strcpybuff(query, a); + *a = '\0'; + } else { + query[0] = '\0'; + } } // Unescape %XX, but not yet high-chars (supposedly encoded with UTF-8) - strcpybuff(lien, unescape_http_unharm(catbuff, lien, 1)); /* note: '%' is still escaped */ + strcpybuff(lien, unescape_http_unharm(catbuff, lien, 1 | 2)); /* note: '%' is still escaped */ // Force to encode non-printable chars (should never happend) escape_remove_control(lien); @@ -2149,7 +2151,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { // Decode remaining %XX high characters with UTF-8 // but only when this leads to valid UTF-8. // Otherwise, leave them unescaped. - if (hts_unescapeUrl(lien, catbuff, sizeof(catbuff)) == 0) { + if (hts_unescapeUrlSpecial(lien, catbuff, sizeof(catbuff), + UNESCAPE_URL_NO_ASCII) == 0) { strcpybuff(lien, catbuff); } else { hts_log_print(opt, LOG_WARNING, diff --git a/tests/11_crawl-parsing.test b/tests/11_crawl-parsing.test index 39aeb03..d0f092d 100755 --- a/tests/11_crawl-parsing.test +++ b/tests/11_crawl-parsing.test @@ -34,3 +34,12 @@ bash crawl-test.sh --errors 0 --files 6 \ --found "ut.httrack.com/parsing/foo barae52.html" \ --found "ut.httrack.com/parsing/foo bar7b30.html" \ httrack http://ut.httrack.com/parsing/escaping.html + +# handling of # encoded in filename +# see http://code.google.com/p/httrack/issues/detail?id=25 +bash crawl-test.sh --errors 2 --files 4 \ + --found "ut.httrack.com/parsing/escaping2.html" \ + --found "ut.httrack.com/parsing/++foo++bar++plus++.html" \ + --found "ut.httrack.com/parsing/foo#bar#.html" \ + --found "ut.httrack.com/parsing/foo bar.html" \ + httrack http://ut.httrack.com/parsing/escaping2.html |