diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/htsencoding.c | 14 | ||||
-rw-r--r-- | src/htsencoding.h | 20 | ||||
-rw-r--r-- | src/htslib.c | 6 | ||||
-rw-r--r-- | src/htsparse.c | 19 |
4 files changed, 46 insertions, 13 deletions
diff --git a/src/htsencoding.c b/src/htsencoding.c index 184cca6..4160fa2 100644 --- a/src/htsencoding.c +++ b/src/htsencoding.c @@ -204,7 +204,8 @@ int hts_unescapeEntities(const char *src, char *dest, const size_t max) { return hts_unescapeEntitiesWithCharset(src, dest, max, "UTF-8"); } -int hts_unescapeUrl(const char *src, char *dest, const size_t max) { +int hts_unescapeUrlSpecial(const char *src, char *dest, const size_t max, + const int flags) { size_t i, j, lastI, lastJ, k, utfBufferJ, utfBufferSize; int seenQuery = 0; char utfBuffer[32]; @@ -239,7 +240,10 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) { cUtf = (unsigned char) ec; /* Shortcut for ASCII (do not unescape non-printable) */ - if ((unsigned char) ec < 0x80 && (unsigned char) ec >= 32) { + if ( + (cUtf < 0x80 && cUtf >= 32) + && ( flags & UNESCAPE_URL_NO_ASCII ) == 0 + ) { /* Rollback new write position and character */ j = lastJ; c = ec; @@ -251,7 +255,7 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) { /* ASCII (and not in %xx) */ else if (cUtf < 0x80 && i != lastI + 1) { k = 0; /* cancel any sequence */ - if (!seenQuery && c == '?') { + if (c == '?' && !seenQuery) { seenQuery = 1; } } @@ -316,3 +320,7 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) { return 0; } + +int hts_unescapeUrl(const char *src, char *dest, const size_t max) { + return hts_unescapeUrlSpecial(src, dest, max, 0); +} diff --git a/src/htsencoding.h b/src/htsencoding.h index 4ab660b..e119c4a 100644 --- a/src/htsencoding.h +++ b/src/htsencoding.h @@ -42,6 +42,14 @@ Please visit our Website: http://www.httrack.com #endif /** + * Flags for hts_unescapeUrlSpecial(). + **/ +typedef enum unescapeFlags { + /** Do not decode ASCII. **/ + UNESCAPE_URL_NO_ASCII = 1 +} unescapeFlags; + +/** * Unescape HTML entities (as per HTML 4.0 Specification) * and replace them in-place by their UTF-8 equivalents. * Note: source and destination may be the same, and the destination only @@ -71,4 +79,16 @@ extern int hts_unescapeEntitiesWithCharset(const char *src, **/ extern int hts_unescapeUrl(const char *src, char *dest, const size_t max); +/** + * Unescape an URL-encoded string. The implicit charset is UTF-8. + * In case of UTF-8 decoding error inside URL-encoded characters, + * the characters are left undecoded. + * "flags" is a mask composed of UNESCAPE_URL_XXX constants. + * Note: source and destination MUST NOT be the same. + * Returns 0 upon success, -1 upon overflow or error. + **/ +extern int hts_unescapeUrlSpecial(const char *src, + char *dest, const size_t max, + int flags); + #endif diff --git a/src/htslib.c b/src/htslib.c index 63a3abb..bb46f94 100644 --- a/src/htslib.c +++ b/src/htslib.c @@ -3708,6 +3708,8 @@ HTSEXT_API char *unescape_http(char *catbuff, const char *s) { // unescape in URL/URI ONLY what has to be escaped, to form a standard URL/URI // DOES NOT DECODE %25 (part of CHAR_DELIM) +// no_high & 1: decode high chars +// no_high & 2: decode space HTSEXT_API char *unescape_http_unharm(char *catbuff, const char *s, int no_high) { size_t i, j; @@ -3720,8 +3722,8 @@ HTSEXT_API char *unescape_http_unharm(char *catbuff, const char *s, int no_high) || CHAR_DELIM(nchar) || CHAR_UNWISE(nchar) || CHAR_LOW(nchar) /* CHAR_SPECIAL */ - || CHAR_XXAVOID(nchar) - || ( no_high && CHAR_HIG(nchar) ) + || ( CHAR_XXAVOID(nchar) && ( nchar != ' ' || ( no_high & 2) == 0 ) ) + || ( ( no_high & 1 ) && CHAR_HIG(nchar) ) ; if (!test && nchar >= 0) { /* can safely unescape */ diff --git a/src/htsparse.c b/src/htsparse.c index 711165c..819c25f 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -2100,18 +2100,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { const int hasCharset = charset != NULL && *charset != '\0'; char BIGSTK query[HTS_URLMAXSIZE * 2]; - char *const a = strchr(lien, '?'); // cut query string - if (a != NULL) { - strcpybuff(query, a); - *a = '\0'; - } else { - query[0] = '\0'; + { + char *const a = strchr(lien, '?'); + if (a != NULL) { + strcpybuff(query, a); + *a = '\0'; + } else { + query[0] = '\0'; + } } // Unescape %XX, but not yet high-chars (supposedly encoded with UTF-8) - strcpybuff(lien, unescape_http_unharm(catbuff, lien, 1)); /* note: '%' is still escaped */ + strcpybuff(lien, unescape_http_unharm(catbuff, lien, 1 | 2)); /* note: '%' is still escaped */ // Force to encode non-printable chars (should never happend) escape_remove_control(lien); @@ -2149,7 +2151,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { // Decode remaining %XX high characters with UTF-8 // but only when this leads to valid UTF-8. // Otherwise, leave them unescaped. - if (hts_unescapeUrl(lien, catbuff, sizeof(catbuff)) == 0) { + if (hts_unescapeUrlSpecial(lien, catbuff, sizeof(catbuff), + UNESCAPE_URL_NO_ASCII) == 0) { strcpybuff(lien, catbuff); } else { hts_log_print(opt, LOG_WARNING, |