summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/htsencoding.c14
-rw-r--r--src/htsencoding.h20
-rw-r--r--src/htslib.c6
-rw-r--r--src/htsparse.c19
-rwxr-xr-xtests/11_crawl-parsing.test9
5 files changed, 55 insertions, 13 deletions
diff --git a/src/htsencoding.c b/src/htsencoding.c
index 184cca6..4160fa2 100644
--- a/src/htsencoding.c
+++ b/src/htsencoding.c
@@ -204,7 +204,8 @@ int hts_unescapeEntities(const char *src, char *dest, const size_t max) {
return hts_unescapeEntitiesWithCharset(src, dest, max, "UTF-8");
}
-int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
+int hts_unescapeUrlSpecial(const char *src, char *dest, const size_t max,
+ const int flags) {
size_t i, j, lastI, lastJ, k, utfBufferJ, utfBufferSize;
int seenQuery = 0;
char utfBuffer[32];
@@ -239,7 +240,10 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
cUtf = (unsigned char) ec;
/* Shortcut for ASCII (do not unescape non-printable) */
- if ((unsigned char) ec < 0x80 && (unsigned char) ec >= 32) {
+ if (
+ (cUtf < 0x80 && cUtf >= 32)
+ && ( flags & UNESCAPE_URL_NO_ASCII ) == 0
+ ) {
/* Rollback new write position and character */
j = lastJ;
c = ec;
@@ -251,7 +255,7 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
/* ASCII (and not in %xx) */
else if (cUtf < 0x80 && i != lastI + 1) {
k = 0; /* cancel any sequence */
- if (!seenQuery && c == '?') {
+ if (c == '?' && !seenQuery) {
seenQuery = 1;
}
}
@@ -316,3 +320,7 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
return 0;
}
+
+int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
+ return hts_unescapeUrlSpecial(src, dest, max, 0);
+}
diff --git a/src/htsencoding.h b/src/htsencoding.h
index 4ab660b..e119c4a 100644
--- a/src/htsencoding.h
+++ b/src/htsencoding.h
@@ -42,6 +42,14 @@ Please visit our Website: http://www.httrack.com
#endif
/**
+ * Flags for hts_unescapeUrlSpecial().
+ **/
+typedef enum unescapeFlags {
+ /** Do not decode ASCII. **/
+ UNESCAPE_URL_NO_ASCII = 1
+} unescapeFlags;
+
+/**
* Unescape HTML entities (as per HTML 4.0 Specification)
* and replace them in-place by their UTF-8 equivalents.
* Note: source and destination may be the same, and the destination only
@@ -71,4 +79,16 @@ extern int hts_unescapeEntitiesWithCharset(const char *src,
**/
extern int hts_unescapeUrl(const char *src, char *dest, const size_t max);
+/**
+ * Unescape an URL-encoded string. The implicit charset is UTF-8.
+ * In case of UTF-8 decoding error inside URL-encoded characters,
+ * the characters are left undecoded.
+ * "flags" is a mask composed of UNESCAPE_URL_XXX constants.
+ * Note: source and destination MUST NOT be the same.
+ * Returns 0 upon success, -1 upon overflow or error.
+ **/
+extern int hts_unescapeUrlSpecial(const char *src,
+ char *dest, const size_t max,
+ int flags);
+
#endif
diff --git a/src/htslib.c b/src/htslib.c
index 63a3abb..bb46f94 100644
--- a/src/htslib.c
+++ b/src/htslib.c
@@ -3708,6 +3708,8 @@ HTSEXT_API char *unescape_http(char *catbuff, const char *s) {
// unescape in URL/URI ONLY what has to be escaped, to form a standard URL/URI
// DOES NOT DECODE %25 (part of CHAR_DELIM)
+// no_high & 1: decode high chars
+// no_high & 2: decode space
HTSEXT_API char *unescape_http_unharm(char *catbuff, const char *s, int no_high) {
size_t i, j;
@@ -3720,8 +3722,8 @@ HTSEXT_API char *unescape_http_unharm(char *catbuff, const char *s, int no_high)
|| CHAR_DELIM(nchar)
|| CHAR_UNWISE(nchar)
|| CHAR_LOW(nchar) /* CHAR_SPECIAL */
- || CHAR_XXAVOID(nchar)
- || ( no_high && CHAR_HIG(nchar) )
+ || ( CHAR_XXAVOID(nchar) && ( nchar != ' ' || ( no_high & 2) == 0 ) )
+ || ( ( no_high & 1 ) && CHAR_HIG(nchar) )
;
if (!test && nchar >= 0) { /* can safely unescape */
diff --git a/src/htsparse.c b/src/htsparse.c
index 711165c..819c25f 100644
--- a/src/htsparse.c
+++ b/src/htsparse.c
@@ -2100,18 +2100,20 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
const int hasCharset = charset != NULL
&& *charset != '\0';
char BIGSTK query[HTS_URLMAXSIZE * 2];
- char *const a = strchr(lien, '?');
// cut query string
- if (a != NULL) {
- strcpybuff(query, a);
- *a = '\0';
- } else {
- query[0] = '\0';
+ {
+ char *const a = strchr(lien, '?');
+ if (a != NULL) {
+ strcpybuff(query, a);
+ *a = '\0';
+ } else {
+ query[0] = '\0';
+ }
}
// Unescape %XX, but not yet high-chars (supposedly encoded with UTF-8)
- strcpybuff(lien, unescape_http_unharm(catbuff, lien, 1)); /* note: '%' is still escaped */
+ strcpybuff(lien, unescape_http_unharm(catbuff, lien, 1 | 2)); /* note: '%' is still escaped */
// Force to encode non-printable chars (should never happend)
escape_remove_control(lien);
@@ -2149,7 +2151,8 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
// Decode remaining %XX high characters with UTF-8
// but only when this leads to valid UTF-8.
// Otherwise, leave them unescaped.
- if (hts_unescapeUrl(lien, catbuff, sizeof(catbuff)) == 0) {
+ if (hts_unescapeUrlSpecial(lien, catbuff, sizeof(catbuff),
+ UNESCAPE_URL_NO_ASCII) == 0) {
strcpybuff(lien, catbuff);
} else {
hts_log_print(opt, LOG_WARNING,
diff --git a/tests/11_crawl-parsing.test b/tests/11_crawl-parsing.test
index 39aeb03..d0f092d 100755
--- a/tests/11_crawl-parsing.test
+++ b/tests/11_crawl-parsing.test
@@ -34,3 +34,12 @@ bash crawl-test.sh --errors 0 --files 6 \
--found "ut.httrack.com/parsing/foo barae52.html" \
--found "ut.httrack.com/parsing/foo bar7b30.html" \
httrack http://ut.httrack.com/parsing/escaping.html
+
+# handling of # encoded in filename
+# see http://code.google.com/p/httrack/issues/detail?id=25
+bash crawl-test.sh --errors 2 --files 4 \
+ --found "ut.httrack.com/parsing/escaping2.html" \
+ --found "ut.httrack.com/parsing/++foo++bar++plus++.html" \
+ --found "ut.httrack.com/parsing/foo#bar#.html" \
+ --found "ut.httrack.com/parsing/foo bar.html" \
+ httrack http://ut.httrack.com/parsing/escaping2.html