summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorXavier Roche <xroche@users.noreply.github.com>2013-07-05 17:53:54 +0000
committerXavier Roche <xroche@users.noreply.github.com>2013-07-05 17:53:54 +0000
commitd2a3d7a3ffd60e1e28d2fcd631b38bf40328dde6 (patch)
treedffb8d2574c1404cb2ccd2664c857a2a598d5433
parent00fe2d4432ca8f994275af2bf32931a05f6c8132 (diff)
Do not unescape '+' before the query string
Fixed issue 18
-rw-r--r--src/htsencoding.c6
-rwxr-xr-xtests/11_crawl-parsing.test10
2 files changed, 15 insertions, 1 deletions
diff --git a/src/htsencoding.c b/src/htsencoding.c
index e2a8fb2..184cca6 100644
--- a/src/htsencoding.c
+++ b/src/htsencoding.c
@@ -206,6 +206,7 @@ int hts_unescapeEntities(const char *src, char *dest, const size_t max) {
int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
size_t i, j, lastI, lastJ, k, utfBufferJ, utfBufferSize;
+ int seenQuery = 0;
char utfBuffer[32];
assert(src != dest);
@@ -218,7 +219,7 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
unsigned char cUtf = (unsigned char) c;
/* Replacement for ' ' */
- if (c == '+') {
+ if (c == '+' && seenQuery) {
c = cUtf = ' ';
k = 0; /* cancel any sequence */
}
@@ -250,6 +251,9 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
/* ASCII (and not in %xx) */
else if (cUtf < 0x80 && i != lastI + 1) {
k = 0; /* cancel any sequence */
+ if (!seenQuery && c == '?') {
+ seenQuery = 1;
+ }
}
/* UTF-8 sequence in progress (either a raw or a %xx character) */
diff --git a/tests/11_crawl-parsing.test b/tests/11_crawl-parsing.test
index 3c75c24..1d8e3d9 100755
--- a/tests/11_crawl-parsing.test
+++ b/tests/11_crawl-parsing.test
@@ -24,3 +24,13 @@ bash crawl-test.sh --errors 0 --files 3 \
--found ut.httrack.com/parsing/fade.gif \
--found ut.httrack.com/parsing/javascript.html \
httrack http://ut.httrack.com/parsing/javascript.html
+
+# handling of + before query string
+bash crawl-test.sh --errors 0 --files 6 \
+ --found ut.httrack.com/parsing/escaping.html \
+ --found "ut.httrack.com/parsing/foo bar30f4.html" \
+ --found "ut.httrack.com/parsing/foo bar5e1f.html" \
+ --found "ut.httrack.com/parsing/foo+bar3860.html" \
+ --found "ut.httrack.com/parsing/foo barae52.html" \
+ --found "ut.httrack.com/parsing/foo bar7b30.html" \
+ httrack http://ut.httrack.com/parsing/escaping.html