diff options
author | Xavier Roche <xroche@users.noreply.github.com> | 2013-07-05 17:53:54 +0000 |
---|---|---|
committer | Xavier Roche <xroche@users.noreply.github.com> | 2013-07-05 17:53:54 +0000 |
commit | d2a3d7a3ffd60e1e28d2fcd631b38bf40328dde6 (patch) | |
tree | dffb8d2574c1404cb2ccd2664c857a2a598d5433 | |
parent | 00fe2d4432ca8f994275af2bf32931a05f6c8132 (diff) |
Do not unescape '+' before the query string
Fixed issue 18
-rw-r--r-- | src/htsencoding.c | 6 | ||||
-rwxr-xr-x | tests/11_crawl-parsing.test | 10 |
2 files changed, 15 insertions, 1 deletions
diff --git a/src/htsencoding.c b/src/htsencoding.c index e2a8fb2..184cca6 100644 --- a/src/htsencoding.c +++ b/src/htsencoding.c @@ -206,6 +206,7 @@ int hts_unescapeEntities(const char *src, char *dest, const size_t max) { int hts_unescapeUrl(const char *src, char *dest, const size_t max) { size_t i, j, lastI, lastJ, k, utfBufferJ, utfBufferSize; + int seenQuery = 0; char utfBuffer[32]; assert(src != dest); @@ -218,7 +219,7 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) { unsigned char cUtf = (unsigned char) c; /* Replacement for ' ' */ - if (c == '+') { + if (c == '+' && seenQuery) { c = cUtf = ' '; k = 0; /* cancel any sequence */ } @@ -250,6 +251,9 @@ int hts_unescapeUrl(const char *src, char *dest, const size_t max) { /* ASCII (and not in %xx) */ else if (cUtf < 0x80 && i != lastI + 1) { k = 0; /* cancel any sequence */ + if (!seenQuery && c == '?') { + seenQuery = 1; + } } /* UTF-8 sequence in progress (either a raw or a %xx character) */ diff --git a/tests/11_crawl-parsing.test b/tests/11_crawl-parsing.test index 3c75c24..1d8e3d9 100755 --- a/tests/11_crawl-parsing.test +++ b/tests/11_crawl-parsing.test @@ -24,3 +24,13 @@ bash crawl-test.sh --errors 0 --files 3 \ --found ut.httrack.com/parsing/fade.gif \ --found ut.httrack.com/parsing/javascript.html \ httrack http://ut.httrack.com/parsing/javascript.html + +# handling of + before query string +bash crawl-test.sh --errors 0 --files 6 \ + --found ut.httrack.com/parsing/escaping.html \ + --found "ut.httrack.com/parsing/foo bar30f4.html" \ + --found "ut.httrack.com/parsing/foo bar5e1f.html" \ + --found "ut.httrack.com/parsing/foo+bar3860.html" \ + --found "ut.httrack.com/parsing/foo barae52.html" \ + --found "ut.httrack.com/parsing/foo bar7b30.html" \ + httrack http://ut.httrack.com/parsing/escaping.html |