diff options
-rw-r--r-- | src/htsencoding.c | 83 | ||||
-rw-r--r-- | src/htsencoding.h | 14 |
2 files changed, 93 insertions, 4 deletions
diff --git a/src/htsencoding.c b/src/htsencoding.c index 9e171fe..08f16d7 100644 --- a/src/htsencoding.c +++ b/src/htsencoding.c @@ -31,6 +31,8 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ +#include <assert.h> + #include "htscharset.h" #include "htsencoding.h" @@ -65,6 +67,8 @@ int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t ma int uc; int hex; unsigned int hash; + + assert(max != 0); for(i = 0, j = 0, ampStart = (size_t) -1, ampStartDest = 0, uc = -1, hex = 0, hash = 0 ; src[i] != '\0' ; i++) { /* start of entity */ @@ -121,7 +125,7 @@ int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t ma if (s != NULL) { const size_t sLen = strlen(s); if (sLen < maxOut) { - // Do not copy \0. + /* Do not copy \0. */ memcpy(&dest[ampStartDest], s, sLen); len = sLen; } @@ -199,3 +203,80 @@ int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t ma int hts_unescapeEntities(const char *src, char *dest, const size_t max) { return hts_unescapeEntitiesWithCharset(src, dest, max, "UTF-8"); } + +int hts_unescapeUrl(const char *src, char *dest, const size_t max) { + size_t i, j, lastJ, k, utfBufferJ; + char utfBuffer[32]; + + assert(src != dest); + assert(max != 0); + + for(i = 0, j = 0, k = 0, utfBufferJ = 0, lastJ = (size_t) -1 + ; src[i] != '\0' ; i++) { + char c = src[i]; + + /* Replacement for ' ' */ + if (c == '+') { + c = ' '; + } + /* Escape sequence start */ + else if (c == '%') { + /* last known position of % written on destination + copy blindly c, we'll rollback later */ + lastJ = j; + } + /* End of sequence seen */ + else if (i >= 2 && i == lastJ + 2) { + const int a1 = get_hex_value(src[i - 1]); + const int a2 = get_hex_value(src[i - 0]); + if (a1 != -1 && a2 != -1) { + const char ec = a1*16 + a2; /* new character */ + + /* New leading character ? Flush UTF-8 buffer now. */ + if (k != 0 && HTS_IS_LEADING_UTF8(ec)) { + const size_t utfBufferSize = k; + const size_t nRead = hts_readUTF8(utfBuffer, utfBufferSize, NULL); + const size_t destPos = utfBufferJ; + + /* Reset UTF-8 buffer in all cases. */ + k = 0; + /* New destination-centric offset of utf-8 buffer beginning */ + utfBufferJ = lastJ; + + /* Was the character read successfully ? */ + if (nRead == utfBufferSize) { + /* Rollback and copy */ + j = destPos; + memcpy(&dest[j], utfBuffer, utfBufferSize); + j += utfBufferSize; + /* Skip current character */ + continue; + } + } + + /* Shortcut for ASCII (do not unescape non-printable) */ + if ((unsigned char) ec < 0x80 && (unsigned char) ec >= 32) { + assert(k == 0); + /* Rollback new write position and character */ + j = lastJ; + c = ec; + } + /* Copy if no overflow */ + else if (k < sizeof(utfBuffer)) { + utfBuffer[k++] = ec; + } + } + } + + /* Check for overflow */ + if (j + 1 > max) { + return -1; + } + + /* Copy current */ + dest[j++] = c; + } + dest[j] = '\0'; + + return 0; +} diff --git a/src/htsencoding.h b/src/htsencoding.h index cd35a00..4ab660b 100644 --- a/src/htsencoding.h +++ b/src/htsencoding.h @@ -46,7 +46,7 @@ Please visit our Website: http://www.httrack.com * and replace them in-place by their UTF-8 equivalents. * Note: source and destination may be the same, and the destination only * needs to hold as space as the source. - * Returns 0 upon success. + * Returns 0 upon success, -1 upon overflow or error. **/ extern int hts_unescapeEntities(const char *src, char *dest, const size_t max); @@ -56,11 +56,19 @@ extern int hts_unescapeEntities(const char *src, * and replace them in-place by their charset equivalents. * Note: source and destination may be the same, and the destination only * needs to hold as space as the source. - * Returns 0 upon success. + * Returns 0 upon success, -1 upon overflow or error. **/ extern int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t max, const char *charset); -#endif +/** + * Unescape an URL-encoded string. The implicit charset is UTF-8. + * In case of UTF-8 decoding error inside URL-encoded characters, + * the characters are left undecoded. + * Note: source and destination MUST NOT be the same. + * Returns 0 upon success, -1 upon overflow or error. + **/ +extern int hts_unescapeUrl(const char *src, char *dest, const size_t max); +#endif |