diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/htscoremain.c | 7 | ||||
-rw-r--r-- | src/htsencoding.c | 29 | ||||
-rw-r--r-- | src/htsencoding.h | 19 | ||||
-rw-r--r-- | src/htslib.c | 2 | ||||
-rw-r--r-- | src/htsparse.c | 41 |
5 files changed, 80 insertions, 18 deletions
diff --git a/src/htscoremain.c b/src/htscoremain.c index 534c469..c2ff520 100644 --- a/src/htscoremain.c +++ b/src/htscoremain.c @@ -2342,10 +2342,13 @@ HTSEXT_API int hts_main2(int argc, char **argv, httrackp * opt) { htsmain_free(); return 0; break; - case '6': // entities: httrack -#6 + case '6': // entities: httrack -#6 "&foo;" ["encoding"] if (++na < argc) { char *const s = strdup(argv[na]); - if (s != NULL && hts_unescape_entities(s, s, strlen(s)) == 0) { + const char *const enc = na + 1 < argc ? argv[na + 1] : "UTF-8"; + if (s != NULL + && hts_unescapeEntitiesWithCharset(s, s, strlen(s), + enc) == 0) { printf("%s\n", s); free(s); } else { diff --git a/src/htsencoding.c b/src/htsencoding.c index 46c57a4..0fa21fc 100644 --- a/src/htsencoding.c +++ b/src/htsencoding.c @@ -60,7 +60,7 @@ static int get_hex_value(char c) { (HASH) += (C); \ } while(0) -int hts_unescape_entities(const char *src, char *dest, const size_t max) { +int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t max, const char *charset) { size_t i, j, ampStart, ampStartDest; int uc; int hex; @@ -106,8 +106,29 @@ int hts_unescape_entities(const char *src, char *dest, const size_t max) { /* success ? */ if (uc > 0) { + const size_t maxOut = max - ampStartDest; /* write at position */ - len = hts_writeUTF8(uc, &dest[ampStartDest], max - ampStartDest); + if (charset != NULL && hts_isCharsetUTF8(charset)) { + len = hts_writeUTF8(uc, &dest[ampStartDest], maxOut); + } else { + size_t ulen; + char buffer[32]; + len = 0; + if ( ( ulen = hts_writeUTF8(uc, buffer, sizeof(buffer)) ) != 0) { + char *s; + buffer[ulen] = '\0'; + s = hts_convertStringFromUTF8(buffer, strlen(buffer), charset); + if (s != NULL) { + const size_t sLen = strlen(s); + if (sLen < maxOut) { + // Do not copy \0. + memcpy(&dest[ampStartDest], s, sLen); + ulen = sLen; + } + free(s); + } + } + } if (len > 0) { /* new dest position */ j = ampStartDest + len; @@ -174,3 +195,7 @@ int hts_unescape_entities(const char *src, char *dest, const size_t max) { return 0; } + +int hts_unescapeEntities(const char *src, char *dest, const size_t max) { + return hts_unescapeEntitiesWithCharset(src, dest, max, "UTF-8"); +} diff --git a/src/htsencoding.h b/src/htsencoding.h index 4dfd367..cd35a00 100644 --- a/src/htsencoding.h +++ b/src/htsencoding.h @@ -31,8 +31,8 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ -#ifndef HTS_CHARSET_DEFH -#define HTS_CHARSET_DEFH +#ifndef HTS_ENCODING_DEFH +#define HTS_ENCODING_DEFH /** Standard includes. **/ #include <stdlib.h> @@ -48,8 +48,19 @@ Please visit our Website: http://www.httrack.com * needs to hold as space as the source. * Returns 0 upon success. **/ -extern int hts_unescape_entities(const char *src, - char *dest, const size_t max); +extern int hts_unescapeEntities(const char *src, + char *dest, const size_t max); + +/** + * Unescape HTML entities (as per HTML 4.0 Specification) + * and replace them in-place by their charset equivalents. + * Note: source and destination may be the same, and the destination only + * needs to hold as space as the source. + * Returns 0 upon success. + **/ +extern int hts_unescapeEntitiesWithCharset(const char *src, + char *dest, const size_t max, + const char *charset); #endif diff --git a/src/htslib.c b/src/htslib.c index 8b53b88..f74efe1 100644 --- a/src/htslib.c +++ b/src/htslib.c @@ -3753,7 +3753,7 @@ static int ehex(const char *s) { } void unescape_amp(char *s) { - if (hts_unescape_entities(s, s, strlen(s) + 1) != 0) { + if (hts_unescapeEntities(s, s, strlen(s) + 1) != 0) { assertf(! "error escaping html entities"); } } diff --git a/src/htsparse.c b/src/htsparse.c index 52445b3..caace62 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -51,6 +51,7 @@ Please visit our Website: http://www.httrack.com #include "htsmd5.h" #include "htsindex.h" #include "htscharset.h" +#include "htsencoding.h" /* external modules */ #include "htsmodules.h" @@ -2081,25 +2082,31 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { /* Unescape/escape %20 and other */ { + // Note: always true (iso-8859-1 as default) const char *const charset = str->page_charset_; const int hasCharset = charset != NULL && *charset != '\0'; char BIGSTK query[HTS_URLMAXSIZE * 2]; - char *a = strchr(lien, '?'); + char *const a = strchr(lien, '?'); - if (a) { + // cut query string + if (a != NULL) { strcpybuff(query, a); *a = '\0'; - } else + } else { query[0] = '\0'; + } + // décoder l'inutile (%2E par exemple) et coder espaces // Unescape high-chars for UTF-8 conversion strcpybuff(lien, unescape_http_unharm(catbuff, lien, !hasCharset)); /* note: '%' is still escaped */ escape_remove_control(lien); - // ???? No! escape_spc_url(lien); - strcatbuff(lien, query); /* restore */ + + // we need to encode query string non-ascii chars, + // leaving the encoding as-is (unlike the file part) + escape_check_url(query); - // Charset conversion for the URI filename, + // charset conversion for the URI filename, // and not already UTF-8 // (note: not for the query string!) if (hasCharset && !hts_isCharsetUTF8(charset)) { @@ -2112,9 +2119,25 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { free(s); } } - // conversion entities - unescape_amp(lien); - unescape_amp(query); + + // decode URI entities with UTF-8 charset + if (!hts_unescapeEntities(lien, lien, strlen(lien))) { + hts_log_print(opt, LOG_WARNING, + "could not decode URI '%s' with charset '%s'", lien, charset); + } + + // decode query string entities with page charset + if (hasCharset) { + if (!hts_unescapeEntitiesWithCharset(query, + query, strlen(query), + charset)) { + hts_log_print(opt, LOG_WARNING, + "could not decode query string '%s' with charset '%s'", query, charset); + } + } + + // copy back query + strcatbuff(lien, query); /* restore */ } // convertir les éventuels \ en des / pour éviter des problèmes de reconnaissance! |