diff options
author | Xavier Roche <xroche@users.noreply.github.com> | 2013-05-31 11:38:53 +0000 |
---|---|---|
committer | Xavier Roche <xroche@users.noreply.github.com> | 2013-05-31 11:38:53 +0000 |
commit | bc31ec0da9573d482de24f27241482f50e46e60c (patch) | |
tree | e5e80dd055b2e4790802728d4e3b4b5b8c361277 /src/htsparse.c | |
parent | 8767fd0e750b70a121d95e3ecf7e59bcec499d95 (diff) |
Fixed issue 14 (http://code.google.com/p/httrack/issues/detail?id=14)
Rationale:
* hostname is ASCII, non-ascii characters shall be encoded with IDNA
* URI filenames may embed non-ascii characters, which MUST be UTF-8 encoded
* query string may embed non-ascii characters, which are encoded with the pahe charset into %xx codes
Diffstat (limited to 'src/htsparse.c')
-rw-r--r-- | src/htsparse.c | 41 |
1 files changed, 32 insertions, 9 deletions
diff --git a/src/htsparse.c b/src/htsparse.c index 52445b3..caace62 100644 --- a/src/htsparse.c +++ b/src/htsparse.c @@ -51,6 +51,7 @@ Please visit our Website: http://www.httrack.com #include "htsmd5.h" #include "htsindex.h" #include "htscharset.h" +#include "htsencoding.h" /* external modules */ #include "htsmodules.h" @@ -2081,25 +2082,31 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { /* Unescape/escape %20 and other */ { + // Note: always true (iso-8859-1 as default) const char *const charset = str->page_charset_; const int hasCharset = charset != NULL && *charset != '\0'; char BIGSTK query[HTS_URLMAXSIZE * 2]; - char *a = strchr(lien, '?'); + char *const a = strchr(lien, '?'); - if (a) { + // cut query string + if (a != NULL) { strcpybuff(query, a); *a = '\0'; - } else + } else { query[0] = '\0'; + } + // décoder l'inutile (%2E par exemple) et coder espaces // Unescape high-chars for UTF-8 conversion strcpybuff(lien, unescape_http_unharm(catbuff, lien, !hasCharset)); /* note: '%' is still escaped */ escape_remove_control(lien); - // ???? No! escape_spc_url(lien); - strcatbuff(lien, query); /* restore */ + + // we need to encode query string non-ascii chars, + // leaving the encoding as-is (unlike the file part) + escape_check_url(query); - // Charset conversion for the URI filename, + // charset conversion for the URI filename, // and not already UTF-8 // (note: not for the query string!) if (hasCharset && !hts_isCharsetUTF8(charset)) { @@ -2112,9 +2119,25 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) { free(s); } } - // conversion entities - unescape_amp(lien); - unescape_amp(query); + + // decode URI entities with UTF-8 charset + if (!hts_unescapeEntities(lien, lien, strlen(lien))) { + hts_log_print(opt, LOG_WARNING, + "could not decode URI '%s' with charset '%s'", lien, charset); + } + + // decode query string entities with page charset + if (hasCharset) { + if (!hts_unescapeEntitiesWithCharset(query, + query, strlen(query), + charset)) { + hts_log_print(opt, LOG_WARNING, + "could not decode query string '%s' with charset '%s'", query, charset); + } + } + + // copy back query + strcatbuff(lien, query); /* restore */ } // convertir les éventuels \ en des / pour éviter des problèmes de reconnaissance! |