summaryrefslogtreecommitdiff
path: root/src/htsencoding.c
diff options
context:
space:
mode:
authorXavier Roche <xroche@users.noreply.github.com>2013-05-31 11:38:53 +0000
committerXavier Roche <xroche@users.noreply.github.com>2013-05-31 11:38:53 +0000
commitbc31ec0da9573d482de24f27241482f50e46e60c (patch)
treee5e80dd055b2e4790802728d4e3b4b5b8c361277 /src/htsencoding.c
parent8767fd0e750b70a121d95e3ecf7e59bcec499d95 (diff)
Fixed issue 14 (http://code.google.com/p/httrack/issues/detail?id=14)
Rationale: * hostname is ASCII, non-ascii characters shall be encoded with IDNA * URI filenames may embed non-ascii characters, which MUST be UTF-8 encoded * query string may embed non-ascii characters, which are encoded with the pahe charset into %xx codes
Diffstat (limited to 'src/htsencoding.c')
-rw-r--r--src/htsencoding.c29
1 files changed, 27 insertions, 2 deletions
diff --git a/src/htsencoding.c b/src/htsencoding.c
index 46c57a4..0fa21fc 100644
--- a/src/htsencoding.c
+++ b/src/htsencoding.c
@@ -60,7 +60,7 @@ static int get_hex_value(char c) {
(HASH) += (C); \
} while(0)
-int hts_unescape_entities(const char *src, char *dest, const size_t max) {
+int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t max, const char *charset) {
size_t i, j, ampStart, ampStartDest;
int uc;
int hex;
@@ -106,8 +106,29 @@ int hts_unescape_entities(const char *src, char *dest, const size_t max) {
/* success ? */
if (uc > 0) {
+ const size_t maxOut = max - ampStartDest;
/* write at position */
- len = hts_writeUTF8(uc, &dest[ampStartDest], max - ampStartDest);
+ if (charset != NULL && hts_isCharsetUTF8(charset)) {
+ len = hts_writeUTF8(uc, &dest[ampStartDest], maxOut);
+ } else {
+ size_t ulen;
+ char buffer[32];
+ len = 0;
+ if ( ( ulen = hts_writeUTF8(uc, buffer, sizeof(buffer)) ) != 0) {
+ char *s;
+ buffer[ulen] = '\0';
+ s = hts_convertStringFromUTF8(buffer, strlen(buffer), charset);
+ if (s != NULL) {
+ const size_t sLen = strlen(s);
+ if (sLen < maxOut) {
+ // Do not copy \0.
+ memcpy(&dest[ampStartDest], s, sLen);
+ ulen = sLen;
+ }
+ free(s);
+ }
+ }
+ }
if (len > 0) {
/* new dest position */
j = ampStartDest + len;
@@ -174,3 +195,7 @@ int hts_unescape_entities(const char *src, char *dest, const size_t max) {
return 0;
}
+
+int hts_unescapeEntities(const char *src, char *dest, const size_t max) {
+ return hts_unescapeEntitiesWithCharset(src, dest, max, "UTF-8");
+}