summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/htscoremain.c7
-rw-r--r--src/htsencoding.c29
-rw-r--r--src/htsencoding.h19
-rw-r--r--src/htslib.c2
-rw-r--r--src/htsparse.c41
5 files changed, 80 insertions, 18 deletions
diff --git a/src/htscoremain.c b/src/htscoremain.c
index 534c469..c2ff520 100644
--- a/src/htscoremain.c
+++ b/src/htscoremain.c
@@ -2342,10 +2342,13 @@ HTSEXT_API int hts_main2(int argc, char **argv, httrackp * opt) {
htsmain_free();
return 0;
break;
- case '6': // entities: httrack -#6
+ case '6': // entities: httrack -#6 "&foo;" ["encoding"]
if (++na < argc) {
char *const s = strdup(argv[na]);
- if (s != NULL && hts_unescape_entities(s, s, strlen(s)) == 0) {
+ const char *const enc = na + 1 < argc ? argv[na + 1] : "UTF-8";
+ if (s != NULL
+ && hts_unescapeEntitiesWithCharset(s, s, strlen(s),
+ enc) == 0) {
printf("%s\n", s);
free(s);
} else {
diff --git a/src/htsencoding.c b/src/htsencoding.c
index 46c57a4..0fa21fc 100644
--- a/src/htsencoding.c
+++ b/src/htsencoding.c
@@ -60,7 +60,7 @@ static int get_hex_value(char c) {
(HASH) += (C); \
} while(0)
-int hts_unescape_entities(const char *src, char *dest, const size_t max) {
+int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t max, const char *charset) {
size_t i, j, ampStart, ampStartDest;
int uc;
int hex;
@@ -106,8 +106,29 @@ int hts_unescape_entities(const char *src, char *dest, const size_t max) {
/* success ? */
if (uc > 0) {
+ const size_t maxOut = max - ampStartDest;
/* write at position */
- len = hts_writeUTF8(uc, &dest[ampStartDest], max - ampStartDest);
+ if (charset != NULL && hts_isCharsetUTF8(charset)) {
+ len = hts_writeUTF8(uc, &dest[ampStartDest], maxOut);
+ } else {
+ size_t ulen;
+ char buffer[32];
+ len = 0;
+ if ( ( ulen = hts_writeUTF8(uc, buffer, sizeof(buffer)) ) != 0) {
+ char *s;
+ buffer[ulen] = '\0';
+ s = hts_convertStringFromUTF8(buffer, strlen(buffer), charset);
+ if (s != NULL) {
+ const size_t sLen = strlen(s);
+ if (sLen < maxOut) {
+ // Do not copy \0.
+ memcpy(&dest[ampStartDest], s, sLen);
+ ulen = sLen;
+ }
+ free(s);
+ }
+ }
+ }
if (len > 0) {
/* new dest position */
j = ampStartDest + len;
@@ -174,3 +195,7 @@ int hts_unescape_entities(const char *src, char *dest, const size_t max) {
return 0;
}
+
+int hts_unescapeEntities(const char *src, char *dest, const size_t max) {
+ return hts_unescapeEntitiesWithCharset(src, dest, max, "UTF-8");
+}
diff --git a/src/htsencoding.h b/src/htsencoding.h
index 4dfd367..cd35a00 100644
--- a/src/htsencoding.h
+++ b/src/htsencoding.h
@@ -31,8 +31,8 @@ Please visit our Website: http://www.httrack.com
/* Author: Xavier Roche */
/* ------------------------------------------------------------ */
-#ifndef HTS_CHARSET_DEFH
-#define HTS_CHARSET_DEFH
+#ifndef HTS_ENCODING_DEFH
+#define HTS_ENCODING_DEFH
/** Standard includes. **/
#include <stdlib.h>
@@ -48,8 +48,19 @@ Please visit our Website: http://www.httrack.com
* needs to hold as space as the source.
* Returns 0 upon success.
**/
-extern int hts_unescape_entities(const char *src,
- char *dest, const size_t max);
+extern int hts_unescapeEntities(const char *src,
+ char *dest, const size_t max);
+
+/**
+ * Unescape HTML entities (as per HTML 4.0 Specification)
+ * and replace them in-place by their charset equivalents.
+ * Note: source and destination may be the same, and the destination only
+ * needs to hold as space as the source.
+ * Returns 0 upon success.
+ **/
+extern int hts_unescapeEntitiesWithCharset(const char *src,
+ char *dest, const size_t max,
+ const char *charset);
#endif
diff --git a/src/htslib.c b/src/htslib.c
index 8b53b88..f74efe1 100644
--- a/src/htslib.c
+++ b/src/htslib.c
@@ -3753,7 +3753,7 @@ static int ehex(const char *s) {
}
void unescape_amp(char *s) {
- if (hts_unescape_entities(s, s, strlen(s) + 1) != 0) {
+ if (hts_unescapeEntities(s, s, strlen(s) + 1) != 0) {
assertf(! "error escaping html entities");
}
}
diff --git a/src/htsparse.c b/src/htsparse.c
index 52445b3..caace62 100644
--- a/src/htsparse.c
+++ b/src/htsparse.c
@@ -51,6 +51,7 @@ Please visit our Website: http://www.httrack.com
#include "htsmd5.h"
#include "htsindex.h"
#include "htscharset.h"
+#include "htsencoding.h"
/* external modules */
#include "htsmodules.h"
@@ -2081,25 +2082,31 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
/* Unescape/escape %20 and other &nbsp; */
{
+ // Note: always true (iso-8859-1 as default)
const char *const charset = str->page_charset_;
const int hasCharset = charset != NULL
&& *charset != '\0';
char BIGSTK query[HTS_URLMAXSIZE * 2];
- char *a = strchr(lien, '?');
+ char *const a = strchr(lien, '?');
- if (a) {
+ // cut query string
+ if (a != NULL) {
strcpybuff(query, a);
*a = '\0';
- } else
+ } else {
query[0] = '\0';
+ }
+
// décoder l'inutile (%2E par exemple) et coder espaces
// Unescape high-chars for UTF-8 conversion
strcpybuff(lien, unescape_http_unharm(catbuff, lien, !hasCharset)); /* note: '%' is still escaped */
escape_remove_control(lien);
- // ???? No! escape_spc_url(lien);
- strcatbuff(lien, query); /* restore */
+
+ // we need to encode query string non-ascii chars,
+ // leaving the encoding as-is (unlike the file part)
+ escape_check_url(query);
- // Charset conversion for the URI filename,
+ // charset conversion for the URI filename,
// and not already UTF-8
// (note: not for the query string!)
if (hasCharset && !hts_isCharsetUTF8(charset)) {
@@ -2112,9 +2119,25 @@ int htsparse(htsmoduleStruct * str, htsmoduleStructExtended * stre) {
free(s);
}
}
- // conversion entities
- unescape_amp(lien);
- unescape_amp(query);
+
+ // decode URI entities with UTF-8 charset
+ if (!hts_unescapeEntities(lien, lien, strlen(lien))) {
+ hts_log_print(opt, LOG_WARNING,
+ "could not decode URI '%s' with charset '%s'", lien, charset);
+ }
+
+ // decode query string entities with page charset
+ if (hasCharset) {
+ if (!hts_unescapeEntitiesWithCharset(query,
+ query, strlen(query),
+ charset)) {
+ hts_log_print(opt, LOG_WARNING,
+ "could not decode query string '%s' with charset '%s'", query, charset);
+ }
+ }
+
+ // copy back query
+ strcatbuff(lien, query); /* restore */
}
// convertir les éventuels \ en des / pour éviter des problèmes de reconnaissance!