From bc31ec0da9573d482de24f27241482f50e46e60c Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Fri, 31 May 2013 11:38:53 +0000 Subject: Fixed issue 14 (http://code.google.com/p/httrack/issues/detail?id=14) Rationale: * hostname is ASCII, non-ascii characters shall be encoded with IDNA * URI filenames may embed non-ascii characters, which MUST be UTF-8 encoded * query string may embed non-ascii characters, which are encoded with the pahe charset into %xx codes --- src/htsencoding.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'src/htsencoding.h') diff --git a/src/htsencoding.h b/src/htsencoding.h index 4dfd367..cd35a00 100644 --- a/src/htsencoding.h +++ b/src/htsencoding.h @@ -31,8 +31,8 @@ Please visit our Website: http://www.httrack.com /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ -#ifndef HTS_CHARSET_DEFH -#define HTS_CHARSET_DEFH +#ifndef HTS_ENCODING_DEFH +#define HTS_ENCODING_DEFH /** Standard includes. **/ #include @@ -48,8 +48,19 @@ Please visit our Website: http://www.httrack.com * needs to hold as space as the source. * Returns 0 upon success. **/ -extern int hts_unescape_entities(const char *src, - char *dest, const size_t max); +extern int hts_unescapeEntities(const char *src, + char *dest, const size_t max); + +/** + * Unescape HTML entities (as per HTML 4.0 Specification) + * and replace them in-place by their charset equivalents. + * Note: source and destination may be the same, and the destination only + * needs to hold as space as the source. + * Returns 0 upon success. + **/ +extern int hts_unescapeEntitiesWithCharset(const char *src, + char *dest, const size_t max, + const char *charset); #endif -- cgit v1.2.3