summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/htscharset.c93
-rw-r--r--src/htscharset.h6
-rw-r--r--src/htscore.c13
-rw-r--r--src/htsparse.c31
4 files changed, 134 insertions, 9 deletions
diff --git a/src/htscharset.c b/src/htscharset.c
index 4c97228..1c21590 100644
--- a/src/htscharset.c
+++ b/src/htscharset.c
@@ -34,6 +34,17 @@ Please visit our Website: http://www.httrack.com
/* Author: Xavier Roche */
/* ------------------------------------------------------------ */
+int hts_isStringAscii(const char *s, size_t size) {
+ size_t i;
+ for(i = 0 ; i < size ; i++) {
+ const unsigned char c = (const unsigned char) s[i];
+ if (c >= 0x80) {
+ return 0;
+ }
+ }
+ return 1;
+}
+
#ifdef _WIN32
#include <windows.h>
@@ -255,12 +266,12 @@ LPWSTR hts_convertUTF8StringToUCS2(const char *s, int size, int* pwsize) {
return hts_convertStringToUCS2(s, size, CP_UTF8, pwsize);
}
-char *hts_convertUCS2StringToUTF8(LPWSTR woutput, int wsize) {
- const int usize = WideCharToMultiByte(CP_UTF8, 0, woutput, wsize, NULL, 0, NULL, FALSE);
+char *hts_convertUCS2StringToCP(LPWSTR woutput, int wsize, UINT cp) {
+ const int usize = WideCharToMultiByte(cp, 0, woutput, wsize, NULL, 0, NULL, FALSE);
if (usize > 0) {
char *const uoutput = malloc((usize + 1)*sizeof(char));
if (uoutput != NULL) {
- if (WideCharToMultiByte(CP_UTF8, 0, woutput, wsize, uoutput, usize, NULL, FALSE) == usize) {
+ if (WideCharToMultiByte(cp, 0, woutput, wsize, uoutput, usize, NULL, FALSE) == usize) {
uoutput[usize] = '\0';
return uoutput;
} else {
@@ -271,13 +282,17 @@ char *hts_convertUCS2StringToUTF8(LPWSTR woutput, int wsize) {
return NULL;
}
+char *hts_convertUCS2StringToUTF8(LPWSTR woutput, int wsize) {
+ return hts_convertUCS2StringToCP(woutput, wsize, CP_UTF8);
+}
+
char *hts_convertStringCPToUTF8(const char *s, size_t size, UINT cp) {
/* Empty string ? */
if (size == 0) {
return strndup(s, size);
}
/* Already UTF-8 ? */
- if (cp == CP_UTF8) {
+ if (cp == CP_UTF8 || hts_isStringAscii(s, size)) {
return strndup(s, size);
}
/* Other (valid) charset */
@@ -291,6 +306,32 @@ char *hts_convertStringCPToUTF8(const char *s, size_t size, UINT cp) {
return uoutput;
}
}
+
+ /* Error, charset not found! */
+ return NULL;
+}
+
+char *hts_convertStringCPFromUTF8(const char *s, size_t size, UINT cp) {
+ /* Empty string ? */
+ if (size == 0) {
+ return strndup(s, size);
+ }
+ /* Already UTF-8 ? */
+ if (cp == CP_UTF8 || hts_isStringAscii(s, size)) {
+ return strndup(s, size);
+ }
+ /* Other (valid) charset */
+ else if (cp != 0) {
+ /* Size in wide chars of the output */
+ int wsize;
+ LPWSTR woutput = hts_convertStringToUCS2(s, (int) size, CP_UTF8, &wsize);
+ if (woutput != NULL) {
+ char *const uoutput = hts_convertUCS2StringToCP(woutput, wsize, cp);
+ free(woutput);
+ return uoutput;
+ }
+ }
+
/* Error, charset not found! */
return NULL;
}
@@ -300,6 +341,11 @@ char *hts_convertStringToUTF8(const char *s, size_t size, const char *charset) {
return hts_convertStringCPToUTF8(s, size, cp);
}
+char *hts_convertStringFromUTF8(const char *s, size_t size, const char *charset) {
+ const UINT cp = hts_getCodepage(charset);
+ return hts_convertStringCPFromUTF8(s, size, cp);
+}
+
char *hts_convertStringSystemToUTF8(const char *s, size_t size) {
return hts_convertStringCPToUTF8(s, size, GetACP());
}
@@ -310,18 +356,18 @@ char *hts_convertStringSystemToUTF8(const char *s, size_t size) {
#include <errno.h>
#include <iconv.h>
-char *hts_convertStringToUTF8(const char *s, size_t size, const char *charset) {
+char *hts_convertStringToUTF8_(const char *s, size_t size, const char *to, const char *from) {
/* Empty string ? */
if (size == 0) {
return strdup("");
}
- /* Already UTF-8 ? */
- if (strcasecmp(charset, "utf-8") == 0 || strcasecmp(charset, "utf8") == 0) {
+ /* Already on correct charset ? */
+ if (strcasecmp(from, to) == 0) {
return strndup(s, size);
}
/* Find codepage */
else {
- const iconv_t cp = iconv_open("utf-8", charset);
+ const iconv_t cp = iconv_open(to, from);
if (cp != (iconv_t) -1) {
char *inbuf = (char*) s;
size_t inbytesleft = size;
@@ -373,10 +419,41 @@ char *hts_convertStringToUTF8(const char *s, size_t size, const char *charset) {
return outbuf;
}
}
+
/* Error, charset not found! */
return NULL;
}
+char *hts_convertStringToUTF8(const char *s, size_t size, const char *charset) {
+ /* Empty string ? */
+ if (size == 0) {
+ return strdup("");
+ }
+ /* Already UTF-8 ? */
+ if (strcasecmp(charset, "utf-8") == 0 || strcasecmp(charset, "utf8") == 0 || hts_isStringAscii(s, size)) {
+ return strndup(s, size);
+ }
+ /* Find codepage */
+ else {
+ return hts_convertStringToUTF8_(s, size, "utf-8", charset);
+ }
+}
+
+char *hts_convertStringFromUTF8(const char *s, size_t size, const char *charset) {
+ /* Empty string ? */
+ if (size == 0) {
+ return strdup("");
+ }
+ /* Already UTF-8 ? */
+ if (strcasecmp(charset, "utf-8") == 0 || strcasecmp(charset, "utf8") == 0 || hts_isStringAscii(s, size)) {
+ return strndup(s, size);
+ }
+ /* Find codepage */
+ else {
+ return hts_convertStringToUTF8_(s, size, charset, "utf-8");
+ }
+}
+
#endif
char* hts_getCharsetFromContentType(const char *mime) {
diff --git a/src/htscharset.h b/src/htscharset.h
index 035f05e..2d76b3c 100644
--- a/src/htscharset.h
+++ b/src/htscharset.h
@@ -44,6 +44,12 @@ Please visit our Website: http://www.httrack.com
extern char *hts_convertStringToUTF8(const char *s, size_t size, const char *charset);
/**
+ * Convert the string "s" from UTF-8 to charset "charset".
+ * Return NULL upon error.
+ **/
+extern char *hts_convertStringFromUTF8(const char *s, size_t size, const char *charset);
+
+/**
* Extract the charset from the HTML buffer "html"
**/
extern char* hts_getCharsetFromMeta(const char *html, size_t size);
diff --git a/src/htscore.c b/src/htscore.c
index 8d62df7..de202db 100644
--- a/src/htscore.c
+++ b/src/htscore.c
@@ -1446,11 +1446,14 @@ int httpmirror(char* url1, httrackp* opt) {
/* Detect charset to convert links into proper UTF8 filenames */
page_charset[0] = '\0';
if (opt->convert_utf8) {
+ /* HTTP charset is prioritary over meta */
if (r.charset[0] != '\0') {
if (strlen(r.charset) < sizeof(page_charset)) {
strcpy(page_charset, r.charset);
}
- } else if (is_html_mime_type(r.contenttype)) {
+ }
+ /* Attempt to find a meta charset */
+ else if (is_html_mime_type(r.contenttype)) {
char *const charset = hts_getCharsetFromMeta(r.adr, r.size);
if (charset != NULL && strlen(charset) < sizeof(page_charset)) {
strcpy(page_charset, charset);
@@ -1458,6 +1461,14 @@ int httpmirror(char* url1, httrackp* opt) {
if (charset != NULL)
free(charset);
}
+ /* Could not detect charset */
+ if (page_charset[0] == '\0') {
+ if ( (opt->debug>0) && (opt->log!=NULL) ) {
+ HTS_LOG(opt,LOG_INFO); fprintf(opt->log,"Warning: could not detect encoding for: %s%s"LF,urladr,urlfil);
+ }
+ /* Fallback to ISO-8859-1 (~== identity) ; accents will look weird */
+ strcpy(page_charset, "iso-8859-1");
+ }
}
/* Info for wrappers */
diff --git a/src/htsparse.c b/src/htsparse.c
index f127f0d..18059f5 100644
--- a/src/htsparse.c
+++ b/src/htsparse.c
@@ -54,6 +54,7 @@ Please visit our Website: http://www.httrack.com
#include "htsbauth.h"
#include "htsmd5.h"
#include "htsindex.h"
+#include "htscharset.h"
/* external modules */
#include "htsmodules.h"
@@ -2783,6 +2784,21 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) {
}
}
+ // convert to local codepage
+ if (str->page_charset_ != NULL && *str->page_charset_ != '\0') {
+ char *const local_save = hts_convertStringFromUTF8(tempo, strlen(tempo), str->page_charset_);
+ if (local_save != NULL) {
+ strcpybuff(tempo, local_save);
+ free(local_save);
+ } else {
+ if ((opt->debug>1) && (opt->log!=NULL)) {
+ HTS_LOG(opt,LOG_DEBUG);
+ fprintf(opt->log, "Warning: could not build local charset representation of '%s' in '%s'"LF, tempo, str->page_charset_);
+ test_flush;
+ }
+ }
+ }
+
// put original query string if any (ex: "www.example.com/foo4242.html?q=45)
pos = strchr(fil, '?');
if (pos != NULL) {
@@ -2872,6 +2888,21 @@ int htsparse(htsmoduleStruct* str, htsmoduleStructExtended* stre) {
}
if ((opt->getmode & 1) && (ptr>0)) {
+ // convert to local codepage - NOT, already converted into %NN, and passed to the remote server so we do not have anything to do
+ //if (str->page_charset_ != NULL && *str->page_charset_ != '\0') {
+ // char *const local_save = hts_convertStringFromUTF8(tempo, strlen(tempo), str->page_charset_);
+ // if (local_save != NULL) {
+ // strcpybuff(tempo, local_save);
+ // free(local_save);
+ // } else {
+ // if ((opt->debug>1) && (opt->log!=NULL)) {
+ // HTS_LOG(opt,LOG_DEBUG);
+ // fprintf(opt->log, "Warning: could not build local charset representation of '%s' in '%s'"LF, tempo, str->page_charset_);
+ // test_flush;
+ // }
+ // }
+ //}
+
// écrire le lien modifié, relatif
// Note: escape all chars, even >127 (no UTF)
HT_ADD_HTMLESCAPED_FULL(tempo);