diff options
author | Xavier Roche <xroche@users.noreply.github.com> | 2012-05-06 14:23:07 +0000 |
---|---|---|
committer | Xavier Roche <xroche@users.noreply.github.com> | 2012-05-06 14:23:07 +0000 |
commit | 06000077987fd107a3a69761f39b9a83eb1d8af0 (patch) | |
tree | 7d31d768d33955ed1cae99c79bfa5087117a4555 /src | |
parent | 8368e4772fdd69ae04d95b08e042fe43eae3392c (diff) |
Missing files
Diffstat (limited to 'src')
-rw-r--r-- | src/htscharset.c | 451 | ||||
-rw-r--r-- | src/htscharset.h | 70 |
2 files changed, 521 insertions, 0 deletions
diff --git a/src/htscharset.c b/src/htscharset.c new file mode 100644 index 0000000..8f7ff0d --- /dev/null +++ b/src/htscharset.c @@ -0,0 +1,451 @@ +/* ------------------------------------------------------------ */ +/* +HTTrack Website Copier, Offline Browser for Windows and Unix +Copyright (C) Xavier Roche and other contributors + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 3 +of the License, or any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +Important notes: + +- We hereby ask people using this source NOT to use it in purpose of grabbing +emails addresses, or collecting any other private information on persons. +This would disgrace our work, and spoil the many hours we spent on it. + + +Please visit our Website: http://www.httrack.com +*/ + + +/* ------------------------------------------------------------ */ +/* File: Charset conversion functions */ +/* Author: Xavier Roche */ +/* ------------------------------------------------------------ */ + +#ifdef _WIN32 + +#include <windows.h> +#include <string.h> + +typedef struct wincodepage_t wincodepage_t; +struct wincodepage_t { + UINT codepage; + const char *name; +}; + +/* See <http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756%28v=vs.85%29.aspx> */ +static const wincodepage_t codepages[] = { + {37, "ibm037"}, + {437, "ibm437"}, + {500, "ibm500"}, + {708, "asmo-708"}, + {720, "dos-720"}, + {737, "ibm737"}, + {775, "ibm775"}, + {850, "ibm850"}, + {852, "ibm852"}, + {855, "ibm855"}, + {857, "ibm857"}, + {858, "ibm00858"}, + {860, "ibm860"}, + {861, "ibm861"}, + {862, "dos-862"}, + {863, "ibm863"}, + {864, "ibm864"}, + {865, "ibm865"}, + {866, "cp866"}, + {869, "ibm869"}, + {870, "ibm870"}, + {874, "windows-874"}, + {875, "cp875"}, + {932, "shift_jis"}, + {936, "gb2312"}, + {949, "ks_c_5601-1987"}, + {950, "big5"}, + {1026, "ibm1026"}, + {1047, "ibm01047"}, + {1140, "ibm01140"}, + {1141, "ibm01141"}, + {1142, "ibm01142"}, + {1143, "ibm01143"}, + {1144, "ibm01144"}, + {1145, "ibm01145"}, + {1146, "ibm01146"}, + {1147, "ibm01147"}, + {1148, "ibm01148"}, + {1149, "ibm01149"}, + {1200, "utf-16"}, + {1201, "unicodefffe"}, + {1250, "windows-1250"}, + {1251, "windows-1251"}, + {1252, "windows-1252"}, + {1253, "windows-1253"}, + {1254, "windows-1254"}, + {1255, "windows-1255"}, + {1256, "windows-1256"}, + {1257, "windows-1257"}, + {1258, "windows-1258"}, + {1361, "johab"}, + {10000, "macintosh"}, + {10001, "x-mac-japanese"}, + {10002, "x-mac-chinesetrad"}, + {10003, "x-mac-korean"}, + {10004, "x-mac-arabic"}, + {10005, "x-mac-hebrew"}, + {10006, "x-mac-greek"}, + {10007, "x-mac-cyrillic"}, + {10008, "x-mac-chinesesimp"}, + {10010, "x-mac-romanian"}, + {10017, "x-mac-ukrainian"}, + {10021, "x-mac-thai"}, + {10029, "x-mac-ce"}, + {10079, "x-mac-icelandic"}, + {10081, "x-mac-turkish"}, + {10082, "x-mac-croatian"}, + {12000, "utf-32"}, + {12001, "utf-32be"}, + {20000, "x-chinese_cns"}, + {20001, "x-cp20001"}, + {20002, "x_chinese-eten"}, + {20003, "x-cp20003"}, + {20004, "x-cp20004"}, + {20005, "x-cp20005"}, + {20105, "x-ia5"}, + {20106, "x-ia5-german"}, + {20107, "x-ia5-swedish"}, + {20108, "x-ia5-norwegian"}, + {20127, "us-ascii"}, + {20261, "x-cp20261"}, + {20269, "x-cp20269"}, + {20273, "ibm273"}, + {20277, "ibm277"}, + {20278, "ibm278"}, + {20280, "ibm280"}, + {20284, "ibm284"}, + {20285, "ibm285"}, + {20290, "ibm290"}, + {20297, "ibm297"}, + {20420, "ibm420"}, + {20423, "ibm423"}, + {20424, "ibm424"}, + {20833, "x-ebcdic-koreanextended"}, + {20838, "ibm-thai"}, + {20866, "koi8-r"}, + {20871, "ibm871"}, + {20880, "ibm880"}, + {20905, "ibm905"}, + {20924, "ibm00924"}, + {20932, "euc-jp"}, + {20936, "x-cp20936"}, + {20949, "x-cp20949"}, + {21025, "cp1025"}, + {21866, "koi8-u"}, + {28591, "iso-8859-1"}, + {28592, "iso-8859-2"}, + {28593, "iso-8859-3"}, + {28594, "iso-8859-4"}, + {28595, "iso-8859-5"}, + {28596, "iso-8859-6"}, + {28597, "iso-8859-7"}, + {28598, "iso-8859-8"}, + {28599, "iso-8859-9"}, + {28603, "iso-8859-13"}, + {28605, "iso-8859-15"}, + {29001, "x-europa"}, + {38598, "iso-8859-8-i"}, + {50220, "iso-2022-jp"}, + {50221, "csiso2022jp"}, + {50222, "iso-2022-jp"}, + {50225, "iso-2022-kr"}, + {50227, "x-cp50227"}, + {50229, "iso-2022-cn"}, + {51932, "euc-jp"}, + {51936, "euc-cn"}, + {51949, "euc-kr"}, + {52936, "hz-gb-2312"}, + {54936, "gb18030"}, + {57002, "x-iscii-de"}, + {57003, "x-iscii-be"}, + {57004, "x-iscii-ta"}, + {57005, "x-iscii-te"}, + {57006, "x-iscii-as"}, + {57007, "x-iscii-or"}, + {57008, "x-iscii-ka"}, + {57009, "x-iscii-ma"}, + {57010, "x-iscii-gu"}, + {57011, "x-iscii-pa"}, + {65000, "utf-7"}, + {65001, "utf-8"}, + {0, NULL} +}; + +/* Get a Windows codepage, by its name. Return 0 upon error. */ +UINT hts_getCodepage(const char *name) { + int id; +#define IS_ALNUM(C) ( ((C) >= 'A' && (C) <= 'Z') || ((C) >= 'a' && (C) <= 'z') || ((C) >= '0' && (C) <= '9') ) +#define CHAR_LOWER(C) ( ((C) >= 'A' && (C) <= 'Z') ? ((C) + 'a' - 'A') : (C) ) + for(id = 0 ; codepages[id].name != NULL ; id++) { + int i, j; + /* Compare the two strings, lowercase and alphanum only (ISO88591 == iso-8859-1) */ + const char *a = name, *b = codepages[id].name; + for(i = 0, j = 0 ; ; i++, j++) { + /* Skip non-alnum */ + for( ; a[i] != '\0' && !IS_ALNUM(a[i]) ; i++) ; + for( ; b[j] != '\0' && !IS_ALNUM(b[j]) ; j++) ; + /* Compare */ + if (CHAR_LOWER(a[i]) != CHAR_LOWER(b[j])) { + break; + } + /* End of string ? (note: a[i] == b[j]) */ + else if (a[i] == '\0') { + return codepages[id].codepage; + } + } + } +#undef IS_ALNUM +#undef CHAR_LOWER + /* Not found */ + return 0; +} + +static char *strndup(const char *s, size_t size) { + char *dest = malloc(size + 1); + if (dest != NULL) { + memcpy(dest, s, size); + dest[size] = '\0'; + return dest; + } + return NULL; +} + +LPWSTR hts_convertStringToUCS2(const char *s, int size, UINT cp, int* pwsize) { + /* Size in wide chars of the output */ + const int wsize = MultiByteToWideChar(cp, 0, (LPCSTR) s, size, NULL, 0); + if (wsize > 0) { + LPSTR uoutput = NULL; + LPWSTR woutput = malloc((wsize + 1)*sizeof(WCHAR)); + if (woutput != NULL && MultiByteToWideChar(cp, 0, (LPCSTR) s, size, woutput, wsize) == wsize) { + const int usize = WideCharToMultiByte(CP_UTF8, 0, woutput, wsize, NULL, 0, NULL, FALSE); + if (usize > 0) { + woutput[wsize] = 0x0; + if (pwsize != NULL) + *pwsize = wsize; + return woutput; + } + } + if (woutput != NULL) + free(woutput); + } + return NULL; +} + +LPWSTR hts_convertUTF8StringToUCS2(const char *s, int size, int* pwsize) { + return hts_convertStringToUCS2(s, size, CP_UTF8, pwsize); +} + +char *hts_convertUCS2StringToUTF8(LPWSTR woutput, int wsize) { + const int usize = WideCharToMultiByte(CP_UTF8, 0, woutput, wsize, NULL, 0, NULL, FALSE); + if (usize > 0) { + char *const uoutput = malloc((usize + 1)*sizeof(char)); + if (uoutput != NULL) { + if (WideCharToMultiByte(CP_UTF8, 0, woutput, wsize, uoutput, usize, NULL, FALSE) == usize) { + uoutput[usize] = '\0'; + return uoutput; + } else { + free(uoutput); + } + } + } + return NULL; +} + +char *hts_convertStringCPToUTF8(const char *s, size_t size, UINT cp) { + /* Empty string ? */ + if (size == 0) { + return strndup(s, size); + } + /* Already UTF-8 ? */ + if (cp == CP_UTF8) { + return strndup(s, size); + } + /* Other (valid) charset */ + else if (cp != 0) { + /* Size in wide chars of the output */ + int wsize; + LPWSTR woutput = hts_convertStringToUCS2(s, (int) size, cp, &wsize); + if (woutput != NULL) { + char *const uoutput = hts_convertUCS2StringToUTF8(woutput, wsize); + free(woutput); + return uoutput; + } + } + /* Error, charset not found! */ + return NULL; +} + +char *hts_convertStringToUTF8(const char *s, size_t size, const char *charset) { + const UINT cp = hts_getCodepage(charset); + return hts_convertStringCPToUTF8(s, size, cp); +} + +char *hts_convertStringSystemToUTF8(const char *s, size_t size) { + return hts_convertStringCPToUTF8(s, size, GetACP()); +} + +#else + +#include <string.h> +#include <iconv.h> + +char *hts_convertStringToUTF8(const char *s, size_t size, const char *charset) { + /* Empty string ? */ + if (size == 0) { + return strndup(s, size); + } + /* Already UTF-8 ? */ + if (strcasecmp(charset, "utf-8") == 0 || strcasecmp(charset, "utf8") == 0) { + return strndup(s, size); + } + /* Find codepage */ + else { + const iconv_t cp = iconv_open("utf-8", charset); + if (cp != (iconv_t) -1) { + char *inbuf = (char*) s; + size_t inbytesleft = size; + size_t outbufCapa = 0; + char *outbuf = NULL; + size_t outbytesleft = 0; + while(inbytesleft != 0) { + const size_t ret = iconv(cp, &inbuf, &inbytesleft, &outbuf, &outbytesleft); + if (ret == (size_t) -1) { + if (errno == E2BIG) { + const size_t used = outbufCapa - outbytesleft; + if (outbufCapa == 0) { + outbufCapa = 16; + } else { + outbufCapa *= 2; + } + outbuf = realloc(outbuf, outbufCapa); + if (outbuf == NULL) { + break; + } + outbytesleft = outbufCapa - used; + } + } + } + + /* Close codepage */ + iconv_close(cp); + + /* Return resulr (may be NULL) */ + return outbuf; + } + } + /* Error, charset not found! */ + return NULL; +} + +#endif + +char* hts_getCharsetFromContentType(const char *mime) { + /* text/html; charset=utf-8 */ + const char *const charset = "charset"; + char *pos = strstr(mime, charset); + if (pos != NULL) { + /* Skip spaces */ + int eq = 0; + for(pos += strlen(charset) ; *pos == ' ' || *pos == '=' || *pos == '"' || *pos == '\'' ; pos++) { + if (*pos == '=') { + eq = 1; + } + } + if (eq == 1) { + int len; + for(len = 0 ; pos[len] == ' ' || pos[len] == ';' || pos[len] == '"' || *pos == '\'' ; pos++) ; + if (len != 0) { + char *const s = malloc(len + 1); + int i; + for(i = 0 ; i < len ; i++) { + s[i] = pos[i]; + } + s[len] = '\0'; + return s; + } + } + } + return NULL; +} + +#ifdef _WIN32 +#define strcasecmp(a,b) stricmp(a,b) +#define strncasecmp(a,b,n) strnicmp(a,b,n) +#endif + +static int is_space(char c) { + return c == ' ' || c == '\t' || c == '\r' || c == '\n'; +} + +static int is_space_or_equal(char c) { + return is_space(c) || c == '='; +} + +static int is_space_or_equal_or_quote(char c) { + return is_space_or_equal(c) || c == '"' || c == '\''; +} + +char* hts_getCharsetFromMeta(const char *html, size_t size) { + int i; + // <META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=utf-8" > + for(i = 0 ; i < size ; i++) { + if (html[i] == '<' && strncasecmp(&html[i + 1], "meta", 4) == 0 && is_space(html[i + 5]) ) { + /* Skip spaces */ + for(i += 5 ; is_space(html[i]) ; i++) ; + if (strncasecmp(&html[i], "HTTP-EQUIV", 10) == 0 && is_space_or_equal(html[i + 10]) ) { + for(i += 10 ; is_space_or_equal_or_quote(html[i]) ; i++) ; + if (strncasecmp(&html[i], "CONTENT-TYPE", 12) == 0) { + for(i += 12 ; is_space_or_equal_or_quote(html[i]) ; i++) ; + if (strncasecmp(&html[i], "CONTENT", 7) == 0 && is_space_or_equal(html[i + 7]) ) { + for(i += 7 ; is_space_or_equal_or_quote(html[i]) ; i++) ; + /* Skip content-type */ + for( ; i < size && html[i] != ';' && html[i] != '"' && html[i] != '\'' ; i++) ; + /* Expect charset attribute here */ + if (html[i] == ';') { + for(i++ ; is_space(html[i]) ; i++) ; + /* Look for charset */ + if (strncasecmp(&html[i], "charset", 7) == 0 && is_space_or_equal(html[i + 7])) { + int len; + for(i += 7 ; is_space_or_equal(html[i]) || html[i] == '\'' ; i++) ; + /* Charset */ + for(len = 0 ; i + len < size && html[i + len] != '"' && html[i + len] != '\'' && html[i + len] != ' ' ; len++) ; + /* No error ? */ + if (len != 0 && i < size) { + char *const s = malloc(len + 1); + int j; + for(j = 0 ; j < len ; j++) { + s[j] = html[i + j]; + } + s[len] = '\0'; + return s; + } + } + } + } + } + } + } + } + return NULL; +} + diff --git a/src/htscharset.h b/src/htscharset.h new file mode 100644 index 0000000..035f05e --- /dev/null +++ b/src/htscharset.h @@ -0,0 +1,70 @@ +/* ------------------------------------------------------------ */ +/* +HTTrack Website Copier, Offline Browser for Windows and Unix +Copyright (C) Xavier Roche and other contributors + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 3 +of the License, or any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +Important notes: + +- We hereby ask people using this source NOT to use it in purpose of grabbing +emails addresses, or collecting any other private information on persons. +This would disgrace our work, and spoil the many hours we spent on it. + + +Please visit our Website: http://www.httrack.com +*/ + + +/* ------------------------------------------------------------ */ +/* File: Charset conversion functions */ +/* Author: Xavier Roche */ +/* ------------------------------------------------------------ */ + +#ifndef HTS_CHARSET_DEFH +#define HTS_CHARSET_DEFH + +/** + * Convert the string "s" from charset "charset" to UTF-8. + * Return NULL upon error. + **/ +extern char *hts_convertStringToUTF8(const char *s, size_t size, const char *charset); + +/** + * Extract the charset from the HTML buffer "html" + **/ +extern char* hts_getCharsetFromMeta(const char *html, size_t size); + +#ifdef _WIN32 + +/** + * Convert UTF-8 to WCHAR. + **/ +extern LPWSTR hts_convertUTF8StringToUCS2(const char *s, int size, int* pwsize); + +/** + * Convert from WCHAR. + **/ +extern char *hts_convertUCS2StringToUTF8(LPWSTR woutput, int wsize); + +/** + * Convert current system codepage to UTF-8. + **/ +extern char *hts_convertStringSystemToUTF8(const char *s, size_t size); + +#endif + +#endif |