summaryrefslogtreecommitdiff
path: root/src/htscharset.h
blob: 32124710062287db488735120b1f1eff5df293b5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
/* ------------------------------------------------------------ */
/*
HTTrack Website Copier, Offline Browser for Windows and Unix
Copyright (C) 1998-2017 Xavier Roche and other contributors

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.

Important notes:

- We hereby ask people using this source NOT to use it in purpose of grabbing
emails addresses, or collecting any other private information on persons.
This would disgrace our work, and spoil the many hours we spent on it.

Please visit our Website: http://www.httrack.com
*/

/* ------------------------------------------------------------ */
/* File: Charset conversion functions                           */
/* Author: Xavier Roche                                         */
/* ------------------------------------------------------------ */

#ifndef HTS_CHARSET_DEFH
#define HTS_CHARSET_DEFH

/** Standard includes. **/
#include <stdlib.h>
#include <string.h>
#ifdef _WIN32
#include <windows.h>
#endif

/** UCS4 type. **/
typedef unsigned int hts_UCS4;

/** Leading character (ASCII or leading UTF-8 sequence) **/
#define HTS_IS_LEADING_UTF8(C) ((unsigned char)(C) < 0x80 || (unsigned char)(C) >= 0xc0)

/**
 * Convert the string "s" from charset "charset" to UTF-8.
 * Return NULL upon error.
 **/
extern char *hts_convertStringToUTF8(const char *s, size_t size,
                                     const char *charset);

/**
 * Convert the string "s" from UTF-8 to charset "charset".
 * Return NULL upon error.
 **/
extern char *hts_convertStringFromUTF8(const char *s, size_t size,
                                       const char *charset);

/**
 * Convert an UTF-8 string to an IDNA (RFC 3492) string.
 **/
extern char *hts_convertStringUTF8ToIDNA(const char *s, size_t size);

/**
 * Convert an IDNA (RFC 3492) string to an UTF-8 string.
 **/
extern char *hts_convertStringIDNAToUTF8(const char *s, size_t size);

/**
 * Has the given string any IDNA segments ?
 **/
extern int hts_isStringIDNA(const char *s, size_t size);

/**
 * Extract the charset from the HTML buffer "html"
 **/
extern char *hts_getCharsetFromMeta(const char *html, size_t size);

/**
 * Is the given string an ASCII string ?
 **/
extern int hts_isStringAscii(const char *s, size_t size);

/**
 * Is the given string an UTF-8 string ?
 **/
extern int hts_isStringUTF8(const char *s, size_t size);

/**
 * Is the given charset the UTF-8 charset ?
 **/
extern int hts_isCharsetUTF8(const char *charset);

/**
 * Get an UTF-8 string length in characters.
 **/
extern size_t hts_stringLengthUTF8(const char *s);

/**
 * Copy at most 'nBytes' bytes from src to dest, not truncating UTF-8
 * sequences.
 * Returns the number of bytes copied, not including the terminating \0.
 **/
extern size_t hts_copyStringUTF8(char *dest, const char *src, 
                                 size_t nBytes);

/**
 * Append at most 'nBytes' bytes from src to dest, not truncating UTF-8
 * sequences.
 * Returns the number of bytes appended, not including the terminating \0.
 **/
extern size_t hts_appendStringUTF8(char *dest, const char *src, 
                                   size_t nBytes);

/**
 * Convert an UTF-8 string into an Unicode string (0-terminated).
 **/
extern hts_UCS4* hts_convertUTF8StringToUCS4(const char *s, size_t size, 
                                             size_t *nChars);

/**
 * Convert an Unicode string into an UTF-8 string.
 **/
extern char *hts_convertUCS4StringToUTF8(const hts_UCS4 *s, size_t nChars);

/**
 * Return the length (in characters) of an UCS4 string terminated by 0.
 **/
extern size_t hts_stringLengthUCS4(const hts_UCS4 *s);

/**
 * Write the Unicode character 'uc' in 'dest' of maximum size 'size'.
 * Return the number of bytes written, or 0 upon error.
 * Note: does not \0-terminate the destination buffer.
 **/
extern size_t hts_writeUTF8(hts_UCS4 uc, char *dest, size_t size);

/**
 * Read the next Unicode character within 'src' of size 'size' and, upon
 * successful reading, return the number of bytes read and place the
 * character is 'puc'.
 * Return 0 upon error.
 **/
extern size_t hts_readUTF8(const char *src, size_t size, hts_UCS4 *puc);

/**
 * Given the first UTF-8 sequence character, get the total number of
 * characters in the sequence (1 for ASCII). 
 * Return 0 upon error (not a leading character).
 **/
extern size_t hts_getUTF8SequenceLength(const char lead);

/** WIN32 specific functions. **/
#ifdef _WIN32
/**
 * Convert UTF-8 to WCHAR.
 * This function is WIN32 specific.
 **/
extern LPWSTR hts_convertUTF8StringToUCS2(const char *s, int size, int *pwsize);

/**
 * Convert from WCHAR.
 * This function is WIN32 specific.
 **/
extern char *hts_convertUCS2StringToUTF8(LPWSTR woutput, int wsize);

/**
 * Convert current system codepage to UTF-8.
 * This function is WIN32 specific.
 **/
extern char *hts_convertStringSystemToUTF8(const char *s, size_t size);
#endif

#endif