summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/htscharset.c30
-rw-r--r--src/htscharset.h14
2 files changed, 39 insertions, 5 deletions
diff --git a/src/htscharset.c b/src/htscharset.c
index 932c01e..405a9aa 100644
--- a/src/htscharset.c
+++ b/src/htscharset.c
@@ -540,7 +540,7 @@ size_t hts_stringLengthUTF8(const char *s) {
for(i = 0, len = 0; bytes[i] != '\0'; i++) {
const unsigned char c = bytes[i];
- if (HTS_IS_LEADING_UTF8(c)) { // ASCII or leading byte
+ if (HTS_IS_LEADING_UTF8(c)) { /* ASCII or leading byte */
len++;
}
}
@@ -578,7 +578,7 @@ int hts_isCharsetUTF8(const char *charset) {
char *hts_getCharsetFromMeta(const char *html, size_t size) {
int i;
- // <META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=utf-8" >
+ /* <META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=utf-8" > */
for(i = 0; i < size; i++) {
if (html[i] == '<' && strncasecmp(&html[i + 1], "meta", 4) == 0
&& is_space(html[i + 5])) {
@@ -1170,6 +1170,32 @@ size_t hts_writeUTF8(hts_UCS4 uc, char *dest, size_t size) {
return offs;
}
+size_t hts_readUTF8(const char *src, size_t size, hts_UCS4 *puc) {
+ size_t i = 0;
+ int uc = -1;
+
+ /* Reader: can read bytes up to j */
+#define RD ( i < size ? src[i++] : -1 )
+
+ /* Writer: upon error, return FFFD (replacement character) */
+#define WR(C) uc = (C)
+
+ /* Read Unicode character. */
+ READ_UNICODE(RD, WR);
+#undef RD
+#undef WR
+
+ /* Return */
+ if (uc != -1) {
+ if (puc != NULL) {
+ *puc = (hts_UCS4) uc;
+ }
+ return i;
+ }
+
+ return 0;
+}
+
size_t hts_stringLengthUCS4(const hts_UCS4 *s) {
size_t i;
for(i = 0 ; s[i] != 0 ; i++) ;
diff --git a/src/htscharset.h b/src/htscharset.h
index c22978e..0453788 100644
--- a/src/htscharset.h
+++ b/src/htscharset.h
@@ -135,25 +135,33 @@ extern size_t hts_stringLengthUCS4(const hts_UCS4 *s);
**/
extern size_t hts_writeUTF8(hts_UCS4 uc, char *dest, size_t size);
-/* WIN32 specific. */
+/**
+ * Read the next Unicode character within 'src' of size 'size' and, upon
+ * successful reading, return the number of bytes read and place the
+ * character is 'puc'.
+ * Return 0 upon error.
+ **/
+extern size_t hts_readUTF8(const char *src, size_t size, hts_UCS4 *puc);
+/** WIN32 specific functions. **/
#ifdef _WIN32
-
/**
* Convert UTF-8 to WCHAR.
+ * This function is WIN32 specific.
**/
extern LPWSTR hts_convertUTF8StringToUCS2(const char *s, int size, int *pwsize);
/**
* Convert from WCHAR.
+ * This function is WIN32 specific.
**/
extern char *hts_convertUCS2StringToUTF8(LPWSTR woutput, int wsize);
/**
* Convert current system codepage to UTF-8.
+ * This function is WIN32 specific.
**/
extern char *hts_convertStringSystemToUTF8(const char *s, size_t size);
-
#endif
#endif