From 7f82ef46816b65268b231e75eb403faf2801e52a Mon Sep 17 00:00:00 2001 From: Xavier Roche Date: Sat, 1 Jun 2013 09:31:43 +0000 Subject: Added hts_readUTF8() --- src/htscharset.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'src/htscharset.c') diff --git a/src/htscharset.c b/src/htscharset.c index 932c01e..405a9aa 100644 --- a/src/htscharset.c +++ b/src/htscharset.c @@ -540,7 +540,7 @@ size_t hts_stringLengthUTF8(const char *s) { for(i = 0, len = 0; bytes[i] != '\0'; i++) { const unsigned char c = bytes[i]; - if (HTS_IS_LEADING_UTF8(c)) { // ASCII or leading byte + if (HTS_IS_LEADING_UTF8(c)) { /* ASCII or leading byte */ len++; } } @@ -578,7 +578,7 @@ int hts_isCharsetUTF8(const char *charset) { char *hts_getCharsetFromMeta(const char *html, size_t size) { int i; - // + /* */ for(i = 0; i < size; i++) { if (html[i] == '<' && strncasecmp(&html[i + 1], "meta", 4) == 0 && is_space(html[i + 5])) { @@ -1170,6 +1170,32 @@ size_t hts_writeUTF8(hts_UCS4 uc, char *dest, size_t size) { return offs; } +size_t hts_readUTF8(const char *src, size_t size, hts_UCS4 *puc) { + size_t i = 0; + int uc = -1; + + /* Reader: can read bytes up to j */ +#define RD ( i < size ? src[i++] : -1 ) + + /* Writer: upon error, return FFFD (replacement character) */ +#define WR(C) uc = (C) + + /* Read Unicode character. */ + READ_UNICODE(RD, WR); +#undef RD +#undef WR + + /* Return */ + if (uc != -1) { + if (puc != NULL) { + *puc = (hts_UCS4) uc; + } + return i; + } + + return 0; +} + size_t hts_stringLengthUCS4(const hts_UCS4 *s) { size_t i; for(i = 0 ; s[i] != 0 ; i++) ; -- cgit v1.2.3