diff options
author | Xavier Roche <xroche@users.noreply.github.com> | 2013-06-02 13:14:22 +0000 |
---|---|---|
committer | Xavier Roche <xroche@users.noreply.github.com> | 2013-06-02 13:14:22 +0000 |
commit | 3e0b1c7f208f7d5cc830ae78e66c781d48159737 (patch) | |
tree | 5a737ed3fe7dac8f4beb80cfb705ab3dc7d602fa /src | |
parent | d4c81f4e8592bcf3d07cf4bb93e0351201413008 (diff) |
Added hts_getUTF8SequenceLength()
Diffstat (limited to 'src')
-rw-r--r-- | src/htscharset.c | 33 | ||||
-rw-r--r-- | src/htscharset.h | 7 |
2 files changed, 36 insertions, 4 deletions
diff --git a/src/htscharset.c b/src/htscharset.c index 405a9aa..5dd58ad 100644 --- a/src/htscharset.c +++ b/src/htscharset.c @@ -730,7 +730,7 @@ static unsigned int nlz8(unsigned char x) { int i_; \ /* loop should be unrolled by compiler */ \ for(i_ = 0 ; i_ < 7 - CLEARED ; i_++) { \ - const int c_ = READER; \ + const int c_ = (READER); \ /* continuation byte 10xxxxxx */ \ if (c_ != -1 && ( c_ >> 6 ) == 0x2) { \ uc_ <<= 6; \ @@ -747,7 +747,7 @@ static unsigned int nlz8(unsigned char x) { EMITTER is a macro function taking an int (-1 for error). */ #define READ_UNICODE(READER, EMITTER) do { \ const unsigned int f_ = \ - (unsigned int) READER; \ + (unsigned int) (READER); \ /* 1..8 */ \ const unsigned int c_ = \ nlz8((unsigned char)~f_); \ @@ -1101,6 +1101,7 @@ char *hts_convertStringIDNAToUTF8(const char *s, size_t size) { } hts_UCS4* hts_convertUTF8StringToUCS4(const char *s, size_t size, size_t *nChars) { + const unsigned char *const data = (const unsigned char*) s; size_t i; hts_UCS4 *dest = NULL; size_t capa = 0, destSize = 0; @@ -1112,7 +1113,7 @@ hts_UCS4* hts_convertUTF8StringToUCS4(const char *s, size_t size, size_t *nChars hts_UCS4 uc; /* Reader: can read bytes up to j */ -#define RD ( i < size ? s[i++] : -1 ) +#define RD ( i < size ? data[i++] : -1 ) /* Writer: upon error, return FFFD (replacement character) */ #define WR(C) uc = (C) != -1 ? (hts_UCS4) (C) : (hts_UCS4) 0xfffd @@ -1171,11 +1172,12 @@ size_t hts_writeUTF8(hts_UCS4 uc, char *dest, size_t size) { } size_t hts_readUTF8(const char *src, size_t size, hts_UCS4 *puc) { + const unsigned char *const data = (const unsigned char*) src; size_t i = 0; int uc = -1; /* Reader: can read bytes up to j */ -#define RD ( i < size ? src[i++] : -1 ) +#define RD ( i < size ? data[i++] : -1 ) /* Writer: upon error, return FFFD (replacement character) */ #define WR(C) uc = (C) @@ -1196,6 +1198,29 @@ size_t hts_readUTF8(const char *src, size_t size, hts_UCS4 *puc) { return 0; } +size_t hts_getUTF8SequenceLength(const char lead) { + const unsigned char f = (unsigned char) lead; + const unsigned int c = nlz8(~f); + switch(c) { + case 0: + /* ASCII */ + return 1; + break; + case 2: + case 3: + case 4: + case 5: + case 6: + /* UTF-8 */ + return c; + break; + default: + /* ERROR */ + return 0; + break; + } +} + size_t hts_stringLengthUCS4(const hts_UCS4 *s) { size_t i; for(i = 0 ; s[i] != 0 ; i++) ; diff --git a/src/htscharset.h b/src/htscharset.h index 0453788..92f8b7c 100644 --- a/src/htscharset.h +++ b/src/htscharset.h @@ -143,6 +143,13 @@ extern size_t hts_writeUTF8(hts_UCS4 uc, char *dest, size_t size); **/ extern size_t hts_readUTF8(const char *src, size_t size, hts_UCS4 *puc); +/** + * Given the first UTF-8 sequence character, get the total number of + * characters in the sequence (1 for ASCII). + * Return 0 upon error (not a leading character). + **/ +extern size_t hts_getUTF8SequenceLength(const char lead); + /** WIN32 specific functions. **/ #ifdef _WIN32 /** |