summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/htscharset.c33
-rw-r--r--src/htscharset.h7
2 files changed, 36 insertions, 4 deletions
diff --git a/src/htscharset.c b/src/htscharset.c
index 405a9aa..5dd58ad 100644
--- a/src/htscharset.c
+++ b/src/htscharset.c
@@ -730,7 +730,7 @@ static unsigned int nlz8(unsigned char x) {
int i_; \
/* loop should be unrolled by compiler */ \
for(i_ = 0 ; i_ < 7 - CLEARED ; i_++) { \
- const int c_ = READER; \
+ const int c_ = (READER); \
/* continuation byte 10xxxxxx */ \
if (c_ != -1 && ( c_ >> 6 ) == 0x2) { \
uc_ <<= 6; \
@@ -747,7 +747,7 @@ static unsigned int nlz8(unsigned char x) {
EMITTER is a macro function taking an int (-1 for error). */
#define READ_UNICODE(READER, EMITTER) do { \
const unsigned int f_ = \
- (unsigned int) READER; \
+ (unsigned int) (READER); \
/* 1..8 */ \
const unsigned int c_ = \
nlz8((unsigned char)~f_); \
@@ -1101,6 +1101,7 @@ char *hts_convertStringIDNAToUTF8(const char *s, size_t size) {
}
hts_UCS4* hts_convertUTF8StringToUCS4(const char *s, size_t size, size_t *nChars) {
+ const unsigned char *const data = (const unsigned char*) s;
size_t i;
hts_UCS4 *dest = NULL;
size_t capa = 0, destSize = 0;
@@ -1112,7 +1113,7 @@ hts_UCS4* hts_convertUTF8StringToUCS4(const char *s, size_t size, size_t *nChars
hts_UCS4 uc;
/* Reader: can read bytes up to j */
-#define RD ( i < size ? s[i++] : -1 )
+#define RD ( i < size ? data[i++] : -1 )
/* Writer: upon error, return FFFD (replacement character) */
#define WR(C) uc = (C) != -1 ? (hts_UCS4) (C) : (hts_UCS4) 0xfffd
@@ -1171,11 +1172,12 @@ size_t hts_writeUTF8(hts_UCS4 uc, char *dest, size_t size) {
}
size_t hts_readUTF8(const char *src, size_t size, hts_UCS4 *puc) {
+ const unsigned char *const data = (const unsigned char*) src;
size_t i = 0;
int uc = -1;
/* Reader: can read bytes up to j */
-#define RD ( i < size ? src[i++] : -1 )
+#define RD ( i < size ? data[i++] : -1 )
/* Writer: upon error, return FFFD (replacement character) */
#define WR(C) uc = (C)
@@ -1196,6 +1198,29 @@ size_t hts_readUTF8(const char *src, size_t size, hts_UCS4 *puc) {
return 0;
}
+size_t hts_getUTF8SequenceLength(const char lead) {
+ const unsigned char f = (unsigned char) lead;
+ const unsigned int c = nlz8(~f);
+ switch(c) {
+ case 0:
+ /* ASCII */
+ return 1;
+ break;
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ /* UTF-8 */
+ return c;
+ break;
+ default:
+ /* ERROR */
+ return 0;
+ break;
+ }
+}
+
size_t hts_stringLengthUCS4(const hts_UCS4 *s) {
size_t i;
for(i = 0 ; s[i] != 0 ; i++) ;
diff --git a/src/htscharset.h b/src/htscharset.h
index 0453788..92f8b7c 100644
--- a/src/htscharset.h
+++ b/src/htscharset.h
@@ -143,6 +143,13 @@ extern size_t hts_writeUTF8(hts_UCS4 uc, char *dest, size_t size);
**/
extern size_t hts_readUTF8(const char *src, size_t size, hts_UCS4 *puc);
+/**
+ * Given the first UTF-8 sequence character, get the total number of
+ * characters in the sequence (1 for ASCII).
+ * Return 0 upon error (not a leading character).
+ **/
+extern size_t hts_getUTF8SequenceLength(const char lead);
+
/** WIN32 specific functions. **/
#ifdef _WIN32
/**