2 files changed, 39 insertions, 5 deletions
diff --git a/src/htscharset.c b/src/htscharset.c
index 932c01e..405a9aa 100644
--- a/src/htscharset.c
+++ b/src/htscharset.c
@@ -540,7 +540,7 @@ size_t hts_stringLengthUTF8(const char *s) {
   for(i = 0, len = 0; bytes[i] != '\0'; i++) {
     const unsigned char c = bytes[i];
 
-    if (HTS_IS_LEADING_UTF8(c)) {       // ASCII or leading byte
+    if (HTS_IS_LEADING_UTF8(c)) {       /* ASCII or leading byte */
       len++;
     }
   }
@@ -578,7 +578,7 @@ int hts_isCharsetUTF8(const char *charset) {
 char *hts_getCharsetFromMeta(const char *html, size_t size) {
   int i;
 
-  // <META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=utf-8" >
+  /* <META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=utf-8" > */
   for(i = 0; i < size; i++) {
     if (html[i] == '<' && strncasecmp(&html[i + 1], "meta", 4) == 0
         && is_space(html[i + 5])) {
@@ -1170,6 +1170,32 @@ size_t hts_writeUTF8(hts_UCS4 uc, char *dest, size_t size) {
   return offs; 
 }
 
+size_t hts_readUTF8(const char *src, size_t size, hts_UCS4 *puc) {
+  size_t i = 0;
+  int uc = -1;
+
+  /* Reader: can read bytes up to j */
+#define RD ( i < size ? src[i++] : -1 )
+
+  /* Writer: upon error, return FFFD (replacement character) */
+#define WR(C) uc = (C)
+
+  /* Read Unicode character. */
+  READ_UNICODE(RD, WR);
+#undef RD
+#undef WR
+
+  /* Return */
+  if (uc != -1) {
+    if (puc != NULL) {
+      *puc = (hts_UCS4) uc;
+    }
+    return i;
+  }
+
+  return 0;
+}
+
 size_t hts_stringLengthUCS4(const hts_UCS4 *s) {
   size_t i;
   for(i = 0 ; s[i] != 0 ; i++) ;
diff --git a/src/htscharset.h b/src/htscharset.h
index c22978e..0453788 100644
--- a/src/htscharset.h
+++ b/src/htscharset.h
@@ -135,25 +135,33 @@ extern size_t hts_stringLengthUCS4(const hts_UCS4 *s);
  **/
 extern size_t hts_writeUTF8(hts_UCS4 uc, char *dest, size_t size);
 
-/* WIN32 specific. */
+/**
+ * Read the next Unicode character within 'src' of size 'size' and, upon
+ * successful reading, return the number of bytes read and place the
+ * character is 'puc'.
+ * Return 0 upon error.
+ **/
+extern size_t hts_readUTF8(const char *src, size_t size, hts_UCS4 *puc);
 
+/** WIN32 specific functions. **/
 #ifdef _WIN32
-
 /**
  * Convert UTF-8 to WCHAR.
+ * This function is WIN32 specific.
  **/
 extern LPWSTR hts_convertUTF8StringToUCS2(const char *s, int size, int *pwsize);
 
 /**
  * Convert from WCHAR.
+ * This function is WIN32 specific.
  **/
 extern char *hts_convertUCS2StringToUTF8(LPWSTR woutput, int wsize);
 
 /**
  * Convert current system codepage to UTF-8.
+ * This function is WIN32 specific.
  **/
 extern char *hts_convertStringSystemToUTF8(const char *s, size_t size);
-
 #endif
 
 #endif