summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/htsencoding.c86
1 files changed, 59 insertions, 27 deletions
diff --git a/src/htsencoding.c b/src/htsencoding.c
index 08f16d7..e2a8fb2 100644
--- a/src/htsencoding.c
+++ b/src/htsencoding.c
@@ -205,66 +205,98 @@ int hts_unescapeEntities(const char *src, char *dest, const size_t max) {
}
int hts_unescapeUrl(const char *src, char *dest, const size_t max) {
- size_t i, j, lastJ, k, utfBufferJ;
+ size_t i, j, lastI, lastJ, k, utfBufferJ, utfBufferSize;
char utfBuffer[32];
assert(src != dest);
assert(max != 0);
- for(i = 0, j = 0, k = 0, utfBufferJ = 0, lastJ = (size_t) -1
+ for(i = 0, j = 0, k = 0, utfBufferJ = 0, utfBufferSize = 0,
+ lastI = (size_t) -1, lastJ = (size_t) -1
; src[i] != '\0' ; i++) {
char c = src[i];
+ unsigned char cUtf = (unsigned char) c;
/* Replacement for ' ' */
if (c == '+') {
- c = ' ';
+ c = cUtf = ' ';
+ k = 0; /* cancel any sequence */
}
/* Escape sequence start */
else if (c == '%') {
/* last known position of % written on destination
copy blindly c, we'll rollback later */
+ lastI = i;
lastJ = j;
}
/* End of sequence seen */
- else if (i >= 2 && i == lastJ + 2) {
- const int a1 = get_hex_value(src[i - 1]);
- const int a2 = get_hex_value(src[i - 0]);
+ else if (i >= 2 && i == lastI + 2) {
+ const int a1 = get_hex_value(src[lastI + 1]);
+ const int a2 = get_hex_value(src[lastI + 2]);
if (a1 != -1 && a2 != -1) {
const char ec = a1*16 + a2; /* new character */
+ cUtf = (unsigned char) ec;
- /* New leading character ? Flush UTF-8 buffer now. */
- if (k != 0 && HTS_IS_LEADING_UTF8(ec)) {
- const size_t utfBufferSize = k;
+ /* Shortcut for ASCII (do not unescape non-printable) */
+ if ((unsigned char) ec < 0x80 && (unsigned char) ec >= 32) {
+ /* Rollback new write position and character */
+ j = lastJ;
+ c = ec;
+ }
+ } else {
+ k = 0; /* cancel any sequence */
+ }
+ }
+ /* ASCII (and not in %xx) */
+ else if (cUtf < 0x80 && i != lastI + 1) {
+ k = 0; /* cancel any sequence */
+ }
+
+ /* UTF-8 sequence in progress (either a raw or a %xx character) */
+ if (cUtf >= 0x80) {
+ /* Leading UTF ? */
+ if (HTS_IS_LEADING_UTF8(cUtf)) {
+ k = 0; /* cancel any sequence */
+ }
+
+ /* Copy */
+ if (k < sizeof(utfBuffer)) {
+ /* First character */
+ if (k == 0) {
+ /* New destination-centric offset of utf-8 buffer beginning */
+ if (i == lastI + 2) { /* just read a %xx */
+ utfBufferJ = lastJ; /* position of % */
+ } else {
+ utfBufferJ = j; /* current position otherwise */
+ }
+
+ /* Sequence length */
+ utfBufferSize = hts_getUTF8SequenceLength(cUtf);
+ }
+
+ /* Copy */
+ utfBuffer[k++] = cUtf;
+
+ /* Flush UTF-8 buffer when completed. */
+ if (k == utfBufferSize) {
const size_t nRead = hts_readUTF8(utfBuffer, utfBufferSize, NULL);
- const size_t destPos = utfBufferJ;
-
+
/* Reset UTF-8 buffer in all cases. */
k = 0;
- /* New destination-centric offset of utf-8 buffer beginning */
- utfBufferJ = lastJ;
/* Was the character read successfully ? */
if (nRead == utfBufferSize) {
- /* Rollback and copy */
- j = destPos;
+ /* Rollback write position to sequence start write position */
+ j = utfBufferJ;
+
+ /* Copy full character sequence */
memcpy(&dest[j], utfBuffer, utfBufferSize);
j += utfBufferSize;
+
/* Skip current character */
continue;
}
}
-
- /* Shortcut for ASCII (do not unescape non-printable) */
- if ((unsigned char) ec < 0x80 && (unsigned char) ec >= 32) {
- assert(k == 0);
- /* Rollback new write position and character */
- j = lastJ;
- c = ec;
- }
- /* Copy if no overflow */
- else if (k < sizeof(utfBuffer)) {
- utfBuffer[k++] = ec;
- }
}
}