summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThiago Padilha <thiago@padilha.cc>2021-05-08 18:31:40 -0300
committerGitHub <noreply@github.com>2021-05-08 23:31:40 +0200
commit18a684ab1c20914e13c27bc10e20bda6396ea38d (patch)
tree42389bca4d7ec23ac3e310bf4cb99478815b4a77
parenta051a7f7bc8dab2a4360c146d08b549cbcf17b8d (diff)
fix: TextEncoder#encodeInto spec compliance + perf gains (#10129)
-rw-r--r--cli/bench/main.rs5
-rw-r--r--cli/tests/text_encoder_into_perf.js34
-rw-r--r--cli/tests/unit/text_encoding_test.ts56
-rw-r--r--extensions/web/08_text_encoding.js187
-rw-r--r--tools/wpt/expectation.json24
5 files changed, 215 insertions, 91 deletions
diff --git a/cli/bench/main.rs b/cli/bench/main.rs
index f2ade54d8..b98a9d141 100644
--- a/cli/bench/main.rs
+++ b/cli/bench/main.rs
@@ -90,6 +90,11 @@ const EXEC_TIME_BENCHMARKS: &[(&str, &[&str], Option<i32>)] = &[
None,
),
(
+ "text_encoder_into",
+ &["run", "cli/tests/text_encoder_into_perf.js"],
+ None,
+ ),
+ (
"check",
&[
"cache",
diff --git a/cli/tests/text_encoder_into_perf.js b/cli/tests/text_encoder_into_perf.js
new file mode 100644
index 000000000..8d60e9f00
--- /dev/null
+++ b/cli/tests/text_encoder_into_perf.js
@@ -0,0 +1,34 @@
+const mixed = "@Ā๐😀";
+
+function generateRandom(bytes) {
+ let result = "";
+ let i = 0;
+ while (i < bytes) {
+ const toAdd = Math.floor(Math.random() * Math.min(4, bytes - i));
+ switch (toAdd) {
+ case 0:
+ result += mixed[0];
+ i++;
+ break;
+ case 1:
+ result += mixed[1];
+ i++;
+ break;
+ case 2:
+ result += mixed[2];
+ i++;
+ break;
+ case 3:
+ result += mixed[3];
+ result += mixed[4];
+ i += 2;
+ break;
+ }
+ }
+ return result;
+}
+
+const randomData = generateRandom(1024);
+const encoder = new TextEncoder();
+const targetBuffer = new Uint8Array(randomData.length * 4);
+for (let i = 0; i < 10_000; i++) encoder.encodeInto(randomData, targetBuffer);
diff --git a/cli/tests/unit/text_encoding_test.ts b/cli/tests/unit/text_encoding_test.ts
index 7a15c9376..42c221cb2 100644
--- a/cli/tests/unit/text_encoding_test.ts
+++ b/cli/tests/unit/text_encoding_test.ts
@@ -157,6 +157,62 @@ unitTest(function textEncodeInto3(): void {
]);
});
+unitTest(function loneSurrogateEncodeInto(): void {
+ const fixture = "lone𝄞\ud888surrogate";
+ const encoder = new TextEncoder();
+ const bytes = new Uint8Array(20);
+ const result = encoder.encodeInto(fixture, bytes);
+ assertEquals(result.read, 16);
+ assertEquals(result.written, 20);
+ // deno-fmt-ignore
+ assertEquals(Array.from(bytes), [
+ 0x6c, 0x6f, 0x6e, 0x65,
+ 0xf0, 0x9d, 0x84, 0x9e,
+ 0xef, 0xbf, 0xbd, 0x73,
+ 0x75, 0x72, 0x72, 0x6f,
+ 0x67, 0x61, 0x74, 0x65
+ ]);
+});
+
+unitTest(function loneSurrogateEncodeInto2(): void {
+ const fixture = "\ud800";
+ const encoder = new TextEncoder();
+ const bytes = new Uint8Array(3);
+ const result = encoder.encodeInto(fixture, bytes);
+ assertEquals(result.read, 1);
+ assertEquals(result.written, 3);
+ // deno-fmt-ignore
+ assertEquals(Array.from(bytes), [
+ 0xef, 0xbf, 0xbd
+ ]);
+});
+
+unitTest(function loneSurrogateEncodeInto3(): void {
+ const fixture = "\udc00";
+ const encoder = new TextEncoder();
+ const bytes = new Uint8Array(3);
+ const result = encoder.encodeInto(fixture, bytes);
+ assertEquals(result.read, 1);
+ assertEquals(result.written, 3);
+ // deno-fmt-ignore
+ assertEquals(Array.from(bytes), [
+ 0xef, 0xbf, 0xbd
+ ]);
+});
+
+unitTest(function swappedSurrogatePairEncodeInto4(): void {
+ const fixture = "\udc00\ud800";
+ const encoder = new TextEncoder();
+ const bytes = new Uint8Array(8);
+ const result = encoder.encodeInto(fixture, bytes);
+ assertEquals(result.read, 2);
+ assertEquals(result.written, 6);
+ // deno-fmt-ignore
+ assertEquals(Array.from(bytes), [
+ 0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd, 0x00, 0x00
+ ]);
+});
+
unitTest(function textDecoderSharedUint8Array(): void {
const ab = new SharedArrayBuffer(6);
const dataView = new DataView(ab);
diff --git a/extensions/web/08_text_encoding.js b/extensions/web/08_text_encoding.js
index 7e7d4a573..c293633c3 100644
--- a/extensions/web/08_text_encoding.js
+++ b/extensions/web/08_text_encoding.js
@@ -48,51 +48,129 @@
return inRange(a, 0x00, 0x7f);
}
- function stringToCodePoints(input) {
- const u = [];
- for (const c of input) {
- u.push(c.codePointAt(0));
- }
- return u;
- }
-
- class UTF8Encoder {
- handler(codePoint) {
- if (codePoint === END_OF_STREAM) {
- return "finished";
+ // Minor Unicode reference for readers.
+ //
+ // Unicode code points are integers in the range 0x0 - 0x10ffff, (using at
+ // most 21 bits). These integers are what rendering engines use to decide what
+ // glyphs are displayed on the screen. Since most code points use less than
+ // 21-bits, there are encodings that can represent code points more
+ // efficiently.
+ //
+ // UTF-16 is one such encoding, and is used by Javascript engines to store
+ // strings internally. UTF-16 uses 1 or 2 16-bit integers (2 or 4 bytes) to
+ // represent a single code point.
+ //
+ // UTF-8 is another encoding, and uses 1, 2, 3 or 4 bytes to represent a
+ // single code point.
+ //
+ // The goal of the function below is to transform UTF-16 into UTF-8 without
+ // allocating any memory (writing to the buffer passed as parameter). The
+ // conversion loop is roughly divided into 3 steps:
+ //
+ // - Decode UTF-16 into Unicode.
+ // - Check if there's still enough space in the output buffer. If not, break
+ // out of the loop.
+ // - Encode UTF-8 into the output buffer.
+ //
+ // Some references to learn more about the topic:
+ // - https://dmitripavlutin.com/what-every-javascript-developer-should-know-about-unicode
+ // - https://en.wikipedia.org/wiki/UTF-8
+ // - https://en.wikipedia.org/wiki/UTF-16
+ function encodeUtf8(input, output, state) {
+ let { read, written } = state;
+ const inLen = input.length;
+ const outLen = output.length;
+ while (read < inLen) {
+ // Step 1: Decode the UTF-16 code unit(s) into an unicode code point.
+ //
+ // There are three possibilities here:
+ // - The code unit is outside the high surrogate range and is treated as
+ // the code point.
+ // - The code unit is in the high surrogate range and the next one
+ // is in the low surrogate range. The surrogate pair is combined into
+ // the final code point.
+ // - The code unit is a lone surrogate (high or low) which is invalid in
+ // UTF-16. In this case it is replaced by 0xfffd (� )
+ const badCodePoint = 0xfffd;
+ const codeUnit = input.charCodeAt(read++);
+ const surrogateMask = codeUnit & 0xfc00;
+ let codePoint = codeUnit;
+ if (surrogateMask === 0xd800) {
+ // codeUnit is a high surrogate, check if there's a next character
+ if (read < inLen) {
+ // check if the next one is a low surrogate
+ const nextCodeUnit = input.charCodeAt(read);
+ if ((nextCodeUnit & 0xfc00) === 0xdc00) {
+ // low surrogate, advance input offset and compute code point
+ codePoint = 0x10000 +
+ ((codeUnit & 0x3ff) << 10) + (nextCodeUnit & 0x3ff);
+ read++;
+ } else {
+ // lone high surrogate
+ codePoint = badCodePoint;
+ }
+ } else {
+ // lone high surrogate
+ codePoint = badCodePoint;
+ }
+ } else if (surrogateMask === 0xdc00) {
+ // lone low surrogate
+ codePoint = badCodePoint;
}
- if (inRange(codePoint, 0x00, 0x7f)) {
- return [codePoint];
+ // Step 2: Check if there's available space to encode the code point as
+ // UTF-8. It will take at most 4 bytes, only need to check if the
+ // available space is lower than that.
+ const availableSpace = outLen - written;
+ if (availableSpace < 4) {
+ // Possibly not enough space, make the final decision based on the code
+ // point range.
+ if (
+ availableSpace < 1 ||
+ (availableSpace < 2 && codePoint >= 0x80) ||
+ (availableSpace < 3 && codePoint >= 0x800) ||
+ codePoint >= 0x10000
+ ) {
+ // Not enough space. Rewind read offset and bail out
+ const isSurrogatePair = codePoint !== codeUnit &&
+ codePoint !== badCodePoint;
+ read -= isSurrogatePair ? 2 : 1;
+ break;
+ }
}
- let count;
- let offset;
- if (inRange(codePoint, 0x0080, 0x07ff)) {
- count = 1;
- offset = 0xc0;
- } else if (inRange(codePoint, 0x0800, 0xffff)) {
- count = 2;
- offset = 0xe0;
- } else if (inRange(codePoint, 0x10000, 0x10ffff)) {
- count = 3;
- offset = 0xf0;
+ // Step 3: Encode the code point as UTF-8 into the output buffer.
+ if (codePoint < 0x80) {
+ // 7 bits, encoded in 1 byte directly (0xxxxxxx).
+ output[written++] = codePoint;
+ } else if (codePoint < 0x800) {
+ // 11 bits, encode in 2 bytes where:
+ // byte 1: 110xxxxx (5 bits)
+ // byte 2: 10xxxxxx (6 bits)
+ output[written++] = 0xc0 | (0x1f & (codePoint >> 6));
+ output[written++] = 0x80 | (0x3f & (codePoint));
+ } else if (codePoint < 0x10000) {
+ // 16 bits, encode in 3 bytes where:
+ // byte 1: 1110xxxx (4 bits)
+ // byte 2: 10xxxxxx (6 bits)
+ // byte 3: 10xxxxxx (6 bits)
+ output[written++] = 0xe0 | (0x0f & (codePoint >> 12));
+ output[written++] = 0x80 | (0x3f & (codePoint >> 6));
+ output[written++] = 0x80 | (0x3f & (codePoint));
} else {
- throw TypeError(
- `Code point out of range: \\x${codePoint.toString(16)}`,
- );
+ // 21 bits, encode in 4 bytes where:
+ // byte 1: 11110xxx (3 bits)
+ // byte 2: 10xxxxxx (6 bits)
+ // byte 3: 10xxxxxx (6 bits)
+ // byte 4: 10xxxxxx (6 bits)
+ output[written++] = 0xf0 | (0x07 & (codePoint >> 18));
+ output[written++] = 0x80 | (0x3f & (codePoint >> 12));
+ output[written++] = 0x80 | (0x3f & (codePoint >> 6));
+ output[written++] = 0x80 | (0x3f & (codePoint));
}
-
- const bytes = [(codePoint >> (6 * count)) + offset];
-
- while (count > 0) {
- const temp = codePoint >> (6 * (count - 1));
- bytes.push(0x80 | (temp & 0x3f));
- count--;
- }
-
- return bytes;
}
+ state.read = read;
+ state.written = written;
}
function atob(s) {
@@ -4221,37 +4299,12 @@
"2nd argument to TextEncoder.encodeInto must be Uint8Array",
);
}
+ const state = { read: 0, written: 0 };
if (dest.byteLength === 0) {
- return { read: 0, written: 0 };
+ return state;
}
- const encoder = new UTF8Encoder();
- const inputStream = new Stream(stringToCodePoints(input));
-
- let written = 0;
- let read = 0;
- while (true) {
- const item = inputStream.read();
- const result = encoder.handler(item);
- if (result === "finished") {
- break;
- }
- if (dest.length - written >= result.length) {
- read++;
- if (item > 0xFFFF) {
- // increment read a second time if greater than U+FFFF
- read++;
- }
- dest.set(result, written);
- written += result.length;
- } else {
- break;
- }
- }
-
- return {
- read,
- written,
- };
+ encodeUtf8(input, dest, state);
+ return state;
}
get [Symbol.toStringTag]() {
return "TextEncoder";
diff --git a/tools/wpt/expectation.json b/tools/wpt/expectation.json
index 407e301bb..6eec573cb 100644
--- a/tools/wpt/expectation.json
+++ b/tools/wpt/expectation.json
@@ -39,30 +39,6 @@
"api-replacement-encodings.any.js": true,
"api-surrogates-utf8.any.js": true,
"encodeInto.any.js": [
- "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler 0",
- "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler 0",
- "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler 0",
- "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler 0",
- "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler 128",
- "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler 128",
- "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler 128",
- "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler 128",
- "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler random",
- "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 0, filler random",
- "encodeInto() into ArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler random",
- "encodeInto() into SharedArrayBuffer with \ud834A\udf06A¥Hi and destination length 10, offset 4, filler random",
- "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 0, filler 0",
- "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 0, filler 0",
- "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 4, filler 0",
- "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 4, filler 0",
- "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 0, filler 128",
- "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 0, filler 128",
- "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 4, filler 128",
- "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 4, filler 128",
- "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 0, filler random",
- "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 0, filler random",
- "encodeInto() into ArrayBuffer with A\udf06 and destination length 4, offset 4, filler random",
- "encodeInto() into SharedArrayBuffer with A\udf06 and destination length 4, offset 4, filler random",
"encodeInto() and a detached output buffer"
],
"idlharness.any.js": [