summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreu Botella <abb@randomunok.com>2021-06-01 11:24:16 +0200
committerGitHub <noreply@github.com>2021-06-01 11:24:16 +0200
commite466a6fc9ade08c8ca17322d47017709eecc4444 (patch)
treee8303c17e404e6fb55f311e789eda3c583b61671
parent6dd7a7ecd9d9b09f5112c73c67aa5f94480c8196 (diff)
fix: Support the stream option to TextDecoder#decode (#10805)
-rw-r--r--extensions/web/08_text_encoding.js104
-rw-r--r--extensions/web/lib.deno_web.d.ts2
-rw-r--r--tools/wpt/expectation.json6
3 files changed, 75 insertions, 37 deletions
diff --git a/extensions/web/08_text_encoding.js b/extensions/web/08_text_encoding.js
index 2804ff6a0..6290ee982 100644
--- a/extensions/web/08_text_encoding.js
+++ b/extensions/web/08_text_encoding.js
@@ -222,12 +222,18 @@
return result;
}
- function Big5Decoder(big5, bytes, fatal = false, ignoreBOM = false) {
+ function Big5Decoder(
+ big5,
+ bytes,
+ fatal = false,
+ ignoreBOM = false,
+ stream = false,
+ lead = 0x00,
+ ) {
if (ignoreBOM) {
throw new TypeError("Ignoring the BOM is available only with utf-8.");
}
const res = [];
- let lead = 0x00;
for (let i = 0; i < bytes.length; i++) {
const byte = bytes[i];
if (lead !== 0x00) {
@@ -276,11 +282,11 @@
res.push(decoderError(fatal));
continue;
}
- if (lead !== 0x00) {
+ if (!stream && lead !== 0x00) {
lead = 0x00;
res.push(decoderError(fatal));
}
- return res;
+ return [res, lead];
}
function Utf16ByteDecoder(
@@ -288,9 +294,9 @@
be = false,
fatal = false,
ignoreBOM = false,
+ stream = false,
+ { leadByte = null, leadSurrogate = null } = {},
) {
- let leadByte = null;
- let leadSurrogate = null;
const result = [];
for (let i = 0; i < bytes.length; i++) {
@@ -327,10 +333,10 @@
}
result.push(codeUnit);
}
- if (!(leadByte === null && leadSurrogate === null)) {
+ if (!stream && !(leadByte === null && leadSurrogate === null)) {
result.push(decoderError(fatal));
}
- return result;
+ return [result, { leadByte, leadSurrogate }];
}
const gb18030Ranges = {
@@ -587,14 +593,13 @@
bytes,
fatal = false,
ignoreBOM = false,
+ stream = false,
+ { first = 0x00, second = 0x00, third = 0x00 } = {},
) {
if (ignoreBOM) {
throw new TypeError("Ignoring the BOM is available only with utf-8.");
}
const result = [];
- let first = 0x00;
- let second = 0x00;
- let third = 0x00;
for (let i = 0; i < bytes.length; i++) {
const byte = bytes[i];
if (third !== 0x00) {
@@ -667,10 +672,10 @@
}
result.push(decoderError(fatal));
}
- if (!(first === 0x00 && second === 0x00 && third === 0x00)) {
+ if (!stream && !(first === 0x00 && second === 0x00 && third === 0x00)) {
result.push(decoderError(fatal));
}
- return result;
+ return [result, { first, second, third }];
}
class SingleByteDecoder {
@@ -4153,6 +4158,7 @@
class TextDecoder {
#encoding = "";
+ #state;
get encoding() {
return this.#encoding;
@@ -4186,9 +4192,11 @@
}
decode(input, options = { stream: false }) {
- if (options.stream) {
- throw new TypeError("Stream not supported.");
- }
+ const stream = Boolean(options.stream);
+
+ // If we're decoding anything other than the first chunk of a stream,
+ // we will not ignore a BOM.
+ const ignoreBOM = this.ignoreBOM && this.#state === undefined;
let bytes;
if (input instanceof Uint8Array) {
@@ -4216,7 +4224,9 @@
if (
this.#encoding === "utf-8" &&
this.fatal === false &&
- this.ignoreBOM === false
+ ignoreBOM === false &&
+ stream === false &&
+ this.#state === undefined
) {
return core.decode(bytes);
}
@@ -4224,42 +4234,59 @@
// For performance reasons we utilise a highly optimised decoder instead of
// the general decoder.
if (this.#encoding === "utf-8") {
- return decodeUtf8(bytes, this.fatal, this.ignoreBOM);
+ const [result, state] = decodeUtf8(
+ bytes,
+ this.fatal,
+ ignoreBOM,
+ stream,
+ this.#state,
+ );
+ this.#state = stream ? state : undefined;
+ return result;
}
if (this.#encoding === "utf-16le" || this.#encoding === "utf-16be") {
- const result = Utf16ByteDecoder(
+ const [result, state] = Utf16ByteDecoder(
bytes,
this.#encoding.endsWith("be"),
this.fatal,
- this.ignoreBOM,
+ ignoreBOM,
+ stream,
+ this.#state,
);
+ this.#state = stream ? state : undefined;
return String.fromCharCode.apply(null, result);
}
if (this.#encoding === "big5") {
- const result = Big5Decoder(
+ const [result, state] = Big5Decoder(
encodingIndexes.get("big5"),
bytes,
this.fatal,
- this.ignoreBOM,
+ ignoreBOM,
+ stream,
+ this.#state,
);
+ this.#state = stream ? state : undefined;
return String.fromCharCode.apply(null, result);
}
if (this.#encoding === "gbk" || this.#encoding === "gb18030") {
- const result = gb18030Decoder(
+ const [result, state] = gb18030Decoder(
encodingIndexes.get("gb18030"),
bytes,
this.fatal,
- this.ignoreBOM,
+ ignoreBOM,
+ stream,
+ this.#state,
);
+ this.#state = stream ? state : undefined;
return String.fromCodePoint.apply(null, result);
}
const decoder = decoders.get(this.#encoding)({
fatal: this.fatal,
- ignoreBOM: this.ignoreBOM,
+ ignoreBOM,
});
const inputStream = new Stream(bytes);
const output = [];
@@ -4333,17 +4360,27 @@
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
- function decodeUtf8(input, fatal, ignoreBOM) {
+ function decodeUtf8(
+ input,
+ fatal,
+ ignoreBOM,
+ stream,
+ { state = 0, codepoint = 0 } = {},
+ ) {
let outString = "";
// Prepare a buffer so that we don't have to do a lot of string concats, which
// are very slow.
- const outBufferLength = Math.min(1024, input.length);
+ // When decoding non-streaming UTF-8, the maximum output string length is
+ // input.length, but if state !== 0, there might be one additional code
+ // point.
+ const outBufferLength = Math.min(
+ 1024,
+ input.length + (state === 0 ? 0 : 2),
+ );
const outBuffer = new Uint16Array(outBufferLength);
let outIndex = 0;
- let state = 0;
- let codepoint = 0;
let type;
let i =
@@ -4416,9 +4453,10 @@
}
}
- // Add a replacement character if we ended in the middle of a sequence or
- // encountered an invalid code at the end.
- if (state !== 0) {
+ // Add a replacement character if we ended in the middle of a sequence and
+ // we aren't in streaming more, or if we encountered an invalid code at the
+ // end.
+ if (state === 12 || (!stream && state !== 0)) {
if (fatal) throw new TypeError(`Decoder error. Unexpected end of data.`);
outBuffer[outIndex++] = 0xfffd; // Replacement character
}
@@ -4429,7 +4467,7 @@
outBuffer.subarray(0, outIndex),
);
- return outString;
+ return [outString, { state, codepoint }];
}
// Following code is forked from https://github.com/beatgammit/base64-js
diff --git a/extensions/web/lib.deno_web.d.ts b/extensions/web/lib.deno_web.d.ts
index 0fce6ce6b..9ede84994 100644
--- a/extensions/web/lib.deno_web.d.ts
+++ b/extensions/web/lib.deno_web.d.ts
@@ -189,7 +189,7 @@ declare class TextDecoder {
options?: { fatal?: boolean; ignoreBOM?: boolean },
);
/** Returns the result of running encoding's decoder. */
- decode(input?: BufferSource, options?: { stream?: false }): string;
+ decode(input?: BufferSource, options?: { stream?: boolean }): string;
readonly [Symbol.toStringTag]: string;
}
diff --git a/tools/wpt/expectation.json b/tools/wpt/expectation.json
index e233dbdef..b7fbc13cf 100644
--- a/tools/wpt/expectation.json
+++ b/tools/wpt/expectation.json
@@ -91,6 +91,7 @@
"encode-utf8.any.html": false,
"readable-writable-properties.any.html": false
},
+ "textdecoder-arguments.any.html": true,
"textdecoder-byte-order-marks.any.html": true,
"textdecoder-copy.any.html": false,
"textdecoder-fatal-single-byte.any.html?1-1000": true,
@@ -132,7 +133,7 @@
"windows-949 => EUC-KR",
"x-user-defined => x-user-defined"
],
- "textdecoder-streaming.any.html": false,
+ "textdecoder-streaming.any.html": true,
"textdecoder-utf16-surrogates.any.html": true,
"textencoder-constructor-non-utf.any.html": [
"Encoding argument supported for decode: EUC-JP",
@@ -142,8 +143,7 @@
"Encoding argument supported for decode: x-user-defined"
],
"textencoder-utf16-surrogates.any.html": true,
- "unsupported-encodings.any.html": false,
- "textdecoder-arguments.any.html": false
+ "unsupported-encodings.any.html": false
},
"hr-time": {
"monotonic-clock.any.html": true,