perf(web): optimize single pass utf8 decoding (#16593)

- [x] Avoid copying buffers. https://encoding.spec.whatwg.org/#dom-textdecoder-decode > Implementations are strongly encouraged to use an implementation strategy that avoids this copy. When doing so they will have to make sure that changes to input do not affect future calls to [decode()](https://encoding.spec.whatwg.org/#dom-textdecoder-decode). - [x] Special op to avoid string label deserialization and parsing. (Ideally we should map labels to integers in JS) - [x] Avoid webidl `Object.assign` when options is undefined.
author: Divy Srivastava <dj.srivastava23@gmail.com> 2022-11-11 06:37:18 -0800
committer: GitHub <noreply@github.com> 2022-11-11 20:07:18 +0530
commit: 38f0b41e7d16db24c1ba7c7cc7b536f4d7e169e9 (patch)
tree: c8688f12f5808bd78f03c9c3ad2d078e974aa185 /ext/web
parent: 5b9620df7ac655449abd2cce5292bd4669b1f211 (diff)
2 files changed, 69 insertions, 26 deletions
diff --git a/ext/web/08_text_encoding.js b/ext/web/08_text_encoding.js
index 4477d9b9e..bf4b33808 100644
--- a/ext/web/08_text_encoding.js
+++ b/ext/web/08_text_encoding.js
@@ -16,14 +16,14 @@
   const ops = core.ops;
   const webidl = window.__bootstrap.webidl;
   const {
-    ArrayBufferIsView,
-    ObjectPrototypeIsPrototypeOf,
     PromiseReject,
     PromiseResolve,
     StringPrototypeCharCodeAt,
     StringPrototypeSlice,
     TypedArrayPrototypeSubarray,
     Uint8Array,
+    ObjectPrototypeIsPrototypeOf,
+    ArrayBufferIsView,
     Uint32Array,
   } = window.__bootstrap.primordials;
 
@@ -34,6 +34,8 @@
     #fatal;
     /** @type {boolean} */
     #ignoreBOM;
+    /** @type {boolean} */
+    #utf8SinglePass;
 
     /** @type {number | null} */
     #rid = null;
@@ -56,6 +58,7 @@
       this.#encoding = encoding;
       this.#fatal = options.fatal;
       this.#ignoreBOM = options.ignoreBOM;
+      this.#utf8SinglePass = encoding === "utf-8" && !options.fatal;
       this[webidl.brand] = webidl.brand;
     }
 
@@ -81,7 +84,7 @@
      * @param {BufferSource} [input]
      * @param {TextDecodeOptions} options
      */
-    decode(input = new Uint8Array(), options = {}) {
+    decode(input = new Uint8Array(), options = undefined) {
       webidl.assertBranded(this, TextDecoderPrototype);
       const prefix = "Failed to execute 'decode' on 'TextDecoder'";
       if (input !== undefined) {
@@ -91,40 +94,46 @@
           allowShared: true,
         });
       }
-      options = webidl.converters.TextDecodeOptions(options, {
-        prefix,
-        context: "Argument 2",
-      });
+      let stream = false;
+      if (options !== undefined) {
+        options = webidl.converters.TextDecodeOptions(options, {
+          prefix,
+          context: "Argument 2",
+        });
+        stream = options.stream;
+      }
 
       try {
-        try {
-          if (ArrayBufferIsView(input)) {
-            input = new Uint8Array(
-              input.buffer,
-              input.byteOffset,
-              input.byteLength,
-            );
-          } else {
-            input = new Uint8Array(input);
-          }
-        } catch {
-          // If the buffer is detached, just create a new empty Uint8Array.
-          input = new Uint8Array();
-        }
+        // Note from spec: implementations are strongly encouraged to use an implementation strategy that avoids this copy.
+        // When doing so they will have to make sure that changes to input do not affect future calls to decode().
         if (
           ObjectPrototypeIsPrototypeOf(
             SharedArrayBuffer.prototype,
-            input.buffer,
+            input || input.buffer,
           )
         ) {
           // We clone the data into a non-shared ArrayBuffer so we can pass it
           // to Rust.
           // `input` is now a Uint8Array, and calling the TypedArray constructor
           // with a TypedArray argument copies the data.
-          input = new Uint8Array(input);
+          if (ArrayBufferIsView(input)) {
+            input = new Uint8Array(
+              input.buffer,
+              input.byteOffset,
+              input.byteLength,
+            );
+          } else {
+            input = new Uint8Array(input);
+          }
         }
 
-        if (!options.stream && this.#rid === null) {
+        // Fast path for single pass encoding.
+        if (!stream && this.#rid === null) {
+          // Fast path for utf8 single pass encoding.
+          if (this.#utf8SinglePass) {
+            return ops.op_encoding_decode_utf8(input, this.#ignoreBOM);
+          }
+
           return ops.op_encoding_decode_single(
             input,
             this.#encoding,
@@ -140,9 +149,9 @@
             this.#ignoreBOM,
           );
         }
-        return ops.op_encoding_decode(input, this.#rid, options.stream);
+        return ops.op_encoding_decode(input, this.#rid, stream);
       } finally {
-        if (!options.stream && this.#rid !== null) {
+        if (!stream && this.#rid !== null) {
           core.close(this.#rid);
           this.#rid = null;
         }
diff --git a/ext/web/lib.rs b/ext/web/lib.rs
index 588a3adfd..f799f02e7 100644
--- a/ext/web/lib.rs
+++ b/ext/web/lib.rs
@@ -91,6 +91,7 @@ pub fn init<P: TimersPermission + 'static>(
       op_base64_btoa::decl(),
       op_encoding_normalize_label::decl(),
       op_encoding_decode_single::decl(),
+      op_encoding_decode_utf8::decl(),
       op_encoding_new_decoder::decl(),
       op_encoding_decode::decl(),
       op_encoding_encode_into::decl(),
@@ -179,6 +180,39 @@ fn op_encoding_normalize_label(label: String) -> Result<String, AnyError> {
   Ok(encoding.name().to_lowercase())
 }
 
+#[op(v8)]
+fn op_encoding_decode_utf8<'a>(
+  scope: &mut v8::HandleScope<'a>,
+  zero_copy: &[u8],
+  ignore_bom: bool,
+) -> Result<serde_v8::Value<'a>, AnyError> {
+  let buf = &zero_copy;
+
+  let buf = if !ignore_bom
+    && buf.len() >= 3
+    && buf[0] == 0xef
+    && buf[1] == 0xbb
+    && buf[2] == 0xbf
+  {
+    &buf[3..]
+  } else {
+    buf
+  };
+
+  // If `String::new_from_utf8()` returns `None`, this means that the
+  // length of the decoded string would be longer than what V8 can
+  // handle. In this case we return `RangeError`.
+  //
+  // For more details see:
+  // - https://encoding.spec.whatwg.org/#dom-textdecoder-decode
+  // - https://github.com/denoland/deno/issues/6649
+  // - https://github.com/v8/v8/blob/d68fb4733e39525f9ff0a9222107c02c28096e2a/include/v8.h#L3277-L3278
+  match v8::String::new_from_utf8(scope, buf, v8::NewStringType::Normal) {
+    Some(text) => Ok(serde_v8::from_v8(scope, text.into())?),
+    None => Err(type_error("buffer exceeds maximum length")),
+  }
+}
+
 #[op]
 fn op_encoding_decode_single(
   data: &[u8],
author	Divy Srivastava <dj.srivastava23@gmail.com>	2022-11-11 06:37:18 -0800
committer	GitHub <noreply@github.com>	2022-11-11 20:07:18 +0530
commit	38f0b41e7d16db24c1ba7c7cc7b536f4d7e169e9 (patch)
tree	c8688f12f5808bd78f03c9c3ad2d078e974aa185 /ext/web
parent	5b9620df7ac655449abd2cce5292bd4669b1f211 (diff)