feat(ext/node): buffer.transcode() (#25972)

Closes https://github.com/denoland/deno/issues/25911
author: Satya Rohith <me@satyarohith.com> 2024-10-02 13:53:14 +0530
committer: GitHub <noreply@github.com> 2024-10-02 08:23:14 +0000
commit: 32c12787361b65bbc55a7b9c1fe43689cb0a8b98 (patch)
tree: 047f47b5146b9192d74bd8ef1d9af9d183b7aad1
parent: 620e6b43a66c2af44ae4aea62417af408309f61c (diff)
7 files changed, 255 insertions, 3 deletions
diff --git a/ext/node/lib.rs b/ext/node/lib.rs
index 0c821ecf8..d23c07204 100644
--- a/ext/node/lib.rs
+++ b/ext/node/lib.rs
@@ -167,6 +167,7 @@ deno_core::extension!(deno_node,
 
     ops::buffer::op_is_ascii,
     ops::buffer::op_is_utf8,
+    ops::buffer::op_transcode,
     ops::crypto::op_node_check_prime_async,
     ops::crypto::op_node_check_prime_bytes_async,
     ops::crypto::op_node_check_prime_bytes,
diff --git a/ext/node/ops/buffer.rs b/ext/node/ops/buffer.rs
index 74a011ab8..01f878ec1 100644
--- a/ext/node/ops/buffer.rs
+++ b/ext/node/ops/buffer.rs
@@ -1,5 +1,7 @@
 // Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
 
+use deno_core::anyhow::anyhow;
+use deno_core::anyhow::Result;
 use deno_core::op2;
 
 #[op2(fast)]
@@ -11,3 +13,107 @@ pub fn op_is_ascii(#[buffer] buf: &[u8]) -> bool {
 pub fn op_is_utf8(#[buffer] buf: &[u8]) -> bool {
   std::str::from_utf8(buf).is_ok()
 }
+
+#[op2]
+#[buffer]
+pub fn op_transcode(
+  #[buffer] source: &[u8],
+  #[string] from_encoding: &str,
+  #[string] to_encoding: &str,
+) -> Result<Vec<u8>> {
+  match (from_encoding, to_encoding) {
+    ("utf8", "ascii") => Ok(utf8_to_ascii(source)),
+    ("utf8", "latin1") => Ok(utf8_to_latin1(source)),
+    ("utf8", "utf16le") => utf8_to_utf16le(source),
+    ("utf16le", "utf8") => utf16le_to_utf8(source),
+    ("latin1", "utf16le") | ("ascii", "utf16le") => {
+      Ok(latin1_ascii_to_utf16le(source))
+    }
+    (from, to) => Err(anyhow!("Unable to transcode Buffer {from}->{to}")),
+  }
+}
+
+fn latin1_ascii_to_utf16le(source: &[u8]) -> Vec<u8> {
+  let mut result = Vec::with_capacity(source.len() * 2);
+  for &byte in source {
+    result.push(byte);
+    result.push(0);
+  }
+  result
+}
+
+fn utf16le_to_utf8(source: &[u8]) -> Result<Vec<u8>> {
+  let ucs2_vec: Vec<u16> = source
+    .chunks(2)
+    .map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]))
+    .collect();
+  String::from_utf16(&ucs2_vec)
+    .map(|utf8_string| utf8_string.into_bytes())
+    .map_err(|e| anyhow!("Invalid UTF-16 sequence: {}", e))
+}
+
+fn utf8_to_utf16le(source: &[u8]) -> Result<Vec<u8>> {
+  let utf8_string = std::str::from_utf8(source)?;
+  let ucs2_vec: Vec<u16> = utf8_string.encode_utf16().collect();
+  let bytes: Vec<u8> = ucs2_vec.iter().flat_map(|&x| x.to_le_bytes()).collect();
+  Ok(bytes)
+}
+
+fn utf8_to_latin1(source: &[u8]) -> Vec<u8> {
+  let mut latin1_bytes = Vec::with_capacity(source.len());
+  let mut i = 0;
+  while i < source.len() {
+    match source[i] {
+      byte if byte <= 0x7F => {
+        // ASCII character
+        latin1_bytes.push(byte);
+        i += 1;
+      }
+      byte if (0xC2..=0xDF).contains(&byte) && i + 1 < source.len() => {
+        // 2-byte UTF-8 sequence
+        let codepoint =
+          ((byte as u16 & 0x1F) << 6) | (source[i + 1] as u16 & 0x3F);
+        latin1_bytes.push(if codepoint <= 0xFF {
+          codepoint as u8
+        } else {
+          b'?'
+        });
+        i += 2;
+      }
+      _ => {
+        // 3-byte or 4-byte UTF-8 sequence, or invalid UTF-8
+        latin1_bytes.push(b'?');
+        // Skip to the next valid UTF-8 start byte
+        i += 1;
+        while i < source.len() && (source[i] & 0xC0) == 0x80 {
+          i += 1;
+        }
+      }
+    }
+  }
+  latin1_bytes
+}
+
+fn utf8_to_ascii(source: &[u8]) -> Vec<u8> {
+  let mut ascii_bytes = Vec::with_capacity(source.len());
+  let mut i = 0;
+  while i < source.len() {
+    match source[i] {
+      byte if byte <= 0x7F => {
+        // ASCII character
+        ascii_bytes.push(byte);
+        i += 1;
+      }
+      _ => {
+        // Non-ASCII character
+        ascii_bytes.push(b'?');
+        // Skip to the next valid UTF-8 start byte
+        i += 1;
+        while i < source.len() && (source[i] & 0xC0) == 0x80 {
+          i += 1;
+        }
+      }
+    }
+  }
+  ascii_bytes
+}
diff --git a/ext/node/polyfills/buffer.ts b/ext/node/polyfills/buffer.ts
index 8986cf53d..efe3b07a9 100644
--- a/ext/node/polyfills/buffer.ts
+++ b/ext/node/polyfills/buffer.ts
@@ -13,4 +13,5 @@ export {
   kMaxLength,
   kStringMaxLength,
   SlowBuffer,
+  transcode,
 } from "ext:deno_node/internal/buffer.mjs";
diff --git a/ext/node/polyfills/internal/buffer.mjs b/ext/node/polyfills/internal/buffer.mjs
index 6e43a4903..6687f7394 100644
--- a/ext/node/polyfills/internal/buffer.mjs
+++ b/ext/node/polyfills/internal/buffer.mjs
@@ -6,7 +6,7 @@
 // deno-lint-ignore-file prefer-primordials
 
 import { core } from "ext:core/mod.js";
-import { op_is_ascii, op_is_utf8 } from "ext:core/ops";
+import { op_is_ascii, op_is_utf8, op_transcode } from "ext:core/ops";
 
 import { TextDecoder, TextEncoder } from "ext:deno_web/08_text_encoding.js";
 import { codes } from "ext:deno_node/internal/error_codes.ts";
@@ -32,7 +32,11 @@ import {
 import { normalizeEncoding } from "ext:deno_node/internal/util.mjs";
 import { validateBuffer } from "ext:deno_node/internal/validators.mjs";
 import { isUint8Array } from "ext:deno_node/internal/util/types.ts";
-import { ERR_INVALID_STATE, NodeError } from "ext:deno_node/internal/errors.ts";
+import {
+  ERR_INVALID_STATE,
+  genericNodeError,
+  NodeError,
+} from "ext:deno_node/internal/errors.ts";
 import {
   forgivingBase64Encode,
   forgivingBase64UrlEncode,
@@ -2598,6 +2602,48 @@ export function isAscii(input) {
   ], input);
 }
 
+export function transcode(source, fromEnco, toEnco) {
+  if (!isUint8Array(source)) {
+    throw new codes.ERR_INVALID_ARG_TYPE(
+      "source",
+      ["Buffer", "Uint8Array"],
+      source,
+    );
+  }
+  if (source.length === 0) {
+    return Buffer.alloc(0);
+  }
+  const code = "U_ILLEGAL_ARGUMENT_ERROR";
+  const illegalArgumentError = genericNodeError(
+    `Unable to transcode Buffer [${code}]`,
+    { code: code, errno: 1 },
+  );
+  fromEnco = normalizeEncoding(fromEnco);
+  toEnco = normalizeEncoding(toEnco);
+  if (!fromEnco || !toEnco) {
+    throw illegalArgumentError;
+  }
+  // Return the provided source when transcode is not required
+  // for the from/to encoding pair.
+  const returnSource = fromEnco === toEnco ||
+    fromEnco === "ascii" && toEnco === "utf8" ||
+    fromEnco === "ascii" && toEnco === "latin1";
+  if (returnSource) {
+    return Buffer.from(source);
+  }
+
+  try {
+    const result = op_transcode(new Uint8Array(source), fromEnco, toEnco);
+    return Buffer.from(result, toEnco);
+  } catch (err) {
+    if (err.message.includes("Unable to transcode Buffer")) {
+      throw illegalArgumentError;
+    } else {
+      throw err;
+    }
+  }
+}
+
 export default {
   atob,
   btoa,
@@ -2610,4 +2656,5 @@ export default {
   kMaxLength,
   kStringMaxLength,
   SlowBuffer,
+  transcode,
 };
diff --git a/tests/node_compat/config.jsonc b/tests/node_compat/config.jsonc
index 2f94fa2f2..bc9bf476b 100644
--- a/tests/node_compat/config.jsonc
+++ b/tests/node_compat/config.jsonc
@@ -406,6 +406,7 @@
       "test-http-outgoing-settimeout.js",
       "test-http-url.parse-https.request.js",
       "test-http-url.parse-only-support-http-https-protocol.js",
+      "test-icu-transcode.js",
       "test-net-access-byteswritten.js",
       "test-net-better-error-messages-listen-path.js",
       "test-net-better-error-messages-path.js",
diff --git a/tests/node_compat/runner/TODO.md b/tests/node_compat/runner/TODO.md
index 24e827182..99258f5a5 100644
--- a/tests/node_compat/runner/TODO.md
+++ b/tests/node_compat/runner/TODO.md
@@ -1632,7 +1632,6 @@ NOTE: This file should not be manually edited. Please edit `tests/node_compat/co
 - [parallel/test-icu-minimum-version.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-icu-minimum-version.js)
 - [parallel/test-icu-punycode.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-icu-punycode.js)
 - [parallel/test-icu-stringwidth.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-icu-stringwidth.js)
-- [parallel/test-icu-transcode.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-icu-transcode.js)
 - [parallel/test-inspect-address-in-use.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-inspect-address-in-use.js)
 - [parallel/test-inspect-async-hook-setup-at-inspect.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-inspect-async-hook-setup-at-inspect.js)
 - [parallel/test-inspect-publish-uid.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-inspect-publish-uid.js)
diff --git a/tests/node_compat/test/parallel/test-icu-transcode.js b/tests/node_compat/test/parallel/test-icu-transcode.js
new file mode 100644
index 000000000..1f5aeb535
--- /dev/null
+++ b/tests/node_compat/test/parallel/test-icu-transcode.js
@@ -0,0 +1,97 @@
+// deno-fmt-ignore-file
+// deno-lint-ignore-file
+
+// Copyright Joyent and Node contributors. All rights reserved. MIT license.
+// Taken from Node 18.12.1
+// This file is automatically generated by `tests/node_compat/runner/setup.ts`. Do not modify this file manually.
+
+'use strict';
+
+const common = require('../common');
+
+if (!common.hasIntl)
+  common.skip('missing Intl');
+
+const buffer = require('buffer');
+const assert = require('assert');
+const orig = Buffer.from('těst ☕', 'utf8');
+
+// Test Transcoding
+const tests = {
+  'latin1': [0x74, 0x3f, 0x73, 0x74, 0x20, 0x3f],
+  'ascii': [0x74, 0x3f, 0x73, 0x74, 0x20, 0x3f],
+  'ucs2': [0x74, 0x00, 0x1b, 0x01, 0x73,
+           0x00, 0x74, 0x00, 0x20, 0x00,
+           0x15, 0x26]
+};
+
+for (const test in tests) {
+  const dest = buffer.transcode(orig, 'utf8', test);
+  assert.strictEqual(dest.length, tests[test].length, `utf8->${test} length`);
+  for (let n = 0; n < tests[test].length; n++)
+    assert.strictEqual(dest[n], tests[test][n], `utf8->${test} char ${n}`);
+}
+
+{
+  const dest = buffer.transcode(Buffer.from(tests.ucs2), 'ucs2', 'utf8');
+  assert.strictEqual(dest.toString(), orig.toString());
+}
+
+{
+  const utf8 = Buffer.from('€'.repeat(4000), 'utf8');
+  const ucs2 = Buffer.from('€'.repeat(4000), 'ucs2');
+  const utf8_to_ucs2 = buffer.transcode(utf8, 'utf8', 'ucs2');
+  const ucs2_to_utf8 = buffer.transcode(ucs2, 'ucs2', 'utf8');
+  assert.deepStrictEqual(utf8, ucs2_to_utf8);
+  assert.deepStrictEqual(ucs2, utf8_to_ucs2);
+  assert.strictEqual(ucs2_to_utf8.toString('utf8'),
+                     utf8_to_ucs2.toString('ucs2'));
+}
+
+assert.throws(
+  () => buffer.transcode(null, 'utf8', 'ascii'),
+  {
+    name: 'TypeError',
+    code: 'ERR_INVALID_ARG_TYPE',
+    message: 'The "source" argument must be an instance of Buffer ' +
+             'or Uint8Array. Received null'
+  }
+);
+
+assert.throws(
+  () => buffer.transcode(Buffer.from('a'), 'b', 'utf8'),
+  /^Error: Unable to transcode Buffer \[U_ILLEGAL_ARGUMENT_ERROR\]/
+);
+
+assert.throws(
+  () => buffer.transcode(Buffer.from('a'), 'uf8', 'b'),
+  /^Error: Unable to transcode Buffer \[U_ILLEGAL_ARGUMENT_ERROR\]$/
+);
+
+assert.deepStrictEqual(
+  buffer.transcode(Buffer.from('hi', 'ascii'), 'ascii', 'utf16le'),
+  Buffer.from('hi', 'utf16le'));
+assert.deepStrictEqual(
+  buffer.transcode(Buffer.from('hi', 'latin1'), 'latin1', 'utf16le'),
+  Buffer.from('hi', 'utf16le'));
+assert.deepStrictEqual(
+  buffer.transcode(Buffer.from('hä', 'latin1'), 'latin1', 'utf16le'),
+  Buffer.from('hä', 'utf16le'));
+
+// Test that Uint8Array arguments are okay.
+{
+  const uint8array = new Uint8Array([...Buffer.from('hä', 'latin1')]);
+  assert.deepStrictEqual(
+    buffer.transcode(uint8array, 'latin1', 'utf16le'),
+    Buffer.from('hä', 'utf16le'));
+}
+
+{
+  const dest = buffer.transcode(new Uint8Array(), 'utf8', 'latin1');
+  assert.strictEqual(dest.length, 0);
+}
+
+// Test that it doesn't crash
+{
+  buffer.transcode(new buffer.SlowBuffer(1), 'utf16le', 'ucs2');
+}
author	Satya Rohith <me@satyarohith.com>	2024-10-02 13:53:14 +0530
committer	GitHub <noreply@github.com>	2024-10-02 08:23:14 +0000
commit	32c12787361b65bbc55a7b9c1fe43689cb0a8b98 (patch)
tree	047f47b5146b9192d74bd8ef1d9af9d183b7aad1
parent	620e6b43a66c2af44ae4aea62417af408309f61c (diff)