From 9ba2c4c42fcbadea1f19c67c88b5cbc4c97804f3 Mon Sep 17 00:00:00 2001
From: Andreu Botella <andreu@andreubotella.com>
Date: Tue, 16 May 2023 17:49:35 +0200
Subject: fix(fetch): Correctly decode `multipart/form-data` names and
 filenames (#19145)

Currently the `multipart/form-data` parser in
`Request.prototype.formData` and `Response.prototype.formData` decodes
non-ASCII filenames incorrectly, as if they were encoded in Latin-1
rather than UTF-8. This happens because the header section of each
`multipart/form-data` entry is decoded as Latin-1 in order to be parsed
with `Headers`, which only allows `ByteString`s, but the names and
filenames are never decoded correctly. This PR fixes this as a
post-processing step.

Note that the `multipart/form-data` parsing for this APIs in the Fetch
spec is very much underspecified, and it does not specify that names and
filenames must be decoded as UTF-8. However, it does require that the
bodies of non-`File` entries are decoded as UTF-8, and in browsers,
names and filenames always use the same encoding as the body.

Closes #19142.
---
 ext/fetch/21_formdata.js | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

(limited to 'ext/fetch/21_formdata.js')

diff --git a/ext/fetch/21_formdata.js b/ext/fetch/21_formdata.js
index 330ed92e6..1ddd5f656 100644
--- a/ext/fetch/21_formdata.js
+++ b/ext/fetch/21_formdata.js
@@ -41,6 +41,7 @@ const {
   StringPrototypeReplaceAll,
   TypeError,
   TypedArrayPrototypeSubarray,
+  Uint8Array,
 } = primordials;
 
 const entryList = Symbol("entry list");
@@ -358,6 +359,20 @@ function parseContentDisposition(value) {
   return params;
 }
 
+/**
+ * Decodes a string containing UTF-8 mistakenly decoded as Latin-1 and
+ * decodes it correctly.
+ * @param {string} latin1String
+ * @returns {string}
+ */
+function decodeLatin1StringAsUtf8(latin1String) {
+  const buffer = new Uint8Array(latin1String.length);
+  for (let i = 0; i < latin1String.length; i++) {
+    buffer[i] = latin1String.charCodeAt(i);
+  }
+  return core.decode(buffer);
+}
+
 const CRLF = "\r\n";
 const LF = StringPrototypeCodePointAt(CRLF, 1);
 const CR = StringPrototypeCodePointAt(CRLF, 0);
@@ -465,23 +480,31 @@ class MultipartParser {
             i - boundaryIndex - 1,
           );
           // https://fetch.spec.whatwg.org/#ref-for-dom-body-formdata
-          const filename = MapPrototypeGet(disposition, "filename");
-          const name = MapPrototypeGet(disposition, "name");
+          // These are UTF-8 decoded as if it was Latin-1.
+          // TODO(@andreubotella): Maybe we shouldn't be parsing entry headers
+          // as Latin-1.
+          const latin1Filename = MapPrototypeGet(disposition, "filename");
+          const latin1Name = MapPrototypeGet(disposition, "name");
 
           state = 5;
           // Reset
           boundaryIndex = 0;
           headerText = "";
 
-          if (!name) {
+          if (!latin1Name) {
             continue; // Skip, unknown name
           }
 
-          if (filename) {
+          const name = decodeLatin1StringAsUtf8(latin1Name);
+          if (latin1Filename) {
             const blob = new Blob([content], {
               type: headers.get("Content-Type") || "application/octet-stream",
             });
-            formData.append(name, blob, filename);
+            formData.append(
+              name,
+              blob,
+              decodeLatin1StringAsUtf8(latin1Filename),
+            );
           } else {
             formData.append(name, core.decode(content));
           }
-- 
cgit v1.2.3