diff options
Diffstat (limited to 'op_crates/web/08_text_encoding.js')
-rw-r--r-- | op_crates/web/08_text_encoding.js | 135 |
1 files changed, 97 insertions, 38 deletions
diff --git a/op_crates/web/08_text_encoding.js b/op_crates/web/08_text_encoding.js index 13e256982..59c1aba8f 100644 --- a/op_crates/web/08_text_encoding.js +++ b/op_crates/web/08_text_encoding.js @@ -169,26 +169,14 @@ // The encodingMap is a hash of labels that are indexed by the conical // encoding. const encodingMap = { - "windows-1252": [ - "ansi_x3.4-1968", - "ascii", - "cp1252", - "cp819", - "csisolatin1", - "ibm819", - "iso-8859-1", - "iso-ir-100", - "iso8859-1", - "iso88591", - "iso_8859-1", - "iso_8859-1:1987", - "l1", - "latin1", - "us-ascii", - "windows-1252", - "x-cp1252", + "utf-8": [ + "unicode-1-1-utf-8", + "unicode11utf8", + "unicode20utf8", + "utf-8", + "utf8", + "x-unicode20utf8", ], - "utf-8": ["unicode-1-1-utf-8", "utf-8", "utf8"], ibm866: ["866", "cp866", "csibm866", "ibm866"], "iso-8859-2": [ "csisolatin2", @@ -276,6 +264,11 @@ "iso_8859-8:1988", "visual", ], + "iso-8859-8-i": [ + "csiso88598i", + "iso-8859-8-i", + "logical", + ], "iso-8859-10": [ "csisolatin6", "iso-8859-10", @@ -296,19 +289,6 @@ "l9", ], "iso-8859-16": ["iso-8859-16"], - gbk: [ - "chinese", - "csgb2312", - "csiso58gb231280", - "gb2312", - "gb_2312", - "gb_2312-80", - "gbk", - "iso-ir-58", - "x-gbk", - ], - gb18030: ["gb18030"], - big5: ["big5", "big5-hkscs", "cn-big5", "csbig5", "x-x-big5"], "koi8-r": ["cskoi8r", "koi", "koi8", "koi8-r", "koi8_r"], "koi8-u": ["koi8-ru", "koi8-u"], macintosh: ["csmacintosh", "mac", "macintosh", "x-mac-roman"], @@ -322,6 +302,25 @@ ], "windows-1250": ["cp1250", "windows-1250", "x-cp1250"], "windows-1251": ["cp1251", "windows-1251", "x-cp1251"], + "windows-1252": [ + "ansi_x3.4-1968", + "ascii", + "cp1252", + "cp819", + "csisolatin1", + "ibm819", + "iso-8859-1", + "iso-ir-100", + "iso8859-1", + "iso88591", + "iso_8859-1", + "iso_8859-1:1987", + "l1", + "latin1", + "us-ascii", + "windows-1252", + "x-cp1252", + ], "windows-1253": ["cp1253", "windows-1253", "x-cp1253"], "windows-1254": [ "cp1254", @@ -342,6 +341,19 @@ "windows-1257": ["cp1257", "windows-1257", "x-cp1257"], "windows-1258": ["cp1258", "windows-1258", "x-cp1258"], "x-mac-cyrillic": ["x-mac-cyrillic", "x-mac-ukrainian"], + gbk: [ + "chinese", + "csgb2312", + "csiso58gb231280", + "gb2312", + "gb_2312", + "gb_2312-80", + "gbk", + "iso-ir-58", + "x-gbk", + ], + gb18030: ["gb18030"], + big5: ["big5", "big5-hkscs", "cn-big5", "csbig5", "x-x-big5"], }; // We convert these into a Map where every label resolves to its canonical // encoding type. @@ -540,6 +552,26 @@ ]); // deno-fmt-ignore + encodingIndexes.set("iso-8859-8-i", [ + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, null, 162, 163, 164, 165, 166, 167, + 168, 169, 215, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, + 184, 185, 247, 187, 188, 189, 190, null, + null, null, null, null, null, null, null, null, + null, null, null, null, null, null, null, null, + null, null, null, null, null, null, null, null, + null, null, null, null, null, null, null, 8215, + 1488, 1489, 1490, 1491, 1492, 1493, 1494, 1495, + 1496, 1497, 1498, 1499, 1500, 1501, 1502, 1503, + 1504, 1505, 1506, 1507, 1508, 1509, 1510, 1511, + 1512, 1513, 1514, null, null, 8206, 8207, null, + ]); + + // deno-fmt-ignore encodingIndexes.set("iso-8859-10", [ 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, @@ -957,6 +989,26 @@ ); } + const whitespace = [" ", "\t", "\n", "\f", "\r"]; + function trimAsciiWhitespace(label) { + let start = 0; + for (const i in label) { + if (!whitespace.includes(label[i])) { + start = i; + break; + } + } + let end = label.length - 1; + for (const _i in label) { + const i = end - _i; + if (!whitespace.includes(label[i])) { + end = i; + break; + } + } + return label.substring(start, end + 1); + } + class TextDecoder { #encoding = ""; @@ -973,7 +1025,7 @@ if (options.fatal) { this.fatal = true; } - const _label = String(label).trim().toLowerCase(); + const _label = trimAsciiWhitespace(String(label)).toLowerCase(); const encoding = encodings.get(_label); if (!encoding) { throw new RangeError( @@ -1085,21 +1137,28 @@ const encoder = new UTF8Encoder(); const inputStream = new Stream(stringToCodePoints(input)); + if (!(dest instanceof Uint8Array)) { + throw new TypeError( + "2nd argument to TextEncoder.encodeInto must be Uint8Array", + ); + } + let written = 0; let read = 0; while (true) { - const result = encoder.handler(inputStream.read()); + const item = inputStream.read(); + const result = encoder.handler(item); if (result === "finished") { break; } if (dest.length - written >= result.length) { read++; - dest.set(result, written); - written += result.length; - if (result.length > 3) { + if (item > 0xFFFF) { // increment read a second time if greater than U+FFFF read++; } + dest.set(result, written); + written += result.length; } else { break; } @@ -1151,7 +1210,7 @@ let type; let i = - ignoreBOM && input[0] === 0xef && input[1] === 0xbb && input[2] === 0xbf + !ignoreBOM && input[0] === 0xef && input[1] === 0xbb && input[2] === 0xbf ? 3 : 0; |