diff options
-rw-r--r-- | cli/js/encode_utf8.ts | 80 | ||||
-rw-r--r-- | cli/js/text_encoding.ts | 7 |
2 files changed, 87 insertions, 0 deletions
diff --git a/cli/js/encode_utf8.ts b/cli/js/encode_utf8.ts new file mode 100644 index 000000000..04e2560b7 --- /dev/null +++ b/cli/js/encode_utf8.ts @@ -0,0 +1,80 @@ +// Copyright 2018-2020 the Deno authors. All rights reserved. MIT license. +// The following code is based off: +// https://github.com/samthor/fast-text-encoding +// +// Copyright 2017 Sam Thorogood. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy of +// the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations under +// the License. +// + +export function encodeUtf8(input: string): Uint8Array { + let pos = 0; + const len = input.length; + + let at = 0; // output position + let tlen = Math.max(32, len + (len >> 1) + 7); // 1.5x size + let target = new Uint8Array((tlen >> 3) << 3); // ... but at 8 byte offset + + while (pos < len) { + let value = input.charCodeAt(pos++); + if (value >= 0xd800 && value <= 0xdbff) { + // high surrogate + if (pos < len) { + const extra = input.charCodeAt(pos); + if ((extra & 0xfc00) === 0xdc00) { + ++pos; + value = ((value & 0x3ff) << 10) + (extra & 0x3ff) + 0x10000; + } + } + if (value >= 0xd800 && value <= 0xdbff) { + continue; // drop lone surrogate + } + } + + // expand the buffer if we couldn't write 4 bytes + if (at + 4 > target.length) { + tlen += 8; // minimum extra + tlen *= 1.0 + (pos / input.length) * 2; // take 2x the remaining + tlen = (tlen >> 3) << 3; // 8 byte offset + + const update = new Uint8Array(tlen); + update.set(target); + target = update; + } + + if ((value & 0xffffff80) === 0) { + // 1-byte + target[at++] = value; // ASCII + continue; + } else if ((value & 0xfffff800) === 0) { + // 2-byte + target[at++] = ((value >> 6) & 0x1f) | 0xc0; + } else if ((value & 0xffff0000) === 0) { + // 3-byte + target[at++] = ((value >> 12) & 0x0f) | 0xe0; + target[at++] = ((value >> 6) & 0x3f) | 0x80; + } else if ((value & 0xffe00000) === 0) { + // 4-byte + target[at++] = ((value >> 18) & 0x07) | 0xf0; + target[at++] = ((value >> 12) & 0x3f) | 0x80; + target[at++] = ((value >> 6) & 0x3f) | 0x80; + } else { + // FIXME: do we care + continue; + } + + target[at++] = (value & 0x3f) | 0x80; + } + + return target.slice(0, at); +} diff --git a/cli/js/text_encoding.ts b/cli/js/text_encoding.ts index 6ca7e642f..ceb2f3fdc 100644 --- a/cli/js/text_encoding.ts +++ b/cli/js/text_encoding.ts @@ -26,6 +26,7 @@ import * as base64 from "./base64.ts"; import { decodeUtf8 } from "./decode_utf8.ts"; import * as domTypes from "./dom_types.ts"; +import { encodeUtf8 } from "./encode_utf8.ts"; import { DenoError, ErrorKind } from "./errors.ts"; const CONTINUE = null; @@ -405,6 +406,12 @@ export class TextEncoder { readonly encoding = "utf-8"; /** Returns the result of running UTF-8's encoder. */ encode(input = ""): Uint8Array { + // For performance reasons we utilise a highly optimised decoder instead of + // the general decoder. + if (this.encoding === "utf-8") { + return encodeUtf8(input); + } + const encoder = new UTF8Encoder(); const inputStream = new Stream(stringToCodePoints(input)); const output: number[] = []; |