diff options
author | Ryan Dahl <ry@tinyclouds.org> | 2019-10-04 20:28:51 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-10-04 20:28:51 -0400 |
commit | b81e5db17aa8b3088d6034ddf86b79c69410f012 (patch) | |
tree | 579e4c23d60d1b0d038156bc28a04f74ea87b2f0 /cli/js/text_encoding.ts | |
parent | 9049213867d30f7df090a83b6baf3e0717a4d2d2 (diff) |
Merge deno_cli_snapshots into deno_cli (#3064)
Diffstat (limited to 'cli/js/text_encoding.ts')
-rw-r--r-- | cli/js/text_encoding.ts | 554 |
1 files changed, 554 insertions, 0 deletions
diff --git a/cli/js/text_encoding.ts b/cli/js/text_encoding.ts new file mode 100644 index 000000000..8386ff8b0 --- /dev/null +++ b/cli/js/text_encoding.ts @@ -0,0 +1,554 @@ +// Copyright 2018-2019 the Deno authors. All rights reserved. MIT license. +// The following code is based off of text-encoding at: +// https://github.com/inexorabletash/text-encoding +// +// Anyone is free to copy, modify, publish, use, compile, sell, or +// distribute this software, either in source code form or as a compiled +// binary, for any purpose, commercial or non-commercial, and by any +// means. +// +// In jurisdictions that recognize copyright laws, the author or authors +// of this software dedicate any and all copyright interest in the +// software to the public domain. We make this dedication for the benefit +// of the public at large and to the detriment of our heirs and +// successors. We intend this dedication to be an overt act of +// relinquishment in perpetuity of all present and future rights to this +// software under copyright law. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +import * as base64 from "./base64.ts"; +import * as domTypes from "./dom_types.ts"; +import { DenoError, ErrorKind } from "./errors.ts"; + +const CONTINUE = null; +const END_OF_STREAM = -1; +const FINISHED = -1; + +function decoderError(fatal: boolean): number | never { + if (fatal) { + throw new TypeError("Decoder error."); + } + return 0xfffd; // default code point +} + +function inRange(a: number, min: number, max: number): boolean { + return min <= a && a <= max; +} + +function isASCIIByte(a: number): boolean { + return inRange(a, 0x00, 0x7f); +} + +function stringToCodePoints(input: string): number[] { + const u: number[] = []; + for (const c of input) { + u.push(c.codePointAt(0)!); + } + return u; +} + +class UTF8Decoder implements Decoder { + private _codePoint = 0; + private _bytesSeen = 0; + private _bytesNeeded = 0; + private _fatal: boolean; + private _ignoreBOM: boolean; + private _lowerBoundary = 0x80; + private _upperBoundary = 0xbf; + + constructor(options: DecoderOptions) { + this._fatal = options.fatal || false; + this._ignoreBOM = options.ignoreBOM || false; + } + + handler(stream: Stream, byte: number): number | null { + if (byte === END_OF_STREAM && this._bytesNeeded !== 0) { + this._bytesNeeded = 0; + return decoderError(this._fatal); + } + + if (byte === END_OF_STREAM) { + return FINISHED; + } + + if (this._ignoreBOM) { + if ( + (this._bytesSeen === 0 && byte !== 0xef) || + (this._bytesSeen === 1 && byte !== 0xbb) + ) { + this._ignoreBOM = false; + } + + if (this._bytesSeen === 2) { + this._ignoreBOM = false; + if (byte === 0xbf) { + //Ignore BOM + this._codePoint = 0; + this._bytesNeeded = 0; + this._bytesSeen = 0; + return CONTINUE; + } + } + } + + if (this._bytesNeeded === 0) { + if (isASCIIByte(byte)) { + // Single byte code point + return byte; + } else if (inRange(byte, 0xc2, 0xdf)) { + // Two byte code point + this._bytesNeeded = 1; + this._codePoint = byte & 0x1f; + } else if (inRange(byte, 0xe0, 0xef)) { + // Three byte code point + if (byte === 0xe0) { + this._lowerBoundary = 0xa0; + } else if (byte === 0xed) { + this._upperBoundary = 0x9f; + } + this._bytesNeeded = 2; + this._codePoint = byte & 0xf; + } else if (inRange(byte, 0xf0, 0xf4)) { + if (byte === 0xf0) { + this._lowerBoundary = 0x90; + } else if (byte === 0xf4) { + this._upperBoundary = 0x8f; + } + this._bytesNeeded = 3; + this._codePoint = byte & 0x7; + } else { + return decoderError(this._fatal); + } + return CONTINUE; + } + + if (!inRange(byte, this._lowerBoundary, this._upperBoundary)) { + // Byte out of range, so encoding error + this._codePoint = 0; + this._bytesNeeded = 0; + this._bytesSeen = 0; + stream.prepend(byte); + return decoderError(this._fatal); + } + + this._lowerBoundary = 0x80; + this._upperBoundary = 0xbf; + + this._codePoint = (this._codePoint << 6) | (byte & 0x3f); + + this._bytesSeen++; + + if (this._bytesSeen !== this._bytesNeeded) { + return CONTINUE; + } + + const codePoint = this._codePoint; + + this._codePoint = 0; + this._bytesNeeded = 0; + this._bytesSeen = 0; + + return codePoint; + } +} + +class UTF8Encoder implements Encoder { + handler(codePoint: number): number | number[] { + if (codePoint === END_OF_STREAM) { + return FINISHED; + } + + if (inRange(codePoint, 0x00, 0x7f)) { + return codePoint; + } + + let count: number; + let offset: number; + if (inRange(codePoint, 0x0080, 0x07ff)) { + count = 1; + offset = 0xc0; + } else if (inRange(codePoint, 0x0800, 0xffff)) { + count = 2; + offset = 0xe0; + } else if (inRange(codePoint, 0x10000, 0x10ffff)) { + count = 3; + offset = 0xf0; + } else { + throw TypeError(`Code point out of range: \\x${codePoint.toString(16)}`); + } + + const bytes = [(codePoint >> (6 * count)) + offset]; + + while (count > 0) { + const temp = codePoint >> (6 * (count - 1)); + bytes.push(0x80 | (temp & 0x3f)); + count--; + } + + return bytes; + } +} + +/** Decodes a string of data which has been encoded using base-64. */ +export function atob(s: string): string { + s = String(s); + s = s.replace(/[\t\n\f\r ]/g, ""); + + if (s.length % 4 === 0) { + s = s.replace(/==?$/, ""); + } + + const rem = s.length % 4; + if (rem === 1 || /[^+/0-9A-Za-z]/.test(s)) { + // TODO: throw `DOMException` + throw new DenoError( + ErrorKind.InvalidInput, + "The string to be decoded is not correctly encoded" + ); + } + + // base64-js requires length exactly times of 4 + if (rem > 0) { + s = s.padEnd(s.length + (4 - rem), "="); + } + + const byteArray: Uint8Array = base64.toByteArray(s); + let result = ""; + for (let i = 0; i < byteArray.length; i++) { + result += String.fromCharCode(byteArray[i]); + } + return result; +} + +/** Creates a base-64 ASCII string from the input string. */ +export function btoa(s: string): string { + const byteArray = []; + for (let i = 0; i < s.length; i++) { + const charCode = s[i].charCodeAt(0); + if (charCode > 0xff) { + throw new DenoError( + ErrorKind.InvalidInput, + "The string to be encoded contains characters " + + "outside of the Latin1 range." + ); + } + byteArray.push(charCode); + } + const result = base64.fromByteArray(Uint8Array.from(byteArray)); + return result; +} + +interface DecoderOptions { + fatal?: boolean; + ignoreBOM?: boolean; +} + +interface Decoder { + handler(stream: Stream, byte: number): number | null; +} + +interface Encoder { + handler(codePoint: number): number | number[]; +} + +class SingleByteDecoder implements Decoder { + private _index: number[]; + private _fatal: boolean; + + constructor(index: number[], options: DecoderOptions) { + if (options.ignoreBOM) { + throw new TypeError("Ignoring the BOM is available only with utf-8."); + } + this._fatal = options.fatal || false; + this._index = index; + } + handler(stream: Stream, byte: number): number { + if (byte === END_OF_STREAM) { + return FINISHED; + } + if (isASCIIByte(byte)) { + return byte; + } + const codePoint = this._index[byte - 0x80]; + + if (codePoint == null) { + return decoderError(this._fatal); + } + + return codePoint; + } +} + +// The encodingMap is a hash of labels that are indexed by the conical +// encoding. +const encodingMap: { [key: string]: string[] } = { + "windows-1252": [ + "ansi_x3.4-1968", + "ascii", + "cp1252", + "cp819", + "csisolatin1", + "ibm819", + "iso-8859-1", + "iso-ir-100", + "iso8859-1", + "iso88591", + "iso_8859-1", + "iso_8859-1:1987", + "l1", + "latin1", + "us-ascii", + "windows-1252", + "x-cp1252" + ], + "utf-8": ["unicode-1-1-utf-8", "utf-8", "utf8"] +}; +// We convert these into a Map where every label resolves to its canonical +// encoding type. +const encodings = new Map<string, string>(); +for (const key of Object.keys(encodingMap)) { + const labels = encodingMap[key]; + for (const label of labels) { + encodings.set(label, key); + } +} + +// A map of functions that return new instances of a decoder indexed by the +// encoding type. +const decoders = new Map<string, (options: DecoderOptions) => Decoder>(); +decoders.set( + "utf-8", + (options: DecoderOptions): UTF8Decoder => { + return new UTF8Decoder(options); + } +); + +// Single byte decoders are an array of code point lookups +const encodingIndexes = new Map<string, number[]>(); +// prettier-ignore +encodingIndexes.set("windows-1252", [8364,129,8218,402,8222,8230,8224,8225,710,8240,352,8249,338,141,381,143,144,8216,8217,8220,8221,8226,8211,8212,732,8482,353,8250,339,157,382,376,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255]); +for (const [key, index] of encodingIndexes) { + decoders.set( + key, + (options: DecoderOptions): SingleByteDecoder => { + return new SingleByteDecoder(index, options); + } + ); +} + +function codePointsToString(codePoints: number[]): string { + let s = ""; + for (const cp of codePoints) { + s += String.fromCodePoint(cp); + } + return s; +} + +class Stream { + private _tokens: number[]; + constructor(tokens: number[] | Uint8Array) { + this._tokens = [].slice.call(tokens); + this._tokens.reverse(); + } + + endOfStream(): boolean { + return !this._tokens.length; + } + + read(): number { + return !this._tokens.length ? END_OF_STREAM : this._tokens.pop()!; + } + + prepend(token: number | number[]): void { + if (Array.isArray(token)) { + while (token.length) { + this._tokens.push(token.pop()!); + } + } else { + this._tokens.push(token); + } + } + + push(token: number | number[]): void { + if (Array.isArray(token)) { + while (token.length) { + this._tokens.unshift(token.shift()!); + } + } else { + this._tokens.unshift(token); + } + } +} + +export interface TextDecodeOptions { + stream?: false; +} + +export interface TextDecoderOptions { + fatal?: boolean; + ignoreBOM?: boolean; +} + +type EitherArrayBuffer = SharedArrayBuffer | ArrayBuffer; + +// eslint-disable-next-line @typescript-eslint/no-explicit-any +function isEitherArrayBuffer(x: any): x is EitherArrayBuffer { + return x instanceof SharedArrayBuffer || x instanceof ArrayBuffer; +} + +export class TextDecoder { + private _encoding: string; + + /** Returns encoding's name, lowercased. */ + get encoding(): string { + return this._encoding; + } + /** Returns `true` if error mode is "fatal", and `false` otherwise. */ + readonly fatal: boolean = false; + /** Returns `true` if ignore BOM flag is set, and `false` otherwise. */ + readonly ignoreBOM: boolean = false; + + constructor(label = "utf-8", options: TextDecoderOptions = { fatal: false }) { + if (options.ignoreBOM) { + this.ignoreBOM = true; + } + if (options.fatal) { + this.fatal = true; + } + label = String(label) + .trim() + .toLowerCase(); + const encoding = encodings.get(label); + if (!encoding) { + throw new RangeError( + `The encoding label provided ('${label}') is invalid.` + ); + } + if (!decoders.has(encoding)) { + throw new TypeError(`Internal decoder ('${encoding}') not found.`); + } + this._encoding = encoding; + } + + /** Returns the result of running encoding's decoder. */ + decode( + input?: domTypes.BufferSource, + options: TextDecodeOptions = { stream: false } + ): string { + if (options.stream) { + throw new TypeError("Stream not supported."); + } + + let bytes: Uint8Array; + if (input instanceof Uint8Array) { + bytes = input; + } else if (isEitherArrayBuffer(input)) { + bytes = new Uint8Array(input); + } else if ( + typeof input === "object" && + "buffer" in input && + isEitherArrayBuffer(input.buffer) + ) { + bytes = new Uint8Array(input.buffer, input.byteOffset, input.byteLength); + } else { + bytes = new Uint8Array(0); + } + + const decoder = decoders.get(this._encoding)!({ + fatal: this.fatal, + ignoreBOM: this.ignoreBOM + }); + const inputStream = new Stream(bytes); + const output: number[] = []; + + while (true) { + const result = decoder.handler(inputStream, inputStream.read()); + if (result === FINISHED) { + break; + } + + if (result !== CONTINUE) { + output.push(result); + } + } + + if (output.length > 0 && output[0] === 0xfeff) { + output.shift(); + } + + return codePointsToString(output); + } + get [Symbol.toStringTag](): string { + return "TextDecoder"; + } +} + +interface TextEncoderEncodeIntoResult { + read: number; + written: number; +} + +export class TextEncoder { + /** Returns "utf-8". */ + readonly encoding = "utf-8"; + /** Returns the result of running UTF-8's encoder. */ + encode(input = ""): Uint8Array { + const encoder = new UTF8Encoder(); + const inputStream = new Stream(stringToCodePoints(input)); + const output: number[] = []; + + while (true) { + const result = encoder.handler(inputStream.read()); + if (result === FINISHED) { + break; + } + if (Array.isArray(result)) { + output.push(...result); + } else { + output.push(result); + } + } + + return new Uint8Array(output); + } + encodeInto(input: string, dest: Uint8Array): TextEncoderEncodeIntoResult { + const encoder = new UTF8Encoder(); + const inputStream = new Stream(stringToCodePoints(input)); + + let written = 0; + let read = 0; + while (true) { + const result = encoder.handler(inputStream.read()); + if (result === FINISHED) { + break; + } + read++; + if (Array.isArray(result)) { + dest.set(result, written); + written += result.length; + if (result.length > 3) { + // increment read a second time if greater than U+FFFF + read++; + } + } else { + dest[written] = result; + written++; + } + } + + return { + read, + written + }; + } + get [Symbol.toStringTag](): string { + return "TextEncoder"; + } +} |