summaryrefslogtreecommitdiff
path: root/std/encoding
diff options
context:
space:
mode:
Diffstat (limited to 'std/encoding')
-rw-r--r--std/encoding/csv.ts172
-rw-r--r--std/encoding/csv_test.ts77
2 files changed, 167 insertions, 82 deletions
diff --git a/std/encoding/csv.ts b/std/encoding/csv.ts
index c8c7719ca..711e27772 100644
--- a/std/encoding/csv.ts
+++ b/std/encoding/csv.ts
@@ -1,5 +1,7 @@
// Ported from Go:
// https://github.com/golang/go/blob/go1.12.5/src/encoding/csv/
+// Copyright 2011 The Go Authors. All rights reserved. BSD license.
+// https://github.com/golang/go/blob/master/LICENSE
// Copyright 2018-2020 the Deno authors. All rights reserved. MIT license.
import { BufReader } from "../io/bufio.ts";
@@ -9,6 +11,11 @@ import { assert } from "../testing/asserts.ts";
const INVALID_RUNE = ["\r", "\n", '"'];
+export const ERR_BARE_QUOTE = 'bare " in non-quoted-field';
+export const ERR_QUOTE = 'extraneous or missing " in quoted-field';
+export const ERR_INVALID_DELIM = "Invalid Delimiter";
+export const ERR_FIELD_COUNT = "wrong number of fields";
+
export class ParseError extends Error {
StartLine: number;
Line: number;
@@ -49,70 +56,159 @@ function chkOptions(opt: ReadOptions): void {
(typeof opt.comment === "string" && INVALID_RUNE.includes(opt.comment)) ||
opt.comma === opt.comment
) {
- throw new Error("Invalid Delimiter");
+ throw new Error(ERR_INVALID_DELIM);
}
}
-async function read(
+async function readRecord(
Startline: number,
reader: BufReader,
opt: ReadOptions = { comma: ",", trimLeadingSpace: false }
): Promise<string[] | Deno.EOF> {
const tp = new TextProtoReader(reader);
- let line: string;
- let result: string[] = [];
const lineIndex = Startline;
+ let line = await readLine(tp);
- const r = await tp.readLine();
- if (r === Deno.EOF) return Deno.EOF;
- line = r;
- // Normalize \r\n to \n on all input lines.
- if (
- line.length >= 2 &&
- line[line.length - 2] === "\r" &&
- line[line.length - 1] === "\n"
- ) {
- line = line.substring(0, line.length - 2);
- line = line + "\n";
- }
-
- const trimmedLine = line.trimLeft();
- if (trimmedLine.length === 0) {
+ if (line === Deno.EOF) return Deno.EOF;
+ if (line.length === 0) {
return [];
}
-
// line starting with comment character is ignored
- if (opt.comment && trimmedLine[0] === opt.comment) {
+ if (opt.comment && line[0] === opt.comment) {
return [];
}
assert(opt.comma != null);
- result = line.split(opt.comma);
- let quoteError = false;
- result = result.map((r): string => {
+ let quoteError: string | null = null;
+ const quote = '"';
+ const quoteLen = quote.length;
+ const commaLen = opt.comma.length;
+ let recordBuffer = "";
+ const fieldIndexes = [] as number[];
+ parseField: for (;;) {
if (opt.trimLeadingSpace) {
- r = r.trimLeft();
- }
- if (r[0] === '"' && r[r.length - 1] === '"') {
- r = r.substring(1, r.length - 1);
- } else if (r[0] === '"') {
- r = r.substring(1, r.length);
+ line = line.trimLeft();
}
- if (!opt.lazyQuotes) {
- if (r[0] !== '"' && r.indexOf('"') !== -1) {
- quoteError = true;
+ if (line.length === 0 || !line.startsWith(quote)) {
+ // Non-quoted string field
+ const i = line.indexOf(opt.comma);
+ let field = line;
+ if (i >= 0) {
+ field = field.substring(0, i);
+ }
+ // Check to make sure a quote does not appear in field.
+ if (!opt.lazyQuotes) {
+ const j = field.indexOf(quote);
+ if (j >= 0) {
+ quoteError = ERR_BARE_QUOTE;
+ break parseField;
+ }
+ }
+ recordBuffer += field;
+ fieldIndexes.push(recordBuffer.length);
+ if (i >= 0) {
+ line = line.substring(i + commaLen);
+ continue parseField;
+ }
+ break parseField;
+ } else {
+ // Quoted string field
+ line = line.substring(quoteLen);
+ for (;;) {
+ const i = line.indexOf(quote);
+ if (i >= 0) {
+ // Hit next quote.
+ recordBuffer += line.substring(0, i);
+ line = line.substring(i + quoteLen);
+ if (line.startsWith(quote)) {
+ // `""` sequence (append quote).
+ recordBuffer += quote;
+ line = line.substring(quoteLen);
+ } else if (line.startsWith(opt.comma)) {
+ // `","` sequence (end of field).
+ line = line.substring(commaLen);
+ fieldIndexes.push(recordBuffer.length);
+ continue parseField;
+ } else if (0 === line.length) {
+ // `"\n` sequence (end of line).
+ fieldIndexes.push(recordBuffer.length);
+ break parseField;
+ } else if (opt.lazyQuotes) {
+ // `"` sequence (bare quote).
+ recordBuffer += quote;
+ } else {
+ // `"*` sequence (invalid non-escaped quote).
+ quoteError = ERR_QUOTE;
+ break parseField;
+ }
+ } else if (line.length > 0 || !(await isEOF(tp))) {
+ // Hit end of line (copy all data so far).
+ recordBuffer += line;
+ const r = await readLine(tp);
+ if (r === Deno.EOF) {
+ if (!opt.lazyQuotes) {
+ quoteError = ERR_QUOTE;
+ break parseField;
+ }
+ fieldIndexes.push(recordBuffer.length);
+ break parseField;
+ }
+ recordBuffer += "\n"; // preserve line feed (This is because TextProtoReader removes it.)
+ line = r;
+ } else {
+ // Abrupt end of file (EOF on error).
+ if (!opt.lazyQuotes) {
+ quoteError = ERR_QUOTE;
+ break parseField;
+ }
+ fieldIndexes.push(recordBuffer.length);
+ break parseField;
+ }
}
}
- return r;
- });
+ }
if (quoteError) {
- throw new ParseError(Startline, lineIndex, 'bare " in non-quoted-field');
+ throw new ParseError(Startline, lineIndex, quoteError);
+ }
+ const result = [] as string[];
+ let preIdx = 0;
+ for (const i of fieldIndexes) {
+ result.push(recordBuffer.slice(preIdx, i));
+ preIdx = i;
}
return result;
}
+async function isEOF(tp: TextProtoReader): Promise<boolean> {
+ return (await tp.r.peek(0)) === Deno.EOF;
+}
+
+async function readLine(tp: TextProtoReader): Promise<string | Deno.EOF> {
+ let line: string;
+ const r = await tp.readLine();
+ if (r === Deno.EOF) return Deno.EOF;
+ line = r;
+
+ // For backwards compatibility, drop trailing \r before EOF.
+ if ((await isEOF(tp)) && line.length > 0 && line[line.length - 1] === "\r") {
+ line = line.substring(0, line.length - 1);
+ }
+
+ // Normalize \r\n to \n on all input lines.
+ if (
+ line.length >= 2 &&
+ line[line.length - 2] === "\r" &&
+ line[line.length - 1] === "\n"
+ ) {
+ line = line.substring(0, line.length - 2);
+ line = line + "\n";
+ }
+
+ return line;
+}
+
export async function readMatrix(
reader: BufReader,
opt: ReadOptions = {
@@ -129,7 +225,7 @@ export async function readMatrix(
chkOptions(opt);
for (;;) {
- const r = await read(lineIndex, reader, opt);
+ const r = await readRecord(lineIndex, reader, opt);
if (r === Deno.EOF) break;
lineResult = r;
lineIndex++;
@@ -148,7 +244,7 @@ export async function readMatrix(
if (lineResult.length > 0) {
if (_nbFields && _nbFields !== lineResult.length) {
- throw new ParseError(lineIndex, lineIndex, "wrong number of fields");
+ throw new ParseError(lineIndex, lineIndex, ERR_FIELD_COUNT);
}
result.push(lineResult);
}
diff --git a/std/encoding/csv_test.ts b/std/encoding/csv_test.ts
index cb61de433..b3d4ec0c9 100644
--- a/std/encoding/csv_test.ts
+++ b/std/encoding/csv_test.ts
@@ -1,15 +1,21 @@
// Test ported from Golang
// https://github.com/golang/go/blob/2cc15b1/src/encoding/csv/reader_test.go
+// Copyright 2011 The Go Authors. All rights reserved. BSD license.
+// https://github.com/golang/go/blob/master/LICENSE
+// Copyright 2018-2020 the Deno authors. All rights reserved. MIT license.
+
import { assertEquals, assert } from "../testing/asserts.ts";
-import { readMatrix, parse } from "./csv.ts";
+import {
+ readMatrix,
+ parse,
+ ERR_BARE_QUOTE,
+ ERR_QUOTE,
+ ERR_INVALID_DELIM,
+ ERR_FIELD_COUNT,
+} from "./csv.ts";
import { StringReader } from "../io/readers.ts";
import { BufReader } from "../io/bufio.ts";
-const ErrInvalidDelim = "Invalid Delimiter";
-const ErrFieldCount = "wrong number of fields";
-const ErrBareQuote = 'bare " in non-quoted-field';
-
-// TODO(zekth): Activate remaining tests
const testCases = [
{
Name: "Simple",
@@ -43,7 +49,6 @@ zzz,yyy,xxx`,
["a,a", `bbb`, "ccc"],
["zzz", "yyy", "xxx"],
],
- ignore: true,
},
{
Name: "NoEOLTest",
@@ -62,8 +67,7 @@ zzz,yyy,xxx`,
line","one line","three
line
field"`,
- Output: [["two\nline"], ["one line"], ["three\nline\nfield"]],
- ignore: true,
+ Output: [["two\nline", "one line", "three\nline\nfield"]],
},
{
Name: "BlankLine",
@@ -129,7 +133,7 @@ field"`,
{
Name: "BadDoubleQuotes",
Input: `a""b,c`,
- Error: ErrBareQuote,
+ Error: ERR_BARE_QUOTE,
// Error: &ParseError{StartLine: 1, Line: 1, Column: 1, Err: ErrBareQuote},
},
{
@@ -141,23 +145,23 @@ field"`,
{
Name: "BadBareQuote",
Input: `a "word","b"`,
- Error: ErrBareQuote,
+ Error: ERR_BARE_QUOTE,
// &ParseError{StartLine: 1, Line: 1, Column: 2, Err: ErrBareQuote}
},
{
Name: "BadTrailingQuote",
Input: `"a word",b"`,
- Error: ErrBareQuote,
+ Error: ERR_BARE_QUOTE,
},
{
Name: "ExtraneousQuote",
Input: `"a "word","b"`,
- Error: ErrBareQuote,
+ Error: ERR_QUOTE,
},
{
Name: "BadFieldCount",
Input: "a,b,c\nd,e",
- Error: ErrFieldCount,
+ Error: ERR_FIELD_COUNT,
UseFieldsPerRecord: true,
FieldsPerRecord: 0,
},
@@ -167,7 +171,7 @@ field"`,
// Error: &ParseError{StartLine: 1, Line: 1, Err: ErrFieldCount},
UseFieldsPerRecord: true,
FieldsPerRecord: 2,
- Error: ErrFieldCount,
+ Error: ERR_FIELD_COUNT,
},
{
Name: "FieldCount",
@@ -261,22 +265,19 @@ x,,,
{
Name: "StartLine1", // Issue 19019
Input: 'a,"b\nc"d,e',
- Error: true,
+ Error: ERR_QUOTE,
// Error: &ParseError{StartLine: 1, Line: 2, Column: 1, Err: ErrQuote},
- ignore: true,
},
{
Name: "StartLine2",
Input: 'a,b\n"d\n\n,e',
- Error: true,
+ Error: ERR_QUOTE,
// Error: &ParseError{StartLine: 2, Line: 5, Column: 0, Err: ErrQuote},
- ignore: true,
},
{
Name: "CRLFInQuotedField", // Issue 21201
Input: 'A,"Hello\r\nHi",B\r\n',
Output: [["A", "Hello\nHi", "B"]],
- ignore: true,
},
{
Name: "BinaryBlobField", // Issue 19410
@@ -287,32 +288,27 @@ x,,,
Name: "TrailingCR",
Input: "field1,field2\r",
Output: [["field1", "field2"]],
- ignore: true,
},
{
Name: "QuotedTrailingCR",
Input: '"field"\r',
- Output: [['"field"']],
- ignore: true,
+ Output: [["field"]],
},
{
Name: "QuotedTrailingCRCR",
Input: '"field"\r\r',
- Error: true,
+ Error: ERR_QUOTE,
// Error: &ParseError{StartLine: 1, Line: 1, Column: 6, Err: ErrQuote},
- ignore: true,
},
{
Name: "FieldCR",
Input: "field\rfield\r",
Output: [["field\rfield"]],
- ignore: true,
},
{
Name: "FieldCRCR",
Input: "field\r\rfield\r\r",
Output: [["field\r\rfield\r"]],
- ignore: true,
},
{
Name: "FieldCRCRLF",
@@ -328,7 +324,6 @@ x,,,
Name: "FieldCRCRLFCRCR",
Input: "field\r\r\n\r\rfield\r\r\n\r\r",
Output: [["field\r"], ["\r\rfield\r"], ["\r"]],
- ignore: true,
},
{
Name: "MultiFieldCRCRLFCRCR",
@@ -338,7 +333,6 @@ x,,,
["\r\rfield1", "field2\r"],
["\r\r", ""],
],
- ignore: true,
},
{
Name: "NonASCIICommaAndComment",
@@ -374,12 +368,11 @@ x,,,
Name: "QuotedFieldMultipleLF",
Input: '"\n\n\n\n"',
Output: [["\n\n\n\n"]],
- ignore: true,
},
{
Name: "MultipleCRLF",
Input: "\r\n\r\n\r\n\r\n",
- ignore: true,
+ Output: [],
},
/**
* The implementation may read each line in several chunks if
@@ -392,12 +385,12 @@ x,,,
"#ignore\n".repeat(10000) + "@".repeat(5000) + "," + "*".repeat(5000),
Output: [["@".repeat(5000), "*".repeat(5000)]],
Comment: "#",
- ignore: true,
+ ignore: true, // TODO(#4521)
},
{
Name: "QuoteWithTrailingCRLF",
Input: '"foo"bar"\r\n',
- Error: ErrBareQuote,
+ Error: ERR_QUOTE,
// Error: &ParseError{StartLine: 1, Line: 1, Column: 4, Err: ErrQuote},
},
{
@@ -410,58 +403,54 @@ x,,,
Name: "DoubleQuoteWithTrailingCRLF",
Input: '"foo""bar"\r\n',
Output: [[`foo"bar`]],
- ignore: true,
},
{
Name: "EvenQuotes",
Input: `""""""""`,
Output: [[`"""`]],
- ignore: true,
},
{
Name: "OddQuotes",
Input: `"""""""`,
- Error: true,
+ Error: ERR_QUOTE,
// Error:" &ParseError{StartLine: 1, Line: 1, Column: 7, Err: ErrQuote}",
- ignore: true,
},
{
Name: "LazyOddQuotes",
Input: `"""""""`,
Output: [[`"""`]],
LazyQuotes: true,
- ignore: true,
},
{
Name: "BadComma1",
Comma: "\n",
- Error: ErrInvalidDelim,
+ Error: ERR_INVALID_DELIM,
},
{
Name: "BadComma2",
Comma: "\r",
- Error: ErrInvalidDelim,
+ Error: ERR_INVALID_DELIM,
},
{
Name: "BadComma3",
Comma: '"',
- Error: ErrInvalidDelim,
+ Error: ERR_INVALID_DELIM,
},
{
Name: "BadComment1",
Comment: "\n",
- Error: ErrInvalidDelim,
+ Error: ERR_INVALID_DELIM,
},
{
Name: "BadComment2",
Comment: "\r",
- Error: ErrInvalidDelim,
+ Error: ERR_INVALID_DELIM,
},
{
Name: "BadCommaComment",
Comma: "X",
Comment: "X",
- Error: ErrInvalidDelim,
+ Error: ERR_INVALID_DELIM,
},
];
for (const t of testCases) {