diff --git a/packages/csv-parse/lib/api/index.js b/packages/csv-parse/lib/api/index.js index 80ae8056..3f694d89 100644 --- a/packages/csv-parse/lib/api/index.js +++ b/packages/csv-parse/lib/api/index.js @@ -747,18 +747,19 @@ const transform = function (original_options = {}) { }, // Helper to test if a character is trimable __isCharTrimable: function (buf, pos) { - const isTrim = (buf, pos) => { - const { timchars } = this.state; - loop1: for (let i = 0; i < timchars.length; i++) { - const timchar = timchars[i]; - for (let j = 0; j < timchar.length; j++) { - if (timchar[j] !== buf[pos + j]) continue loop1; - } - return timchar.length; + const { timchars, timcharFirstBytes } = this.state; + // Fast bail-out: non-whitespace bytes (the common case) are rejected + // without scanning the full timchar list. + const first = buf[pos]; + if (first === undefined || timcharFirstBytes[first] === 0) return 0; + loop1: for (let i = 0; i < timchars.length; i++) { + const timchar = timchars[i]; + for (let j = 0; j < timchar.length; j++) { + if (timchar[j] !== buf[pos + j]) continue loop1; } - return 0; - }; - return isTrim(buf, pos); + return timchar.length; + } + return 0; }, __isDelimiter: function (buf, pos, chr) { const { delimiter, ignore_last_delimiters } = this.options; diff --git a/packages/csv-parse/lib/api/init_state.js b/packages/csv-parse/lib/api/init_state.js index dce8b258..0ce050c3 100644 --- a/packages/csv-parse/lib/api/init_state.js +++ b/packages/csv-parse/lib/api/init_state.js @@ -1,16 +1,59 @@ import ResizeableBuffer from "../utils/ResizeableBuffer.js"; -// white space characters -// https://en.wikipedia.org/wiki/Whitespace_character -// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Character_Classes#Types -// \f\n\r\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff -const np = 12; -const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal -const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal -const space = 32; -const tab = 9; - const init_state = function (options) { + // ECMAScript WhiteSpace + LineTerminator codepoints, encoded under + // `options.encoding`. Aligns trimming with `String.prototype.trim()`. + // https://tc39.es/ecma262/#sec-white-space + // https://tc39.es/ecma262/#sec-line-terminators + // + // Codepoints unrepresentable in the target encoding are dropped: Node's + // Buffer substitutes them with `?` (0x3F), and including those would cause + // literal `?` bytes in the input to be trimmed under `latin1`/`ascii`. + const timchars = [ + // Basic Latin + 0x0020, // [Space](https://www.fileformat.info/info/unicode/char/0020/index.htm) + 0x0009, // [CHARACTER TABULATION (HT)](https://www.fileformat.info/info/unicode/char/0009/index.htm) + 0x000a, // [LINE FEED (LF)](https://www.fileformat.info/info/unicode/char/000a/index.htm) + 0x000d, // [CARRIAGE RETURN (CR)](https://www.fileformat.info/info/unicode/char/000d/index.htm) + 0x000c, // [FORM FEED (FF)](https://www.fileformat.info/info/unicode/char/000c/index.htm) + 0x000b, // [LINE TABULATION (VT)](https://www.fileformat.info/info/unicode/char/000b/index.htm) + // Latin-1 Supplement + 0x00a0, // [NO-BREAK SPACE (NBSP)](https://www.fileformat.info/info/unicode/char/00a0/index.htm) + // Ogham + 0x1680, // [OGHAM SPACE MARK](https://www.fileformat.info/info/unicode/char/1680/index.htm) + // General Punctuation + 0x2000, // [EN QUAD](https://www.fileformat.info/info/unicode/char/2000/index.htm) + 0x2001, // [EM QUAD](https://www.fileformat.info/info/unicode/char/2001/index.htm) + 0x2002, // [EN SPACE](https://www.fileformat.info/info/unicode/char/2002/index.htm) + 0x2003, // [EM SPACE](https://www.fileformat.info/info/unicode/char/2003/index.htm) + 0x2004, // [THREE-PER-EM SPACE](https://www.fileformat.info/info/unicode/char/2004/index.htm) + 0x2005, // [FOUR-PER-EM SPACE](https://www.fileformat.info/info/unicode/char/2005/index.htm) + 0x2006, // [SIX-PER-EM SPACE](https://www.fileformat.info/info/unicode/char/2006/index.htm) + 0x2007, // [FIGURE SPACE](https://www.fileformat.info/info/unicode/char/2007/index.htm) + 0x2008, // [PUNCTUATION SPACE](https://www.fileformat.info/info/unicode/char/2008/index.htm) + 0x2009, // [THIN SPACE](https://www.fileformat.info/info/unicode/char/2009/index.htm) + 0x200a, // [HAIR SPACE](https://www.fileformat.info/info/unicode/char/200a/index.htm) + 0x2028, // [LINE SEPARATOR](https://www.fileformat.info/info/unicode/char/2028/index.htm) + 0x2029, // [PARAGRAPH SEPARATOR](https://www.fileformat.info/info/unicode/char/2029/index.htm) + 0x202f, // [NARROW NO-BREAK SPACE (NNBSP)](https://www.fileformat.info/info/unicode/char/202f/index.htm) + 0x205f, // [MEDIUM MATHEMATICAL SPACE (MMSP)](https://www.fileformat.info/info/unicode/char/205f/index.htm) + 0x3000, // [IDEOGRAPHIC SPACE](https://www.fileformat.info/info/unicode/char/3000/index.htm) + 0xfeff, // [ZERO WIDTH NO-BREAK SPACE (BOM)](https://www.fileformat.info/info/unicode/char/feff/index.htm) + ].reduce((acc, codepoint) => { + const encoded = Buffer.from( + String.fromCharCode(codepoint), + options.encoding, + ); + if (codepoint !== 0x3f && encoded.length === 1 && encoded[0] === 0x3f) { + return acc; + } + acc.push(encoded); + return acc; + }, []); + // First-byte lookup table for `__isCharTrimable`. Non-whitespace bytes + // (the common case) bail out in O(1) without scanning every timchar. + const timcharFirstBytes = new Uint8Array(256); + for (const t of timchars) timcharFirstBytes[t[0]] = 1; return { bomSkipped: false, bufBytesStart: 0, @@ -37,6 +80,8 @@ const init_state = function (options) { ...options.delimiter.map((delimiter) => delimiter.length), // Skip if the remaining buffer can be escape sequence options.quote !== null ? options.quote.length : 0, + // Skip if the remaining buffer can be a multi-byte trim character + ...timchars.map((t) => t.length), ), previousBuf: undefined, quoting: false, @@ -55,13 +100,8 @@ const init_state = function (options) { ], wasQuoting: false, wasRowDelimiter: false, - timchars: [ - Buffer.from(Buffer.from([cr], "utf8").toString(), options.encoding), - Buffer.from(Buffer.from([nl], "utf8").toString(), options.encoding), - Buffer.from(Buffer.from([np], "utf8").toString(), options.encoding), - Buffer.from(Buffer.from([space], "utf8").toString(), options.encoding), - Buffer.from(Buffer.from([tab], "utf8").toString(), options.encoding), - ], + timchars: timchars, + timcharFirstBytes: timcharFirstBytes, }; }; diff --git a/packages/csv-parse/test/option.trim.ts b/packages/csv-parse/test/option.trim.ts index 353cb2ed..8975f810 100644 --- a/packages/csv-parse/test/option.trim.ts +++ b/packages/csv-parse/test/option.trim.ts @@ -295,4 +295,94 @@ describe("Option `trim`", function () { ); }); }); + + describe("unicode whitespace", function () { + const ws = (codepoint: number): string => String.fromCharCode(codepoint); + + it("trim ideographic space U+3000", function (next) { + const sp = ws(0x3000); + parse(`${sp}a${sp},${sp}b${sp}`, { trim: true }, (err, records) => { + if (err) return next(err); + records.should.eql([["a", "b"]]); + next(); + }); + }); + + it("trim vertical tab U+000B", function (next) { + const sp = ws(0x000b); + parse( + `${sp}a${sp},${sp}b${sp}`, + { trim: true, record_delimiter: "|" }, + (err, records) => { + if (err) return next(err); + records.should.eql([["a", "b"]]); + next(); + }, + ); + }); + + it("trim no-break space U+00A0", function (next) { + const sp = ws(0x00a0); + parse(`${sp}a${sp},${sp}b${sp}`, { trim: true }, (err, records) => { + if (err) return next(err); + records.should.eql([["a", "b"]]); + next(); + }); + }); + + it("trim mixed ECMAScript whitespace at field boundaries", function (next) { + // U+2028 and U+2029 are excluded because they act as record delimiters. + const codepoints = [ + 0x00a0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, + 0x2007, 0x2008, 0x2009, 0x200a, 0x202f, 0x205f, 0x3000, 0xfeff, + ]; + const surround = codepoints.map(ws).join(""); + parse( + `${surround}field-1${surround},${surround}field-2${surround}`, + { trim: true }, + (err, records) => { + if (err) return next(err); + records.should.eql([["field-1", "field-2"]]); + next(); + }, + ); + }); + + it("does not trim '?' under latin1 encoding", function (next) { + parse( + Buffer.from("?a?,?b?", "latin1"), + { encoding: "latin1", trim: true }, + (err, records) => { + if (err) return next(err); + records.should.eql([["?a?", "?b?"]]); + next(); + }, + ); + }); + + it("trim multi-byte whitespace split across writes", function (next) { + const records: string[] = []; + const parser = parse({ trim: true }); + parser.on("readable", () => { + let d; + while ((d = parser.read())) { + records.push(d); + } + }); + parser.on("end", () => { + records.should.eql([["a", "b"]]); + next(); + }); + const sp = Buffer.from(ws(0x3000), "utf8"); + parser.write(sp.subarray(0, 1)); + parser.write(sp.subarray(1)); + parser.write("a"); + parser.write(sp.subarray(0, 2)); + parser.write(sp.subarray(2)); + parser.write(","); + parser.write("b"); + parser.write(sp); + parser.end(); + }); + }); });