From 3cbe276b4e7786def4c00f9d0bc5c00c7e945b1e Mon Sep 17 00:00:00 2001 From: "Victor M. Varela" Date: Thu, 7 May 2026 13:57:41 +0200 Subject: [PATCH 1/2] feat: support multi-character input delimiters (#86) Extend -d/--delimiter to accept strings of 1-8 bytes instead of a single character. Common real-world separators like '||', ';;', or two spaces now work without preprocessing. - CsvReader: delimiter field changed from u8 to []const u8; added partial_delim: usize to track in-progress multi-byte matches in the streaming state machine - parseDelimiter: returns []const u8, rejects empty and >8-byte values - writeField / printRow / printHeaderRow: delimiter type updated to []const u8; quoting detection uses std.mem.indexOf instead of byte-by-byte comparison - New unit tests: 2-char (||), 3-char (;;;), partial-match false positive, quoted field containing multi-char delimiter - New integration tests 95-98: double-pipe, three-char, empty-error, too-long-error - README and man page updated to describe the 1-8 char constraint --- README.md | 5 +- build.zig | 34 ++++++++ docs/sql-pipe.1.scd | 9 ++- src/csv.zig | 191 +++++++++++++++++++++++++++++++++++++++----- src/main.zig | 80 +++++++++---------- 5 files changed, 253 insertions(+), 66 deletions(-) diff --git a/README.md b/README.md index e87aea9..b87f5bb 100644 --- a/README.md +++ b/README.md @@ -173,13 +173,16 @@ Column names with spaces work — quote them in SQL: $ cat report.csv | sql-pipe 'SELECT "first name", "last name" FROM t WHERE "dept id" = "42"' ``` -Use a custom input delimiter with `-d` / `--delimiter` (single character), or `--tsv` for tab-separated files: +Use a custom input delimiter with `-d` / `--delimiter` (1–8 characters), or `--tsv` for tab-separated files: ```sh $ cat data.psv | sql-pipe -d '|' 'SELECT * FROM t' $ cat data.tsv | sql-pipe --tsv 'SELECT * FROM t' # equivalent: $ cat data.tsv | sql-pipe --delimiter '\t' 'SELECT * FROM t' +# multi-character delimiters: +$ cat data.psv | sql-pipe -d '||' 'SELECT * FROM t' +$ cat report.txt | sql-pipe --delimiter ' ' 'SELECT * FROM t' # two spaces ``` Output results as a JSON array of objects with `--json`: diff --git a/build.zig b/build.zig index dd27511..1e497f6 100644 --- a/build.zig +++ b/build.zig @@ -977,6 +977,40 @@ pub fn build(b: *std.Build) void { test_sample_fewer_rows.step.dependOn(b.getInstallStep()); test_step.dependOn(&test_sample_fewer_rows.step); + // Integration test 95: 2-char delimiter (||) splits fields correctly + const test_delimiter_double_pipe = b.addSystemCommand(&.{ + "bash", "-c", + \\printf 'name||age\nAlice||30\nBob||25\n' | ./zig-out/bin/sql-pipe --delimiter '||' 'SELECT name, age FROM t ORDER BY age' | diff - <(printf 'Bob,25\nAlice,30\n') + }); + test_delimiter_double_pipe.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_delimiter_double_pipe.step); + + // Integration test 96: 3-char delimiter (;;;) splits fields correctly + const test_delimiter_three_char = b.addSystemCommand(&.{ + "bash", "-c", + \\printf 'name;;;age\nAlice;;;30\nBob;;;25\n' | ./zig-out/bin/sql-pipe --delimiter ';;;' 'SELECT name, age FROM t ORDER BY age' | diff - <(printf 'Bob,25\nAlice,30\n') + }); + test_delimiter_three_char.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_delimiter_three_char.step); + + // Integration test 97: empty delimiter string exits 1 with error + const test_delimiter_empty_error = b.addSystemCommand(&.{ + "bash", "-c", + \\msg=$(printf 'a,b\n1,2\n' | ./zig-out/bin/sql-pipe -d '' 'SELECT * FROM t' 2>&1 >/dev/null; echo "EXIT:$?") + \\echo "$msg" | grep -q 'EXIT:1' + }); + test_delimiter_empty_error.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_delimiter_empty_error.step); + + // Integration test 98: delimiter longer than 8 chars exits 1 with error + const test_delimiter_too_long_error = b.addSystemCommand(&.{ + "bash", "-c", + \\msg=$(printf 'a,b\n1,2\n' | ./zig-out/bin/sql-pipe -d '123456789' 'SELECT * FROM t' 2>&1 >/dev/null; echo "EXIT:$?") + \\echo "$msg" | grep -q 'EXIT:1' + }); + test_delimiter_too_long_error.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_delimiter_too_long_error.step); + // Unit tests for the RFC 4180 CSV parser (src/csv.zig) const unit_tests = b.addTest(.{ .root_module = b.createModule(.{ diff --git a/docs/sql-pipe.1.scd b/docs/sql-pipe.1.scd index ae5890a..ecb88bc 100644 --- a/docs/sql-pipe.1.scd +++ b/docs/sql-pipe.1.scd @@ -22,17 +22,18 @@ DESCRIPTION behavior and treat all columns as TEXT. By default, input fields are parsed as comma-separated values. Use - *--delimiter* (or *-d*) to parse other single-character delimiters, or *--tsv* + *--delimiter* (or *-d*) to parse other delimiters (1–8 characters), or *--tsv* for tab-separated input. Exit codes follow a structured convention for scripting integration (see EXIT CODES section). OPTIONS - *-d, --delimiter* - Use a custom single-character input field delimiter instead of comma. + *-d, --delimiter* + Use a custom input field delimiter (1–8 bytes) instead of comma. Examples: *-d '|'* for pipe-separated files, *--delimiter '\\t'* for - tab-separated files. + tab-separated files, *-d '||'* for double-pipe-separated files. + Produces a usage error if the value is empty or longer than 8 bytes. *--tsv* Alias for *--delimiter '\\t'*. Parses tab-separated input. diff --git a/src/csv.zig b/src/csv.zig index 7901084..9d88322 100644 --- a/src/csv.zig +++ b/src/csv.zig @@ -3,11 +3,16 @@ //! No full-input buffering: every byte is processed exactly once. //! Supports: //! - Quoted fields enclosed in double-quotes -//! - Embedded delimiters inside quoted fields +//! - Embedded delimiters inside quoted fields (single or multi-character) //! - Escaped double-quotes ("") inside quoted fields → decoded to " //! - Embedded newlines (\n, \r\n) inside quoted fields → multi-line value //! - Both \r\n and \n record terminators (outside quoted fields) //! - Unchanged behaviour for unquoted fields +//! - Multi-character delimiters up to 8 bytes (e.g. "||", ";;", " ") +//! +//! Note: multi-character delimiter matching uses a simple byte-by-byte +//! partial-match approach. Overlapping delimiter patterns (e.g. delimiter "aa" +//! within "aaa") follow a greedy left-to-right strategy. const std = @import("std"); @@ -55,10 +60,14 @@ const State = enum { pub const CsvReader = struct { reader: *std.Io.Reader, allocator: std.mem.Allocator, - delimiter: u8, + /// Field delimiter — one to eight bytes (e.g. ",", "||", "\t"). + delimiter: []const u8, done: bool = false, + /// Number of delimiter bytes matched in the current in-progress match attempt. + /// Zero when no match is in progress. + partial_delim: usize = 0, - pub fn init(allocator: std.mem.Allocator, reader: *std.Io.Reader, delimiter: u8) CsvReader { + pub fn init(allocator: std.mem.Allocator, reader: *std.Io.Reader, delimiter: []const u8) CsvReader { return .{ .reader = reader, .allocator = allocator, .delimiter = delimiter }; } @@ -99,6 +108,11 @@ pub const CsvReader = struct { while (true) { const byte = self.reader.takeByte() catch |err| switch (err) { error.EndOfStream => { + // Flush any pending partial delimiter bytes as field content. + if (self.partial_delim > 0) { + try field.appendSlice(self.allocator, self.delimiter[0..self.partial_delim]); + self.partial_delim = 0; + } // EOF: flush whatever pending data we have. if (!has_data and fields.items.len == 0) { field.deinit(self.allocator); @@ -122,10 +136,16 @@ pub const CsvReader = struct { switch (state) { .field_start => { - if (byte == self.delimiter) { - // Empty unquoted field before delimiter. - try fields.append(self.allocator, try field.toOwnedSlice(self.allocator)); - state = .field_start; + if (byte == self.delimiter[0]) { + if (self.delimiter.len == 1) { + // Single-char delimiter: immediate match → empty field. + try fields.append(self.allocator, try field.toOwnedSlice(self.allocator)); + state = .field_start; + } else { + // First byte of a potential multi-char delimiter. + self.partial_delim = 1; + state = .unquoted; + } } else switch (byte) { '"' => { state = .quoted; @@ -146,9 +166,45 @@ pub const CsvReader = struct { }, .unquoted => { - if (byte == self.delimiter) { - try fields.append(self.allocator, try field.toOwnedSlice(self.allocator)); - state = .field_start; + if (self.partial_delim > 0) { + // Ongoing potential delimiter match. + if (byte == self.delimiter[self.partial_delim]) { + self.partial_delim += 1; + if (self.partial_delim == self.delimiter.len) { + // Full delimiter matched: flush field, return to field_start. + self.partial_delim = 0; + try fields.append(self.allocator, try field.toOwnedSlice(self.allocator)); + state = .field_start; + } + // else: keep accumulating; byte is not yet emitted to field. + } else { + // Partial match failed: emit consumed prefix as field content. + try field.appendSlice(self.allocator, self.delimiter[0..self.partial_delim]); + self.partial_delim = 0; + // Process current byte as fresh unquoted input. + if (byte == self.delimiter[0]) { + if (self.delimiter.len == 1) { + try fields.append(self.allocator, try field.toOwnedSlice(self.allocator)); + state = .field_start; + } else { + self.partial_delim = 1; + } + } else switch (byte) { + '\r' => {}, + '\n' => { + try fields.append(self.allocator, try field.toOwnedSlice(self.allocator)); + return try fields.toOwnedSlice(self.allocator); + }, + else => try field.append(self.allocator, byte), + } + } + } else if (byte == self.delimiter[0]) { + if (self.delimiter.len == 1) { + try fields.append(self.allocator, try field.toOwnedSlice(self.allocator)); + state = .field_start; + } else { + self.partial_delim = 1; + } } else switch (byte) { '\r' => { // Strip \r before the \n record terminator. @@ -175,10 +231,47 @@ pub const CsvReader = struct { }, .quote_saw => { - if (byte == self.delimiter) { - // Closing quote followed by field delimiter. - try fields.append(self.allocator, try field.toOwnedSlice(self.allocator)); - state = .field_start; + if (self.partial_delim > 0) { + // Ongoing delimiter match started after a closing quote. + if (byte == self.delimiter[self.partial_delim]) { + self.partial_delim += 1; + if (self.partial_delim == self.delimiter.len) { + // Full delimiter matched. + self.partial_delim = 0; + try fields.append(self.allocator, try field.toOwnedSlice(self.allocator)); + state = .field_start; + } + } else { + // Partial match failed: emit prefix, continue as unquoted. + try field.appendSlice(self.allocator, self.delimiter[0..self.partial_delim]); + self.partial_delim = 0; + state = .unquoted; + // Process current byte as fresh unquoted input. + if (byte == self.delimiter[0]) { + if (self.delimiter.len == 1) { + try fields.append(self.allocator, try field.toOwnedSlice(self.allocator)); + state = .field_start; + } else { + self.partial_delim = 1; + } + } else switch (byte) { + '\r' => {}, + '\n' => { + try fields.append(self.allocator, try field.toOwnedSlice(self.allocator)); + return try fields.toOwnedSlice(self.allocator); + }, + else => try field.append(self.allocator, byte), + } + } + } else if (byte == self.delimiter[0]) { + if (self.delimiter.len == 1) { + // Closing quote followed by field delimiter. + try fields.append(self.allocator, try field.toOwnedSlice(self.allocator)); + state = .field_start; + } else { + // Start of potential multi-char delimiter after closing quote. + self.partial_delim = 1; + } } else switch (byte) { '"' => { // Escaped double-quote: "" → single " @@ -219,11 +312,11 @@ pub const CsvReader = struct { /// Convenience constructor — comma delimiter. pub fn csvReader(allocator: std.mem.Allocator, reader: *std.Io.Reader) CsvReader { - return csvReaderWithDelimiter(allocator, reader, ','); + return csvReaderWithDelimiter(allocator, reader, ","); } -/// Convenience constructor with custom input delimiter. -pub fn csvReaderWithDelimiter(allocator: std.mem.Allocator, reader: *std.Io.Reader, delimiter: u8) CsvReader { +/// Convenience constructor with custom input delimiter (1–8 bytes). +pub fn csvReaderWithDelimiter(allocator: std.mem.Allocator, reader: *std.Io.Reader, delimiter: []const u8) CsvReader { return CsvReader.init(allocator, reader, delimiter); } @@ -371,7 +464,7 @@ test "entirely empty input returns null" { test "custom pipe delimiter" { const input = "a|b|c\n1|2|3\n"; var input_reader: std.Io.Reader = .fixed(input); - var csv = csvReaderWithDelimiter(std.testing.allocator, &input_reader, '|'); + var csv = csvReaderWithDelimiter(std.testing.allocator, &input_reader, "|"); const r1 = (try csv.nextRecord()).?; defer csv.freeRecord(r1); @@ -391,7 +484,7 @@ test "custom pipe delimiter" { test "custom tab delimiter" { const input = "name\tage\nAlice\t30\n"; var input_reader: std.Io.Reader = .fixed(input); - var csv = csvReaderWithDelimiter(std.testing.allocator, &input_reader, '\t'); + var csv = csvReaderWithDelimiter(std.testing.allocator, &input_reader, "\t"); const r1 = (try csv.nextRecord()).?; defer csv.freeRecord(r1); @@ -405,3 +498,63 @@ test "custom tab delimiter" { try std.testing.expectEqualStrings("Alice", r2[0]); try std.testing.expectEqualStrings("30", r2[1]); } + +test "2-char delimiter (||) splits fields correctly" { + const input = "a||b||c\n1||2||3\n"; + var input_reader: std.Io.Reader = .fixed(input); + var csv = csvReaderWithDelimiter(std.testing.allocator, &input_reader, "||"); + + const r1 = (try csv.nextRecord()).?; + defer csv.freeRecord(r1); + try std.testing.expectEqual(@as(usize, 3), r1.len); + try std.testing.expectEqualStrings("a", r1[0]); + try std.testing.expectEqualStrings("b", r1[1]); + try std.testing.expectEqualStrings("c", r1[2]); + + const r2 = (try csv.nextRecord()).?; + defer csv.freeRecord(r2); + try std.testing.expectEqual(@as(usize, 3), r2.len); + try std.testing.expectEqualStrings("1", r2[0]); + try std.testing.expectEqualStrings("2", r2[1]); + try std.testing.expectEqualStrings("3", r2[2]); + + try std.testing.expectEqual(@as(?[][]u8, null), try csv.nextRecord()); +} + +test "3-char delimiter (;;;) splits fields correctly" { + const input = "foo;;;bar;;;baz\n"; + var input_reader: std.Io.Reader = .fixed(input); + var csv = csvReaderWithDelimiter(std.testing.allocator, &input_reader, ";;;"); + + const r = (try csv.nextRecord()).?; + defer csv.freeRecord(r); + try std.testing.expectEqual(@as(usize, 3), r.len); + try std.testing.expectEqualStrings("foo", r[0]); + try std.testing.expectEqualStrings("bar", r[1]); + try std.testing.expectEqualStrings("baz", r[2]); +} + +test "multi-char delimiter: partial match bytes emitted as field content" { + // '|' alone is NOT a delimiter; only '||' is. + const input = "a|b||c\n"; + var input_reader: std.Io.Reader = .fixed(input); + var csv = csvReaderWithDelimiter(std.testing.allocator, &input_reader, "||"); + + const r = (try csv.nextRecord()).?; + defer csv.freeRecord(r); + try std.testing.expectEqual(@as(usize, 2), r.len); + try std.testing.expectEqualStrings("a|b", r[0]); + try std.testing.expectEqualStrings("c", r[1]); +} + +test "quoted field containing multi-char delimiter is preserved" { + const input = "\"a||b\"||c\n"; + var input_reader: std.Io.Reader = .fixed(input); + var csv = csvReaderWithDelimiter(std.testing.allocator, &input_reader, "||"); + + const r = (try csv.nextRecord()).?; + defer csv.freeRecord(r); + try std.testing.expectEqual(@as(usize, 2), r.len); + try std.testing.expectEqualStrings("a||b", r[0]); + try std.testing.expectEqualStrings("c", r[1]); +} diff --git a/src/main.zig b/src/main.zig index 86ebecc..8996bd2 100644 --- a/src/main.zig +++ b/src/main.zig @@ -81,8 +81,8 @@ const ParsedArgs = struct { query: []const u8, /// Infer column types from the first 100 buffered rows when true. type_inference: bool, - /// CSV field delimiter (default: ','). - delimiter: u8, + /// CSV field delimiter — 1 to 8 bytes (default: ","). + delimiter: []const u8, /// Emit column names as first output row when true (CSV output only). header: bool, /// Input format (default: csv). @@ -102,8 +102,8 @@ const ParsedArgs = struct { /// Arguments for `--columns` mode. const ColumnsArgs = struct { - /// CSV field delimiter (default: ','). - delimiter: u8, + /// CSV field delimiter — 1 to 8 bytes (default: ","). + delimiter: []const u8, /// Show inferred type alongside name when true. verbose: bool, /// Input format (default: csv). @@ -112,8 +112,8 @@ const ColumnsArgs = struct { /// Arguments for `--validate` mode. const ValidateArgs = struct { - /// CSV field delimiter (default: ','). - delimiter: u8, + /// CSV field delimiter — 1 to 8 bytes (default: ","). + delimiter: []const u8, /// Infer column types from the first 100 buffered rows when true. type_inference: bool, /// Input format (default: csv). @@ -122,8 +122,8 @@ const ValidateArgs = struct { /// Arguments for `--sample` mode. const SampleArgs = struct { - /// CSV field delimiter (default: ','). - delimiter: u8, + /// CSV field delimiter — 1 to 8 bytes (default: ","). + delimiter: []const u8, /// Input format (default: csv). input_format: InputFormat, /// Number of sample rows to print (default: 10). @@ -161,7 +161,7 @@ fn printUsage(writer: *std.Io.Writer) !void { \\runs , and prints results to stdout. \\ \\Options: - \\ -d, --delimiter Input field delimiter for CSV (default: ,) + \\ -d, --delimiter Input field delimiter for CSV: 1–8 chars (default: ,) \\ --tsv Alias for --delimiter '\t' \\ -I, --input-format Input format: csv (default), tsv, json, ndjson \\ -O, --output-format Output format: csv (default), tsv, json, ndjson @@ -207,14 +207,15 @@ fn printUsage(writer: *std.Io.Writer) !void { ); } -/// parseDelimiter(value) → u8 +/// parseDelimiter(value) → []const u8 /// Pre: value is the delimiter token provided by the user -/// Post: result is a single-byte delimiter, or '\t' when value = "\\t" -/// error.InvalidDelimiter when value is empty or has more than one char -fn parseDelimiter(value: []const u8) SqlPipeError!u8 { - if (std.mem.eql(u8, value, "\\t")) return '\t'; - if (value.len != 1) return error.InvalidDelimiter; - return value[0]; +/// Post: result is a 1–8 byte delimiter string, or "\t" when value = "\\t" +/// error.InvalidDelimiter when value is empty or longer than 8 bytes +fn parseDelimiter(value: []const u8) SqlPipeError![]const u8 { + if (std.mem.eql(u8, value, "\\t")) return "\t"; + if (value.len == 0) return error.InvalidDelimiter; + if (value.len > 8) return error.InvalidDelimiter; + return value; } /// parseInputFormat(s) → InputFormat @@ -253,7 +254,7 @@ fn parseOutputFormat(s: []const u8) SqlPipeError!OutputFormat { fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { var query: ?[]const u8 = null; var type_inference = true; - var delimiter: u8 = ','; + var delimiter: []const u8 = ","; var header = false; var input_format: InputFormat = .csv; var output_format: OutputFormat = .csv; @@ -284,7 +285,7 @@ fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { } else if (std.mem.eql(u8, arg, "--version") or std.mem.eql(u8, arg, "-V")) { return .version; } else if (std.mem.eql(u8, arg, "--tsv")) { - delimiter = '\t'; + delimiter = "\t"; } else if (std.mem.eql(u8, arg, "-d") or std.mem.eql(u8, arg, "--delimiter")) { i += 1; if (i >= args.len) return error.InvalidDelimiter; @@ -789,20 +790,20 @@ fn insertRowTyped( /// printRow(stmt, col_count, writer, delimiter) → !void /// Pre: sqlite3_step returned SQLITE_ROW for stmt /// col_count = sqlite3_column_count(stmt) > 0 -/// delimiter is the field separator character (e.g. ',' or '\t') +/// delimiter is the field separator string (e.g. "," or "\t") /// Post: one delimited line written to writer with col_count values; /// NULL cells rendered as the literal string "NULL" fn printRow( stmt: *c.sqlite3_stmt, col_count: c_int, writer: *std.Io.Writer, - delimiter: u8, + delimiter: []const u8, ) !void { // Loop invariant I: columns 0..i-1 have been written, separated by delimiter // Bounding function: col_count - i var i: c_int = 0; while (i < col_count) : (i += 1) { - if (i > 0) try writer.writeByte(delimiter); + if (i > 0) try writer.writeAll(delimiter); if (c.sqlite3_column_type(stmt, i) == c.SQLITE_NULL) { try writer.writeAll("NULL"); } else { @@ -819,19 +820,14 @@ fn printRow( /// writeField(writer, value, delimiter) → !void /// Pre: writer is a valid writer, value is a valid UTF-8 slice -/// delimiter is the field separator character (e.g. ',' or '\t') +/// delimiter is the field separator string (e.g. "," or "\t" or "||") /// Post: value is written to writer as a single delimited field: -/// if value contains the delimiter, double-quote, or newline, it is enclosed -/// in double-quotes with internal quotes escaped as "" (RFC 4180); +/// if value contains the delimiter string, double-quote, or newline, it is +/// enclosed in double-quotes with internal quotes escaped as "" (RFC 4180); /// otherwise it is written verbatim -fn writeField(writer: *std.Io.Writer, value: []const u8, delimiter: u8) !void { - var needs_quoting = false; - for (value) |ch| { - if (ch == delimiter or ch == '"' or ch == '\n' or ch == '\r') { - needs_quoting = true; - break; - } - } +fn writeField(writer: *std.Io.Writer, value: []const u8, delimiter: []const u8) !void { + const needs_quoting = std.mem.indexOf(u8, value, delimiter) != null or + std.mem.indexOfAny(u8, value, "\"\n\r") != null; if (needs_quoting) { try writer.writeByte('"'); for (value) |ch| { @@ -846,7 +842,7 @@ fn writeField(writer: *std.Io.Writer, value: []const u8, delimiter: u8) !void { /// printHeaderRow(stmt, col_count, writer, delimiter) → !void /// Pre: stmt is a prepared statement, col_count > 0 -/// delimiter is the field separator character (e.g. ',' or '\t') +/// delimiter is the field separator string (e.g. "," or "\t") /// Post: one delimited line with col_count column names written to writer; /// names are obtained from sqlite3_column_name (alias or original); /// fields are RFC 4180 quoted when they contain special characters @@ -854,13 +850,13 @@ fn printHeaderRow( stmt: *c.sqlite3_stmt, col_count: c_int, writer: *std.Io.Writer, - delimiter: u8, + delimiter: []const u8, ) !void { // Loop invariant I: columns 0..i-1 names have been written, separated by delimiter // Bounding function: col_count - i var i: c_int = 0; while (i < col_count) : (i += 1) { - if (i > 0) try writer.writeByte(delimiter); + if (i > 0) try writer.writeAll(delimiter); const name_ptr = c.sqlite3_column_name(stmt, i); if (name_ptr != null) { const name = std.mem.span(@as([*:0]const u8, @ptrCast(name_ptr))); @@ -934,7 +930,7 @@ fn execQuery( } }, .csv, .tsv => { - const out_delim: u8 = if (output_format == .tsv) '\t' else ','; + const out_delim: []const u8 = if (output_format == .tsv) "\t" else ","; // When header is requested, print column names before data rows if (header and col_count > 0) { @@ -1293,7 +1289,7 @@ fn runColumns( ) void { switch (args.input_format) { .csv, .tsv => { - const col_delim: u8 = if (args.input_format == .tsv) '\t' else args.delimiter; + const col_delim: []const u8 = if (args.input_format == .tsv) "\t" else args.delimiter; var stdin_buf: [4096]u8 = undefined; var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); var csv_reader = csv.csvReaderWithDelimiter(allocator, &stdin_file_reader.interface, col_delim); @@ -1456,7 +1452,7 @@ fn runValidate( ) void { switch (args.input_format) { .csv, .tsv => { - const col_delim: u8 = if (args.input_format == .tsv) '\t' else args.delimiter; + const col_delim: []const u8 = if (args.input_format == .tsv) "\t" else args.delimiter; var stdin_buf: [4096]u8 = undefined; var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); var csv_reader = csv.csvReaderWithDelimiter(allocator, &stdin_file_reader.interface, col_delim); @@ -1732,7 +1728,7 @@ fn runSample( .{}, ), .csv, .tsv => { - const col_delim: u8 = if (args.input_format == .tsv) '\t' else args.delimiter; + const col_delim: []const u8 = if (args.input_format == .tsv) "\t" else args.delimiter; var stdin_buf: [4096]u8 = undefined; var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); var csv_reader = csv.csvReaderWithDelimiter(allocator, &stdin_file_reader.interface, col_delim); @@ -1828,7 +1824,7 @@ fn runSample( // Loop invariant I: cols[0..i] names have been written, separated by col_delim // Bounding function: cols.len - i for (cols, 0..) |col, i| { - if (i > 0) stdout_writer.writeByte(col_delim) catch + if (i > 0) stdout_writer.writeAll(col_delim) catch fatal("failed to write header", stderr_writer, .csv_error, .{}); writeField(stdout_writer, col, col_delim) catch fatal("failed to write header", stderr_writer, .csv_error, .{}); @@ -1845,7 +1841,7 @@ fn runSample( // Loop invariant I: cols[0..col_idx] fields have been written for this row // Bounding function: cols.len - col_idx while (col_idx < cols.len) : (col_idx += 1) { - if (col_idx > 0) stdout_writer.writeByte(col_delim) catch + if (col_idx > 0) stdout_writer.writeAll(col_delim) catch fatal("failed to write field separator", stderr_writer, .csv_error, .{}); const val: []const u8 = if (col_idx < row.len) row[col_idx] else ""; writeField(stdout_writer, val, col_delim) catch @@ -1885,7 +1881,7 @@ fn run( .tsv => blk: { // TSV is CSV with tab delimiter; override delimiter and reuse the CSV loader var tsv_parsed = parsed; - tsv_parsed.delimiter = '\t'; + tsv_parsed.delimiter = "\t"; break :blk loadCsvInput(allocator, io, db, tsv_parsed, stderr_writer); }, .json => blk: { From cb664d92185cac319cb8935f7de701efeb895fd0 Mon Sep 17 00:00:00 2001 From: "Victor M. Varela" Date: Thu, 7 May 2026 14:21:36 +0200 Subject: [PATCH 2/2] test: add edge-case unit tests for multi-char delimiters - Reset partial_delim at start of nextRecord (latent correctness fix: avoids stale state if a previous call exited via a non-fatal error) - Add 6 unit tests covering: empty first field, empty last field, only-delimiter input, EOF without newline, partial delimiter at EOF treated as field content, and greedy left-to-right matching behavior --- src/csv.zig | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/src/csv.zig b/src/csv.zig index 9d88322..00c28cf 100644 --- a/src/csv.zig +++ b/src/csv.zig @@ -84,6 +84,8 @@ pub const CsvReader = struct { /// All returned memory must be freed with freeRecord. pub fn nextRecord(self: *CsvReader) !?[][]u8 { if (self.done) return null; + // Reset any stale partial-delimiter state from a prior error-interrupted call. + self.partial_delim = 0; var fields = std.ArrayList([]u8).empty; errdefer { @@ -558,3 +560,81 @@ test "quoted field containing multi-char delimiter is preserved" { try std.testing.expectEqualStrings("a||b", r[0]); try std.testing.expectEqualStrings("c", r[1]); } + +test "multi-char delimiter: empty first field" { + const input = "||b||c\n"; + var input_reader: std.Io.Reader = .fixed(input); + var csv = csvReaderWithDelimiter(std.testing.allocator, &input_reader, "||"); + + const r = (try csv.nextRecord()).?; + defer csv.freeRecord(r); + try std.testing.expectEqual(@as(usize, 3), r.len); + try std.testing.expectEqualStrings("", r[0]); + try std.testing.expectEqualStrings("b", r[1]); + try std.testing.expectEqualStrings("c", r[2]); +} + +test "multi-char delimiter: empty last field, no trailing newline" { + const input = "a||b||"; + var input_reader: std.Io.Reader = .fixed(input); + var csv = csvReaderWithDelimiter(std.testing.allocator, &input_reader, "||"); + + const r = (try csv.nextRecord()).?; + defer csv.freeRecord(r); + try std.testing.expectEqual(@as(usize, 3), r.len); + try std.testing.expectEqualStrings("a", r[0]); + try std.testing.expectEqualStrings("b", r[1]); + try std.testing.expectEqualStrings("", r[2]); +} + +test "multi-char delimiter: only delimiter produces two empty fields" { + const input = "||\n"; + var input_reader: std.Io.Reader = .fixed(input); + var csv = csvReaderWithDelimiter(std.testing.allocator, &input_reader, "||"); + + const r = (try csv.nextRecord()).?; + defer csv.freeRecord(r); + try std.testing.expectEqual(@as(usize, 2), r.len); + try std.testing.expectEqualStrings("", r[0]); + try std.testing.expectEqualStrings("", r[1]); +} + +test "multi-char delimiter: EOF without trailing newline" { + const input = "a||b"; + var input_reader: std.Io.Reader = .fixed(input); + var csv = csvReaderWithDelimiter(std.testing.allocator, &input_reader, "||"); + + const r = (try csv.nextRecord()).?; + defer csv.freeRecord(r); + try std.testing.expectEqual(@as(usize, 2), r.len); + try std.testing.expectEqualStrings("a", r[0]); + try std.testing.expectEqualStrings("b", r[1]); + + try std.testing.expectEqual(@as(?[][]u8, null), try csv.nextRecord()); +} + +test "multi-char delimiter: partial delimiter at EOF treated as field content" { + // '|' alone is not delimiter '||'; at EOF it becomes literal field content. + const input = "a|"; + var input_reader: std.Io.Reader = .fixed(input); + var csv = csvReaderWithDelimiter(std.testing.allocator, &input_reader, "||"); + + const r = (try csv.nextRecord()).?; + defer csv.freeRecord(r); + try std.testing.expectEqual(@as(usize, 1), r.len); + try std.testing.expectEqualStrings("a|", r[0]); +} + +test "multi-char delimiter: greedy left-to-right matching" { + // Delimiter "||" in "a|||b": greedy match finds "||" at position 1, + // leaving "|b" as the second field. + const input = "a|||b\n"; + var input_reader: std.Io.Reader = .fixed(input); + var csv = csvReaderWithDelimiter(std.testing.allocator, &input_reader, "||"); + + const r = (try csv.nextRecord()).?; + defer csv.freeRecord(r); + try std.testing.expectEqual(@as(usize, 2), r.len); + try std.testing.expectEqualStrings("a", r[0]); + try std.testing.expectEqualStrings("|b", r[1]); +}