From c5bbeaa63ee460215c77342d89ad22c595ef0c91 Mon Sep 17 00:00:00 2001 From: "Victor M. Varela" Date: Fri, 8 May 2026 12:03:20 +0200 Subject: [PATCH 1/6] feat: introduce format.zig with InputFormat, OutputFormat, and OutputWriter Extracts InputFormat and OutputFormat enums (with parse() methods) and the writeField CSV helper from main.zig into a new src/format.zig module. Introduces OutputWriter, a stateful struct that dispatches output formatting across all five formats (csv, tsv, json, ndjson, xml), eliminating the format-switch inside execQuery. Closes no issue yet; part of #145. --- src/format.zig | 276 +++++++++++++++++++++++++++++++++++++++++++++++++ src/main.zig | 213 +++++--------------------------------- 2 files changed, 303 insertions(+), 186 deletions(-) create mode 100644 src/format.zig diff --git a/src/format.zig b/src/format.zig new file mode 100644 index 0000000..fdbed0a --- /dev/null +++ b/src/format.zig @@ -0,0 +1,276 @@ +//! Format abstraction — input/output format types and the OutputWriter. +//! +//! This module owns: +//! InputFormat — supported input formats, with parse() +//! OutputFormat — supported output formats, with parse() +//! LoadOpts — common options forwarded to input-format loaders +//! WriteOpts — options forwarded to OutputWriter +//! OutputWriter — stateful writer that dispatches on OutputFormat +//! writeField — RFC 4180 CSV field writer (used by OutputWriter and --sample mode) + +const std = @import("std"); +const c = @import("c"); +const json_mod = @import("json.zig"); +const xml_mod = @import("xml.zig"); + +// ─── Input format ────────────────────────────────────── + +/// Supported input formats. +pub const InputFormat = enum { + csv, + tsv, + json, + ndjson, + xml, + + /// Parse a format name string. + /// Returns error.InvalidInputFormat when the value is unrecognised. + pub fn parse(s: []const u8) error{InvalidInputFormat}!InputFormat { + if (std.mem.eql(u8, s, "csv")) return .csv; + if (std.mem.eql(u8, s, "tsv")) return .tsv; + if (std.mem.eql(u8, s, "json")) return .json; + if (std.mem.eql(u8, s, "ndjson")) return .ndjson; + if (std.mem.eql(u8, s, "xml")) return .xml; + return error.InvalidInputFormat; + } +}; + +// ─── Output format ───────────────────────────────────── + +/// Supported output formats. +pub const OutputFormat = enum { + csv, + tsv, + json, + ndjson, + xml, + + /// Parse a format name string. + /// Returns error.InvalidOutputFormat when the value is unrecognised. + pub fn parse(s: []const u8) error{InvalidOutputFormat}!OutputFormat { + if (std.mem.eql(u8, s, "csv")) return .csv; + if (std.mem.eql(u8, s, "tsv")) return .tsv; + if (std.mem.eql(u8, s, "json")) return .json; + if (std.mem.eql(u8, s, "ndjson")) return .ndjson; + if (std.mem.eql(u8, s, "xml")) return .xml; + return error.InvalidOutputFormat; + } +}; + +// ─── Load options ─────────────────────────────────────── + +/// Options forwarded to input-format loaders. +pub const LoadOpts = struct { + /// Abort if more than this many data rows are read; null = unlimited. + max_rows: ?usize = null, + /// CSV/TSV field delimiter (1–8 bytes). + delimiter: []const u8 = ",", + /// Infer INTEGER/REAL column types from the first 100 rows (CSV/TSV only). + type_inference: bool = true, + /// Root element to navigate to for XML input; null = actual document root. + xml_root: ?[]const u8 = null, + /// Row tag filter for XML input; null = any direct child element. + xml_row: ?[]const u8 = null, +}; + +// ─── Write options ────────────────────────────────────── + +/// Options forwarded to OutputWriter. +pub const WriteOpts = struct { + /// Emit column names as the first row (CSV/TSV output only). + header: bool = false, + /// Root element name for XML output. + xml_root: []const u8 = "results", + /// Row element name for XML output. + xml_row: []const u8 = "row", +}; + +// ─── Output writer ────────────────────────────────────── + +/// Stateful writer that formats SQLite result rows in any supported output format. +/// +/// Usage: +/// var w = OutputWriter.init(format, opts); +/// defer w.deinit(allocator); +/// try w.begin(allocator, stmt, col_count, writer); +/// while (sqlite3_step(stmt) == SQLITE_ROW) try w.writeRow(stmt, writer); +/// try w.end(writer); +pub const OutputWriter = struct { + format: OutputFormat, + opts: WriteOpts, + /// Set to false after the first writeRow call; controls JSON comma placement. + first_row: bool, + /// Slice of column-name pointers borrowed from SQLite (valid until stmt is finalized). + /// Allocated in begin(); freed in deinit(). + col_names: []const [*:0]const u8, + col_count: c_int, + + /// Create a new OutputWriter. Call begin() before the first writeRow(). + pub fn init(format: OutputFormat, opts: WriteOpts) OutputWriter { + return .{ + .format = format, + .opts = opts, + .first_row = true, + .col_names = &.{}, + .col_count = 0, + }; + } + + /// Release any memory allocated during begin(). + /// Safe to call even when begin() was never called. + pub fn deinit(self: *OutputWriter, allocator: std.mem.Allocator) void { + if (self.col_names.len > 0) { + allocator.free(self.col_names); + } + self.* = undefined; + } + + /// Write any format preamble and collect column metadata. + /// + /// JSON: writes '[' + /// XML: writes the XML declaration and opening root element + /// CSV/TSV: writes an optional header row (when opts.header = true) + /// + /// Pre: stmt is a valid prepared statement; col_count = sqlite3_column_count(stmt) + pub fn begin( + self: *OutputWriter, + allocator: std.mem.Allocator, + stmt: *c.sqlite3_stmt, + col_count: c_int, + writer: *std.Io.Writer, + ) !void { + self.col_count = col_count; + + // Collect column-name pointers for formats that need them per row. + switch (self.format) { + .json, .ndjson, .xml => { + const names = try allocator.alloc([*:0]const u8, @intCast(col_count)); + var i: c_int = 0; + while (i < col_count) : (i += 1) { + names[@intCast(i)] = c.sqlite3_column_name(stmt, i); + } + self.col_names = names; + }, + .csv, .tsv => { + if (self.opts.header and col_count > 0) + try csvPrintHeaderRow(stmt, col_count, writer, self.csvDelimiter()); + }, + } + + // Write format-specific preamble. + switch (self.format) { + .json => try writer.writeByte('['), + .xml => try xml_mod.writeXmlHeader(writer, self.opts.xml_root), + else => {}, + } + } + + /// Write the current SQLITE_ROW to writer. + /// + /// Pre: sqlite3_step(stmt) just returned SQLITE_ROW; begin() has been called + pub fn writeRow( + self: *OutputWriter, + stmt: *c.sqlite3_stmt, + writer: *std.Io.Writer, + ) !void { + switch (self.format) { + .json => { + try json_mod.printJsonRow(stmt, self.col_count, self.col_names, writer, self.first_row); + self.first_row = false; + }, + .ndjson => try json_mod.printNdjsonRow(stmt, self.col_count, self.col_names, writer), + .csv, .tsv => try csvPrintRow(stmt, self.col_count, writer, self.csvDelimiter()), + .xml => try xml_mod.writeXmlRow( + stmt, + self.col_count, + self.col_names, + writer, + self.opts.xml_row, + ), + } + } + + /// Write any format epilogue. + /// + /// JSON: writes ']\n' + /// XML: writes the closing root element + pub fn end(self: *OutputWriter, writer: *std.Io.Writer) !void { + switch (self.format) { + .json => try writer.writeAll("]\n"), + .xml => try xml_mod.writeXmlFooter(writer, self.opts.xml_root), + else => {}, + } + } + + fn csvDelimiter(self: OutputWriter) []const u8 { + return if (self.format == .tsv) "\t" else ","; + } +}; + +// ── CSV output helpers ───────────────────────────────────────────────────────── + +/// Write a single CSV/TSV field with RFC 4180 quoting when necessary. +/// +/// Pre: value is a valid UTF-8 slice; delimiter is the field separator string +/// Post: if value contains delimiter, '"', '\n', or '\r', it is enclosed in +/// double-quotes with internal double-quotes doubled; otherwise written verbatim +pub fn writeField(writer: *std.Io.Writer, value: []const u8, delimiter: []const u8) !void { + const needs_quoting = std.mem.indexOf(u8, value, delimiter) != null or + std.mem.indexOfAny(u8, value, "\"\n\r") != null; + if (needs_quoting) { + try writer.writeByte('"'); + for (value) |ch| { + if (ch == '"') try writer.writeByte('"'); + try writer.writeByte(ch); + } + try writer.writeByte('"'); + } else { + try writer.writeAll(value); + } +} + +/// Write one delimited output row from the current SQLITE_ROW. +fn csvPrintRow( + stmt: *c.sqlite3_stmt, + col_count: c_int, + writer: *std.Io.Writer, + delimiter: []const u8, +) !void { + // Loop invariant I: columns 0..i-1 have been written, separated by delimiter + // Bounding function: col_count - i + var i: c_int = 0; + while (i < col_count) : (i += 1) { + if (i > 0) try writer.writeAll(delimiter); + if (c.sqlite3_column_type(stmt, i) == c.SQLITE_NULL) { + try writer.writeAll("NULL"); + } else { + const ptr = c.sqlite3_column_text(stmt, i); + if (ptr != null) { + try writeField(writer, std.mem.span(@as([*:0]const u8, @ptrCast(ptr))), delimiter); + } else { + try writer.writeAll("NULL"); + } + } + } + try writer.writeByte('\n'); +} + +/// Write a header row with column names from the prepared statement. +fn csvPrintHeaderRow( + stmt: *c.sqlite3_stmt, + col_count: c_int, + writer: *std.Io.Writer, + delimiter: []const u8, +) !void { + // Loop invariant I: columns 0..i-1 names have been written, separated by delimiter + // Bounding function: col_count - i + var i: c_int = 0; + while (i < col_count) : (i += 1) { + if (i > 0) try writer.writeAll(delimiter); + const name_ptr = c.sqlite3_column_name(stmt, i); + if (name_ptr != null) { + try writeField(writer, std.mem.span(@as([*:0]const u8, @ptrCast(name_ptr))), delimiter); + } + } + try writer.writeByte('\n'); +} diff --git a/src/main.zig b/src/main.zig index 086d36e..cf6aba2 100644 --- a/src/main.zig +++ b/src/main.zig @@ -3,6 +3,7 @@ const c = @import("c"); const csv = @import("csv.zig"); const json = @import("json.zig"); const xml = @import("xml.zig"); +const format = @import("format.zig"); const build_options = @import("build_options"); const VERSION: []const u8 = build_options.version; @@ -72,11 +73,11 @@ const ExitCode = enum(u8) { sql_error = 3, }; -/// Supported input formats. -const InputFormat = enum { csv, tsv, json, ndjson, xml }; +/// Supported input formats (canonical definition lives in format.zig). +const InputFormat = format.InputFormat; -/// Supported output formats. -const OutputFormat = enum { csv, tsv, json, ndjson, xml }; +/// Supported output formats (canonical definition lives in format.zig). +const OutputFormat = format.OutputFormat; /// Parsed command-line arguments. const ParsedArgs = struct { @@ -239,32 +240,6 @@ fn parseDelimiter(value: []const u8) SqlPipeError![]const u8 { return value; } -/// parseInputFormat(s) → InputFormat -/// Pre: s is the format string provided by the user -/// Post: result is the matching InputFormat -/// error.InvalidInputFormat when s is not "csv", "tsv", "json", or "ndjson" -fn parseInputFormat(s: []const u8) SqlPipeError!InputFormat { - if (std.mem.eql(u8, s, "csv")) return .csv; - if (std.mem.eql(u8, s, "tsv")) return .tsv; - if (std.mem.eql(u8, s, "json")) return .json; - if (std.mem.eql(u8, s, "ndjson")) return .ndjson; - if (std.mem.eql(u8, s, "xml")) return .xml; - return error.InvalidInputFormat; -} - -/// parseOutputFormat(s) → OutputFormat -/// Pre: s is the format string provided by the user -/// Post: result is the matching OutputFormat -/// error.InvalidOutputFormat when s is not "csv", "tsv", "json", or "ndjson" -fn parseOutputFormat(s: []const u8) SqlPipeError!OutputFormat { - if (std.mem.eql(u8, s, "csv")) return .csv; - if (std.mem.eql(u8, s, "tsv")) return .tsv; - if (std.mem.eql(u8, s, "json")) return .json; - if (std.mem.eql(u8, s, "ndjson")) return .ndjson; - if (std.mem.eql(u8, s, "xml")) return .xml; - return error.InvalidOutputFormat; -} - /// isValidXmlName(s) → bool /// /// Returns true iff s is a valid XML Name: @@ -350,19 +325,19 @@ fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { } else if (std.mem.eql(u8, arg, "-I") or std.mem.eql(u8, arg, "--input-format")) { i += 1; if (i >= args.len) return error.InvalidInputFormat; - input_format = try parseInputFormat(args[i]); + input_format = InputFormat.parse(args[i]) catch return error.InvalidInputFormat; } else if (std.mem.startsWith(u8, arg, "--input-format=")) { - input_format = try parseInputFormat(arg["--input-format=".len..]); + input_format = InputFormat.parse(arg["--input-format=".len..]) catch return error.InvalidInputFormat; } else if (std.mem.startsWith(u8, arg, "-I=")) { - input_format = try parseInputFormat(arg["-I=".len..]); + input_format = InputFormat.parse(arg["-I=".len..]) catch return error.InvalidInputFormat; } else if (std.mem.eql(u8, arg, "-O") or std.mem.eql(u8, arg, "--output-format")) { i += 1; if (i >= args.len) return error.InvalidOutputFormat; - output_format = try parseOutputFormat(args[i]); + output_format = OutputFormat.parse(args[i]) catch return error.InvalidOutputFormat; } else if (std.mem.startsWith(u8, arg, "--output-format=")) { - output_format = try parseOutputFormat(arg["--output-format=".len..]); + output_format = OutputFormat.parse(arg["--output-format=".len..]) catch return error.InvalidOutputFormat; } else if (std.mem.startsWith(u8, arg, "-O=")) { - output_format = try parseOutputFormat(arg["-O=".len..]); + output_format = OutputFormat.parse(arg["-O=".len..]) catch return error.InvalidOutputFormat; } else if (std.mem.eql(u8, arg, "--max-rows")) { i += 1; if (i >= args.len) return error.InvalidMaxRows; @@ -862,94 +837,12 @@ fn insertRowTyped( if (c.sqlite3_step(stmt) != c.SQLITE_DONE) return error.StepFailed; } -/// printRow(stmt, col_count, writer, delimiter) → !void -/// Pre: sqlite3_step returned SQLITE_ROW for stmt -/// col_count = sqlite3_column_count(stmt) > 0 -/// delimiter is the field separator string (e.g. "," or "\t") -/// Post: one delimited line written to writer with col_count values; -/// NULL cells rendered as the literal string "NULL" -fn printRow( - stmt: *c.sqlite3_stmt, - col_count: c_int, - writer: *std.Io.Writer, - delimiter: []const u8, -) !void { - // Loop invariant I: columns 0..i-1 have been written, separated by delimiter - // Bounding function: col_count - i - var i: c_int = 0; - while (i < col_count) : (i += 1) { - if (i > 0) try writer.writeAll(delimiter); - if (c.sqlite3_column_type(stmt, i) == c.SQLITE_NULL) { - try writer.writeAll("NULL"); - } else { - const ptr = c.sqlite3_column_text(stmt, i); - if (ptr != null) { - try writeField(writer, std.mem.span(@as([*:0]const u8, @ptrCast(ptr))), delimiter); - } else { - try writer.writeAll("NULL"); - } - } - } - try writer.writeByte('\n'); -} - -/// writeField(writer, value, delimiter) → !void -/// Pre: writer is a valid writer, value is a valid UTF-8 slice -/// delimiter is the field separator string (e.g. "," or "\t" or "||") -/// Post: value is written to writer as a single delimited field: -/// if value contains the delimiter string, double-quote, or newline, it is -/// enclosed in double-quotes with internal quotes escaped as "" (RFC 4180); -/// otherwise it is written verbatim -fn writeField(writer: *std.Io.Writer, value: []const u8, delimiter: []const u8) !void { - const needs_quoting = std.mem.indexOf(u8, value, delimiter) != null or - std.mem.indexOfAny(u8, value, "\"\n\r") != null; - if (needs_quoting) { - try writer.writeByte('"'); - for (value) |ch| { - if (ch == '"') try writer.writeByte('"'); - try writer.writeByte(ch); - } - try writer.writeByte('"'); - } else { - try writer.writeAll(value); - } -} - -/// printHeaderRow(stmt, col_count, writer, delimiter) → !void -/// Pre: stmt is a prepared statement, col_count > 0 -/// delimiter is the field separator string (e.g. "," or "\t") -/// Post: one delimited line with col_count column names written to writer; -/// names are obtained from sqlite3_column_name (alias or original); -/// fields are RFC 4180 quoted when they contain special characters -fn printHeaderRow( - stmt: *c.sqlite3_stmt, - col_count: c_int, - writer: *std.Io.Writer, - delimiter: []const u8, -) !void { - // Loop invariant I: columns 0..i-1 names have been written, separated by delimiter - // Bounding function: col_count - i - var i: c_int = 0; - while (i < col_count) : (i += 1) { - if (i > 0) try writer.writeAll(delimiter); - const name_ptr = c.sqlite3_column_name(stmt, i); - if (name_ptr != null) { - const name = std.mem.span(@as([*:0]const u8, @ptrCast(name_ptr))); - try writeField(writer, name, delimiter); - } - } - try writer.writeByte('\n'); -} - /// execQuery(db, query, allocator, writer, header, output_format) → !void /// Pre: db is open with table `t` populated /// query is a valid SQL string (not null-terminated) /// allocator is valid /// when output_format = .json or .ndjson, header must not be set (caller's responsibility) -/// Post: if output_format = .json, results are written as a JSON array of objects -/// if output_format = .ndjson, results are written as one JSON object per line -/// if output_format = .csv or .tsv, results are written as delimited text; -/// when header = true, column names are written as the first row +/// Post: results are written to writer in the requested output format /// error.PrepareQueryFailed when sqlite3_prepare_v2 returns non-SQLITE_OK /// propagates any writer I/O error fn execQuery( @@ -961,7 +854,7 @@ fn execQuery( output_format: OutputFormat, xml_root: []const u8, xml_row: []const u8, -) (SqlPipeError || std.mem.Allocator.Error || std.Io.Writer.Error)!void { +) (SqlPipeError || std.mem.Allocator.Error || error{WriteFailed})!void { const query_z = try allocator.dupeZ(u8, query); defer allocator.free(query_z); @@ -972,72 +865,20 @@ fn execQuery( const col_count = c.sqlite3_column_count(stmt); - switch (output_format) { - .json => { - // Collect column names before stepping (sqlite3_column_name is valid before step) - var col_names = try allocator.alloc([*:0]const u8, @intCast(col_count)); - defer allocator.free(col_names); - var ci: c_int = 0; - while (ci < col_count) : (ci += 1) { - col_names[@intCast(ci)] = c.sqlite3_column_name(stmt, ci); - } - - try writer.writeByte('['); - var first = true; - // Loop invariant I: all SQLITE_ROW results returned so far have been printed as JSON objects - // Bounding function: number of remaining rows in the result set (finite) - while (c.sqlite3_step(stmt) == c.SQLITE_ROW) { - try json.printJsonRow(stmt.?, col_count, col_names, writer, first); - first = false; - } - try writer.writeAll("]\n"); - }, - .ndjson => { - // Collect column names before stepping - var col_names = try allocator.alloc([*:0]const u8, @intCast(col_count)); - defer allocator.free(col_names); - var ci: c_int = 0; - while (ci < col_count) : (ci += 1) { - col_names[@intCast(ci)] = c.sqlite3_column_name(stmt, ci); - } - // Loop invariant I: all SQLITE_ROW results returned so far have been printed as NDJSON lines - // Bounding function: number of remaining rows in the result set (finite) - while (c.sqlite3_step(stmt) == c.SQLITE_ROW) { - try json.printNdjsonRow(stmt.?, col_count, col_names, writer); - } - }, - .csv, .tsv => { - const out_delim: []const u8 = if (output_format == .tsv) "\t" else ","; - - // When header is requested, print column names before data rows - if (header and col_count > 0) { - try printHeaderRow(stmt.?, col_count, writer, out_delim); - } - - // Loop invariant I: all SQLITE_ROW results returned so far have been printed - // Bounding function: number of remaining rows in the result set (finite) - while (c.sqlite3_step(stmt) == c.SQLITE_ROW) { - try printRow(stmt.?, col_count, writer, out_delim); - } - }, - .xml => { - // Collect column names before stepping - var col_names = try allocator.alloc([*:0]const u8, @intCast(col_count)); - defer allocator.free(col_names); - var ci: c_int = 0; - while (ci < col_count) : (ci += 1) { - col_names[@intCast(ci)] = c.sqlite3_column_name(stmt, ci); - } + var out_writer = format.OutputWriter.init(output_format, .{ + .header = header, + .xml_root = xml_root, + .xml_row = xml_row, + }); + defer out_writer.deinit(allocator); - try xml.writeXmlHeader(writer, xml_root); - // Loop invariant I: all SQLITE_ROW results returned so far have been written as XML rows - // Bounding function: number of remaining rows in the result set (finite) - while (c.sqlite3_step(stmt) == c.SQLITE_ROW) { - try xml.writeXmlRow(stmt.?, col_count, col_names, writer, xml_row); - } - try xml.writeXmlFooter(writer, xml_root); - }, + try out_writer.begin(allocator, stmt.?, col_count, writer); + // Loop invariant I: all SQLITE_ROW results returned so far have been written + // Bounding function: number of remaining rows in the result set (finite) + while (c.sqlite3_step(stmt) == c.SQLITE_ROW) { + try out_writer.writeRow(stmt.?, writer); } + try out_writer.end(writer); } // ─── SQL error context helpers ──────────────────────── @@ -1972,7 +1813,7 @@ fn runSample( for (cols, 0..) |col, i| { if (i > 0) stdout_writer.writeAll(col_delim) catch fatal("failed to write header", stderr_writer, .csv_error, .{}); - writeField(stdout_writer, col, col_delim) catch + format.writeField(stdout_writer, col, col_delim) catch fatal("failed to write header", stderr_writer, .csv_error, .{}); } stdout_writer.writeByte('\n') catch @@ -1990,7 +1831,7 @@ fn runSample( if (col_idx > 0) stdout_writer.writeAll(col_delim) catch fatal("failed to write field separator", stderr_writer, .csv_error, .{}); const val: []const u8 = if (col_idx < row.len) row[col_idx] else ""; - writeField(stdout_writer, val, col_delim) catch + format.writeField(stdout_writer, val, col_delim) catch fatal("failed to write field", stderr_writer, .csv_error, .{}); } stdout_writer.writeByte('\n') catch From ecfd50065e3bfdedb8459186bc1d41b92fcf627f Mon Sep 17 00:00:00 2001 From: "Victor M. Varela" Date: Fri, 8 May 2026 12:48:05 +0200 Subject: [PATCH 2/6] refactor: extract CLI argument types and parseArgs into args.zig --- src/args.zig | 439 +++++++++++++++++++++++++++++++++++++++++++++ src/main.zig | 496 +++------------------------------------------------ 2 files changed, 461 insertions(+), 474 deletions(-) create mode 100644 src/args.zig diff --git a/src/args.zig b/src/args.zig new file mode 100644 index 0000000..c8e2c47 --- /dev/null +++ b/src/args.zig @@ -0,0 +1,439 @@ +//! CLI argument types and parser for sql-pipe. + +const std = @import("std"); +const format = @import("format.zig"); + +const InputFormat = format.InputFormat; +const OutputFormat = format.OutputFormat; + +pub const SqlPipeError = error{ + MissingQuery, + InvalidDelimiter, + IncompatibleFlags, + SilentVerboseConflict, + ColumnsWithQuery, + ValidateWithQuery, + ValidateWithColumns, + OutputWithValidate, + InvalidMaxRows, + InvalidInputFormat, + InvalidOutputFormat, + MissingXmlFlagValue, + InvalidXmlName, + OpenDbFailed, + EmptyInput, + EmptyColumnName, + NoColumns, + CreateTableFailed, + BeginTransactionFailed, + PrepareInsertFailed, + BindFailed, + StepFailed, + PrepareQueryFailed, + InvalidOutputPath, + OutputWithColumns, + SampleWithQuery, + SampleWithJson, + SampleWithColumns, + SampleWithValidate, + SampleWithOutput, + InvalidSampleCount, +}; + +pub const ParsedArgs = struct { + /// SQL query to execute against table `t`. + query: []const u8, + /// Infer column types from the first 100 buffered rows when true. + type_inference: bool, + /// CSV field delimiter — 1 to 8 bytes (default: ","). + delimiter: []const u8, + /// Emit column names as first output row when true (CSV output only). + header: bool, + /// Input format (default: csv). + input_format: InputFormat, + /// Output format (default: csv). + output_format: OutputFormat, + /// Abort with exit 1 when more than this many data rows are read; null = unlimited. + max_rows: ?usize, + /// Print "Loaded rows" to stderr after all rows are inserted when true. + /// When false, the message is still shown automatically when stderr is a TTY. + verbose: bool, + /// Suppress "Loaded rows" unconditionally. + silent: bool, + /// Write results to this file path instead of stdout; null = write to stdout. + output: ?[]const u8, + /// Root element name for XML output (default: "results"). + xml_root: []const u8, + /// Row element name for XML output (default: "row"). + xml_row: []const u8, + /// Root element to navigate to for XML input; null = use actual document root. + xml_root_input: ?[]const u8, + /// Row tag filter for XML input; null = accept any direct child element as a row. + xml_row_input: ?[]const u8, +}; + +pub const ColumnsArgs = struct { + /// CSV field delimiter — 1 to 8 bytes (default: ","). + delimiter: []const u8, + /// Show inferred type alongside name when true. + verbose: bool, + /// Input format (default: csv). + input_format: InputFormat, + /// Root element to navigate to for XML input; null = use actual document root. + xml_root_input: ?[]const u8, + /// Row tag filter for XML input; null = accept any direct child element as a row. + xml_row_input: ?[]const u8, +}; + +pub const ValidateArgs = struct { + /// CSV field delimiter — 1 to 8 bytes (default: ","). + delimiter: []const u8, + /// Infer column types from the first 100 buffered rows when true. + type_inference: bool, + /// Input format (default: csv). + input_format: InputFormat, + /// Root element to navigate to for XML input; null = use actual document root. + xml_root_input: ?[]const u8, + /// Row tag filter for XML input; null = accept any direct child element as a row. + xml_row_input: ?[]const u8, +}; + +pub const SampleArgs = struct { + /// CSV field delimiter — 1 to 8 bytes (default: ","). + delimiter: []const u8, + /// Input format (default: csv). + input_format: InputFormat, + /// Number of sample rows to print (default: 10). + n: usize, + /// Infer column types from buffered rows when true; show all TEXT when false. + type_inference: bool, +}; + +pub const ArgsResult = union(enum) { + /// Normal execution: run the query. + parsed: ParsedArgs, + /// User requested --help / -h. + help, + /// User requested --version / -V. + version, + /// User requested --columns: list column names and exit. + columns: ColumnsArgs, + /// User requested --validate: parse input and print summary. + validate: ValidateArgs, + /// User requested --sample: print schema + first n rows and exit. + sample: SampleArgs, +}; + +pub fn printUsage(writer: *std.Io.Writer) !void { + try writer.writeAll( + \\Usage: sql-pipe [OPTIONS] + \\ + \\Reads input from stdin, loads it into an in-memory SQLite table `t`, + \\runs , and prints results to stdout. + \\ + \\Options: + \\ -d, --delimiter Input field delimiter for CSV: 1–8 chars (default: ,) + \\ --tsv Alias for --delimiter '\t' + \\ -I, --input-format Input format: csv (default), tsv, json, ndjson, xml + \\ -O, --output-format Output format: csv (default), tsv, json, ndjson, xml + \\ --json Alias for --output-format json + \\ --no-type-inference Treat all columns as TEXT (CSV input only) + \\ -H, --header Print column names as the first output row (CSV/TSV output only) + \\ --max-rows Stop if more than data rows are read (exit 1) + \\ -v, --verbose Force row count to stderr (shown automatically on TTY) + \\ With --columns: show inferred type per column + \\ -s, --silent Suppress row count output unconditionally + \\ Cannot be combined with -v/--verbose + \\ --validate Parse the entire input and print a summary to stdout + \\ (OK: rows, columns ( , ...)) + \\ Exit 0 on success, exit 2 on parse error. No query required. + \\ Compatible with --delimiter, --tsv, --no-type-inference, -I. + \\ --columns List column names from input header (one per line) and exit + \\ Combine with -v/--verbose to include inferred types + \\ Cannot be combined with --output or a query argument + \\ --sample [] Print schema to stderr and first rows to stdout (default: 10) + \\ Schema lists column names and inferred types, prefixed with # + \\ Implies --header. Compatible with --delimiter and --tsv. + \\ Incompatible with --json and with a query argument. + \\ --output Write results to file instead of stdout + \\ --xml-root Root element name for XML I/O (default: results) + \\ --xml-row Row element name for XML I/O (default: row) + \\ -h, --help Show this help message and exit + \\ -V, --version Show version and exit + \\ + \\Exit codes: + \\ 0 Success + \\ 1 Usage error (missing query, bad arguments) + \\ 2 Input parse error + \\ 3 SQL error + \\ + \\Examples: + \\ echo 'name,age\nAlice,30' | sql-pipe 'SELECT * FROM t' + \\ cat data.tsv | sql-pipe --tsv 'SELECT * FROM t' + \\ cat data.psv | sql-pipe -d '|' 'SELECT * FROM t' + \\ cat data.csv | sql-pipe 'SELECT region, SUM(revenue) FROM t GROUP BY region' + \\ cat data.csv | sql-pipe --output-format json 'SELECT * FROM t' + \\ cat data.json | sql-pipe --input-format json 'SELECT * FROM t' + \\ cat data.ndjson | sql-pipe -I ndjson -O ndjson 'SELECT name FROM t WHERE age > 18' + \\ cat data.csv | sql-pipe --sample 5 + \\ + ); +} + +pub fn parseDelimiter(value: []const u8) SqlPipeError![]const u8 { + if (std.mem.eql(u8, value, "\\t")) return "\t"; + if (value.len == 0) return error.InvalidDelimiter; + if (value.len > 8) return error.InvalidDelimiter; + return value; +} + +pub fn isValidXmlName(s: []const u8) bool { + if (s.len == 0) return false; + switch (s[0]) { + 'a'...'z', 'A'...'Z', '_', ':' => {}, + else => return false, + } + for (s[1..]) |ch| { + switch (ch) { + 'a'...'z', 'A'...'Z', '0'...'9', '-', '.', '_', ':' => {}, + else => return false, + } + } + return true; +} + +pub fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { + var query: ?[]const u8 = null; + var type_inference = true; + var delimiter: []const u8 = ","; + var header = false; + var input_format: InputFormat = .csv; + var output_format: OutputFormat = .csv; + + var max_rows: ?usize = null; + var verbose = false; + var silent = false; + var list_columns = false; + var validate = false; + var output: ?[]const u8 = null; + var xml_root: []const u8 = "results"; + var xml_row: []const u8 = "row"; + var xml_root_input: ?[]const u8 = null; + var xml_row_input: ?[]const u8 = null; + var sample_mode = false; + var sample_n: usize = 10; + + // Loop invariant I: all args[1..i] have been processed; + // query holds the first non-flag argument seen, or null; + // type_inference reflects the presence of --no-type-inference; + // delimiter reflects -d/--delimiter/--tsv if present; + // header reflects the presence of --header/-H; + // output_format reflects the last --output-format/--json flag seen; + // input_format reflects the last --input-format flag seen; + // max_rows reflects the presence of --max-rows + // Bounding function: args.len - i + var i: usize = 1; + while (i < args.len) : (i += 1) { + const arg = args[i]; + if (std.mem.eql(u8, arg, "--help") or std.mem.eql(u8, arg, "-h")) { + return .help; + } else if (std.mem.eql(u8, arg, "--version") or std.mem.eql(u8, arg, "-V")) { + return .version; + } else if (std.mem.eql(u8, arg, "--tsv")) { + delimiter = "\t"; + } else if (std.mem.eql(u8, arg, "-d") or std.mem.eql(u8, arg, "--delimiter")) { + i += 1; + if (i >= args.len) return error.InvalidDelimiter; + delimiter = try parseDelimiter(args[i]); + } else if (std.mem.startsWith(u8, arg, "--delimiter=")) { + delimiter = try parseDelimiter(arg["--delimiter=".len..]); + } else if (std.mem.startsWith(u8, arg, "-d=")) { + delimiter = try parseDelimiter(arg["-d=".len..]); + } else if (std.mem.eql(u8, arg, "--no-type-inference")) { + type_inference = false; + } else if (std.mem.eql(u8, arg, "--header") or std.mem.eql(u8, arg, "-H")) { + header = true; + } else if (std.mem.eql(u8, arg, "--json")) { + output_format = .json; + } else if (std.mem.eql(u8, arg, "-I") or std.mem.eql(u8, arg, "--input-format")) { + i += 1; + if (i >= args.len) return error.InvalidInputFormat; + input_format = InputFormat.parse(args[i]) catch return error.InvalidInputFormat; + } else if (std.mem.startsWith(u8, arg, "--input-format=")) { + input_format = InputFormat.parse(arg["--input-format=".len..]) catch return error.InvalidInputFormat; + } else if (std.mem.startsWith(u8, arg, "-I=")) { + input_format = InputFormat.parse(arg["-I=".len..]) catch return error.InvalidInputFormat; + } else if (std.mem.eql(u8, arg, "-O") or std.mem.eql(u8, arg, "--output-format")) { + i += 1; + if (i >= args.len) return error.InvalidOutputFormat; + output_format = OutputFormat.parse(args[i]) catch return error.InvalidOutputFormat; + } else if (std.mem.startsWith(u8, arg, "--output-format=")) { + output_format = OutputFormat.parse(arg["--output-format=".len..]) catch return error.InvalidOutputFormat; + } else if (std.mem.startsWith(u8, arg, "-O=")) { + output_format = OutputFormat.parse(arg["-O=".len..]) catch return error.InvalidOutputFormat; + } else if (std.mem.eql(u8, arg, "--max-rows")) { + i += 1; + if (i >= args.len) return error.InvalidMaxRows; + max_rows = std.fmt.parseUnsigned(usize, args[i], 10) catch return error.InvalidMaxRows; + if (max_rows.? == 0) return error.InvalidMaxRows; + } else if (std.mem.startsWith(u8, arg, "--max-rows=")) { + max_rows = std.fmt.parseUnsigned(usize, arg["--max-rows=".len..], 10) catch return error.InvalidMaxRows; + if (max_rows.? == 0) return error.InvalidMaxRows; + } else if (std.mem.eql(u8, arg, "--verbose") or std.mem.eql(u8, arg, "-v")) { + verbose = true; + } else if (std.mem.eql(u8, arg, "--silent") or std.mem.eql(u8, arg, "-s")) { + silent = true; + } else if (std.mem.eql(u8, arg, "--columns")) { + list_columns = true; + } else if (std.mem.eql(u8, arg, "--validate")) { + validate = true; + } else if (std.mem.eql(u8, arg, "--sample")) { + sample_mode = true; + // Peek at next arg: if it is a positive integer, consume it as the sample count + if (i + 1 < args.len) { + const next = args[i + 1]; + if (next.len > 0 and next[0] != '-') { + if (std.fmt.parseUnsigned(usize, next, 10)) |n| { + if (n == 0) return error.InvalidSampleCount; + sample_n = n; + i += 1; + } else |_| { + // Not a number — keep default (10) + } + } + } + } else if (std.mem.startsWith(u8, arg, "--sample=")) { + const val = arg["--sample=".len..]; + const n = std.fmt.parseUnsigned(usize, val, 10) catch return error.InvalidSampleCount; + if (n == 0) return error.InvalidSampleCount; + sample_n = n; + sample_mode = true; + } else if (std.mem.eql(u8, arg, "--output")) { + i += 1; + if (i >= args.len) return error.InvalidOutputPath; + const trimmed = std.mem.trim(u8, args[i], " \t"); + if (trimmed.len == 0) return error.InvalidOutputPath; + output = trimmed; + } else if (std.mem.startsWith(u8, arg, "--output=")) { + const trimmed = std.mem.trim(u8, arg["--output=".len..], " \t"); + if (trimmed.len == 0) return error.InvalidOutputPath; + output = trimmed; + } else if (std.mem.eql(u8, arg, "--xml-root")) { + i += 1; + if (i >= args.len) return error.MissingXmlFlagValue; + xml_root = args[i]; + xml_root_input = args[i]; + } else if (std.mem.startsWith(u8, arg, "--xml-root=")) { + xml_root = arg["--xml-root=".len..]; + xml_root_input = arg["--xml-root=".len..]; + } else if (std.mem.eql(u8, arg, "--xml-row")) { + i += 1; + if (i >= args.len) return error.MissingXmlFlagValue; + xml_row = args[i]; + xml_row_input = args[i]; + } else if (std.mem.startsWith(u8, arg, "--xml-row=")) { + xml_row = arg["--xml-row=".len..]; + xml_row_input = arg["--xml-row=".len..]; + } else { + if (query == null) query = arg; + } + } + + // Non-CSV/TSV output format is mutually exclusive with --header + if (output_format != .csv and output_format != .tsv and header) + return error.IncompatibleFlags; + + // --output is mutually exclusive with --columns (--columns always writes to stdout) + if (output != null and list_columns) + return error.OutputWithColumns; + + // --output is mutually exclusive with --validate (--validate always writes to stdout) + if (output != null and validate) + return error.OutputWithValidate; + + // --output is mutually exclusive with --sample (--sample always writes to stdout) + if (output != null and sample_mode) + return error.SampleWithOutput; + + // --validate is mutually exclusive with --columns + if (validate and list_columns) + return error.ValidateWithColumns; + + // --columns is mutually exclusive with a query argument + if (list_columns and query != null) + return error.ColumnsWithQuery; + + // --validate is mutually exclusive with a query argument + if (validate and query != null) + return error.ValidateWithQuery; + + // --sample is mutually exclusive with a query argument + if (sample_mode and query != null) + return error.SampleWithQuery; + + // --sample is mutually exclusive with --json / json output format + if (sample_mode and (output_format == .json or output_format == .ndjson)) + return error.SampleWithJson; + + // --sample is mutually exclusive with --columns + if (sample_mode and list_columns) + return error.SampleWithColumns; + + // --sample is mutually exclusive with --validate + if (sample_mode and validate) + return error.SampleWithValidate; + + // --silent and --verbose are mutually exclusive + if (silent and verbose) + return error.SilentVerboseConflict; + + // --xml-root and --xml-row must be valid XML element names + if (!isValidXmlName(xml_root) or !isValidXmlName(xml_row)) + return error.InvalidXmlName; + + // --columns mode: list headers and exit + if (list_columns) + return .{ .columns = ColumnsArgs{ + .delimiter = delimiter, + .verbose = verbose, + .input_format = input_format, + .xml_root_input = xml_root_input, + .xml_row_input = xml_row_input, + } }; + + // --validate mode: parse CSV and print summary + if (validate) + return .{ .validate = ValidateArgs{ + .delimiter = delimiter, + .type_inference = type_inference, + .input_format = input_format, + .xml_root_input = xml_root_input, + .xml_row_input = xml_row_input, + } }; + + // --sample mode: print schema + first n rows and exit + if (sample_mode) + return .{ .sample = SampleArgs{ + .delimiter = delimiter, + .input_format = input_format, + .n = sample_n, + .type_inference = type_inference, + } }; + + return .{ .parsed = ParsedArgs{ + .query = query orelse return error.MissingQuery, + .type_inference = type_inference, + .delimiter = delimiter, + .header = header, + .input_format = input_format, + .output_format = output_format, + .max_rows = max_rows, + .verbose = verbose, + .silent = silent, + .output = output, + .xml_root = xml_root, + .xml_row = xml_row, + .xml_root_input = xml_root_input, + .xml_row_input = xml_row_input, + } }; +} diff --git a/src/main.zig b/src/main.zig index cf6aba2..839b32f 100644 --- a/src/main.zig +++ b/src/main.zig @@ -5,6 +5,7 @@ const json = @import("json.zig"); const xml = @import("xml.zig"); const format = @import("format.zig"); const build_options = @import("build_options"); +const args_mod = @import("args.zig"); const VERSION: []const u8 = build_options.version; @@ -14,41 +15,14 @@ const sqlite_static: c.sqlite3_destructor_type = null; /// SQLITE_TRANSIENT sentinel: tells sqlite3_bind_text to copy the string /// immediately (safe for short-lived source buffers, e.g. JSON arena data). -// ─── Error types ───────────────────────────────────── - -const SqlPipeError = error{ - MissingQuery, - InvalidDelimiter, - IncompatibleFlags, - SilentVerboseConflict, - ColumnsWithQuery, - ValidateWithQuery, - ValidateWithColumns, - OutputWithValidate, - InvalidMaxRows, - InvalidInputFormat, - InvalidOutputFormat, - MissingXmlFlagValue, - InvalidXmlName, - OpenDbFailed, - EmptyInput, - EmptyColumnName, - NoColumns, - CreateTableFailed, - BeginTransactionFailed, - PrepareInsertFailed, - BindFailed, - StepFailed, - PrepareQueryFailed, - InvalidOutputPath, - OutputWithColumns, - SampleWithQuery, - SampleWithJson, - SampleWithColumns, - SampleWithValidate, - SampleWithOutput, - InvalidSampleCount, -}; +const SqlPipeError = args_mod.SqlPipeError; +const ParsedArgs = args_mod.ParsedArgs; +const ColumnsArgs = args_mod.ColumnsArgs; +const ValidateArgs = args_mod.ValidateArgs; +const SampleArgs = args_mod.SampleArgs; +const ArgsResult = args_mod.ArgsResult; +const parseArgs = args_mod.parseArgs; +const printUsage = args_mod.printUsage; // ─── Column type inference ──────────────────────────── @@ -79,432 +53,6 @@ const InputFormat = format.InputFormat; /// Supported output formats (canonical definition lives in format.zig). const OutputFormat = format.OutputFormat; -/// Parsed command-line arguments. -const ParsedArgs = struct { - /// SQL query to execute against table `t`. - query: []const u8, - /// Infer column types from the first 100 buffered rows when true. - type_inference: bool, - /// CSV field delimiter — 1 to 8 bytes (default: ","). - delimiter: []const u8, - /// Emit column names as first output row when true (CSV output only). - header: bool, - /// Input format (default: csv). - input_format: InputFormat, - /// Output format (default: csv). - output_format: OutputFormat, - /// Abort with exit 1 when more than this many data rows are read; null = unlimited. - max_rows: ?usize, - /// Print "Loaded rows" to stderr after all rows are inserted when true. - /// When false, the message is still shown automatically when stderr is a TTY. - verbose: bool, - /// Suppress "Loaded rows" unconditionally. - silent: bool, - /// Write results to this file path instead of stdout; null = write to stdout. - output: ?[]const u8, - /// Root element name for XML output (default: "results"). - xml_root: []const u8, - /// Row element name for XML output (default: "row"). - xml_row: []const u8, - /// Root element to navigate to for XML input; null = use actual document root. - xml_root_input: ?[]const u8, - /// Row tag filter for XML input; null = accept any direct child element as a row. - xml_row_input: ?[]const u8, -}; - -/// Arguments for `--columns` mode. -const ColumnsArgs = struct { - /// CSV field delimiter — 1 to 8 bytes (default: ","). - delimiter: []const u8, - /// Show inferred type alongside name when true. - verbose: bool, - /// Input format (default: csv). - input_format: InputFormat, - /// Root element to navigate to for XML input; null = use actual document root. - xml_root_input: ?[]const u8, - /// Row tag filter for XML input; null = accept any direct child element as a row. - xml_row_input: ?[]const u8, -}; - -/// Arguments for `--validate` mode. -const ValidateArgs = struct { - /// CSV field delimiter — 1 to 8 bytes (default: ","). - delimiter: []const u8, - /// Infer column types from the first 100 buffered rows when true. - type_inference: bool, - /// Input format (default: csv). - input_format: InputFormat, - /// Root element to navigate to for XML input; null = use actual document root. - xml_root_input: ?[]const u8, - /// Row tag filter for XML input; null = accept any direct child element as a row. - xml_row_input: ?[]const u8, -}; - -/// Arguments for `--sample` mode. -const SampleArgs = struct { - /// CSV field delimiter — 1 to 8 bytes (default: ","). - delimiter: []const u8, - /// Input format (default: csv). - input_format: InputFormat, - /// Number of sample rows to print (default: 10). - n: usize, - /// Infer column types from buffered rows when true; show all TEXT when false. - type_inference: bool, -}; - -/// Result of argument parsing — either parsed arguments or a special action. -const ArgsResult = union(enum) { - /// Normal execution: run the query. - parsed: ParsedArgs, - /// User requested --help / -h. - help, - /// User requested --version / -V. - version, - /// User requested --columns: list column names and exit. - columns: ColumnsArgs, - /// User requested --validate: parse input and print summary. - validate: ValidateArgs, - /// User requested --sample: print schema + first n rows and exit. - sample: SampleArgs, -}; - -// ─── Extracted functions ────────────────────────────── - -/// printUsage(writer) → void -/// Pre: writer is a valid stderr writer -/// Post: usage text has been written to writer -fn printUsage(writer: *std.Io.Writer) !void { - try writer.writeAll( - \\Usage: sql-pipe [OPTIONS] - \\ - \\Reads input from stdin, loads it into an in-memory SQLite table `t`, - \\runs , and prints results to stdout. - \\ - \\Options: - \\ -d, --delimiter Input field delimiter for CSV: 1–8 chars (default: ,) - \\ --tsv Alias for --delimiter '\t' - \\ -I, --input-format Input format: csv (default), tsv, json, ndjson, xml - \\ -O, --output-format Output format: csv (default), tsv, json, ndjson, xml - \\ --json Alias for --output-format json - \\ --no-type-inference Treat all columns as TEXT (CSV input only) - \\ -H, --header Print column names as the first output row (CSV/TSV output only) - \\ --max-rows Stop if more than data rows are read (exit 1) - \\ -v, --verbose Force row count to stderr (shown automatically on TTY) - \\ With --columns: show inferred type per column - \\ -s, --silent Suppress row count output unconditionally - \\ Cannot be combined with -v/--verbose - \\ --validate Parse the entire input and print a summary to stdout - \\ (OK: rows, columns ( , ...)) - \\ Exit 0 on success, exit 2 on parse error. No query required. - \\ Compatible with --delimiter, --tsv, --no-type-inference, -I. - \\ --columns List column names from input header (one per line) and exit - \\ Combine with -v/--verbose to include inferred types - \\ Cannot be combined with --output or a query argument - \\ --sample [] Print schema to stderr and first rows to stdout (default: 10) - \\ Schema lists column names and inferred types, prefixed with # - \\ Implies --header. Compatible with --delimiter and --tsv. - \\ Incompatible with --json and with a query argument. - \\ --output Write results to file instead of stdout - \\ --xml-root Root element name for XML I/O (default: results) - \\ --xml-row Row element name for XML I/O (default: row) - \\ -h, --help Show this help message and exit - \\ -V, --version Show version and exit - \\ - \\Exit codes: - \\ 0 Success - \\ 1 Usage error (missing query, bad arguments) - \\ 2 Input parse error - \\ 3 SQL error - \\ - \\Examples: - \\ echo 'name,age\nAlice,30' | sql-pipe 'SELECT * FROM t' - \\ cat data.tsv | sql-pipe --tsv 'SELECT * FROM t' - \\ cat data.psv | sql-pipe -d '|' 'SELECT * FROM t' - \\ cat data.csv | sql-pipe 'SELECT region, SUM(revenue) FROM t GROUP BY region' - \\ cat data.csv | sql-pipe --output-format json 'SELECT * FROM t' - \\ cat data.json | sql-pipe --input-format json 'SELECT * FROM t' - \\ cat data.ndjson | sql-pipe -I ndjson -O ndjson 'SELECT name FROM t WHERE age > 18' - \\ cat data.csv | sql-pipe --sample 5 - \\ - ); -} - -/// parseDelimiter(value) → []const u8 -/// Pre: value is the delimiter token provided by the user -/// Post: result is a 1–8 byte delimiter string, or "\t" when value = "\\t" -/// error.InvalidDelimiter when value is empty or longer than 8 bytes -fn parseDelimiter(value: []const u8) SqlPipeError![]const u8 { - if (std.mem.eql(u8, value, "\\t")) return "\t"; - if (value.len == 0) return error.InvalidDelimiter; - if (value.len > 8) return error.InvalidDelimiter; - return value; -} - -/// isValidXmlName(s) → bool -/// -/// Returns true iff s is a valid XML Name: -/// NameStartChar: letter, '_', ':' -/// NameChar: NameStartChar | digit | '-' | '.' -fn isValidXmlName(s: []const u8) bool { - if (s.len == 0) return false; - switch (s[0]) { - 'a'...'z', 'A'...'Z', '_', ':' => {}, - else => return false, - } - for (s[1..]) |ch| { - switch (ch) { - 'a'...'z', 'A'...'Z', '0'...'9', '-', '.', '_', ':' => {}, - else => return false, - } - } - return true; -} - -/// parseArgs(args) → ArgsResult -/// Pre: args is the full process argument slice; args[0] is the program name -/// Post: result.parsed.query is the first non-flag argument -/// result.parsed.type_inference = false when "--no-type-inference" is present -/// result.parsed.output_format = .json when "--json" or "--output-format json" is present -/// result = .help when --help or -h is present -/// result = .version when --version or -V is present -/// error.MissingQuery when no non-flag argument is found -/// error.IncompatibleFlags when a non-CSV/TSV output format is combined with --header -fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { - var query: ?[]const u8 = null; - var type_inference = true; - var delimiter: []const u8 = ","; - var header = false; - var input_format: InputFormat = .csv; - var output_format: OutputFormat = .csv; - - var max_rows: ?usize = null; - var verbose = false; - var silent = false; - var list_columns = false; - var validate = false; - var output: ?[]const u8 = null; - var xml_root: []const u8 = "results"; - var xml_row: []const u8 = "row"; - var xml_root_input: ?[]const u8 = null; - var xml_row_input: ?[]const u8 = null; - var sample_mode = false; - var sample_n: usize = 10; - - // Loop invariant I: all args[1..i] have been processed; - // query holds the first non-flag argument seen, or null; - // type_inference reflects the presence of --no-type-inference; - // delimiter reflects -d/--delimiter/--tsv if present; - // header reflects the presence of --header/-H; - // output_format reflects the last --output-format/--json flag seen; - // input_format reflects the last --input-format flag seen; - // max_rows reflects the presence of --max-rows - // Bounding function: args.len - i - var i: usize = 1; - while (i < args.len) : (i += 1) { - const arg = args[i]; - if (std.mem.eql(u8, arg, "--help") or std.mem.eql(u8, arg, "-h")) { - return .help; - } else if (std.mem.eql(u8, arg, "--version") or std.mem.eql(u8, arg, "-V")) { - return .version; - } else if (std.mem.eql(u8, arg, "--tsv")) { - delimiter = "\t"; - } else if (std.mem.eql(u8, arg, "-d") or std.mem.eql(u8, arg, "--delimiter")) { - i += 1; - if (i >= args.len) return error.InvalidDelimiter; - delimiter = try parseDelimiter(args[i]); - } else if (std.mem.startsWith(u8, arg, "--delimiter=")) { - delimiter = try parseDelimiter(arg["--delimiter=".len..]); - } else if (std.mem.startsWith(u8, arg, "-d=")) { - delimiter = try parseDelimiter(arg["-d=".len..]); - } else if (std.mem.eql(u8, arg, "--no-type-inference")) { - type_inference = false; - } else if (std.mem.eql(u8, arg, "--header") or std.mem.eql(u8, arg, "-H")) { - header = true; - } else if (std.mem.eql(u8, arg, "--json")) { - output_format = .json; - } else if (std.mem.eql(u8, arg, "-I") or std.mem.eql(u8, arg, "--input-format")) { - i += 1; - if (i >= args.len) return error.InvalidInputFormat; - input_format = InputFormat.parse(args[i]) catch return error.InvalidInputFormat; - } else if (std.mem.startsWith(u8, arg, "--input-format=")) { - input_format = InputFormat.parse(arg["--input-format=".len..]) catch return error.InvalidInputFormat; - } else if (std.mem.startsWith(u8, arg, "-I=")) { - input_format = InputFormat.parse(arg["-I=".len..]) catch return error.InvalidInputFormat; - } else if (std.mem.eql(u8, arg, "-O") or std.mem.eql(u8, arg, "--output-format")) { - i += 1; - if (i >= args.len) return error.InvalidOutputFormat; - output_format = OutputFormat.parse(args[i]) catch return error.InvalidOutputFormat; - } else if (std.mem.startsWith(u8, arg, "--output-format=")) { - output_format = OutputFormat.parse(arg["--output-format=".len..]) catch return error.InvalidOutputFormat; - } else if (std.mem.startsWith(u8, arg, "-O=")) { - output_format = OutputFormat.parse(arg["-O=".len..]) catch return error.InvalidOutputFormat; - } else if (std.mem.eql(u8, arg, "--max-rows")) { - i += 1; - if (i >= args.len) return error.InvalidMaxRows; - max_rows = std.fmt.parseUnsigned(usize, args[i], 10) catch return error.InvalidMaxRows; - if (max_rows.? == 0) return error.InvalidMaxRows; - } else if (std.mem.startsWith(u8, arg, "--max-rows=")) { - max_rows = std.fmt.parseUnsigned(usize, arg["--max-rows=".len..], 10) catch return error.InvalidMaxRows; - if (max_rows.? == 0) return error.InvalidMaxRows; - } else if (std.mem.eql(u8, arg, "--verbose") or std.mem.eql(u8, arg, "-v")) { - verbose = true; - } else if (std.mem.eql(u8, arg, "--silent") or std.mem.eql(u8, arg, "-s")) { - silent = true; - } else if (std.mem.eql(u8, arg, "--columns")) { - list_columns = true; - } else if (std.mem.eql(u8, arg, "--validate")) { - validate = true; - } else if (std.mem.eql(u8, arg, "--sample")) { - sample_mode = true; - // Peek at next arg: if it is a positive integer, consume it as the sample count - if (i + 1 < args.len) { - const next = args[i + 1]; - if (next.len > 0 and next[0] != '-') { - if (std.fmt.parseUnsigned(usize, next, 10)) |n| { - if (n == 0) return error.InvalidSampleCount; - sample_n = n; - i += 1; - } else |_| { - // Not a number — keep default (10) - } - } - } - } else if (std.mem.startsWith(u8, arg, "--sample=")) { - const val = arg["--sample=".len..]; - const n = std.fmt.parseUnsigned(usize, val, 10) catch return error.InvalidSampleCount; - if (n == 0) return error.InvalidSampleCount; - sample_n = n; - sample_mode = true; - } else if (std.mem.eql(u8, arg, "--output")) { - i += 1; - if (i >= args.len) return error.InvalidOutputPath; - const trimmed = std.mem.trim(u8, args[i], " \t"); - if (trimmed.len == 0) return error.InvalidOutputPath; - output = trimmed; - } else if (std.mem.startsWith(u8, arg, "--output=")) { - const trimmed = std.mem.trim(u8, arg["--output=".len..], " \t"); - if (trimmed.len == 0) return error.InvalidOutputPath; - output = trimmed; - } else if (std.mem.eql(u8, arg, "--xml-root")) { - i += 1; - if (i >= args.len) return error.MissingXmlFlagValue; - xml_root = args[i]; - xml_root_input = args[i]; - } else if (std.mem.startsWith(u8, arg, "--xml-root=")) { - xml_root = arg["--xml-root=".len..]; - xml_root_input = arg["--xml-root=".len..]; - } else if (std.mem.eql(u8, arg, "--xml-row")) { - i += 1; - if (i >= args.len) return error.MissingXmlFlagValue; - xml_row = args[i]; - xml_row_input = args[i]; - } else if (std.mem.startsWith(u8, arg, "--xml-row=")) { - xml_row = arg["--xml-row=".len..]; - xml_row_input = arg["--xml-row=".len..]; - } else { - if (query == null) query = arg; - } - } - - // Non-CSV/TSV output format is mutually exclusive with --header - if (output_format != .csv and output_format != .tsv and header) - return error.IncompatibleFlags; - - // --output is mutually exclusive with --columns (--columns always writes to stdout) - if (output != null and list_columns) - return error.OutputWithColumns; - - // --output is mutually exclusive with --validate (--validate always writes to stdout) - if (output != null and validate) - return error.OutputWithValidate; - - // --output is mutually exclusive with --sample (--sample always writes to stdout) - if (output != null and sample_mode) - return error.SampleWithOutput; - - // --validate is mutually exclusive with --columns - if (validate and list_columns) - return error.ValidateWithColumns; - - // --columns is mutually exclusive with a query argument - if (list_columns and query != null) - return error.ColumnsWithQuery; - - // --validate is mutually exclusive with a query argument - if (validate and query != null) - return error.ValidateWithQuery; - - // --sample is mutually exclusive with a query argument - if (sample_mode and query != null) - return error.SampleWithQuery; - - // --sample is mutually exclusive with --json / json output format - if (sample_mode and (output_format == .json or output_format == .ndjson)) - return error.SampleWithJson; - - // --sample is mutually exclusive with --columns - if (sample_mode and list_columns) - return error.SampleWithColumns; - - // --sample is mutually exclusive with --validate - if (sample_mode and validate) - return error.SampleWithValidate; - - // --silent and --verbose are mutually exclusive - if (silent and verbose) - return error.SilentVerboseConflict; - - // --xml-root and --xml-row must be valid XML element names - if (!isValidXmlName(xml_root) or !isValidXmlName(xml_row)) - return error.InvalidXmlName; - - // --columns mode: list headers and exit - if (list_columns) - return .{ .columns = ColumnsArgs{ - .delimiter = delimiter, - .verbose = verbose, - .input_format = input_format, - .xml_root_input = xml_root_input, - .xml_row_input = xml_row_input, - } }; - - // --validate mode: parse CSV and print summary - if (validate) - return .{ .validate = ValidateArgs{ - .delimiter = delimiter, - .type_inference = type_inference, - .input_format = input_format, - .xml_root_input = xml_root_input, - .xml_row_input = xml_row_input, - } }; - - // --sample mode: print schema + first n rows and exit - if (sample_mode) - return .{ .sample = SampleArgs{ - .delimiter = delimiter, - .input_format = input_format, - .n = sample_n, - .type_inference = type_inference, - } }; - - return .{ .parsed = ParsedArgs{ - .query = query orelse return error.MissingQuery, - .type_inference = type_inference, - .delimiter = delimiter, - .header = header, - .input_format = input_format, - .output_format = output_format, - .max_rows = max_rows, - .verbose = verbose, - .silent = silent, - .output = output, - .xml_root = xml_root, - .xml_row = xml_row, - .xml_root_input = xml_root_input, - .xml_row_input = xml_row_input, - } }; -} - /// openDb() → *sqlite3 /// Pre: — /// Post: result is an open, empty in-memory SQLite database handle @@ -1210,15 +758,15 @@ fn loadCsvInput( return rows_inserted; } -/// runColumns(args, allocator, io, stderr_writer, stdout_writer) → void +/// runColumns(allocator, io, args, stderr_writer, stdout_writer) → void /// Pre: args is valid; allocator and writers are valid /// Post: column names from the input header (CSV/JSON/NDJSON) are written to stdout, /// one per line; when args.verbose is true each line has format " " /// (CSV only — JSON/NDJSON always show TEXT); exits 0 on success, 2 on parse error fn runColumns( - args: ColumnsArgs, allocator: std.mem.Allocator, io: std.Io, + args: ColumnsArgs, stderr_writer: *std.Io.Writer, stdout_writer: *std.Io.Writer, ) void { @@ -1394,15 +942,15 @@ fn runColumns( } } -/// runValidate(args, allocator, io, stderr_writer, stdout_writer) → void +/// runValidate(allocator, io, args, stderr_writer, stdout_writer) → void /// Pre: args is valid; allocator and writers are valid /// Post: the entire input has been parsed (CSV, TSV, JSON, or NDJSON); /// on success prints "OK: rows, columns ( , ...)" to stdout. /// On parse error, prints the error message to stderr and exits 2. fn runValidate( - args: ValidateArgs, allocator: std.mem.Allocator, io: std.Io, + args: ValidateArgs, stderr_writer: *std.Io.Writer, stdout_writer: *std.Io.Writer, ) void { @@ -1694,16 +1242,16 @@ fn runValidate( } } -/// runSample(args, allocator, io, stderr_writer, stdout_writer) → void +/// runSample(allocator, io, args, stderr_writer, stdout_writer) → void /// Pre: args is valid; allocator and writers are valid; input_format is csv or tsv /// Post: a schema comment block is written to stderr (column names + inferred types, /// or all TEXT if args.type_inference is false, each line prefixed with "#") and /// a header row + first args.n data rows are written to stdout as delimited text. /// Exits 2 on parse error, 1 on stdout write error. No query required. fn runSample( - args: SampleArgs, allocator: std.mem.Allocator, io: std.Io, + args: SampleArgs, stderr_writer: *std.Io.Writer, stdout_writer: *std.Io.Writer, ) void { @@ -1841,16 +1389,16 @@ fn runSample( } } -/// run(parsed, allocator, io, stderr_writer, stdout_writer) → void +/// run(allocator, io, parsed, stderr_writer, stdout_writer) → void /// Pre: parsed contains a valid query; allocator and writers are valid /// Post: input from stdin has been loaded (dispatched on parsed.input_format), /// query executed, results written to stdout in parsed.output_format /// On error, an "error: ..." message is written to stderr and process /// exits with the appropriate ExitCode (1, 2, or 3) fn run( - parsed: ParsedArgs, allocator: std.mem.Allocator, io: std.Io, + parsed: ParsedArgs, stderr_writer: *std.Io.Writer, stdout_writer: *std.Io.Writer, ) void { @@ -2093,7 +1641,7 @@ pub fn main(init: std.process.Init.Minimal) void { std.process.exit(@intFromEnum(ExitCode.success)); }, .columns => |col_args| { - runColumns(col_args, allocator, io.io(), stderr_writer, stdout_writer); + runColumns(allocator, io.io(), col_args, stderr_writer, stdout_writer); stdout_file_writer.flush() catch |err| { std.log.err("failed to flush stdout: {}", .{err}); }; @@ -2102,7 +1650,7 @@ pub fn main(init: std.process.Init.Minimal) void { }; }, .validate => |val_args| { - runValidate(val_args, allocator, io.io(), stderr_writer, stdout_writer); + runValidate(allocator, io.io(), val_args, stderr_writer, stdout_writer); stdout_file_writer.flush() catch |err| { std.log.err("failed to flush stdout: {}", .{err}); }; @@ -2111,7 +1659,7 @@ pub fn main(init: std.process.Init.Minimal) void { }; }, .sample => |sample_args| { - runSample(sample_args, allocator, io.io(), stderr_writer, stdout_writer); + runSample(allocator, io.io(), sample_args, stderr_writer, stdout_writer); stdout_file_writer.flush() catch |err| { std.log.err("failed to flush stdout: {}", .{err}); }; @@ -2131,12 +1679,12 @@ pub fn main(init: std.process.Init.Minimal) void { defer std.Io.File.close(output_file, io.io()); var output_buf: [4096]u8 = undefined; var output_file_writer = std.Io.File.writer(output_file, io.io(), &output_buf); - run(parsed, allocator, io.io(), stderr_writer, &output_file_writer.interface); + run(allocator, io.io(), parsed, stderr_writer, &output_file_writer.interface); output_file_writer.flush() catch |err| { std.log.err("failed to flush output file: {}", .{err}); }; } else { - run(parsed, allocator, io.io(), stderr_writer, stdout_writer); + run(allocator, io.io(), parsed, stderr_writer, stdout_writer); stdout_file_writer.flush() catch |err| { std.log.err("failed to flush stdout: {}", .{err}); }; From 311ce1195e7e7482fcd82cd0817aa485ea5c08e6 Mon Sep 17 00:00:00 2001 From: "Victor M. Varela" Date: Fri, 8 May 2026 12:59:09 +0200 Subject: [PATCH 3/6] refactor: consolidate SQLite helpers and ColumnType into sqlite.zig --- src/main.zig | 228 +++---------------------------------------------- src/sqlite.zig | 169 ++++++++++++++++++++++++++++++++++++ 2 files changed, 180 insertions(+), 217 deletions(-) diff --git a/src/main.zig b/src/main.zig index 839b32f..9834a21 100644 --- a/src/main.zig +++ b/src/main.zig @@ -6,15 +6,12 @@ const xml = @import("xml.zig"); const format = @import("format.zig"); const build_options = @import("build_options"); const args_mod = @import("args.zig"); +const sqlite_mod = @import("sqlite.zig"); +const ColumnType = sqlite_mod.ColumnType; +const sqlite_static = sqlite_mod.sqlite_static; const VERSION: []const u8 = build_options.version; -/// SQLITE_STATIC sentinel: tells sqlite3_bind_text that the string is -/// caller-managed and SQLite must not attempt to free it. -const sqlite_static: c.sqlite3_destructor_type = null; - -/// SQLITE_TRANSIENT sentinel: tells sqlite3_bind_text to copy the string -/// immediately (safe for short-lived source buffers, e.g. JSON arena data). const SqlPipeError = args_mod.SqlPipeError; const ParsedArgs = args_mod.ParsedArgs; const ColumnsArgs = args_mod.ColumnsArgs; @@ -26,9 +23,6 @@ const printUsage = args_mod.printUsage; // ─── Column type inference ──────────────────────────── -/// Inferred SQLite affinity for a CSV column. -const ColumnType = enum { TEXT, INTEGER, REAL }; - /// Number of rows buffered from stdin to infer column types. const inference_buffer_size: usize = 100; @@ -53,16 +47,6 @@ const InputFormat = format.InputFormat; /// Supported output formats (canonical definition lives in format.zig). const OutputFormat = format.OutputFormat; -/// openDb() → *sqlite3 -/// Pre: — -/// Post: result is an open, empty in-memory SQLite database handle -/// error.OpenDbFailed when sqlite3_open returns non-SQLITE_OK -fn openDb() SqlPipeError!*c.sqlite3 { - var db: ?*c.sqlite3 = null; - if (c.sqlite3_open(":memory:", &db) != c.SQLITE_OK) return error.OpenDbFailed; - return db.?; -} - /// stripQuotes(raw) → []const u8 /// Pre: raw is a valid UTF-8 slice /// Post: if raw = '"' ++ inner ++ '"' => result = inner @@ -234,80 +218,6 @@ fn parseHeader( return cols.toOwnedSlice(allocator); } -/// createTable(db, cols, types, allocator) → void -/// Pre: db is an open SQLite handle -/// cols.len > 0 -/// types.len = cols.len -/// allocator is valid -/// Post: table `t` exists in db with cols.len columns named by cols; -/// each column's SQL type reflects its ColumnType value -/// (INTEGER / REAL / TEXT with correct SQLite affinity) -/// column identifiers are double-quote escaped per SQL syntax -/// error.CreateTableFailed when sqlite3_exec returns non-SQLITE_OK -fn createTable( - allocator: std.mem.Allocator, - db: *c.sqlite3, - cols: []const []const u8, - types: []const ColumnType, -) (SqlPipeError || std.mem.Allocator.Error)!void { - var sql: std.ArrayList(u8) = .empty; - defer sql.deinit(allocator); - - try sql.appendSlice(allocator, "CREATE TABLE t ("); - // Loop invariant I: sql = "CREATE TABLE t (" ++ columns[0..i] joined by ", " - // Bounding function: cols.len - i - for (cols, 0..) |col, i| { - if (i > 0) try sql.appendSlice(allocator, ", "); - try sql.append(allocator, '"'); - // Escape embedded double-quotes by doubling them (SQL identifier rule) - for (col) |ch| { - if (ch == '"') try sql.append(allocator, '"'); - try sql.append(allocator, ch); - } - try sql.append(allocator, '"'); - try sql.appendSlice(allocator, switch (types[i]) { - .INTEGER => " INTEGER", - .REAL => " REAL", - .TEXT => " TEXT", - }); - } - try sql.appendSlice(allocator, ")"); - try sql.append(allocator, 0); // null-terminate for the C API - - var errmsg: [*c]u8 = null; - if (c.sqlite3_exec(db, sql.items.ptr, null, null, &errmsg) != c.SQLITE_OK) { - if (errmsg != null) c.sqlite3_free(errmsg); - return error.CreateTableFailed; - } -} - -/// prepareInsert(db, n, allocator) → *sqlite3_stmt -/// Pre: db is open, table `t` exists with n TEXT columns, n > 0 -/// allocator is valid -/// Post: result is a prepared `INSERT INTO t VALUES (?,…,?)` with n parameters -/// error.PrepareInsertFailed when sqlite3_prepare_v2 returns non-SQLITE_OK -fn prepareInsert( - allocator: std.mem.Allocator, - db: *c.sqlite3, - n: usize, -) (SqlPipeError || std.mem.Allocator.Error)!*c.sqlite3_stmt { - var sql: std.ArrayList(u8) = .empty; - defer sql.deinit(allocator); - - try sql.appendSlice(allocator, "INSERT INTO t VALUES ("); - for (0..n) |i| { - if (i > 0) try sql.append(allocator, ','); - try sql.append(allocator, '?'); - } - try sql.appendSlice(allocator, ")"); - try sql.append(allocator, 0); - - var stmt: ?*c.sqlite3_stmt = null; - if (c.sqlite3_prepare_v2(db, sql.items.ptr, -1, &stmt, null) != c.SQLITE_OK) - return error.PrepareInsertFailed; - return stmt.?; -} - /// insertRowTyped(stmt, db, row, types, param_count) → void /// Pre: stmt is a prepared INSERT with param_count parameters, freshly reset /// row is a non-empty CSV record (slice of field slices) @@ -429,102 +339,6 @@ fn execQuery( try out_writer.end(writer); } -// ─── SQL error context helpers ──────────────────────── - -/// Compute the Levenshtein edit distance between two strings. -/// Uses two-row DP over at most max_len characters per string. -fn levenshteinDistance(a: []const u8, b: []const u8) usize { - const max_len = 128; - var prev: [max_len + 1]usize = undefined; - var curr: [max_len + 1]usize = undefined; - const a_len = @min(a.len, max_len); - const b_len = @min(b.len, max_len); - - for (0..b_len + 1) |j| prev[j] = j; - for (0..a_len) |i| { - curr[0] = i + 1; - for (0..b_len) |j| { - const cost: usize = if (a[i] == b[j]) 0 else 1; - curr[j + 1] = @min(curr[j] + 1, @min(prev[j + 1] + 1, prev[j] + cost)); - } - @memcpy(prev[0 .. b_len + 1], curr[0 .. b_len + 1]); - } - return prev[b_len]; -} - -/// Return column names of table `t` via PRAGMA table_info. -/// Caller owns the returned slice; free each element and the slice with allocator. -/// Returns empty slice on PRAGMA failure. -fn getTableColumns(allocator: std.mem.Allocator, db: *c.sqlite3) ![][]const u8 { - var stmt: ?*c.sqlite3_stmt = null; - if (c.sqlite3_prepare_v2(db, "PRAGMA table_info(t)", -1, &stmt, null) != c.SQLITE_OK) - return &.{}; - defer _ = c.sqlite3_finalize(stmt); - - var cols = std.ArrayList([]const u8).empty; - errdefer { - for (cols.items) |col| allocator.free(col); - cols.deinit(allocator); - } - - while (c.sqlite3_step(stmt) == c.SQLITE_ROW) { - // PRAGMA table_info columns: cid(0), name(1), type(2), notnull(3), dflt_value(4), pk(5) - const ptr = c.sqlite3_column_text(stmt, 1); - if (ptr == null) continue; - const name = std.mem.span(@as([*:0]const u8, @ptrCast(ptr))); - const owned = try allocator.dupe(u8, name); - errdefer allocator.free(owned); - try cols.append(allocator, owned); - } - - return cols.toOwnedSlice(allocator); -} - -/// Print column context to writer after a SQL error. -/// Prints " table \"t\" has columns: ..." and optionally " hint: did you mean \"\"?" -/// when the error message matches "no such column: " and a column exists within edit distance 2. -/// Silently returns on any failure (PRAGMA unavailable, OOM, writer error). -fn printSqlErrorContext( - allocator: std.mem.Allocator, - db: *c.sqlite3, - errmsg: []const u8, - writer: *std.Io.Writer, -) void { - const columns = getTableColumns(allocator, db) catch return; - defer { - for (columns) |col| allocator.free(col); - allocator.free(columns); - } - if (columns.len == 0) return; - - writer.writeAll(" table \"t\" has columns: ") catch return; - for (columns, 0..) |col, i| { - if (i > 0) writer.writeAll(", ") catch return; - writer.writeAll(col) catch return; - } - writer.writeByte('\n') catch return; - - // Suggest the closest column when the error is "no such column: " - const no_such_col = "no such column: "; - if (std.mem.find(u8, errmsg, no_such_col)) |start| { - const missing = errmsg[start + no_such_col.len ..]; - var best_col: ?[]const u8 = null; - var best_dist: usize = std.math.maxInt(usize); - for (columns) |col| { - const dist = levenshteinDistance(missing, col); - if (dist < best_dist) { - best_dist = dist; - best_col = col; - } - } - if (best_dist <= 2) { - if (best_col) |col| { - writer.print(" hint: did you mean \"{s}\"?\n", .{col}) catch return; - } - } - } -} - // ─── Entry point ────────────────────────────────────── /// fmtThousands(buf, n) → []const u8 @@ -585,23 +399,6 @@ fn fatal(comptime fmt: []const u8, writer: *std.Io.Writer, code: ExitCode, args: std.process.exit(@intFromEnum(code)); } -/// Print SQL error message with column context then exit with sql_error code. -/// Pre: errmsg is the SQLite error string; db has table `t` (or PRAGMA silently fails) -/// Post: stderr has "error: \n" + optional column list + optional hint; process exits 3 -fn fatalSqlWithContext( - allocator: std.mem.Allocator, - db: *c.sqlite3, - errmsg: []const u8, - writer: *std.Io.Writer, -) noreturn { - writer.print("error: {s}\n", .{errmsg}) catch |err| { - std.log.err("failed to write error message: {}", .{err}); - }; - printSqlErrorContext(allocator, db, errmsg, writer); - writer.flush() catch |err| std.log.err("failed to flush: {}", .{err}); - std.process.exit(@intFromEnum(ExitCode.sql_error)); -} - /// loadCsvInput loads all CSV rows from stdin into db table `t`. /// Pre: db is an open in-memory SQLite handle with no tables yet /// parsed.delimiter is valid; allocator and writers are valid @@ -682,19 +479,17 @@ fn loadCsvInput( // ─── Phase 2: create table and insert rows ──────────────────────────────── - createTable(allocator, db, cols, types) catch - fatal("{s}", stderr_writer, .sql_error, .{std.mem.span(c.sqlite3_errmsg(db))}); + sqlite_mod.createTable(allocator, db, cols, types, stderr_writer); { var errmsg: [*c]u8 = null; if (c.sqlite3_exec(db, "BEGIN TRANSACTION", null, null, &errmsg) != c.SQLITE_OK) { const msg = if (errmsg != null) std.mem.span(errmsg) else std.mem.span(c.sqlite3_errmsg(db)); - fatalSqlWithContext(allocator, db, msg, stderr_writer); + sqlite_mod.fatalSqlWithContext(allocator, db, msg, stderr_writer); } } - const stmt = prepareInsert(allocator, db, num_cols) catch - fatalSqlWithContext(allocator, db, std.mem.span(c.sqlite3_errmsg(db)), stderr_writer); + const stmt = sqlite_mod.prepareInsertStmt(allocator, db, num_cols, stderr_writer); defer _ = c.sqlite3_finalize(stmt); const is_tty = std.Io.File.isTty(std.Io.File.stderr(), io) catch false; @@ -708,7 +503,7 @@ fn loadCsvInput( fatal("input exceeds --max-rows limit ({d} rows)", stderr_writer, .usage, .{limit}); } insertRowTyped(stmt, db, row, types, @intCast(num_cols)) catch - fatalSqlWithContext(allocator, db, std.mem.span(c.sqlite3_errmsg(db)), stderr_writer); + sqlite_mod.fatalSqlWithContext(allocator, db, std.mem.span(c.sqlite3_errmsg(db)), stderr_writer); if (is_tty and rows_inserted % progress_interval == 0) printProgress(stderr_writer, rows_inserted, parsed.max_rows); } @@ -740,7 +535,7 @@ fn loadCsvInput( fatal("input exceeds --max-rows limit ({d} rows)", stderr_writer, .usage, .{limit}); } insertRowTyped(stmt, db, record, types, @intCast(num_cols)) catch - fatalSqlWithContext(allocator, db, std.mem.span(c.sqlite3_errmsg(db)), stderr_writer); + sqlite_mod.fatalSqlWithContext(allocator, db, std.mem.span(c.sqlite3_errmsg(db)), stderr_writer); if (is_tty and rows_inserted % progress_interval == 0) printProgress(stderr_writer, rows_inserted, parsed.max_rows); } @@ -750,7 +545,7 @@ fn loadCsvInput( const rc = c.sqlite3_exec(db, "COMMIT", null, null, &errmsg); if (rc != c.SQLITE_OK) { const msg = if (errmsg != null) std.mem.span(errmsg) else std.mem.span(c.sqlite3_errmsg(db)); - fatalSqlWithContext(allocator, db, msg, stderr_writer); + sqlite_mod.fatalSqlWithContext(allocator, db, msg, stderr_writer); } if (errmsg != null) c.sqlite3_free(errmsg); } @@ -1404,8 +1199,7 @@ fn run( ) void { const query = parsed.query; - const db = openDb() catch - fatal("failed to open in-memory database", stderr_writer, .sql_error, .{}); + const db = sqlite_mod.openDb(stderr_writer); defer _ = c.sqlite3_close(db); const start_ts = std.Io.Timestamp.now(io, .awake); @@ -1457,7 +1251,7 @@ fn run( execQuery(allocator, db, query, stdout_writer, parsed.header, parsed.output_format, parsed.xml_root, parsed.xml_row) catch { stdout_writer.flush() catch |err| std.log.err("failed to flush output before fatal: {}", .{err}); - fatalSqlWithContext(allocator, db, std.mem.span(c.sqlite3_errmsg(db)), stderr_writer); + sqlite_mod.fatalSqlWithContext(allocator, db, std.mem.span(c.sqlite3_errmsg(db)), stderr_writer); }; } diff --git a/src/sqlite.zig b/src/sqlite.zig index 5222f1e..e1ef096 100644 --- a/src/sqlite.zig +++ b/src/sqlite.zig @@ -6,6 +6,9 @@ const c = @import("c"); /// SQLITE_STATIC: caller manages string lifetime; SQLite must not free it. pub const sqlite_static: c.sqlite3_destructor_type = null; +/// Inferred SQLite affinity for a CSV column. +pub const ColumnType = enum { TEXT, INTEGER, REAL }; + // Shared exit codes (same values as in each format module) pub const exit_usage: u8 = 1; pub const exit_parse: u8 = 2; @@ -93,3 +96,169 @@ pub fn commitTransaction(db: *c.sqlite3, writer: *std.Io.Writer) void { fatal("{s}", writer, exit_sql, .{msg}); } } + +/// openDb(writer) → *sqlite3 +/// Pre: — +/// Post: result is an open, empty in-memory SQLite database handle +pub fn openDb(writer: *std.Io.Writer) *c.sqlite3 { + var db: ?*c.sqlite3 = null; + if (c.sqlite3_open(":memory:", &db) != c.SQLITE_OK) + fatal("failed to open in-memory database", writer, exit_sql, .{}); + return db.?; +} + +/// createTable(allocator, db, cols, types, writer) → void +/// Pre: db is an open SQLite handle +/// cols.len > 0 +/// types.len = cols.len +/// allocator is valid +/// Post: table `t` exists in db with cols.len columns named by cols; +/// each column's SQL type reflects its ColumnType value +/// (INTEGER / REAL / TEXT with correct SQLite affinity) +/// column identifiers are double-quote escaped per SQL syntax +pub fn createTable( + allocator: std.mem.Allocator, + db: *c.sqlite3, + cols: []const []const u8, + types: []const ColumnType, + writer: *std.Io.Writer, +) void { + var sql: std.ArrayList(u8) = .empty; + defer sql.deinit(allocator); + + sql.appendSlice(allocator, "CREATE TABLE t (") catch fatal("out of memory", writer, exit_parse, .{}); + for (cols, 0..) |col, i| { + if (i > 0) sql.appendSlice(allocator, ", ") catch fatal("out of memory", writer, exit_parse, .{}); + sql.append(allocator, '"') catch fatal("out of memory", writer, exit_parse, .{}); + for (col) |ch| { + if (ch == '"') sql.append(allocator, '"') catch fatal("out of memory", writer, exit_parse, .{}); + sql.append(allocator, ch) catch fatal("out of memory", writer, exit_parse, .{}); + } + sql.append(allocator, '"') catch fatal("out of memory", writer, exit_parse, .{}); + sql.appendSlice(allocator, switch (types[i]) { + .INTEGER => " INTEGER", + .REAL => " REAL", + .TEXT => " TEXT", + }) catch fatal("out of memory", writer, exit_parse, .{}); + } + sql.appendSlice(allocator, ")") catch fatal("out of memory", writer, exit_parse, .{}); + sql.append(allocator, 0) catch fatal("out of memory", writer, exit_parse, .{}); + + var errmsg: [*c]u8 = null; + if (c.sqlite3_exec(db, sql.items.ptr, null, null, &errmsg) != c.SQLITE_OK) { + const msg = if (errmsg != null) std.mem.span(errmsg) else std.mem.span(c.sqlite3_errmsg(db)); + if (errmsg != null) c.sqlite3_free(errmsg); + fatal("{s}", writer, exit_sql, .{msg}); + } +} + +/// Compute the Levenshtein edit distance between two strings. +/// Uses two-row DP over at most max_len characters per string. +pub fn levenshteinDistance(a: []const u8, b: []const u8) usize { + const max_len = 128; + var prev: [max_len + 1]usize = undefined; + var curr: [max_len + 1]usize = undefined; + const a_len = @min(a.len, max_len); + const b_len = @min(b.len, max_len); + + for (0..b_len + 1) |j| prev[j] = j; + for (0..a_len) |i| { + curr[0] = i + 1; + for (0..b_len) |j| { + const cost: usize = if (a[i] == b[j]) 0 else 1; + curr[j + 1] = @min(curr[j] + 1, @min(prev[j + 1] + 1, prev[j] + cost)); + } + @memcpy(prev[0 .. b_len + 1], curr[0 .. b_len + 1]); + } + return prev[b_len]; +} + +/// Return column names of table `t` via PRAGMA table_info. +/// Caller owns the returned slice; free each element and the slice with allocator. +/// Returns empty slice on PRAGMA failure. +pub fn getTableColumns(allocator: std.mem.Allocator, db: *c.sqlite3) ![][]const u8 { + var stmt: ?*c.sqlite3_stmt = null; + if (c.sqlite3_prepare_v2(db, "PRAGMA table_info(t)", -1, &stmt, null) != c.SQLITE_OK) + return &.{}; + defer _ = c.sqlite3_finalize(stmt); + + var cols = std.ArrayList([]const u8).empty; + errdefer { + for (cols.items) |col| allocator.free(col); + cols.deinit(allocator); + } + + while (c.sqlite3_step(stmt) == c.SQLITE_ROW) { + // PRAGMA table_info columns: cid(0), name(1), type(2), notnull(3), dflt_value(4), pk(5) + const ptr = c.sqlite3_column_text(stmt, 1); + if (ptr == null) continue; + const name = std.mem.span(@as([*:0]const u8, @ptrCast(ptr))); + const owned = try allocator.dupe(u8, name); + errdefer allocator.free(owned); + try cols.append(allocator, owned); + } + + return cols.toOwnedSlice(allocator); +} + +/// Print column context to writer after a SQL error. +/// Prints " table \"t\" has columns: ..." and optionally " hint: did you mean \"\"?" +/// when the error message matches "no such column: " and a column exists within edit distance 2. +/// Silently returns on any failure (PRAGMA unavailable, OOM, writer error). +pub fn printSqlErrorContext( + allocator: std.mem.Allocator, + db: *c.sqlite3, + errmsg: []const u8, + writer: *std.Io.Writer, +) void { + const columns = getTableColumns(allocator, db) catch return; + defer { + for (columns) |col| allocator.free(col); + allocator.free(columns); + } + if (columns.len == 0) return; + + writer.writeAll(" table \"t\" has columns: ") catch return; + for (columns, 0..) |col, i| { + if (i > 0) writer.writeAll(", ") catch return; + writer.writeAll(col) catch return; + } + writer.writeByte('\n') catch return; + + // Suggest the closest column when the error is "no such column: " + const no_such_col = "no such column: "; + if (std.mem.find(u8, errmsg, no_such_col)) |start| { + const missing = errmsg[start + no_such_col.len ..]; + var best_col: ?[]const u8 = null; + var best_dist: usize = std.math.maxInt(usize); + for (columns) |col| { + const dist = levenshteinDistance(missing, col); + if (dist < best_dist) { + best_dist = dist; + best_col = col; + } + } + if (best_dist <= 2) { + if (best_col) |col| { + writer.print(" hint: did you mean \"{s}\"?\n", .{col}) catch return; + } + } + } +} + +/// Print SQL error message with column context then exit with sql_error code. +/// Pre: errmsg is the SQLite error string; db has table `t` (or PRAGMA silently fails) +/// Post: stderr has "error: \n" + optional column list + optional hint; process exits 3 +pub fn fatalSqlWithContext( + allocator: std.mem.Allocator, + db: *c.sqlite3, + errmsg: []const u8, + writer: *std.Io.Writer, +) noreturn { + writer.print("error: {s}\n", .{errmsg}) catch |err| { + std.log.err("failed to write error message: {}", .{err}); + }; + printSqlErrorContext(allocator, db, errmsg, writer); + writer.flush() catch |err| std.log.err("failed to flush: {}", .{err}); + std.process.exit(exit_sql); +} From b1825ed19e323832396b83f12e4c00108ac9c1a2 Mon Sep 17 00:00:00 2001 From: "Victor M. Varela" Date: Fri, 8 May 2026 13:12:34 +0200 Subject: [PATCH 4/6] refactor: extract CSV loader and type inference into loader.zig --- src/loader.zig | 468 ++++++++++++++++++++++++++++++++++++++++++++++++ src/main.zig | 469 ++----------------------------------------------- 2 files changed, 478 insertions(+), 459 deletions(-) create mode 100644 src/loader.zig diff --git a/src/loader.zig b/src/loader.zig new file mode 100644 index 0000000..77d0906 --- /dev/null +++ b/src/loader.zig @@ -0,0 +1,468 @@ +//! CSV loader — type inference, header parsing, and loading CSV/TSV into SQLite. + +const std = @import("std"); +const c = @import("c"); +const csv_mod = @import("csv.zig"); +const sqlite_mod = @import("sqlite.zig"); +const args_mod = @import("args.zig"); + +const ColumnType = sqlite_mod.ColumnType; +const sqlite_static = sqlite_mod.sqlite_static; + +const fatal = sqlite_mod.fatal; +const fatalSqlWithContext = sqlite_mod.fatalSqlWithContext; + +/// Number of rows buffered from stdin to infer column types. +pub const inference_buffer_size: usize = 100; + +/// Number of rows between progress indicator updates. +pub const progress_interval: usize = 10_000; + +/// stripQuotes(raw) → []const u8 +/// Pre: raw is a valid UTF-8 slice +/// Post: if raw = '"' ++ inner ++ '"' => result = inner +/// otherwise => result = raw +/// Note: RFC 4180 quoted-field unescaping is handled by csv.zig; this function +/// provides an explicit, single-location implementation for any residual +/// direct string handling that bypasses the CSV parser. +fn stripQuotes(raw: []const u8) []const u8 { + if (raw.len >= 2 and raw[0] == '"' and raw[raw.len - 1] == '"') + return raw[1 .. raw.len - 1]; + return raw; +} + +/// isInteger(val) → bool +/// Pre: val is a valid UTF-8 slice +/// Post: result = val matches [+-]?[0-9]+ (non-empty, only digits after optional sign) +pub fn isInteger(val: []const u8) bool { + if (val.len == 0) return false; + var i: usize = 0; + if (val[0] == '+' or val[0] == '-') i = 1; + if (i >= val.len) return false; // sign only → not an integer + // Loop invariant I: val[0..i] is a valid integer prefix (sign + digits) + // Bounding function: val.len - i + while (i < val.len) : (i += 1) { + if (val[i] < '0' or val[i] > '9') return false; + } + return true; +} + +/// isReal(val) → bool +/// Pre: val is a valid UTF-8 slice +/// Post: result = val is parseable as a 64-bit floating-point number +/// Note: returns true for integers too; callers should check isInteger first +/// for finer classification. +pub fn isReal(val: []const u8) bool { + if (val.len == 0) return false; + _ = std.fmt.parseFloat(f64, val) catch return false; + return true; +} + +/// inferTypes(buffer, num_cols, allocator) → []ColumnType +/// Pre: buffer is a slice of rows (each row is a slice of field strings) +/// num_cols > 0; allocator is valid +/// Post: result.len = num_cols +/// result[j] = INTEGER ⟺ all non-empty values in column j are integers +/// result[j] = REAL ⟺ all non-empty values are numeric but at least one +/// is not a plain integer +/// result[j] = TEXT ⟺ at least one non-empty value is non-numeric, +/// OR no non-empty values exist +pub fn inferTypes( + allocator: std.mem.Allocator, + buffer: []const [][]u8, + num_cols: usize, +) std.mem.Allocator.Error![]ColumnType { + const types = try allocator.alloc(ColumnType, num_cols); + errdefer allocator.free(types); + + const can_be_integer = try allocator.alloc(bool, num_cols); + defer allocator.free(can_be_integer); + const can_be_real = try allocator.alloc(bool, num_cols); + defer allocator.free(can_be_real); + const has_data = try allocator.alloc(bool, num_cols); + defer allocator.free(has_data); + + // Initialise: optimistically assume every column can be INTEGER + for (0..num_cols) |j| { + can_be_integer[j] = true; + can_be_real[j] = true; + has_data[j] = false; + } + + // Loop invariant I: for each j in 0..num_cols, + // can_be_integer[j] = true ⟺ all non-empty values in column j seen so far are integers + // can_be_real[j] = true ⟺ all non-empty values in column j seen so far are numeric + // has_data[j] = true ⟺ at least one non-empty value has been seen in column j + // Bounding function: buffer.len - row_idx + for (buffer) |row| { + for (row, 0..) |val, j| { + if (j >= num_cols) break; + if (val.len == 0) continue; // NULL/empty → skip, does not affect inference + has_data[j] = true; + if (!can_be_real[j]) continue; // already TEXT, no need to re-check + if (!isReal(val)) { + can_be_real[j] = false; + can_be_integer[j] = false; + } else if (!isInteger(val)) { + can_be_integer[j] = false; + } + } + } + + // Determine final type per column + // Post: types[j] reflects can_be_integer[j] / can_be_real[j] / has_data[j] + for (0..num_cols) |j| { + if (has_data[j] and can_be_integer[j]) { + types[j] = .INTEGER; + } else if (has_data[j] and can_be_real[j]) { + types[j] = .REAL; + } else { + types[j] = .TEXT; + } + } + + return types; +} + +/// parseHeader(record, allocator, stderr_writer) → [][]const u8 +/// Pre: record is a non-null CSV record (slice of owned UTF-8 field slices) +/// allocator is valid +/// stderr_writer is a valid writer (warnings are best-effort; write errors ignored) +/// Post: result is a non-empty slice of trimmed column names (leading/trailing +/// ASCII whitespace removed); UTF-8 BOM stripped from the first field +/// duplicate names are suffixed (_2, _3, …) and a warning is written to +/// stderr for each rename: `warning: duplicate column "" renamed to ""` +/// error.EmptyColumnName when any trimmed name is empty +/// error.NoColumns when record is empty +pub fn parseHeader( + allocator: std.mem.Allocator, + record: [][]u8, + stderr_writer: *std.Io.Writer, +) (args_mod.SqlPipeError || std.mem.Allocator.Error)![][]const u8 { + if (record.len == 0) return error.NoColumns; + + // Strip UTF-8 BOM (\xEF\xBB\xBF) from first field if present + const bom = "\xEF\xBB\xBF"; + if (std.mem.startsWith(u8, record[0], bom)) { + const without_bom = try allocator.dupe(u8, record[0][bom.len..]); + allocator.free(record[0]); + record[0] = without_bom; + } + + var cols: std.ArrayList([]const u8) = .empty; + errdefer { + for (cols.items) |col| allocator.free(col); + cols.deinit(allocator); + } + + // seen: maps a column name to the number of times it has appeared so far. + // Pre: seen is empty + // Post: seen[name] = count of occurrences in record[0..i] + var seen = std.StringHashMap(usize).init(allocator); + defer seen.deinit(); + + // Loop invariant I: cols contains trimmed, non-empty (possibly suffixed) names for record[0..i] + // seen maps each base name to its occurrence count up to i + // all items in cols are heap-allocated (owned by allocator) + // Bounding function: record.len - i (natural, decreasing, lower-bounded by 0) + for (record) |field| { + const base = std.mem.trim(u8, field, " \t\r"); + if (base.len == 0) return error.EmptyColumnName; + + const count = (seen.get(base) orelse 0) + 1; + try seen.put(base, count); + + const col: []const u8 = if (count == 1) + try allocator.dupe(u8, base) + else blk: { + const renamed = try std.fmt.allocPrint(allocator, "{s}_{d}", .{ base, count }); + // Best-effort warning to stderr; write errors are silently ignored + stderr_writer.print("warning: duplicate column \"{s}\" renamed to \"{s}\"\n", .{ base, renamed }) catch |err| { + std.log.err("failed to write warning: {}", .{err}); + }; + break :blk renamed; + }; + + try cols.append(allocator, col); + } + + return cols.toOwnedSlice(allocator); +} + +/// insertRowTyped(stmt, db, row, types, param_count) → void +/// Pre: stmt is a prepared INSERT with param_count parameters, freshly reset +/// row is a non-empty CSV record (slice of field slices) +/// types.len = param_count (or shorter → remaining treated as TEXT) +/// db is the database that owns stmt (used for error reporting by caller) +/// Post: each field is bound to its parameter using the appropriate SQLite bind +/// function according to types[j]: +/// INTEGER → sqlite3_bind_int64 (fallback: TEXT on parse failure) +/// REAL → sqlite3_bind_double (fallback: TEXT on parse failure) +/// TEXT → sqlite3_bind_text +/// empty / missing values → sqlite3_bind_null +/// sqlite3_step returned SQLITE_DONE +/// error.BindFailed / error.StepFailed on SQLite errors +pub fn insertRowTyped( + stmt: *c.sqlite3_stmt, + db: *c.sqlite3, + row: [][]u8, + types: []const ColumnType, + param_count: c_int, +) args_mod.SqlPipeError!void { + _ = db; + + _ = c.sqlite3_reset(stmt); + _ = c.sqlite3_clear_bindings(stmt); + + var col_idx: c_int = 1; + + // Loop invariant I: row[0..col_idx-1] are bound to params 1..col_idx-1 + // using the appropriate SQLite bind function for each column type. + // Bounding function: row.len + 1 - col_idx (decreasing toward 0) + for (row) |val| { + if (col_idx > param_count) break; + const j: usize = @intCast(col_idx - 1); + const col_type: ColumnType = if (j < types.len) types[j] else .TEXT; + + if (val.len == 0) { + // Empty / NULL value → bind as SQL NULL regardless of column type + if (c.sqlite3_bind_null(stmt, col_idx) != c.SQLITE_OK) + return error.BindFailed; + } else switch (col_type) { + .INTEGER => { + if (std.fmt.parseInt(i64, val, 10)) |n| { + if (c.sqlite3_bind_int64(stmt, col_idx, n) != c.SQLITE_OK) + return error.BindFailed; + } else |_| { + // Parse failure: fall back to text binding + if (c.sqlite3_bind_text(stmt, col_idx, val.ptr, @intCast(val.len), sqlite_static) != c.SQLITE_OK) + return error.BindFailed; + } + }, + .REAL => { + if (std.fmt.parseFloat(f64, val)) |f| { + if (c.sqlite3_bind_double(stmt, col_idx, f) != c.SQLITE_OK) + return error.BindFailed; + } else |_| { + if (c.sqlite3_bind_text(stmt, col_idx, val.ptr, @intCast(val.len), sqlite_static) != c.SQLITE_OK) + return error.BindFailed; + } + }, + .TEXT => { + if (c.sqlite3_bind_text(stmt, col_idx, val.ptr, @intCast(val.len), sqlite_static) != c.SQLITE_OK) + return error.BindFailed; + }, + } + col_idx += 1; + } + + // Bind NULL for any trailing columns the row is short of + // Loop invariant: params 1..col_idx-1 are bound; col_idx..param_count become NULL + while (col_idx <= param_count) : (col_idx += 1) { + if (c.sqlite3_bind_null(stmt, col_idx) != c.SQLITE_OK) + return error.BindFailed; + } + + if (c.sqlite3_step(stmt) != c.SQLITE_DONE) return error.StepFailed; +} + +/// fmtThousands(buf, n) → []const u8 +/// Pre: buf.len >= 26 (accommodates any usize value with thousands separators) +/// Post: n is formatted as a decimal string with ',' separating each group of +/// three digits from the right (e.g. 42317 → "42,317", 1000 → "1,000") +pub fn fmtThousands(buf: []u8, n: usize) []const u8 { + var tmp: [32]u8 = undefined; // 20 digits max (u64) + safety margin + const digits = std.fmt.bufPrint(&tmp, "{d}", .{n}) catch unreachable; + const len = digits.len; + const first_group = len % 3; // digits in the leading group (0 means groups of 3 from start) + var out_len: usize = 0; + // Loop invariant I: buf[0..out_len] = formatted prefix of digits[0..i] + // commas inserted before every third digit counted from the right + // Bounding function: len - i + for (digits, 0..) |ch, i| { + if ((i > 0 and i == first_group) or + (i > first_group and (i - first_group) % 3 == 0)) + { + buf[out_len] = ','; + out_len += 1; + } + buf[out_len] = ch; + out_len += 1; + } + return buf[0..out_len]; +} + +/// printProgress(writer, n, max_rows) → void +/// Pre: writer is stderr; n > 0 +/// Post: "Loading... rows\r" (or "Loading... / rows\r" when max_rows is set) +/// written to writer with carriage return for in-place update; flushed immediately +pub fn printProgress(writer: *std.Io.Writer, n: usize, max_rows: ?usize) void { + var count_buf: [32]u8 = undefined; + const count_str = fmtThousands(&count_buf, n); + if (max_rows) |limit| { + var limit_buf: [32]u8 = undefined; + const limit_str = fmtThousands(&limit_buf, limit); + writer.print("Loading... {s} / {s} rows\r", .{ count_str, limit_str }) catch |err| { + std.log.err("failed to write progress: {}", .{err}); + }; + } else { + writer.print("Loading... {s} rows\r", .{count_str}) catch |err| { + std.log.err("failed to write progress: {}", .{err}); + }; + } + writer.flush() catch |err| std.log.err("failed to flush progress: {}", .{err}); +} + +/// loadCsvInput loads all CSV rows from stdin into db table `t`. +/// Pre: db is an open in-memory SQLite handle with no tables yet +/// parsed.delimiter is valid; allocator and writers are valid +/// Post: table `t` exists in db with columns inferred from the CSV header; +/// all CSV rows have been inserted; transaction has been committed +/// returns rows_inserted (data rows only, header not counted) +/// on error: writes message to stderr_writer and exits with appropriate code +pub fn loadCsvInput( + allocator: std.mem.Allocator, + io: std.Io, + db: *c.sqlite3, + parsed: args_mod.ParsedArgs, + stderr_writer: *std.Io.Writer, +) usize { + var stdin_buf: [4096]u8 = undefined; + var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + var csv_reader = csv_mod.csvReaderWithDelimiter(allocator, &stdin_file_reader.interface, parsed.delimiter); + + const header_record = csv_reader.nextRecord() catch |err| switch (err) { + error.UnterminatedQuotedField => fatal("row 1: unterminated quoted field", stderr_writer, sqlite_mod.exit_parse, .{}), + else => fatal("row 1: failed to parse CSV header", stderr_writer, sqlite_mod.exit_parse, .{}), + } orelse fatal("empty input (no header row)", stderr_writer, sqlite_mod.exit_parse, .{}); + defer csv_reader.freeRecord(header_record); + + const cols = parseHeader(allocator, header_record, stderr_writer) catch |err| switch (err) { + error.EmptyColumnName => fatal("row 1: empty column name in header", stderr_writer, sqlite_mod.exit_parse, .{}), + error.NoColumns => fatal("row 1: no columns found in header", stderr_writer, sqlite_mod.exit_parse, .{}), + else => fatal("row 1: failed to parse header", stderr_writer, sqlite_mod.exit_parse, .{}), + }; + defer { + for (cols) |col| allocator.free(col); + allocator.free(cols); + } + + const num_cols = cols.len; + var csv_row_count: usize = 1; // 1 = header already read + + // ─── Phase 1: determine column types ───────────────────────────────────── + var row_buffer: std.ArrayList([][]u8) = .empty; + defer { + for (row_buffer.items) |row| csv_reader.freeRecord(row); + row_buffer.deinit(allocator); + } + + const types: []ColumnType = if (parsed.type_inference) blk: { + while (row_buffer.items.len < inference_buffer_size) { + const rec = csv_reader.nextRecord() catch |err| switch (err) { + error.UnterminatedQuotedField => fatal( + "row {d}: unterminated quoted field", + stderr_writer, + sqlite_mod.exit_parse, + .{csv_row_count + 1}, + ), + else => fatal( + "row {d}: failed to parse CSV", + stderr_writer, + sqlite_mod.exit_parse, + .{csv_row_count + 1}, + ), + } orelse break; + csv_row_count += 1; + if (rec.len == 0) { + csv_reader.freeRecord(rec); + continue; + } + row_buffer.append(allocator, rec) catch + fatal("out of memory while buffering rows", stderr_writer, sqlite_mod.exit_parse, .{}); + } + break :blk inferTypes(allocator, row_buffer.items, num_cols) catch + fatal("out of memory during type inference", stderr_writer, sqlite_mod.exit_parse, .{}); + } else blk: { + const t = allocator.alloc(ColumnType, num_cols) catch + fatal("out of memory", stderr_writer, sqlite_mod.exit_parse, .{}); + @memset(t, .TEXT); + break :blk t; + }; + defer allocator.free(types); + + // ─── Phase 2: create table and insert rows ──────────────────────────────── + + sqlite_mod.createTable(allocator, db, cols, types, stderr_writer); + + { + var errmsg: [*c]u8 = null; + if (c.sqlite3_exec(db, "BEGIN TRANSACTION", null, null, &errmsg) != c.SQLITE_OK) { + const msg = if (errmsg != null) std.mem.span(errmsg) else std.mem.span(c.sqlite3_errmsg(db)); + fatalSqlWithContext(allocator, db, msg, stderr_writer); + } + } + + const stmt = sqlite_mod.prepareInsertStmt(allocator, db, num_cols, stderr_writer); + defer _ = c.sqlite3_finalize(stmt); + + const is_tty = std.Io.File.isTty(std.Io.File.stderr(), io) catch false; + var rows_inserted: usize = 0; + + // Insert buffered rows + for (row_buffer.items) |row| { + rows_inserted += 1; + if (parsed.max_rows) |limit| { + if (rows_inserted > limit) + fatal("input exceeds --max-rows limit ({d} rows)", stderr_writer, sqlite_mod.exit_usage, .{limit}); + } + insertRowTyped(stmt, db, row, types, @intCast(num_cols)) catch + fatalSqlWithContext(allocator, db, std.mem.span(c.sqlite3_errmsg(db)), stderr_writer); + if (is_tty and rows_inserted % progress_interval == 0) + printProgress(stderr_writer, rows_inserted, parsed.max_rows); + } + + // Stream remaining rows from stdin + while (true) { + const record = csv_reader.nextRecord() catch |err| switch (err) { + error.UnterminatedQuotedField => fatal( + "row {d}: unterminated quoted field", + stderr_writer, + sqlite_mod.exit_parse, + .{csv_row_count + 1}, + ), + else => fatal( + "row {d}: failed to parse CSV", + stderr_writer, + sqlite_mod.exit_parse, + .{csv_row_count + 1}, + ), + } orelse break; + csv_row_count += 1; + defer csv_reader.freeRecord(record); + + if (record.len == 0) continue; + + rows_inserted += 1; + if (parsed.max_rows) |limit| { + if (rows_inserted > limit) + fatal("input exceeds --max-rows limit ({d} rows)", stderr_writer, sqlite_mod.exit_usage, .{limit}); + } + insertRowTyped(stmt, db, record, types, @intCast(num_cols)) catch + fatalSqlWithContext(allocator, db, std.mem.span(c.sqlite3_errmsg(db)), stderr_writer); + if (is_tty and rows_inserted % progress_interval == 0) + printProgress(stderr_writer, rows_inserted, parsed.max_rows); + } + + { + var errmsg: [*c]u8 = null; + const rc = c.sqlite3_exec(db, "COMMIT", null, null, &errmsg); + if (rc != c.SQLITE_OK) { + const msg = if (errmsg != null) std.mem.span(errmsg) else std.mem.span(c.sqlite3_errmsg(db)); + fatalSqlWithContext(allocator, db, msg, stderr_writer); + } + if (errmsg != null) c.sqlite3_free(errmsg); + } + + return rows_inserted; +} diff --git a/src/main.zig b/src/main.zig index 9834a21..78d1bd4 100644 --- a/src/main.zig +++ b/src/main.zig @@ -7,8 +7,9 @@ const format = @import("format.zig"); const build_options = @import("build_options"); const args_mod = @import("args.zig"); const sqlite_mod = @import("sqlite.zig"); +const loader = @import("loader.zig"); + const ColumnType = sqlite_mod.ColumnType; -const sqlite_static = sqlite_mod.sqlite_static; const VERSION: []const u8 = build_options.version; @@ -21,13 +22,14 @@ const ArgsResult = args_mod.ArgsResult; const parseArgs = args_mod.parseArgs; const printUsage = args_mod.printUsage; -// ─── Column type inference ──────────────────────────── - -/// Number of rows buffered from stdin to infer column types. -const inference_buffer_size: usize = 100; - -/// Number of rows between progress indicator updates. -const progress_interval: usize = 10_000; +const inferTypes = loader.inferTypes; +const parseHeader = loader.parseHeader; +const insertRowTyped = loader.insertRowTyped; +const fmtThousands = loader.fmtThousands; +const printProgress = loader.printProgress; +const loadCsvInput = loader.loadCsvInput; +const inference_buffer_size = loader.inference_buffer_size; +const progress_interval = loader.progress_interval; /// Structured exit codes for scripting. /// 0 = success @@ -47,254 +49,6 @@ const InputFormat = format.InputFormat; /// Supported output formats (canonical definition lives in format.zig). const OutputFormat = format.OutputFormat; -/// stripQuotes(raw) → []const u8 -/// Pre: raw is a valid UTF-8 slice -/// Post: if raw = '"' ++ inner ++ '"' => result = inner -/// otherwise => result = raw -/// Note: RFC 4180 quoted-field unescaping is handled by csv.zig; this function -/// provides an explicit, single-location implementation for any residual -/// direct string handling that bypasses the CSV parser. -fn stripQuotes(raw: []const u8) []const u8 { - if (raw.len >= 2 and raw[0] == '"' and raw[raw.len - 1] == '"') - return raw[1 .. raw.len - 1]; - return raw; -} - -/// isInteger(val) → bool -/// Pre: val is a valid UTF-8 slice -/// Post: result = val matches [+-]?[0-9]+ (non-empty, only digits after optional sign) -fn isInteger(val: []const u8) bool { - if (val.len == 0) return false; - var i: usize = 0; - if (val[0] == '+' or val[0] == '-') i = 1; - if (i >= val.len) return false; // sign only → not an integer - // Loop invariant I: val[0..i] is a valid integer prefix (sign + digits) - // Bounding function: val.len - i - while (i < val.len) : (i += 1) { - if (val[i] < '0' or val[i] > '9') return false; - } - return true; -} - -/// isReal(val) → bool -/// Pre: val is a valid UTF-8 slice -/// Post: result = val is parseable as a 64-bit floating-point number -/// Note: returns true for integers too; callers should check isInteger first -/// for finer classification. -fn isReal(val: []const u8) bool { - if (val.len == 0) return false; - _ = std.fmt.parseFloat(f64, val) catch return false; - return true; -} - -/// inferTypes(buffer, num_cols, allocator) → []ColumnType -/// Pre: buffer is a slice of rows (each row is a slice of field strings) -/// num_cols > 0; allocator is valid -/// Post: result.len = num_cols -/// result[j] = INTEGER ⟺ all non-empty values in column j are integers -/// result[j] = REAL ⟺ all non-empty values are numeric but at least one -/// is not a plain integer -/// result[j] = TEXT ⟺ at least one non-empty value is non-numeric, -/// OR no non-empty values exist -fn inferTypes( - allocator: std.mem.Allocator, - buffer: []const [][]u8, - num_cols: usize, -) std.mem.Allocator.Error![]ColumnType { - const types = try allocator.alloc(ColumnType, num_cols); - errdefer allocator.free(types); - - const can_be_integer = try allocator.alloc(bool, num_cols); - defer allocator.free(can_be_integer); - const can_be_real = try allocator.alloc(bool, num_cols); - defer allocator.free(can_be_real); - const has_data = try allocator.alloc(bool, num_cols); - defer allocator.free(has_data); - - // Initialise: optimistically assume every column can be INTEGER - for (0..num_cols) |j| { - can_be_integer[j] = true; - can_be_real[j] = true; - has_data[j] = false; - } - - // Loop invariant I: for each j in 0..num_cols, - // can_be_integer[j] = true ⟺ all non-empty values in column j seen so far are integers - // can_be_real[j] = true ⟺ all non-empty values in column j seen so far are numeric - // has_data[j] = true ⟺ at least one non-empty value has been seen in column j - // Bounding function: buffer.len - row_idx - for (buffer) |row| { - for (row, 0..) |val, j| { - if (j >= num_cols) break; - if (val.len == 0) continue; // NULL/empty → skip, does not affect inference - has_data[j] = true; - if (!can_be_real[j]) continue; // already TEXT, no need to re-check - if (!isReal(val)) { - can_be_real[j] = false; - can_be_integer[j] = false; - } else if (!isInteger(val)) { - can_be_integer[j] = false; - } - } - } - - // Determine final type per column - // Post: types[j] reflects can_be_integer[j] / can_be_real[j] / has_data[j] - for (0..num_cols) |j| { - if (has_data[j] and can_be_integer[j]) { - types[j] = .INTEGER; - } else if (has_data[j] and can_be_real[j]) { - types[j] = .REAL; - } else { - types[j] = .TEXT; - } - } - - return types; -} - -/// parseHeader(record, allocator, stderr_writer) → [][]const u8 -/// Pre: record is a non-null CSV record (slice of owned UTF-8 field slices) -/// allocator is valid -/// stderr_writer is a valid writer (warnings are best-effort; write errors ignored) -/// Post: result is a non-empty slice of trimmed column names (leading/trailing -/// ASCII whitespace removed); UTF-8 BOM stripped from the first field -/// duplicate names are suffixed (_2, _3, …) and a warning is written to -/// stderr for each rename: `warning: duplicate column "" renamed to ""` -/// error.EmptyColumnName when any trimmed name is empty -/// error.NoColumns when record is empty -fn parseHeader( - allocator: std.mem.Allocator, - record: [][]u8, - stderr_writer: *std.Io.Writer, -) (SqlPipeError || std.mem.Allocator.Error)![][]const u8 { - if (record.len == 0) return error.NoColumns; - - // Strip UTF-8 BOM (\xEF\xBB\xBF) from first field if present - const bom = "\xEF\xBB\xBF"; - if (std.mem.startsWith(u8, record[0], bom)) { - const without_bom = try allocator.dupe(u8, record[0][bom.len..]); - allocator.free(record[0]); - record[0] = without_bom; - } - - var cols: std.ArrayList([]const u8) = .empty; - errdefer { - for (cols.items) |col| allocator.free(col); - cols.deinit(allocator); - } - - // seen: maps a column name to the number of times it has appeared so far. - // Pre: seen is empty - // Post: seen[name] = count of occurrences in record[0..i] - var seen = std.StringHashMap(usize).init(allocator); - defer seen.deinit(); - - // Loop invariant I: cols contains trimmed, non-empty (possibly suffixed) names for record[0..i] - // seen maps each base name to its occurrence count up to i - // all items in cols are heap-allocated (owned by allocator) - // Bounding function: record.len - i (natural, decreasing, lower-bounded by 0) - for (record) |field| { - const base = std.mem.trim(u8, field, " \t\r"); - if (base.len == 0) return error.EmptyColumnName; - - const count = (seen.get(base) orelse 0) + 1; - try seen.put(base, count); - - const col: []const u8 = if (count == 1) - try allocator.dupe(u8, base) - else blk: { - const renamed = try std.fmt.allocPrint(allocator, "{s}_{d}", .{ base, count }); - // Best-effort warning to stderr; write errors are silently ignored - stderr_writer.print("warning: duplicate column \"{s}\" renamed to \"{s}\"\n", .{ base, renamed }) catch |err| { - std.log.err("failed to write warning: {}", .{err}); - }; - break :blk renamed; - }; - - try cols.append(allocator, col); - } - - return cols.toOwnedSlice(allocator); -} - -/// insertRowTyped(stmt, db, row, types, param_count) → void -/// Pre: stmt is a prepared INSERT with param_count parameters, freshly reset -/// row is a non-empty CSV record (slice of field slices) -/// types.len = param_count (or shorter → remaining treated as TEXT) -/// db is the database that owns stmt (used for error reporting by caller) -/// Post: each field is bound to its parameter using the appropriate SQLite bind -/// function according to types[j]: -/// INTEGER → sqlite3_bind_int64 (fallback: TEXT on parse failure) -/// REAL → sqlite3_bind_double (fallback: TEXT on parse failure) -/// TEXT → sqlite3_bind_text -/// empty / missing values → sqlite3_bind_null -/// sqlite3_step returned SQLITE_DONE -/// error.BindFailed / error.StepFailed on SQLite errors -fn insertRowTyped( - stmt: *c.sqlite3_stmt, - db: *c.sqlite3, - row: [][]u8, - types: []const ColumnType, - param_count: c_int, -) SqlPipeError!void { - _ = db; - - _ = c.sqlite3_reset(stmt); - _ = c.sqlite3_clear_bindings(stmt); - - var col_idx: c_int = 1; - - // Loop invariant I: row[0..col_idx-1] are bound to params 1..col_idx-1 - // using the appropriate SQLite bind function for each column type. - // Bounding function: row.len + 1 - col_idx (decreasing toward 0) - for (row) |val| { - if (col_idx > param_count) break; - const j: usize = @intCast(col_idx - 1); - const col_type: ColumnType = if (j < types.len) types[j] else .TEXT; - - if (val.len == 0) { - // Empty / NULL value → bind as SQL NULL regardless of column type - if (c.sqlite3_bind_null(stmt, col_idx) != c.SQLITE_OK) - return error.BindFailed; - } else switch (col_type) { - .INTEGER => { - if (std.fmt.parseInt(i64, val, 10)) |n| { - if (c.sqlite3_bind_int64(stmt, col_idx, n) != c.SQLITE_OK) - return error.BindFailed; - } else |_| { - // Parse failure: fall back to text binding - if (c.sqlite3_bind_text(stmt, col_idx, val.ptr, @intCast(val.len), sqlite_static) != c.SQLITE_OK) - return error.BindFailed; - } - }, - .REAL => { - if (std.fmt.parseFloat(f64, val)) |f| { - if (c.sqlite3_bind_double(stmt, col_idx, f) != c.SQLITE_OK) - return error.BindFailed; - } else |_| { - if (c.sqlite3_bind_text(stmt, col_idx, val.ptr, @intCast(val.len), sqlite_static) != c.SQLITE_OK) - return error.BindFailed; - } - }, - .TEXT => { - if (c.sqlite3_bind_text(stmt, col_idx, val.ptr, @intCast(val.len), sqlite_static) != c.SQLITE_OK) - return error.BindFailed; - }, - } - col_idx += 1; - } - - // Bind NULL for any trailing columns the row is short of - // Loop invariant: params 1..col_idx-1 are bound; col_idx..param_count become NULL - while (col_idx <= param_count) : (col_idx += 1) { - if (c.sqlite3_bind_null(stmt, col_idx) != c.SQLITE_OK) - return error.BindFailed; - } - - if (c.sqlite3_step(stmt) != c.SQLITE_DONE) return error.StepFailed; -} - /// execQuery(db, query, allocator, writer, header, output_format) → !void /// Pre: db is open with table `t` populated /// query is a valid SQL string (not null-terminated) @@ -339,55 +93,6 @@ fn execQuery( try out_writer.end(writer); } -// ─── Entry point ────────────────────────────────────── - -/// fmtThousands(buf, n) → []const u8 -/// Pre: buf.len >= 26 (accommodates any usize value with thousands separators) -/// Post: n is formatted as a decimal string with ',' separating each group of -/// three digits from the right (e.g. 42317 → "42,317", 1000 → "1,000") -fn fmtThousands(buf: []u8, n: usize) []const u8 { - var tmp: [32]u8 = undefined; // 20 digits max (u64) + safety margin - const digits = std.fmt.bufPrint(&tmp, "{d}", .{n}) catch unreachable; - const len = digits.len; - const first_group = len % 3; // digits in the leading group (0 means groups of 3 from start) - var out_len: usize = 0; - // Loop invariant I: buf[0..out_len] = formatted prefix of digits[0..i] - // commas inserted before every third digit counted from the right - // Bounding function: len - i - for (digits, 0..) |ch, i| { - if ((i > 0 and i == first_group) or - (i > first_group and (i - first_group) % 3 == 0)) - { - buf[out_len] = ','; - out_len += 1; - } - buf[out_len] = ch; - out_len += 1; - } - return buf[0..out_len]; -} - -/// printProgress(writer, n, max_rows) → void -/// Pre: writer is stderr; n > 0 -/// Post: "Loading... rows\r" (or "Loading... / rows\r" when max_rows is set) -/// written to writer with carriage return for in-place update; flushed immediately -fn printProgress(writer: *std.Io.Writer, n: usize, max_rows: ?usize) void { - var count_buf: [32]u8 = undefined; - const count_str = fmtThousands(&count_buf, n); - if (max_rows) |limit| { - var limit_buf: [32]u8 = undefined; - const limit_str = fmtThousands(&limit_buf, limit); - writer.print("Loading... {s} / {s} rows\r", .{ count_str, limit_str }) catch |err| { - std.log.err("failed to write progress: {}", .{err}); - }; - } else { - writer.print("Loading... {s} rows\r", .{count_str}) catch |err| { - std.log.err("failed to write progress: {}", .{err}); - }; - } - writer.flush() catch |err| std.log.err("failed to flush progress: {}", .{err}); -} - /// fatal(writer, code, comptime fmt, args) → noreturn /// Pre: writer is stderr, code is non-zero ExitCode /// Post: "error: \n" written to stderr, process exits with code @@ -399,160 +104,6 @@ fn fatal(comptime fmt: []const u8, writer: *std.Io.Writer, code: ExitCode, args: std.process.exit(@intFromEnum(code)); } -/// loadCsvInput loads all CSV rows from stdin into db table `t`. -/// Pre: db is an open in-memory SQLite handle with no tables yet -/// parsed.delimiter is valid; allocator and writers are valid -/// Post: table `t` exists in db with columns inferred from the CSV header; -/// all CSV rows have been inserted; transaction has been committed -/// returns rows_inserted (data rows only, header not counted) -/// on error: writes message to stderr_writer and exits with appropriate code -fn loadCsvInput( - allocator: std.mem.Allocator, - io: std.Io, - db: *c.sqlite3, - parsed: ParsedArgs, - stderr_writer: *std.Io.Writer, -) usize { - var stdin_buf: [4096]u8 = undefined; - var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); - var csv_reader = csv.csvReaderWithDelimiter(allocator, &stdin_file_reader.interface, parsed.delimiter); - - const header_record = csv_reader.nextRecord() catch |err| switch (err) { - error.UnterminatedQuotedField => fatal("row 1: unterminated quoted field", stderr_writer, .csv_error, .{}), - else => fatal("row 1: failed to parse CSV header", stderr_writer, .csv_error, .{}), - } orelse fatal("empty input (no header row)", stderr_writer, .csv_error, .{}); - defer csv_reader.freeRecord(header_record); - - const cols = parseHeader(allocator, header_record, stderr_writer) catch |err| switch (err) { - error.EmptyColumnName => fatal("row 1: empty column name in header", stderr_writer, .csv_error, .{}), - error.NoColumns => fatal("row 1: no columns found in header", stderr_writer, .csv_error, .{}), - else => fatal("row 1: failed to parse header", stderr_writer, .csv_error, .{}), - }; - defer { - for (cols) |col| allocator.free(col); - allocator.free(cols); - } - - const num_cols = cols.len; - var csv_row_count: usize = 1; // 1 = header already read - - // ─── Phase 1: determine column types ───────────────────────────────────── - var row_buffer: std.ArrayList([][]u8) = .empty; - defer { - for (row_buffer.items) |row| csv_reader.freeRecord(row); - row_buffer.deinit(allocator); - } - - const types: []ColumnType = if (parsed.type_inference) blk: { - while (row_buffer.items.len < inference_buffer_size) { - const rec = csv_reader.nextRecord() catch |err| switch (err) { - error.UnterminatedQuotedField => fatal( - "row {d}: unterminated quoted field", - stderr_writer, - .csv_error, - .{csv_row_count + 1}, - ), - else => fatal( - "row {d}: failed to parse CSV", - stderr_writer, - .csv_error, - .{csv_row_count + 1}, - ), - } orelse break; - csv_row_count += 1; - if (rec.len == 0) { - csv_reader.freeRecord(rec); - continue; - } - row_buffer.append(allocator, rec) catch - fatal("out of memory while buffering rows", stderr_writer, .csv_error, .{}); - } - break :blk inferTypes(allocator, row_buffer.items, num_cols) catch - fatal("out of memory during type inference", stderr_writer, .csv_error, .{}); - } else blk: { - const t = allocator.alloc(ColumnType, num_cols) catch - fatal("out of memory", stderr_writer, .csv_error, .{}); - @memset(t, .TEXT); - break :blk t; - }; - defer allocator.free(types); - - // ─── Phase 2: create table and insert rows ──────────────────────────────── - - sqlite_mod.createTable(allocator, db, cols, types, stderr_writer); - - { - var errmsg: [*c]u8 = null; - if (c.sqlite3_exec(db, "BEGIN TRANSACTION", null, null, &errmsg) != c.SQLITE_OK) { - const msg = if (errmsg != null) std.mem.span(errmsg) else std.mem.span(c.sqlite3_errmsg(db)); - sqlite_mod.fatalSqlWithContext(allocator, db, msg, stderr_writer); - } - } - - const stmt = sqlite_mod.prepareInsertStmt(allocator, db, num_cols, stderr_writer); - defer _ = c.sqlite3_finalize(stmt); - - const is_tty = std.Io.File.isTty(std.Io.File.stderr(), io) catch false; - var rows_inserted: usize = 0; - - // Insert buffered rows - for (row_buffer.items) |row| { - rows_inserted += 1; - if (parsed.max_rows) |limit| { - if (rows_inserted > limit) - fatal("input exceeds --max-rows limit ({d} rows)", stderr_writer, .usage, .{limit}); - } - insertRowTyped(stmt, db, row, types, @intCast(num_cols)) catch - sqlite_mod.fatalSqlWithContext(allocator, db, std.mem.span(c.sqlite3_errmsg(db)), stderr_writer); - if (is_tty and rows_inserted % progress_interval == 0) - printProgress(stderr_writer, rows_inserted, parsed.max_rows); - } - - // Stream remaining rows from stdin - while (true) { - const record = csv_reader.nextRecord() catch |err| switch (err) { - error.UnterminatedQuotedField => fatal( - "row {d}: unterminated quoted field", - stderr_writer, - .csv_error, - .{csv_row_count + 1}, - ), - else => fatal( - "row {d}: failed to parse CSV", - stderr_writer, - .csv_error, - .{csv_row_count + 1}, - ), - } orelse break; - csv_row_count += 1; - defer csv_reader.freeRecord(record); - - if (record.len == 0) continue; - - rows_inserted += 1; - if (parsed.max_rows) |limit| { - if (rows_inserted > limit) - fatal("input exceeds --max-rows limit ({d} rows)", stderr_writer, .usage, .{limit}); - } - insertRowTyped(stmt, db, record, types, @intCast(num_cols)) catch - sqlite_mod.fatalSqlWithContext(allocator, db, std.mem.span(c.sqlite3_errmsg(db)), stderr_writer); - if (is_tty and rows_inserted % progress_interval == 0) - printProgress(stderr_writer, rows_inserted, parsed.max_rows); - } - - { - var errmsg: [*c]u8 = null; - const rc = c.sqlite3_exec(db, "COMMIT", null, null, &errmsg); - if (rc != c.SQLITE_OK) { - const msg = if (errmsg != null) std.mem.span(errmsg) else std.mem.span(c.sqlite3_errmsg(db)); - sqlite_mod.fatalSqlWithContext(allocator, db, msg, stderr_writer); - } - if (errmsg != null) c.sqlite3_free(errmsg); - } - - return rows_inserted; -} - /// runColumns(allocator, io, args, stderr_writer, stdout_writer) → void /// Pre: args is valid; allocator and writers are valid /// Post: column names from the input header (CSV/JSON/NDJSON) are written to stdout, From f8dc1c33ba1992d560c36572a8bea1d1ef6a290b Mon Sep 17 00:00:00 2001 From: "Victor M. Varela" Date: Fri, 8 May 2026 13:25:05 +0200 Subject: [PATCH 5/6] refactor: extract runColumns, runValidate, runSample into src/modes/ --- src/main.zig | 653 +---------------------------------------- src/modes/columns.zig | 204 +++++++++++++ src/modes/sample.zig | 167 +++++++++++ src/modes/validate.zig | 323 ++++++++++++++++++++ 4 files changed, 701 insertions(+), 646 deletions(-) create mode 100644 src/modes/columns.zig create mode 100644 src/modes/sample.zig create mode 100644 src/modes/validate.zig diff --git a/src/main.zig b/src/main.zig index 78d1bd4..2bfd761 100644 --- a/src/main.zig +++ b/src/main.zig @@ -1,6 +1,5 @@ const std = @import("std"); const c = @import("c"); -const csv = @import("csv.zig"); const json = @import("json.zig"); const xml = @import("xml.zig"); const format = @import("format.zig"); @@ -9,26 +8,19 @@ const args_mod = @import("args.zig"); const sqlite_mod = @import("sqlite.zig"); const loader = @import("loader.zig"); -const ColumnType = sqlite_mod.ColumnType; +const columns_mode = @import("modes/columns.zig"); +const validate_mode = @import("modes/validate.zig"); +const sample_mode = @import("modes/sample.zig"); const VERSION: []const u8 = build_options.version; const SqlPipeError = args_mod.SqlPipeError; const ParsedArgs = args_mod.ParsedArgs; -const ColumnsArgs = args_mod.ColumnsArgs; -const ValidateArgs = args_mod.ValidateArgs; -const SampleArgs = args_mod.SampleArgs; -const ArgsResult = args_mod.ArgsResult; const parseArgs = args_mod.parseArgs; const printUsage = args_mod.printUsage; -const inferTypes = loader.inferTypes; -const parseHeader = loader.parseHeader; -const insertRowTyped = loader.insertRowTyped; -const fmtThousands = loader.fmtThousands; -const printProgress = loader.printProgress; const loadCsvInput = loader.loadCsvInput; -const inference_buffer_size = loader.inference_buffer_size; +const fmtThousands = loader.fmtThousands; const progress_interval = loader.progress_interval; /// Structured exit codes for scripting. @@ -104,637 +96,6 @@ fn fatal(comptime fmt: []const u8, writer: *std.Io.Writer, code: ExitCode, args: std.process.exit(@intFromEnum(code)); } -/// runColumns(allocator, io, args, stderr_writer, stdout_writer) → void -/// Pre: args is valid; allocator and writers are valid -/// Post: column names from the input header (CSV/JSON/NDJSON) are written to stdout, -/// one per line; when args.verbose is true each line has format " " -/// (CSV only — JSON/NDJSON always show TEXT); exits 0 on success, 2 on parse error -fn runColumns( - allocator: std.mem.Allocator, - io: std.Io, - args: ColumnsArgs, - stderr_writer: *std.Io.Writer, - stdout_writer: *std.Io.Writer, -) void { - switch (args.input_format) { - .csv, .tsv => { - const col_delim: []const u8 = if (args.input_format == .tsv) "\t" else args.delimiter; - var stdin_buf: [4096]u8 = undefined; - var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); - var csv_reader = csv.csvReaderWithDelimiter(allocator, &stdin_file_reader.interface, col_delim); - - const header_record = csv_reader.nextRecord() catch |err| switch (err) { - error.UnterminatedQuotedField => fatal("row 1: unterminated quoted field", stderr_writer, .csv_error, .{}), - else => fatal("row 1: failed to parse CSV header", stderr_writer, .csv_error, .{}), - } orelse fatal("empty input (no header row)", stderr_writer, .csv_error, .{}); - defer csv_reader.freeRecord(header_record); - - const cols = parseHeader(allocator, header_record, stderr_writer) catch |err| switch (err) { - error.EmptyColumnName => fatal("row 1: empty column name in header", stderr_writer, .csv_error, .{}), - error.NoColumns => fatal("row 1: no columns found in header", stderr_writer, .csv_error, .{}), - else => fatal("row 1: failed to parse header", stderr_writer, .csv_error, .{}), - }; - defer { - for (cols) |col| allocator.free(col); - allocator.free(cols); - } - - if (args.verbose) { - var row_buffer: std.ArrayList([][]u8) = .empty; - defer { - for (row_buffer.items) |row| csv_reader.freeRecord(row); - row_buffer.deinit(allocator); - } - var data_row: usize = 1; - while (row_buffer.items.len < inference_buffer_size) { - data_row += 1; - const rec = csv_reader.nextRecord() catch |err| switch (err) { - error.UnterminatedQuotedField => fatal( - "row {d}: unterminated quoted field", - stderr_writer, - .csv_error, - .{data_row}, - ), - else => fatal("row {d}: failed to parse CSV", stderr_writer, .csv_error, .{data_row}), - } orelse break; - if (rec.len == 0) { - csv_reader.freeRecord(rec); - continue; - } - row_buffer.append(allocator, rec) catch - fatal("out of memory while buffering rows", stderr_writer, .csv_error, .{}); - } - const types = inferTypes(allocator, row_buffer.items, cols.len) catch - fatal("out of memory during type inference", stderr_writer, .csv_error, .{}); - defer allocator.free(types); - for (cols, types) |col, t| { - stdout_writer.print("{s} {s}\n", .{ col, @tagName(t) }) catch |err| { - std.log.err("failed to write output: {}", .{err}); - }; - } - } else { - for (cols) |col| { - stdout_writer.print("{s}\n", .{col}) catch |err| { - std.log.err("failed to write output: {}", .{err}); - }; - } - } - }, - .json => { - var stdin_buf: [4096]u8 = undefined; - var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); - - var buf: std.ArrayList(u8) = .empty; - defer buf.deinit(allocator); - while (true) { - const byte = stdin_file_reader.interface.takeByte() catch |err| switch (err) { - error.EndOfStream => break, - error.ReadFailed => fatal("failed to read JSON input", stderr_writer, .csv_error, .{}), - }; - buf.append(allocator, byte) catch fatal("out of memory reading JSON", stderr_writer, .csv_error, .{}); - } - if (buf.items.len == 0) fatal("empty input", stderr_writer, .csv_error, .{}); - - var parsed = std.json.parseFromSlice(std.json.Value, allocator, buf.items, .{}) catch - fatal("failed to parse JSON input", stderr_writer, .csv_error, .{}); - defer parsed.deinit(); - - const array = switch (parsed.value) { - .array => |a| a, - else => fatal("JSON input must be an array of objects", stderr_writer, .csv_error, .{}), - }; - if (array.items.len == 0) fatal("empty JSON array: cannot determine column names", stderr_writer, .csv_error, .{}); - - const first_obj = switch (array.items[0]) { - .object => |o| o, - else => fatal("JSON array elements must be objects", stderr_writer, .csv_error, .{}), - }; - - var ki = first_obj.iterator(); - while (ki.next()) |entry| { - if (args.verbose) { - stdout_writer.print("{s} TEXT\n", .{entry.key_ptr.*}) catch |err| { - std.log.err("failed to write output: {}", .{err}); - }; - } else { - stdout_writer.print("{s}\n", .{entry.key_ptr.*}) catch |err| { - std.log.err("failed to write output: {}", .{err}); - }; - } - } - }, - .ndjson => { - var stdin_buf: [4096]u8 = undefined; - var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); - - // Read until we find a non-empty line - var line_num: usize = 0; - while (true) { - line_num += 1; - const line = json.readLine(allocator, &stdin_file_reader.interface) catch |err| switch (err) { - error.OutOfMemory => fatal("out of memory reading NDJSON", stderr_writer, .csv_error, .{}), - error.ReadFailed => fatal("line {d}: failed to read NDJSON", stderr_writer, .csv_error, .{line_num}), - } orelse fatal("empty NDJSON input", stderr_writer, .csv_error, .{}); - defer allocator.free(line); - - const trimmed = std.mem.trim(u8, line, " \t\r"); - if (trimmed.len == 0) { line_num -= 1; continue; } - - var parsed = std.json.parseFromSlice(std.json.Value, allocator, trimmed, .{}) catch - fatal("line 1: failed to parse NDJSON", stderr_writer, .csv_error, .{}); - defer parsed.deinit(); - - const obj = switch (parsed.value) { - .object => |o| o, - else => fatal("line 1: NDJSON element must be a JSON object", stderr_writer, .csv_error, .{}), - }; - - var ki = obj.iterator(); - while (ki.next()) |entry| { - if (args.verbose) { - stdout_writer.print("{s} TEXT\n", .{entry.key_ptr.*}) catch |err| { - std.log.err("failed to write output: {}", .{err}); - }; - } else { - stdout_writer.print("{s}\n", .{entry.key_ptr.*}) catch |err| { - std.log.err("failed to write output: {}", .{err}); - }; - } - } - break; - } - }, - .xml => { - var stdin_buf: [4096]u8 = undefined; - var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); - - const names = xml.getXmlColumnNames(allocator, &stdin_file_reader.interface, args.xml_root_input, args.xml_row_input, stderr_writer); - defer { - for (names) |name| allocator.free(name); - allocator.free(names); - } - for (names) |name| { - if (args.verbose) { - stdout_writer.print("{s} TEXT\n", .{name}) catch |err| { - std.log.err("failed to write output: {}", .{err}); - }; - } else { - stdout_writer.print("{s}\n", .{name}) catch |err| { - std.log.err("failed to write output: {}", .{err}); - }; - } - } - }, - } -} - -/// runValidate(allocator, io, args, stderr_writer, stdout_writer) → void -/// Pre: args is valid; allocator and writers are valid -/// Post: the entire input has been parsed (CSV, TSV, JSON, or NDJSON); -/// on success prints "OK: rows, columns ( , ...)" to stdout. -/// On parse error, prints the error message to stderr and exits 2. -fn runValidate( - allocator: std.mem.Allocator, - io: std.Io, - args: ValidateArgs, - stderr_writer: *std.Io.Writer, - stdout_writer: *std.Io.Writer, -) void { - switch (args.input_format) { - .csv, .tsv => { - const col_delim: []const u8 = if (args.input_format == .tsv) "\t" else args.delimiter; - var stdin_buf: [4096]u8 = undefined; - var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); - var csv_reader = csv.csvReaderWithDelimiter(allocator, &stdin_file_reader.interface, col_delim); - - const header_record = csv_reader.nextRecord() catch |err| switch (err) { - error.UnterminatedQuotedField => fatal("row 1: unterminated quoted field", stderr_writer, .csv_error, .{}), - else => fatal("row 1: failed to parse CSV header", stderr_writer, .csv_error, .{}), - } orelse fatal("empty input (no header row)", stderr_writer, .csv_error, .{}); - defer csv_reader.freeRecord(header_record); - - const cols = parseHeader(allocator, header_record, stderr_writer) catch |err| switch (err) { - error.EmptyColumnName => fatal("row 1: empty column name in header", stderr_writer, .csv_error, .{}), - error.NoColumns => fatal("row 1: no columns found in header", stderr_writer, .csv_error, .{}), - else => fatal("row 1: failed to parse header", stderr_writer, .csv_error, .{}), - }; - defer { - for (cols) |col| allocator.free(col); - allocator.free(cols); - } - - const num_cols = cols.len; - var csv_row_count: usize = 1; // header already read - var data_row_count: usize = 0; - - var row_buffer: std.ArrayList([][]u8) = .empty; - defer { - for (row_buffer.items) |row| csv_reader.freeRecord(row); - row_buffer.deinit(allocator); - } - - // Buffer up to inference_buffer_size rows for type inference - while (row_buffer.items.len < inference_buffer_size) { - const rec = csv_reader.nextRecord() catch |err| switch (err) { - error.UnterminatedQuotedField => fatal( - "row {d}: unterminated quoted field", - stderr_writer, - .csv_error, - .{csv_row_count + 1}, - ), - else => fatal( - "row {d}: failed to parse CSV", - stderr_writer, - .csv_error, - .{csv_row_count + 1}, - ), - } orelse break; - csv_row_count += 1; - if (rec.len == 0) { - csv_reader.freeRecord(rec); - continue; - } - data_row_count += 1; - row_buffer.append(allocator, rec) catch - fatal("out of memory while buffering rows", stderr_writer, .csv_error, .{}); - } - - const types: []ColumnType = if (args.type_inference) blk: { - break :blk inferTypes(allocator, row_buffer.items, num_cols) catch - fatal("out of memory during type inference", stderr_writer, .csv_error, .{}); - } else blk: { - const t = allocator.alloc(ColumnType, num_cols) catch - fatal("out of memory", stderr_writer, .csv_error, .{}); - @memset(t, .TEXT); - break :blk t; - }; - defer allocator.free(types); - - // Stream remaining rows and count them - while (true) { - const record = csv_reader.nextRecord() catch |err| switch (err) { - error.UnterminatedQuotedField => fatal( - "row {d}: unterminated quoted field", - stderr_writer, - .csv_error, - .{csv_row_count + 1}, - ), - else => fatal( - "row {d}: failed to parse CSV", - stderr_writer, - .csv_error, - .{csv_row_count + 1}, - ), - } orelse break; - csv_row_count += 1; - defer csv_reader.freeRecord(record); - if (record.len == 0) continue; - data_row_count += 1; - } - - var count_buf: [32]u8 = undefined; - const count_str = fmtThousands(&count_buf, data_row_count); - - stdout_writer.print("OK: {s} rows, {d} columns (", .{ count_str, num_cols }) catch |err| { - std.log.err("failed to write output: {}", .{err}); - std.process.exit(@intFromEnum(ExitCode.usage)); - }; - - for (cols, types, 0..) |col, t, i| { - if (i > 0) { - stdout_writer.writeAll(", ") catch |err| { - std.log.err("failed to write output: {}", .{err}); - std.process.exit(@intFromEnum(ExitCode.usage)); - }; - } - stdout_writer.print("{s} {s}", .{ col, @tagName(t) }) catch |err| { - std.log.err("failed to write output: {}", .{err}); - std.process.exit(@intFromEnum(ExitCode.usage)); - }; - } - stdout_writer.writeAll(")\n") catch |err| { - std.log.err("failed to write output: {}", .{err}); - std.process.exit(@intFromEnum(ExitCode.usage)); - }; - }, - .json => { - var stdin_buf: [4096]u8 = undefined; - var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); - - var buf: std.ArrayList(u8) = .empty; - defer buf.deinit(allocator); - while (true) { - const byte = stdin_file_reader.interface.takeByte() catch |err| switch (err) { - error.EndOfStream => break, - error.ReadFailed => fatal("failed to read JSON input", stderr_writer, .csv_error, .{}), - }; - buf.append(allocator, byte) catch fatal("out of memory reading JSON", stderr_writer, .csv_error, .{}); - } - if (buf.items.len == 0) fatal("empty input", stderr_writer, .csv_error, .{}); - - var parsed = std.json.parseFromSlice(std.json.Value, allocator, buf.items, .{}) catch - fatal("failed to parse JSON input", stderr_writer, .csv_error, .{}); - defer parsed.deinit(); - - const array = switch (parsed.value) { - .array => |a| a, - else => fatal("JSON input must be an array of objects", stderr_writer, .csv_error, .{}), - }; - if (array.items.len == 0) fatal("empty JSON array: cannot determine column names", stderr_writer, .csv_error, .{}); - - const first_obj = switch (array.items[0]) { - .object => |o| o, - else => fatal("JSON array elements must be objects", stderr_writer, .csv_error, .{}), - }; - - var num_cols: usize = 0; - var ki = first_obj.iterator(); - while (ki.next()) |_| num_cols += 1; - - var count_buf: [32]u8 = undefined; - const count_str = fmtThousands(&count_buf, array.items.len); - stdout_writer.print("OK: {s} rows, {d} columns (", .{ count_str, num_cols }) catch |err| { - std.log.err("failed to write output: {}", .{err}); - std.process.exit(@intFromEnum(ExitCode.usage)); - }; - ki = first_obj.iterator(); - var col_i: usize = 0; - while (ki.next()) |entry| : (col_i += 1) { - if (col_i > 0) stdout_writer.writeAll(", ") catch |err| { - std.log.err("failed to write output: {}", .{err}); - std.process.exit(@intFromEnum(ExitCode.usage)); - }; - stdout_writer.print("{s} TEXT", .{entry.key_ptr.*}) catch |err| { - std.log.err("failed to write output: {}", .{err}); - std.process.exit(@intFromEnum(ExitCode.usage)); - }; - } - stdout_writer.writeAll(")\n") catch |err| { - std.log.err("failed to write output: {}", .{err}); - std.process.exit(@intFromEnum(ExitCode.usage)); - }; - }, - .ndjson => { - var stdin_buf: [4096]u8 = undefined; - var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); - - var line_num: usize = 0; - var row_count: usize = 0; - var cols_owned: ?[][]u8 = null; - defer if (cols_owned) |cs| { - for (cs) |col| allocator.free(col); - allocator.free(cs); - }; - - while (true) { - line_num += 1; - const line = json.readLine(allocator, &stdin_file_reader.interface) catch |err| switch (err) { - error.OutOfMemory => fatal("out of memory reading NDJSON", stderr_writer, .csv_error, .{}), - error.ReadFailed => fatal("line {d}: failed to read NDJSON", stderr_writer, .csv_error, .{line_num}), - } orelse break; - defer allocator.free(line); - - const trimmed = std.mem.trim(u8, line, " \t\r"); - if (trimmed.len == 0) { - line_num -= 1; - continue; - } - - var parsed_line = std.json.parseFromSlice(std.json.Value, allocator, trimmed, .{}) catch - fatal("line {d}: failed to parse NDJSON", stderr_writer, .csv_error, .{line_num}); - defer parsed_line.deinit(); - - const obj = switch (parsed_line.value) { - .object => |o| o, - else => fatal("line {d}: NDJSON element must be a JSON object", stderr_writer, .csv_error, .{line_num}), - }; - - if (cols_owned == null) { - var col_list: std.ArrayList([]u8) = .empty; - errdefer { - for (col_list.items) |col| allocator.free(col); - col_list.deinit(allocator); - } - var ki = obj.iterator(); - while (ki.next()) |entry| { - const owned_key = allocator.dupe(u8, entry.key_ptr.*) catch - fatal("out of memory building column list", stderr_writer, .csv_error, .{}); - col_list.append(allocator, owned_key) catch - fatal("out of memory building column list", stderr_writer, .csv_error, .{}); - } - if (col_list.items.len == 0) - fatal("line 1: first NDJSON object has no keys", stderr_writer, .csv_error, .{}); - cols_owned = col_list.toOwnedSlice(allocator) catch - fatal("out of memory", stderr_writer, .csv_error, .{}); - } - row_count += 1; - } - - if (cols_owned == null) fatal("empty NDJSON input", stderr_writer, .csv_error, .{}); - - const cols = cols_owned.?; - var count_buf: [32]u8 = undefined; - const count_str = fmtThousands(&count_buf, row_count); - stdout_writer.print("OK: {s} rows, {d} columns (", .{ count_str, cols.len }) catch |err| { - std.log.err("failed to write output: {}", .{err}); - std.process.exit(@intFromEnum(ExitCode.usage)); - }; - for (cols, 0..) |col, i| { - if (i > 0) stdout_writer.writeAll(", ") catch |err| { - std.log.err("failed to write output: {}", .{err}); - std.process.exit(@intFromEnum(ExitCode.usage)); - }; - stdout_writer.print("{s} TEXT", .{col}) catch |err| { - std.log.err("failed to write output: {}", .{err}); - std.process.exit(@intFromEnum(ExitCode.usage)); - }; - } - stdout_writer.writeAll(")\n") catch |err| { - std.log.err("failed to write output: {}", .{err}); - std.process.exit(@intFromEnum(ExitCode.usage)); - }; - }, - .xml => { - var stdin_buf: [4096]u8 = undefined; - var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); - - const summary = xml.summarizeXml(allocator, &stdin_file_reader.interface, args.xml_root_input, args.xml_row_input, stderr_writer); - defer { - for (summary.col_names) |name| allocator.free(name); - allocator.free(summary.col_names); - } - - var count_buf: [32]u8 = undefined; - const count_str = fmtThousands(&count_buf, summary.row_count); - stdout_writer.print("OK: {s} rows, {d} columns (", .{ count_str, summary.col_names.len }) catch |err| { - std.log.err("failed to write output: {}", .{err}); - std.process.exit(@intFromEnum(ExitCode.usage)); - }; - for (summary.col_names, 0..) |name, i| { - if (i > 0) stdout_writer.writeAll(", ") catch |err| { - std.log.err("failed to write output: {}", .{err}); - std.process.exit(@intFromEnum(ExitCode.usage)); - }; - stdout_writer.print("{s} TEXT", .{name}) catch |err| { - std.log.err("failed to write output: {}", .{err}); - std.process.exit(@intFromEnum(ExitCode.usage)); - }; - } - stdout_writer.writeAll(")\n") catch |err| { - std.log.err("failed to write output: {}", .{err}); - std.process.exit(@intFromEnum(ExitCode.usage)); - }; - }, - } -} - -/// runSample(allocator, io, args, stderr_writer, stdout_writer) → void -/// Pre: args is valid; allocator and writers are valid; input_format is csv or tsv -/// Post: a schema comment block is written to stderr (column names + inferred types, -/// or all TEXT if args.type_inference is false, each line prefixed with "#") and -/// a header row + first args.n data rows are written to stdout as delimited text. -/// Exits 2 on parse error, 1 on stdout write error. No query required. -fn runSample( - allocator: std.mem.Allocator, - io: std.Io, - args: SampleArgs, - stderr_writer: *std.Io.Writer, - stdout_writer: *std.Io.Writer, -) void { - switch (args.input_format) { - .json, .ndjson, .xml => fatal( - "--sample is only supported with CSV and TSV input", - stderr_writer, - .usage, - .{}, - ), - .csv, .tsv => { - const col_delim: []const u8 = if (args.input_format == .tsv) "\t" else args.delimiter; - var stdin_buf: [4096]u8 = undefined; - var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); - var csv_reader = csv.csvReaderWithDelimiter(allocator, &stdin_file_reader.interface, col_delim); - - const header_record = csv_reader.nextRecord() catch |err| switch (err) { - error.UnterminatedQuotedField => fatal("row 1: unterminated quoted field", stderr_writer, .csv_error, .{}), - else => fatal("row 1: failed to parse CSV header", stderr_writer, .csv_error, .{}), - } orelse fatal("empty input (no header row)", stderr_writer, .csv_error, .{}); - defer csv_reader.freeRecord(header_record); - - const cols = parseHeader(allocator, header_record, stderr_writer) catch |err| switch (err) { - error.EmptyColumnName => fatal("row 1: empty column name in header", stderr_writer, .csv_error, .{}), - error.NoColumns => fatal("row 1: no columns found in header", stderr_writer, .csv_error, .{}), - else => fatal("row 1: failed to parse header", stderr_writer, .csv_error, .{}), - }; - defer { - for (cols) |col| allocator.free(col); - allocator.free(cols); - } - - // Buffer max(inference_buffer_size, n) rows for type inference - const buf_size = @max(inference_buffer_size, args.n); - var row_buffer: std.ArrayList([][]u8) = .empty; - defer { - for (row_buffer.items) |row| csv_reader.freeRecord(row); - row_buffer.deinit(allocator); - } - - var csv_row_count: usize = 1; - // Loop invariant I: row_buffer contains all non-empty data rows read so far (up to buf_size) - // Bounding function: buf_size - row_buffer.items.len - while (row_buffer.items.len < buf_size) { - const rec = csv_reader.nextRecord() catch |err| switch (err) { - error.UnterminatedQuotedField => fatal( - "row {d}: unterminated quoted field", - stderr_writer, - .csv_error, - .{csv_row_count + 1}, - ), - else => fatal("row {d}: failed to parse CSV", stderr_writer, .csv_error, .{csv_row_count + 1}), - } orelse break; - csv_row_count += 1; - if (rec.len == 0) { - csv_reader.freeRecord(rec); - continue; - } - row_buffer.append(allocator, rec) catch - fatal("out of memory while buffering rows", stderr_writer, .csv_error, .{}); - } - - const types: []ColumnType = if (args.type_inference) blk: { - break :blk inferTypes(allocator, row_buffer.items, cols.len) catch - fatal("out of memory during type inference", stderr_writer, .csv_error, .{}); - } else blk: { - const t = allocator.alloc(ColumnType, cols.len) catch - fatal("out of memory", stderr_writer, .csv_error, .{}); - @memset(t, .TEXT); - break :blk t; - }; - defer allocator.free(types); - - // ─── Print schema block to stderr ───────────────────────────────────── - // Compute max column name width for aligned output - var max_col_width: usize = 0; - for (cols) |col| max_col_width = @max(max_col_width, col.len); - - stderr_writer.print("# Schema ({d} columns):\n", .{cols.len}) catch |err| { - std.log.err("failed to write schema: {}", .{err}); - }; - // Loop invariant I: cols[0..i] have been printed with aligned type annotation - // Bounding function: cols.len - i - for (cols, types) |col, t| { - stderr_writer.writeAll("# ") catch |err| { - std.log.err("failed to write schema: {}", .{err}); - }; - stderr_writer.writeAll(col) catch |err| { - std.log.err("failed to write schema: {}", .{err}); - }; - // Pad to max_col_width + 2 spaces before the type - var p: usize = col.len; - while (p < max_col_width + 2) : (p += 1) { - stderr_writer.writeByte(' ') catch |err| { - std.log.err("failed to write schema: {}", .{err}); - }; - } - stderr_writer.print("{s}\n", .{@tagName(t)}) catch |err| { - std.log.err("failed to write schema: {}", .{err}); - }; - } - stderr_writer.flush() catch |err| std.log.err("failed to flush stderr: {}", .{err}); - - // ─── Print header row to stdout ──────────────────────────────────────── - // Loop invariant I: cols[0..i] names have been written, separated by col_delim - // Bounding function: cols.len - i - for (cols, 0..) |col, i| { - if (i > 0) stdout_writer.writeAll(col_delim) catch - fatal("failed to write header", stderr_writer, .csv_error, .{}); - format.writeField(stdout_writer, col, col_delim) catch - fatal("failed to write header", stderr_writer, .csv_error, .{}); - } - stdout_writer.writeByte('\n') catch - fatal("failed to write header newline", stderr_writer, .csv_error, .{}); - - // ─── Print first n data rows to stdout ──────────────────────────────── - const rows_to_print = @min(args.n, row_buffer.items.len); - // Loop invariant I: row_buffer[0..r] have been printed as delimited rows - // Bounding function: rows_to_print - r - for (row_buffer.items[0..rows_to_print]) |row| { - var col_idx: usize = 0; - // Loop invariant I: cols[0..col_idx] fields have been written for this row - // Bounding function: cols.len - col_idx - while (col_idx < cols.len) : (col_idx += 1) { - if (col_idx > 0) stdout_writer.writeAll(col_delim) catch - fatal("failed to write field separator", stderr_writer, .csv_error, .{}); - const val: []const u8 = if (col_idx < row.len) row[col_idx] else ""; - format.writeField(stdout_writer, val, col_delim) catch - fatal("failed to write field", stderr_writer, .csv_error, .{}); - } - stdout_writer.writeByte('\n') catch - fatal("failed to write row newline", stderr_writer, .csv_error, .{}); - } - }, - } -} - /// run(allocator, io, parsed, stderr_writer, stdout_writer) → void /// Pre: parsed contains a valid query; allocator and writers are valid /// Post: input from stdin has been loaded (dispatched on parsed.input_format), @@ -986,7 +347,7 @@ pub fn main(init: std.process.Init.Minimal) void { std.process.exit(@intFromEnum(ExitCode.success)); }, .columns => |col_args| { - runColumns(allocator, io.io(), col_args, stderr_writer, stdout_writer); + columns_mode.runColumns(allocator, io.io(), col_args, stderr_writer, stdout_writer); stdout_file_writer.flush() catch |err| { std.log.err("failed to flush stdout: {}", .{err}); }; @@ -995,7 +356,7 @@ pub fn main(init: std.process.Init.Minimal) void { }; }, .validate => |val_args| { - runValidate(allocator, io.io(), val_args, stderr_writer, stdout_writer); + validate_mode.runValidate(allocator, io.io(), val_args, stderr_writer, stdout_writer); stdout_file_writer.flush() catch |err| { std.log.err("failed to flush stdout: {}", .{err}); }; @@ -1004,7 +365,7 @@ pub fn main(init: std.process.Init.Minimal) void { }; }, .sample => |sample_args| { - runSample(allocator, io.io(), sample_args, stderr_writer, stdout_writer); + sample_mode.runSample(allocator, io.io(), sample_args, stderr_writer, stdout_writer); stdout_file_writer.flush() catch |err| { std.log.err("failed to flush stdout: {}", .{err}); }; diff --git a/src/modes/columns.zig b/src/modes/columns.zig new file mode 100644 index 0000000..2d3e199 --- /dev/null +++ b/src/modes/columns.zig @@ -0,0 +1,204 @@ +const std = @import("std"); +const csv_mod = @import("../csv.zig"); +const json_mod = @import("../json.zig"); +const xml_mod = @import("../xml.zig"); +const loader = @import("../loader.zig"); +const args_mod = @import("../args.zig"); + +const inferTypes = loader.inferTypes; +const parseHeader = loader.parseHeader; +const inference_buffer_size = loader.inference_buffer_size; + +const ExitCode = enum(u8) { + success = 0, + usage = 1, + csv_error = 2, + sql_error = 3, +}; + +fn fatal(comptime fmt: []const u8, writer: *std.Io.Writer, code: ExitCode, f_args: anytype) noreturn { + writer.print("error: " ++ fmt ++ "\n", f_args) catch |err| { + std.log.err("failed to write error message: {}", .{err}); + }; + writer.flush() catch |err| std.log.err("failed to flush: {}", .{err}); + std.process.exit(@intFromEnum(code)); +} + +pub fn runColumns( + allocator: std.mem.Allocator, + io: std.Io, + args: args_mod.ColumnsArgs, + stderr_writer: *std.Io.Writer, + stdout_writer: *std.Io.Writer, +) void { + switch (args.input_format) { + .csv, .tsv => { + const col_delim: []const u8 = if (args.input_format == .tsv) "\t" else args.delimiter; + var stdin_buf: [4096]u8 = undefined; + var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + var csv_reader = csv_mod.csvReaderWithDelimiter(allocator, &stdin_file_reader.interface, col_delim); + + const header_record = csv_reader.nextRecord() catch |err| switch (err) { + error.UnterminatedQuotedField => fatal("row 1: unterminated quoted field", stderr_writer, .csv_error, .{}), + else => fatal("row 1: failed to parse CSV header", stderr_writer, .csv_error, .{}), + } orelse fatal("empty input (no header row)", stderr_writer, .csv_error, .{}); + defer csv_reader.freeRecord(header_record); + + const cols = parseHeader(allocator, header_record, stderr_writer) catch |err| switch (err) { + error.EmptyColumnName => fatal("row 1: empty column name in header", stderr_writer, .csv_error, .{}), + error.NoColumns => fatal("row 1: no columns found in header", stderr_writer, .csv_error, .{}), + else => fatal("row 1: failed to parse header", stderr_writer, .csv_error, .{}), + }; + defer { + for (cols) |col| allocator.free(col); + allocator.free(cols); + } + + if (args.verbose) { + var row_buffer: std.ArrayList([][]u8) = .empty; + defer { + for (row_buffer.items) |row| csv_reader.freeRecord(row); + row_buffer.deinit(allocator); + } + var data_row: usize = 1; + while (row_buffer.items.len < inference_buffer_size) { + data_row += 1; + const rec = csv_reader.nextRecord() catch |err| switch (err) { + error.UnterminatedQuotedField => fatal( + "row {d}: unterminated quoted field", + stderr_writer, + .csv_error, + .{data_row}, + ), + else => fatal("row {d}: failed to parse CSV", stderr_writer, .csv_error, .{data_row}), + } orelse break; + if (rec.len == 0) { + csv_reader.freeRecord(rec); + continue; + } + row_buffer.append(allocator, rec) catch + fatal("out of memory while buffering rows", stderr_writer, .csv_error, .{}); + } + const types = inferTypes(allocator, row_buffer.items, cols.len) catch + fatal("out of memory during type inference", stderr_writer, .csv_error, .{}); + defer allocator.free(types); + for (cols, types) |col, t| { + stdout_writer.print("{s} {s}\n", .{ col, @tagName(t) }) catch |err| { + std.log.err("failed to write output: {}", .{err}); + }; + } + } else { + for (cols) |col| { + stdout_writer.print("{s}\n", .{col}) catch |err| { + std.log.err("failed to write output: {}", .{err}); + }; + } + } + }, + .json => { + var stdin_buf: [4096]u8 = undefined; + var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + + var buf: std.ArrayList(u8) = .empty; + defer buf.deinit(allocator); + while (true) { + const byte = stdin_file_reader.interface.takeByte() catch |err| switch (err) { + error.EndOfStream => break, + error.ReadFailed => fatal("failed to read JSON input", stderr_writer, .csv_error, .{}), + }; + buf.append(allocator, byte) catch fatal("out of memory reading JSON", stderr_writer, .csv_error, .{}); + } + if (buf.items.len == 0) fatal("empty input", stderr_writer, .csv_error, .{}); + + var parsed = std.json.parseFromSlice(std.json.Value, allocator, buf.items, .{}) catch + fatal("failed to parse JSON input", stderr_writer, .csv_error, .{}); + defer parsed.deinit(); + + const array = switch (parsed.value) { + .array => |a| a, + else => fatal("JSON input must be an array of objects", stderr_writer, .csv_error, .{}), + }; + if (array.items.len == 0) fatal("empty JSON array: cannot determine column names", stderr_writer, .csv_error, .{}); + + const first_obj = switch (array.items[0]) { + .object => |o| o, + else => fatal("JSON array elements must be objects", stderr_writer, .csv_error, .{}), + }; + + var ki = first_obj.iterator(); + while (ki.next()) |entry| { + if (args.verbose) { + stdout_writer.print("{s} TEXT\n", .{entry.key_ptr.*}) catch |err| { + std.log.err("failed to write output: {}", .{err}); + }; + } else { + stdout_writer.print("{s}\n", .{entry.key_ptr.*}) catch |err| { + std.log.err("failed to write output: {}", .{err}); + }; + } + } + }, + .ndjson => { + var stdin_buf: [4096]u8 = undefined; + var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + + // Read until we find a non-empty line + var line_num: usize = 0; + while (true) { + line_num += 1; + const line = json_mod.readLine(allocator, &stdin_file_reader.interface) catch |err| switch (err) { + error.OutOfMemory => fatal("out of memory reading NDJSON", stderr_writer, .csv_error, .{}), + error.ReadFailed => fatal("line {d}: failed to read NDJSON", stderr_writer, .csv_error, .{line_num}), + } orelse fatal("empty NDJSON input", stderr_writer, .csv_error, .{}); + defer allocator.free(line); + + const trimmed = std.mem.trim(u8, line, " \t\r"); + if (trimmed.len == 0) { line_num -= 1; continue; } + + var parsed = std.json.parseFromSlice(std.json.Value, allocator, trimmed, .{}) catch + fatal("line 1: failed to parse NDJSON", stderr_writer, .csv_error, .{}); + defer parsed.deinit(); + + const obj = switch (parsed.value) { + .object => |o| o, + else => fatal("line 1: NDJSON element must be a JSON object", stderr_writer, .csv_error, .{}), + }; + + var ki = obj.iterator(); + while (ki.next()) |entry| { + if (args.verbose) { + stdout_writer.print("{s} TEXT\n", .{entry.key_ptr.*}) catch |err| { + std.log.err("failed to write output: {}", .{err}); + }; + } else { + stdout_writer.print("{s}\n", .{entry.key_ptr.*}) catch |err| { + std.log.err("failed to write output: {}", .{err}); + }; + } + } + break; + } + }, + .xml => { + var stdin_buf: [4096]u8 = undefined; + var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + + const names = xml_mod.getXmlColumnNames(allocator, &stdin_file_reader.interface, args.xml_root_input, args.xml_row_input, stderr_writer); + defer { + for (names) |name| allocator.free(name); + allocator.free(names); + } + for (names) |name| { + if (args.verbose) { + stdout_writer.print("{s} TEXT\n", .{name}) catch |err| { + std.log.err("failed to write output: {}", .{err}); + }; + } else { + stdout_writer.print("{s}\n", .{name}) catch |err| { + std.log.err("failed to write output: {}", .{err}); + }; + } + } + }, + } +} diff --git a/src/modes/sample.zig b/src/modes/sample.zig new file mode 100644 index 0000000..084a270 --- /dev/null +++ b/src/modes/sample.zig @@ -0,0 +1,167 @@ +const std = @import("std"); +const csv_mod = @import("../csv.zig"); +const sqlite_mod = @import("../sqlite.zig"); +const loader = @import("../loader.zig"); +const args_mod = @import("../args.zig"); +const format = @import("../format.zig"); + +const ColumnType = sqlite_mod.ColumnType; +const inferTypes = loader.inferTypes; +const parseHeader = loader.parseHeader; +const inference_buffer_size = loader.inference_buffer_size; + +const ExitCode = enum(u8) { + success = 0, + usage = 1, + csv_error = 2, + sql_error = 3, +}; + +fn fatal(comptime fmt: []const u8, writer: *std.Io.Writer, code: ExitCode, f_args: anytype) noreturn { + writer.print("error: " ++ fmt ++ "\n", f_args) catch |err| { + std.log.err("failed to write error message: {}", .{err}); + }; + writer.flush() catch |err| std.log.err("failed to flush: {}", .{err}); + std.process.exit(@intFromEnum(code)); +} + +pub fn runSample( + allocator: std.mem.Allocator, + io: std.Io, + args: args_mod.SampleArgs, + stderr_writer: *std.Io.Writer, + stdout_writer: *std.Io.Writer, +) void { + switch (args.input_format) { + .json, .ndjson, .xml => fatal( + "--sample is only supported with CSV and TSV input", + stderr_writer, + .usage, + .{}, + ), + .csv, .tsv => { + const col_delim: []const u8 = if (args.input_format == .tsv) "\t" else args.delimiter; + var stdin_buf: [4096]u8 = undefined; + var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + var csv_reader = csv_mod.csvReaderWithDelimiter(allocator, &stdin_file_reader.interface, col_delim); + + const header_record = csv_reader.nextRecord() catch |err| switch (err) { + error.UnterminatedQuotedField => fatal("row 1: unterminated quoted field", stderr_writer, .csv_error, .{}), + else => fatal("row 1: failed to parse CSV header", stderr_writer, .csv_error, .{}), + } orelse fatal("empty input (no header row)", stderr_writer, .csv_error, .{}); + defer csv_reader.freeRecord(header_record); + + const cols = parseHeader(allocator, header_record, stderr_writer) catch |err| switch (err) { + error.EmptyColumnName => fatal("row 1: empty column name in header", stderr_writer, .csv_error, .{}), + error.NoColumns => fatal("row 1: no columns found in header", stderr_writer, .csv_error, .{}), + else => fatal("row 1: failed to parse header", stderr_writer, .csv_error, .{}), + }; + defer { + for (cols) |col| allocator.free(col); + allocator.free(cols); + } + + // Buffer max(inference_buffer_size, n) rows for type inference + const buf_size = @max(inference_buffer_size, args.n); + var row_buffer: std.ArrayList([][]u8) = .empty; + defer { + for (row_buffer.items) |row| csv_reader.freeRecord(row); + row_buffer.deinit(allocator); + } + + var csv_row_count: usize = 1; + // Loop invariant I: row_buffer contains all non-empty data rows read so far (up to buf_size) + // Bounding function: buf_size - row_buffer.items.len + while (row_buffer.items.len < buf_size) { + const rec = csv_reader.nextRecord() catch |err| switch (err) { + error.UnterminatedQuotedField => fatal( + "row {d}: unterminated quoted field", + stderr_writer, + .csv_error, + .{csv_row_count + 1}, + ), + else => fatal("row {d}: failed to parse CSV", stderr_writer, .csv_error, .{csv_row_count + 1}), + } orelse break; + csv_row_count += 1; + if (rec.len == 0) { + csv_reader.freeRecord(rec); + continue; + } + row_buffer.append(allocator, rec) catch + fatal("out of memory while buffering rows", stderr_writer, .csv_error, .{}); + } + + const types: []ColumnType = if (args.type_inference) blk: { + break :blk inferTypes(allocator, row_buffer.items, cols.len) catch + fatal("out of memory during type inference", stderr_writer, .csv_error, .{}); + } else blk: { + const t = allocator.alloc(ColumnType, cols.len) catch + fatal("out of memory", stderr_writer, .csv_error, .{}); + @memset(t, .TEXT); + break :blk t; + }; + defer allocator.free(types); + + // ─── Print schema block to stderr ───────────────────────────────────── + // Compute max column name width for aligned output + var max_col_width: usize = 0; + for (cols) |col| max_col_width = @max(max_col_width, col.len); + + stderr_writer.print("# Schema ({d} columns):\n", .{cols.len}) catch |err| { + std.log.err("failed to write schema: {}", .{err}); + }; + // Loop invariant I: cols[0..i] have been printed with aligned type annotation + // Bounding function: cols.len - i + for (cols, types) |col, t| { + stderr_writer.writeAll("# ") catch |err| { + std.log.err("failed to write schema: {}", .{err}); + }; + stderr_writer.writeAll(col) catch |err| { + std.log.err("failed to write schema: {}", .{err}); + }; + // Pad to max_col_width + 2 spaces before the type + var p: usize = col.len; + while (p < max_col_width + 2) : (p += 1) { + stderr_writer.writeByte(' ') catch |err| { + std.log.err("failed to write schema: {}", .{err}); + }; + } + stderr_writer.print("{s}\n", .{@tagName(t)}) catch |err| { + std.log.err("failed to write schema: {}", .{err}); + }; + } + stderr_writer.flush() catch |err| std.log.err("failed to flush stderr: {}", .{err}); + + // ─── Print header row to stdout ──────────────────────────────────────── + // Loop invariant I: cols[0..i] names have been written, separated by col_delim + // Bounding function: cols.len - i + for (cols, 0..) |col, i| { + if (i > 0) stdout_writer.writeAll(col_delim) catch + fatal("failed to write header", stderr_writer, .csv_error, .{}); + format.writeField(stdout_writer, col, col_delim) catch + fatal("failed to write header", stderr_writer, .csv_error, .{}); + } + stdout_writer.writeByte('\n') catch + fatal("failed to write header newline", stderr_writer, .csv_error, .{}); + + // ─── Print first n data rows to stdout ──────────────────────────────── + const rows_to_print = @min(args.n, row_buffer.items.len); + // Loop invariant I: row_buffer[0..r] have been printed as delimited rows + // Bounding function: rows_to_print - r + for (row_buffer.items[0..rows_to_print]) |row| { + var col_idx: usize = 0; + // Loop invariant I: cols[0..col_idx] fields have been written for this row + // Bounding function: cols.len - col_idx + while (col_idx < cols.len) : (col_idx += 1) { + if (col_idx > 0) stdout_writer.writeAll(col_delim) catch + fatal("failed to write field separator", stderr_writer, .csv_error, .{}); + const val: []const u8 = if (col_idx < row.len) row[col_idx] else ""; + format.writeField(stdout_writer, val, col_delim) catch + fatal("failed to write field", stderr_writer, .csv_error, .{}); + } + stdout_writer.writeByte('\n') catch + fatal("failed to write row newline", stderr_writer, .csv_error, .{}); + } + }, + } +} diff --git a/src/modes/validate.zig b/src/modes/validate.zig new file mode 100644 index 0000000..6926328 --- /dev/null +++ b/src/modes/validate.zig @@ -0,0 +1,323 @@ +const std = @import("std"); +const csv_mod = @import("../csv.zig"); +const json_mod = @import("../json.zig"); +const xml_mod = @import("../xml.zig"); +const sqlite_mod = @import("../sqlite.zig"); +const loader = @import("../loader.zig"); +const args_mod = @import("../args.zig"); + +const ColumnType = sqlite_mod.ColumnType; +const inferTypes = loader.inferTypes; +const parseHeader = loader.parseHeader; +const fmtThousands = loader.fmtThousands; +const inference_buffer_size = loader.inference_buffer_size; + +const ExitCode = enum(u8) { + success = 0, + usage = 1, + csv_error = 2, + sql_error = 3, +}; + +fn fatal(comptime fmt: []const u8, writer: *std.Io.Writer, code: ExitCode, f_args: anytype) noreturn { + writer.print("error: " ++ fmt ++ "\n", f_args) catch |err| { + std.log.err("failed to write error message: {}", .{err}); + }; + writer.flush() catch |err| std.log.err("failed to flush: {}", .{err}); + std.process.exit(@intFromEnum(code)); +} + +pub fn runValidate( + allocator: std.mem.Allocator, + io: std.Io, + args: args_mod.ValidateArgs, + stderr_writer: *std.Io.Writer, + stdout_writer: *std.Io.Writer, +) void { + switch (args.input_format) { + .csv, .tsv => { + const col_delim: []const u8 = if (args.input_format == .tsv) "\t" else args.delimiter; + var stdin_buf: [4096]u8 = undefined; + var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + var csv_reader = csv_mod.csvReaderWithDelimiter(allocator, &stdin_file_reader.interface, col_delim); + + const header_record = csv_reader.nextRecord() catch |err| switch (err) { + error.UnterminatedQuotedField => fatal("row 1: unterminated quoted field", stderr_writer, .csv_error, .{}), + else => fatal("row 1: failed to parse CSV header", stderr_writer, .csv_error, .{}), + } orelse fatal("empty input (no header row)", stderr_writer, .csv_error, .{}); + defer csv_reader.freeRecord(header_record); + + const cols = parseHeader(allocator, header_record, stderr_writer) catch |err| switch (err) { + error.EmptyColumnName => fatal("row 1: empty column name in header", stderr_writer, .csv_error, .{}), + error.NoColumns => fatal("row 1: no columns found in header", stderr_writer, .csv_error, .{}), + else => fatal("row 1: failed to parse header", stderr_writer, .csv_error, .{}), + }; + defer { + for (cols) |col| allocator.free(col); + allocator.free(cols); + } + + const num_cols = cols.len; + var csv_row_count: usize = 1; // header already read + var data_row_count: usize = 0; + + var row_buffer: std.ArrayList([][]u8) = .empty; + defer { + for (row_buffer.items) |row| csv_reader.freeRecord(row); + row_buffer.deinit(allocator); + } + + // Buffer up to inference_buffer_size rows for type inference + while (row_buffer.items.len < inference_buffer_size) { + const rec = csv_reader.nextRecord() catch |err| switch (err) { + error.UnterminatedQuotedField => fatal( + "row {d}: unterminated quoted field", + stderr_writer, + .csv_error, + .{csv_row_count + 1}, + ), + else => fatal( + "row {d}: failed to parse CSV", + stderr_writer, + .csv_error, + .{csv_row_count + 1}, + ), + } orelse break; + csv_row_count += 1; + if (rec.len == 0) { + csv_reader.freeRecord(rec); + continue; + } + data_row_count += 1; + row_buffer.append(allocator, rec) catch + fatal("out of memory while buffering rows", stderr_writer, .csv_error, .{}); + } + + const types: []ColumnType = if (args.type_inference) blk: { + break :blk inferTypes(allocator, row_buffer.items, num_cols) catch + fatal("out of memory during type inference", stderr_writer, .csv_error, .{}); + } else blk: { + const t = allocator.alloc(ColumnType, num_cols) catch + fatal("out of memory", stderr_writer, .csv_error, .{}); + @memset(t, .TEXT); + break :blk t; + }; + defer allocator.free(types); + + // Stream remaining rows and count them + while (true) { + const record = csv_reader.nextRecord() catch |err| switch (err) { + error.UnterminatedQuotedField => fatal( + "row {d}: unterminated quoted field", + stderr_writer, + .csv_error, + .{csv_row_count + 1}, + ), + else => fatal( + "row {d}: failed to parse CSV", + stderr_writer, + .csv_error, + .{csv_row_count + 1}, + ), + } orelse break; + csv_row_count += 1; + defer csv_reader.freeRecord(record); + if (record.len == 0) continue; + data_row_count += 1; + } + + var count_buf: [32]u8 = undefined; + const count_str = fmtThousands(&count_buf, data_row_count); + + stdout_writer.print("OK: {s} rows, {d} columns (", .{ count_str, num_cols }) catch |err| { + std.log.err("failed to write output: {}", .{err}); + std.process.exit(@intFromEnum(ExitCode.usage)); + }; + + for (cols, types, 0..) |col, t, i| { + if (i > 0) { + stdout_writer.writeAll(", ") catch |err| { + std.log.err("failed to write output: {}", .{err}); + std.process.exit(@intFromEnum(ExitCode.usage)); + }; + } + stdout_writer.print("{s} {s}", .{ col, @tagName(t) }) catch |err| { + std.log.err("failed to write output: {}", .{err}); + std.process.exit(@intFromEnum(ExitCode.usage)); + }; + } + stdout_writer.writeAll(")\n") catch |err| { + std.log.err("failed to write output: {}", .{err}); + std.process.exit(@intFromEnum(ExitCode.usage)); + }; + }, + .json => { + var stdin_buf: [4096]u8 = undefined; + var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + + var buf: std.ArrayList(u8) = .empty; + defer buf.deinit(allocator); + while (true) { + const byte = stdin_file_reader.interface.takeByte() catch |err| switch (err) { + error.EndOfStream => break, + error.ReadFailed => fatal("failed to read JSON input", stderr_writer, .csv_error, .{}), + }; + buf.append(allocator, byte) catch fatal("out of memory reading JSON", stderr_writer, .csv_error, .{}); + } + if (buf.items.len == 0) fatal("empty input", stderr_writer, .csv_error, .{}); + + var parsed = std.json.parseFromSlice(std.json.Value, allocator, buf.items, .{}) catch + fatal("failed to parse JSON input", stderr_writer, .csv_error, .{}); + defer parsed.deinit(); + + const array = switch (parsed.value) { + .array => |a| a, + else => fatal("JSON input must be an array of objects", stderr_writer, .csv_error, .{}), + }; + if (array.items.len == 0) fatal("empty JSON array: cannot determine column names", stderr_writer, .csv_error, .{}); + + const first_obj = switch (array.items[0]) { + .object => |o| o, + else => fatal("JSON array elements must be objects", stderr_writer, .csv_error, .{}), + }; + + var num_cols: usize = 0; + var ki = first_obj.iterator(); + while (ki.next()) |_| num_cols += 1; + + var count_buf: [32]u8 = undefined; + const count_str = fmtThousands(&count_buf, array.items.len); + stdout_writer.print("OK: {s} rows, {d} columns (", .{ count_str, num_cols }) catch |err| { + std.log.err("failed to write output: {}", .{err}); + std.process.exit(@intFromEnum(ExitCode.usage)); + }; + ki = first_obj.iterator(); + var col_i: usize = 0; + while (ki.next()) |entry| : (col_i += 1) { + if (col_i > 0) stdout_writer.writeAll(", ") catch |err| { + std.log.err("failed to write output: {}", .{err}); + std.process.exit(@intFromEnum(ExitCode.usage)); + }; + stdout_writer.print("{s} TEXT", .{entry.key_ptr.*}) catch |err| { + std.log.err("failed to write output: {}", .{err}); + std.process.exit(@intFromEnum(ExitCode.usage)); + }; + } + stdout_writer.writeAll(")\n") catch |err| { + std.log.err("failed to write output: {}", .{err}); + std.process.exit(@intFromEnum(ExitCode.usage)); + }; + }, + .ndjson => { + var stdin_buf: [4096]u8 = undefined; + var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + + var line_num: usize = 0; + var row_count: usize = 0; + var cols_owned: ?[][]u8 = null; + defer if (cols_owned) |cs| { + for (cs) |col| allocator.free(col); + allocator.free(cs); + }; + + while (true) { + line_num += 1; + const line = json_mod.readLine(allocator, &stdin_file_reader.interface) catch |err| switch (err) { + error.OutOfMemory => fatal("out of memory reading NDJSON", stderr_writer, .csv_error, .{}), + error.ReadFailed => fatal("line {d}: failed to read NDJSON", stderr_writer, .csv_error, .{line_num}), + } orelse break; + defer allocator.free(line); + + const trimmed = std.mem.trim(u8, line, " \t\r"); + if (trimmed.len == 0) { + line_num -= 1; + continue; + } + + var parsed_line = std.json.parseFromSlice(std.json.Value, allocator, trimmed, .{}) catch + fatal("line {d}: failed to parse NDJSON", stderr_writer, .csv_error, .{line_num}); + defer parsed_line.deinit(); + + const obj = switch (parsed_line.value) { + .object => |o| o, + else => fatal("line {d}: NDJSON element must be a JSON object", stderr_writer, .csv_error, .{line_num}), + }; + + if (cols_owned == null) { + var col_list: std.ArrayList([]u8) = .empty; + errdefer { + for (col_list.items) |col| allocator.free(col); + col_list.deinit(allocator); + } + var ki = obj.iterator(); + while (ki.next()) |entry| { + const owned_key = allocator.dupe(u8, entry.key_ptr.*) catch + fatal("out of memory building column list", stderr_writer, .csv_error, .{}); + col_list.append(allocator, owned_key) catch + fatal("out of memory building column list", stderr_writer, .csv_error, .{}); + } + if (col_list.items.len == 0) + fatal("line 1: first NDJSON object has no keys", stderr_writer, .csv_error, .{}); + cols_owned = col_list.toOwnedSlice(allocator) catch + fatal("out of memory", stderr_writer, .csv_error, .{}); + } + row_count += 1; + } + + if (cols_owned == null) fatal("empty NDJSON input", stderr_writer, .csv_error, .{}); + + const cols = cols_owned.?; + var count_buf: [32]u8 = undefined; + const count_str = fmtThousands(&count_buf, row_count); + stdout_writer.print("OK: {s} rows, {d} columns (", .{ count_str, cols.len }) catch |err| { + std.log.err("failed to write output: {}", .{err}); + std.process.exit(@intFromEnum(ExitCode.usage)); + }; + for (cols, 0..) |col, i| { + if (i > 0) stdout_writer.writeAll(", ") catch |err| { + std.log.err("failed to write output: {}", .{err}); + std.process.exit(@intFromEnum(ExitCode.usage)); + }; + stdout_writer.print("{s} TEXT", .{col}) catch |err| { + std.log.err("failed to write output: {}", .{err}); + std.process.exit(@intFromEnum(ExitCode.usage)); + }; + } + stdout_writer.writeAll(")\n") catch |err| { + std.log.err("failed to write output: {}", .{err}); + std.process.exit(@intFromEnum(ExitCode.usage)); + }; + }, + .xml => { + var stdin_buf: [4096]u8 = undefined; + var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + + const summary = xml_mod.summarizeXml(allocator, &stdin_file_reader.interface, args.xml_root_input, args.xml_row_input, stderr_writer); + defer { + for (summary.col_names) |name| allocator.free(name); + allocator.free(summary.col_names); + } + + var count_buf: [32]u8 = undefined; + const count_str = fmtThousands(&count_buf, summary.row_count); + stdout_writer.print("OK: {s} rows, {d} columns (", .{ count_str, summary.col_names.len }) catch |err| { + std.log.err("failed to write output: {}", .{err}); + std.process.exit(@intFromEnum(ExitCode.usage)); + }; + for (summary.col_names, 0..) |name, i| { + if (i > 0) stdout_writer.writeAll(", ") catch |err| { + std.log.err("failed to write output: {}", .{err}); + std.process.exit(@intFromEnum(ExitCode.usage)); + }; + stdout_writer.print("{s} TEXT", .{name}) catch |err| { + std.log.err("failed to write output: {}", .{err}); + std.process.exit(@intFromEnum(ExitCode.usage)); + }; + } + stdout_writer.writeAll(")\n") catch |err| { + std.log.err("failed to write output: {}", .{err}); + std.process.exit(@intFromEnum(ExitCode.usage)); + }; + }, + } +} From e07fd9e7a5fbb8c367ab98f7955a6f2c5dabab99 Mon Sep 17 00:00:00 2001 From: "Victor M. Varela" Date: Fri, 8 May 2026 14:37:29 +0200 Subject: [PATCH 6/6] =?UTF-8?q?fix:=20address=20code=20review=20=E2=80=94?= =?UTF-8?q?=20consolidate=20ExitCode,=20remove=20dead=20code,=20guard=20XM?= =?UTF-8?q?L=20validation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/args.zig | 20 ++++++++++-- src/format.zig | 6 +++- src/json.zig | 50 +++++++++++++++--------------- src/loader.zig | 60 +++++++++++------------------------- src/main.zig | 13 +------- src/modes/columns.zig | 7 +---- src/modes/sample.zig | 7 +---- src/modes/validate.zig | 7 +---- src/sqlite.zig | 70 ++++++++++++++++++++---------------------- src/xml.zig | 70 ++++++++++++++++++++---------------------- 10 files changed, 136 insertions(+), 174 deletions(-) diff --git a/src/args.zig b/src/args.zig index c8e2c47..9238e60 100644 --- a/src/args.zig +++ b/src/args.zig @@ -6,6 +6,18 @@ const format = @import("format.zig"); const InputFormat = format.InputFormat; const OutputFormat = format.OutputFormat; +/// Structured exit codes for scripting. +/// 0 = success +/// 1 = usage error (missing query, bad flag) +/// 2 = CSV/parse error +/// 3 = SQL error +pub const ExitCode = enum(u8) { + success = 0, + usage = 1, + csv_error = 2, + sql_error = 3, +}; + pub const SqlPipeError = error{ MissingQuery, InvalidDelimiter, @@ -387,9 +399,11 @@ pub fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { if (silent and verbose) return error.SilentVerboseConflict; - // --xml-root and --xml-row must be valid XML element names - if (!isValidXmlName(xml_root) or !isValidXmlName(xml_row)) - return error.InvalidXmlName; + // --xml-root and --xml-row must be valid XML element names (only validated in XML mode) + if (input_format == .xml or output_format == .xml) { + if (!isValidXmlName(xml_root) or !isValidXmlName(xml_row)) + return error.InvalidXmlName; + } // --columns mode: list headers and exit if (list_columns) diff --git a/src/format.zig b/src/format.zig index fdbed0a..448b1b8 100644 --- a/src/format.zig +++ b/src/format.zig @@ -104,6 +104,8 @@ pub const OutputWriter = struct { /// Allocated in begin(); freed in deinit(). col_names: []const [*:0]const u8, col_count: c_int, + /// True when col_names was heap-allocated in begin(); false when begin() was never called. + col_names_allocated: bool, /// Create a new OutputWriter. Call begin() before the first writeRow(). pub fn init(format: OutputFormat, opts: WriteOpts) OutputWriter { @@ -113,13 +115,14 @@ pub const OutputWriter = struct { .first_row = true, .col_names = &.{}, .col_count = 0, + .col_names_allocated = false, }; } /// Release any memory allocated during begin(). /// Safe to call even when begin() was never called. pub fn deinit(self: *OutputWriter, allocator: std.mem.Allocator) void { - if (self.col_names.len > 0) { + if (self.col_names_allocated) { allocator.free(self.col_names); } self.* = undefined; @@ -150,6 +153,7 @@ pub const OutputWriter = struct { names[@intCast(i)] = c.sqlite3_column_name(stmt, i); } self.col_names = names; + self.col_names_allocated = true; }, .csv, .tsv => { if (self.opts.header and col_count > 0) diff --git a/src/json.zig b/src/json.zig index 77e9311..0732ea2 100644 --- a/src/json.zig +++ b/src/json.zig @@ -29,9 +29,7 @@ const prepareInsertStmt = sqlite_helpers.prepareInsertStmt; const beginTransaction = sqlite_helpers.beginTransaction; const commitTransaction = sqlite_helpers.commitTransaction; const fatal = sqlite_helpers.fatal; -const exit_usage = sqlite_helpers.exit_usage; -const exit_parse = sqlite_helpers.exit_parse; -const exit_sql = sqlite_helpers.exit_sql; +const ExitCode = sqlite_helpers.ExitCode; const sqlite_static = sqlite_helpers.sqlite_static; // ─── Shared helpers ─────────────────────────────────── @@ -189,28 +187,28 @@ pub fn loadJsonArray( while (true) { const byte = reader.takeByte() catch |err| switch (err) { error.EndOfStream => break, - error.ReadFailed => fatal("failed to read JSON input", stderr_writer, exit_parse, .{}), + error.ReadFailed => fatal("failed to read JSON input", stderr_writer, .csv_error, .{}), }; - buf.append(allocator, byte) catch fatal("out of memory reading JSON input", stderr_writer, exit_parse, .{}); + buf.append(allocator, byte) catch fatal("out of memory reading JSON input", stderr_writer, .csv_error, .{}); } - if (buf.items.len == 0) fatal("empty input", stderr_writer, exit_parse, .{}); + if (buf.items.len == 0) fatal("empty input", stderr_writer, .csv_error, .{}); var parsed = std.json.parseFromSlice(std.json.Value, allocator, buf.items, .{}) catch - fatal("failed to parse JSON input", stderr_writer, exit_parse, .{}); + fatal("failed to parse JSON input", stderr_writer, .csv_error, .{}); defer parsed.deinit(); const array = switch (parsed.value) { .array => |a| a, - else => fatal("JSON input must be an array of objects", stderr_writer, exit_parse, .{}), + else => fatal("JSON input must be an array of objects", stderr_writer, .csv_error, .{}), }; - if (array.items.len == 0) fatal("empty JSON array: cannot determine column names", stderr_writer, exit_parse, .{}); + if (array.items.len == 0) fatal("empty JSON array: cannot determine column names", stderr_writer, .csv_error, .{}); // Extract column names from the first object's keys (insertion order) const first_obj = switch (array.items[0]) { .object => |o| o, - else => fatal("JSON array elements must be objects", stderr_writer, exit_parse, .{}), + else => fatal("JSON array elements must be objects", stderr_writer, .csv_error, .{}), }; var cols: std.ArrayList([]const u8) = .empty; @@ -218,9 +216,9 @@ pub fn loadJsonArray( var key_iter = first_obj.iterator(); while (key_iter.next()) |entry| { cols.append(allocator, entry.key_ptr.*) catch - fatal("out of memory building column list", stderr_writer, exit_parse, .{}); + fatal("out of memory building column list", stderr_writer, .csv_error, .{}); } - if (cols.items.len == 0) fatal("first JSON object has no keys", stderr_writer, exit_parse, .{}); + if (cols.items.len == 0) fatal("first JSON object has no keys", stderr_writer, .csv_error, .{}); // Create all-TEXT table (column names are owned by parsed arena — valid until parsed.deinit()) createAllTextTable(allocator, db, cols.items, stderr_writer); @@ -235,15 +233,15 @@ pub fn loadJsonArray( for (array.items) |item| { const obj = switch (item) { .object => |o| o, - else => fatal("JSON array element is not an object", stderr_writer, exit_parse, .{}), + else => fatal("JSON array element is not an object", stderr_writer, .csv_error, .{}), }; rows_inserted += 1; if (max_rows) |limit| { if (rows_inserted > limit) - fatal("input exceeds --max-rows limit ({d} rows)", stderr_writer, exit_usage, .{limit}); + fatal("input exceeds --max-rows limit ({d} rows)", stderr_writer, .usage, .{limit}); } insertRowFromJson(allocator, stmt, cols.items, obj) catch - fatal("{s}", stderr_writer, exit_sql, .{std.mem.span(c.sqlite3_errmsg(db))}); + fatal("{s}", stderr_writer, .sql_error, .{std.mem.span(c.sqlite3_errmsg(db))}); } commitTransaction(db, stderr_writer); @@ -283,8 +281,8 @@ pub fn loadNdjsonInput( while (true) { line_num += 1; const line = readLine(allocator, reader) catch |err| switch (err) { - error.OutOfMemory => fatal("out of memory reading NDJSON", stderr_writer, exit_parse, .{}), - error.ReadFailed => fatal("line {d}: failed to read NDJSON input", stderr_writer, exit_parse, .{line_num}), + error.OutOfMemory => fatal("out of memory reading NDJSON", stderr_writer, .csv_error, .{}), + error.ReadFailed => fatal("line {d}: failed to read NDJSON input", stderr_writer, .csv_error, .{line_num}), } orelse break; defer allocator.free(line); @@ -295,12 +293,12 @@ pub fn loadNdjsonInput( } var parsed_line = std.json.parseFromSlice(std.json.Value, allocator, trimmed, .{}) catch - fatal("line {d}: failed to parse NDJSON", stderr_writer, exit_parse, .{line_num}); + fatal("line {d}: failed to parse NDJSON", stderr_writer, .csv_error, .{line_num}); defer parsed_line.deinit(); const obj = switch (parsed_line.value) { .object => |o| o, - else => fatal("line {d}: NDJSON element must be a JSON object", stderr_writer, exit_parse, .{line_num}), + else => fatal("line {d}: NDJSON element must be a JSON object", stderr_writer, .csv_error, .{line_num}), }; if (cols_owned == null) { @@ -313,15 +311,15 @@ pub fn loadNdjsonInput( var ki = obj.iterator(); while (ki.next()) |entry| { const owned_key = allocator.dupe(u8, entry.key_ptr.*) catch - fatal("out of memory building column list", stderr_writer, exit_parse, .{}); + fatal("out of memory building column list", stderr_writer, .csv_error, .{}); col_list.append(allocator, owned_key) catch - fatal("out of memory building column list", stderr_writer, exit_parse, .{}); + fatal("out of memory building column list", stderr_writer, .csv_error, .{}); } if (col_list.items.len == 0) - fatal("line 1: first NDJSON object has no keys", stderr_writer, exit_parse, .{}); + fatal("line 1: first NDJSON object has no keys", stderr_writer, .csv_error, .{}); cols_owned = col_list.toOwnedSlice(allocator) catch - fatal("out of memory", stderr_writer, exit_parse, .{}); + fatal("out of memory", stderr_writer, .csv_error, .{}); const cols_const: []const []const u8 = @ptrCast(cols_owned.?); createAllTextTable(allocator, db, cols_const, stderr_writer); @@ -334,16 +332,16 @@ pub fn loadNdjsonInput( rows_inserted += 1; if (max_rows) |limit| { if (rows_inserted > limit) - fatal("input exceeds --max-rows limit ({d} rows)", stderr_writer, exit_usage, .{limit}); + fatal("input exceeds --max-rows limit ({d} rows)", stderr_writer, .usage, .{limit}); } const cols_const: []const []const u8 = @ptrCast(cols_owned.?); insertRowFromJson(allocator, insert_stmt.?, cols_const, obj) catch - fatal("line {d}: {s}", stderr_writer, exit_sql, .{ line_num, std.mem.span(c.sqlite3_errmsg(db)) }); + fatal("line {d}: {s}", stderr_writer, .sql_error, .{ line_num, std.mem.span(c.sqlite3_errmsg(db)) }); } if (cols_owned == null) - fatal("empty NDJSON input", stderr_writer, exit_parse, .{}); + fatal("empty NDJSON input", stderr_writer, .csv_error, .{}); if (in_transaction) commitTransaction(db, stderr_writer); return rows_inserted; diff --git a/src/loader.zig b/src/loader.zig index 77d0906..55e2b8f 100644 --- a/src/loader.zig +++ b/src/loader.zig @@ -18,19 +18,6 @@ pub const inference_buffer_size: usize = 100; /// Number of rows between progress indicator updates. pub const progress_interval: usize = 10_000; -/// stripQuotes(raw) → []const u8 -/// Pre: raw is a valid UTF-8 slice -/// Post: if raw = '"' ++ inner ++ '"' => result = inner -/// otherwise => result = raw -/// Note: RFC 4180 quoted-field unescaping is handled by csv.zig; this function -/// provides an explicit, single-location implementation for any residual -/// direct string handling that bypasses the CSV parser. -fn stripQuotes(raw: []const u8) []const u8 { - if (raw.len >= 2 and raw[0] == '"' and raw[raw.len - 1] == '"') - return raw[1 .. raw.len - 1]; - return raw; -} - /// isInteger(val) → bool /// Pre: val is a valid UTF-8 slice /// Post: result = val matches [+-]?[0-9]+ (non-empty, only digits after optional sign) @@ -189,28 +176,17 @@ pub fn parseHeader( return cols.toOwnedSlice(allocator); } -/// insertRowTyped(stmt, db, row, types, param_count) → void +/// insertRowTyped(stmt, row, types, param_count) → void /// Pre: stmt is a prepared INSERT with param_count parameters, freshly reset /// row is a non-empty CSV record (slice of field slices) /// types.len = param_count (or shorter → remaining treated as TEXT) -/// db is the database that owns stmt (used for error reporting by caller) /// Post: each field is bound to its parameter using the appropriate SQLite bind -/// function according to types[j]: -/// INTEGER → sqlite3_bind_int64 (fallback: TEXT on parse failure) -/// REAL → sqlite3_bind_double (fallback: TEXT on parse failure) -/// TEXT → sqlite3_bind_text -/// empty / missing values → sqlite3_bind_null -/// sqlite3_step returned SQLITE_DONE -/// error.BindFailed / error.StepFailed on SQLite errors pub fn insertRowTyped( stmt: *c.sqlite3_stmt, - db: *c.sqlite3, row: [][]u8, types: []const ColumnType, param_count: c_int, ) args_mod.SqlPipeError!void { - _ = db; - _ = c.sqlite3_reset(stmt); _ = c.sqlite3_clear_bindings(stmt); @@ -332,15 +308,15 @@ pub fn loadCsvInput( var csv_reader = csv_mod.csvReaderWithDelimiter(allocator, &stdin_file_reader.interface, parsed.delimiter); const header_record = csv_reader.nextRecord() catch |err| switch (err) { - error.UnterminatedQuotedField => fatal("row 1: unterminated quoted field", stderr_writer, sqlite_mod.exit_parse, .{}), - else => fatal("row 1: failed to parse CSV header", stderr_writer, sqlite_mod.exit_parse, .{}), - } orelse fatal("empty input (no header row)", stderr_writer, sqlite_mod.exit_parse, .{}); + error.UnterminatedQuotedField => fatal("row 1: unterminated quoted field", stderr_writer, .csv_error, .{}), + else => fatal("row 1: failed to parse CSV header", stderr_writer, .csv_error, .{}), + } orelse fatal("empty input (no header row)", stderr_writer, .csv_error, .{}); defer csv_reader.freeRecord(header_record); const cols = parseHeader(allocator, header_record, stderr_writer) catch |err| switch (err) { - error.EmptyColumnName => fatal("row 1: empty column name in header", stderr_writer, sqlite_mod.exit_parse, .{}), - error.NoColumns => fatal("row 1: no columns found in header", stderr_writer, sqlite_mod.exit_parse, .{}), - else => fatal("row 1: failed to parse header", stderr_writer, sqlite_mod.exit_parse, .{}), + error.EmptyColumnName => fatal("row 1: empty column name in header", stderr_writer, .csv_error, .{}), + error.NoColumns => fatal("row 1: no columns found in header", stderr_writer, .csv_error, .{}), + else => fatal("row 1: failed to parse header", stderr_writer, .csv_error, .{}), }; defer { for (cols) |col| allocator.free(col); @@ -363,13 +339,13 @@ pub fn loadCsvInput( error.UnterminatedQuotedField => fatal( "row {d}: unterminated quoted field", stderr_writer, - sqlite_mod.exit_parse, + .csv_error, .{csv_row_count + 1}, ), else => fatal( "row {d}: failed to parse CSV", stderr_writer, - sqlite_mod.exit_parse, + .csv_error, .{csv_row_count + 1}, ), } orelse break; @@ -379,13 +355,13 @@ pub fn loadCsvInput( continue; } row_buffer.append(allocator, rec) catch - fatal("out of memory while buffering rows", stderr_writer, sqlite_mod.exit_parse, .{}); + fatal("out of memory while buffering rows", stderr_writer, .csv_error, .{}); } break :blk inferTypes(allocator, row_buffer.items, num_cols) catch - fatal("out of memory during type inference", stderr_writer, sqlite_mod.exit_parse, .{}); + fatal("out of memory during type inference", stderr_writer, .csv_error, .{}); } else blk: { const t = allocator.alloc(ColumnType, num_cols) catch - fatal("out of memory", stderr_writer, sqlite_mod.exit_parse, .{}); + fatal("out of memory", stderr_writer, .csv_error, .{}); @memset(t, .TEXT); break :blk t; }; @@ -414,9 +390,9 @@ pub fn loadCsvInput( rows_inserted += 1; if (parsed.max_rows) |limit| { if (rows_inserted > limit) - fatal("input exceeds --max-rows limit ({d} rows)", stderr_writer, sqlite_mod.exit_usage, .{limit}); + fatal("input exceeds --max-rows limit ({d} rows)", stderr_writer, .usage, .{limit}); } - insertRowTyped(stmt, db, row, types, @intCast(num_cols)) catch + insertRowTyped(stmt, row, types, @intCast(num_cols)) catch fatalSqlWithContext(allocator, db, std.mem.span(c.sqlite3_errmsg(db)), stderr_writer); if (is_tty and rows_inserted % progress_interval == 0) printProgress(stderr_writer, rows_inserted, parsed.max_rows); @@ -428,13 +404,13 @@ pub fn loadCsvInput( error.UnterminatedQuotedField => fatal( "row {d}: unterminated quoted field", stderr_writer, - sqlite_mod.exit_parse, + .csv_error, .{csv_row_count + 1}, ), else => fatal( "row {d}: failed to parse CSV", stderr_writer, - sqlite_mod.exit_parse, + .csv_error, .{csv_row_count + 1}, ), } orelse break; @@ -446,9 +422,9 @@ pub fn loadCsvInput( rows_inserted += 1; if (parsed.max_rows) |limit| { if (rows_inserted > limit) - fatal("input exceeds --max-rows limit ({d} rows)", stderr_writer, sqlite_mod.exit_usage, .{limit}); + fatal("input exceeds --max-rows limit ({d} rows)", stderr_writer, .usage, .{limit}); } - insertRowTyped(stmt, db, record, types, @intCast(num_cols)) catch + insertRowTyped(stmt, record, types, @intCast(num_cols)) catch fatalSqlWithContext(allocator, db, std.mem.span(c.sqlite3_errmsg(db)), stderr_writer); if (is_tty and rows_inserted % progress_interval == 0) printProgress(stderr_writer, rows_inserted, parsed.max_rows); diff --git a/src/main.zig b/src/main.zig index 2bfd761..1c13623 100644 --- a/src/main.zig +++ b/src/main.zig @@ -16,6 +16,7 @@ const VERSION: []const u8 = build_options.version; const SqlPipeError = args_mod.SqlPipeError; const ParsedArgs = args_mod.ParsedArgs; +const ExitCode = args_mod.ExitCode; const parseArgs = args_mod.parseArgs; const printUsage = args_mod.printUsage; @@ -23,18 +24,6 @@ const loadCsvInput = loader.loadCsvInput; const fmtThousands = loader.fmtThousands; const progress_interval = loader.progress_interval; -/// Structured exit codes for scripting. -/// 0 = success -/// 1 = usage error (missing query, bad flag) -/// 2 = CSV parse error -/// 3 = SQL error (sqlite3 error) -const ExitCode = enum(u8) { - success = 0, - usage = 1, - csv_error = 2, - sql_error = 3, -}; - /// Supported input formats (canonical definition lives in format.zig). const InputFormat = format.InputFormat; diff --git a/src/modes/columns.zig b/src/modes/columns.zig index 2d3e199..88825f4 100644 --- a/src/modes/columns.zig +++ b/src/modes/columns.zig @@ -9,12 +9,7 @@ const inferTypes = loader.inferTypes; const parseHeader = loader.parseHeader; const inference_buffer_size = loader.inference_buffer_size; -const ExitCode = enum(u8) { - success = 0, - usage = 1, - csv_error = 2, - sql_error = 3, -}; +const ExitCode = args_mod.ExitCode; fn fatal(comptime fmt: []const u8, writer: *std.Io.Writer, code: ExitCode, f_args: anytype) noreturn { writer.print("error: " ++ fmt ++ "\n", f_args) catch |err| { diff --git a/src/modes/sample.zig b/src/modes/sample.zig index 084a270..61b46f0 100644 --- a/src/modes/sample.zig +++ b/src/modes/sample.zig @@ -10,12 +10,7 @@ const inferTypes = loader.inferTypes; const parseHeader = loader.parseHeader; const inference_buffer_size = loader.inference_buffer_size; -const ExitCode = enum(u8) { - success = 0, - usage = 1, - csv_error = 2, - sql_error = 3, -}; +const ExitCode = args_mod.ExitCode; fn fatal(comptime fmt: []const u8, writer: *std.Io.Writer, code: ExitCode, f_args: anytype) noreturn { writer.print("error: " ++ fmt ++ "\n", f_args) catch |err| { diff --git a/src/modes/validate.zig b/src/modes/validate.zig index 6926328..627346f 100644 --- a/src/modes/validate.zig +++ b/src/modes/validate.zig @@ -12,12 +12,7 @@ const parseHeader = loader.parseHeader; const fmtThousands = loader.fmtThousands; const inference_buffer_size = loader.inference_buffer_size; -const ExitCode = enum(u8) { - success = 0, - usage = 1, - csv_error = 2, - sql_error = 3, -}; +const ExitCode = args_mod.ExitCode; fn fatal(comptime fmt: []const u8, writer: *std.Io.Writer, code: ExitCode, f_args: anytype) noreturn { writer.print("error: " ++ fmt ++ "\n", f_args) catch |err| { diff --git a/src/sqlite.zig b/src/sqlite.zig index e1ef096..ddf320b 100644 --- a/src/sqlite.zig +++ b/src/sqlite.zig @@ -2,6 +2,9 @@ const std = @import("std"); const c = @import("c"); +const args_mod = @import("args.zig"); + +pub const ExitCode = args_mod.ExitCode; /// SQLITE_STATIC: caller manages string lifetime; SQLite must not free it. pub const sqlite_static: c.sqlite3_destructor_type = null; @@ -9,18 +12,13 @@ pub const sqlite_static: c.sqlite3_destructor_type = null; /// Inferred SQLite affinity for a CSV column. pub const ColumnType = enum { TEXT, INTEGER, REAL }; -// Shared exit codes (same values as in each format module) -pub const exit_usage: u8 = 1; -pub const exit_parse: u8 = 2; -pub const exit_sql: u8 = 3; - /// fatal(fmt, writer, code, args) → noreturn /// /// Writes an error message to writer and exits with the given code. -pub fn fatal(comptime fmt: []const u8, writer: *std.Io.Writer, code: u8, args: anytype) noreturn { +pub fn fatal(comptime fmt: []const u8, writer: *std.Io.Writer, code: ExitCode, args: anytype) noreturn { writer.print("error: " ++ fmt ++ "\n", args) catch |err| std.log.err("failed to write error: {}", .{err}); writer.flush() catch |err| std.log.err("failed to flush: {}", .{err}); - std.process.exit(code); + std.process.exit(@intFromEnum(code)); } /// Create table `t` with all-TEXT columns. Column names are double-quote–escaped @@ -34,24 +32,24 @@ pub fn createAllTextTable( var sql: std.ArrayList(u8) = .empty; defer sql.deinit(allocator); - sql.appendSlice(allocator, "CREATE TABLE t (") catch fatal("out of memory", writer, exit_parse, .{}); + sql.appendSlice(allocator, "CREATE TABLE t (") catch fatal("out of memory", writer, .csv_error, .{}); for (cols, 0..) |col, i| { - if (i > 0) sql.appendSlice(allocator, ", ") catch fatal("out of memory", writer, exit_parse, .{}); - sql.append(allocator, '"') catch fatal("out of memory", writer, exit_parse, .{}); + if (i > 0) sql.appendSlice(allocator, ", ") catch fatal("out of memory", writer, .csv_error, .{}); + sql.append(allocator, '"') catch fatal("out of memory", writer, .csv_error, .{}); for (col) |ch| { - if (ch == '"') sql.append(allocator, '"') catch fatal("out of memory", writer, exit_parse, .{}); - sql.append(allocator, ch) catch fatal("out of memory", writer, exit_parse, .{}); + if (ch == '"') sql.append(allocator, '"') catch fatal("out of memory", writer, .csv_error, .{}); + sql.append(allocator, ch) catch fatal("out of memory", writer, .csv_error, .{}); } - sql.appendSlice(allocator, "\" TEXT") catch fatal("out of memory", writer, exit_parse, .{}); + sql.appendSlice(allocator, "\" TEXT") catch fatal("out of memory", writer, .csv_error, .{}); } - sql.appendSlice(allocator, ")") catch fatal("out of memory", writer, exit_parse, .{}); - sql.append(allocator, 0) catch fatal("out of memory", writer, exit_parse, .{}); + sql.appendSlice(allocator, ")") catch fatal("out of memory", writer, .csv_error, .{}); + sql.append(allocator, 0) catch fatal("out of memory", writer, .csv_error, .{}); var errmsg: [*c]u8 = null; if (c.sqlite3_exec(db, sql.items.ptr, null, null, &errmsg) != c.SQLITE_OK) { const msg = if (errmsg != null) std.mem.span(errmsg) else std.mem.span(c.sqlite3_errmsg(db)); if (errmsg != null) c.sqlite3_free(errmsg); - fatal("{s}", writer, exit_sql, .{msg}); + fatal("{s}", writer, .sql_error, .{msg}); } } @@ -65,17 +63,17 @@ pub fn prepareInsertStmt( var sql: std.ArrayList(u8) = .empty; defer sql.deinit(allocator); - sql.appendSlice(allocator, "INSERT INTO t VALUES (") catch fatal("out of memory", writer, exit_parse, .{}); + sql.appendSlice(allocator, "INSERT INTO t VALUES (") catch fatal("out of memory", writer, .csv_error, .{}); for (0..n) |i| { - if (i > 0) sql.append(allocator, ',') catch fatal("out of memory", writer, exit_parse, .{}); - sql.append(allocator, '?') catch fatal("out of memory", writer, exit_parse, .{}); + if (i > 0) sql.append(allocator, ',') catch fatal("out of memory", writer, .csv_error, .{}); + sql.append(allocator, '?') catch fatal("out of memory", writer, .csv_error, .{}); } - sql.appendSlice(allocator, ")") catch fatal("out of memory", writer, exit_parse, .{}); - sql.append(allocator, 0) catch fatal("out of memory", writer, exit_parse, .{}); + sql.appendSlice(allocator, ")") catch fatal("out of memory", writer, .csv_error, .{}); + sql.append(allocator, 0) catch fatal("out of memory", writer, .csv_error, .{}); var stmt: ?*c.sqlite3_stmt = null; if (c.sqlite3_prepare_v2(db, sql.items.ptr, -1, &stmt, null) != c.SQLITE_OK) - fatal("{s}", writer, exit_sql, .{std.mem.span(c.sqlite3_errmsg(db))}); + fatal("{s}", writer, .sql_error, .{std.mem.span(c.sqlite3_errmsg(db))}); return stmt.?; } @@ -84,7 +82,7 @@ pub fn beginTransaction(db: *c.sqlite3, writer: *std.Io.Writer) void { if (c.sqlite3_exec(db, "BEGIN TRANSACTION", null, null, &errmsg) != c.SQLITE_OK) { const msg = if (errmsg != null) std.mem.span(errmsg) else std.mem.span(c.sqlite3_errmsg(db)); if (errmsg != null) c.sqlite3_free(errmsg); - fatal("{s}", writer, exit_sql, .{msg}); + fatal("{s}", writer, .sql_error, .{msg}); } } @@ -93,7 +91,7 @@ pub fn commitTransaction(db: *c.sqlite3, writer: *std.Io.Writer) void { if (c.sqlite3_exec(db, "COMMIT", null, null, &errmsg) != c.SQLITE_OK) { const msg = if (errmsg != null) std.mem.span(errmsg) else std.mem.span(c.sqlite3_errmsg(db)); if (errmsg != null) c.sqlite3_free(errmsg); - fatal("{s}", writer, exit_sql, .{msg}); + fatal("{s}", writer, .sql_error, .{msg}); } } @@ -103,7 +101,7 @@ pub fn commitTransaction(db: *c.sqlite3, writer: *std.Io.Writer) void { pub fn openDb(writer: *std.Io.Writer) *c.sqlite3 { var db: ?*c.sqlite3 = null; if (c.sqlite3_open(":memory:", &db) != c.SQLITE_OK) - fatal("failed to open in-memory database", writer, exit_sql, .{}); + fatal("failed to open in-memory database", writer, .sql_error, .{}); return db.?; } @@ -126,29 +124,29 @@ pub fn createTable( var sql: std.ArrayList(u8) = .empty; defer sql.deinit(allocator); - sql.appendSlice(allocator, "CREATE TABLE t (") catch fatal("out of memory", writer, exit_parse, .{}); + sql.appendSlice(allocator, "CREATE TABLE t (") catch fatal("out of memory", writer, .csv_error, .{}); for (cols, 0..) |col, i| { - if (i > 0) sql.appendSlice(allocator, ", ") catch fatal("out of memory", writer, exit_parse, .{}); - sql.append(allocator, '"') catch fatal("out of memory", writer, exit_parse, .{}); + if (i > 0) sql.appendSlice(allocator, ", ") catch fatal("out of memory", writer, .csv_error, .{}); + sql.append(allocator, '"') catch fatal("out of memory", writer, .csv_error, .{}); for (col) |ch| { - if (ch == '"') sql.append(allocator, '"') catch fatal("out of memory", writer, exit_parse, .{}); - sql.append(allocator, ch) catch fatal("out of memory", writer, exit_parse, .{}); + if (ch == '"') sql.append(allocator, '"') catch fatal("out of memory", writer, .csv_error, .{}); + sql.append(allocator, ch) catch fatal("out of memory", writer, .csv_error, .{}); } - sql.append(allocator, '"') catch fatal("out of memory", writer, exit_parse, .{}); + sql.append(allocator, '"') catch fatal("out of memory", writer, .csv_error, .{}); sql.appendSlice(allocator, switch (types[i]) { .INTEGER => " INTEGER", .REAL => " REAL", .TEXT => " TEXT", - }) catch fatal("out of memory", writer, exit_parse, .{}); + }) catch fatal("out of memory", writer, .csv_error, .{}); } - sql.appendSlice(allocator, ")") catch fatal("out of memory", writer, exit_parse, .{}); - sql.append(allocator, 0) catch fatal("out of memory", writer, exit_parse, .{}); + sql.appendSlice(allocator, ")") catch fatal("out of memory", writer, .csv_error, .{}); + sql.append(allocator, 0) catch fatal("out of memory", writer, .csv_error, .{}); var errmsg: [*c]u8 = null; if (c.sqlite3_exec(db, sql.items.ptr, null, null, &errmsg) != c.SQLITE_OK) { const msg = if (errmsg != null) std.mem.span(errmsg) else std.mem.span(c.sqlite3_errmsg(db)); if (errmsg != null) c.sqlite3_free(errmsg); - fatal("{s}", writer, exit_sql, .{msg}); + fatal("{s}", writer, .sql_error, .{msg}); } } @@ -260,5 +258,5 @@ pub fn fatalSqlWithContext( }; printSqlErrorContext(allocator, db, errmsg, writer); writer.flush() catch |err| std.log.err("failed to flush: {}", .{err}); - std.process.exit(exit_sql); + std.process.exit(@intFromEnum(ExitCode.sql_error)); } diff --git a/src/xml.zig b/src/xml.zig index 95e99ad..525b5f6 100644 --- a/src/xml.zig +++ b/src/xml.zig @@ -37,9 +37,7 @@ const prepareInsertStmt = sqlite_helpers.prepareInsertStmt; const beginTransaction = sqlite_helpers.beginTransaction; const commitTransaction = sqlite_helpers.commitTransaction; const fatal = sqlite_helpers.fatal; -const exit_usage = sqlite_helpers.exit_usage; -const exit_parse = sqlite_helpers.exit_parse; -const exit_sql = sqlite_helpers.exit_sql; +const ExitCode = sqlite_helpers.ExitCode; const sqlite_static = sqlite_helpers.sqlite_static; // ─── XML escaping ───────────────────────────────────── @@ -279,7 +277,7 @@ pub const XmlParser = struct { err_writer.print("error: xml: line {d}, col {d}: ", .{ self.line, self.col }) catch |err| std.log.err("failed to write error: {}", .{err}); err_writer.print(fmt ++ "\n", args) catch |err| std.log.err("failed to write error: {}", .{err}); err_writer.flush() catch |err| std.log.err("failed to flush: {}", .{err}); - std.process.exit(exit_parse); + std.process.exit(@intFromEnum(ExitCode.csv_error)); } // ─── Skip helpers ──────────────────────────────────── @@ -701,11 +699,11 @@ pub fn getXmlColumnNames( while (true) { const byte = reader.takeByte() catch |err| switch (err) { error.EndOfStream => break, - error.ReadFailed => fatal("failed to read XML input", stderr_writer, exit_parse, .{}), + error.ReadFailed => fatal("failed to read XML input", stderr_writer, .csv_error, .{}), }; - buf.append(allocator, byte) catch fatal("out of memory reading XML", stderr_writer, exit_parse, .{}); + buf.append(allocator, byte) catch fatal("out of memory reading XML", stderr_writer, .csv_error, .{}); } - if (buf.items.len == 0) fatal("empty input", stderr_writer, exit_parse, .{}); + if (buf.items.len == 0) fatal("empty input", stderr_writer, .csv_error, .{}); var p = XmlParser.init(buf.items); p.skipPrologue(stderr_writer); @@ -715,12 +713,12 @@ pub fn getXmlColumnNames( } else p.readRootOpen(stderr_writer); const cols = p.nextRow(allocator, root_name, xml_row, stderr_writer) catch - fatal("out of memory parsing XML", stderr_writer, exit_parse, .{}); + fatal("out of memory parsing XML", stderr_writer, .csv_error, .{}); if (cols == null) { if (xml_row) |row_tag| - fatal("XML document has no '{s}' elements (check --xml-row value)", stderr_writer, exit_parse, .{row_tag}) + fatal("XML document has no '{s}' elements (check --xml-row value)", stderr_writer, .csv_error, .{row_tag}) else - fatal("XML document has no row elements", stderr_writer, exit_parse, .{}); + fatal("XML document has no row elements", stderr_writer, .csv_error, .{}); } defer { for (cols.?) |col| if (col.value) |v| allocator.free(v); @@ -730,10 +728,10 @@ pub fn getXmlColumnNames( var names: std.ArrayList([]const u8) = .empty; for (cols.?) |col| { const owned = allocator.dupe(u8, col.name) catch - fatal("out of memory", stderr_writer, exit_parse, .{}); - names.append(allocator, owned) catch fatal("out of memory", stderr_writer, exit_parse, .{}); + fatal("out of memory", stderr_writer, .csv_error, .{}); + names.append(allocator, owned) catch fatal("out of memory", stderr_writer, .csv_error, .{}); } - return names.toOwnedSlice(allocator) catch fatal("out of memory", stderr_writer, exit_parse, .{}); + return names.toOwnedSlice(allocator) catch fatal("out of memory", stderr_writer, .csv_error, .{}); } /// XmlSummary — result of summarizeXml. @@ -765,11 +763,11 @@ pub fn summarizeXml( while (true) { const byte = reader.takeByte() catch |err| switch (err) { error.EndOfStream => break, - error.ReadFailed => fatal("failed to read XML input", stderr_writer, exit_parse, .{}), + error.ReadFailed => fatal("failed to read XML input", stderr_writer, .csv_error, .{}), }; - buf.append(allocator, byte) catch fatal("out of memory reading XML", stderr_writer, exit_parse, .{}); + buf.append(allocator, byte) catch fatal("out of memory reading XML", stderr_writer, .csv_error, .{}); } - if (buf.items.len == 0) fatal("empty input", stderr_writer, exit_parse, .{}); + if (buf.items.len == 0) fatal("empty input", stderr_writer, .csv_error, .{}); var p = XmlParser.init(buf.items); p.skipPrologue(stderr_writer); @@ -785,7 +783,7 @@ pub fn summarizeXml( // Bounding function: rows remaining in the XML document (finite) while (true) { const cols = p.nextRow(allocator, root_name, xml_row, stderr_writer) catch - fatal("out of memory parsing XML", stderr_writer, exit_parse, .{}); + fatal("out of memory parsing XML", stderr_writer, .csv_error, .{}); if (cols == null) break; defer { for (cols.?) |col| if (col.value) |v| allocator.free(v); @@ -796,19 +794,19 @@ pub fn summarizeXml( var names: std.ArrayList([]const u8) = .empty; for (cols.?) |col| { const owned = allocator.dupe(u8, col.name) catch - fatal("out of memory", stderr_writer, exit_parse, .{}); - names.append(allocator, owned) catch fatal("out of memory", stderr_writer, exit_parse, .{}); + fatal("out of memory", stderr_writer, .csv_error, .{}); + names.append(allocator, owned) catch fatal("out of memory", stderr_writer, .csv_error, .{}); } col_names = names.toOwnedSlice(allocator) catch - fatal("out of memory", stderr_writer, exit_parse, .{}); + fatal("out of memory", stderr_writer, .csv_error, .{}); } } if (col_names == null) { if (xml_row) |row_tag| - fatal("XML document has no '{s}' elements (check --xml-row value)", stderr_writer, exit_parse, .{row_tag}) + fatal("XML document has no '{s}' elements (check --xml-row value)", stderr_writer, .csv_error, .{row_tag}) else - fatal("XML document has no row elements", stderr_writer, exit_parse, .{}); + fatal("XML document has no row elements", stderr_writer, .csv_error, .{}); } return .{ .row_count = row_count, .col_names = col_names.? }; } @@ -838,11 +836,11 @@ pub fn loadXmlInput( while (true) { const byte = reader.takeByte() catch |err| switch (err) { error.EndOfStream => break, - error.ReadFailed => fatal("failed to read XML input", stderr_writer, exit_parse, .{}), + error.ReadFailed => fatal("failed to read XML input", stderr_writer, .csv_error, .{}), }; - buf.append(allocator, byte) catch fatal("out of memory reading XML input", stderr_writer, exit_parse, .{}); + buf.append(allocator, byte) catch fatal("out of memory reading XML input", stderr_writer, .csv_error, .{}); } - if (buf.items.len == 0) fatal("empty input", stderr_writer, exit_parse, .{}); + if (buf.items.len == 0) fatal("empty input", stderr_writer, .csv_error, .{}); var p = XmlParser.init(buf.items); p.skipPrologue(stderr_writer); @@ -872,7 +870,7 @@ pub fn loadXmlInput( // Bounding function: row elements remaining in the document (finite) while (true) { const cols = p.nextRow(allocator, root_name, xml_row, stderr_writer) catch - fatal("out of memory parsing XML", stderr_writer, exit_parse, .{}); + fatal("out of memory parsing XML", stderr_writer, .csv_error, .{}); if (cols == null) break; defer { @@ -883,7 +881,7 @@ pub fn loadXmlInput( rows_inserted += 1; if (max_rows) |limit| { if (rows_inserted > limit) - fatal("input exceeds --max-rows limit ({d} rows)", stderr_writer, exit_usage, .{limit}); + fatal("input exceeds --max-rows limit ({d} rows)", stderr_writer, .usage, .{limit}); } if (col_names == null) { @@ -891,12 +889,12 @@ pub fn loadXmlInput( var names: std.ArrayList([]const u8) = .empty; for (cols.?) |col| { const owned = allocator.dupe(u8, col.name) catch - fatal("out of memory", stderr_writer, exit_parse, .{}); - names.append(allocator, owned) catch fatal("out of memory", stderr_writer, exit_parse, .{}); + fatal("out of memory", stderr_writer, .csv_error, .{}); + names.append(allocator, owned) catch fatal("out of memory", stderr_writer, .csv_error, .{}); } if (names.items.len == 0) - fatal("first XML row element has no column children", stderr_writer, exit_parse, .{}); - col_names = names.toOwnedSlice(allocator) catch fatal("out of memory", stderr_writer, exit_parse, .{}); + fatal("first XML row element has no column children", stderr_writer, .csv_error, .{}); + col_names = names.toOwnedSlice(allocator) catch fatal("out of memory", stderr_writer, .csv_error, .{}); createAllTextTable(allocator, db, col_names.?, stderr_writer); beginTransaction(db, stderr_writer); @@ -922,22 +920,22 @@ pub fn loadXmlInput( }; if (value) |v| { if (c.sqlite3_bind_text(stmt, param_idx, v.ptr, @intCast(v.len), sqlite_static) != c.SQLITE_OK) - fatal("{s}", stderr_writer, exit_sql, .{std.mem.span(c.sqlite3_errmsg(db))}); + fatal("{s}", stderr_writer, .sql_error, .{std.mem.span(c.sqlite3_errmsg(db))}); } else { if (c.sqlite3_bind_null(stmt, param_idx) != c.SQLITE_OK) - fatal("{s}", stderr_writer, exit_sql, .{std.mem.span(c.sqlite3_errmsg(db))}); + fatal("{s}", stderr_writer, .sql_error, .{std.mem.span(c.sqlite3_errmsg(db))}); } } if (c.sqlite3_step(stmt) != c.SQLITE_DONE) - fatal("{s}", stderr_writer, exit_sql, .{std.mem.span(c.sqlite3_errmsg(db))}); + fatal("{s}", stderr_writer, .sql_error, .{std.mem.span(c.sqlite3_errmsg(db))}); } if (col_names == null) { if (xml_row) |row_tag| - fatal("XML document has no '{s}' elements (check --xml-row value)", stderr_writer, exit_parse, .{row_tag}) + fatal("XML document has no '{s}' elements (check --xml-row value)", stderr_writer, .csv_error, .{row_tag}) else - fatal("XML document has no row elements", stderr_writer, exit_parse, .{}); + fatal("XML document has no row elements", stderr_writer, .csv_error, .{}); } if (in_transaction) commitTransaction(db, stderr_writer); return rows_inserted;