From 926d21eeebfc1ba914abcc58fb64f5894edb4a5e Mon Sep 17 00:00:00 2001 From: "Victor M. Varela" Date: Fri, 8 May 2026 08:51:09 +0200 Subject: [PATCH 1/2] fix: make --xml-root and --xml-row work for XML input parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously these flags only controlled XML output element names. The parser always used the actual document root and accepted any element as a row, making it impossible to query nested structures like RSS feeds (). Changes: - Add XmlParser.skipElementBody() to skip a complete element tree - Add XmlParser.navigateToRoot() to descend into a named container - Add row_tag_filter to XmlParser.nextRow() to skip non-matching elements - Update loadXmlInput, getXmlColumnNames, summarizeXml to accept optional xml_root and xml_row parameters (null = legacy behaviour) - Thread xml_root_input / xml_row_input from CLI args through ParsedArgs, ColumnsArgs, and ValidateArgs to all call sites - Fix test_xml_no_rows to use root (consistent with default) - Add integration tests 114 and 115 for nested navigation Closes #139 --- build.zig | 31 +++++- src/main.zig | 30 +++++- src/xml.zig | 292 ++++++++++++++++++++++++++++++++++++++------------- 3 files changed, 273 insertions(+), 80 deletions(-) diff --git a/build.zig b/build.zig index 1ac00a5..9cc3997 100644 --- a/build.zig +++ b/build.zig @@ -1066,9 +1066,11 @@ pub fn build(b: *std.Build) void { test_xml_validate.step.dependOn(b.getInstallStep()); test_step.dependOn(&test_xml_validate.step); - // Integration test 104: --xml-root and --xml-row customize element names + // Integration test 104: --xml-root and --xml-row customize element names for output, + // and navigate nested XML for input const test_xml_custom_elements = b.addSystemCommand(&.{ "bash", "-c", + // Output: custom element names appear in the XML \\result=$(printf 'name,age\nAlice,30\n' \ \\ | ./zig-out/bin/sql-pipe -O xml --xml-root data --xml-row record 'SELECT * FROM t') \\echo "$result" | grep -q '' && echo "$result" | grep -q '' && echo "$result" | grep -q '' @@ -1108,7 +1110,7 @@ pub fn build(b: *std.Build) void { // Integration test 108: Root sin rows → error con "no row elements" const test_xml_no_rows = b.addSystemCommand(&.{ "bash", "-c", - \\msg=$(printf '' | ./zig-out/bin/sql-pipe -I xml 'SELECT 1' 2>&1; echo "EXIT:$?") + \\msg=$(printf '' | ./zig-out/bin/sql-pipe -I xml 'SELECT 1' 2>&1; echo "EXIT:$?") \\echo "$msg" | grep -q 'no row elements' && echo "$msg" | grep -qv 'EXIT:0' }); test_xml_no_rows.step.dependOn(b.getInstallStep()); @@ -1162,6 +1164,31 @@ pub fn build(b: *std.Build) void { test_xml_float_as_int.step.dependOn(b.getInstallStep()); test_step.dependOn(&test_xml_float_as_int.step); + // Integration test 114: --xml-root navigates nested XML for input (RSS-like structure) + const test_xml_nested_navigation = b.addSystemCommand(&.{ + "bash", "-c", + // Feed with structure; --xml-root channel --xml-row item + // selects only item elements from inside channel, skipping etc. + \\doc='<feed><channel><title>My FeedAlice30Bob25' + \\result=$(printf '%s' "$doc" \ + \\ | ./zig-out/bin/sql-pipe -I xml --xml-root channel --xml-row item \ + \\ 'SELECT name || ":" || age FROM t ORDER BY name') + \\[ "$result" = "$(printf 'Alice:30\nBob:25')" ] + }); + test_xml_nested_navigation.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_xml_nested_navigation.step); + + // Integration test 115: --xml-root / --xml-row with --validate counts only matching rows + const test_xml_nested_validate = b.addSystemCommand(&.{ + "bash", "-c", + \\doc='T12' + \\result=$(printf '%s' "$doc" \ + \\ | ./zig-out/bin/sql-pipe -I xml --xml-root channel --xml-row item --validate) + \\echo "$result" | grep -q 'OK: 2 rows' + }); + test_xml_nested_validate.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_xml_nested_validate.step); + // Unit tests for the RFC 4180 CSV parser (src/csv.zig) const unit_tests = b.addTest(.{ .root_module = b.createModule(.{ diff --git a/src/main.zig b/src/main.zig index 9f2e132..086d36e 100644 --- a/src/main.zig +++ b/src/main.zig @@ -105,6 +105,10 @@ const ParsedArgs = struct { xml_root: []const u8, /// Row element name for XML output (default: "row"). xml_row: []const u8, + /// Root element to navigate to for XML input; null = use actual document root. + xml_root_input: ?[]const u8, + /// Row tag filter for XML input; null = accept any direct child element as a row. + xml_row_input: ?[]const u8, }; /// Arguments for `--columns` mode. @@ -115,6 +119,10 @@ const ColumnsArgs = struct { verbose: bool, /// Input format (default: csv). input_format: InputFormat, + /// Root element to navigate to for XML input; null = use actual document root. + xml_root_input: ?[]const u8, + /// Row tag filter for XML input; null = accept any direct child element as a row. + xml_row_input: ?[]const u8, }; /// Arguments for `--validate` mode. @@ -125,6 +133,10 @@ const ValidateArgs = struct { type_inference: bool, /// Input format (default: csv). input_format: InputFormat, + /// Root element to navigate to for XML input; null = use actual document root. + xml_root_input: ?[]const u8, + /// Row tag filter for XML input; null = accept any direct child element as a row. + xml_row_input: ?[]const u8, }; /// Arguments for `--sample` mode. @@ -298,6 +310,8 @@ fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { var output: ?[]const u8 = null; var xml_root: []const u8 = "results"; var xml_row: []const u8 = "row"; + var xml_root_input: ?[]const u8 = null; + var xml_row_input: ?[]const u8 = null; var sample_mode = false; var sample_n: usize = 10; @@ -400,14 +414,18 @@ fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { i += 1; if (i >= args.len) return error.MissingXmlFlagValue; xml_root = args[i]; + xml_root_input = args[i]; } else if (std.mem.startsWith(u8, arg, "--xml-root=")) { xml_root = arg["--xml-root=".len..]; + xml_root_input = arg["--xml-root=".len..]; } else if (std.mem.eql(u8, arg, "--xml-row")) { i += 1; if (i >= args.len) return error.MissingXmlFlagValue; xml_row = args[i]; + xml_row_input = args[i]; } else if (std.mem.startsWith(u8, arg, "--xml-row=")) { xml_row = arg["--xml-row=".len..]; + xml_row_input = arg["--xml-row=".len..]; } else { if (query == null) query = arg; } @@ -471,6 +489,8 @@ fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { .delimiter = delimiter, .verbose = verbose, .input_format = input_format, + .xml_root_input = xml_root_input, + .xml_row_input = xml_row_input, } }; // --validate mode: parse CSV and print summary @@ -479,6 +499,8 @@ fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { .delimiter = delimiter, .type_inference = type_inference, .input_format = input_format, + .xml_root_input = xml_root_input, + .xml_row_input = xml_row_input, } }; // --sample mode: print schema + first n rows and exit @@ -503,6 +525,8 @@ fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { .output = output, .xml_root = xml_root, .xml_row = xml_row, + .xml_root_input = xml_root_input, + .xml_row_input = xml_row_input, } }; } @@ -1509,7 +1533,7 @@ fn runColumns( var stdin_buf: [4096]u8 = undefined; var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); - const names = xml.getXmlColumnNames(allocator, &stdin_file_reader.interface, stderr_writer); + const names = xml.getXmlColumnNames(allocator, &stdin_file_reader.interface, args.xml_root_input, args.xml_row_input, stderr_writer); defer { for (names) |name| allocator.free(name); allocator.free(names); @@ -1799,7 +1823,7 @@ fn runValidate( var stdin_buf: [4096]u8 = undefined; var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); - const summary = xml.summarizeXml(allocator, &stdin_file_reader.interface, stderr_writer); + const summary = xml.summarizeXml(allocator, &stdin_file_reader.interface, args.xml_root_input, args.xml_row_input, stderr_writer); defer { for (summary.col_names) |name| allocator.free(name); allocator.free(summary.col_names); @@ -2019,7 +2043,7 @@ fn run( .xml => blk: { var stdin_buf: [4096]u8 = undefined; var stdin_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); - break :blk xml.loadXmlInput(allocator, &stdin_reader.interface, db, parsed.max_rows, stderr_writer); + break :blk xml.loadXmlInput(allocator, &stdin_reader.interface, db, parsed.xml_root_input, parsed.xml_row_input, parsed.max_rows, stderr_writer); }, }; diff --git a/src/xml.zig b/src/xml.zig index 399e2f2..e570d16 100644 --- a/src/xml.zig +++ b/src/xml.zig @@ -223,7 +223,7 @@ pub fn writeXmlFooter(writer: *std.Io.Writer, root_name: []const u8) !void { /// var p = XmlParser.init(data); /// p.skipPrologue(err_writer); /// const root = p.readRootOpen(err_writer); -/// while (try p.nextRow(allocator, root, err_writer)) |cols| { +/// while (try p.nextRow(allocator, root, null, err_writer)) |cols| { /// defer { for (cols) |col| { if (col.value) |v| allocator.free(v); } allocator.free(cols); } /// // use cols[i].name and cols[i].value /// } @@ -449,6 +449,59 @@ pub const XmlParser = struct { self.fatalAt("unexpected end of input: unclosed element '{s}'", err_writer, .{elem_name}); } + // ─── Element skip ──────────────────────────────────── + + /// Skip the body and closing tag of an element. + /// + /// Pre: positioned just after the element's opening tag '>' + /// Post: positioned just after the element's closing '' + /// properly handles nested elements, comments, CDATA, and PIs + fn skipElementBody(self: *XmlParser, tag: []const u8, err_writer: *std.Io.Writer) void { + // depth counts unclosed nested elements inside the one we are skipping + var depth: usize = 0; + // Loop invariant: depth = number of open nested elements not yet closed + // Bounding function: self.data.len - self.pos (finite input) + while (self.pos < self.data.len) { + if (self.peek().? != '<') { + self.advance(); + continue; + } + if (self.startsWith("