Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions src/Processors/Formats/Impl/Parquet/Reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1334,11 +1334,20 @@ void Reader::decodePrimitiveColumn(ColumnChunk & column, const PrimitiveColumnIn
/// use_filter_in_decoder path reads ALL pages sequentially, so it would crash trying to access
/// those reset handles. Only use this optimization when reading the whole column chunk
/// sequentially (no offset index, i.e. data_pages is empty).
///
/// Also disabled for nullable columns (need_null_map): the filter-in-decoder path processes
/// ALL rows (passing + non-passing) through processDefLevelsForInnermostColumn, so the
/// null_map ends up with entries for all rows rather than just filtered rows. Additionally,
/// the decoder applies the filter at consecutive encoded-value indices, but with nulls the
/// encoded values don't correspond 1:1 to rows (null rows have no encoded values), causing
/// the filter to be applied at wrong positions. The standard row-range iteration path
/// correctly handles both issues by only reading rows in passing filter ranges.
const bool use_filter_in_decoder = (column_info.levels.back().rep == 0) &&
!row_subgroup.filter.filter.empty() &&
column.page.initialized &&
!column.page.is_dictionary_encoded &&
column.data_pages.empty();
column.data_pages.empty() &&
!column.need_null_map;
const size_t subgroup_end_row_idx = row_subgroup.start_row_idx + row_subgroup.filter.rows_total;

if (use_filter_in_decoder)
Expand Down Expand Up @@ -1417,7 +1426,9 @@ void Reader::decodePrimitiveColumn(ColumnChunk & column, const PrimitiveColumnIn
if (subchunk.null_map && !column_info.output_nullable && !options.format.null_as_default)
{
const auto & null_map = assert_cast<const ColumnUInt8 &>(*subchunk.null_map).getData();
if (memchr(null_map.data(), 0, null_map.size()) != nullptr)
/// null_map uses standard ClickHouse convention: 1 = NULL, 0 = NOT NULL.
/// Check if any values are null — those can't be inserted into a non-Nullable column.
if (memchr(null_map.data(), 1, null_map.size()) != nullptr)
throw Exception(ErrorCodes::CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN, "Cannot convert NULL value to non-Nullable type for column {}", column_info.name);
subchunk.null_map = nullptr;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
--- null_as_default=1 non-nullable ---
0
3
6
--- nullable output ---
0 \N
3 \N
6 \N
--- non-nullable null_as_default=0 should error ---
CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN
--- non-nullable null_as_default=0 no nulls in file ---
1 1
2 2
4 4
76 changes: 76 additions & 0 deletions tests/queries/0_stateless/04099_parquet_nullable_null_check_fix.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/usr/bin/env bash
# Tags: no-fasttest

# Regression test for a LOGICAL_ERROR crash in the Parquet V3 native reader:
# "Unexpected number of rows in column subchunk 0 1"
#
# The bug was an inverted memchr check in Reader.cpp that searched for byte 0
# (non-null marker) instead of byte 1 (null marker) in the null_map. When ALL
# values in a filtered subchunk were NULL and the output column was non-nullable
# with null_as_default disabled, the check incorrectly cleared the null_map
# instead of throwing CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN. This left the
# column with 0 rows while rows_pass was 1, causing the assertion failure.
#
# Requires V3 native reader (input_format_parquet_use_native_reader_v3 = 1)
# because the V3 reader applies filter pushdown during reading. The legacy
# Arrow-based reader reads ALL rows first then casts, giving different behavior.

CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh

DATA_FILE="${CLICKHOUSE_TEST_UNIQUE_NAME}.parquet"
DATA_FILE_NO_NULLS="${CLICKHOUSE_TEST_UNIQUE_NAME}_no_nulls.parquet"

# Create a Parquet file with a Nullable(String) column containing NULLs.
# Every row where id % 3 == 0 has val = NULL.
$CLICKHOUSE_CLIENT -q "
INSERT INTO FUNCTION file('$DATA_FILE', Parquet)
SELECT number AS id, if(number % 3 = 0, NULL, toString(number)) AS val
FROM numbers(20)
SETTINGS output_format_parquet_use_custom_encoder = 0
"

# Create a second Parquet file with a Nullable(String) column but NO NULLs.
$CLICKHOUSE_CLIENT -q "
INSERT INTO FUNCTION file('$DATA_FILE_NO_NULLS', Parquet)
SELECT number AS id, toString(number) AS val
FROM numbers(20)
SETTINGS output_format_parquet_use_custom_encoder = 0
"

# 1) Reading with null_as_default=1 (default) — NULLs become empty strings.
echo "--- null_as_default=1 non-nullable ---"
$CLICKHOUSE_CLIENT -q "
SELECT id, val FROM file('$DATA_FILE', Parquet, 'id UInt64, val String')
WHERE id IN (0, 3, 6) ORDER BY id
SETTINGS input_format_null_as_default = 1, input_format_parquet_use_native_reader_v3 = 1
"

# 2) Reading nullable output — NULLs preserved.
echo "--- nullable output ---"
$CLICKHOUSE_CLIENT -q "
SELECT id, val FROM file('$DATA_FILE', Parquet, 'id UInt64, val Nullable(String)')
WHERE id IN (0, 3, 6) ORDER BY id
SETTINGS input_format_parquet_use_native_reader_v3 = 1
"

# 3) Non-nullable with null_as_default=0 and a filter that hits only NULL rows:
# should throw CANNOT_INSERT_NULL, NOT crash with LOGICAL_ERROR.
# Use head -1 because CI's --send_logs_level=warning can cause the error code
# to appear in both a server log line and the exception message.
echo "--- non-nullable null_as_default=0 should error ---"
$CLICKHOUSE_CLIENT -q "
SELECT id, val FROM file('$DATA_FILE', Parquet, 'id UInt64, val String')
WHERE id = 0
SETTINGS input_format_null_as_default = 0, input_format_parquet_use_native_reader_v3 = 1
" 2>&1 | grep -o 'CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN' | head -1

# 4) Non-nullable with null_as_default=0 reading from a file with NO nulls — should succeed.
# (Tests that the fix doesn't break the non-null case.)
echo "--- non-nullable null_as_default=0 no nulls in file ---"
$CLICKHOUSE_CLIENT -q "
SELECT id, val FROM file('$DATA_FILE_NO_NULLS', Parquet, 'id UInt64, val String')
WHERE id IN (1, 2, 4) ORDER BY id
SETTINGS input_format_null_as_default = 0, input_format_parquet_use_native_reader_v3 = 1
"
Loading