diff --git a/src/Processors/Formats/Impl/Parquet/Reader.cpp b/src/Processors/Formats/Impl/Parquet/Reader.cpp index b0ec0df2e7c1..fca3403fc839 100644 --- a/src/Processors/Formats/Impl/Parquet/Reader.cpp +++ b/src/Processors/Formats/Impl/Parquet/Reader.cpp @@ -1334,11 +1334,20 @@ void Reader::decodePrimitiveColumn(ColumnChunk & column, const PrimitiveColumnIn /// use_filter_in_decoder path reads ALL pages sequentially, so it would crash trying to access /// those reset handles. Only use this optimization when reading the whole column chunk /// sequentially (no offset index, i.e. data_pages is empty). + /// + /// Also disabled for nullable columns (need_null_map): the filter-in-decoder path processes + /// ALL rows (passing + non-passing) through processDefLevelsForInnermostColumn, so the + /// null_map ends up with entries for all rows rather than just filtered rows. Additionally, + /// the decoder applies the filter at consecutive encoded-value indices, but with nulls the + /// encoded values don't correspond 1:1 to rows (null rows have no encoded values), causing + /// the filter to be applied at wrong positions. The standard row-range iteration path + /// correctly handles both issues by only reading rows in passing filter ranges. const bool use_filter_in_decoder = (column_info.levels.back().rep == 0) && !row_subgroup.filter.filter.empty() && column.page.initialized && !column.page.is_dictionary_encoded && - column.data_pages.empty(); + column.data_pages.empty() && + !column.need_null_map; const size_t subgroup_end_row_idx = row_subgroup.start_row_idx + row_subgroup.filter.rows_total; if (use_filter_in_decoder) @@ -1417,7 +1426,9 @@ void Reader::decodePrimitiveColumn(ColumnChunk & column, const PrimitiveColumnIn if (subchunk.null_map && !column_info.output_nullable && !options.format.null_as_default) { const auto & null_map = assert_cast(*subchunk.null_map).getData(); - if (memchr(null_map.data(), 0, null_map.size()) != nullptr) + /// null_map uses standard ClickHouse convention: 1 = NULL, 0 = NOT NULL. + /// Check if any values are null — those can't be inserted into a non-Nullable column. + if (memchr(null_map.data(), 1, null_map.size()) != nullptr) throw Exception(ErrorCodes::CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN, "Cannot convert NULL value to non-Nullable type for column {}", column_info.name); subchunk.null_map = nullptr; } diff --git a/tests/queries/0_stateless/04099_parquet_nullable_null_check_fix.reference b/tests/queries/0_stateless/04099_parquet_nullable_null_check_fix.reference new file mode 100644 index 000000000000..1aa22c26ee05 --- /dev/null +++ b/tests/queries/0_stateless/04099_parquet_nullable_null_check_fix.reference @@ -0,0 +1,14 @@ +--- null_as_default=1 non-nullable --- +0 +3 +6 +--- nullable output --- +0 \N +3 \N +6 \N +--- non-nullable null_as_default=0 should error --- +CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN +--- non-nullable null_as_default=0 no nulls in file --- +1 1 +2 2 +4 4 diff --git a/tests/queries/0_stateless/04099_parquet_nullable_null_check_fix.sh b/tests/queries/0_stateless/04099_parquet_nullable_null_check_fix.sh new file mode 100755 index 000000000000..166dd9caee00 --- /dev/null +++ b/tests/queries/0_stateless/04099_parquet_nullable_null_check_fix.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +# Regression test for a LOGICAL_ERROR crash in the Parquet V3 native reader: +# "Unexpected number of rows in column subchunk 0 1" +# +# The bug was an inverted memchr check in Reader.cpp that searched for byte 0 +# (non-null marker) instead of byte 1 (null marker) in the null_map. When ALL +# values in a filtered subchunk were NULL and the output column was non-nullable +# with null_as_default disabled, the check incorrectly cleared the null_map +# instead of throwing CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN. This left the +# column with 0 rows while rows_pass was 1, causing the assertion failure. +# +# Requires V3 native reader (input_format_parquet_use_native_reader_v3 = 1) +# because the V3 reader applies filter pushdown during reading. The legacy +# Arrow-based reader reads ALL rows first then casts, giving different behavior. + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +DATA_FILE="${CLICKHOUSE_TEST_UNIQUE_NAME}.parquet" +DATA_FILE_NO_NULLS="${CLICKHOUSE_TEST_UNIQUE_NAME}_no_nulls.parquet" + +# Create a Parquet file with a Nullable(String) column containing NULLs. +# Every row where id % 3 == 0 has val = NULL. +$CLICKHOUSE_CLIENT -q " + INSERT INTO FUNCTION file('$DATA_FILE', Parquet) + SELECT number AS id, if(number % 3 = 0, NULL, toString(number)) AS val + FROM numbers(20) + SETTINGS output_format_parquet_use_custom_encoder = 0 +" + +# Create a second Parquet file with a Nullable(String) column but NO NULLs. +$CLICKHOUSE_CLIENT -q " + INSERT INTO FUNCTION file('$DATA_FILE_NO_NULLS', Parquet) + SELECT number AS id, toString(number) AS val + FROM numbers(20) + SETTINGS output_format_parquet_use_custom_encoder = 0 +" + +# 1) Reading with null_as_default=1 (default) — NULLs become empty strings. +echo "--- null_as_default=1 non-nullable ---" +$CLICKHOUSE_CLIENT -q " + SELECT id, val FROM file('$DATA_FILE', Parquet, 'id UInt64, val String') + WHERE id IN (0, 3, 6) ORDER BY id + SETTINGS input_format_null_as_default = 1, input_format_parquet_use_native_reader_v3 = 1 +" + +# 2) Reading nullable output — NULLs preserved. +echo "--- nullable output ---" +$CLICKHOUSE_CLIENT -q " + SELECT id, val FROM file('$DATA_FILE', Parquet, 'id UInt64, val Nullable(String)') + WHERE id IN (0, 3, 6) ORDER BY id + SETTINGS input_format_parquet_use_native_reader_v3 = 1 +" + +# 3) Non-nullable with null_as_default=0 and a filter that hits only NULL rows: +# should throw CANNOT_INSERT_NULL, NOT crash with LOGICAL_ERROR. +# Use head -1 because CI's --send_logs_level=warning can cause the error code +# to appear in both a server log line and the exception message. +echo "--- non-nullable null_as_default=0 should error ---" +$CLICKHOUSE_CLIENT -q " + SELECT id, val FROM file('$DATA_FILE', Parquet, 'id UInt64, val String') + WHERE id = 0 + SETTINGS input_format_null_as_default = 0, input_format_parquet_use_native_reader_v3 = 1 +" 2>&1 | grep -o 'CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN' | head -1 + +# 4) Non-nullable with null_as_default=0 reading from a file with NO nulls — should succeed. +# (Tests that the fix doesn't break the non-null case.) +echo "--- non-nullable null_as_default=0 no nulls in file ---" +$CLICKHOUSE_CLIENT -q " + SELECT id, val FROM file('$DATA_FILE_NO_NULLS', Parquet, 'id UInt64, val String') + WHERE id IN (1, 2, 4) ORDER BY id + SETTINGS input_format_null_as_default = 0, input_format_parquet_use_native_reader_v3 = 1 +"