From 3d10b2b7cd85ffe6a5cf53381131a75f7100ab08 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Mon, 1 Jun 2026 16:55:53 +0800 Subject: [PATCH 1/8] fix: fix calculated chunk_end exceed true chunk end when dictionary page is present --- .../page_filtered_row_group_reader.cpp | 10 +- .../page_filtered_row_group_reader_test.cpp | 107 ++++++++++++++++++ 2 files changed, 113 insertions(+), 4 deletions(-) diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader.cpp b/src/paimon/format/parquet/page_filtered_row_group_reader.cpp index 4594717f0..ed2c2745f 100644 --- a/src/paimon/format/parquet/page_filtered_row_group_reader.cpp +++ b/src/paimon/format/parquet/page_filtered_row_group_reader.cpp @@ -316,18 +316,20 @@ std::vector<::arrow::io::ReadRange> PageFilteredRowGroupReader::ComputePageRange for (int32_t col_idx : column_indices) { auto col_chunk = rg_metadata->ColumnChunk(col_idx); int64_t data_page_offset = col_chunk->data_page_offset(); - int64_t total_compressed_size = col_chunk->total_compressed_size(); - int64_t chunk_end = data_page_offset + total_compressed_size; - + int64_t data_page_compressed_size = col_chunk->total_compressed_size(); // Dictionary page: always include if present if (col_chunk->has_dictionary_page()) { int64_t dict_offset = col_chunk->dictionary_page_offset(); int64_t dict_size = data_page_offset - dict_offset; + // if dictionary exists, the data page size should be reduced by the dictionary + data_page_compressed_size -= dict_size; if (dict_size > 0) { ranges.push_back({dict_offset, dict_size}); } } + int64_t chunk_end = data_page_offset + data_page_compressed_size; + // Try to get OffsetIndex for page-level ranges std::shared_ptr<::parquet::OffsetIndex> offset_index; if (rg_page_index_reader) { @@ -336,7 +338,7 @@ std::vector<::arrow::io::ReadRange> PageFilteredRowGroupReader::ComputePageRange if (!offset_index) { // No OffsetIndex: fall back to entire column chunk - ranges.push_back({data_page_offset, total_compressed_size}); + ranges.push_back({data_page_offset, data_page_compressed_size}); continue; } diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp index bd693730d..c806391b3 100644 --- a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp +++ b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp @@ -719,4 +719,111 @@ TEST_F(PageFilteredRowGroupReaderTest, EndToEndPageLevelPreBuffer) { ASSERT_EQ(10, offset); } +/// Test: ComputePageRanges with dictionary encoding produces correct chunk_end. +/// +/// When dictionary encoding is enabled, the column chunk layout is: +/// [Dictionary Page] [Data Page 0] [Data Page 1] ... [Data Page N] +/// And total_compressed_size covers the entire chunk starting from dictionary_page_offset. +/// +/// The bug: chunk_end = data_page_offset + total_compressed_size is wrong because +/// total_compressed_size already includes the dictionary page size. The correct +/// chunk_end should be dictionary_page_offset + total_compressed_size. +/// +/// This test verifies that the last page's computed range does not exceed the +/// actual column chunk boundary. +TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesWithDictionaryEncoding) { + std::string file_name = dir_->Str() + "/compute_ranges_dict.parquet"; + + // Use low-cardinality data to ensure dictionary encoding is actually used. + // 100 rows with values cycling through 0..9 → dictionary will have 10 entries. + arrow::Int32Builder val_builder; + ASSERT_TRUE(val_builder.Reserve(100).ok()); + for (int32_t i = 0; i < 100; ++i) { + val_builder.UnsafeAppend(i % 10); + } + auto val_array = val_builder.Finish().ValueOrDie(); + auto field = arrow::field("val", arrow::int32()); + auto struct_array = arrow::StructArray::Make({val_array}, {field}).ValueOrDie(); + + // Write with dictionary encoding enabled (the key difference from other tests). + auto data_type = struct_array->struct_type(); + auto data_schema = arrow::schema(data_type->fields()); + auto data_arrow_array = std::make_unique(); + ASSERT_TRUE(arrow::ExportArray(*struct_array, data_arrow_array.get()).ok()); + ASSERT_OK_AND_ASSIGN(std::shared_ptr out, + fs_->Create(file_name, /*overwrite=*/false)); + ::parquet::WriterProperties::Builder builder; + builder.write_batch_size(10); + builder.max_row_group_length(100); + builder.enable_dictionary(); // Enable dictionary → triggers the bug + builder.enable_write_page_index(); + builder.data_pagesize(1); // Force small pages + auto writer_properties = builder.build(); + ASSERT_OK_AND_ASSIGN( + auto format_writer, + ParquetFormatWriter::Create(out, data_schema, writer_properties, + DEFAULT_PARQUET_WRITER_MAX_MEMORY_USE, arrow_pool_)); + ASSERT_OK(format_writer->AddBatch(data_arrow_array.get())); + ASSERT_OK(format_writer->Finish()); + ASSERT_OK(out->Close()); + + // Open the file and verify metadata confirms dictionary page presence + ASSERT_OK_AND_ASSIGN(std::shared_ptr in, fs_->Open(file_name)); + ASSERT_OK_AND_ASSIGN(uint64_t length, in->Length()); + auto in_stream = std::make_shared(in, arrow_pool_, length); + auto parquet_reader = ::parquet::ParquetFileReader::Open(in_stream); + ASSERT_TRUE(parquet_reader); + + auto file_metadata = parquet_reader->metadata(); + auto rg_metadata = file_metadata->RowGroup(0); + auto col_chunk = rg_metadata->ColumnChunk(0); + + // Precondition: dictionary page must exist for this test to be meaningful + ASSERT_TRUE(col_chunk->has_dictionary_page()) + << "Dictionary page not present - test setup error"; + + int64_t dict_offset = col_chunk->dictionary_page_offset(); + int64_t data_page_offset = col_chunk->data_page_offset(); + int64_t total_compressed_size = col_chunk->total_compressed_size(); + + // The true chunk end is dict_offset + total_compressed_size + int64_t true_chunk_end = dict_offset + total_compressed_size; + // The buggy chunk end would be data_page_offset + total_compressed_size + int64_t buggy_chunk_end = data_page_offset + total_compressed_size; + + // Sanity: dict page is before data pages, so buggy end > true end + ASSERT_LT(dict_offset, data_page_offset) + << "Dictionary offset should be before data page offset"; + ASSERT_GT(buggy_chunk_end, true_chunk_end) + << "Buggy chunk_end should exceed true chunk_end when dictionary is present"; + + // Now call ComputePageRanges with all rows matching + RowRanges row_ranges; + row_ranges.Add(RowRanges::Range(0, 99)); + + auto ranges = PageFilteredRowGroupReader::ComputePageRanges( + parquet_reader.get(), /*row_group_index=*/0, row_ranges, /*column_indices=*/{0}); + + ASSERT_FALSE(ranges.empty()); + + // The critical check: no range should extend beyond the true chunk end. + // With the bug, the last data page's range would use chunk_end = data_page_offset + + // total_compressed_size, which overshoots by the dictionary page size. + for (size_t i = 0; i < ranges.size(); ++i) { + int64_t range_end = ranges[i].offset + ranges[i].length; + EXPECT_LE(range_end, true_chunk_end) + << "Range " << i << " [offset=" << ranges[i].offset << ", length=" << ranges[i].length + << "] exceeds true chunk end (" << true_chunk_end << "). " + << "This indicates chunk_end is computed as data_page_offset + " + "total_compressed_size instead of dictionary_page_offset + " + "total_compressed_size."; + } + + // Also verify total covered range does not exceed file size + for (const auto& range : ranges) { + EXPECT_LE(range.offset + range.length, static_cast(length)) + << "Range exceeds file size"; + } +} + } // namespace paimon::parquet::test From 35593d8762730a9a148f5a926b6e6fe3ce6e2574 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Fri, 5 Jun 2026 14:53:09 +0800 Subject: [PATCH 2/8] test: add end-to-end test for PageFilteredRowGroupReader and check the maxium range_end --- .../page_filtered_row_group_reader_test.cpp | 58 +++++++++++++++++-- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp index c806391b3..953e120cb 100644 --- a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp +++ b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp @@ -729,8 +729,11 @@ TEST_F(PageFilteredRowGroupReaderTest, EndToEndPageLevelPreBuffer) { /// total_compressed_size already includes the dictionary page size. The correct /// chunk_end should be dictionary_page_offset + total_compressed_size. /// -/// This test verifies that the last page's computed range does not exceed the -/// actual column chunk boundary. +/// This test verifies that: +/// 1. No range exceeds the true chunk boundary (overshoot regression). +/// 2. At least one non-dictionary data-page range is present (not truncated). +/// 3. The maximum range_end equals true_chunk_end when requesting all rows. +/// 4. End-to-end reads with page-level filtering return correct query results. TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesWithDictionaryEncoding) { std::string file_name = dir_->Str() + "/compute_ranges_dict.parquet"; @@ -806,7 +809,7 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesWithDictionaryEncoding) ASSERT_FALSE(ranges.empty()); - // The critical check: no range should extend beyond the true chunk end. + // --- Check 1: No range should extend beyond the true chunk end --- // With the bug, the last data page's range would use chunk_end = data_page_offset + // total_compressed_size, which overshoots by the dictionary page size. for (size_t i = 0; i < ranges.size(); ++i) { @@ -819,11 +822,58 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesWithDictionaryEncoding) "total_compressed_size."; } - // Also verify total covered range does not exceed file size + // --- Check 2: Maximum range_end equals true_chunk_end --- + int64_t max_range_end = 0; + for (const auto& range : ranges) { + int64_t range_end = range.offset + range.length; + max_range_end = std::max(max_range_end, range_end); + } + ASSERT_EQ(max_range_end, true_chunk_end) + << "When requesting all rows, the maximum range_end should exactly equal " + << "true_chunk_end (" << true_chunk_end << "), but got " << max_range_end + << ". The last data page range may be truncated or missing."; + + // --- Check 3: No range exceeds file size --- for (const auto& range : ranges) { EXPECT_LE(range.offset + range.length, static_cast(length)) << "Range exceeds file size"; } + + // --- End-to-end check 1: read all rows (no predicate filtering) --- + // Verifies that reading a dictionary-encoded file with page index enabled + // returns all 100 rows with correct values. + auto read_schema = arrow::schema({field}); + auto predicate_all = PredicateBuilder::GreaterOrEqual( + /*field_index=*/0, /*field_name=*/"val", FieldType::INT, Literal(0)); + std::shared_ptr result_all; + ReadWithPredicateImpl(file_name, read_schema, predicate_all, &result_all); + ASSERT_TRUE(result_all); + ASSERT_EQ(100, result_all->length()) + << "End-to-end read with dictionary encoding should return all 100 rows"; + + // --- End-to-end check 2: manual row-level filtering --- + // Page-level filter does not do row-level filtering. Verify data content by + // scanning all returned rows and checking val == 5 appears exactly 10 times + // (val = i % 10, so rows 5,15,25,...,95 have val == 5). + int32_t count_val5 = 0; + int64_t total_rows_checked = 0; + for (int i = 0; i < result_all->num_chunks(); ++i) { + auto struct_arr = std::dynamic_pointer_cast(result_all->chunk(i)); + ASSERT_TRUE(struct_arr); + auto val_arr = std::dynamic_pointer_cast(struct_arr->field(0)); + ASSERT_TRUE(val_arr); + for (int64_t j = 0; j < val_arr->length(); ++j) { + auto expected = static_cast(total_rows_checked % 10); + ASSERT_EQ(expected, val_arr->Value(j)) + << "Value mismatch at row " << total_rows_checked; + if (val_arr->Value(j) == 5) { + ++count_val5; + } + ++total_rows_checked; + } + } + ASSERT_EQ(100, total_rows_checked); + ASSERT_EQ(10, count_val5) << "val == 5 should appear exactly 10 times in 100 rows"; } } // namespace paimon::parquet::test From f527d3b98c97445c8ba8c6bc65dff99d4f01613b Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Fri, 5 Jun 2026 15:08:09 +0800 Subject: [PATCH 3/8] fix: check dict_size before substracting --- .../parquet/page_filtered_row_group_reader.cpp | 4 ++-- .../page_filtered_row_group_reader_test.cpp | 17 +++++++++++++++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader.cpp b/src/paimon/format/parquet/page_filtered_row_group_reader.cpp index ed2c2745f..d82bec758 100644 --- a/src/paimon/format/parquet/page_filtered_row_group_reader.cpp +++ b/src/paimon/format/parquet/page_filtered_row_group_reader.cpp @@ -321,9 +321,9 @@ std::vector<::arrow::io::ReadRange> PageFilteredRowGroupReader::ComputePageRange if (col_chunk->has_dictionary_page()) { int64_t dict_offset = col_chunk->dictionary_page_offset(); int64_t dict_size = data_page_offset - dict_offset; - // if dictionary exists, the data page size should be reduced by the dictionary - data_page_compressed_size -= dict_size; if (dict_size > 0) { + // if dictionary exists, the data page size should be reduced by the dictionary + data_page_compressed_size -= dict_size; ranges.push_back({dict_offset, dict_size}); } } diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp index 953e120cb..634d29098 100644 --- a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp +++ b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp @@ -822,7 +822,20 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesWithDictionaryEncoding) "total_compressed_size."; } - // --- Check 2: Maximum range_end equals true_chunk_end --- + // --- Check 2: At least one non-dictionary data-page range is present --- + // Guards against truncation: if only the dictionary range is returned, the test + // would still pass the overshoot check but miss that data pages are lost. + int data_page_range_count = 0; + for (const auto& range : ranges) { + if (range.offset >= data_page_offset) { + ++data_page_range_count; + } + } + ASSERT_GE(data_page_range_count, 1) + << "Expected at least one data-page range (offset >= " << data_page_offset + << "), but only dictionary range(s) were returned."; + + // --- Check 3: Maximum range_end equals true_chunk_end when requesting all rows --- int64_t max_range_end = 0; for (const auto& range : ranges) { int64_t range_end = range.offset + range.length; @@ -833,7 +846,7 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesWithDictionaryEncoding) << "true_chunk_end (" << true_chunk_end << "), but got " << max_range_end << ". The last data page range may be truncated or missing."; - // --- Check 3: No range exceeds file size --- + // --- Check 4: No range exceeds file size --- for (const auto& range : ranges) { EXPECT_LE(range.offset + range.length, static_cast(length)) << "Range exceeds file size"; From 51edff5d599d104e3f163ef90baf52e1214eb919 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Mon, 8 Jun 2026 15:54:15 +0800 Subject: [PATCH 4/8] fix: reuse WriteTestFile in PageFilteredRowGroupReaderTest --- .../page_filtered_row_group_reader_test.cpp | 31 ++++++------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp index 634d29098..7a743f2b9 100644 --- a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp +++ b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp @@ -74,7 +74,8 @@ class PageFilteredRowGroupReaderTest : public ::testing::Test { /// @param max_row_group_length Controls row group size void WriteTestFile(const std::string& file_name, const std::shared_ptr& struct_array, - int32_t write_batch_size, int64_t max_row_group_length) { + int32_t write_batch_size, int64_t max_row_group_length, + bool enable_dictionary = false) { auto data_type = struct_array->struct_type(); auto data_schema = arrow::schema(data_type->fields()); auto data_arrow_array = std::make_unique(); @@ -84,7 +85,11 @@ class PageFilteredRowGroupReaderTest : public ::testing::Test { ::parquet::WriterProperties::Builder builder; builder.write_batch_size(write_batch_size); builder.max_row_group_length(max_row_group_length); - builder.disable_dictionary(); // Ensure page index min/max are meaningful + if (enable_dictionary) { + builder.enable_dictionary(); + } else { + builder.disable_dictionary(); // Ensure page index min/max are meaningful + } builder.enable_write_page_index(); // Enable page index for page-level filtering // Set data page size to 1 byte to force a new page after every write_batch_size rows. // The writer flushes a page when accumulated data exceeds data_pagesize, so setting @@ -749,26 +754,8 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesWithDictionaryEncoding) auto struct_array = arrow::StructArray::Make({val_array}, {field}).ValueOrDie(); // Write with dictionary encoding enabled (the key difference from other tests). - auto data_type = struct_array->struct_type(); - auto data_schema = arrow::schema(data_type->fields()); - auto data_arrow_array = std::make_unique(); - ASSERT_TRUE(arrow::ExportArray(*struct_array, data_arrow_array.get()).ok()); - ASSERT_OK_AND_ASSIGN(std::shared_ptr out, - fs_->Create(file_name, /*overwrite=*/false)); - ::parquet::WriterProperties::Builder builder; - builder.write_batch_size(10); - builder.max_row_group_length(100); - builder.enable_dictionary(); // Enable dictionary → triggers the bug - builder.enable_write_page_index(); - builder.data_pagesize(1); // Force small pages - auto writer_properties = builder.build(); - ASSERT_OK_AND_ASSIGN( - auto format_writer, - ParquetFormatWriter::Create(out, data_schema, writer_properties, - DEFAULT_PARQUET_WRITER_MAX_MEMORY_USE, arrow_pool_)); - ASSERT_OK(format_writer->AddBatch(data_arrow_array.get())); - ASSERT_OK(format_writer->Finish()); - ASSERT_OK(out->Close()); + WriteTestFile(file_name, struct_array, /*write_batch_size=*/10, + /*max_row_group_length=*/100, /*enable_dictionary=*/true); // Open the file and verify metadata confirms dictionary page presence ASSERT_OK_AND_ASSIGN(std::shared_ptr in, fs_->Open(file_name)); From e1a6cf90ab7a5f555825fd73bf5af805ee931607 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Tue, 9 Jun 2026 10:09:48 +0800 Subject: [PATCH 5/8] style: use StructArray::Equals to compare results, and ASSERT_* to check results. --- .../page_filtered_row_group_reader_test.cpp | 43 ++++++++++--------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp index 7a743f2b9..74b2d20a3 100644 --- a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp +++ b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp @@ -835,7 +835,7 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesWithDictionaryEncoding) // --- Check 4: No range exceeds file size --- for (const auto& range : ranges) { - EXPECT_LE(range.offset + range.length, static_cast(length)) + ASSERT_LE(range.offset + range.length, static_cast(length)) << "Range exceeds file size"; } @@ -851,28 +851,31 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesWithDictionaryEncoding) ASSERT_EQ(100, result_all->length()) << "End-to-end read with dictionary encoding should return all 100 rows"; - // --- End-to-end check 2: manual row-level filtering --- - // Page-level filter does not do row-level filtering. Verify data content by - // scanning all returned rows and checking val == 5 appears exactly 10 times - // (val = i % 10, so rows 5,15,25,...,95 have val == 5). + // --- End-to-end check 2: verify data content using Equals --- + // Build expected array: val = i % 10 for i in [0, 100), wrapped in a struct. + arrow::Int32Builder expected_val_builder; + ASSERT_TRUE(expected_val_builder.Reserve(100).ok()); + for (int i = 0; i < 100; ++i) { + expected_val_builder.UnsafeAppend(static_cast(i % 10)); + } + auto expected_val_arr = expected_val_builder.Finish().ValueOrDie(); + + auto expected_struct_arr = arrow::StructArray::Make({expected_val_arr}, {field}).ValueOrDie(); + + // Concatenate all chunks and compare with Equals + auto actual_struct_arr = arrow::Concatenate(result_all->chunks()).ValueOrDie(); + ASSERT_TRUE(actual_struct_arr->Equals(*expected_struct_arr)) + << "Struct array content mismatch: actual values differ from expected (val = i % 10)"; + + // val == 5 appears exactly 10 times (rows 5,15,25,...,95) + auto actual_val_arr = std::dynamic_pointer_cast( + std::dynamic_pointer_cast(actual_struct_arr)->field(0)); int32_t count_val5 = 0; - int64_t total_rows_checked = 0; - for (int i = 0; i < result_all->num_chunks(); ++i) { - auto struct_arr = std::dynamic_pointer_cast(result_all->chunk(i)); - ASSERT_TRUE(struct_arr); - auto val_arr = std::dynamic_pointer_cast(struct_arr->field(0)); - ASSERT_TRUE(val_arr); - for (int64_t j = 0; j < val_arr->length(); ++j) { - auto expected = static_cast(total_rows_checked % 10); - ASSERT_EQ(expected, val_arr->Value(j)) - << "Value mismatch at row " << total_rows_checked; - if (val_arr->Value(j) == 5) { - ++count_val5; - } - ++total_rows_checked; + for (int64_t i = 0; i < actual_val_arr->length(); ++i) { + if (actual_val_arr->Value(i) == 5) { + ++count_val5; } } - ASSERT_EQ(100, total_rows_checked); ASSERT_EQ(10, count_val5) << "val == 5 should appear exactly 10 times in 100 rows"; } From 19f3f01aa9c84cb9a64ac0870544564eef7af021 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Tue, 9 Jun 2026 10:12:15 +0800 Subject: [PATCH 6/8] style: use ASSERT_* to check results --- .../format/parquet/page_filtered_row_group_reader_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp index 74b2d20a3..04379c213 100644 --- a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp +++ b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp @@ -801,7 +801,7 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesWithDictionaryEncoding) // total_compressed_size, which overshoots by the dictionary page size. for (size_t i = 0; i < ranges.size(); ++i) { int64_t range_end = ranges[i].offset + ranges[i].length; - EXPECT_LE(range_end, true_chunk_end) + ASSERT_LE(range_end, true_chunk_end) << "Range " << i << " [offset=" << ranges[i].offset << ", length=" << ranges[i].length << "] exceeds true chunk end (" << true_chunk_end << "). " << "This indicates chunk_end is computed as data_page_offset + " From c6f25b988821e652fbb71b8b61fbf1f99421e2f2 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Tue, 9 Jun 2026 14:13:06 +0800 Subject: [PATCH 7/8] test: add a partially matched case --- .../page_filtered_row_group_reader_test.cpp | 80 ++++++++----------- 1 file changed, 33 insertions(+), 47 deletions(-) diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp index 04379c213..6e9fb3a44 100644 --- a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp +++ b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp @@ -753,8 +753,9 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesWithDictionaryEncoding) auto field = arrow::field("val", arrow::int32()); auto struct_array = arrow::StructArray::Make({val_array}, {field}).ValueOrDie(); - // Write with dictionary encoding enabled (the key difference from other tests). - WriteTestFile(file_name, struct_array, /*write_batch_size=*/10, + // Write with dictionary encoding enabled and 1 row per page. + // Each page has min==max==val for that row, enabling precise page-level skipping. + WriteTestFile(file_name, struct_array, /*write_batch_size=*/1, /*max_row_group_length=*/100, /*enable_dictionary=*/true); // Open the file and verify metadata confirms dictionary page presence @@ -769,8 +770,7 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesWithDictionaryEncoding) auto col_chunk = rg_metadata->ColumnChunk(0); // Precondition: dictionary page must exist for this test to be meaningful - ASSERT_TRUE(col_chunk->has_dictionary_page()) - << "Dictionary page not present - test setup error"; + ASSERT_TRUE(col_chunk->has_dictionary_page()); int64_t dict_offset = col_chunk->dictionary_page_offset(); int64_t data_page_offset = col_chunk->data_page_offset(); @@ -782,11 +782,8 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesWithDictionaryEncoding) int64_t buggy_chunk_end = data_page_offset + total_compressed_size; // Sanity: dict page is before data pages, so buggy end > true end - ASSERT_LT(dict_offset, data_page_offset) - << "Dictionary offset should be before data page offset"; - ASSERT_GT(buggy_chunk_end, true_chunk_end) - << "Buggy chunk_end should exceed true chunk_end when dictionary is present"; - + ASSERT_LT(dict_offset, data_page_offset); + ASSERT_GT(buggy_chunk_end, true_chunk_end); // Now call ComputePageRanges with all rows matching RowRanges row_ranges; row_ranges.Add(RowRanges::Range(0, 99)); @@ -801,12 +798,7 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesWithDictionaryEncoding) // total_compressed_size, which overshoots by the dictionary page size. for (size_t i = 0; i < ranges.size(); ++i) { int64_t range_end = ranges[i].offset + ranges[i].length; - ASSERT_LE(range_end, true_chunk_end) - << "Range " << i << " [offset=" << ranges[i].offset << ", length=" << ranges[i].length - << "] exceeds true chunk end (" << true_chunk_end << "). " - << "This indicates chunk_end is computed as data_page_offset + " - "total_compressed_size instead of dictionary_page_offset + " - "total_compressed_size."; + ASSERT_LE(range_end, true_chunk_end); } // --- Check 2: At least one non-dictionary data-page range is present --- @@ -818,9 +810,7 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesWithDictionaryEncoding) ++data_page_range_count; } } - ASSERT_GE(data_page_range_count, 1) - << "Expected at least one data-page range (offset >= " << data_page_offset - << "), but only dictionary range(s) were returned."; + ASSERT_GE(data_page_range_count, 1); // --- Check 3: Maximum range_end equals true_chunk_end when requesting all rows --- int64_t max_range_end = 0; @@ -828,15 +818,11 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesWithDictionaryEncoding) int64_t range_end = range.offset + range.length; max_range_end = std::max(max_range_end, range_end); } - ASSERT_EQ(max_range_end, true_chunk_end) - << "When requesting all rows, the maximum range_end should exactly equal " - << "true_chunk_end (" << true_chunk_end << "), but got " << max_range_end - << ". The last data page range may be truncated or missing."; + ASSERT_EQ(max_range_end, true_chunk_end); // --- Check 4: No range exceeds file size --- for (const auto& range : ranges) { - ASSERT_LE(range.offset + range.length, static_cast(length)) - << "Range exceeds file size"; + ASSERT_LE(range.offset + range.length, static_cast(length)); } // --- End-to-end check 1: read all rows (no predicate filtering) --- @@ -848,35 +834,35 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesWithDictionaryEncoding) std::shared_ptr result_all; ReadWithPredicateImpl(file_name, read_schema, predicate_all, &result_all); ASSERT_TRUE(result_all); - ASSERT_EQ(100, result_all->length()) - << "End-to-end read with dictionary encoding should return all 100 rows"; + ASSERT_EQ(100, result_all->length()); - // --- End-to-end check 2: verify data content using Equals --- + // --- End-to-end check 2: full range query with page level skipping --- // Build expected array: val = i % 10 for i in [0, 100), wrapped in a struct. - arrow::Int32Builder expected_val_builder; - ASSERT_TRUE(expected_val_builder.Reserve(100).ok()); - for (int i = 0; i < 100; ++i) { - expected_val_builder.UnsafeAppend(static_cast(i % 10)); - } - auto expected_val_arr = expected_val_builder.Finish().ValueOrDie(); - - auto expected_struct_arr = arrow::StructArray::Make({expected_val_arr}, {field}).ValueOrDie(); - // Concatenate all chunks and compare with Equals auto actual_struct_arr = arrow::Concatenate(result_all->chunks()).ValueOrDie(); - ASSERT_TRUE(actual_struct_arr->Equals(*expected_struct_arr)) - << "Struct array content mismatch: actual values differ from expected (val = i % 10)"; - - // val == 5 appears exactly 10 times (rows 5,15,25,...,95) - auto actual_val_arr = std::dynamic_pointer_cast( - std::dynamic_pointer_cast(actual_struct_arr)->field(0)); - int32_t count_val5 = 0; - for (int64_t i = 0; i < actual_val_arr->length(); ++i) { - if (actual_val_arr->Value(i) == 5) { - ++count_val5; + ASSERT_TRUE(actual_struct_arr->Equals(struct_array)); + + // --- End-to-end check 3: partial-row query with page-level skipping --- + // Predicate val >= 7 skips pages where val < 7, keeping only val in {7,8,9}. + // Out of 100 rows, 30 rows satisfy val >= 7 (3 per cycle × 10 cycles). + auto predicate_partial = PredicateBuilder::GreaterOrEqual( + /*field_index=*/0, /*field_name=*/"val", FieldType::INT, Literal(7)); + std::shared_ptr result_partial; + ReadWithPredicateImpl(file_name, read_schema, predicate_partial, &result_partial); + ASSERT_TRUE(result_partial); + + // Build expected StructArray and compare with Equals + arrow::Int32Builder expected_builder; + ASSERT_TRUE(expected_builder.Reserve(30).ok()); + for (int32_t i = 0; i < 100; ++i) { + if (i % 10 >= 7) { + expected_builder.UnsafeAppend(i % 10); } } - ASSERT_EQ(10, count_val5) << "val == 5 should appear exactly 10 times in 100 rows"; + auto expected_array = expected_builder.Finish().ValueOrDie(); + auto expected_struct = arrow::StructArray::Make({expected_array}, {field}).ValueOrDie(); + auto partial_concat = arrow::Concatenate(result_partial->chunks()).ValueOrDie(); + ASSERT_TRUE(partial_concat->Equals(expected_struct)); } } // namespace paimon::parquet::test From 8d598704342b6f013f72bb7a27ad97ff5a542429 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Tue, 9 Jun 2026 17:50:00 +0800 Subject: [PATCH 8/8] style: use range based loop --- .../format/parquet/page_filtered_row_group_reader_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp index 6e9fb3a44..477b2bf32 100644 --- a/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp +++ b/src/paimon/format/parquet/page_filtered_row_group_reader_test.cpp @@ -796,8 +796,8 @@ TEST_F(PageFilteredRowGroupReaderTest, ComputePageRangesWithDictionaryEncoding) // --- Check 1: No range should extend beyond the true chunk end --- // With the bug, the last data page's range would use chunk_end = data_page_offset + // total_compressed_size, which overshoots by the dictionary page size. - for (size_t i = 0; i < ranges.size(); ++i) { - int64_t range_end = ranges[i].offset + ranges[i].length; + for (auto & range : ranges) { + int64_t range_end = range.offset + range.length; ASSERT_LE(range_end, true_chunk_end); }