diff --git a/src/paimon/common/file_index/bloomfilter/bloom_filter_file_index.cpp b/src/paimon/common/file_index/bloomfilter/bloom_filter_file_index.cpp index 157361f32..06cf57343 100644 --- a/src/paimon/common/file_index/bloomfilter/bloom_filter_file_index.cpp +++ b/src/paimon/common/file_index/bloomfilter/bloom_filter_file_index.cpp @@ -78,9 +78,15 @@ BloomFilterFileIndexReader::BloomFilterFileIndexReader(const FastHash::HashFunct Result> BloomFilterFileIndexReader::VisitEqual( const Literal& literal) { + // This returns `Remain` to align with the current Java implementation in BF index, even though + // its predicate semantics are inconsistent here. In practice, equality tests in predicate + // evaluation always return false when the literal is null. See + // `null_false_leaf_binary_function.h`. + if (literal.IsNull()) { + return FileIndexResult::Remain(); + } int64_t hash = hash_function_(literal); - return literal.IsNull() || filter_.TestHash(hash) ? FileIndexResult::Remain() - : FileIndexResult::Skip(); + return filter_.TestHash(hash) ? FileIndexResult::Remain() : FileIndexResult::Skip(); } } // namespace paimon diff --git a/src/paimon/common/file_index/bsi/bit_slice_index_bitmap_file_index.cpp b/src/paimon/common/file_index/bsi/bit_slice_index_bitmap_file_index.cpp index 0bbd993ec..d71262cc2 100644 --- a/src/paimon/common/file_index/bsi/bit_slice_index_bitmap_file_index.cpp +++ b/src/paimon/common/file_index/bsi/bit_slice_index_bitmap_file_index.cpp @@ -17,7 +17,9 @@ #include "paimon/common/file_index/bsi/bit_slice_index_bitmap_file_index.h" #include +#include #include +#include #include "fmt/format.h" #include "paimon/common/file_index/bsi/bit_slice_index_roaring_bitmap.h" @@ -31,7 +33,17 @@ #include "paimon/io/data_input_stream.h" #include "paimon/memory/bytes.h" #include "paimon/utils/roaring_bitmap32.h" +namespace { +// Safe absolute value for int64_t that avoids undefined behavior when value == INT64_MIN. +// This mirrors Java's Math.abs() wrapping semantics. +inline int64_t SafeAbs(int64_t value) { + if (value == INT64_MIN) { + return INT64_MIN; + } + return value < 0 ? -value : value; +} +} // namespace namespace paimon { class MemoryPool; @@ -153,10 +165,14 @@ Result> BitSliceIndexBitmapFileIndexReader::Vis BitmapIndexResult::BitmapSupplier bitmap_supplier = [literal = literal, reader = shared_from_this()]() -> Result { PAIMON_ASSIGN_OR_RAISE(int64_t value, reader->value_mapper_(literal)); - if (value >= 0) { + if (value == INT64_MIN) { + // Everything is greater than INT64_MIN (writer cannot store it) + return RoaringBitmap32::Or(reader->positive_->IsNotNull(), + reader->negative_->IsNotNull()); + } else if (value >= 0) { return reader->positive_->GreaterThan(value); } else { - PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 b1, reader->negative_->LessThan(-value)); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 b1, reader->negative_->LessThan(SafeAbs(value))); RoaringBitmap32 b2 = reader->positive_->IsNotNull(); b1 |= b2; return b1; @@ -170,10 +186,15 @@ Result> BitSliceIndexBitmapFileIndexReader::Vis BitmapIndexResult::BitmapSupplier bitmap_supplier = [literal = literal, reader = shared_from_this()]() -> Result { PAIMON_ASSIGN_OR_RAISE(int64_t value, reader->value_mapper_(literal)); - if (value >= 0) { + if (value == INT64_MIN) { + // All non-null rows satisfy x >= INT64_MIN + return RoaringBitmap32::Or(reader->positive_->IsNotNull(), + reader->negative_->IsNotNull()); + } else if (value >= 0) { return reader->positive_->GreaterOrEqual(value); } else { - PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 b1, reader->negative_->LessOrEqual(-value)); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 b1, + reader->negative_->LessOrEqual(SafeAbs(value))); RoaringBitmap32 b2 = reader->positive_->IsNotNull(); b1 |= b2; return b1; @@ -187,8 +208,11 @@ Result> BitSliceIndexBitmapFileIndexReader::Vis BitmapIndexResult::BitmapSupplier bitmap_supplier = [literal = literal, reader = shared_from_this()]() -> Result { PAIMON_ASSIGN_OR_RAISE(int64_t value, reader->value_mapper_(literal)); - if (value < 0) { - return reader->negative_->GreaterThan(-value); + if (value == INT64_MIN) { + // Nothing is less than INT64_MIN + return RoaringBitmap32(); + } else if (value < 0) { + return reader->negative_->GreaterThan(SafeAbs(value)); } else { PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 b1, reader->positive_->LessThan(value)); RoaringBitmap32 b2 = reader->negative_->IsNotNull(); @@ -203,8 +227,11 @@ Result> BitSliceIndexBitmapFileIndexReader::Vis BitmapIndexResult::BitmapSupplier bitmap_supplier = [literal = literal, reader = shared_from_this()]() -> Result { PAIMON_ASSIGN_OR_RAISE(int64_t value, reader->value_mapper_(literal)); - if (value < 0) { - return reader->negative_->GreaterOrEqual(-value); + if (value == INT64_MIN) { + // Writer cannot store INT64_MIN, so no row can match + return RoaringBitmap32(); + } else if (value < 0) { + return reader->negative_->GreaterOrEqual(SafeAbs(value)); } else { PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 b1, reader->positive_->LessOrEqual(value)); RoaringBitmap32 b2 = reader->negative_->IsNotNull(); @@ -232,13 +259,17 @@ Result> BitSliceIndexBitmapFileIndexReader::Vis result_bitmaps.reserve(literals.size()); for (const auto& literal : literals) { PAIMON_ASSIGN_OR_RAISE(int64_t value, reader->value_mapper_(literal)); - RoaringBitmap32 equal; - if (value < 0) { - PAIMON_ASSIGN_OR_RAISE(equal, reader->negative_->Equal(-value)); + if (value == INT64_MIN) { + // Writer cannot store INT64_MIN, so no row can match it + result_bitmaps.emplace_back(); + } else if (value < 0) { + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 equal, + reader->negative_->Equal(SafeAbs(value))); + result_bitmaps.emplace_back(std::move(equal)); } else { - PAIMON_ASSIGN_OR_RAISE(equal, reader->positive_->Equal(value)); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 equal, reader->positive_->Equal(value)); + result_bitmaps.emplace_back(std::move(equal)); } - result_bitmaps.emplace_back(std::move(equal)); } return RoaringBitmap32::FastUnion(result_bitmaps); }; @@ -255,13 +286,17 @@ Result> BitSliceIndexBitmapFileIndexReader::Vis result_bitmaps.reserve(literals.size()); for (const auto& literal : literals) { PAIMON_ASSIGN_OR_RAISE(int64_t value, reader->value_mapper_(literal)); - RoaringBitmap32 equal; - if (value < 0) { - PAIMON_ASSIGN_OR_RAISE(equal, reader->negative_->Equal(-value)); + if (value == INT64_MIN) { + // Writer cannot store INT64_MIN, so no row can match it + result_bitmaps.emplace_back(); + } else if (value < 0) { + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 equal, + reader->negative_->Equal(SafeAbs(value))); + result_bitmaps.emplace_back(std::move(equal)); } else { - PAIMON_ASSIGN_OR_RAISE(equal, reader->positive_->Equal(value)); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 equal, reader->positive_->Equal(value)); + result_bitmaps.emplace_back(std::move(equal)); } - result_bitmaps.emplace_back(std::move(equal)); } auto in = RoaringBitmap32::FastUnion(result_bitmaps); ebm -= in; diff --git a/src/paimon/common/file_index/bsi/bit_slice_index_bitmap_file_index_test.cpp b/src/paimon/common/file_index/bsi/bit_slice_index_bitmap_file_index_test.cpp index cc873dbf6..e96b7b688 100644 --- a/src/paimon/common/file_index/bsi/bit_slice_index_bitmap_file_index_test.cpp +++ b/src/paimon/common/file_index/bsi/bit_slice_index_bitmap_file_index_test.cpp @@ -16,6 +16,7 @@ #include "paimon/common/file_index/bsi/bit_slice_index_bitmap_file_index.h" +#include #include #include "gtest/gtest.h" @@ -385,4 +386,49 @@ TEST_F(BitSliceIndexBitmapIndexReaderTest, TestUnInvalidType) { "BitSliceIndexBitmapFileIndex only support TINYINT/SMALLINT/INT/BIGINT/DATE"); } +TEST_F(BitSliceIndexBitmapIndexReaderTest, TestReaderPredicatePruningWithInt64Min) { + // Reuse TestPrimitiveType's index bytes. + // data: null, 1, null, 2, -1 (non-null rows: {1, 3, 4}) + std::vector index_bytes = { + 1, 0, 0, 0, 5, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 58, + 48, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 16, 0, 0, 0, 1, 0, 3, 0, 0, 0, 0, 2, 58, + 48, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 1, 0, 58, 48, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 16, 0, 0, 0, 3, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 58, 48, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 4, 0, 0, + 0, 0, 1, 58, 48, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 4, 0}; + auto input_stream = + std::make_shared(index_bytes.data(), index_bytes.size()); + BitSliceIndexBitmapFileIndex file_index({}); + ASSERT_OK_AND_ASSIGN( + auto reader, + file_index.CreateReader(CreateArrowSchema(arrow::int64()).get(), + /*start=*/0, /*length=*/index_bytes.size(), input_stream, pool_)); + + // x > INT64_MIN: all non-null rows (writer cannot store INT64_MIN) + CheckResult(reader->VisitGreaterThan(Literal(INT64_MIN)).value(), {1, 3, 4}); + + // x >= INT64_MIN: all non-null rows + CheckResult(reader->VisitGreaterOrEqual(Literal(INT64_MIN)).value(), {1, 3, 4}); + + // x < INT64_MIN: empty + CheckResult(reader->VisitLessThan(Literal(INT64_MIN)).value(), {}); + + // x <= INT64_MIN: empty (no row has INT64_MIN) + CheckResult(reader->VisitLessOrEqual(Literal(INT64_MIN)).value(), {}); + + // x == INT64_MIN: empty + CheckResult(reader->VisitEqual(Literal(INT64_MIN)).value(), {}); + + // x != INT64_MIN: all non-null rows + CheckResult(reader->VisitNotEqual(Literal(INT64_MIN)).value(), {1, 3, 4}); + + // x IN (INT64_MIN, -1): only -1 matches → row 4 + CheckResult(reader->VisitIn({Literal(INT64_MIN), Literal(static_cast(-1))}).value(), + {4}); + + // x NOT IN (INT64_MIN, -1): all non-null except row 4 + CheckResult(reader->VisitNotIn({Literal(INT64_MIN), Literal(static_cast(-1))}).value(), + {1, 3}); +} + } // namespace paimon::test