From cf44831f258a6b55362ebd4eed8416baef955314 Mon Sep 17 00:00:00 2001 From: duanyyyyyyy Date: Thu, 4 Jun 2026 11:30:54 +0800 Subject: [PATCH] [core] Remap predicate field index to projected read schema in InternalReadContext MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Predicates are constructed against the latest table schema, so each LeafPredicate carries a field index that points to the column's position in the full table schema. When the query projects a subset of columns the read_schema built inside InternalReadContext::Create lays those columns out at different positions; the leaf's field index no longer matches the column it names. The strict field-id validation that runs immediately afterwards then fails: Paimon TableRead::Create error: Invalid: field obs_index has field idx 0 in input schema, mismatch field idx 1 in predicate The downstream LeafPredicateImpl::Test paths use field_index_ directly to index into arrow arrays / internal rows built from read_schema, so even if validation were relaxed, leaving the mismatch in place would silently read the wrong column. Walk the predicate tree once at context construction and rebuild each LeafPredicate with the field index resolved from the read_schema by field name. CompoundPredicate is reconstructed only when any descendant actually changed, otherwise the original shared_ptr is reused. GetPredicate() now returns the remapped predicate; the original (with latest-schema indices) is still available via the wrapped ReadContext if ever needed. CreateWithSchema lays the context over a different read_schema (e.g. the minimal column set for COUNT(*)), so it also remaps from the original FE predicate against the new read_schema rather than copying the already-remapped one whose indices are aligned with the original read_schema. Repro: -- DE table with btree global index on obs_index CREATE TABLE t ( clip_id STRING, obs_index INT, time_offset_ms BIGINT, collected_date DATE ) PARTITIONED BY (collected_date, clip_id) TBLPROPERTIES ( 'data-evolution.enabled' = 'true', 'global-index.btree.index-column' = 'obs_index', 'bucket' = '-1' ); -- after INSERT + CALL paimon.sys.create_global_index(...) SELECT clip_id, obs_index, time_offset_ms FROM t WHERE collected_date = '2026-05-26' AND clip_id = 'clip_a' AND obs_index BETWEEN 0 AND 10; -- before: TableRead::Create error (field idx mismatch) -- after: returns matching rows Coverage: InternalReadContext gains four cases — TestPredicateFieldIdxRemappedWhenProjected, TestPredicateUnchangedWhenAligned, TestCompoundPredicateRemap, TestPredicateOnFieldMissingFromReadSchema — covering the projected/aligned/compound/missing-field paths through the remap. Signed-off-by: duanyyyyyyy --- include/paimon/predicate/leaf_predicate.h | 5 + include/paimon/predicate/predicate_builder.h | 23 ++- include/paimon/read_context.h | 7 + .../common/predicate/predicate_builder.cpp | 2 - .../core/operation/internal_read_context.cpp | 148 +++++++++++++++-- .../core/operation/internal_read_context.h | 13 +- .../operation/internal_read_context_test.cpp | 149 ++++++++++++++++++ .../core/table/source/table_read_test.cpp | 12 -- 8 files changed, 326 insertions(+), 33 deletions(-) diff --git a/include/paimon/predicate/leaf_predicate.h b/include/paimon/predicate/leaf_predicate.h index 8924e52e2..be173a933 100644 --- a/include/paimon/predicate/leaf_predicate.h +++ b/include/paimon/predicate/leaf_predicate.h @@ -32,6 +32,11 @@ enum class FieldType; /// Leaf node of a `Predicate` tree. Compares a field with literals. class PAIMON_EXPORT LeafPredicate : virtual public Predicate { public: + /// The field's position in the schema this predicate is currently bound to. + /// At construction the value reflects the schema the caller supplied to + /// `PredicateBuilder`; predicates obtained from `InternalReadContext::GetPredicate()` + /// have already been projected onto the read schema (see + /// `InternalReadContext::GetPredicate()` for the projection semantics). int32_t FieldIndex() const { return field_index_; } diff --git a/include/paimon/predicate/predicate_builder.h b/include/paimon/predicate/predicate_builder.h index 77e0ace12..78621a57c 100644 --- a/include/paimon/predicate/predicate_builder.h +++ b/include/paimon/predicate/predicate_builder.h @@ -32,6 +32,17 @@ enum class FieldType; /// /// PredicateBuilder provides static factory methods to create various types of predicates /// that can be used for filtering data in Paimon tables. +/// +/// The `field_index` parameter accepted by every factory method is the position of the +/// field in the schema the caller is working with — typically the latest table schema. +/// When the resulting predicate is later attached to an `InternalReadContext`, +/// `InternalReadContext::Create` projects each leaf onto the read schema (mirrors +/// paimon Java's `PredicateProjectionConverter`): leaf field indices are rewritten via +/// the table-schema → read-schema position mapping (keyed by the stable paimon field +/// id, so it survives column renames), and leaves / OR branches whose fields are not +/// in the projection are dropped. Callers therefore do not need to track projection +/// state themselves. `field_name` is informational (used for debug / display) and +/// does not participate in projection lookup. class PAIMON_EXPORT PredicateBuilder { public: PredicateBuilder() = delete; @@ -39,8 +50,10 @@ class PAIMON_EXPORT PredicateBuilder { /// Create an equality predicate (field == literal). /// - /// @param field_index The index of the field in read schema (0-based). - /// @param field_name The name of the field. + /// @param field_index The position of the field in the schema the caller is working + /// with (0-based); projected onto the read schema by + /// `InternalReadContext::Create`. See class doc for details. + /// @param field_name The name of the field (informational; see class doc). /// @param field_type The data type of the field. /// @param literal The literal value to compare against. /// @return A shared pointer to the created Predicate object. @@ -99,8 +112,10 @@ class PAIMON_EXPORT PredicateBuilder { /// /// Tests whether the field value falls within the specified range (inclusive on both ends). /// - /// @param field_index The index of the field in read schema (0-based). - /// @param field_name The name of the field. + /// @param field_index The position of the field in the schema the caller is working + /// with (0-based); projected onto the read schema by + /// `InternalReadContext::Create`. See class doc for details. + /// @param field_name The name of the field (informational; see class doc). /// @param field_type The data type of the field. /// @param included_lower_bound The lower bound of the range (inclusive). /// @param included_upper_bound The upper bound of the range (inclusive). diff --git a/include/paimon/read_context.h b/include/paimon/read_context.h index 4c4e15899..fd5334cea 100644 --- a/include/paimon/read_context.h +++ b/include/paimon/read_context.h @@ -206,6 +206,13 @@ class PAIMON_EXPORT ReadContextBuilder { /// It can significantly improve performance by reducing the amount of data /// that needs to be read and processed. /// + /// The caller should construct the predicate against the latest table schema. + /// `InternalReadContext::Create` projects each leaf onto the read schema + /// (mirroring paimon Java's `PredicateProjectionConverter`): leaf field indices + /// are rewritten to positions in the read schema, and AND children / OR branches + /// whose fields are not in the projection are pruned. The predicate therefore + /// does not need to be projection-aware. + /// /// @param predicate Shared pointer to the predicate for data filtering. /// @return Reference to this builder for method chaining. ReadContextBuilder& SetPredicate(const std::shared_ptr& predicate); diff --git a/src/paimon/common/predicate/predicate_builder.cpp b/src/paimon/common/predicate/predicate_builder.cpp index a8b798f6d..0c7b86abc 100644 --- a/src/paimon/common/predicate/predicate_builder.cpp +++ b/src/paimon/common/predicate/predicate_builder.cpp @@ -42,8 +42,6 @@ namespace paimon { enum class FieldType; -// TODO(xinyu.lxy): predicate field_index use index in read schema now, but java paimon use index -// in file schema std::shared_ptr PredicateBuilder::Equal(int32_t field_index, const std::string& field_name, const FieldType& field_type, diff --git a/src/paimon/core/operation/internal_read_context.cpp b/src/paimon/core/operation/internal_read_context.cpp index ec74c24d6..af77641b9 100644 --- a/src/paimon/core/operation/internal_read_context.cpp +++ b/src/paimon/core/operation/internal_read_context.cpp @@ -16,12 +16,16 @@ #include "paimon/core/operation/internal_read_context.h" +#include #include +#include "paimon/common/predicate/compound_predicate_impl.h" +#include "paimon/common/predicate/leaf_predicate_impl.h" #include "paimon/common/predicate/predicate_validator.h" #include "paimon/common/table/special_fields.h" #include "paimon/common/types/data_field.h" #include "paimon/core/schema/arrow_schema_validator.h" +#include "paimon/predicate/function.h" #include "paimon/status.h" namespace arrow { @@ -29,6 +33,107 @@ class Schema; } // namespace arrow namespace paimon { +namespace { +// Build a map from a field's position in `table_schema` (the latest table schema, the +// index space upstream predicates are typically constructed against) to its position +// in `read_data_fields` (the projected read schema). Field identity is the stable +// field id, so the mapping survives column renames within the same schema. Read-only +// special fields (RowId / SequenceNumber / etc.) have no analogue in the table +// schema and are skipped — user-supplied predicates do not reference them. +std::map BuildLatestToReadIdxMapping( + const TableSchema& table_schema, const std::vector& read_data_fields) { + std::map id_to_latest_idx; + const auto& table_fields = table_schema.Fields(); + for (size_t latest_idx = 0; latest_idx < table_fields.size(); latest_idx++) { + id_to_latest_idx[table_fields[latest_idx].Id()] = static_cast(latest_idx); + } + std::map mapping; + for (size_t read_idx = 0; read_idx < read_data_fields.size(); read_idx++) { + auto iter = id_to_latest_idx.find(read_data_fields[read_idx].Id()); + if (iter != id_to_latest_idx.end()) { + mapping[iter->second] = static_cast(read_idx); + } + } + return mapping; +} + +// Project `predicate` onto the read schema, rewriting each leaf's field index via +// `latest_to_read_idx` (predicate's source index space → read schema position). +// +// Inclusive semantics, matching paimon Java's PredicateProjectionConverter: +// - Leaf whose field is not in the read schema: dropped (returns nullopt). +// - AND: drop non-projectable children, keep the rest. Safe because if `A AND B` +// holds for a row, A holds too; the projected predicate is a superset +// (necessary, not sufficient). +// - OR: every child must be projectable. If any child is dropped, drop the +// whole OR — otherwise a row that only satisfied the dropped branch would +// falsely pass. +// - Predicates without a field index (e.g. full-text / vector search) flow +// through unchanged. +Result>> ProjectPredicate( + const std::map& latest_to_read_idx, + const std::shared_ptr& predicate) { + if (auto leaf = std::dynamic_pointer_cast(predicate)) { + auto iter = latest_to_read_idx.find(leaf->FieldIndex()); + if (iter == latest_to_read_idx.end()) { + return std::optional>{}; + } + if (iter->second == leaf->FieldIndex()) { + return std::optional>{predicate}; + } + return std::optional>{ + std::static_pointer_cast(leaf->NewLeafPredicate(iter->second))}; + } + if (auto compound = std::dynamic_pointer_cast(predicate)) { + const bool is_and = compound->GetFunction().GetType() == Function::Type::AND; + std::vector> projected_children; + projected_children.reserve(compound->Children().size()); + bool any_changed = false; + for (const auto& child : compound->Children()) { + PAIMON_ASSIGN_OR_RAISE(std::optional> projected_child, + ProjectPredicate(latest_to_read_idx, child)); + if (!projected_child.has_value()) { + if (!is_and) { + return std::optional>{}; + } + any_changed = true; + continue; + } + if (projected_child.value() != child) { + any_changed = true; + } + projected_children.push_back(std::move(projected_child.value())); + } + if (projected_children.empty()) { + return std::optional>{}; + } + if (projected_children.size() == 1) { + return std::optional>{std::move(projected_children[0])}; + } + if (!any_changed) { + return std::optional>{predicate}; + } + return std::optional>{std::static_pointer_cast( + compound->NewCompoundPredicate(projected_children))}; + } + return std::optional>{predicate}; +} + +Result> ProjectAndValidatePredicate( + const arrow::Schema& read_schema, const std::map& latest_to_read_idx, + const std::shared_ptr& predicate) { + PAIMON_ASSIGN_OR_RAISE(std::optional> projected, + ProjectPredicate(latest_to_read_idx, predicate)); + if (!projected.has_value()) { + return std::shared_ptr{}; + } + PAIMON_RETURN_NOT_OK(PredicateValidator::ValidatePredicateWithSchema( + read_schema, projected.value(), /*validate_field_idx=*/true)); + PAIMON_RETURN_NOT_OK(PredicateValidator::ValidatePredicateWithLiterals(projected.value())); + return projected.value(); +} +} // namespace + Result> InternalReadContext::Create( const std::shared_ptr& context, const std::shared_ptr& table_schema, const std::map& options) { @@ -95,34 +200,51 @@ Result> InternalReadContext::Create( auto read_schema = DataField::ConvertDataFieldsToArrowSchema(read_data_fields); // validate read schema to avoid redundant fields PAIMON_RETURN_NOT_OK(ArrowSchemaValidator::ValidateSchemaWithFieldId(*read_schema)); - // validate predicate + // Project the upstream predicate onto `read_schema`. Predicates carry field indices + // pointing into the latest table schema; rewrite them to positions in `read_schema_` + // so downstream readers can apply them directly. + std::shared_ptr projected_predicate; if (context->GetPredicate()) { - PAIMON_RETURN_NOT_OK(PredicateValidator::ValidatePredicateWithSchema( - *read_schema, context->GetPredicate(), /*validate_field_idx=*/true)); - PAIMON_RETURN_NOT_OK( - PredicateValidator::ValidatePredicateWithLiterals(context->GetPredicate())); + auto latest_to_read_idx = BuildLatestToReadIdxMapping(*table_schema, read_data_fields); + PAIMON_ASSIGN_OR_RAISE( + projected_predicate, + ProjectAndValidatePredicate(*read_schema, latest_to_read_idx, context->GetPredicate())); } - return std::unique_ptr( - new InternalReadContext(context, table_schema, read_schema, core_options)); + return std::unique_ptr(new InternalReadContext( + context, table_schema, read_schema, core_options, std::move(projected_predicate))); } InternalReadContext::InternalReadContext(const std::shared_ptr& read_context, const std::shared_ptr& table_schema, const std::shared_ptr& read_schema, - const CoreOptions& options) + const CoreOptions& options, + std::shared_ptr projected_predicate) : read_context_(read_context), table_schema_(table_schema), read_schema_(read_schema), - options_(options) {} + options_(options), + projected_predicate_(std::move(projected_predicate)) {} Result> InternalReadContext::CreateWithSchema( const std::shared_ptr& original, const std::shared_ptr& new_read_schema) { - // Create a new InternalReadContext sharing all properties except read_schema. - // The new read_schema is the minimal column set for COUNT(*). - return std::shared_ptr(new InternalReadContext( - original->read_context_, original->table_schema_, new_read_schema, original->options_)); + // The wrapped read_context still holds the caller-supplied predicate (in the latest + // table schema's index space). Re-project it against `new_read_schema` directly, + // rather than re-projecting the already-projected predicate sitting on `original`. + std::shared_ptr projected_predicate; + if (original->read_context_->GetPredicate()) { + PAIMON_ASSIGN_OR_RAISE(std::vector new_read_data_fields, + DataField::ConvertArrowSchemaToDataFields(new_read_schema)); + auto latest_to_read_idx = + BuildLatestToReadIdxMapping(*original->table_schema_, new_read_data_fields); + PAIMON_ASSIGN_OR_RAISE(projected_predicate, ProjectAndValidatePredicate( + *new_read_schema, latest_to_read_idx, + original->read_context_->GetPredicate())); + } + return std::shared_ptr( + new InternalReadContext(original->read_context_, original->table_schema_, new_read_schema, + original->options_, std::move(projected_predicate))); } } // namespace paimon diff --git a/src/paimon/core/operation/internal_read_context.h b/src/paimon/core/operation/internal_read_context.h index 12b734a62..abe23b907 100644 --- a/src/paimon/core/operation/internal_read_context.h +++ b/src/paimon/core/operation/internal_read_context.h @@ -62,8 +62,16 @@ class InternalReadContext { const std::vector& GetPrimaryKeys() const { return table_schema_->PrimaryKeys(); } + // Returns the predicate projected onto `read_schema_`. Upstream constructs predicates + // against the latest table schema, so when the query projects a subset of columns + // the leaf field indices no longer match the projected schema. The projection is + // done once at context construction (mirrors paimon Java's + // `PredicateProjectionConverter`): leaf indices are rewritten via the table-schema + // → read-schema mapping, AND children whose fields are absent from the read schema + // are dropped (inclusive), and OR is dropped wholesale if any of its children is not + // projectable. May be nullptr if the entire predicate is non-projectable. const std::shared_ptr& GetPredicate() const { - return read_context_->GetPredicate(); + return projected_predicate_; } bool EnablePredicateFilter() const { return read_context_->EnablePredicateFilter(); @@ -110,12 +118,13 @@ class InternalReadContext { InternalReadContext(const std::shared_ptr& read_context, const std::shared_ptr& table_schema, const std::shared_ptr& read_schema, - const CoreOptions& options); + const CoreOptions& options, std::shared_ptr projected_predicate); std::shared_ptr read_context_; std::shared_ptr table_schema_; std::shared_ptr read_schema_; CoreOptions options_; + std::shared_ptr projected_predicate_; }; } // namespace paimon diff --git a/src/paimon/core/operation/internal_read_context_test.cpp b/src/paimon/core/operation/internal_read_context_test.cpp index e48336b3f..c15480d5a 100644 --- a/src/paimon/core/operation/internal_read_context_test.cpp +++ b/src/paimon/core/operation/internal_read_context_test.cpp @@ -25,6 +25,9 @@ #include "paimon/core/schema/schema_manager.h" #include "paimon/defs.h" #include "paimon/fs/local/local_file_system.h" +#include "paimon/predicate/compound_predicate.h" +#include "paimon/predicate/leaf_predicate.h" +#include "paimon/predicate/predicate_builder.h" #include "paimon/status.h" #include "paimon/testing/utils/testharness.h" @@ -191,4 +194,150 @@ TEST(InternalReadContext, TestReadWithFieldIdsAndSpecialFields) { } } +// Upstream predicates are constructed against the latest table schema. When the query +// projects a subset of columns, the read schema built inside InternalReadContext::Create +// lays those columns out at different positions and the original field_index no longer +// matches. The remapping path in Create() must rewrite each leaf predicate so its +// field_index points to the column's position in the projected read schema. +TEST(InternalReadContext, TestPredicateFieldIdxRemappedWhenProjected) { + std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09"; + ReadContextBuilder context_builder(path); + // read_schema lays out (f3, f0) — f3 ends up at position 0, f0 at position 1, while + // the latest table schema has them at 3 and 0 respectively. + context_builder.SetReadSchema({"f3", "f0"}); + // Predicate was constructed against the latest schema, so f3 carries field_index 3. + auto predicate = + PredicateBuilder::Equal(/*field_index=*/3, "f3", FieldType::DOUBLE, Literal(1.5)); + context_builder.SetPredicate(predicate); + ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); + SchemaManager schema_manager(std::make_shared(), read_context->GetPath()); + ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0)); + ASSERT_OK_AND_ASSIGN(auto internal_context, + InternalReadContext::Create(std::move(read_context), table_schema, + table_schema->Options())); + auto leaf = std::dynamic_pointer_cast(internal_context->GetPredicate()); + ASSERT_NE(leaf, nullptr); + ASSERT_EQ(leaf->FieldName(), "f3"); + // f3 is the first column in the projected read schema. + ASSERT_EQ(leaf->FieldIndex(), 0); +} + +// When the predicate already aligns with the read schema, remapping should be a no-op +// and return the original shared_ptr without reconstructing the leaf. +TEST(InternalReadContext, TestPredicateUnchangedWhenAligned) { + std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09"; + ReadContextBuilder context_builder(path); + // No projection -> read schema matches the latest schema: f0(0), f1(1), f2(2), f3(3). + auto predicate = + PredicateBuilder::Equal(/*field_index=*/3, "f3", FieldType::DOUBLE, Literal(1.5)); + context_builder.SetPredicate(predicate); + ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); + SchemaManager schema_manager(std::make_shared(), read_context->GetPath()); + ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0)); + ASSERT_OK_AND_ASSIGN(auto internal_context, + InternalReadContext::Create(std::move(read_context), table_schema, + table_schema->Options())); + // shared_ptr equality: remap returned the input unchanged. + ASSERT_EQ(predicate.get(), internal_context->GetPredicate().get()); +} + +// CompoundPredicate is recursively remapped: every nested leaf must point to the +// column's position in the projected read schema. +TEST(InternalReadContext, TestCompoundPredicateRemap) { + std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09"; + ReadContextBuilder context_builder(path); + context_builder.SetReadSchema({"f3", "f0"}); + // AND(f3 == 1.5, f0 == "x") with field indices from the latest schema. + auto left = PredicateBuilder::Equal(/*field_index=*/3, "f3", FieldType::DOUBLE, Literal(1.5)); + auto right = PredicateBuilder::Equal(/*field_index=*/0, "f0", FieldType::STRING, + Literal(FieldType::STRING, "x", 1)); + ASSERT_OK_AND_ASSIGN(auto compound, PredicateBuilder::And({left, right})); + context_builder.SetPredicate(compound); + ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); + SchemaManager schema_manager(std::make_shared(), read_context->GetPath()); + ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0)); + ASSERT_OK_AND_ASSIGN(auto internal_context, + InternalReadContext::Create(std::move(read_context), table_schema, + table_schema->Options())); + auto remapped_compound = + std::dynamic_pointer_cast(internal_context->GetPredicate()); + ASSERT_NE(remapped_compound, nullptr); + ASSERT_EQ(remapped_compound->Children().size(), 2); + auto remapped_left = std::dynamic_pointer_cast(remapped_compound->Children()[0]); + auto remapped_right = + std::dynamic_pointer_cast(remapped_compound->Children()[1]); + ASSERT_NE(remapped_left, nullptr); + ASSERT_NE(remapped_right, nullptr); + ASSERT_EQ(remapped_left->FieldName(), "f3"); + ASSERT_EQ(remapped_left->FieldIndex(), 0); + ASSERT_EQ(remapped_right->FieldName(), "f0"); + ASSERT_EQ(remapped_right->FieldIndex(), 1); +} + +// A leaf whose field is not in the projected read schema is dropped silently +// (inclusive projection — same as paimon Java's PredicateProjectionConverter). +// The caller's request still succeeds; the predicate is simply unavailable for +// pushdown at this read. +TEST(InternalReadContext, TestPredicateOnFieldOutsideProjectionDropped) { + std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09"; + ReadContextBuilder context_builder(path); + context_builder.SetReadSchema({"f3", "f0"}); + auto predicate = PredicateBuilder::Equal(/*field_index=*/1, "f1", FieldType::INT, Literal(7)); + context_builder.SetPredicate(predicate); + ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); + SchemaManager schema_manager(std::make_shared(), read_context->GetPath()); + ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0)); + ASSERT_OK_AND_ASSIGN(auto internal_context, + InternalReadContext::Create(std::move(read_context), table_schema, + table_schema->Options())); + ASSERT_EQ(internal_context->GetPredicate(), nullptr); +} + +// AND is inclusive: a child whose field is not in the read schema is dropped, the rest +// of the AND survives. Verifies the same semantic paimon Java applies in +// PredicateProjectionConverter (necessary, not sufficient — projection is a superset). +TEST(InternalReadContext, TestAndChildOutsideProjectionDropped) { + std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09"; + ReadContextBuilder context_builder(path); + context_builder.SetReadSchema({"f3", "f0"}); + // AND(f3 == 1.5, f1 == 7): f1 is outside the projection; only f3 should survive. + auto projectable = + PredicateBuilder::Equal(/*field_index=*/3, "f3", FieldType::DOUBLE, Literal(1.5)); + auto dropped = PredicateBuilder::Equal(/*field_index=*/1, "f1", FieldType::INT, Literal(7)); + ASSERT_OK_AND_ASSIGN(auto compound, PredicateBuilder::And({projectable, dropped})); + context_builder.SetPredicate(compound); + ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); + SchemaManager schema_manager(std::make_shared(), read_context->GetPath()); + ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0)); + ASSERT_OK_AND_ASSIGN(auto internal_context, + InternalReadContext::Create(std::move(read_context), table_schema, + table_schema->Options())); + auto surviving_leaf = + std::dynamic_pointer_cast(internal_context->GetPredicate()); + ASSERT_NE(surviving_leaf, nullptr); + ASSERT_EQ(surviving_leaf->FieldName(), "f3"); + ASSERT_EQ(surviving_leaf->FieldIndex(), 0); +} + +// OR is strict: if any branch is not projectable, the whole OR is dropped, otherwise +// rows that only satisfied the dropped branch would falsely pass through the projected +// predicate. Verifies the strict half of paimon Java's PredicateProjectionConverter. +TEST(InternalReadContext, TestOrChildOutsideProjectionDropsWholePredicate) { + std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09"; + ReadContextBuilder context_builder(path); + context_builder.SetReadSchema({"f3", "f0"}); + auto projectable = + PredicateBuilder::Equal(/*field_index=*/3, "f3", FieldType::DOUBLE, Literal(1.5)); + auto dropped = PredicateBuilder::Equal(/*field_index=*/1, "f1", FieldType::INT, Literal(7)); + ASSERT_OK_AND_ASSIGN(auto compound, PredicateBuilder::Or({projectable, dropped})); + context_builder.SetPredicate(compound); + ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); + SchemaManager schema_manager(std::make_shared(), read_context->GetPath()); + ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0)); + ASSERT_OK_AND_ASSIGN(auto internal_context, + InternalReadContext::Create(std::move(read_context), table_schema, + table_schema->Options())); + ASSERT_EQ(internal_context->GetPredicate(), nullptr); +} + } // namespace paimon::test diff --git a/src/paimon/core/table/source/table_read_test.cpp b/src/paimon/core/table/source/table_read_test.cpp index 762e9362c..48a49038d 100644 --- a/src/paimon/core/table/source/table_read_test.cpp +++ b/src/paimon/core/table/source/table_read_test.cpp @@ -67,18 +67,6 @@ TEST(TableReadTest, TestReadWithInvalidContext) { ASSERT_NOK_WITH_MSG(TableRead::Create(std::move(read_context)), "schema type double mismatches predicate field type BIGINT"); } - { - // field idx in predicate mismatch in schema - auto predicate = PredicateBuilder::Equal(/*field_index=*/2, /*field_name=*/"f3", - FieldType::DOUBLE, Literal(15.0)); - ReadContextBuilder context_builder(path); - context_builder.SetReadSchema({"f3", "f0", "f1"}); - context_builder.SetPredicate(predicate); - ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); - ASSERT_NOK_WITH_MSG( - TableRead::Create(std::move(read_context)), - "field f3 has field idx 0 in input schema, mismatch field idx 2 in predicate"); - } { // literal cannot be null auto predicate = PredicateBuilder::Equal(/*field_index=*/3, /*field_name=*/"f3",