From 305e7241291434299dc611b415ee94c67520514b Mon Sep 17 00:00:00 2001 From: Mikhail Filimonov Date: Wed, 7 Jan 2026 23:40:48 +0100 Subject: [PATCH 1/4] Add support for ALIAS columns in segments for engine=Hybrid --- src/Storages/StorageDistributed.cpp | 19 ++-- .../03645_hybrid_alias_columns.reference | 9 ++ .../03645_hybrid_alias_columns.sql | 97 +++++++++++++++++++ 3 files changed, 118 insertions(+), 7 deletions(-) create mode 100644 tests/queries/0_stateless/03645_hybrid_alias_columns.reference create mode 100644 tests/queries/0_stateless/03645_hybrid_alias_columns.sql diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index d518887b3f6d..57bd7e753075 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -2247,8 +2247,9 @@ void StorageDistributed::setCachedColumnsToCast(ColumnsDescription columns) if (!cached_columns_to_cast.empty() && log) { Names columns_with_types; - columns_with_types.reserve(cached_columns_to_cast.getAllPhysical().size()); - for (const auto & col : cached_columns_to_cast.getAllPhysical()) + const auto cached_columns = cached_columns_to_cast.getAllPhysical(); + columns_with_types.reserve(cached_columns.size()); + for (const auto & col : cached_columns) columns_with_types.emplace_back(col.name + " " + col.type->getName()); LOG_DEBUG(log, "Hybrid auto-cast will apply to: [{}]", fmt::join(columns_with_types, ", ")); } @@ -2420,12 +2421,15 @@ void registerStorageHybrid(StorageFactory & factory) if (columns_to_use.empty()) columns_to_use = first_segment_columns; + const auto physical_columns = columns_to_use.getAllPhysical(); + NameSet columns_to_cast_names; auto validate_segment_schema = [&](const ColumnsDescription & segment_columns, const String & segment_name) { - for (const auto & column : columns_to_use.getAllPhysical()) + for (const auto & column : physical_columns) { - auto found = segment_columns.tryGetPhysical(column.name); + // all columns defined as physical in hybrid should exists in segments (but can be aliases there) + auto found = segment_columns.tryGetColumn(GetColumnsOptions(GetColumnsOptions::AllPhysicalAndAliases), column.name); if (!found) { throw Exception( @@ -2434,6 +2438,7 @@ void registerStorageHybrid(StorageFactory & factory) segment_name, column.name); } + // if the type of the column is the segment differs - we need to add it to the list of columns which require casts if (!found->type->equals(*column.type)) columns_to_cast_names.emplace(column.name); } @@ -2467,8 +2472,6 @@ void registerStorageHybrid(StorageFactory & factory) "TableFunctionRemote did not return a StorageDistributed or StorageProxy, got: {}", actual_type); } - const auto physical_columns = columns_to_use.getAllPhysical(); - auto validate_predicate = [&](ASTPtr & predicate, size_t argument_index) { try @@ -2619,7 +2622,9 @@ void registerStorageHybrid(StorageFactory & factory) if (!columns_to_cast_names.empty()) { NamesAndTypesList cast_cols; - for (const auto & col : columns_to_use.getAllPhysical()) + + // 'physical' columns of Hybrid will be read from segments, and may need CASTS + for (const auto & col : physical_columns) { if (columns_to_cast_names.contains(col.name)) cast_cols.emplace_back(col.name, col.type); diff --git a/tests/queries/0_stateless/03645_hybrid_alias_columns.reference b/tests/queries/0_stateless/03645_hybrid_alias_columns.reference new file mode 100644 index 000000000000..d20eba96c358 --- /dev/null +++ b/tests/queries/0_stateless/03645_hybrid_alias_columns.reference @@ -0,0 +1,9 @@ +test1 +1 ['foo1','bar1_before'] foo1 +2 ['foo2','bar2_after'] foo2 +Insert into Hybrid with EPHEMERAL column +2 0A0B0C0D +Select from Hybrid with EPHEMERAL column +1 5A90B714 +2 0A0B0C0D +10 01020304 diff --git a/tests/queries/0_stateless/03645_hybrid_alias_columns.sql b/tests/queries/0_stateless/03645_hybrid_alias_columns.sql new file mode 100644 index 000000000000..ab9dde1eae0d --- /dev/null +++ b/tests/queries/0_stateless/03645_hybrid_alias_columns.sql @@ -0,0 +1,97 @@ +SET allow_experimental_hybrid_table = 1, + prefer_localhost_replica = 0; + +DROP TABLE IF EXISTS test_hybrid_alias_cast; +DROP TABLE IF EXISTS test_hybrid_alias_after; +DROP TABLE IF EXISTS test_hybrid_alias_before; + +CREATE TABLE test_hybrid_alias_after +( + a UInt32, + arr Array(String), + arr_1 ALIAS arr[1] +) +ENGINE = MergeTree() +ORDER BY (a, arr[1]) +SETTINGS index_granularity = 1; + +CREATE TABLE test_hybrid_alias_before +( + a UInt32, + arr Array(String), + arr_1 MATERIALIZED arr[1] +) +ENGINE = MergeTree() +ORDER BY (a, arr_1) +SETTINGS index_granularity = 1; + +INSERT INTO test_hybrid_alias_after VALUES (1, ['foo1', 'bar1_after']), (2, ['foo2', 'bar2_after']); +INSERT INTO test_hybrid_alias_before VALUES (1, ['foo1', 'bar1_before']), (2, ['foo2', 'bar2_before']); + +CREATE TABLE test_hybrid_alias_cast +( + a UInt32, + arr Array(String), + arr_1 String +) +ENGINE = Hybrid( + remote('127.0.0.1:9000', currentDatabase(), 'test_hybrid_alias_after'), + a >= 2, + remote('127.0.0.1:9000', currentDatabase(), 'test_hybrid_alias_before'), + a < 2 +); + +SELECT 'test1'; +SELECT * FROM test_hybrid_alias_cast WHERE arr_1 like 'foo%' ORDER BY a; + +DROP TABLE test_hybrid_alias_cast; +DROP TABLE test_hybrid_alias_after; +DROP TABLE test_hybrid_alias_before; + +DROP TABLE IF EXISTS test_hybrid_ephem; +DROP TABLE IF EXISTS test_hybrid_ephem_after; +DROP TABLE IF EXISTS test_hybrid_ephem_before; + +CREATE TABLE test_hybrid_ephem_after +( + id UInt64, + unhexed String EPHEMERAL, + hexed FixedString(4) DEFAULT unhex(unhexed) +) +ENGINE = MergeTree() +ORDER BY id; + +CREATE TABLE test_hybrid_ephem_before +( + id UInt64, + hexed FixedString(4) +) +ENGINE = MergeTree() +ORDER BY id; + +INSERT INTO test_hybrid_ephem_after (id, unhexed) VALUES (1, '5a90b714'); +INSERT INTO test_hybrid_ephem_before (id, hexed) VALUES (10, unhex('01020304')); + +CREATE TABLE test_hybrid_ephem +( + id UInt64, + unhexed String EPHEMERAL, + hexed FixedString(4) DEFAULT unhex(unhexed) +) +ENGINE = Hybrid( + remote('127.0.0.1:9000', currentDatabase(), 'test_hybrid_ephem_after'), + id < 10, + remote('127.0.0.1:9000', currentDatabase(), 'test_hybrid_ephem_before'), + id >= 10 +); + +SELECT 'Insert into Hybrid with EPHEMERAL column'; +INSERT INTO test_hybrid_ephem (id, unhexed) VALUES (2, '0a0b0c0d'); +SELECT id, hex(hexed) FROM test_hybrid_ephem_after WHERE id = 2; + +SELECT 'Select from Hybrid with EPHEMERAL column'; +SELECT id, hex(hexed) FROM test_hybrid_ephem ORDER BY id; + +DROP TABLE test_hybrid_ephem; +DROP TABLE test_hybrid_ephem_after; +DROP TABLE test_hybrid_ephem_before; From 3660bb8f1a0acacd547464508712f44581428070 Mon Sep 17 00:00:00 2001 From: Mikhail Filimonov Date: Tue, 20 Jan 2026 10:00:11 +0100 Subject: [PATCH 2/4] Preserve ALIAS column order for distributed reads Store ALIAS expressions in insertion order to keep output column ordering stable in distributed queries, and track alias names separately for membership checks. --- src/Planner/PlannerJoinTree.cpp | 5 +++-- src/Planner/TableExpressionData.h | 22 ++++++++++++++++--- src/Storages/SelectQueryInfo.cpp | 3 +-- ...6_distributed_alias_column_order.reference | 2 ++ .../03726_distributed_alias_column_order.sql | 17 ++++++++++++++ 5 files changed, 42 insertions(+), 7 deletions(-) create mode 100644 tests/queries/0_stateless/03726_distributed_alias_column_order.reference create mode 100644 tests/queries/0_stateless/03726_distributed_alias_column_order.sql diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index 568865c881cf..96cee39c4069 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -654,13 +654,14 @@ UInt64 mainQueryNodeBlockSizeByLimit(const SelectQueryInfo & select_query_info) } std::unique_ptr createComputeAliasColumnsStep( - std::unordered_map & alias_column_expressions, const SharedHeader & current_header) + AliasColumnExpressions & alias_column_expressions, const SharedHeader & current_header) { ActionsDAG merged_alias_columns_actions_dag(current_header->getColumnsWithTypeAndName()); ActionsDAG::NodeRawConstPtrs action_dag_outputs = merged_alias_columns_actions_dag.getInputs(); - for (auto & [column_name, alias_column_actions_dag] : alias_column_expressions) + for (auto & alias_column_expression : alias_column_expressions) { + auto & alias_column_actions_dag = alias_column_expression.second; const auto & current_outputs = alias_column_actions_dag.getOutputs(); action_dag_outputs.insert(action_dag_outputs.end(), current_outputs.begin(), current_outputs.end()); merged_alias_columns_actions_dag.mergeNodes(std::move(alias_column_actions_dag)); diff --git a/src/Planner/TableExpressionData.h b/src/Planner/TableExpressionData.h index 565250e8ed26..7c77c20afdfa 100644 --- a/src/Planner/TableExpressionData.h +++ b/src/Planner/TableExpressionData.h @@ -18,6 +18,8 @@ namespace ErrorCodes using ColumnIdentifier = std::string; using ColumnIdentifiers = std::vector; using ColumnIdentifierSet = std::unordered_set; +using AliasColumnExpression = std::pair; +using AliasColumnExpressions = std::vector; struct PrewhereInfo; using PrewhereInfoPtr = std::shared_ptr; @@ -77,7 +79,8 @@ class TableExpressionData /// Add alias column void addAliasColumn(const NameAndTypePair & column, const ColumnIdentifier & column_identifier, ActionsDAG actions_dag, bool is_selected_column = true) { - alias_column_expressions.emplace(column.name, std::move(actions_dag)); + alias_column_expressions.emplace_back(column.name, std::move(actions_dag)); + alias_column_names_set.emplace(column.name); addColumnImpl(column, column_identifier, is_selected_column); } @@ -96,11 +99,21 @@ class TableExpressionData } /// Get ALIAS columns names mapped to expressions - std::unordered_map & getAliasColumnExpressions() + AliasColumnExpressions & getAliasColumnExpressions() { return alias_column_expressions; } + const AliasColumnExpressions & getAliasColumnExpressions() const + { + return alias_column_expressions; + } + + bool hasAliasColumn(const std::string & column_name) const + { + return alias_column_names_set.contains(column_name); + } + /// Get column name to column map const ColumnNameToColumn & getColumnNameToColumn() const { @@ -279,7 +292,10 @@ class TableExpressionData NameSet selected_column_names_set; /// Expression to calculate ALIAS columns - std::unordered_map alias_column_expressions; + /// Keep alias name (String) + expression (ActionsDAG) pairs; vector preserves insertion order. + AliasColumnExpressions alias_column_expressions; + /// Fast membership checks for alias column names. + NameSet alias_column_names_set; /// Valid for table, table function, array join, query, union nodes ColumnNameToColumn column_name_to_column; diff --git a/src/Storages/SelectQueryInfo.cpp b/src/Storages/SelectQueryInfo.cpp index b08b511e2701..6965822427d6 100644 --- a/src/Storages/SelectQueryInfo.cpp +++ b/src/Storages/SelectQueryInfo.cpp @@ -25,12 +25,11 @@ std::unordered_map SelectQueryInfo::buildNod if (planner_context) { auto & table_expression_data = planner_context->getTableExpressionDataOrThrow(table_expression); - const auto & alias_column_expressions = table_expression_data.getAliasColumnExpressions(); for (const auto & [column_identifier, column_name] : table_expression_data.getColumnIdentifierToColumnName()) { /// ALIAS columns cannot be used in the filter expression without being calculated in ActionsDAG, /// so they should not be added to the input nodes. - if (alias_column_expressions.contains(column_name)) + if (table_expression_data.hasAliasColumn(column_name)) continue; const auto & column = table_expression_data.getColumnOrThrow(column_name); node_name_to_input_node_column.emplace(column_identifier, ColumnWithTypeAndName(column.type, column_name)); diff --git a/tests/queries/0_stateless/03726_distributed_alias_column_order.reference b/tests/queries/0_stateless/03726_distributed_alias_column_order.reference new file mode 100644 index 000000000000..0ec0640cf73d --- /dev/null +++ b/tests/queries/0_stateless/03726_distributed_alias_column_order.reference @@ -0,0 +1,2 @@ +e f +e f diff --git a/tests/queries/0_stateless/03726_distributed_alias_column_order.sql b/tests/queries/0_stateless/03726_distributed_alias_column_order.sql new file mode 100644 index 000000000000..31691861787c --- /dev/null +++ b/tests/queries/0_stateless/03726_distributed_alias_column_order.sql @@ -0,0 +1,17 @@ +DROP TABLE IF EXISTS test_local; + +CREATE TABLE test_local +( + `a` UInt64, + `e` String ALIAS 'e', + `f` String ALIAS 'f' +) +ENGINE = Memory; + +INSERT INTO test_local VALUES (1); + +SELECT e, f +FROM remote('127.0.0.{1,2}', currentDatabase(), test_local) +ORDER BY a; + +DROP TABLE IF EXISTS test_local; From 353d2715be957046f447be0a0b4fb5df0f80e83b Mon Sep 17 00:00:00 2001 From: Mikhail Filimonov Date: Wed, 21 Jan 2026 01:39:01 +0100 Subject: [PATCH 3/4] rebind columns to replaced storage when rewriting segment predicates, to avoid alias issues --- src/Storages/StorageDistributed.cpp | 123 +++++++++++++++++- .../03645_hybrid_alias_columns.reference | 2 + .../03645_hybrid_alias_columns.sql | 60 +++++++++ 3 files changed, 183 insertions(+), 2 deletions(-) diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 57bd7e753075..09ab0fb13dba 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -869,6 +869,99 @@ class ReplaseAliasColumnsVisitor : public InDepthQueryTreeVisitor; + +ColumnNameToColumnNodeMap buildColumnNodesForTableExpression(const QueryTreeNodePtr & table_expression_node, const ContextPtr & context) +{ + const TableNode * table_node = table_expression_node->as(); + const TableFunctionNode * table_function_node = table_expression_node->as(); + if (!table_node && !table_function_node) + return {}; + + // Rebuild per-column nodes (including ALIAS expressions) for the replacement table expression. + const auto & storage_snapshot = table_node ? table_node->getStorageSnapshot() : table_function_node->getStorageSnapshot(); + auto get_column_options = GetColumnsOptions(GetColumnsOptions::All).withExtendedObjects().withVirtuals(); + if (storage_snapshot->storage.supportsSubcolumns()) + get_column_options.withSubcolumns(); + + auto column_names_and_types = storage_snapshot->getColumns(get_column_options); + const auto & columns_description = storage_snapshot->metadata->getColumns(); + + ColumnNameToColumnNodeMap column_name_to_node; + column_name_to_node.reserve(column_names_and_types.size()); + + for (const auto & column_name_and_type : column_names_and_types) + { + const auto & column_default = columns_description.getDefault(column_name_and_type.name); + if (column_default && column_default->kind == ColumnDefaultKind::Alias) + { + auto alias_expression = buildQueryTree(column_default->expression, context); + QueryAnalysisPass(table_expression_node).run(alias_expression, context); + if (!alias_expression->getResultType()->equals(*column_name_and_type.type)) + alias_expression = buildCastFunction(alias_expression, column_name_and_type.type, context, true); + + auto column_node = std::make_shared(column_name_and_type, std::move(alias_expression), table_expression_node); + column_name_to_node.emplace(column_name_and_type.name, std::move(column_node)); + } + else + { + auto column_node = std::make_shared(column_name_and_type, table_expression_node); + column_name_to_node.emplace(column_name_and_type.name, std::move(column_node)); + } + } + + return column_name_to_node; +} + +class ReplaceColumnNodesForTableExpressionVisitor : public InDepthQueryTreeVisitor +{ +public: + ReplaceColumnNodesForTableExpressionVisitor( + const QueryTreeNodePtr & from_, + const QueryTreeNodePtr & to_, + const ColumnNameToColumnNodeMap & column_name_to_node_) + : from(from_), to(to_), column_name_to_node(column_name_to_node_) + {} + + void visitImpl(QueryTreeNodePtr & node) + { + auto * column_node = node->as(); + if (!column_node) + return; + + auto column_source = column_node->getColumnSourceOrNull(); + if (!column_source) + return; + + if (column_source.get() != from.get()) + return; + + auto it = column_name_to_node.find(column_node->getColumnName()); + if (it != column_name_to_node.end()) + { + auto replacement = it->second->clone(); + replacement->setAlias(column_node->getAlias()); + node = std::move(replacement); + } + else + { + // Preserve the column name but rebind its source to the replacement table expression. + column_node->setColumnSource(to); + } + } + + static bool needChildVisit(const QueryTreeNodePtr &, const QueryTreeNodePtr & child_node) + { + auto child_node_type = child_node->getNodeType(); + return !(child_node_type == QueryTreeNodeType::QUERY || child_node_type == QueryTreeNodeType::UNION); + } + +private: + QueryTreeNodePtr from; + QueryTreeNodePtr to; + const ColumnNameToColumnNodeMap & column_name_to_node; +}; + class RewriteInToGlobalInVisitor : public InDepthQueryTreeVisitorWithContext { public: @@ -1019,13 +1112,12 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, replacement_table_expression->setAlias(query_info.table_expression->getAlias()); QueryTreeNodePtr filter; - if (additional_filter) { const auto & context = query_info.planner_context->getQueryContext(); filter = buildQueryTree(additional_filter->clone(), query_context); - + // Resolve now; alias expressions are normalized later for the merged query. QueryAnalysisPass(replacement_table_expression).run(filter, context); } @@ -1040,6 +1132,33 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, : std::move(filter); } + if (additional_filter) + { + auto replacement_columns = buildColumnNodesForTableExpression(replacement_table_expression, query_context); + + /** + * When Hybrid injects a segment predicate, the query tree may end up mixing + * two different column interpretations for the same name: + * - SELECT list columns are resolved against the Hybrid schema (physical columns). + * - WHERE predicate columns are resolved against the segment schema (ALIAS columns). + * + * If we later expand alias columns only in one place, the analyzer can see + * two different expressions with the same alias (e.g. `computed` as a column + * vs `value * 2 AS computed`), which triggers MULTIPLE_EXPRESSIONS_FOR_ALIAS. + * + * To prevent this, we rebuild ColumnNodes from the replacement table expression + * (including fully-resolved ALIAS expressions) and rewrite the whole query tree + * so all references to the replaced table share the same column source and + * the same alias semantics. This keeps SELECT and WHERE consistent before + * ReplaseAliasColumnsVisitor performs final alias expansion. + */ + ReplaceColumnNodesForTableExpressionVisitor replace_query_columns_visitor( + replacement_table_expression, + replacement_table_expression, + replacement_columns); + replace_query_columns_visitor.visit(query_tree_to_modify); + } + ReplaseAliasColumnsVisitor replase_alias_columns_visitor; replase_alias_columns_visitor.visit(query_tree_to_modify); diff --git a/tests/queries/0_stateless/03645_hybrid_alias_columns.reference b/tests/queries/0_stateless/03645_hybrid_alias_columns.reference index d20eba96c358..22bfe3702822 100644 --- a/tests/queries/0_stateless/03645_hybrid_alias_columns.reference +++ b/tests/queries/0_stateless/03645_hybrid_alias_columns.reference @@ -1,6 +1,8 @@ test1 1 ['foo1','bar1_before'] foo1 2 ['foo2','bar2_after'] foo2 +Hybrid segment predicates with alias columns +3 30 60 33 Insert into Hybrid with EPHEMERAL column 2 0A0B0C0D Select from Hybrid with EPHEMERAL column diff --git a/tests/queries/0_stateless/03645_hybrid_alias_columns.sql b/tests/queries/0_stateless/03645_hybrid_alias_columns.sql index ab9dde1eae0d..5f1eec5866b0 100644 --- a/tests/queries/0_stateless/03645_hybrid_alias_columns.sql +++ b/tests/queries/0_stateless/03645_hybrid_alias_columns.sql @@ -48,6 +48,66 @@ DROP TABLE test_hybrid_alias_cast; DROP TABLE test_hybrid_alias_after; DROP TABLE test_hybrid_alias_before; +DROP TABLE IF EXISTS test_hybrid_alias_predicate; +DROP TABLE IF EXISTS test_hybrid_alias_predicate_left; +DROP TABLE IF EXISTS test_hybrid_alias_predicate_right; + +CREATE TABLE test_hybrid_alias_predicate_left +( + id Int32, + value Int32, + date_col Date, + computed ALIAS value * 2, + sum_alias ALIAS id + value +) +ENGINE = MergeTree() +ORDER BY (date_col, id) +PARTITION BY toYYYYMM(date_col); + +CREATE TABLE test_hybrid_alias_predicate_right +( + id Int32, + value Int32, + date_col Date, + computed ALIAS value * 2, + sum_alias ALIAS id + value +) +ENGINE = MergeTree() +ORDER BY (date_col, id) +PARTITION BY toYYYYMM(date_col); + +INSERT INTO test_hybrid_alias_predicate_left (id, value, date_col) VALUES + (1, 10, '2025-01-15'), + (2, 20, '2025-01-16'), + (3, 30, '2025-01-17'); + +INSERT INTO test_hybrid_alias_predicate_right (id, value, date_col) VALUES + (4, 40, '2025-01-10'), + (5, 50, '2025-01-11'), + (6, 60, '2025-01-12'); + +CREATE TABLE test_hybrid_alias_predicate +( + id Int32, + value Int32, + date_col Date, + computed Int64, + sum_alias Int64 +) +ENGINE = Hybrid( + remote('127.0.0.1:9000', currentDatabase(), 'test_hybrid_alias_predicate_left'), + computed >= 60, + remote('127.0.0.1:9000', currentDatabase(), 'test_hybrid_alias_predicate_right'), + computed < 60 +); + +SELECT 'Hybrid segment predicates with alias columns'; +SELECT id, value, computed, sum_alias FROM test_hybrid_alias_predicate; + +DROP TABLE test_hybrid_alias_predicate; +DROP TABLE test_hybrid_alias_predicate_left; +DROP TABLE test_hybrid_alias_predicate_right; + DROP TABLE IF EXISTS test_hybrid_ephem; DROP TABLE IF EXISTS test_hybrid_ephem_after; DROP TABLE IF EXISTS test_hybrid_ephem_before; From d3b7738982824fa1c312151e0ec1c802b8db3c2f Mon Sep 17 00:00:00 2001 From: Mikhail Filimonov Date: Tue, 20 Jan 2026 15:38:57 +0100 Subject: [PATCH 4/4] add __aliasMarker injection for ALIAS column with enable_alias_marker flag --- src/Core/Settings.cpp | 5 ++ src/Core/SettingsChangesHistory.cpp | 1 + src/Functions/identity.cpp | 5 ++ src/Functions/identity.h | 34 ++++++++++ src/Interpreters/ActionsVisitor.cpp | 3 + src/Planner/PlannerActionsVisitor.cpp | 65 ++++++++++++++++++- src/Planner/Utils.cpp | 52 ++++++++++++++- src/Storages/StorageDistributed.cpp | 50 ++++++++++++-- ...lias_marker_with_mergeable_state.reference | 15 +++++ ...03648_alias_marker_with_mergeable_state.sh | 48 ++++++++++++++ 10 files changed, 272 insertions(+), 6 deletions(-) create mode 100644 tests/queries/0_stateless/03648_alias_marker_with_mergeable_state.reference create mode 100755 tests/queries/0_stateless/03648_alias_marker_with_mergeable_state.sh diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index 9977550b19de..63e34fe58072 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -2110,6 +2110,11 @@ DECLARE(BoolAuto, query_plan_join_swap_table, Field("auto"), R"( DECLARE(Bool, query_plan_join_shard_by_pk_ranges, false, R"( Apply sharding for JOIN if join keys contain a prefix of PRIMARY KEY for both tables. Supported for hash, parallel_hash and full_sorting_merge algorithms. Usually does not speed up queries but may lower memory consumption. )", 0) \ + \ + DECLARE(Bool, enable_alias_marker, true, R"( +Enable __aliasMarker injection for ALIAS column expressions when using the analyzer. +This stabilizes action node names across planner/analyzer stages without changing query semantics. +)", 0) \ \ DECLARE(UInt64, preferred_block_size_bytes, 1000000, R"( This setting adjusts the data block size for query processing and represents additional fine-tuning to the more rough 'max_block_size' setting. If the columns are large and with 'max_block_size' rows the block size is likely to be larger than the specified amount of bytes, its size will be lowered for better CPU cache locality. diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index 3d57025ddd76..971937ca3f15 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -56,6 +56,7 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory() {"iceberg_timezone_for_timestamptz", "UTC", "UTC", "New setting."}, {"hybrid_table_auto_cast_columns", true, true, "New setting to automatically cast Hybrid table columns when segments disagree on types. Default enabled."}, {"allow_experimental_hybrid_table", false, false, "Added new setting to allow the Hybrid table engine."}, + {"enable_alias_marker", true, true, "New setting."}, {"input_format_parquet_use_native_reader_v3", false, true, "Seems stable"}, {"input_format_parquet_verify_checksums", true, true, "New setting."}, {"output_format_parquet_write_checksums", false, true, "New setting."}, diff --git a/src/Functions/identity.cpp b/src/Functions/identity.cpp index 2541e715cb16..910859a4ac93 100644 --- a/src/Functions/identity.cpp +++ b/src/Functions/identity.cpp @@ -19,4 +19,9 @@ REGISTER_FUNCTION(ActionName) factory.registerFunction(); } +REGISTER_FUNCTION(AliasMarker) +{ + factory.registerFunction(); +} + } diff --git a/src/Functions/identity.h b/src/Functions/identity.h index e1ce253a46d9..77bb9e8ed52f 100644 --- a/src/Functions/identity.h +++ b/src/Functions/identity.h @@ -79,4 +79,38 @@ class FunctionActionName : public FunctionIdentityBase } }; +struct AliasMarkerName +{ + static constexpr auto name = "__aliasMarker"; +}; + +class FunctionAliasMarker : public IFunction +{ +public: + static constexpr auto name = AliasMarkerName::name; + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + String getName() const override { return name; } + size_t getNumberOfArguments() const override { return 2; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } + bool isSuitableForConstantFolding() const override { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (arguments.size() != 2) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function __aliasMarker expects 2 arguments"); + + if (!WhichDataType(arguments[1]).isString()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function __aliasMarker is internal and should not be used directly"); + + return arguments.front(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override + { + return arguments.front().column; + } +}; + } diff --git a/src/Interpreters/ActionsVisitor.cpp b/src/Interpreters/ActionsVisitor.cpp index cd9ee309b742..546264a0de4b 100644 --- a/src/Interpreters/ActionsVisitor.cpp +++ b/src/Interpreters/ActionsVisitor.cpp @@ -923,6 +923,9 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data & if (node.name == "lambda") throw Exception(ErrorCodes::UNEXPECTED_EXPRESSION, "Unexpected lambda expression"); + if (node.name == "__aliasMarker") + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function __aliasMarker is internal and supported only with the analyzer"); + /// Function arrayJoin. if (node.name == "arrayJoin") { diff --git a/src/Planner/PlannerActionsVisitor.cpp b/src/Planner/PlannerActionsVisitor.cpp index 57482c45a714..7bc8426582f5 100644 --- a/src/Planner/PlannerActionsVisitor.cpp +++ b/src/Planner/PlannerActionsVisitor.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -178,7 +179,25 @@ class ActionNodeNameHelper case QueryTreeNodeType::FUNCTION: { const auto & function_node = node->as(); - if (function_node.getFunctionName() == "__actionName") + if (function_node.getFunctionName() == "__aliasMarker") + { + /// Perform sanity check, because user may call this function with unexpected arguments + const auto & function_argument_nodes = function_node.getArguments().getNodes(); + if (function_argument_nodes.size() == 2) + { + if (const auto * second_argument = function_argument_nodes.at(1)->as()) + { + if (isString(second_argument->getResultType())) + result = second_argument->getValue().safeGet(); + } + } + + /// Empty node name is not allowed and leads to logical errors + if (result.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function __aliasMarker is internal and should not be used directly"); + break; + } + else if (function_node.getFunctionName() == "__actionName") { /// Perform sanity check, because user may call this function with unexpected arguments const auto & function_argument_nodes = function_node.getArguments().getNodes(); @@ -601,6 +620,18 @@ class ActionsScopeNode return node; } + const ActionsDAG::Node * addAliasIfNecessary(const std::string & node_name, const ActionsDAG::Node * child) + { + auto it = node_name_to_node.find(node_name); + if (it != node_name_to_node.end()) + return it->second; + + const auto * node = &actions_dag.addAlias(*child, node_name); + node_name_to_node[node->result_name] = node; + + return node; + } + private: std::unordered_map node_name_to_node; ActionsDAG & actions_dag; @@ -1062,6 +1093,38 @@ PlannerActionsVisitorImpl::NodeNameAndNodeMinLevel PlannerActionsVisitorImpl::vi { const auto & function_node = node->as(); + if (function_node.getFunctionName() == "__aliasMarker") + { + const auto & function_arguments = function_node.getArguments().getNodes(); + if (function_arguments.size() != 2) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function __aliasMarker expects 2 arguments"); + + const auto * alias_id_node = function_arguments.at(1)->as(); + if (!alias_id_node || !isString(alias_id_node->getResultType())) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function __aliasMarker is internal and should not be used directly"); + + const auto & alias_id = alias_id_node->getValue().safeGet(); + if (alias_id.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function __aliasMarker is internal and should not be used directly"); + + auto [child_name, levels] = visitImpl(function_arguments.at(0)); + if (alias_id == child_name) + return {child_name, levels}; + + size_t level = levels.max(); + const auto * child_node = actions_stack[level].getNodeOrThrow(child_name); + actions_stack[level].addAliasIfNecessary(alias_id, child_node); + + size_t actions_stack_size = actions_stack.size(); + for (size_t i = level + 1; i < actions_stack_size; ++i) + { + auto & actions_stack_node = actions_stack[i]; + actions_stack_node.addInputColumnIfNecessary(alias_id, function_node.getResultType()); + } + + return {alias_id, levels}; + } + if (function_node.getFunctionName() == "indexHint") return visitIndexHintFunction(node); if (function_node.getFunctionName() == "exists") diff --git a/src/Planner/Utils.cpp b/src/Planner/Utils.cpp index f9e1d0bbf4d9..08db59bfdb44 100644 --- a/src/Planner/Utils.cpp +++ b/src/Planner/Utils.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -191,13 +192,62 @@ ASTPtr queryNodeToSelectQuery(const QueryTreeNodePtr & query_node, bool set_subq return result_ast; } +namespace +{ +class NormalizeAliasMarkerVisitor : public InDepthQueryTreeVisitor +{ +public: + void visitImpl(QueryTreeNodePtr & node) + { + auto * function_node = node->as(); + if (!function_node || function_node->getFunctionName() != "__aliasMarker") + return; + + auto & arguments = function_node->getArguments().getNodes(); + if (arguments.size() != 2) + return; + + while (true) + { + auto * inner_function = arguments.front()->as(); + if (!inner_function || inner_function->getFunctionName() != "__aliasMarker") + break; + + auto & inner_arguments = inner_function->getArguments().getNodes(); + if (inner_arguments.size() != 2) + break; + + arguments.front() = inner_arguments.front(); + } + } + + bool needChildVisit(QueryTreeNodePtr & parent, QueryTreeNodePtr & child) + { + auto * parent_function = parent->as(); + if (parent_function && parent_function->getFunctionName() == "__aliasMarker") + return false; + + auto child_node_type = child->getNodeType(); + return !(child_node_type == QueryTreeNodeType::QUERY || child_node_type == QueryTreeNodeType::UNION); + } +}; + +void normalizeAliasMarkersInQueryTree(QueryTreeNodePtr & node) +{ + NormalizeAliasMarkerVisitor visitor; + visitor.visit(node); +} +} + ASTPtr queryNodeToDistributedSelectQuery(const QueryTreeNodePtr & query_node) { /// Remove CTEs information from distributed queries. /// Now, if cte_name is set for subquery node, AST -> String serialization will only print cte name. /// But CTE is defined only for top-level query part, so may not be sent. /// Removing cte_name forces subquery to be always printed. - auto ast = queryNodeToSelectQuery(query_node, /*set_subquery_cte_name=*/false); + auto query_node_to_convert = query_node->clone(); + normalizeAliasMarkersInQueryTree(query_node_to_convert); + auto ast = queryNodeToSelectQuery(query_node_to_convert, /*set_subquery_cte_name=*/false); return ast; } diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 09ab0fb13dba..d230002be591 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -52,6 +52,7 @@ #include #include +#include #include #include #include @@ -205,6 +206,7 @@ namespace Setting extern const SettingsBool prefer_global_in_and_join; extern const SettingsBool enable_global_with_statement; extern const SettingsBool allow_experimental_hybrid_table; + extern const SettingsBool enable_alias_marker; } namespace DistributedSetting @@ -844,7 +846,7 @@ namespace class ReplaseAliasColumnsVisitor : public InDepthQueryTreeVisitor { - static QueryTreeNodePtr getColumnNodeAliasExpression(const QueryTreeNodePtr & node) + QueryTreeNodePtr getColumnNodeAliasExpression(const QueryTreeNodePtr & node) const { const auto * column_node = node->as(); if (!column_node || !column_node->hasExpression()) @@ -857,16 +859,56 @@ class ReplaseAliasColumnsVisitor : public InDepthQueryTreeVisitorgetExpression(); - column_expression->setAlias(column_node->getColumnName()); - return column_expression; + const auto & column_name = column_node->getColumnName(); + + if (!context->getSettingsRef()[Setting::enable_alias_marker]) + { + column_expression->setAlias(column_name); + return column_expression; + } + + String alias_id; + const auto & source_alias = column_source->getAlias(); + if (!source_alias.empty()) + alias_id = source_alias + "." + column_name; + else + alias_id = column_name; + + if (auto * function_node = column_expression->as(); + function_node && function_node->getFunctionName() == "__aliasMarker") + { + auto & arguments = function_node->getArguments().getNodes(); + if (arguments.size() == 2) + arguments[1] = std::make_shared(alias_id, std::make_shared()); + + column_expression->setAlias(column_name); + return column_expression; + } + + QueryTreeNodes arguments; + arguments.reserve(2); + arguments.emplace_back(std::move(column_expression)); + arguments.emplace_back(std::make_shared(alias_id, std::make_shared())); + + auto alias_marker_node = std::make_shared("__aliasMarker"); + alias_marker_node->getArguments().getNodes() = std::move(arguments); + alias_marker_node->setAlias(column_name); + resolveOrdinaryFunctionNodeByName(*alias_marker_node, "__aliasMarker", context); + + return alias_marker_node; } public: + explicit ReplaseAliasColumnsVisitor(ContextPtr context_) : context(std::move(context_)) {} + void visitImpl(QueryTreeNodePtr & node) { if (auto column_expression = getColumnNodeAliasExpression(node)) node = column_expression; } + +private: + ContextPtr context; }; using ColumnNameToColumnNodeMap = std::unordered_map; @@ -1159,7 +1201,7 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, replace_query_columns_visitor.visit(query_tree_to_modify); } - ReplaseAliasColumnsVisitor replase_alias_columns_visitor; + ReplaseAliasColumnsVisitor replase_alias_columns_visitor(query_context); replase_alias_columns_visitor.visit(query_tree_to_modify); const auto & settings = query_context->getSettingsRef(); diff --git a/tests/queries/0_stateless/03648_alias_marker_with_mergeable_state.reference b/tests/queries/0_stateless/03648_alias_marker_with_mergeable_state.reference new file mode 100644 index 000000000000..58bf6a7ec74b --- /dev/null +++ b/tests/queries/0_stateless/03648_alias_marker_with_mergeable_state.reference @@ -0,0 +1,15 @@ +---- stage: with_mergeable_state (analyzer=1, setting=enable_alias_marker=1) ---- +Header: sum(foo) AggregateFunction(sum, Int64) +---- stage: with_mergeable_state (analyzer=0) ---- +Expected error: Function __aliasMarker is internal and supported only with the analyzer +---- stage: complete (analyzer=1) ---- +Header: x Int64 +---- stage: fetch_columns (analyzer=1) ---- +Header: __table1.number UInt64 +---- stage: with_mergeable_state (analyzer=1) ---- +Header: sum(foo) AggregateFunction(sum, Int64) +---- stage: with_mergeable_state_after_aggregation (analyzer=1) ---- +Header: sum(foo) Int64 +---- stage: with_mergeable_state_after_aggregation_and_limit (analyzer=1) ---- +Header: intDiv(__table1.number, 10_UInt8) UInt64 + sum(foo) Int64 diff --git a/tests/queries/0_stateless/03648_alias_marker_with_mergeable_state.sh b/tests/queries/0_stateless/03648_alias_marker_with_mergeable_state.sh new file mode 100755 index 000000000000..deaaa6dec1f1 --- /dev/null +++ b/tests/queries/0_stateless/03648_alias_marker_with_mergeable_state.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +echo "---- stage: with_mergeable_state (analyzer=1, setting=enable_alias_marker=1) ----" +$CLICKHOUSE_CLIENT --enable_analyzer=1 --stage with_mergeable_state --multiquery 2>&1 <<'EOF' | sed -n '/^Header:/,/^ [^ ]/p' | sed '$d' +SET enable_alias_marker=1; +EXPLAIN header=1 +SELECT sum(__aliasMarker(number*2-3,'foo')) AS x +FROM numbers(10); +EOF + +echo "---- stage: with_mergeable_state (analyzer=0) ----" +alias_marker_error_output=$($CLICKHOUSE_CLIENT --enable_analyzer=0 --stage with_mergeable_state --send_logs_level=fatal --query \ + "EXPLAIN header=1 SELECT sum(__aliasMarker(number*2-3,'foo')) AS x FROM numbers(10)" 2>&1) +if grep -q "Function __aliasMarker is internal and supported only with the analyzer" <<<"${alias_marker_error_output}" +then + echo "Expected error: Function __aliasMarker is internal and supported only with the analyzer" +else + echo "${alias_marker_error_output}" +fi + +echo "---- stage: complete (analyzer=1) ----" +$CLICKHOUSE_CLIENT --enable_analyzer=1 --stage complete --query \ + "EXPLAIN header=1 SELECT sum(__aliasMarker(number*2-3,'foo')) AS x FROM numbers(10)" \ + 2>&1 | sed -n '/^Header:/,/^ [^ ]/p' | sed '$d' + +echo "---- stage: fetch_columns (analyzer=1) ----" +$CLICKHOUSE_CLIENT --enable_analyzer=1 --stage fetch_columns --query \ + "EXPLAIN header=1 SELECT sum(__aliasMarker(number*2-3,'foo')) AS x FROM numbers(10)" \ + 2>&1 | sed -n '/^Header:/,/^ [^ ]/p' | sed '$d' + +echo "---- stage: with_mergeable_state (analyzer=1) ----" +$CLICKHOUSE_CLIENT --enable_analyzer=1 --stage with_mergeable_state --query \ + "EXPLAIN header=1 SELECT sum(__aliasMarker(number*2-3,'foo')) AS x FROM numbers(10)" \ + 2>&1 | sed -n '/^Header:/,/^ [^ ]/p' | sed '$d' + +echo "---- stage: with_mergeable_state_after_aggregation (analyzer=1) ----" +$CLICKHOUSE_CLIENT --enable_analyzer=1 --stage with_mergeable_state_after_aggregation --query \ + "EXPLAIN header=1 SELECT sum(__aliasMarker(number*2-3,'foo')) AS x FROM numbers(10)" \ + 2>&1 | sed -n '/^Header:/,/^ [^ ]/p' | sed '$d' + +echo "---- stage: with_mergeable_state_after_aggregation_and_limit (analyzer=1) ----" +$CLICKHOUSE_CLIENT --enable_analyzer=1 --stage with_mergeable_state_after_aggregation_and_limit --query \ + "EXPLAIN header=1 SELECT sum(__aliasMarker(number*2-3,'foo')) AS x FROM numbers(10) GROUP BY intDiv(number,10) AS y ORDER BY y LIMIT 10" \ + 2>&1 | sed -n '/^Header:/,/^ [^ ]/p' | sed '$d'