From c9abb86fe92bc4f9c535782eaa8109f63288d94f Mon Sep 17 00:00:00 2001 From: CrownChu Date: Fri, 22 May 2026 09:51:22 +0800 Subject: [PATCH 01/24] [globalindex] Support multi-column GlobalIndex framework Extend the GlobalIndex SPI, build path, and query path to support one index builder handling multiple columns (e.g. Lucene indexing title + content + tags together). Key changes: - GlobalIndexerFactory/GlobalIndexer: add List create overloads - GlobalIndexMultiColumnWriter: new interface for multi-column writes - GlobalIndexBuilderUtils: toIndexFileMetas/createIndexWriter accept List - GlobalIndexScanner: route extraFieldIds to same reader group - VectorScanImpl/FullTextScanImpl: match against extraFieldIds - GenericIndexTopoBuilder (Flink): multi-column projection and writer dispatch - DefaultGlobalIndexBuilder/TopoBuilder (Spark): multi-column support - All single-column APIs preserved for backward compatibility --- .../GlobalIndexMultiColumnWriter.java | 34 +++++ .../paimon/globalindex/GlobalIndexer.java | 5 + .../globalindex/GlobalIndexerFactory.java | 6 + .../globalindex/GlobalIndexBuilderUtils.java | 53 ++++++- .../globalindex/GlobalIndexScanner.java | 82 +++++++++-- .../paimon/table/source/FullTextScanImpl.java | 12 +- .../paimon/table/source/VectorScanImpl.java | 30 +++- .../globalindex/GenericIndexTopoBuilder.java | 135 +++++++++++++----- .../GenericIndexTopoBuilderTest.java | 2 +- .../DefaultGlobalIndexBuilder.java | 64 ++++++--- .../DefaultGlobalIndexTopoBuilder.java | 24 +++- .../GlobalIndexTopologyBuilder.java | 21 +++ 12 files changed, 393 insertions(+), 75 deletions(-) create mode 100644 paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexMultiColumnWriter.java diff --git a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexMultiColumnWriter.java b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexMultiColumnWriter.java new file mode 100644 index 000000000000..a6ded78d33fd --- /dev/null +++ b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexMultiColumnWriter.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.globalindex; + +import org.apache.paimon.data.InternalRow; + +import javax.annotation.Nullable; + +/** Index writer for global index that accepts multiple column values per row. */ +public interface GlobalIndexMultiColumnWriter extends GlobalIndexWriter { + + /** + * Write a projected row containing all indexed columns for one record. The row layout matches + * the fields order passed to {@link GlobalIndexerFactory#create(java.util.List, + * org.apache.paimon.options.Options)}. + */ + void write(@Nullable InternalRow row); +} diff --git a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexer.java b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexer.java index 74d223a60467..6c46415cfeee 100644 --- a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexer.java +++ b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexer.java @@ -41,4 +41,9 @@ static GlobalIndexer create(String type, DataField dataField, Options options) { GlobalIndexerFactory globalIndexerFactory = GlobalIndexerFactoryUtils.load(type); return globalIndexerFactory.create(dataField, options); } + + static GlobalIndexer create(String type, List fields, Options options) { + GlobalIndexerFactory globalIndexerFactory = GlobalIndexerFactoryUtils.load(type); + return globalIndexerFactory.create(fields, options); + } } diff --git a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java index 6eabb6d25360..e2497a6f82e3 100644 --- a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java +++ b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java @@ -22,10 +22,16 @@ import org.apache.paimon.options.Options; import org.apache.paimon.types.DataField; +import java.util.List; + /** File index factory to construct {@link FileIndexer}. */ public interface GlobalIndexerFactory { String identifier(); GlobalIndexer create(DataField dataField, Options options); + + default GlobalIndexer create(List fields, Options options) { + return create(fields.get(0), options); + } } diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java index 085423efa851..34badf3ed566 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java @@ -29,6 +29,8 @@ import org.apache.paimon.types.DataField; import org.apache.paimon.utils.Range; +import javax.annotation.Nullable; + import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -45,12 +47,54 @@ public static List toIndexFileMetas( String indexType, List entries) throws IOException { + return toIndexFileMetas( + fileIO, indexPathFactory, options, range, indexFieldId, null, indexType, entries); + } + + public static List toIndexFileMetas( + FileIO fileIO, + IndexPathFactory indexPathFactory, + CoreOptions options, + Range range, + List fields, + String indexType, + List entries) + throws IOException { + int indexFieldId = fields.get(0).id(); + int[] extraFieldIds = + fields.size() > 1 + ? fields.subList(1, fields.size()).stream() + .mapToInt(DataField::id) + .toArray() + : null; + return toIndexFileMetas( + fileIO, + indexPathFactory, + options, + range, + indexFieldId, + extraFieldIds, + indexType, + entries); + } + + private static List toIndexFileMetas( + FileIO fileIO, + IndexPathFactory indexPathFactory, + CoreOptions options, + Range range, + int indexFieldId, + @Nullable int[] extraFieldIds, + String indexType, + List entries) + throws IOException { List results = new ArrayList<>(); for (ResultEntry entry : entries) { String fileName = entry.fileName(); long fileSize = fileIO.getFileSize(indexPathFactory.toPath(fileName)); GlobalIndexMeta globalIndexMeta = - new GlobalIndexMeta(range.from, range.to, indexFieldId, null, entry.meta()); + new GlobalIndexMeta( + range.from, range.to, indexFieldId, extraFieldIds, entry.meta()); Path externalPathDir = options.globalIndexExternalPath(); String externalPathString = null; @@ -78,6 +122,13 @@ public static GlobalIndexWriter createIndexWriter( return globalIndexer.createWriter(createGlobalIndexFileReadWrite(table)); } + public static GlobalIndexWriter createIndexWriter( + FileStoreTable table, String indexType, List fields, Options options) + throws IOException { + GlobalIndexer globalIndexer = GlobalIndexer.create(indexType, fields, options); + return globalIndexer.createWriter(createGlobalIndexFileReadWrite(table)); + } + private static GlobalIndexFileReadWrite createGlobalIndexFileReadWrite(FileStoreTable table) { IndexPathFactory indexPathFactory = table.store().pathFactory().globalIndexFileFactory(); return new GlobalIndexFileReadWrite(table.fileIO(), indexPathFactory); diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java index 975b28183331..04e16eed2d87 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java @@ -20,6 +20,7 @@ import org.apache.paimon.fs.FileIO; import org.apache.paimon.fs.Path; +import org.apache.paimon.globalindex.GlobalIndexBuilderUtils; import org.apache.paimon.globalindex.io.GlobalIndexFileReader; import org.apache.paimon.index.GlobalIndexMeta; import org.apache.paimon.index.IndexFileMeta; @@ -37,6 +38,7 @@ import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; @@ -74,26 +76,66 @@ public GlobalIndexScanner( GlobalIndexReadThreadPool.getExecutorService(options.get(GLOBAL_INDEX_THREAD_NUM)); this.indexPathFactory = indexPathFactory; GlobalIndexFileReader indexFileReader = meta -> fileIO.newInputStream(meta.filePath()); + + // Single-column indexes: fieldId -> indexType -> range -> files Map>>> indexMetas = new HashMap<>(); + // Multi-column indexes: fieldIds -> indexType -> range -> files + Map, Map>>> multiColumnMetas = + new HashMap<>(); + // Reverse lookup: fieldId -> its multi-column group + Map> fieldToGroup = new HashMap<>(); + for (IndexFileMeta indexFile : indexFiles) { GlobalIndexMeta meta = checkNotNull(indexFile.globalIndexMeta()); - int fieldId = meta.indexFieldId(); String indexType = indexFile.indexType(); - indexMetas - .computeIfAbsent(fieldId, k -> new HashMap<>()) - .computeIfAbsent(indexType, k -> new HashMap<>()) - .computeIfAbsent( - new Range(meta.rowRangeStart(), meta.rowRangeEnd()), - k -> new ArrayList<>()) - .add(indexFile); + Range range = new Range(meta.rowRangeStart(), meta.rowRangeEnd()); + + if (meta.indexFieldId() == GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID + && meta.extraFieldIds() != null) { + // Multi-column index: all participating fields share the same IndexFileMeta, + // so looking up from any fieldId returns identical index files. + List fieldIds = + Arrays.stream(meta.extraFieldIds()) + .boxed() + .collect(Collectors.toList()); + multiColumnMetas + .computeIfAbsent(fieldIds, k -> new HashMap<>()) + .computeIfAbsent(indexType, k -> new HashMap<>()) + .computeIfAbsent(range, k -> new ArrayList<>()) + .add(indexFile); + for (int id : fieldIds) { + fieldToGroup.put(id, fieldIds); + } + } else { + // Single-column index + int fieldId = meta.indexFieldId(); + indexMetas + .computeIfAbsent(fieldId, k -> new HashMap<>()) + .computeIfAbsent(indexType, k -> new HashMap<>()) + .computeIfAbsent(range, k -> new ArrayList<>()) + .add(indexFile); + } } IntFunction> readersFunction = - fieldId -> - createReaders( + fId -> { + List group = fieldToGroup.get(fId); + if (group != null) { + // Multi-column: resolve full field list + List fields = + group.stream() + .map(rowType::getField) + .collect(Collectors.toList()); + return createReaders( + indexFileReader, multiColumnMetas.get(group), fields); + } else { + // Single-column + return createReaders( indexFileReader, - indexMetas.get(fieldId), - rowType.getField(fieldId)); + indexMetas.get(fId), + Collections.singletonList(rowType.getField(fId))); + } + }; this.globalIndexEvaluator = new GlobalIndexEvaluator(rowType, readersFunction); } @@ -127,7 +169,17 @@ public static Optional create( if (globalIndex == null) { return false; } - return filterFieldIds.contains(globalIndex.indexFieldId()); + if (filterFieldIds.contains(globalIndex.indexFieldId())) { + return true; + } + if (globalIndex.extraFieldIds() != null) { + for (int id : globalIndex.extraFieldIds()) { + if (filterFieldIds.contains(id)) { + return true; + } + } + } + return false; }; List indexFiles = @@ -145,7 +197,7 @@ public Optional scan(Predicate predicate) { private Collection createReaders( GlobalIndexFileReader indexFileReadWrite, Map>> indexMetas, - DataField dataField) { + List fields) { if (indexMetas == null) { return Collections.emptyList(); } @@ -155,7 +207,7 @@ private Collection createReaders( String indexType = entry.getKey(); Map> metas = entry.getValue(); GlobalIndexerFactory globalIndexerFactory = GlobalIndexerFactoryUtils.load(indexType); - GlobalIndexer globalIndexer = globalIndexerFactory.create(dataField, options); + GlobalIndexer globalIndexer = globalIndexerFactory.create(fields, options); List> futures = new ArrayList<>(metas.size()); for (Map.Entry> rangeMetas : metas.entrySet()) { diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextScanImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextScanImpl.java index cc77d9121ad5..6230b31336a3 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextScanImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextScanImpl.java @@ -61,7 +61,17 @@ public Plan scan() { if (globalIndex == null) { return false; } - return textColumn.id() == globalIndex.indexFieldId(); + if (textColumn.id() == globalIndex.indexFieldId()) { + return true; + } + if (globalIndex.extraFieldIds() != null) { + for (int id : globalIndex.extraFieldIds()) { + if (textColumn.id() == id) { + return true; + } + } + } + return false; }; List allIndexFiles = diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java index d3db6dd13d37..1ff3f82852f6 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java @@ -82,7 +82,17 @@ public Plan scan() { return false; } int fieldId = globalIndex.indexFieldId(); - return vectorColumn.id() == fieldId || filterFieldIds.contains(fieldId); + if (vectorColumn.id() == fieldId || filterFieldIds.contains(fieldId)) { + return true; + } + if (globalIndex.extraFieldIds() != null) { + for (int id : globalIndex.extraFieldIds()) { + if (vectorColumn.id() == id || filterFieldIds.contains(id)) { + return true; + } + } + } + return false; }; List allIndexFiles = @@ -94,7 +104,7 @@ public Plan scan() { Map> vectorByRange = new HashMap<>(); for (IndexFileMeta indexFile : allIndexFiles) { GlobalIndexMeta meta = checkNotNull(indexFile.globalIndexMeta()); - if (meta.indexFieldId() == vectorColumn.id()) { + if (containsField(meta, vectorColumn.id())) { Range range = new Range(meta.rowRangeStart(), meta.rowRangeEnd()); vectorByRange.computeIfAbsent(range, k -> new ArrayList<>()).add(indexFile); } @@ -111,7 +121,7 @@ public Plan scan() { f -> { GlobalIndexMeta globalIndex = checkNotNull(f.globalIndexMeta()); - if (globalIndex.indexFieldId() == vectorColumn.id()) { + if (containsField(globalIndex, vectorColumn.id())) { return false; } return range.hasIntersection(globalIndex.rowRange()); @@ -122,4 +132,18 @@ public Plan scan() { return () -> splits; } + + private static boolean containsField(GlobalIndexMeta meta, int fieldId) { + if (meta.indexFieldId() == fieldId) { + return true; + } + if (meta.extraFieldIds() != null) { + for (int id : meta.extraFieldIds()) { + if (id == fieldId) { + return true; + } + } + } + return false; + } } diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java index 5896503ce09d..5a0d852b12a7 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java @@ -29,7 +29,9 @@ import org.apache.paimon.flink.utils.BoundedOneInputOperator; import org.apache.paimon.flink.utils.JavaTypeInfo; import org.apache.paimon.flink.utils.StreamExecutionEnvironmentUtils; +import org.apache.paimon.globalindex.GlobalIndexMultiColumnWriter; import org.apache.paimon.globalindex.GlobalIndexSingletonWriter; +import org.apache.paimon.globalindex.GlobalIndexWriter; import org.apache.paimon.globalindex.ResultEntry; import org.apache.paimon.index.IndexFileMeta; import org.apache.paimon.io.DataFileMeta; @@ -103,7 +105,7 @@ public static void buildIndexAndExecute( buildIndexAndExecute( env, table, - indexColumn, + Collections.singletonList(indexColumn), indexType, partitionPredicate, userOptions, @@ -119,12 +121,31 @@ public static void buildIndexAndExecute( Options userOptions, long maxIndexedRowId) throws Exception { + buildIndexAndExecute( + env, + table, + Collections.singletonList(indexColumn), + indexType, + partitionPredicate, + userOptions, + maxIndexedRowId); + } + + public static void buildIndexAndExecute( + StreamExecutionEnvironment env, + FileStoreTable table, + List indexColumns, + String indexType, + PartitionPredicate partitionPredicate, + Options userOptions, + long maxIndexedRowId) + throws Exception { boolean hasIndexToBuild = buildIndex( env, () -> new GenericGlobalIndexBuilder(table), table, - indexColumn, + indexColumns, indexType, partitionPredicate, userOptions, @@ -149,13 +170,34 @@ public static boolean buildIndex( env, indexBuilderSupplier, table, - indexColumn, + Collections.singletonList(indexColumn), indexType, partitionPredicate, userOptions, NO_MAX_INDEXED_ROW_ID); } + public static boolean buildIndex( + StreamExecutionEnvironment env, + Supplier indexBuilderSupplier, + FileStoreTable table, + String indexColumn, + String indexType, + PartitionPredicate partitionPredicate, + Options userOptions, + long maxIndexedRowId) + throws Exception { + return buildIndex( + env, + indexBuilderSupplier, + table, + Collections.singletonList(indexColumn), + indexType, + partitionPredicate, + userOptions, + maxIndexedRowId); + } + /** * Builds a generic global index topology using a {@link GenericGlobalIndexBuilder} supplier. * @@ -166,7 +208,7 @@ public static boolean buildIndex( StreamExecutionEnvironment env, Supplier indexBuilderSupplier, FileStoreTable table, - String indexColumn, + List indexColumns, String indexType, PartitionPredicate partitionPredicate, Options userOptions, @@ -183,7 +225,7 @@ public static boolean buildIndex( return buildTopology( env, table, - indexColumn, + indexColumns, indexType, userOptions, entries, @@ -203,7 +245,7 @@ public static boolean buildIndex( private static boolean buildTopology( StreamExecutionEnvironment env, FileStoreTable table, - String indexColumn, + List indexColumns, String indexType, Options userOptions, List entries, @@ -212,24 +254,24 @@ private static boolean buildTopology( throws Exception { long totalRowCount = entries.stream().mapToLong(e -> e.file().rowCount()).sum(); LOG.info( - "Scanned {} files ({} rows) across {} partitions for {} index on column '{}'" + "Scanned {} files ({} rows) across {} partitions for {} index on columns '{}'" + (maxIndexedRowId >= 0 ? ", maxIndexedRowId={}." : "."), entries.size(), totalRowCount, entries.stream().map(ManifestEntry::partition).distinct().count(), indexType, - indexColumn, + indexColumns, maxIndexedRowId); long minNonIndexableRowId = - findMinNonIndexableRowId(table.schemaManager(), entries, indexColumn); + findMinNonIndexableRowId(table.schemaManager(), entries, indexColumns); entries = filterEntriesBefore(entries, minNonIndexableRowId); RowType rowType = table.rowType(); - DataField indexField = rowType.getField(indexColumn); - // Project indexColumn + _ROW_ID so we can read the actual row ID from data - List readColumns = new ArrayList<>(); - readColumns.add(indexColumn); + List indexFields = + indexColumns.stream().map(rowType::getField).collect(Collectors.toList()); + // Project indexColumns + _ROW_ID so we can read the actual row ID from data + List readColumns = new ArrayList<>(indexColumns); readColumns.add(SpecialFields.ROW_ID.name()); RowType projectedRowType = SpecialFields.rowTypeWithRowId(rowType).project(readColumns); @@ -277,7 +319,7 @@ private static boolean buildTopology( readBuilder, table, indexType, - indexField, + indexFields, projectedRowType, mergedOptions)) .setParallelism(parallelism); @@ -299,20 +341,22 @@ private static boolean buildTopology( } /** - * Find the minimum firstRowId among files whose schema does not contain the index column. Files - * at or beyond this rowId cannot be indexed because the column was added later via ALTER TABLE. + * Find the minimum firstRowId among files whose schema does not contain all index columns. + * Files at or beyond this rowId cannot be indexed because the column was added later via ALTER + * TABLE. * - * @return the boundary rowId, or {@link Long#MAX_VALUE} if all files contain the column + * @return the boundary rowId, or {@link Long#MAX_VALUE} if all files contain the columns */ static long findMinNonIndexableRowId( - SchemaManager schemaManager, List entries, String indexColumn) { - Map schemaContainsColumn = new HashMap<>(); + SchemaManager schemaManager, List entries, List indexColumns) { + Map schemaContainsColumns = new HashMap<>(); long minRowId = Long.MAX_VALUE; for (ManifestEntry entry : entries) { long sid = entry.file().schemaId(); boolean contains = - schemaContainsColumn.computeIfAbsent( - sid, id -> schemaManager.schema(id).fieldNames().contains(indexColumn)); + schemaContainsColumns.computeIfAbsent( + sid, + id -> schemaManager.schema(id).fieldNames().containsAll(indexColumns)); if (!contains && entry.file().firstRowId() != null) { minRowId = Math.min(minRowId, entry.file().nonNullFirstRowId()); } @@ -548,25 +592,26 @@ private static class BuildIndexOperator private final ReadBuilder readBuilder; private final FileStoreTable table; private final String indexType; - private final DataField indexField; + private final List indexFields; private final RowType projectedRowType; private final Options mergedOptions; private transient TableRead tableRead; - private transient InternalRow.FieldGetter indexFieldGetter; + private transient InternalRow.FieldGetter[] indexFieldGetters; private transient int rowIdFieldIndex; + private transient boolean multiColumn; BuildIndexOperator( ReadBuilder readBuilder, FileStoreTable table, String indexType, - DataField indexField, + List indexFields, RowType projectedRowType, Options mergedOptions) { this.readBuilder = readBuilder; this.table = table; this.indexType = indexType; - this.indexField = indexField; + this.indexFields = indexFields; this.projectedRowType = projectedRowType; this.mergedOptions = mergedOptions; } @@ -575,10 +620,15 @@ private static class BuildIndexOperator public void open() throws Exception { super.open(); this.tableRead = readBuilder.newRead(); - this.indexFieldGetter = - InternalRow.createFieldGetter( - indexField.type(), projectedRowType.getFieldIndex(indexField.name())); + this.indexFieldGetters = new InternalRow.FieldGetter[indexFields.size()]; + for (int i = 0; i < indexFields.size(); i++) { + DataField field = indexFields.get(i); + indexFieldGetters[i] = + InternalRow.createFieldGetter( + field.type(), projectedRowType.getFieldIndex(field.name())); + } this.rowIdFieldIndex = projectedRowType.getFieldIndex(SpecialFields.ROW_ID.name()); + this.multiColumn = indexFields.size() > 1; } @Override @@ -595,9 +645,8 @@ public void processElement(StreamRecord element) throws Exception { task.split.dataFiles().size()); long startTime = System.currentTimeMillis(); - GlobalIndexSingletonWriter indexWriter = - (GlobalIndexSingletonWriter) - createIndexWriter(table, indexType, indexField, mergedOptions); + GlobalIndexWriter indexWriter = + createIndexWriter(table, indexType, indexFields, mergedOptions); try { long rowsSeen = 0; @@ -626,8 +675,20 @@ public void processElement(StreamRecord element) throws Exception { } // Only write rows within this shard's range if (currentRowId >= task.shardRange.from) { - Object fieldData = indexFieldGetter.getFieldOrNull(row); - indexWriter.write(fieldData); + if (multiColumn) { + ((GlobalIndexMultiColumnWriter) indexWriter).write(row); + } else { + Object fieldData = indexFieldGetters[0].getFieldOrNull(row); + if (fieldData == null) { + LOG.info( + "Null value at rowId={}, stopping shard [{}, {}].", + currentRowId, + task.shardRange.from, + task.shardRange.to); + break; + } + ((GlobalIndexSingletonWriter) indexWriter).write(fieldData); + } rowsSeen++; } } @@ -664,7 +725,7 @@ public void processElement(StreamRecord element) throws Exception { table, partition, task.shardRange, - indexField, + indexFields, indexType, resultEntries); output.collect( @@ -688,7 +749,7 @@ private static CommitMessage flushIndex( FileStoreTable table, BinaryRow partition, Range rowRange, - DataField indexField, + List indexFields, String indexType, List resultEntries) throws IOException { @@ -698,14 +759,14 @@ private static CommitMessage flushIndex( table.store().pathFactory().globalIndexFileFactory(), table.coreOptions(), rowRange, - indexField.id(), + indexFields, indexType, resultEntries); return new CommitMessageImpl( partition, 0, null, indexIncrement(indexFileMetas), emptyIncrement()); } - private static void closeWriterQuietly(GlobalIndexSingletonWriter writer) { + private static void closeWriterQuietly(GlobalIndexWriter writer) { if (writer instanceof Closeable) { try { ((Closeable) writer).close(); diff --git a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilderTest.java b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilderTest.java index 0de57077b295..fb1bd02f4408 100644 --- a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilderTest.java +++ b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilderTest.java @@ -475,7 +475,7 @@ void testAppendFilterOldFilesBeforeNewFiles() { GenericIndexTopoBuilder.filterEntriesBefore( entries, GenericIndexTopoBuilder.findMinNonIndexableRowId( - schemaManager, entries, "vec")); + schemaManager, entries, Collections.singletonList("vec"))); assertThat(result).hasSize(2); assertThat(result.get(0).file().nonNullFirstRowId()).isEqualTo(0L); diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java index 1485d14fac1c..041ee9bf41b6 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java @@ -20,7 +20,9 @@ import org.apache.paimon.data.BinaryRow; import org.apache.paimon.data.InternalRow; +import org.apache.paimon.globalindex.GlobalIndexMultiColumnWriter; import org.apache.paimon.globalindex.GlobalIndexSingletonWriter; +import org.apache.paimon.globalindex.GlobalIndexWriter; import org.apache.paimon.globalindex.ResultEntry; import org.apache.paimon.index.IndexFileMeta; import org.apache.paimon.io.CompactIncrement; @@ -37,6 +39,7 @@ import java.io.IOException; import java.io.Serializable; +import java.util.Collections; import java.util.List; import static org.apache.paimon.globalindex.GlobalIndexBuilderUtils.createIndexWriter; @@ -50,7 +53,7 @@ public class DefaultGlobalIndexBuilder implements Serializable { private final FileStoreTable table; private final BinaryRow partition; private final RowType readType; - private final DataField indexField; + private final List indexFields; private final String indexType; private final Range rowRange; private final Options options; @@ -63,10 +66,28 @@ public DefaultGlobalIndexBuilder( String indexType, Range rowRange, Options options) { + this( + table, + partition, + readType, + Collections.singletonList(indexField), + indexType, + rowRange, + options); + } + + public DefaultGlobalIndexBuilder( + FileStoreTable table, + BinaryRow partition, + RowType readType, + List indexFields, + String indexType, + Range rowRange, + Options options) { this.table = table; this.partition = partition; this.readType = readType; - this.indexField = indexField; + this.indexFields = indexFields; this.indexType = indexType; this.rowRange = rowRange; this.options = options; @@ -89,7 +110,7 @@ public CommitMessage build(CloseableIterator data) throws IOExcepti table.store().pathFactory().globalIndexFileFactory(), table.coreOptions(), rowRange, - indexField.id(), + indexFields, indexType, resultEntries); DataIncrement dataIncrement = DataIncrement.indexIncrement(indexFileMetas); @@ -99,27 +120,38 @@ public CommitMessage build(CloseableIterator data) throws IOExcepti private List writePaimonRows( CloseableIterator rows, LongCounter rowCounter) throws IOException { - GlobalIndexSingletonWriter indexWriter = - (GlobalIndexSingletonWriter) - createIndexWriter(table, indexType, indexField, options); + GlobalIndexWriter indexWriter = createIndexWriter(table, indexType, indexFields, options); + boolean multiColumn = indexFields.size() > 1; try { - InternalRow.FieldGetter getter = - InternalRow.createFieldGetter( - indexField.type(), readType.getFieldIndex(indexField.name())); - rows.forEachRemaining( - row -> { - Object indexO = getter.getFieldOrNull(row); - indexWriter.write(indexO); - rowCounter.add(1); - }); + if (multiColumn) { + GlobalIndexMultiColumnWriter multiWriter = + (GlobalIndexMultiColumnWriter) indexWriter; + rows.forEachRemaining( + row -> { + multiWriter.write(row); + rowCounter.add(1); + }); + } else { + DataField indexField = indexFields.get(0); + GlobalIndexSingletonWriter singleWriter = (GlobalIndexSingletonWriter) indexWriter; + InternalRow.FieldGetter getter = + InternalRow.createFieldGetter( + indexField.type(), readType.getFieldIndex(indexField.name())); + rows.forEachRemaining( + row -> { + Object indexO = getter.getFieldOrNull(row); + singleWriter.write(indexO); + rowCounter.add(1); + }); + } return indexWriter.finish(); } finally { closeWriterQuietly(indexWriter); } } - private static void closeWriterQuietly(GlobalIndexSingletonWriter writer) { + private static void closeWriterQuietly(GlobalIndexWriter writer) { if (writer instanceof java.io.Closeable) { try { ((java.io.Closeable) writer).close(); diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexTopoBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexTopoBuilder.java index afd954c39a5d..437ad11737dc 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexTopoBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexTopoBuilder.java @@ -77,6 +77,28 @@ public List buildIndex( DataField indexField, Options options) throws IOException { + return buildIndex( + spark, + relation, + partitionPredicate, + table, + indexType, + readType, + Collections.singletonList(indexField), + options); + } + + @Override + public List buildIndex( + SparkSession spark, + DataSourceV2Relation relation, + PartitionPredicate partitionPredicate, + FileStoreTable table, + String indexType, + RowType readType, + List indexFields, + Options options) + throws IOException { Options tableOptions = table.coreOptions().toConfiguration(); long rowsPerShard = tableOptions @@ -106,7 +128,7 @@ public List buildIndex( table, partition, readType, - indexField, + indexFields, indexType, indexedSplit.rowRanges().get(0), options); diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/GlobalIndexTopologyBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/GlobalIndexTopologyBuilder.java index 50c6ab34e153..aea421800410 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/GlobalIndexTopologyBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/GlobalIndexTopologyBuilder.java @@ -46,4 +46,25 @@ List buildIndex( DataField indexField, Options options) throws IOException; + + default List buildIndex( + SparkSession spark, + DataSourceV2Relation relation, + PartitionPredicate partitionPredicate, + FileStoreTable table, + String indexType, + RowType readType, + List indexFields, + Options options) + throws IOException { + return buildIndex( + spark, + relation, + partitionPredicate, + table, + indexType, + readType, + indexFields.get(0), + options); + } } From 4f28a682a24231bbdef10d363c0b188c7377ad2a Mon Sep 17 00:00:00 2001 From: CrownChu Date: Mon, 25 May 2026 11:37:17 +0800 Subject: [PATCH 02/24] [globalindex] Support multi-column in CreateGlobalIndexProcedure Allow index_column parameter to accept comma-separated column names (e.g. "title,embedding") for both Flink and Spark procedures. Add List overload for GenericIndexTopoBuilder.buildIndexAndExecute. --- .../globalindex/GenericIndexTopoBuilder.java | 18 +++++++++++ .../procedure/CreateGlobalIndexProcedure.java | 27 ++++++++++------ .../procedure/CreateGlobalIndexProcedure.java | 32 ++++++++++++------- 3 files changed, 57 insertions(+), 20 deletions(-) diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java index 5a0d852b12a7..df8e92f8d0cd 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java @@ -131,6 +131,24 @@ public static void buildIndexAndExecute( maxIndexedRowId); } + public static void buildIndexAndExecute( + StreamExecutionEnvironment env, + FileStoreTable table, + List indexColumns, + String indexType, + PartitionPredicate partitionPredicate, + Options userOptions) + throws Exception { + buildIndexAndExecute( + env, + table, + indexColumns, + indexType, + partitionPredicate, + userOptions, + NO_MAX_INDEXED_ROW_ID); + } + public static void buildIndexAndExecute( StreamExecutionEnvironment env, FileStoreTable table, diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java index ad62ad8f7654..ca6367e97148 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java @@ -32,8 +32,10 @@ import org.apache.flink.table.annotation.ProcedureHint; import org.apache.flink.table.procedure.ProcedureContext; +import java.util.Arrays; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; import static org.apache.paimon.utils.ParameterUtils.getPartitions; import static org.apache.paimon.utils.Preconditions.checkArgument; @@ -85,11 +87,18 @@ public String[] call( tableId); RowType rowType = table.rowType(); - checkArgument( - rowType.containsField(indexColumn), - "Column '%s' does not exist in table '%s'.", - indexColumn, - tableId); + List indexColumns = + Arrays.stream(indexColumn.split(",")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toList()); + for (String col : indexColumns) { + checkArgument( + rowType.containsField(col), + "Column '%s' does not exist in table '%s'.", + col, + tableId); + } // Parse partition predicate PartitionPredicate partitionPredicate = parsePartitionPredicate(table, partitions); @@ -104,7 +113,7 @@ public String[] call( BTreeIndexTopoBuilder.buildIndexAndExecute( procedureContext.getExecutionEnvironment(), table, - indexColumn, + indexColumns.get(0), partitionPredicate, userOptions); return new String[] { @@ -114,7 +123,7 @@ public String[] call( GenericIndexTopoBuilder.buildIndexAndExecute( procedureContext.getExecutionEnvironment(), table, - indexColumn, + indexColumns, indexType, partitionPredicate, userOptions); @@ -122,8 +131,8 @@ public String[] call( } catch (Exception e) { throw new RuntimeException( String.format( - "Failed to create %s index for column '%s' on table '%s'.", - indexType, indexColumn, table.name()), + "Failed to create %s index for columns '%s' on table '%s'.", + indexType, indexColumns, table.name()), e); } return new String[] { diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java index b447cdbd33f8..6b061e6a1577 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java @@ -43,12 +43,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Collections; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.UUID; +import java.util.stream.Collectors; import static org.apache.paimon.utils.Preconditions.checkArgument; import static org.apache.spark.sql.types.DataTypes.StringType; @@ -132,11 +133,18 @@ public InternalRow[] call(InternalRow args) { tableIdent); RowType rowType = table.rowType(); - checkArgument( - rowType.containsField(column), - "Column '%s' does not exist in table '%s'.", - column, - tableIdent); + List indexColumns = + Arrays.stream(column.split(",")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toList()); + for (String col : indexColumns) { + checkArgument( + rowType.containsField(col), + "Column '%s' does not exist in table '%s'.", + col, + tableIdent); + } DataSourceV2Relation relation = createRelation(tableIdent, sparkTable); PartitionPredicate partitionPredicate = SparkProcedureUtils.convertToPartitionPredicate( @@ -145,9 +153,11 @@ public InternalRow[] call(InternalRow args) { spark(), relation); - DataField indexField = rowType.getField(column); - RowType projectedRowType = - rowType.project(Collections.singletonList(column)); + List indexFields = + indexColumns.stream() + .map(rowType::getField) + .collect(Collectors.toList()); + RowType projectedRowType = rowType.project(indexColumns); RowType readRowType = SpecialFields.rowTypeWithRowId(projectedRowType); Options userOptions = createUserOptions(table, optionString); @@ -163,7 +173,7 @@ public InternalRow[] call(InternalRow args) { table, indexType, readRowType, - indexField, + indexFields, userOptions); try (TableCommitImpl commit = @@ -179,7 +189,7 @@ public InternalRow[] call(InternalRow args) { } catch (Exception e) { throw new RuntimeException( String.format( - "Failed to create %s index for column '%s' on table '%s'.", + "Failed to create %s index for columns '%s' on table '%s'.", indexType, column, tableIdent), e); } From 954754924c5df962687f2b2cfc774f588f051ab0 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Tue, 26 May 2026 02:00:19 +0800 Subject: [PATCH 03/24] [globalindex] Fix multi-column index metadata storage and resolveFields validation --- .../globalindex/GlobalIndexBuilderUtils.java | 18 +++++++++++------- .../paimon/globalindex/GlobalIndexScanner.java | 9 +++++++++ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java index 34badf3ed566..3931a53a9ecb 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java @@ -38,6 +38,8 @@ /** Utils for global index build. */ public class GlobalIndexBuilderUtils { + public static final int MULTI_COLUMN_INDEX_FIELD_ID = -1; + public static List toIndexFileMetas( FileIO fileIO, IndexPathFactory indexPathFactory, @@ -60,13 +62,15 @@ public static List toIndexFileMetas( String indexType, List entries) throws IOException { - int indexFieldId = fields.get(0).id(); - int[] extraFieldIds = - fields.size() > 1 - ? fields.subList(1, fields.size()).stream() - .mapToInt(DataField::id) - .toArray() - : null; + int indexFieldId; + int[] extraFieldIds; + if (fields.size() > 1) { + indexFieldId = MULTI_COLUMN_INDEX_FIELD_ID; + extraFieldIds = fields.stream().mapToInt(DataField::id).toArray(); + } else { + indexFieldId = fields.get(0).id(); + extraFieldIds = null; + } return toIndexFileMetas( fileIO, indexPathFactory, diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java index 04e16eed2d87..cffcbb34646a 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java @@ -53,8 +53,10 @@ import java.util.stream.Collectors; import static org.apache.paimon.CoreOptions.GLOBAL_INDEX_THREAD_NUM; +import static org.apache.paimon.globalindex.GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID; import static org.apache.paimon.predicate.PredicateVisitor.collectFieldNames; import static org.apache.paimon.table.source.snapshot.TimeTravelUtil.tryTravelOrLatest; +import static org.apache.paimon.utils.Preconditions.checkArgument; import static org.apache.paimon.utils.Preconditions.checkNotNull; /** Scanner for shard-based global indexes. */ @@ -98,6 +100,13 @@ public GlobalIndexScanner( Arrays.stream(meta.extraFieldIds()) .boxed() .collect(Collectors.toList()); + // Validate consistency: all files in the same group must have identical extraFieldIds + if (fieldToGroup.containsKey(fieldIds.get(0))) { + List existingGroup = fieldToGroup.get(fieldIds.get(0)); + checkArgument( + existingGroup.equals(fieldIds), + "Inconsistent extraFieldIds across index files."); + } multiColumnMetas .computeIfAbsent(fieldIds, k -> new HashMap<>()) .computeIfAbsent(indexType, k -> new HashMap<>()) From 7d488420b68f39ee2f09ee2202b86eeaab228a78 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Tue, 26 May 2026 17:04:15 +0800 Subject: [PATCH 04/24] [globalindex] Fix GenericIndexTopoBuilder multi-column null value error --- .../GlobalIndexBuilderUtilsTest.java | 146 ++++++++++++++++++ .../globalindex/GenericIndexTopoBuilder.java | 15 ++ 2 files changed, 161 insertions(+) create mode 100644 paimon-core/src/test/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtilsTest.java diff --git a/paimon-core/src/test/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtilsTest.java b/paimon-core/src/test/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtilsTest.java new file mode 100644 index 000000000000..703c01c69633 --- /dev/null +++ b/paimon-core/src/test/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtilsTest.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.globalindex; + +import org.apache.paimon.CoreOptions; +import org.apache.paimon.fs.FileIO; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.local.LocalFileIO; +import org.apache.paimon.index.IndexFileMeta; +import org.apache.paimon.index.IndexPathFactory; +import org.apache.paimon.options.Options; +import org.apache.paimon.types.ArrayType; +import org.apache.paimon.types.DataField; +import org.apache.paimon.types.FloatType; +import org.apache.paimon.types.IntType; +import org.apache.paimon.types.VarCharType; +import org.apache.paimon.utils.Range; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Tests for {@link GlobalIndexBuilderUtils}. */ +class GlobalIndexBuilderUtilsTest { + + @TempDir java.nio.file.Path tempDir; + + private FileIO fileIO; + private IndexPathFactory indexPathFactory; + private CoreOptions coreOptions; + + @BeforeEach + void setUp() { + fileIO = new LocalFileIO(); + Path dir = new Path(tempDir.toString()); + indexPathFactory = + new IndexPathFactory() { + @Override + public Path toPath(String fileName) { + return new Path(dir, fileName); + } + + @Override + public Path newPath() { + return new Path(dir, UUID.randomUUID().toString()); + } + + @Override + public boolean isExternalPath() { + return false; + } + }; + coreOptions = new CoreOptions(new Options().toMap()); + } + + // Test: 2 columns (title + vec), indexFieldId=-1, all field ids stored in extraFieldIds + @Test + void testToIndexFileMetasMultiColumn() throws IOException { + DataField titleField = new DataField(1, "title", new VarCharType(Integer.MAX_VALUE)); + DataField vecField = new DataField(2, "vec", new ArrayType(new FloatType())); + List fields = Arrays.asList(titleField, vecField); + + List entries = createDummyResultEntries(); + Range range = new Range(0, 99); + + List metas = + GlobalIndexBuilderUtils.toIndexFileMetas( + fileIO, indexPathFactory, coreOptions, range, fields, "test-type", entries); + + assertThat(metas).hasSize(1); + assertThat(metas.get(0).globalIndexMeta().indexFieldId()).isEqualTo(-1); + assertThat(metas.get(0).globalIndexMeta().extraFieldIds()).isEqualTo(new int[] {1, 2}); + assertThat(metas.get(0).globalIndexMeta().rowRangeStart()).isEqualTo(0); + assertThat(metas.get(0).globalIndexMeta().rowRangeEnd()).isEqualTo(99); + } + + // Test: single column, extraFieldIds should be null (backward compatible with single-column + // path) + @Test + void testToIndexFileMetasSingleColumn() throws IOException { + DataField titleField = new DataField(1, "title", new VarCharType(Integer.MAX_VALUE)); + List fields = Collections.singletonList(titleField); + + List entries = createDummyResultEntries(); + Range range = new Range(0, 49); + + List metas = + GlobalIndexBuilderUtils.toIndexFileMetas( + fileIO, indexPathFactory, coreOptions, range, fields, "test-type", entries); + + assertThat(metas).hasSize(1); + assertThat(metas.get(0).globalIndexMeta().indexFieldId()).isEqualTo(1); + assertThat(metas.get(0).globalIndexMeta().extraFieldIds()).isNull(); + } + + // Test: 3 columns (title + vec + id), indexFieldId=-1, all field ids in extraFieldIds + @Test + void testToIndexFileMetasThreeColumns() throws IOException { + DataField titleField = new DataField(1, "title", new VarCharType(Integer.MAX_VALUE)); + DataField vecField = new DataField(2, "vec", new ArrayType(new FloatType())); + DataField idField = new DataField(3, "id", new IntType()); + List fields = Arrays.asList(titleField, vecField, idField); + + List entries = createDummyResultEntries(); + Range range = new Range(0, 199); + + List metas = + GlobalIndexBuilderUtils.toIndexFileMetas( + fileIO, indexPathFactory, coreOptions, range, fields, "test-type", entries); + + assertThat(metas).hasSize(1); + assertThat(metas.get(0).globalIndexMeta().indexFieldId()).isEqualTo(-1); + assertThat(metas.get(0).globalIndexMeta().extraFieldIds()).isEqualTo(new int[] {1, 2, 3}); + } + + private List createDummyResultEntries() throws IOException { + String fileName = "test-index-" + UUID.randomUUID(); + Path filePath = indexPathFactory.toPath(fileName); + fileIO.newOutputStream(filePath, false).close(); + return Collections.singletonList(new ResultEntry(fileName, 100, null)); + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java index df8e92f8d0cd..2ac57502cc02 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java @@ -694,6 +694,21 @@ public void processElement(StreamRecord element) throws Exception { // Only write rows within this shard's range if (currentRowId >= task.shardRange.from) { if (multiColumn) { + boolean hasNull = false; + for (InternalRow.FieldGetter getter : indexFieldGetters) { + if (getter.getFieldOrNull(row) == null) { + hasNull = true; + break; + } + } + if (hasNull) { + LOG.info( + "Null value in indexed columns at rowId={}, stopping shard [{}, {}].", + currentRowId, + task.shardRange.from, + task.shardRange.to); + break; + } ((GlobalIndexMultiColumnWriter) indexWriter).write(row); } else { Object fieldData = indexFieldGetters[0].getFieldOrNull(row); From b9299bd57301d05c1cd42dc6951471069fb2dcd9 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Tue, 26 May 2026 18:49:08 +0800 Subject: [PATCH 05/24] [globalindex] Extract findMinNonIndexableRowId and filterEntriesBefore into GlobalIndexBuilderUtils --- .../globalindex/GlobalIndexBuilderUtils.java | 44 +++++++++++++++++ .../globalindex/GenericIndexTopoBuilder.java | 49 +------------------ 2 files changed, 46 insertions(+), 47 deletions(-) diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java index 3931a53a9ecb..8f256957b7da 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java @@ -24,7 +24,9 @@ import org.apache.paimon.index.GlobalIndexMeta; import org.apache.paimon.index.IndexFileMeta; import org.apache.paimon.index.IndexPathFactory; +import org.apache.paimon.manifest.ManifestEntry; import org.apache.paimon.options.Options; +import org.apache.paimon.schema.SchemaManager; import org.apache.paimon.table.FileStoreTable; import org.apache.paimon.types.DataField; import org.apache.paimon.utils.Range; @@ -33,7 +35,9 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; /** Utils for global index build. */ public class GlobalIndexBuilderUtils { @@ -133,6 +137,46 @@ public static GlobalIndexWriter createIndexWriter( return globalIndexer.createWriter(createGlobalIndexFileReadWrite(table)); } + /** + * Find the minimum firstRowId among files whose schema does not contain all index columns. + * Files at or beyond this rowId cannot be indexed because the column was added later via ALTER + * TABLE. + * + * @return the boundary rowId, or {@link Long#MAX_VALUE} if all files contain the columns + */ + public static long findMinNonIndexableRowId( + SchemaManager schemaManager, List entries, List indexColumns) { + Map schemaContainsColumns = new HashMap<>(); + long minRowId = Long.MAX_VALUE; + for (ManifestEntry entry : entries) { + long sid = entry.file().schemaId(); + boolean contains = + schemaContainsColumns.computeIfAbsent( + sid, + id -> schemaManager.schema(id).fieldNames().containsAll(indexColumns)); + if (!contains && entry.file().firstRowId() != null) { + minRowId = Math.min(minRowId, entry.file().nonNullFirstRowId()); + } + } + return minRowId; + } + + /** Keep only entries whose firstRowId is strictly less than the given boundary. */ + public static List filterEntriesBefore( + List entries, long boundaryRowId) { + if (boundaryRowId == Long.MAX_VALUE) { + return entries; + } + List result = new ArrayList<>(); + for (ManifestEntry entry : entries) { + if (entry.file().firstRowId() != null + && entry.file().nonNullFirstRowId() < boundaryRowId) { + result.add(entry); + } + } + return result; + } + private static GlobalIndexFileReadWrite createGlobalIndexFileReadWrite(FileStoreTable table) { IndexPathFactory indexPathFactory = table.store().pathFactory().globalIndexFileFactory(); return new GlobalIndexFileReadWrite(table.fileIO(), indexPathFactory); diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java index 2ac57502cc02..035aacdce3f8 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java @@ -40,7 +40,6 @@ import org.apache.paimon.options.Options; import org.apache.paimon.partition.PartitionPredicate; import org.apache.paimon.reader.RecordReader; -import org.apache.paimon.schema.SchemaManager; import org.apache.paimon.table.FileStoreTable; import org.apache.paimon.table.SpecialFields; import org.apache.paimon.table.sink.BatchWriteBuilder; @@ -67,7 +66,6 @@ import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; -import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -76,6 +74,8 @@ import java.util.stream.Collectors; import static org.apache.paimon.globalindex.GlobalIndexBuilderUtils.createIndexWriter; +import static org.apache.paimon.globalindex.GlobalIndexBuilderUtils.filterEntriesBefore; +import static org.apache.paimon.globalindex.GlobalIndexBuilderUtils.findMinNonIndexableRowId; import static org.apache.paimon.globalindex.GlobalIndexBuilderUtils.toIndexFileMetas; import static org.apache.paimon.io.CompactIncrement.emptyIncrement; import static org.apache.paimon.io.DataIncrement.deleteIndexIncrement; @@ -358,51 +358,6 @@ private static boolean buildTopology( return true; } - /** - * Find the minimum firstRowId among files whose schema does not contain all index columns. - * Files at or beyond this rowId cannot be indexed because the column was added later via ALTER - * TABLE. - * - * @return the boundary rowId, or {@link Long#MAX_VALUE} if all files contain the columns - */ - static long findMinNonIndexableRowId( - SchemaManager schemaManager, List entries, List indexColumns) { - Map schemaContainsColumns = new HashMap<>(); - long minRowId = Long.MAX_VALUE; - for (ManifestEntry entry : entries) { - long sid = entry.file().schemaId(); - boolean contains = - schemaContainsColumns.computeIfAbsent( - sid, - id -> schemaManager.schema(id).fieldNames().containsAll(indexColumns)); - if (!contains && entry.file().firstRowId() != null) { - minRowId = Math.min(minRowId, entry.file().nonNullFirstRowId()); - } - } - return minRowId; - } - - /** Keep only entries whose firstRowId is strictly less than the given boundary. */ - static List filterEntriesBefore( - List entries, long boundaryRowId) { - if (boundaryRowId == Long.MAX_VALUE) { - return entries; - } - List result = new ArrayList<>(); - for (ManifestEntry entry : entries) { - if (entry.file().firstRowId() != null - && entry.file().nonNullFirstRowId() < boundaryRowId) { - result.add(entry); - } - } - LOG.info( - "Filtered {} files at or beyond rowId {}, {} files remain.", - entries.size() - result.size(), - boundaryRowId, - result.size()); - return result; - } - /** * Compute shard tasks for a full build (no rows to skip). * From 497d88b598142f54f2aa5eae64912989f8d0a555 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Tue, 26 May 2026 22:45:58 +0800 Subject: [PATCH 06/24] [globalindex] Fix test to reference GlobalIndexBuilderUtils after method extraction --- .../flink/globalindex/GenericIndexTopoBuilderTest.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilderTest.java b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilderTest.java index fb1bd02f4408..c69b59ad6e3c 100644 --- a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilderTest.java +++ b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilderTest.java @@ -23,6 +23,7 @@ import org.apache.paimon.data.BinaryRowWriter; import org.apache.paimon.data.BinaryString; import org.apache.paimon.fs.Path; +import org.apache.paimon.globalindex.GlobalIndexBuilderUtils; import org.apache.paimon.io.PojoDataFileMeta; import org.apache.paimon.manifest.FileKind; import org.apache.paimon.manifest.ManifestEntry; @@ -472,9 +473,9 @@ void testAppendFilterOldFilesBeforeNewFiles() { entries.add(createEntryWithSchemaId(BinaryRow.EMPTY_ROW, 200L, 100, 0L)); List result = - GenericIndexTopoBuilder.filterEntriesBefore( + GlobalIndexBuilderUtils.filterEntriesBefore( entries, - GenericIndexTopoBuilder.findMinNonIndexableRowId( + GlobalIndexBuilderUtils.findMinNonIndexableRowId( schemaManager, entries, Collections.singletonList("vec"))); assertThat(result).hasSize(2); From eeb4c84433592393063474efb83b8b5224bb9fc4 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Wed, 27 May 2026 11:05:04 +0800 Subject: [PATCH 07/24] [globalindex] Fix multi-column writer projection, add BTree validation, and restore observability logs --- .../globalindex/GlobalIndexBuilderUtils.java | 31 ++++++++++++++++++- .../globalindex/GenericIndexTopoBuilder.java | 12 ++++++- .../procedure/CreateGlobalIndexProcedure.java | 6 ++++ .../DefaultGlobalIndexBuilder.java | 8 ++++- .../procedure/CreateGlobalIndexProcedure.java | 6 ++++ 5 files changed, 60 insertions(+), 3 deletions(-) diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java index 8f256957b7da..497d50ece6e9 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java @@ -31,6 +31,9 @@ import org.apache.paimon.types.DataField; import org.apache.paimon.utils.Range; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import javax.annotation.Nullable; import java.io.IOException; @@ -42,6 +45,8 @@ /** Utils for global index build. */ public class GlobalIndexBuilderUtils { + private static final Logger LOG = LoggerFactory.getLogger(GlobalIndexBuilderUtils.class); + public static final int MULTI_COLUMN_INDEX_FIELD_ID = -1; public static List toIndexFileMetas( @@ -148,6 +153,7 @@ public static long findMinNonIndexableRowId( SchemaManager schemaManager, List entries, List indexColumns) { Map schemaContainsColumns = new HashMap<>(); long minRowId = Long.MAX_VALUE; + long minSchemaId = -1; for (ManifestEntry entry : entries) { long sid = entry.file().schemaId(); boolean contains = @@ -155,8 +161,26 @@ public static long findMinNonIndexableRowId( sid, id -> schemaManager.schema(id).fieldNames().containsAll(indexColumns)); if (!contains && entry.file().firstRowId() != null) { - minRowId = Math.min(minRowId, entry.file().nonNullFirstRowId()); + long rowId = entry.file().nonNullFirstRowId(); + if (rowId < minRowId) { + minRowId = rowId; + minSchemaId = sid; + } + } + } + if (minRowId != Long.MAX_VALUE) { + List schemaFields = schemaManager.schema(minSchemaId).fieldNames(); + List missingColumns = new ArrayList<>(); + for (String col : indexColumns) { + if (!schemaFields.contains(col)) { + missingColumns.add(col); + } } + LOG.info( + "Found non-indexable files: schemaId={} missing columns {}, boundaryRowId={}.", + minSchemaId, + missingColumns, + minRowId); } return minRowId; } @@ -174,6 +198,11 @@ public static List filterEntriesBefore( result.add(entry); } } + LOG.info( + "Filtered {} files to {} indexable files (boundaryRowId={}).", + entries.size(), + result.size(), + boundaryRowId); return result; } diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java index 035aacdce3f8..99a551a9e4d9 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java @@ -51,6 +51,7 @@ import org.apache.paimon.types.DataField; import org.apache.paimon.types.RowType; import org.apache.paimon.utils.CloseableIterator; +import org.apache.paimon.utils.ProjectedRow; import org.apache.paimon.utils.Range; import org.apache.flink.streaming.api.datastream.DataStream; @@ -573,6 +574,7 @@ private static class BuildIndexOperator private transient InternalRow.FieldGetter[] indexFieldGetters; private transient int rowIdFieldIndex; private transient boolean multiColumn; + private transient ProjectedRow writerProjection; BuildIndexOperator( ReadBuilder readBuilder, @@ -602,6 +604,13 @@ public void open() throws Exception { } this.rowIdFieldIndex = projectedRowType.getFieldIndex(SpecialFields.ROW_ID.name()); this.multiColumn = indexFields.size() > 1; + if (multiColumn) { + int[] projection = new int[indexFields.size()]; + for (int i = 0; i < indexFields.size(); i++) { + projection[i] = projectedRowType.getFieldIndex(indexFields.get(i).name()); + } + this.writerProjection = ProjectedRow.from(projection); + } } @Override @@ -664,7 +673,8 @@ public void processElement(StreamRecord element) throws Exception { task.shardRange.to); break; } - ((GlobalIndexMultiColumnWriter) indexWriter).write(row); + ((GlobalIndexMultiColumnWriter) indexWriter) + .write(writerProjection.replaceRow(row)); } else { Object fieldData = indexFieldGetters[0].getFieldOrNull(row); if (fieldData == null) { diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java index ca6367e97148..b6641bf6973e 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java @@ -108,6 +108,12 @@ public String[] call( // Build global index based on index type indexType = indexType.toLowerCase().trim(); + if ("btree".equals(indexType)) { + checkArgument( + indexColumns.size() == 1, + "BTree index only supports single column, got: %s", + indexColumns); + } try { if ("btree".equals(indexType)) { BTreeIndexTopoBuilder.buildIndexAndExecute( diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java index 041ee9bf41b6..48386d7ab8d3 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java @@ -35,6 +35,7 @@ import org.apache.paimon.types.RowType; import org.apache.paimon.utils.CloseableIterator; import org.apache.paimon.utils.LongCounter; +import org.apache.paimon.utils.ProjectedRow; import org.apache.paimon.utils.Range; import java.io.IOException; @@ -127,9 +128,14 @@ private List writePaimonRows( if (multiColumn) { GlobalIndexMultiColumnWriter multiWriter = (GlobalIndexMultiColumnWriter) indexWriter; + int[] projection = new int[indexFields.size()]; + for (int i = 0; i < indexFields.size(); i++) { + projection[i] = readType.getFieldIndex(indexFields.get(i).name()); + } + ProjectedRow projectedRow = ProjectedRow.from(projection); rows.forEachRemaining( row -> { - multiWriter.write(row); + multiWriter.write(projectedRow.replaceRow(row)); rowCounter.add(1); }); } else { diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java index 6b061e6a1577..89d63d0472ac 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java @@ -145,6 +145,12 @@ public InternalRow[] call(InternalRow args) { col, tableIdent); } + if ("btree".equalsIgnoreCase(indexType)) { + checkArgument( + indexColumns.size() == 1, + "BTree index only supports single column, got: %s", + indexColumns); + } DataSourceV2Relation relation = createRelation(tableIdent, sparkTable); PartitionPredicate partitionPredicate = SparkProcedureUtils.convertToPartitionPredicate( From 136d14fd182a2ff6de0099a6a0768248b7a87af3 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Wed, 27 May 2026 11:17:54 +0800 Subject: [PATCH 08/24] [globalindex] Fix MERGE INTO crash when table has multi-column global index (indexFieldId=-1) --- .../dataevolution/MergeIntoUpdateChecker.java | 37 ++++++++++++++++--- .../MergeIntoPaimonDataEvolutionTable.scala | 23 ++++++++++-- .../MergeIntoPaimonDataEvolutionTable.scala | 23 ++++++++++-- 3 files changed, 69 insertions(+), 14 deletions(-) diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java index 8b1122382aae..aed46f0078e8 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java @@ -39,6 +39,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -46,6 +48,8 @@ import java.util.Set; import java.util.stream.Collectors; +import static org.apache.paimon.globalindex.GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID; + /** * The checker for merge into update result. It will check each committable to see if some * global-indexed columns are updated. It will take some actions according to {@link @@ -100,10 +104,12 @@ private void checkUpdatedColumns() { GlobalIndexMeta globalIndexMeta = entry.indexFile().globalIndexMeta(); if (globalIndexMeta != null) { - String fieldName = - rowType.getField(globalIndexMeta.indexFieldId()) - .name(); - return updatedColumns.contains(fieldName) + Collection indexedNames = + getIndexedFieldNames(globalIndexMeta, rowType); + boolean overlaps = + indexedNames.stream() + .anyMatch(updatedColumns::contains); + return overlaps && affectedPartitions.contains(entry.partition()); } return false; @@ -116,8 +122,8 @@ private void checkUpdatedColumns() { case THROW_ERROR: Set conflictedColumns = affectedEntries.stream() - .map(file -> file.indexFile().globalIndexMeta().indexFieldId()) - .map(id -> rowType.getField(id).name()) + .map(file -> file.indexFile().globalIndexMeta()) + .flatMap(meta -> getIndexedFieldNames(meta, rowType).stream()) .collect(Collectors.toSet()); throw new RuntimeException( @@ -159,4 +165,23 @@ private void checkUpdatedColumns() { } } } + + private static Collection getIndexedFieldNames(GlobalIndexMeta meta, RowType rowType) { + int fieldId = meta.indexFieldId(); + if (fieldId == MULTI_COLUMN_INDEX_FIELD_ID) { + List names = new ArrayList<>(); + for (int id : meta.extraFieldIds()) { + names.add(rowType.getField(id).name()); + } + return names; + } + List names = new ArrayList<>(); + names.add(rowType.getField(fieldId).name()); + if (meta.extraFieldIds() != null) { + for (int id : meta.extraFieldIds()) { + names.add(rowType.getField(id).name()); + } + } + return names; + } } diff --git a/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala b/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala index ad6f5b95011a..e07a77ae8774 100644 --- a/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala +++ b/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala @@ -21,6 +21,8 @@ package org.apache.paimon.spark.commands import org.apache.paimon.CoreOptions.GlobalIndexColumnUpdateAction import org.apache.paimon.data.BinaryRow import org.apache.paimon.format.blob.BlobFileFormat.isBlobFile +import org.apache.paimon.globalindex.GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID +import org.apache.paimon.index.GlobalIndexMeta import org.apache.paimon.io.{CompactIncrement, DataIncrement} import org.apache.paimon.manifest.IndexManifestEntry import org.apache.paimon.spark.SparkTable @@ -586,15 +588,29 @@ case class MergeIntoPaimonDataEvolutionTable( return updateCommit } + def getIndexedFieldNames( + meta: GlobalIndexMeta, + rt: org.apache.paimon.types.RowType): Seq[String] = { + if (meta.indexFieldId() == MULTI_COLUMN_INDEX_FIELD_ID) { + meta.extraFieldIds().map(id => rt.getField(id).name()).toSeq + } else { + val names = ArrayBuffer(rt.getField(meta.indexFieldId()).name()) + if (meta.extraFieldIds() != null) { + meta.extraFieldIds().foreach(id => names += rt.getField(id).name()) + } + names.toSeq + } + } + val filter: org.apache.paimon.utils.Filter[IndexManifestEntry] = (entry: IndexManifestEntry) => { val globalIndexMeta = entry.indexFile().globalIndexMeta() if (globalIndexMeta == null) { false } else { - val fieldName = rowType.getField(globalIndexMeta.indexFieldId()).name() + val indexedNames = getIndexedFieldNames(globalIndexMeta, rowType) affectedParts.contains(entry.partition()) && updateColumns.exists( - _.name.equals(fieldName)) + col => indexedNames.contains(col.name)) } } @@ -611,8 +627,7 @@ case class MergeIntoPaimonDataEvolutionTable( case GlobalIndexColumnUpdateAction.THROW_ERROR => val updatedColNames = updateColumns.map(_.name) val conflicted = affectedIndexEntries - .map(_.indexFile().globalIndexMeta().indexFieldId()) - .map(id => rowType.getField(id).name()) + .flatMap(e => getIndexedFieldNames(e.indexFile().globalIndexMeta(), rowType)) .toSet throw new RuntimeException( s"""MergeInto: update columns contain globally indexed columns, not supported now. diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala index cd1b000a361f..38bf71c3310f 100644 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala +++ b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala @@ -21,6 +21,8 @@ package org.apache.paimon.spark.commands import org.apache.paimon.CoreOptions.GlobalIndexColumnUpdateAction import org.apache.paimon.data.BinaryRow import org.apache.paimon.format.blob.BlobFileFormat.isBlobFile +import org.apache.paimon.globalindex.GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID +import org.apache.paimon.index.GlobalIndexMeta import org.apache.paimon.io.{CompactIncrement, DataIncrement} import org.apache.paimon.manifest.IndexManifestEntry import org.apache.paimon.spark.SparkTable @@ -594,9 +596,9 @@ case class MergeIntoPaimonDataEvolutionTable( if (globalIndexMeta == null) { false } else { - val fieldName = rowType.getField(globalIndexMeta.indexFieldId()).name() + val indexedNames = getIndexedFieldNames(globalIndexMeta, rowType) affectedParts.contains(entry.partition()) && updateColumns.exists( - _.name.equals(fieldName)) + col => indexedNames.contains(col.name)) } } @@ -613,8 +615,7 @@ case class MergeIntoPaimonDataEvolutionTable( case GlobalIndexColumnUpdateAction.THROW_ERROR => val updatedColNames = updateColumns.map(_.name) val conflicted = affectedIndexEntries - .map(_.indexFile().globalIndexMeta().indexFieldId()) - .map(id => rowType.getField(id).name()) + .flatMap(e => getIndexedFieldNames(e.indexFile().globalIndexMeta(), rowType)) .toSet throw new RuntimeException( s"""MergeInto: update columns contain globally indexed columns, not supported now. @@ -638,6 +639,20 @@ case class MergeIntoPaimonDataEvolutionTable( } } + private def getIndexedFieldNames( + meta: GlobalIndexMeta, + rowType: org.apache.paimon.types.RowType): Seq[String] = { + if (meta.indexFieldId() == MULTI_COLUMN_INDEX_FIELD_ID) { + meta.extraFieldIds().map(id => rowType.getField(id).name()).toSeq + } else { + val names = ArrayBuffer(rowType.getField(meta.indexFieldId()).name()) + if (meta.extraFieldIds() != null) { + meta.extraFieldIds().foreach(id => names += rowType.getField(id).name()) + } + names.toSeq + } + } + private def findRelatedFirstRowIds( dataset: Dataset[Row], sparkSession: SparkSession, From 609dbeccc5f5570287a578aa01189e5feb7acb0e Mon Sep 17 00:00:00 2001 From: CrownChu Date: Wed, 27 May 2026 11:50:00 +0800 Subject: [PATCH 09/24] [globalindex] Fix FullText/Vector read path mismatch and reject multi-column for unsupported index types --- .../globalindex/GlobalIndexerFactory.java | 6 +++++ .../paimon/table/source/FullTextReadImpl.java | 24 +++++++++++++++---- .../paimon/table/source/VectorReadImpl.java | 24 +++++++++++++++---- 3 files changed, 46 insertions(+), 8 deletions(-) diff --git a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java index e2497a6f82e3..cef643fa463f 100644 --- a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java +++ b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java @@ -32,6 +32,12 @@ public interface GlobalIndexerFactory { GlobalIndexer create(DataField dataField, Options options); default GlobalIndexer create(List fields, Options options) { + if (fields.size() > 1) { + throw new UnsupportedOperationException( + String.format( + "Index type '%s' does not support multi-column index, got columns: %s", + identifier(), fields)); + } return create(fields.get(0), options); } } diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java index 66e509de8999..bf86cf9556ea 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java @@ -19,6 +19,7 @@ package org.apache.paimon.table.source; import org.apache.paimon.fs.FileIO; +import org.apache.paimon.globalindex.GlobalIndexBuilderUtils; import org.apache.paimon.globalindex.GlobalIndexIOMeta; import org.apache.paimon.globalindex.GlobalIndexReadThreadPool; import org.apache.paimon.globalindex.GlobalIndexReader; @@ -34,6 +35,7 @@ import org.apache.paimon.predicate.FullTextSearch; import org.apache.paimon.table.FileStoreTable; import org.apache.paimon.types.DataField; +import org.apache.paimon.types.RowType; import org.apache.paimon.utils.IOUtils; import java.util.ArrayList; @@ -78,10 +80,24 @@ public GlobalIndexResult read(List splits) { return GlobalIndexResult.createEmpty(); } - String indexType = splits.get(0).fullTextIndexFiles().get(0).indexType(); - GlobalIndexer globalIndexer = - GlobalIndexerFactoryUtils.load(indexType) - .create(textColumn, table.coreOptions().toConfiguration()); + IndexFileMeta firstFile = splits.get(0).fullTextIndexFiles().get(0); + String indexType = firstFile.indexType(); + GlobalIndexMeta firstMeta = checkNotNull(firstFile.globalIndexMeta()); + GlobalIndexer globalIndexer; + if (firstMeta.indexFieldId() == GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID) { + RowType rowType = table.rowType(); + List fields = new ArrayList<>(); + for (int id : firstMeta.extraFieldIds()) { + fields.add(rowType.getField(id)); + } + globalIndexer = + GlobalIndexerFactoryUtils.load(indexType) + .create(fields, table.coreOptions().toConfiguration()); + } else { + globalIndexer = + GlobalIndexerFactoryUtils.load(indexType) + .create(textColumn, table.coreOptions().toConfiguration()); + } IndexPathFactory indexPathFactory = table.store().pathFactory().globalIndexFileFactory(); int parallelism = table.coreOptions().toConfiguration().get(GLOBAL_INDEX_THREAD_NUM); diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java index 2eae2d48779d..5a59830c1380 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java @@ -19,6 +19,7 @@ package org.apache.paimon.table.source; import org.apache.paimon.fs.FileIO; +import org.apache.paimon.globalindex.GlobalIndexBuilderUtils; import org.apache.paimon.globalindex.GlobalIndexIOMeta; import org.apache.paimon.globalindex.GlobalIndexReadThreadPool; import org.apache.paimon.globalindex.GlobalIndexReader; @@ -36,6 +37,7 @@ import org.apache.paimon.predicate.VectorSearch; import org.apache.paimon.table.FileStoreTable; import org.apache.paimon.types.DataField; +import org.apache.paimon.types.RowType; import org.apache.paimon.utils.IOUtils; import org.apache.paimon.utils.RoaringNavigableMap64; @@ -87,10 +89,24 @@ public GlobalIndexResult read(List splits) { RoaringNavigableMap64 preFilter = preFilter(splits).orElse(null); - String indexType = splits.get(0).vectorIndexFiles().get(0).indexType(); - GlobalIndexer globalIndexer = - GlobalIndexerFactoryUtils.load(indexType) - .create(vectorColumn, table.coreOptions().toConfiguration()); + IndexFileMeta firstFile = splits.get(0).vectorIndexFiles().get(0); + String indexType = firstFile.indexType(); + GlobalIndexMeta firstMeta = checkNotNull(firstFile.globalIndexMeta()); + GlobalIndexer globalIndexer; + if (firstMeta.indexFieldId() == GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID) { + RowType rowType = table.rowType(); + List fields = new ArrayList<>(); + for (int id : firstMeta.extraFieldIds()) { + fields.add(rowType.getField(id)); + } + globalIndexer = + GlobalIndexerFactoryUtils.load(indexType) + .create(fields, table.coreOptions().toConfiguration()); + } else { + globalIndexer = + GlobalIndexerFactoryUtils.load(indexType) + .create(vectorColumn, table.coreOptions().toConfiguration()); + } IndexPathFactory indexPathFactory = table.store().pathFactory().globalIndexFileFactory(); int parallelism = table.coreOptions().toConfiguration().get(GLOBAL_INDEX_THREAD_NUM); From e8e571a732b8bf34c16053bc12a06edd36186586 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Wed, 27 May 2026 13:59:05 +0800 Subject: [PATCH 10/24] [globalindex] Add input validation, Spark schema filtering, null check, and multi-column guard --- .../procedure/CreateGlobalIndexProcedure.java | 1 + .../DefaultGlobalIndexBuilder.java | 35 +++++++++++++++---- .../DefaultGlobalIndexTopoBuilder.java | 9 +++++ .../GlobalIndexTopologyBuilder.java | 6 ++++ .../procedure/CreateGlobalIndexProcedure.java | 1 + 5 files changed, 46 insertions(+), 6 deletions(-) diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java index b6641bf6973e..1a6ba1d301f7 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java @@ -92,6 +92,7 @@ public String[] call( .map(String::trim) .filter(s -> !s.isEmpty()) .collect(Collectors.toList()); + checkArgument(!indexColumns.isEmpty(), "At least one column required."); for (String col : indexColumns) { checkArgument( rowType.containsField(col), diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java index 48386d7ab8d3..a64045633c6b 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java @@ -38,6 +38,9 @@ import org.apache.paimon.utils.ProjectedRow; import org.apache.paimon.utils.Range; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.io.IOException; import java.io.Serializable; import java.util.Collections; @@ -49,6 +52,7 @@ /** Default global index builder. */ public class DefaultGlobalIndexBuilder implements Serializable { + private static final Logger LOG = LoggerFactory.getLogger(DefaultGlobalIndexBuilder.class); private static final long serialVersionUID = 1L; private final FileStoreTable table; @@ -129,15 +133,34 @@ private List writePaimonRows( GlobalIndexMultiColumnWriter multiWriter = (GlobalIndexMultiColumnWriter) indexWriter; int[] projection = new int[indexFields.size()]; + InternalRow.FieldGetter[] getters = new InternalRow.FieldGetter[indexFields.size()]; for (int i = 0; i < indexFields.size(); i++) { - projection[i] = readType.getFieldIndex(indexFields.get(i).name()); + DataField field = indexFields.get(i); + projection[i] = readType.getFieldIndex(field.name()); + getters[i] = + InternalRow.createFieldGetter( + field.type(), readType.getFieldIndex(field.name())); } ProjectedRow projectedRow = ProjectedRow.from(projection); - rows.forEachRemaining( - row -> { - multiWriter.write(projectedRow.replaceRow(row)); - rowCounter.add(1); - }); + while (rows.hasNext()) { + InternalRow row = rows.next(); + boolean hasNull = false; + for (InternalRow.FieldGetter getter : getters) { + if (getter.getFieldOrNull(row) == null) { + hasNull = true; + break; + } + } + if (hasNull) { + LOG.info( + "Null value in indexed columns, stopping shard [{}, {}].", + rowRange.from, + rowRange.to); + break; + } + multiWriter.write(projectedRow.replaceRow(row)); + rowCounter.add(1); + } } else { DataField indexField = indexFields.get(0); GlobalIndexSingletonWriter singleWriter = (GlobalIndexSingletonWriter) indexWriter; diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexTopoBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexTopoBuilder.java index 437ad11737dc..ea2cda4a8b85 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexTopoBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexTopoBuilder.java @@ -21,12 +21,14 @@ import org.apache.paimon.data.BinaryRow; import org.apache.paimon.data.InternalRow; import org.apache.paimon.fs.Path; +import org.apache.paimon.globalindex.GlobalIndexBuilderUtils; import org.apache.paimon.globalindex.IndexedSplit; import org.apache.paimon.io.DataFileMeta; import org.apache.paimon.manifest.ManifestEntry; import org.apache.paimon.options.Options; import org.apache.paimon.partition.PartitionPredicate; import org.apache.paimon.reader.RecordReader; +import org.apache.paimon.schema.SchemaManager; import org.apache.paimon.table.FileStoreTable; import org.apache.paimon.table.sink.CommitMessage; import org.apache.paimon.table.sink.CommitMessageSerializer; @@ -110,6 +112,13 @@ public List buildIndex( List entries = table.store().newScan().withPartitionFilter(partitionPredicate).plan().files(); + List indexColumns = + indexFields.stream().map(DataField::name).collect(Collectors.toList()); + SchemaManager schemaManager = new SchemaManager(table.fileIO(), table.location()); + long boundaryRowId = + GlobalIndexBuilderUtils.findMinNonIndexableRowId( + schemaManager, entries, indexColumns); + entries = GlobalIndexBuilderUtils.filterEntriesBefore(entries, boundaryRowId); // generate splits for each partition && shard Map> splits = split(table, entries, rowsPerShard); diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/GlobalIndexTopologyBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/GlobalIndexTopologyBuilder.java index aea421800410..3d751f4585ac 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/GlobalIndexTopologyBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/GlobalIndexTopologyBuilder.java @@ -57,6 +57,12 @@ default List buildIndex( List indexFields, Options options) throws IOException { + if (indexFields.size() > 1) { + throw new UnsupportedOperationException( + String.format( + "Topology builder '%s' does not support multi-column index, got columns: %s", + identifier(), indexFields)); + } return buildIndex( spark, relation, diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java index 89d63d0472ac..c5c5edf7489d 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java @@ -138,6 +138,7 @@ public InternalRow[] call(InternalRow args) { .map(String::trim) .filter(s -> !s.isEmpty()) .collect(Collectors.toList()); + checkArgument(!indexColumns.isEmpty(), "At least one column required."); for (String col : indexColumns) { checkArgument( rowType.containsField(col), From 65a4b44b5faef2bed40cf5cab4bb89ac62e8f2f9 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Wed, 27 May 2026 14:54:39 +0800 Subject: [PATCH 11/24] [globalindex] Reject duplicate index columns and document why column count is unlimited --- .../paimon/globalindex/GlobalIndexScanner.java | 17 ++++++----------- .../procedure/CreateGlobalIndexProcedure.java | 10 ++++++++++ .../procedure/CreateGlobalIndexProcedure.java | 11 +++++++++++ 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java index cffcbb34646a..68d3c76823e8 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java @@ -20,7 +20,6 @@ import org.apache.paimon.fs.FileIO; import org.apache.paimon.fs.Path; -import org.apache.paimon.globalindex.GlobalIndexBuilderUtils; import org.apache.paimon.globalindex.io.GlobalIndexFileReader; import org.apache.paimon.index.GlobalIndexMeta; import org.apache.paimon.index.IndexFileMeta; @@ -92,15 +91,14 @@ public GlobalIndexScanner( String indexType = indexFile.indexType(); Range range = new Range(meta.rowRangeStart(), meta.rowRangeEnd()); - if (meta.indexFieldId() == GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID + if (meta.indexFieldId() == MULTI_COLUMN_INDEX_FIELD_ID && meta.extraFieldIds() != null) { // Multi-column index: all participating fields share the same IndexFileMeta, // so looking up from any fieldId returns identical index files. List fieldIds = - Arrays.stream(meta.extraFieldIds()) - .boxed() - .collect(Collectors.toList()); - // Validate consistency: all files in the same group must have identical extraFieldIds + Arrays.stream(meta.extraFieldIds()).boxed().collect(Collectors.toList()); + // Validate consistency: all files in the same group must have identical + // extraFieldIds if (fieldToGroup.containsKey(fieldIds.get(0))) { List existingGroup = fieldToGroup.get(fieldIds.get(0)); checkArgument( @@ -132,11 +130,8 @@ public GlobalIndexScanner( if (group != null) { // Multi-column: resolve full field list List fields = - group.stream() - .map(rowType::getField) - .collect(Collectors.toList()); - return createReaders( - indexFileReader, multiColumnMetas.get(group), fields); + group.stream().map(rowType::getField).collect(Collectors.toList()); + return createReaders(indexFileReader, multiColumnMetas.get(group), fields); } else { // Single-column return createReaders( diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java index 1a6ba1d301f7..090ea02e8528 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java @@ -33,6 +33,7 @@ import org.apache.flink.table.procedure.ProcedureContext; import java.util.Arrays; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -93,6 +94,15 @@ public String[] call( .filter(s -> !s.isEmpty()) .collect(Collectors.toList()); checkArgument(!indexColumns.isEmpty(), "At least one column required."); + checkArgument( + indexColumns.size() == new HashSet<>(indexColumns).size(), + "Duplicate index columns are not allowed: %s", + indexColumns); + // No hard cap on the number of index columns: unlike row-store B-tree indexes + // (e.g. MySQL 16, PostgreSQL 32) whose limit comes from composing columns into a + // single key, the global index is built on per-type index frameworks. Whether + // multiple columns are supported, and any practical limit, is decided by each + // index type (single-column types reject multi-column via UnsupportedOperationException). for (String col : indexColumns) { checkArgument( rowType.containsField(col), diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java index c5c5edf7489d..cc20c10af511 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java @@ -45,6 +45,7 @@ import java.util.Arrays; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; @@ -139,6 +140,16 @@ public InternalRow[] call(InternalRow args) { .filter(s -> !s.isEmpty()) .collect(Collectors.toList()); checkArgument(!indexColumns.isEmpty(), "At least one column required."); + checkArgument( + indexColumns.size() == new HashSet<>(indexColumns).size(), + "Duplicate index columns are not allowed: %s", + indexColumns); + // No hard cap on the number of index columns: unlike row-store B-tree + // indexes (e.g. MySQL 16, PostgreSQL 32) whose limit comes from composing + // columns into a single key, the global index is built on per-type index + // frameworks. Whether multiple columns are supported, and any practical + // limit, is decided by each index type (single-column types reject + // multi-column via UnsupportedOperationException). for (String col : indexColumns) { checkArgument( rowType.containsField(col), From 3500fe2cc772147141f7c4276aef22572bf7ee3b Mon Sep 17 00:00:00 2001 From: CrownChu Date: Fri, 29 May 2026 19:34:48 +0800 Subject: [PATCH 12/24] [globalindex] Address PR review: isMultiColumn helper, overlap detection, and display fix - Add GlobalIndexMeta.isMultiColumn() helper to replace scattered sentinel checks - Fix IndexManifestFileHandler overlap detection for multi-column indexes - Fix TableIndexesTable showing null for multi-column index field names - Replace all MULTI_COLUMN_INDEX_FIELD_ID == checks with isMultiColumn() --- .../globalindex/GlobalIndexScanner.java | 4 +-- .../apache/paimon/index/GlobalIndexMeta.java | 5 +++ .../manifest/IndexManifestFileHandler.java | 31 ++++++++++++++----- .../paimon/table/source/FullTextReadImpl.java | 3 +- .../paimon/table/source/VectorReadImpl.java | 3 +- .../table/system/TableIndexesTable.java | 17 ++++++++-- .../dataevolution/MergeIntoUpdateChecker.java | 4 +-- 7 files changed, 47 insertions(+), 20 deletions(-) diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java index 68d3c76823e8..d31175666c71 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java @@ -52,7 +52,6 @@ import java.util.stream.Collectors; import static org.apache.paimon.CoreOptions.GLOBAL_INDEX_THREAD_NUM; -import static org.apache.paimon.globalindex.GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID; import static org.apache.paimon.predicate.PredicateVisitor.collectFieldNames; import static org.apache.paimon.table.source.snapshot.TimeTravelUtil.tryTravelOrLatest; import static org.apache.paimon.utils.Preconditions.checkArgument; @@ -91,8 +90,7 @@ public GlobalIndexScanner( String indexType = indexFile.indexType(); Range range = new Range(meta.rowRangeStart(), meta.rowRangeEnd()); - if (meta.indexFieldId() == MULTI_COLUMN_INDEX_FIELD_ID - && meta.extraFieldIds() != null) { + if (meta.isMultiColumn() && meta.extraFieldIds() != null) { // Multi-column index: all participating fields share the same IndexFileMeta, // so looking up from any fieldId returns identical index files. List fieldIds = diff --git a/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java b/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java index c468bbffb3aa..4bdb17c53817 100644 --- a/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java +++ b/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java @@ -18,6 +18,7 @@ package org.apache.paimon.index; +import org.apache.paimon.globalindex.GlobalIndexBuilderUtils; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.DataField; import org.apache.paimon.types.DataTypes; @@ -78,6 +79,10 @@ public int indexFieldId() { return indexFieldId; } + public boolean isMultiColumn() { + return indexFieldId == GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID; + } + @Nullable public int[] extraFieldIds() { return extraFieldIds; diff --git a/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java b/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java index 3621483197f7..87628290810c 100644 --- a/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java +++ b/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java @@ -28,6 +28,7 @@ import javax.annotation.Nullable; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -239,16 +240,32 @@ private void validateRetainedIndexFiles( for (IndexManifestEntry added : addedIndexFiles) { GlobalIndexMeta addedMeta = added.indexFile().globalIndexMeta(); - if (addedMeta == null - || retainedMeta.indexFieldId() != addedMeta.indexFieldId() - || !Range.intersect( - retainedMeta.rowRangeStart(), - retainedMeta.rowRangeEnd(), - addedMeta.rowRangeStart(), - addedMeta.rowRangeEnd())) { + if (addedMeta == null) { continue; } + // Single-column: skip if different fieldId or no range overlap + if (!retainedMeta.isMultiColumn()) { + if (retainedMeta.indexFieldId() != addedMeta.indexFieldId() + || !Range.intersect( + retainedMeta.rowRangeStart(), + retainedMeta.rowRangeEnd(), + addedMeta.rowRangeStart(), + addedMeta.rowRangeEnd())) { + continue; + } + } else { + // Multi-column: skip if different column group or no range overlap + if (!Arrays.equals(retainedMeta.extraFieldIds(), addedMeta.extraFieldIds()) + || !Range.intersect( + retainedMeta.rowRangeStart(), + retainedMeta.rowRangeEnd(), + addedMeta.rowRangeStart(), + addedMeta.rowRangeEnd())) { + continue; + } + } + throw new IllegalStateException( String.format( "Trying to add global index file %s of type %s for index field %s" diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java index bf86cf9556ea..cfd161db6387 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java @@ -19,7 +19,6 @@ package org.apache.paimon.table.source; import org.apache.paimon.fs.FileIO; -import org.apache.paimon.globalindex.GlobalIndexBuilderUtils; import org.apache.paimon.globalindex.GlobalIndexIOMeta; import org.apache.paimon.globalindex.GlobalIndexReadThreadPool; import org.apache.paimon.globalindex.GlobalIndexReader; @@ -84,7 +83,7 @@ public GlobalIndexResult read(List splits) { String indexType = firstFile.indexType(); GlobalIndexMeta firstMeta = checkNotNull(firstFile.globalIndexMeta()); GlobalIndexer globalIndexer; - if (firstMeta.indexFieldId() == GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID) { + if (firstMeta.isMultiColumn()) { RowType rowType = table.rowType(); List fields = new ArrayList<>(); for (int id : firstMeta.extraFieldIds()) { diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java index 5a59830c1380..cefebe14c02b 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java @@ -19,7 +19,6 @@ package org.apache.paimon.table.source; import org.apache.paimon.fs.FileIO; -import org.apache.paimon.globalindex.GlobalIndexBuilderUtils; import org.apache.paimon.globalindex.GlobalIndexIOMeta; import org.apache.paimon.globalindex.GlobalIndexReadThreadPool; import org.apache.paimon.globalindex.GlobalIndexReader; @@ -93,7 +92,7 @@ public GlobalIndexResult read(List splits) { String indexType = firstFile.indexType(); GlobalIndexMeta firstMeta = checkNotNull(firstFile.globalIndexMeta()); GlobalIndexer globalIndexer; - if (firstMeta.indexFieldId() == GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID) { + if (firstMeta.isMultiColumn()) { RowType rowType = table.rowType(); List fields = new ArrayList<>(); for (int id : firstMeta.extraFieldIds()) { diff --git a/paimon-core/src/main/java/org/apache/paimon/table/system/TableIndexesTable.java b/paimon-core/src/main/java/org/apache/paimon/table/system/TableIndexesTable.java index 320257ce1057..f5e693d4a79d 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/system/TableIndexesTable.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/system/TableIndexesTable.java @@ -234,9 +234,20 @@ private InternalRow toRow( GlobalIndexMeta globalMeta = indexManifestEntry.indexFile().globalIndexMeta(); String indexFieldName = null; if (globalMeta != null) { - try { - indexFieldName = logicalRowType.getField(globalMeta.indexFieldId()).name(); - } catch (RuntimeException ignored) { + if (globalMeta.isMultiColumn() && globalMeta.extraFieldIds() != null) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < globalMeta.extraFieldIds().length; i++) { + if (i > 0) { + sb.append(","); + } + sb.append(logicalRowType.getField(globalMeta.extraFieldIds()[i]).name()); + } + indexFieldName = sb.toString(); + } else { + try { + indexFieldName = logicalRowType.getField(globalMeta.indexFieldId()).name(); + } catch (RuntimeException ignored) { + } } } return GenericRow.of( diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java index aed46f0078e8..b66e5d008307 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java @@ -48,8 +48,6 @@ import java.util.Set; import java.util.stream.Collectors; -import static org.apache.paimon.globalindex.GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID; - /** * The checker for merge into update result. It will check each committable to see if some * global-indexed columns are updated. It will take some actions according to {@link @@ -168,7 +166,7 @@ private void checkUpdatedColumns() { private static Collection getIndexedFieldNames(GlobalIndexMeta meta, RowType rowType) { int fieldId = meta.indexFieldId(); - if (fieldId == MULTI_COLUMN_INDEX_FIELD_ID) { + if (meta.isMultiColumn()) { List names = new ArrayList<>(); for (int id : meta.extraFieldIds()) { names.add(rowType.getField(id).name()); From 0dc435a23542f1dd7200a6eadd26d7c5f5bd6c02 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Fri, 29 May 2026 20:04:44 +0800 Subject: [PATCH 13/24] [globalindex] Extract getIndexedFieldNames to GlobalIndexMeta and fix error message - Add GlobalIndexMeta.getIndexedFieldNames(RowType) to eliminate copy-pasted helper - Replace local getIndexedFieldNames in MergeIntoUpdateChecker (Flink) - Replace local getIndexedFieldNames in MergeIntoPaimonDataEvolutionTable (Spark common & 4.0) - Fix Spark CreateGlobalIndexProcedure error message to use indexColumns instead of column --- .../apache/paimon/index/GlobalIndexMeta.java | 19 +++++++++++++ .../dataevolution/MergeIntoUpdateChecker.java | 27 +++---------------- .../MergeIntoPaimonDataEvolutionTable.scala | 19 ++----------- .../procedure/CreateGlobalIndexProcedure.java | 2 +- .../MergeIntoPaimonDataEvolutionTable.scala | 19 ++----------- 5 files changed, 27 insertions(+), 59 deletions(-) diff --git a/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java b/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java index 4bdb17c53817..60aba56d21ae 100644 --- a/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java +++ b/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java @@ -28,7 +28,9 @@ import javax.annotation.Nullable; +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; /** Schema for global index. */ public class GlobalIndexMeta { @@ -92,4 +94,21 @@ public int[] extraFieldIds() { public byte[] indexMeta() { return indexMeta; } + + public List getIndexedFieldNames(RowType rowType) { + List names = new ArrayList<>(); + if (isMultiColumn()) { + for (int id : extraFieldIds) { + names.add(rowType.getField(id).name()); + } + } else { + names.add(rowType.getField(indexFieldId).name()); + if (extraFieldIds != null) { + for (int id : extraFieldIds) { + names.add(rowType.getField(id).name()); + } + } + } + return names; + } } diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java index b66e5d008307..bdd0c0d49194 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java @@ -39,8 +39,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -102,8 +100,8 @@ private void checkUpdatedColumns() { GlobalIndexMeta globalIndexMeta = entry.indexFile().globalIndexMeta(); if (globalIndexMeta != null) { - Collection indexedNames = - getIndexedFieldNames(globalIndexMeta, rowType); + List indexedNames = + globalIndexMeta.getIndexedFieldNames(rowType); boolean overlaps = indexedNames.stream() .anyMatch(updatedColumns::contains); @@ -121,7 +119,7 @@ private void checkUpdatedColumns() { Set conflictedColumns = affectedEntries.stream() .map(file -> file.indexFile().globalIndexMeta()) - .flatMap(meta -> getIndexedFieldNames(meta, rowType).stream()) + .flatMap(meta -> meta.getIndexedFieldNames(rowType).stream()) .collect(Collectors.toSet()); throw new RuntimeException( @@ -163,23 +161,4 @@ private void checkUpdatedColumns() { } } } - - private static Collection getIndexedFieldNames(GlobalIndexMeta meta, RowType rowType) { - int fieldId = meta.indexFieldId(); - if (meta.isMultiColumn()) { - List names = new ArrayList<>(); - for (int id : meta.extraFieldIds()) { - names.add(rowType.getField(id).name()); - } - return names; - } - List names = new ArrayList<>(); - names.add(rowType.getField(fieldId).name()); - if (meta.extraFieldIds() != null) { - for (int id : meta.extraFieldIds()) { - names.add(rowType.getField(id).name()); - } - } - return names; - } } diff --git a/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala b/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala index e07a77ae8774..e5d7df44be74 100644 --- a/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala +++ b/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala @@ -21,7 +21,6 @@ package org.apache.paimon.spark.commands import org.apache.paimon.CoreOptions.GlobalIndexColumnUpdateAction import org.apache.paimon.data.BinaryRow import org.apache.paimon.format.blob.BlobFileFormat.isBlobFile -import org.apache.paimon.globalindex.GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID import org.apache.paimon.index.GlobalIndexMeta import org.apache.paimon.io.{CompactIncrement, DataIncrement} import org.apache.paimon.manifest.IndexManifestEntry @@ -588,27 +587,13 @@ case class MergeIntoPaimonDataEvolutionTable( return updateCommit } - def getIndexedFieldNames( - meta: GlobalIndexMeta, - rt: org.apache.paimon.types.RowType): Seq[String] = { - if (meta.indexFieldId() == MULTI_COLUMN_INDEX_FIELD_ID) { - meta.extraFieldIds().map(id => rt.getField(id).name()).toSeq - } else { - val names = ArrayBuffer(rt.getField(meta.indexFieldId()).name()) - if (meta.extraFieldIds() != null) { - meta.extraFieldIds().foreach(id => names += rt.getField(id).name()) - } - names.toSeq - } - } - val filter: org.apache.paimon.utils.Filter[IndexManifestEntry] = (entry: IndexManifestEntry) => { val globalIndexMeta = entry.indexFile().globalIndexMeta() if (globalIndexMeta == null) { false } else { - val indexedNames = getIndexedFieldNames(globalIndexMeta, rowType) + val indexedNames = globalIndexMeta.getIndexedFieldNames(rowType).asScala affectedParts.contains(entry.partition()) && updateColumns.exists( col => indexedNames.contains(col.name)) } @@ -627,7 +612,7 @@ case class MergeIntoPaimonDataEvolutionTable( case GlobalIndexColumnUpdateAction.THROW_ERROR => val updatedColNames = updateColumns.map(_.name) val conflicted = affectedIndexEntries - .flatMap(e => getIndexedFieldNames(e.indexFile().globalIndexMeta(), rowType)) + .flatMap(e => e.indexFile().globalIndexMeta().getIndexedFieldNames(rowType).asScala) .toSet throw new RuntimeException( s"""MergeInto: update columns contain globally indexed columns, not supported now. diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java index cc20c10af511..2596300765fb 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java @@ -208,7 +208,7 @@ public InternalRow[] call(InternalRow args) { throw new RuntimeException( String.format( "Failed to create %s index for columns '%s' on table '%s'.", - indexType, column, tableIdent), + indexType, indexColumns, tableIdent), e); } }); diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala index 38bf71c3310f..99990637c351 100644 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala +++ b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala @@ -21,7 +21,6 @@ package org.apache.paimon.spark.commands import org.apache.paimon.CoreOptions.GlobalIndexColumnUpdateAction import org.apache.paimon.data.BinaryRow import org.apache.paimon.format.blob.BlobFileFormat.isBlobFile -import org.apache.paimon.globalindex.GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID import org.apache.paimon.index.GlobalIndexMeta import org.apache.paimon.io.{CompactIncrement, DataIncrement} import org.apache.paimon.manifest.IndexManifestEntry @@ -596,7 +595,7 @@ case class MergeIntoPaimonDataEvolutionTable( if (globalIndexMeta == null) { false } else { - val indexedNames = getIndexedFieldNames(globalIndexMeta, rowType) + val indexedNames = globalIndexMeta.getIndexedFieldNames(rowType).asScala affectedParts.contains(entry.partition()) && updateColumns.exists( col => indexedNames.contains(col.name)) } @@ -615,7 +614,7 @@ case class MergeIntoPaimonDataEvolutionTable( case GlobalIndexColumnUpdateAction.THROW_ERROR => val updatedColNames = updateColumns.map(_.name) val conflicted = affectedIndexEntries - .flatMap(e => getIndexedFieldNames(e.indexFile().globalIndexMeta(), rowType)) + .flatMap(e => e.indexFile().globalIndexMeta().getIndexedFieldNames(rowType).asScala) .toSet throw new RuntimeException( s"""MergeInto: update columns contain globally indexed columns, not supported now. @@ -639,20 +638,6 @@ case class MergeIntoPaimonDataEvolutionTable( } } - private def getIndexedFieldNames( - meta: GlobalIndexMeta, - rowType: org.apache.paimon.types.RowType): Seq[String] = { - if (meta.indexFieldId() == MULTI_COLUMN_INDEX_FIELD_ID) { - meta.extraFieldIds().map(id => rowType.getField(id).name()).toSeq - } else { - val names = ArrayBuffer(rowType.getField(meta.indexFieldId()).name()) - if (meta.extraFieldIds() != null) { - meta.extraFieldIds().foreach(id => names += rowType.getField(id).name()) - } - names.toSeq - } - } - private def findRelatedFirstRowIds( dataset: Dataset[Row], sparkSession: SparkSession, From 7e0e57c867a08c1763975e1de30ea64ae7a397e8 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Mon, 1 Jun 2026 20:05:47 +0800 Subject: [PATCH 14/24] [globalindex] Fix compilation error: move indexColumns out of try block indexColumns was declared inside the try block but referenced in the catch block's error message, which is out of scope. Hoist the parsing before the try so the catch can access it. --- .../spark/procedure/CreateGlobalIndexProcedure.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java index 2596300765fb..2cb5525d25c6 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java @@ -122,6 +122,11 @@ public InternalRow[] call(InternalRow args) { return modifySparkTable( tableIdent, sparkTable -> { + List indexColumns = + Arrays.stream(column.split(",")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toList()); try { org.apache.paimon.table.Table t = sparkTable.getTable(); checkArgument( @@ -134,11 +139,6 @@ public InternalRow[] call(InternalRow args) { tableIdent); RowType rowType = table.rowType(); - List indexColumns = - Arrays.stream(column.split(",")) - .map(String::trim) - .filter(s -> !s.isEmpty()) - .collect(Collectors.toList()); checkArgument(!indexColumns.isEmpty(), "At least one column required."); checkArgument( indexColumns.size() == new HashSet<>(indexColumns).size(), From 438d69c323aa5ae3fdabdb3bd8b7b64ae1ddae0c Mon Sep 17 00:00:00 2001 From: CrownChu Date: Tue, 2 Jun 2026 10:52:35 +0800 Subject: [PATCH 15/24] [globalindex] Keep building through null values instead of ending the shard Breaking out of the shard loop on the first null indexed value dropped all later rows in the shard from the index and broke row-id alignment. Pass every row through the writer instead: a null field advances the logical row id without indexing a value, so later non-null rows are still indexed. - Flink single-column: restore null pass-through (was a regression) - Flink/Spark multi-column: pass the projected row through; each index type decides how to handle null fields --- .../globalindex/GenericIndexTopoBuilder.java | 29 ++++--------------- .../DefaultGlobalIndexBuilder.java | 26 +++-------------- 2 files changed, 10 insertions(+), 45 deletions(-) diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java index 99a551a9e4d9..4bba6fcc830b 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java @@ -658,33 +658,16 @@ public void processElement(StreamRecord element) throws Exception { // Only write rows within this shard's range if (currentRowId >= task.shardRange.from) { if (multiColumn) { - boolean hasNull = false; - for (InternalRow.FieldGetter getter : indexFieldGetters) { - if (getter.getFieldOrNull(row) == null) { - hasNull = true; - break; - } - } - if (hasNull) { - LOG.info( - "Null value in indexed columns at rowId={}, stopping shard [{}, {}].", - currentRowId, - task.shardRange.from, - task.shardRange.to); - break; - } + // Pass the row through, including null fields; each index type + // decides how to handle nulls. A null field advances the logical + // row id without indexing a value, so it must not end the shard: + // later non-null rows still need to be indexed and row-id alignment + // must be preserved. ((GlobalIndexMultiColumnWriter) indexWriter) .write(writerProjection.replaceRow(row)); } else { + // A null value advances the logical row id without indexing. Object fieldData = indexFieldGetters[0].getFieldOrNull(row); - if (fieldData == null) { - LOG.info( - "Null value at rowId={}, stopping shard [{}, {}].", - currentRowId, - task.shardRange.from, - task.shardRange.to); - break; - } ((GlobalIndexSingletonWriter) indexWriter).write(fieldData); } rowsSeen++; diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java index a64045633c6b..713965b01a32 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java @@ -38,9 +38,6 @@ import org.apache.paimon.utils.ProjectedRow; import org.apache.paimon.utils.Range; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.io.IOException; import java.io.Serializable; import java.util.Collections; @@ -52,7 +49,6 @@ /** Default global index builder. */ public class DefaultGlobalIndexBuilder implements Serializable { - private static final Logger LOG = LoggerFactory.getLogger(DefaultGlobalIndexBuilder.class); private static final long serialVersionUID = 1L; private final FileStoreTable table; @@ -133,31 +129,17 @@ private List writePaimonRows( GlobalIndexMultiColumnWriter multiWriter = (GlobalIndexMultiColumnWriter) indexWriter; int[] projection = new int[indexFields.size()]; - InternalRow.FieldGetter[] getters = new InternalRow.FieldGetter[indexFields.size()]; for (int i = 0; i < indexFields.size(); i++) { DataField field = indexFields.get(i); projection[i] = readType.getFieldIndex(field.name()); - getters[i] = - InternalRow.createFieldGetter( - field.type(), readType.getFieldIndex(field.name())); } ProjectedRow projectedRow = ProjectedRow.from(projection); while (rows.hasNext()) { InternalRow row = rows.next(); - boolean hasNull = false; - for (InternalRow.FieldGetter getter : getters) { - if (getter.getFieldOrNull(row) == null) { - hasNull = true; - break; - } - } - if (hasNull) { - LOG.info( - "Null value in indexed columns, stopping shard [{}, {}].", - rowRange.from, - rowRange.to); - break; - } + // Pass the row through, including null fields; each index type decides how to + // handle nulls. A null field advances the logical row id without indexing a + // value, so it must not end the shard: later non-null rows still need to be + // indexed and row-id alignment must be preserved. multiWriter.write(projectedRow.replaceRow(row)); rowCounter.add(1); } From cf4ae3bc42eda1e60206d7f57e09c6c44dc103e0 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Tue, 2 Jun 2026 10:52:42 +0800 Subject: [PATCH 16/24] [globalindex] Let a field participate in multiple multi-column index groups The scanner mapped each field id to a single multi-column group, so a field shared by several multi-column indexes (e.g. (a,b) and (a,c)) threw "Inconsistent extraFieldIds" or silently dropped readers. Model fieldId -> list of groups instead. For evaluation, every index covering a single field returns the same matching row ids, so pick one index rather than running them all: prefer the single-column index, otherwise fall back to one multi-column group. --- .../globalindex/GlobalIndexScanner.java | 52 +++++++++++-------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java index d31175666c71..adffa88152eb 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java @@ -54,7 +54,6 @@ import static org.apache.paimon.CoreOptions.GLOBAL_INDEX_THREAD_NUM; import static org.apache.paimon.predicate.PredicateVisitor.collectFieldNames; import static org.apache.paimon.table.source.snapshot.TimeTravelUtil.tryTravelOrLatest; -import static org.apache.paimon.utils.Preconditions.checkArgument; import static org.apache.paimon.utils.Preconditions.checkNotNull; /** Scanner for shard-based global indexes. */ @@ -82,8 +81,9 @@ public GlobalIndexScanner( // Multi-column indexes: fieldIds -> indexType -> range -> files Map, Map>>> multiColumnMetas = new HashMap<>(); - // Reverse lookup: fieldId -> its multi-column group - Map> fieldToGroup = new HashMap<>(); + // Reverse lookup: fieldId -> all multi-column groups it participates in. A field can + // belong to several multi-column indexes (e.g. (a,b) and (a,c)) at the same time. + Map>> fieldToGroups = new HashMap<>(); for (IndexFileMeta indexFile : indexFiles) { GlobalIndexMeta meta = checkNotNull(indexFile.globalIndexMeta()); @@ -91,25 +91,22 @@ public GlobalIndexScanner( Range range = new Range(meta.rowRangeStart(), meta.rowRangeEnd()); if (meta.isMultiColumn() && meta.extraFieldIds() != null) { - // Multi-column index: all participating fields share the same IndexFileMeta, - // so looking up from any fieldId returns identical index files. + // Multi-column index: all participating fields share the same IndexFileMeta. + // Multiple index files belonging to the same group are aggregated under the same + // multiColumnMetas key, and each participating field records this group. List fieldIds = Arrays.stream(meta.extraFieldIds()).boxed().collect(Collectors.toList()); - // Validate consistency: all files in the same group must have identical - // extraFieldIds - if (fieldToGroup.containsKey(fieldIds.get(0))) { - List existingGroup = fieldToGroup.get(fieldIds.get(0)); - checkArgument( - existingGroup.equals(fieldIds), - "Inconsistent extraFieldIds across index files."); - } multiColumnMetas .computeIfAbsent(fieldIds, k -> new HashMap<>()) .computeIfAbsent(indexType, k -> new HashMap<>()) .computeIfAbsent(range, k -> new ArrayList<>()) .add(indexFile); for (int id : fieldIds) { - fieldToGroup.put(id, fieldIds); + List> groups = + fieldToGroups.computeIfAbsent(id, k -> new ArrayList<>()); + if (!groups.contains(fieldIds)) { + groups.add(fieldIds); + } } } else { // Single-column index @@ -124,19 +121,28 @@ public GlobalIndexScanner( IntFunction> readersFunction = fId -> { - List group = fieldToGroup.get(fId); - if (group != null) { - // Multi-column: resolve full field list - List fields = - group.stream().map(rowType::getField).collect(Collectors.toList()); - return createReaders(indexFileReader, multiColumnMetas.get(group), fields); - } else { - // Single-column + // A filter on a single field can be served by any index covering that field, + // and every such index returns the same matching row ids. So pick ONE index + // instead of running them all: prefer the single-column index (purpose-built + // for this field and always able to serve the predicate); otherwise fall back + // to one of the multi-column groups this field participates in. + Map>> singleColumn = indexMetas.get(fId); + if (singleColumn != null) { return createReaders( indexFileReader, - indexMetas.get(fId), + singleColumn, Collections.singletonList(rowType.getField(fId))); } + List> groups = fieldToGroups.get(fId); + if (groups != null && !groups.isEmpty()) { + // No single-column index for this field: pick one of the multi-column + // groups it belongs to to accelerate the single-column filter. + List group = groups.get(0); + List fields = + group.stream().map(rowType::getField).collect(Collectors.toList()); + return createReaders(indexFileReader, multiColumnMetas.get(group), fields); + } + return Collections.emptyList(); }; this.globalIndexEvaluator = new GlobalIndexEvaluator(rowType, readersFunction); } From 81a96e3036312dd9cd3f53acc67eb843e847f7c6 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Wed, 3 Jun 2026 02:14:34 +0800 Subject: [PATCH 17/24] [globalindex] Make indexFieldId the primary column for multi-column indexes Previously a multi-column index stored indexFieldId=-1 and put all field ids in extraFieldIds, treating columns as parallel. Switch to a primary-column model: indexFieldId is always the first (primary) column and extraFieldIds holds the remaining columns. A primary column can own at most one index. - GlobalIndexMeta: isMultiColumn() based on extraFieldIds; add getIndexedFieldIds() and getIndexedFields(); unify getIndexedFieldNames() - GlobalIndexBuilderUtils: drop MULTI_COLUMN_INDEX_FIELD_ID; first column becomes the primary, rest become extraFieldIds - GlobalIndexScanner: key indexes by primary field id; reject conflicting indexes that share a primary with different columns - IndexManifestFileHandler: reject added index files sharing a primary with an existing one over an overlapping row range - FullText/VectorReadImpl: resolve the full column list via getIndexedFields() - TableIndexesTable: show all indexed column names; log when names cannot resolve --- .../globalindex/GlobalIndexBuilderUtils.java | 26 +++--- .../globalindex/GlobalIndexScanner.java | 82 ++++++------------- .../apache/paimon/index/GlobalIndexMeta.java | 41 +++++++--- .../manifest/IndexManifestFileHandler.java | 29 ++----- .../paimon/table/source/FullTextReadImpl.java | 7 +- .../paimon/table/source/VectorReadImpl.java | 7 +- .../table/system/TableIndexesTable.java | 25 +++--- .../GlobalIndexBuilderUtilsTest.java | 13 +-- 8 files changed, 95 insertions(+), 135 deletions(-) diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java index 497d50ece6e9..62b13833b393 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java @@ -47,8 +47,6 @@ public class GlobalIndexBuilderUtils { private static final Logger LOG = LoggerFactory.getLogger(GlobalIndexBuilderUtils.class); - public static final int MULTI_COLUMN_INDEX_FIELD_ID = -1; - public static List toIndexFileMetas( FileIO fileIO, IndexPathFactory indexPathFactory, @@ -62,6 +60,12 @@ public static List toIndexFileMetas( fileIO, indexPathFactory, options, range, indexFieldId, null, indexType, entries); } + /** + * Builds the index file metas. The first column in {@code fields} is treated as the primary + * index column (e.g. the first column in {@code CREATE ... INDEX ON (a, b, c)}) and is stored + * as {@code indexFieldId}; the remaining columns go into {@code extraFieldIds}. Callers must + * pass {@code fields} in the intended column order. + */ public static List toIndexFileMetas( FileIO fileIO, IndexPathFactory indexPathFactory, @@ -71,15 +75,15 @@ public static List toIndexFileMetas( String indexType, List entries) throws IOException { - int indexFieldId; - int[] extraFieldIds; - if (fields.size() > 1) { - indexFieldId = MULTI_COLUMN_INDEX_FIELD_ID; - extraFieldIds = fields.stream().mapToInt(DataField::id).toArray(); - } else { - indexFieldId = fields.get(0).id(); - extraFieldIds = null; - } + // The first column is the primary index column and is stored as indexFieldId; the + // remaining columns (if any) go into extraFieldIds. + int indexFieldId = fields.get(0).id(); + int[] extraFieldIds = + fields.size() > 1 + ? fields.subList(1, fields.size()).stream() + .mapToInt(DataField::id) + .toArray() + : null; return toIndexFileMetas( fileIO, indexPathFactory, diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java index adffa88152eb..960a56d3b97f 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java @@ -37,7 +37,6 @@ import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; @@ -54,6 +53,7 @@ import static org.apache.paimon.CoreOptions.GLOBAL_INDEX_THREAD_NUM; import static org.apache.paimon.predicate.PredicateVisitor.collectFieldNames; import static org.apache.paimon.table.source.snapshot.TimeTravelUtil.tryTravelOrLatest; +import static org.apache.paimon.utils.Preconditions.checkArgument; import static org.apache.paimon.utils.Preconditions.checkNotNull; /** Scanner for shard-based global indexes. */ @@ -75,74 +75,42 @@ public GlobalIndexScanner( GlobalIndexReadThreadPool.getExecutorService(options.get(GLOBAL_INDEX_THREAD_NUM)); this.indexPathFactory = indexPathFactory; GlobalIndexFileReader indexFileReader = meta -> fileIO.newInputStream(meta.filePath()); - - // Single-column indexes: fieldId -> indexType -> range -> files Map>>> indexMetas = new HashMap<>(); - // Multi-column indexes: fieldIds -> indexType -> range -> files - Map, Map>>> multiColumnMetas = - new HashMap<>(); - // Reverse lookup: fieldId -> all multi-column groups it participates in. A field can - // belong to several multi-column indexes (e.g. (a,b) and (a,c)) at the same time. - Map>> fieldToGroups = new HashMap<>(); - + Map> fieldIdToIndexFields = new HashMap<>(); for (IndexFileMeta indexFile : indexFiles) { GlobalIndexMeta meta = checkNotNull(indexFile.globalIndexMeta()); String indexType = indexFile.indexType(); Range range = new Range(meta.rowRangeStart(), meta.rowRangeEnd()); - - if (meta.isMultiColumn() && meta.extraFieldIds() != null) { - // Multi-column index: all participating fields share the same IndexFileMeta. - // Multiple index files belonging to the same group are aggregated under the same - // multiColumnMetas key, and each participating field records this group. - List fieldIds = - Arrays.stream(meta.extraFieldIds()).boxed().collect(Collectors.toList()); - multiColumnMetas - .computeIfAbsent(fieldIds, k -> new HashMap<>()) - .computeIfAbsent(indexType, k -> new HashMap<>()) - .computeIfAbsent(range, k -> new ArrayList<>()) - .add(indexFile); - for (int id : fieldIds) { - List> groups = - fieldToGroups.computeIfAbsent(id, k -> new ArrayList<>()); - if (!groups.contains(fieldIds)) { - groups.add(fieldIds); - } - } + int fieldId = meta.indexFieldId(); + List indexFields = meta.getIndexedFieldIds(); + List existing = fieldIdToIndexFields.get(fieldId); + if (existing == null) { + fieldIdToIndexFields.put(fieldId, indexFields); } else { - // Single-column index - int fieldId = meta.indexFieldId(); - indexMetas - .computeIfAbsent(fieldId, k -> new HashMap<>()) - .computeIfAbsent(indexType, k -> new HashMap<>()) - .computeIfAbsent(range, k -> new ArrayList<>()) - .add(indexFile); + checkArgument( + existing.equals(indexFields), + "Primary field %s owns multiple indexes with different columns %s and %s; " + + "a primary column can own at most one index.", + fieldId, + existing, + indexFields); } + indexMetas + .computeIfAbsent(fieldId, k -> new HashMap<>()) + .computeIfAbsent(indexType, k -> new HashMap<>()) + .computeIfAbsent(range, k -> new ArrayList<>()) + .add(indexFile); } IntFunction> readersFunction = fId -> { - // A filter on a single field can be served by any index covering that field, - // and every such index returns the same matching row ids. So pick ONE index - // instead of running them all: prefer the single-column index (purpose-built - // for this field and always able to serve the predicate); otherwise fall back - // to one of the multi-column groups this field participates in. - Map>> singleColumn = indexMetas.get(fId); - if (singleColumn != null) { - return createReaders( - indexFileReader, - singleColumn, - Collections.singletonList(rowType.getField(fId))); - } - List> groups = fieldToGroups.get(fId); - if (groups != null && !groups.isEmpty()) { - // No single-column index for this field: pick one of the multi-column - // groups it belongs to to accelerate the single-column filter. - List group = groups.get(0); - List fields = - group.stream().map(rowType::getField).collect(Collectors.toList()); - return createReaders(indexFileReader, multiColumnMetas.get(group), fields); + List group = fieldIdToIndexFields.get(fId); + if (group == null) { + return Collections.emptyList(); } - return Collections.emptyList(); + List fields = + group.stream().map(rowType::getField).collect(Collectors.toList()); + return createReaders(indexFileReader, indexMetas.get(fId), fields); }; this.globalIndexEvaluator = new GlobalIndexEvaluator(rowType, readersFunction); } diff --git a/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java b/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java index 60aba56d21ae..a987e994f9ea 100644 --- a/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java +++ b/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java @@ -18,7 +18,6 @@ package org.apache.paimon.index; -import org.apache.paimon.globalindex.GlobalIndexBuilderUtils; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.DataField; import org.apache.paimon.types.DataTypes; @@ -81,8 +80,13 @@ public int indexFieldId() { return indexFieldId; } + /** + * Whether this index covers more than one column. {@link #indexFieldId} is always the primary + * column; {@link #extraFieldIds} holds the remaining columns and is null/empty for a + * single-column index. + */ public boolean isMultiColumn() { - return indexFieldId == GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID; + return extraFieldIds != null && extraFieldIds.length > 0; } @Nullable @@ -95,20 +99,31 @@ public byte[] indexMeta() { return indexMeta; } - public List getIndexedFieldNames(RowType rowType) { - List names = new ArrayList<>(); - if (isMultiColumn()) { + /** All indexed field ids in order: the primary {@link #indexFieldId} followed by the rest. */ + public List getIndexedFieldIds() { + List ids = new ArrayList<>(); + ids.add(indexFieldId); + if (extraFieldIds != null) { for (int id : extraFieldIds) { - names.add(rowType.getField(id).name()); - } - } else { - names.add(rowType.getField(indexFieldId).name()); - if (extraFieldIds != null) { - for (int id : extraFieldIds) { - names.add(rowType.getField(id).name()); - } + ids.add(id); } } + return ids; + } + + public List getIndexedFields(RowType rowType) { + List fields = new ArrayList<>(); + for (int id : getIndexedFieldIds()) { + fields.add(rowType.getField(id)); + } + return fields; + } + + public List getIndexedFieldNames(RowType rowType) { + List names = new ArrayList<>(); + for (int id : getIndexedFieldIds()) { + names.add(rowType.getField(id).name()); + } return names; } } diff --git a/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java b/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java index 87628290810c..3dbd01d3a645 100644 --- a/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java +++ b/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java @@ -28,7 +28,6 @@ import javax.annotation.Nullable; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -243,27 +242,13 @@ private void validateRetainedIndexFiles( if (addedMeta == null) { continue; } - - // Single-column: skip if different fieldId or no range overlap - if (!retainedMeta.isMultiColumn()) { - if (retainedMeta.indexFieldId() != addedMeta.indexFieldId() - || !Range.intersect( - retainedMeta.rowRangeStart(), - retainedMeta.rowRangeEnd(), - addedMeta.rowRangeStart(), - addedMeta.rowRangeEnd())) { - continue; - } - } else { - // Multi-column: skip if different column group or no range overlap - if (!Arrays.equals(retainedMeta.extraFieldIds(), addedMeta.extraFieldIds()) - || !Range.intersect( - retainedMeta.rowRangeStart(), - retainedMeta.rowRangeEnd(), - addedMeta.rowRangeStart(), - addedMeta.rowRangeEnd())) { - continue; - } + if (retainedMeta.indexFieldId() != addedMeta.indexFieldId() + || !Range.intersect( + retainedMeta.rowRangeStart(), + retainedMeta.rowRangeEnd(), + addedMeta.rowRangeStart(), + addedMeta.rowRangeEnd())) { + continue; } throw new IllegalStateException( diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java index cfd161db6387..d2e5462921c4 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java @@ -34,7 +34,6 @@ import org.apache.paimon.predicate.FullTextSearch; import org.apache.paimon.table.FileStoreTable; import org.apache.paimon.types.DataField; -import org.apache.paimon.types.RowType; import org.apache.paimon.utils.IOUtils; import java.util.ArrayList; @@ -84,11 +83,7 @@ public GlobalIndexResult read(List splits) { GlobalIndexMeta firstMeta = checkNotNull(firstFile.globalIndexMeta()); GlobalIndexer globalIndexer; if (firstMeta.isMultiColumn()) { - RowType rowType = table.rowType(); - List fields = new ArrayList<>(); - for (int id : firstMeta.extraFieldIds()) { - fields.add(rowType.getField(id)); - } + List fields = firstMeta.getIndexedFields(table.rowType()); globalIndexer = GlobalIndexerFactoryUtils.load(indexType) .create(fields, table.coreOptions().toConfiguration()); diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java index cefebe14c02b..de0a7a278f83 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java @@ -36,7 +36,6 @@ import org.apache.paimon.predicate.VectorSearch; import org.apache.paimon.table.FileStoreTable; import org.apache.paimon.types.DataField; -import org.apache.paimon.types.RowType; import org.apache.paimon.utils.IOUtils; import org.apache.paimon.utils.RoaringNavigableMap64; @@ -93,11 +92,7 @@ public GlobalIndexResult read(List splits) { GlobalIndexMeta firstMeta = checkNotNull(firstFile.globalIndexMeta()); GlobalIndexer globalIndexer; if (firstMeta.isMultiColumn()) { - RowType rowType = table.rowType(); - List fields = new ArrayList<>(); - for (int id : firstMeta.extraFieldIds()) { - fields.add(rowType.getField(id)); - } + List fields = firstMeta.getIndexedFields(table.rowType()); globalIndexer = GlobalIndexerFactoryUtils.load(indexType) .create(fields, table.coreOptions().toConfiguration()); diff --git a/paimon-core/src/main/java/org/apache/paimon/table/system/TableIndexesTable.java b/paimon-core/src/main/java/org/apache/paimon/table/system/TableIndexesTable.java index f5e693d4a79d..9ad88e977b3d 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/system/TableIndexesTable.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/system/TableIndexesTable.java @@ -234,20 +234,17 @@ private InternalRow toRow( GlobalIndexMeta globalMeta = indexManifestEntry.indexFile().globalIndexMeta(); String indexFieldName = null; if (globalMeta != null) { - if (globalMeta.isMultiColumn() && globalMeta.extraFieldIds() != null) { - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < globalMeta.extraFieldIds().length; i++) { - if (i > 0) { - sb.append(","); - } - sb.append(logicalRowType.getField(globalMeta.extraFieldIds()[i]).name()); - } - indexFieldName = sb.toString(); - } else { - try { - indexFieldName = logicalRowType.getField(globalMeta.indexFieldId()).name(); - } catch (RuntimeException ignored) { - } + try { + indexFieldName = + String.join(",", globalMeta.getIndexedFieldNames(logicalRowType)); + } catch (RuntimeException e) { + // Indexed columns may no longer exist in the current schema (e.g. dropped via + // ALTER TABLE); leave the name empty instead of failing the listing. + LOG.debug( + "Failed to resolve indexed field names for index file {} (primary field {}).", + indexManifestEntry.indexFile().fileName(), + globalMeta.indexFieldId(), + e); } } return GenericRow.of( diff --git a/paimon-core/src/test/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtilsTest.java b/paimon-core/src/test/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtilsTest.java index 703c01c69633..67852ae925ff 100644 --- a/paimon-core/src/test/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtilsTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtilsTest.java @@ -77,7 +77,7 @@ public boolean isExternalPath() { coreOptions = new CoreOptions(new Options().toMap()); } - // Test: 2 columns (title + vec), indexFieldId=-1, all field ids stored in extraFieldIds + // Test: 2 columns (title + vec), primary column title is indexFieldId, rest in extraFieldIds @Test void testToIndexFileMetasMultiColumn() throws IOException { DataField titleField = new DataField(1, "title", new VarCharType(Integer.MAX_VALUE)); @@ -92,8 +92,8 @@ void testToIndexFileMetasMultiColumn() throws IOException { fileIO, indexPathFactory, coreOptions, range, fields, "test-type", entries); assertThat(metas).hasSize(1); - assertThat(metas.get(0).globalIndexMeta().indexFieldId()).isEqualTo(-1); - assertThat(metas.get(0).globalIndexMeta().extraFieldIds()).isEqualTo(new int[] {1, 2}); + assertThat(metas.get(0).globalIndexMeta().indexFieldId()).isEqualTo(1); + assertThat(metas.get(0).globalIndexMeta().extraFieldIds()).isEqualTo(new int[] {2}); assertThat(metas.get(0).globalIndexMeta().rowRangeStart()).isEqualTo(0); assertThat(metas.get(0).globalIndexMeta().rowRangeEnd()).isEqualTo(99); } @@ -117,7 +117,8 @@ void testToIndexFileMetasSingleColumn() throws IOException { assertThat(metas.get(0).globalIndexMeta().extraFieldIds()).isNull(); } - // Test: 3 columns (title + vec + id), indexFieldId=-1, all field ids in extraFieldIds + // Test: 3 columns (title + vec + id), primary column title is indexFieldId, rest in + // extraFieldIds @Test void testToIndexFileMetasThreeColumns() throws IOException { DataField titleField = new DataField(1, "title", new VarCharType(Integer.MAX_VALUE)); @@ -133,8 +134,8 @@ void testToIndexFileMetasThreeColumns() throws IOException { fileIO, indexPathFactory, coreOptions, range, fields, "test-type", entries); assertThat(metas).hasSize(1); - assertThat(metas.get(0).globalIndexMeta().indexFieldId()).isEqualTo(-1); - assertThat(metas.get(0).globalIndexMeta().extraFieldIds()).isEqualTo(new int[] {1, 2, 3}); + assertThat(metas.get(0).globalIndexMeta().indexFieldId()).isEqualTo(1); + assertThat(metas.get(0).globalIndexMeta().extraFieldIds()).isEqualTo(new int[] {2, 3}); } private List createDummyResultEntries() throws IOException { From d618873c612bf2093857468a48e93a065337774f Mon Sep 17 00:00:00 2001 From: CrownChu Date: Thu, 4 Jun 2026 01:25:08 +0800 Subject: [PATCH 18/24] [globalindex] Refine scanner routing, multi-column writer rowId, and overlap checks - GlobalIndexScanner: split single-/multi-column lookups (IndexMetaFileGroup), single-column index takes priority, fall back to the first multi-column index that has the field as an extra; reject a primary owning multiple indexes - GlobalIndexMultiColumnWriter.write now takes the shard-relative row id; the builders pass projected index columns plus that id - DefaultGlobalIndexBuilder (Spark): multi-column skips rows outside the shard range so the relative row id stays valid for boundary-spanning files - IndexManifestFileHandler: same-primary indexes with different columns always conflict, same columns only conflict on overlapping ranges - FullText/VectorScanImpl: match indexes by their primary column --- .../GlobalIndexMultiColumnWriter.java | 12 ++-- .../globalindex/GlobalIndexScanner.java | 69 +++++++++++++------ .../manifest/IndexManifestFileHandler.java | 19 ++--- .../paimon/table/source/FullTextScanImpl.java | 12 +--- .../paimon/table/source/VectorScanImpl.java | 30 ++------ .../globalindex/GenericIndexTopoBuilder.java | 9 +-- .../procedure/CreateGlobalIndexProcedure.java | 5 -- .../DefaultGlobalIndexBuilder.java | 12 ++-- 8 files changed, 82 insertions(+), 86 deletions(-) diff --git a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexMultiColumnWriter.java b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexMultiColumnWriter.java index a6ded78d33fd..58a847b64ca8 100644 --- a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexMultiColumnWriter.java +++ b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexMultiColumnWriter.java @@ -26,9 +26,13 @@ public interface GlobalIndexMultiColumnWriter extends GlobalIndexWriter { /** - * Write a projected row containing all indexed columns for one record. The row layout matches - * the fields order passed to {@link GlobalIndexerFactory#create(java.util.List, - * org.apache.paimon.options.Options)}. + * Write one record's indexed columns at the given relative row id. + * + * @param rowId the record's row id relative to the current shard (0 to rowCnt - 1); a null row + * still advances the row id without indexing a value + * @param row a projected row containing only the indexed columns, whose layout matches the + * fields order passed to {@link GlobalIndexerFactory#create(java.util.List, + * org.apache.paimon.options.Options)} */ - void write(@Nullable InternalRow row); + void write(long rowId, @Nullable InternalRow row); } diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java index 960a56d3b97f..f7264b1eb4dc 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java @@ -75,46 +75,73 @@ public GlobalIndexScanner( GlobalIndexReadThreadPool.getExecutorService(options.get(GLOBAL_INDEX_THREAD_NUM)); this.indexPathFactory = indexPathFactory; GlobalIndexFileReader indexFileReader = meta -> fileIO.newInputStream(meta.filePath()); - Map>>> indexMetas = new HashMap<>(); - Map> fieldIdToIndexFields = new HashMap<>(); + Map indexMetas = new HashMap<>(); + Map> extraIndexMetas = new HashMap<>(); for (IndexFileMeta indexFile : indexFiles) { GlobalIndexMeta meta = checkNotNull(indexFile.globalIndexMeta()); String indexType = indexFile.indexType(); Range range = new Range(meta.rowRangeStart(), meta.rowRangeEnd()); - int fieldId = meta.indexFieldId(); - List indexFields = meta.getIndexedFieldIds(); - List existing = fieldIdToIndexFields.get(fieldId); - if (existing == null) { - fieldIdToIndexFields.put(fieldId, indexFields); + int indexFieldId = meta.indexFieldId(); + List fieldIds = meta.getIndexedFieldIds(); + IndexMetaFileGroup group = indexMetas.get(indexFieldId); + if (group == null) { + group = new IndexMetaFileGroup(indexFieldId, fieldIds); + indexMetas.put(indexFieldId, group); + if (meta.extraFieldIds() != null) { + for (int extra : meta.extraFieldIds()) { + extraIndexMetas.computeIfAbsent(extra, k -> new ArrayList<>()).add(group); + } + } } else { checkArgument( - existing.equals(indexFields), + group.fieldIds.equals(fieldIds), "Primary field %s owns multiple indexes with different columns %s and %s; " + "a primary column can own at most one index.", - fieldId, - existing, - indexFields); + indexFieldId, + group.fieldIds, + fieldIds); } - indexMetas - .computeIfAbsent(fieldId, k -> new HashMap<>()) - .computeIfAbsent(indexType, k -> new HashMap<>()) - .computeIfAbsent(range, k -> new ArrayList<>()) - .add(indexFile); + group.addFile(indexType, range, indexFile); } IntFunction> readersFunction = fId -> { - List group = fieldIdToIndexFields.get(fId); + IndexMetaFileGroup group = indexMetas.get(fId); if (group == null) { - return Collections.emptyList(); + List extraGroups = extraIndexMetas.get(fId); + if (extraGroups == null || extraGroups.isEmpty()) { + return Collections.emptyList(); + } + group = extraGroups.get(0); } List fields = - group.stream().map(rowType::getField).collect(Collectors.toList()); - return createReaders(indexFileReader, indexMetas.get(fId), fields); + group.fieldIds.stream() + .map(rowType::getField) + .collect(Collectors.toList()); + return createReaders(indexFileReader, group.metas, fields); }; this.globalIndexEvaluator = new GlobalIndexEvaluator(rowType, readersFunction); } + /** All index files of one global index (single- or multi-column), grouped for reading. */ + private static class IndexMetaFileGroup { + + private final int indexFieldId; + private final List fieldIds; + private final Map>> metas = new HashMap<>(); + + IndexMetaFileGroup(int indexFieldId, List fieldIds) { + this.indexFieldId = indexFieldId; + this.fieldIds = fieldIds; + } + + void addFile(String indexType, Range range, IndexFileMeta indexFile) { + metas.computeIfAbsent(indexType, k -> new HashMap<>()) + .computeIfAbsent(range, k -> new ArrayList<>()) + .add(indexFile); + } + } + public static Optional create( FileStoreTable table, Collection indexFiles) { if (indexFiles.isEmpty()) { @@ -145,6 +172,8 @@ public static Optional create( if (globalIndex == null) { return false; } + // Collect indexes whose primary column is filtered, and also multi-column + // indexes that have a filtered column as an extra (used as a fallback). if (filterFieldIds.contains(globalIndex.indexFieldId())) { return true; } diff --git a/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java b/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java index 3dbd01d3a645..f99278085550 100644 --- a/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java +++ b/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java @@ -28,6 +28,7 @@ import javax.annotation.Nullable; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -239,15 +240,15 @@ private void validateRetainedIndexFiles( for (IndexManifestEntry added : addedIndexFiles) { GlobalIndexMeta addedMeta = added.indexFile().globalIndexMeta(); - if (addedMeta == null) { - continue; - } - if (retainedMeta.indexFieldId() != addedMeta.indexFieldId() - || !Range.intersect( - retainedMeta.rowRangeStart(), - retainedMeta.rowRangeEnd(), - addedMeta.rowRangeStart(), - addedMeta.rowRangeEnd())) { + if (addedMeta == null + || retainedMeta.indexFieldId() != addedMeta.indexFieldId() + || (Arrays.equals( + retainedMeta.extraFieldIds(), addedMeta.extraFieldIds()) + && !Range.intersect( + retainedMeta.rowRangeStart(), + retainedMeta.rowRangeEnd(), + addedMeta.rowRangeStart(), + addedMeta.rowRangeEnd()))) { continue; } diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextScanImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextScanImpl.java index 6230b31336a3..cc77d9121ad5 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextScanImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextScanImpl.java @@ -61,17 +61,7 @@ public Plan scan() { if (globalIndex == null) { return false; } - if (textColumn.id() == globalIndex.indexFieldId()) { - return true; - } - if (globalIndex.extraFieldIds() != null) { - for (int id : globalIndex.extraFieldIds()) { - if (textColumn.id() == id) { - return true; - } - } - } - return false; + return textColumn.id() == globalIndex.indexFieldId(); }; List allIndexFiles = diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java index 1ff3f82852f6..5098cc959129 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java @@ -82,17 +82,7 @@ public Plan scan() { return false; } int fieldId = globalIndex.indexFieldId(); - if (vectorColumn.id() == fieldId || filterFieldIds.contains(fieldId)) { - return true; - } - if (globalIndex.extraFieldIds() != null) { - for (int id : globalIndex.extraFieldIds()) { - if (vectorColumn.id() == id || filterFieldIds.contains(id)) { - return true; - } - } - } - return false; + return vectorColumn.id() == fieldId || filterFieldIds.contains(fieldId); }; List allIndexFiles = @@ -104,7 +94,7 @@ public Plan scan() { Map> vectorByRange = new HashMap<>(); for (IndexFileMeta indexFile : allIndexFiles) { GlobalIndexMeta meta = checkNotNull(indexFile.globalIndexMeta()); - if (containsField(meta, vectorColumn.id())) { + if (isPrimaryColumn(meta, vectorColumn.id())) { Range range = new Range(meta.rowRangeStart(), meta.rowRangeEnd()); vectorByRange.computeIfAbsent(range, k -> new ArrayList<>()).add(indexFile); } @@ -121,7 +111,7 @@ public Plan scan() { f -> { GlobalIndexMeta globalIndex = checkNotNull(f.globalIndexMeta()); - if (containsField(globalIndex, vectorColumn.id())) { + if (isPrimaryColumn(globalIndex, vectorColumn.id())) { return false; } return range.hasIntersection(globalIndex.rowRange()); @@ -133,17 +123,7 @@ public Plan scan() { return () -> splits; } - private static boolean containsField(GlobalIndexMeta meta, int fieldId) { - if (meta.indexFieldId() == fieldId) { - return true; - } - if (meta.extraFieldIds() != null) { - for (int id : meta.extraFieldIds()) { - if (id == fieldId) { - return true; - } - } - } - return false; + private static boolean isPrimaryColumn(GlobalIndexMeta meta, int fieldId) { + return meta.indexFieldId() == fieldId; } } diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java index 4bba6fcc830b..c94bc2deda65 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java @@ -658,15 +658,10 @@ public void processElement(StreamRecord element) throws Exception { // Only write rows within this shard's range if (currentRowId >= task.shardRange.from) { if (multiColumn) { - // Pass the row through, including null fields; each index type - // decides how to handle nulls. A null field advances the logical - // row id without indexing a value, so it must not end the shard: - // later non-null rows still need to be indexed and row-id alignment - // must be preserved. + long rowId = currentRowId - task.shardRange.from; ((GlobalIndexMultiColumnWriter) indexWriter) - .write(writerProjection.replaceRow(row)); + .write(rowId, writerProjection.replaceRow(row)); } else { - // A null value advances the logical row id without indexing. Object fieldData = indexFieldGetters[0].getFieldOrNull(row); ((GlobalIndexSingletonWriter) indexWriter).write(fieldData); } diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java index 090ea02e8528..e09272ce846d 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java @@ -98,11 +98,6 @@ public String[] call( indexColumns.size() == new HashSet<>(indexColumns).size(), "Duplicate index columns are not allowed: %s", indexColumns); - // No hard cap on the number of index columns: unlike row-store B-tree indexes - // (e.g. MySQL 16, PostgreSQL 32) whose limit comes from composing columns into a - // single key, the global index is built on per-type index frameworks. Whether - // multiple columns are supported, and any practical limit, is decided by each - // index type (single-column types reject multi-column via UnsupportedOperationException). for (String col : indexColumns) { checkArgument( rowType.containsField(col), diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java index 713965b01a32..bccf4899652c 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java @@ -29,6 +29,7 @@ import org.apache.paimon.io.DataIncrement; import org.apache.paimon.options.Options; import org.apache.paimon.table.FileStoreTable; +import org.apache.paimon.table.SpecialFields; import org.apache.paimon.table.sink.CommitMessage; import org.apache.paimon.table.sink.CommitMessageImpl; import org.apache.paimon.types.DataField; @@ -134,13 +135,14 @@ private List writePaimonRows( projection[i] = readType.getFieldIndex(field.name()); } ProjectedRow projectedRow = ProjectedRow.from(projection); + int rowIdIndex = readType.getFieldIndex(SpecialFields.ROW_ID.name()); while (rows.hasNext()) { InternalRow row = rows.next(); - // Pass the row through, including null fields; each index type decides how to - // handle nulls. A null field advances the logical row id without indexing a - // value, so it must not end the shard: later non-null rows still need to be - // indexed and row-id alignment must be preserved. - multiWriter.write(projectedRow.replaceRow(row)); + long absRowId = row.getLong(rowIdIndex); + if (absRowId < rowRange.from || absRowId > rowRange.to) { + continue; + } + multiWriter.write(absRowId - rowRange.from, projectedRow.replaceRow(row)); rowCounter.add(1); } } else { From 77648433b9ac2aecf344c26e320bff15f88b63ed Mon Sep 17 00:00:00 2001 From: CrownChu Date: Thu, 4 Jun 2026 16:09:58 +0800 Subject: [PATCH 19/24] [globalindex] Reject unsupported multi-column index types at creation time Add GlobalIndexerFactory.supportsMultiColumn() (default false). CreateGlobalIndexProcedure (Spark and Flink) now checks it up front and fails fast with a clear message when a multi-column index is requested for a type whose factory does not support it, instead of failing later in the build job when create(List) throws. --- .../globalindex/GlobalIndexerFactory.java | 8 ++++++++ .../procedure/CreateGlobalIndexProcedure.java | 10 +++++++--- .../procedure/CreateGlobalIndexProcedure.java | 18 ++++++++++++------ 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java index cef643fa463f..b028ba4470cb 100644 --- a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java +++ b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java @@ -31,6 +31,14 @@ public interface GlobalIndexerFactory { GlobalIndexer create(DataField dataField, Options options); + /** + * Whether this index type supports multi-column indexes. A factory that returns {@code true} + * must override {@link #create(List, Options)} to handle more than one column. + */ + default boolean supportsMultiColumn() { + return false; + } + default GlobalIndexer create(List fields, Options options) { if (fields.size() > 1) { throw new UnsupportedOperationException( diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java index e09272ce846d..aefc506b970d 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java @@ -20,6 +20,7 @@ import org.apache.paimon.flink.btree.BTreeIndexTopoBuilder; import org.apache.paimon.flink.globalindex.GenericIndexTopoBuilder; +import org.apache.paimon.globalindex.GlobalIndexerFactoryUtils; import org.apache.paimon.options.Options; import org.apache.paimon.partition.PartitionPredicate; import org.apache.paimon.predicate.Predicate; @@ -114,10 +115,13 @@ public String[] call( // Build global index based on index type indexType = indexType.toLowerCase().trim(); - if ("btree".equals(indexType)) { + if (indexColumns.size() > 1) { + // Whether multi-column is supported is decided by each index type's factory; fail fast + // up front instead of failing later in the build job. checkArgument( - indexColumns.size() == 1, - "BTree index only supports single column, got: %s", + GlobalIndexerFactoryUtils.load(indexType).supportsMultiColumn(), + "Index type '%s' does not support multi-column index, got columns: %s", + indexType, indexColumns); } try { diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java index 2cb5525d25c6..86738e515697 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java @@ -18,6 +18,7 @@ package org.apache.paimon.spark.procedure; +import org.apache.paimon.globalindex.GlobalIndexerFactoryUtils; import org.apache.paimon.options.Options; import org.apache.paimon.partition.PartitionPredicate; import org.apache.paimon.spark.globalindex.GlobalIndexTopologyBuilder; @@ -157,12 +158,6 @@ public InternalRow[] call(InternalRow args) { col, tableIdent); } - if ("btree".equalsIgnoreCase(indexType)) { - checkArgument( - indexColumns.size() == 1, - "BTree index only supports single column, got: %s", - indexColumns); - } DataSourceV2Relation relation = createRelation(tableIdent, sparkTable); PartitionPredicate partitionPredicate = SparkProcedureUtils.convertToPartitionPredicate( @@ -180,6 +175,17 @@ public InternalRow[] call(InternalRow args) { Options userOptions = createUserOptions(table, optionString); + if (indexColumns.size() > 1) { + // Whether multi-column is supported is decided by each index type's + // factory; fail fast up front instead of failing later in the build + // job. + checkArgument( + GlobalIndexerFactoryUtils.load(indexType).supportsMultiColumn(), + "Index type '%s' does not support multi-column index, got columns: %s", + indexType, + indexColumns); + } + GlobalIndexTopologyBuilder topoBuilder = GlobalIndexTopologyBuilderUtils.createTopoBuilder(indexType); From 8ed8130865c27752ea899a5e1234fc7fc6c60778 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Mon, 8 Jun 2026 15:15:06 +0800 Subject: [PATCH 20/24] [globalindex] Union extra-column readers, fix vector scan filter, and align drop procedure with multi-column --- .../globalindex/GlobalIndexScanner.java | 31 +++++++----- .../paimon/table/source/VectorScanImpl.java | 13 ++++- .../procedure/DropGlobalIndexProcedure.java | 49 ++++++++++++------- .../procedure/DropGlobalIndexProcedure.java | 34 +++++++++---- 4 files changed, 87 insertions(+), 40 deletions(-) diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java index f7264b1eb4dc..de656dc42b34 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java @@ -107,18 +107,27 @@ public GlobalIndexScanner( IntFunction> readersFunction = fId -> { IndexMetaFileGroup group = indexMetas.get(fId); - if (group == null) { - List extraGroups = extraIndexMetas.get(fId); - if (extraGroups == null || extraGroups.isEmpty()) { - return Collections.emptyList(); - } - group = extraGroups.get(0); + if (group != null) { + List fields = + group.fieldIds.stream() + .map(rowType::getField) + .collect(Collectors.toList()); + return createReaders(indexFileReader, group.metas, fields); + } + List extraGroups = extraIndexMetas.get(fId); + if (extraGroups == null || extraGroups.isEmpty()) { + return Collections.emptyList(); + } + // Union readers from all groups that share this extra column + List allReaders = new ArrayList<>(); + for (IndexMetaFileGroup g : extraGroups) { + List fields = + g.fieldIds.stream() + .map(rowType::getField) + .collect(Collectors.toList()); + allReaders.addAll(createReaders(indexFileReader, g.metas, fields)); } - List fields = - group.fieldIds.stream() - .map(rowType::getField) - .collect(Collectors.toList()); - return createReaders(indexFileReader, group.metas, fields); + return allReaders; }; this.globalIndexEvaluator = new GlobalIndexEvaluator(rowType, readersFunction); } diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java index 5098cc959129..b59363a3264f 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java @@ -82,7 +82,18 @@ public Plan scan() { return false; } int fieldId = globalIndex.indexFieldId(); - return vectorColumn.id() == fieldId || filterFieldIds.contains(fieldId); + if (vectorColumn.id() == fieldId || filterFieldIds.contains(fieldId)) { + return true; + } + int[] extras = globalIndex.extraFieldIds(); + if (extras != null) { + for (int extra : extras) { + if (filterFieldIds.contains(extra)) { + return true; + } + } + } + return false; }; List allIndexFiles = diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/DropGlobalIndexProcedure.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/DropGlobalIndexProcedure.java index a5ab0239c215..197e368088fd 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/DropGlobalIndexProcedure.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/DropGlobalIndexProcedure.java @@ -42,6 +42,7 @@ import org.slf4j.LoggerFactory; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.UUID; @@ -82,13 +83,25 @@ public String[] call( FileStoreTable table = (FileStoreTable) table(tableId); - // Validate column exists + // Parse comma-separated columns (consistent with create procedure) RowType rowType = table.rowType(); - checkArgument( - rowType.containsField(indexColumn), - "Column '%s' does not exist in table '%s'.", - indexColumn, - tableId); + List indexColumns = + Arrays.stream(indexColumn.split(",")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toList()); + checkArgument(!indexColumns.isEmpty(), "At least one column required."); + for (String col : indexColumns) { + checkArgument( + rowType.containsField(col), + "Column '%s' does not exist in table '%s'.", + col, + tableId); + } + final List indexFieldIds = + indexColumns.stream() + .map(col -> rowType.getField(col).id()) + .collect(Collectors.toList()); // Parse partition predicate PartitionPredicate partitionPredicate = parsePartitionPredicate(table, partitions); @@ -96,9 +109,6 @@ public String[] call( // Normalize index type final String indexTypeLower = indexType.toLowerCase().trim(); - // Get column field ID for final reference in lambda - final int columnId = rowType.getField(indexColumn).id(); - // Get latest snapshot Snapshot snapshot = table.latestSnapshot() @@ -108,12 +118,15 @@ public String[] call( String.format( "Table '%s' has no snapshot.", tableId))); - // Create filter for index entries to delete + // Create filter for index entries to delete — match by primary column + full column set Filter filter = entry -> entry.indexFile().indexType().equals(indexTypeLower) && entry.indexFile().globalIndexMeta() != null - && entry.indexFile().globalIndexMeta().indexFieldId() == columnId + && entry.indexFile() + .globalIndexMeta() + .getIndexedFieldIds() + .equals(indexFieldIds) && (partitionPredicate == null || partitionPredicate.test(entry.partition())); @@ -122,15 +135,15 @@ public String[] call( table.store().newIndexFileHandler().scan(snapshot, filter); LOG.info( - "Found {} {} global index files to delete for column '{}' on table '{}'", + "Found {} {} global index files to delete for columns '{}' on table '{}'", waitToDelete.size(), indexTypeLower, - indexColumn, + indexColumns, table.name()); if (waitToDelete.isEmpty()) { return new String[] { - "No " + indexTypeLower + " global index found for column '" + indexColumn + "'" + "No " + indexTypeLower + " global index found for columns '" + indexColumns + "'" }; } @@ -165,10 +178,10 @@ public String[] call( } LOG.info( - "Successfully dropped {} {} global index files for column '{}' on table '{}'", + "Successfully dropped {} {} global index files for columns '{}' on table '{}'", waitToDelete.size(), indexTypeLower, - indexColumn, + indexColumns, table.name()); return new String[] { @@ -176,8 +189,8 @@ public String[] call( + waitToDelete.size() + " " + indexTypeLower - + " global index files for column '" - + indexColumn + + " global index files for columns '" + + indexColumns + "' on table '" + table.name() + "'" diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/DropGlobalIndexProcedure.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/DropGlobalIndexProcedure.java index 74e4cc4aea50..478b22f407f2 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/DropGlobalIndexProcedure.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/DropGlobalIndexProcedure.java @@ -46,6 +46,7 @@ import org.slf4j.LoggerFactory; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Locale; import java.util.Map; @@ -107,6 +108,13 @@ public InternalRow[] call(InternalRow args) { LOG.info("Starting to drop index for table " + tableIdent + " WHERE: " + finalWhere); + List indexColumns = + Arrays.stream(column.split(",")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toList()); + checkArgument(!indexColumns.isEmpty(), "At least one column required."); + return modifyPaimonTable( tableIdent, t -> { @@ -117,11 +125,17 @@ public InternalRow[] call(InternalRow args) { FileStoreTable table = (FileStoreTable) t; RowType rowType = table.rowType(); - checkArgument( - rowType.containsField(column), - "Column '%s' does not exist in table '%s'.", - column, - tableIdent); + for (String col : indexColumns) { + checkArgument( + rowType.containsField(col), + "Column '%s' does not exist in table '%s'.", + col, + tableIdent); + } + List indexFieldIds = + indexColumns.stream() + .map(col -> rowType.getField(col).id()) + .collect(Collectors.toList()); DataSourceV2Relation relation = createRelation(tableIdent); PartitionPredicate partitionPredicate = SparkProcedureUtils.convertToPartitionPredicate( @@ -144,9 +158,9 @@ public InternalRow[] call(InternalRow args) { entry.indexFile().indexType().equals(indexType) && entry.indexFile().globalIndexMeta() != null && entry.indexFile() - .globalIndexMeta() - .indexFieldId() - == rowType.getField(column).id() + .globalIndexMeta() + .getIndexedFieldIds() + .equals(indexFieldIds) && (partitionPredicate == null || partitionPredicate.test( entry.partition())); @@ -192,8 +206,8 @@ public InternalRow[] call(InternalRow args) { } catch (Exception e) { throw new RuntimeException( String.format( - "Failed to drop %s index for column '%s' on table '%s'.", - indexType, column, tableIdent), + "Failed to drop %s index for columns '%s' on table '%s'.", + indexType, indexColumns, tableIdent), e); } }); From ea398f51f9b464d218cb22f18e1f23e620f07eaf Mon Sep 17 00:00:00 2001 From: CrownChu Date: Mon, 8 Jun 2026 16:09:04 +0800 Subject: [PATCH 21/24] [globalindex] Format drop procedure column message and fix ITCase assertion --- .../paimon/flink/procedure/DropGlobalIndexProcedure.java | 9 +++++---- .../flink/procedure/DropGlobalIndexProcedureITCase.java | 2 +- .../paimon/spark/procedure/DropGlobalIndexProcedure.java | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/DropGlobalIndexProcedure.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/DropGlobalIndexProcedure.java index 197e368088fd..92bde693ea6c 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/DropGlobalIndexProcedure.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/DropGlobalIndexProcedure.java @@ -102,6 +102,7 @@ public String[] call( indexColumns.stream() .map(col -> rowType.getField(col).id()) .collect(Collectors.toList()); + final String columnsDesc = String.join(",", indexColumns); // Parse partition predicate PartitionPredicate partitionPredicate = parsePartitionPredicate(table, partitions); @@ -138,12 +139,12 @@ public String[] call( "Found {} {} global index files to delete for columns '{}' on table '{}'", waitToDelete.size(), indexTypeLower, - indexColumns, + columnsDesc, table.name()); if (waitToDelete.isEmpty()) { return new String[] { - "No " + indexTypeLower + " global index found for columns '" + indexColumns + "'" + "No " + indexTypeLower + " global index found for columns '" + columnsDesc + "'" }; } @@ -181,7 +182,7 @@ public String[] call( "Successfully dropped {} {} global index files for columns '{}' on table '{}'", waitToDelete.size(), indexTypeLower, - indexColumns, + columnsDesc, table.name()); return new String[] { @@ -190,7 +191,7 @@ public String[] call( + " " + indexTypeLower + " global index files for columns '" - + indexColumns + + columnsDesc + "' on table '" + table.name() + "'" diff --git a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/procedure/DropGlobalIndexProcedureITCase.java b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/procedure/DropGlobalIndexProcedureITCase.java index 5659467d8aa9..a348b5af7eda 100644 --- a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/procedure/DropGlobalIndexProcedureITCase.java +++ b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/procedure/DropGlobalIndexProcedureITCase.java @@ -299,6 +299,6 @@ public void testDropNonExistentIndex() throws Exception { assertThat(dropResult.get(0).getField(0)) .isInstanceOf(String.class) .asString() - .contains("No btree global index found for column 'name'"); + .contains("No btree global index found for columns 'name'"); } } diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/DropGlobalIndexProcedure.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/DropGlobalIndexProcedure.java index 478b22f407f2..bd218eb68ddf 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/DropGlobalIndexProcedure.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/DropGlobalIndexProcedure.java @@ -207,7 +207,7 @@ public InternalRow[] call(InternalRow args) { throw new RuntimeException( String.format( "Failed to drop %s index for columns '%s' on table '%s'.", - indexType, indexColumns, tableIdent), + indexType, String.join(",", indexColumns), tableIdent), e); } }); From a477bc05b16889f732d2a55fcc11c575af30896a Mon Sep 17 00:00:00 2001 From: CrownChu Date: Mon, 8 Jun 2026 19:31:39 +0800 Subject: [PATCH 22/24] [globalindex] Refactor multi-column factory create to explicit primary/extraFields signature --- .../paimon/globalindex/GlobalIndexer.java | 5 +++-- .../globalindex/GlobalIndexerFactory.java | 18 ++++++++++++------ .../globalindex/GlobalIndexBuilderUtils.java | 9 +++++++-- .../paimon/globalindex/GlobalIndexScanner.java | 4 +++- .../paimon/table/source/FullTextReadImpl.java | 5 ++++- .../paimon/table/source/VectorReadImpl.java | 5 ++++- .../globalindex/GenericIndexTopoBuilder.java | 7 ++++++- .../globalindex/DefaultGlobalIndexBuilder.java | 8 +++++++- 8 files changed, 46 insertions(+), 15 deletions(-) diff --git a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexer.java b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexer.java index 6c46415cfeee..18087466c6c5 100644 --- a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexer.java +++ b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexer.java @@ -42,8 +42,9 @@ static GlobalIndexer create(String type, DataField dataField, Options options) { return globalIndexerFactory.create(dataField, options); } - static GlobalIndexer create(String type, List fields, Options options) { + static GlobalIndexer create( + String type, DataField dataField, List extraFields, Options options) { GlobalIndexerFactory globalIndexerFactory = GlobalIndexerFactoryUtils.load(type); - return globalIndexerFactory.create(fields, options); + return globalIndexerFactory.create(dataField, extraFields, options); } } diff --git a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java index b028ba4470cb..6f4e66d1b78c 100644 --- a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java +++ b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java @@ -33,19 +33,25 @@ public interface GlobalIndexerFactory { /** * Whether this index type supports multi-column indexes. A factory that returns {@code true} - * must override {@link #create(List, Options)} to handle more than one column. + * must override {@link #create(DataField, List, Options)} to handle extra columns. */ default boolean supportsMultiColumn() { return false; } - default GlobalIndexer create(List fields, Options options) { - if (fields.size() > 1) { + /** + * Creates an indexer over a primary column plus optional extra columns. {@code dataField} is + * the primary column; {@code extraFields} holds the remaining columns and is empty for a + * single-column index. + */ + default GlobalIndexer create( + DataField dataField, List extraFields, Options options) { + if (extraFields != null && !extraFields.isEmpty()) { throw new UnsupportedOperationException( String.format( - "Index type '%s' does not support multi-column index, got columns: %s", - identifier(), fields)); + "Index type '%s' does not support multi-column index, got extra columns: %s", + identifier(), extraFields)); } - return create(fields.get(0), options); + return create(dataField, options); } } diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java index 62b13833b393..3514d8c338cb 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java @@ -140,9 +140,14 @@ public static GlobalIndexWriter createIndexWriter( } public static GlobalIndexWriter createIndexWriter( - FileStoreTable table, String indexType, List fields, Options options) + FileStoreTable table, + String indexType, + DataField dataField, + List extraFields, + Options options) throws IOException { - GlobalIndexer globalIndexer = GlobalIndexer.create(indexType, fields, options); + GlobalIndexer globalIndexer = + GlobalIndexer.create(indexType, dataField, extraFields, options); return globalIndexer.createWriter(createGlobalIndexFileReadWrite(table)); } diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java index de656dc42b34..bcad0e4fb08d 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java @@ -221,7 +221,9 @@ private Collection createReaders( String indexType = entry.getKey(); Map> metas = entry.getValue(); GlobalIndexerFactory globalIndexerFactory = GlobalIndexerFactoryUtils.load(indexType); - GlobalIndexer globalIndexer = globalIndexerFactory.create(fields, options); + GlobalIndexer globalIndexer = + globalIndexerFactory.create( + fields.get(0), fields.subList(1, fields.size()), options); List> futures = new ArrayList<>(metas.size()); for (Map.Entry> rangeMetas : metas.entrySet()) { diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java index d2e5462921c4..290bdbe0f4ed 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java @@ -86,7 +86,10 @@ public GlobalIndexResult read(List splits) { List fields = firstMeta.getIndexedFields(table.rowType()); globalIndexer = GlobalIndexerFactoryUtils.load(indexType) - .create(fields, table.coreOptions().toConfiguration()); + .create( + fields.get(0), + fields.subList(1, fields.size()), + table.coreOptions().toConfiguration()); } else { globalIndexer = GlobalIndexerFactoryUtils.load(indexType) diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java index de0a7a278f83..5d890b092a32 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java @@ -95,7 +95,10 @@ public GlobalIndexResult read(List splits) { List fields = firstMeta.getIndexedFields(table.rowType()); globalIndexer = GlobalIndexerFactoryUtils.load(indexType) - .create(fields, table.coreOptions().toConfiguration()); + .create( + fields.get(0), + fields.subList(1, fields.size()), + table.coreOptions().toConfiguration()); } else { globalIndexer = GlobalIndexerFactoryUtils.load(indexType) diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java index c94bc2deda65..a26f264cb4c4 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java @@ -628,7 +628,12 @@ public void processElement(StreamRecord element) throws Exception { long startTime = System.currentTimeMillis(); GlobalIndexWriter indexWriter = - createIndexWriter(table, indexType, indexFields, mergedOptions); + createIndexWriter( + table, + indexType, + indexFields.get(0), + indexFields.subList(1, indexFields.size()), + mergedOptions); try { long rowsSeen = 0; diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java index bccf4899652c..a2f34df0fed0 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java @@ -122,7 +122,13 @@ public CommitMessage build(CloseableIterator data) throws IOExcepti private List writePaimonRows( CloseableIterator rows, LongCounter rowCounter) throws IOException { - GlobalIndexWriter indexWriter = createIndexWriter(table, indexType, indexFields, options); + GlobalIndexWriter indexWriter = + createIndexWriter( + table, + indexType, + indexFields.get(0), + indexFields.subList(1, indexFields.size()), + options); boolean multiColumn = indexFields.size() > 1; try { From 485e86895baf1210375437f4721ef228fcc0bf04 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Wed, 10 Jun 2026 16:37:25 +0800 Subject: [PATCH 23/24] [globalindex] Thread indexField/extraFields through write path and read scan APIs Add GlobalIndexMeta.getIndexField/getExtraFields helpers and use them in the vector/full-text read paths and the scanner instead of merging then splitting. Propagate the primary/extra column distinction from the create procedures down through the Flink topology builder/operator and the Spark topology builder, so every indexer-creation interface carries (indexField, extraFields) explicitly. --- .../paimon/globalindex/GlobalIndexer.java | 8 +- .../globalindex/GlobalIndexerFactory.java | 8 +- .../globalindex/GlobalIndexBuilderUtils.java | 4 +- .../globalindex/GlobalIndexScanner.java | 38 ++++----- .../apache/paimon/index/GlobalIndexMeta.java | 16 ++++ .../paimon/table/source/FullTextReadImpl.java | 5 +- .../paimon/table/source/VectorReadImpl.java | 5 +- .../globalindex/GenericIndexTopoBuilder.java | 84 ++++++++++++------- .../procedure/CreateGlobalIndexProcedure.java | 3 +- .../DefaultGlobalIndexBuilder.java | 40 +++++---- .../DefaultGlobalIndexTopoBuilder.java | 12 ++- .../GlobalIndexTopologyBuilder.java | 11 +-- .../procedure/CreateGlobalIndexProcedure.java | 3 +- 13 files changed, 145 insertions(+), 92 deletions(-) diff --git a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexer.java b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexer.java index 18087466c6c5..5eadf0597f6e 100644 --- a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexer.java +++ b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexer.java @@ -37,14 +37,14 @@ GlobalIndexReader createReader( List files, ExecutorService executor); - static GlobalIndexer create(String type, DataField dataField, Options options) { + static GlobalIndexer create(String type, DataField indexField, Options options) { GlobalIndexerFactory globalIndexerFactory = GlobalIndexerFactoryUtils.load(type); - return globalIndexerFactory.create(dataField, options); + return globalIndexerFactory.create(indexField, options); } static GlobalIndexer create( - String type, DataField dataField, List extraFields, Options options) { + String type, DataField indexField, List extraFields, Options options) { GlobalIndexerFactory globalIndexerFactory = GlobalIndexerFactoryUtils.load(type); - return globalIndexerFactory.create(dataField, extraFields, options); + return globalIndexerFactory.create(indexField, extraFields, options); } } diff --git a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java index 6f4e66d1b78c..a5aa6e96098c 100644 --- a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java +++ b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java @@ -29,7 +29,7 @@ public interface GlobalIndexerFactory { String identifier(); - GlobalIndexer create(DataField dataField, Options options); + GlobalIndexer create(DataField indexField, Options options); /** * Whether this index type supports multi-column indexes. A factory that returns {@code true} @@ -40,18 +40,18 @@ default boolean supportsMultiColumn() { } /** - * Creates an indexer over a primary column plus optional extra columns. {@code dataField} is + * Creates an indexer over a primary column plus optional extra columns. {@code indexField} is * the primary column; {@code extraFields} holds the remaining columns and is empty for a * single-column index. */ default GlobalIndexer create( - DataField dataField, List extraFields, Options options) { + DataField indexField, List extraFields, Options options) { if (extraFields != null && !extraFields.isEmpty()) { throw new UnsupportedOperationException( String.format( "Index type '%s' does not support multi-column index, got extra columns: %s", identifier(), extraFields)); } - return create(dataField, options); + return create(indexField, options); } } diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java index 3514d8c338cb..39f7fb2b0e2f 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java @@ -142,12 +142,12 @@ public static GlobalIndexWriter createIndexWriter( public static GlobalIndexWriter createIndexWriter( FileStoreTable table, String indexType, - DataField dataField, + DataField indexField, List extraFields, Options options) throws IOException { GlobalIndexer globalIndexer = - GlobalIndexer.create(indexType, dataField, extraFields, options); + GlobalIndexer.create(indexType, indexField, extraFields, options); return globalIndexer.createWriter(createGlobalIndexFileReadWrite(table)); } diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java index bcad0e4fb08d..13c4adb0533e 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java @@ -108,11 +108,7 @@ public GlobalIndexScanner( fId -> { IndexMetaFileGroup group = indexMetas.get(fId); if (group != null) { - List fields = - group.fieldIds.stream() - .map(rowType::getField) - .collect(Collectors.toList()); - return createReaders(indexFileReader, group.metas, fields); + return createReaders(indexFileReader, group, rowType); } List extraGroups = extraIndexMetas.get(fId); if (extraGroups == null || extraGroups.isEmpty()) { @@ -121,11 +117,7 @@ public GlobalIndexScanner( // Union readers from all groups that share this extra column List allReaders = new ArrayList<>(); for (IndexMetaFileGroup g : extraGroups) { - List fields = - g.fieldIds.stream() - .map(rowType::getField) - .collect(Collectors.toList()); - allReaders.addAll(createReaders(indexFileReader, g.metas, fields)); + allReaders.addAll(createReaders(indexFileReader, g, rowType)); } return allReaders; }; @@ -149,6 +141,18 @@ void addFile(String indexType, Range range, IndexFileMeta indexFile) { .computeIfAbsent(range, k -> new ArrayList<>()) .add(indexFile); } + + /** The primary index column. */ + DataField indexField(RowType rowType) { + return rowType.getField(indexFieldId); + } + + /** The extra columns beyond the primary one; empty for a single-column index. */ + List extraFields(RowType rowType) { + return fieldIds.subList(1, fieldIds.size()).stream() + .map(rowType::getField) + .collect(Collectors.toList()); + } } public static Optional create( @@ -209,21 +213,17 @@ public Optional scan(Predicate predicate) { } private Collection createReaders( - GlobalIndexFileReader indexFileReadWrite, - Map>> indexMetas, - List fields) { - if (indexMetas == null) { - return Collections.emptyList(); - } + GlobalIndexFileReader indexFileReadWrite, IndexMetaFileGroup group, RowType rowType) { + DataField indexField = group.indexField(rowType); + List extraFields = group.extraFields(rowType); Set readers = new HashSet<>(); - for (Map.Entry>> entry : indexMetas.entrySet()) { + for (Map.Entry>> entry : group.metas.entrySet()) { String indexType = entry.getKey(); Map> metas = entry.getValue(); GlobalIndexerFactory globalIndexerFactory = GlobalIndexerFactoryUtils.load(indexType); GlobalIndexer globalIndexer = - globalIndexerFactory.create( - fields.get(0), fields.subList(1, fields.size()), options); + globalIndexerFactory.create(indexField, extraFields, options); List> futures = new ArrayList<>(metas.size()); for (Map.Entry> rangeMetas : metas.entrySet()) { diff --git a/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java b/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java index a987e994f9ea..2c9718f70dab 100644 --- a/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java +++ b/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java @@ -119,6 +119,22 @@ public List getIndexedFields(RowType rowType) { return fields; } + /** The primary index column. */ + public DataField getIndexField(RowType rowType) { + return rowType.getField(indexFieldId); + } + + /** The extra columns beyond the primary one; empty for a single-column index. */ + public List getExtraFields(RowType rowType) { + List fields = new ArrayList<>(); + if (extraFieldIds != null) { + for (int id : extraFieldIds) { + fields.add(rowType.getField(id)); + } + } + return fields; + } + public List getIndexedFieldNames(RowType rowType) { List names = new ArrayList<>(); for (int id : getIndexedFieldIds()) { diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java index 290bdbe0f4ed..19999bd824c6 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java @@ -83,12 +83,11 @@ public GlobalIndexResult read(List splits) { GlobalIndexMeta firstMeta = checkNotNull(firstFile.globalIndexMeta()); GlobalIndexer globalIndexer; if (firstMeta.isMultiColumn()) { - List fields = firstMeta.getIndexedFields(table.rowType()); globalIndexer = GlobalIndexerFactoryUtils.load(indexType) .create( - fields.get(0), - fields.subList(1, fields.size()), + firstMeta.getIndexField(table.rowType()), + firstMeta.getExtraFields(table.rowType()), table.coreOptions().toConfiguration()); } else { globalIndexer = diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java index 5d890b092a32..a9cfe896baa2 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java @@ -92,12 +92,11 @@ public GlobalIndexResult read(List splits) { GlobalIndexMeta firstMeta = checkNotNull(firstFile.globalIndexMeta()); GlobalIndexer globalIndexer; if (firstMeta.isMultiColumn()) { - List fields = firstMeta.getIndexedFields(table.rowType()); globalIndexer = GlobalIndexerFactoryUtils.load(indexType) .create( - fields.get(0), - fields.subList(1, fields.size()), + firstMeta.getIndexField(table.rowType()), + firstMeta.getExtraFields(table.rowType()), table.coreOptions().toConfiguration()); } else { globalIndexer = diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java index a26f264cb4c4..af256da8ec72 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java @@ -106,7 +106,8 @@ public static void buildIndexAndExecute( buildIndexAndExecute( env, table, - Collections.singletonList(indexColumn), + indexColumn, + Collections.emptyList(), indexType, partitionPredicate, userOptions, @@ -125,7 +126,8 @@ public static void buildIndexAndExecute( buildIndexAndExecute( env, table, - Collections.singletonList(indexColumn), + indexColumn, + Collections.emptyList(), indexType, partitionPredicate, userOptions, @@ -135,7 +137,8 @@ public static void buildIndexAndExecute( public static void buildIndexAndExecute( StreamExecutionEnvironment env, FileStoreTable table, - List indexColumns, + String indexColumn, + List extraColumns, String indexType, PartitionPredicate partitionPredicate, Options userOptions) @@ -143,7 +146,8 @@ public static void buildIndexAndExecute( buildIndexAndExecute( env, table, - indexColumns, + indexColumn, + extraColumns, indexType, partitionPredicate, userOptions, @@ -153,7 +157,8 @@ public static void buildIndexAndExecute( public static void buildIndexAndExecute( StreamExecutionEnvironment env, FileStoreTable table, - List indexColumns, + String indexColumn, + List extraColumns, String indexType, PartitionPredicate partitionPredicate, Options userOptions, @@ -164,7 +169,8 @@ public static void buildIndexAndExecute( env, () -> new GenericGlobalIndexBuilder(table), table, - indexColumns, + indexColumn, + extraColumns, indexType, partitionPredicate, userOptions, @@ -189,7 +195,8 @@ public static boolean buildIndex( env, indexBuilderSupplier, table, - Collections.singletonList(indexColumn), + indexColumn, + Collections.emptyList(), indexType, partitionPredicate, userOptions, @@ -210,7 +217,8 @@ public static boolean buildIndex( env, indexBuilderSupplier, table, - Collections.singletonList(indexColumn), + indexColumn, + Collections.emptyList(), indexType, partitionPredicate, userOptions, @@ -227,7 +235,8 @@ public static boolean buildIndex( StreamExecutionEnvironment env, Supplier indexBuilderSupplier, FileStoreTable table, - List indexColumns, + String indexColumn, + List extraColumns, String indexType, PartitionPredicate partitionPredicate, Options userOptions, @@ -244,7 +253,8 @@ public static boolean buildIndex( return buildTopology( env, table, - indexColumns, + indexColumn, + extraColumns, indexType, userOptions, entries, @@ -264,13 +274,19 @@ public static boolean buildIndex( private static boolean buildTopology( StreamExecutionEnvironment env, FileStoreTable table, - List indexColumns, + String indexColumn, + List extraColumns, String indexType, Options userOptions, List entries, List deletedIndexEntries, long maxIndexedRowId) throws Exception { + // The primary column followed by the extra columns, in index order. + List indexColumns = new ArrayList<>(1 + extraColumns.size()); + indexColumns.add(indexColumn); + indexColumns.addAll(extraColumns); + long totalRowCount = entries.stream().mapToLong(e -> e.file().rowCount()).sum(); LOG.info( "Scanned {} files ({} rows) across {} partitions for {} index on columns '{}'" @@ -287,8 +303,9 @@ private static boolean buildTopology( entries = filterEntriesBefore(entries, minNonIndexableRowId); RowType rowType = table.rowType(); - List indexFields = - indexColumns.stream().map(rowType::getField).collect(Collectors.toList()); + DataField indexField = rowType.getField(indexColumn); + List extraFields = + extraColumns.stream().map(rowType::getField).collect(Collectors.toList()); // Project indexColumns + _ROW_ID so we can read the actual row ID from data List readColumns = new ArrayList<>(indexColumns); readColumns.add(SpecialFields.ROW_ID.name()); @@ -338,7 +355,8 @@ private static boolean buildTopology( readBuilder, table, indexType, - indexFields, + indexField, + extraFields, projectedRowType, mergedOptions)) .setParallelism(parallelism); @@ -566,11 +584,13 @@ private static class BuildIndexOperator private final ReadBuilder readBuilder; private final FileStoreTable table; private final String indexType; - private final List indexFields; + private final DataField indexField; + private final List extraFields; private final RowType projectedRowType; private final Options mergedOptions; private transient TableRead tableRead; + private transient List indexedFields; private transient InternalRow.FieldGetter[] indexFieldGetters; private transient int rowIdFieldIndex; private transient boolean multiColumn; @@ -580,13 +600,15 @@ private static class BuildIndexOperator ReadBuilder readBuilder, FileStoreTable table, String indexType, - List indexFields, + DataField indexField, + List extraFields, RowType projectedRowType, Options mergedOptions) { this.readBuilder = readBuilder; this.table = table; this.indexType = indexType; - this.indexFields = indexFields; + this.indexField = indexField; + this.extraFields = extraFields; this.projectedRowType = projectedRowType; this.mergedOptions = mergedOptions; } @@ -595,19 +617,24 @@ private static class BuildIndexOperator public void open() throws Exception { super.open(); this.tableRead = readBuilder.newRead(); - this.indexFieldGetters = new InternalRow.FieldGetter[indexFields.size()]; - for (int i = 0; i < indexFields.size(); i++) { - DataField field = indexFields.get(i); + // The primary column followed by the extra columns, in index order. Field getters and + // the writer projection both need the full ordered list. + this.indexedFields = new ArrayList<>(1 + extraFields.size()); + indexedFields.add(indexField); + indexedFields.addAll(extraFields); + this.indexFieldGetters = new InternalRow.FieldGetter[indexedFields.size()]; + for (int i = 0; i < indexedFields.size(); i++) { + DataField field = indexedFields.get(i); indexFieldGetters[i] = InternalRow.createFieldGetter( field.type(), projectedRowType.getFieldIndex(field.name())); } this.rowIdFieldIndex = projectedRowType.getFieldIndex(SpecialFields.ROW_ID.name()); - this.multiColumn = indexFields.size() > 1; + this.multiColumn = !extraFields.isEmpty(); if (multiColumn) { - int[] projection = new int[indexFields.size()]; - for (int i = 0; i < indexFields.size(); i++) { - projection[i] = projectedRowType.getFieldIndex(indexFields.get(i).name()); + int[] projection = new int[indexedFields.size()]; + for (int i = 0; i < indexedFields.size(); i++) { + projection[i] = projectedRowType.getFieldIndex(indexedFields.get(i).name()); } this.writerProjection = ProjectedRow.from(projection); } @@ -628,12 +655,7 @@ public void processElement(StreamRecord element) throws Exception { long startTime = System.currentTimeMillis(); GlobalIndexWriter indexWriter = - createIndexWriter( - table, - indexType, - indexFields.get(0), - indexFields.subList(1, indexFields.size()), - mergedOptions); + createIndexWriter(table, indexType, indexField, extraFields, mergedOptions); try { long rowsSeen = 0; @@ -706,7 +728,7 @@ public void processElement(StreamRecord element) throws Exception { table, partition, task.shardRange, - indexFields, + indexedFields, indexType, resultEntries); output.collect( diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java index aefc506b970d..8d15824918dd 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java @@ -139,7 +139,8 @@ public String[] call( GenericIndexTopoBuilder.buildIndexAndExecute( procedureContext.getExecutionEnvironment(), table, - indexColumns, + indexColumns.get(0), + indexColumns.subList(1, indexColumns.size()), indexType, partitionPredicate, userOptions); diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java index a2f34df0fed0..6105f0342811 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java @@ -41,6 +41,7 @@ import java.io.IOException; import java.io.Serializable; +import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -55,7 +56,8 @@ public class DefaultGlobalIndexBuilder implements Serializable { private final FileStoreTable table; private final BinaryRow partition; private final RowType readType; - private final List indexFields; + private final DataField indexField; + private final List extraFields; private final String indexType; private final Range rowRange; private final Options options; @@ -72,7 +74,8 @@ public DefaultGlobalIndexBuilder( table, partition, readType, - Collections.singletonList(indexField), + indexField, + Collections.emptyList(), indexType, rowRange, options); @@ -82,19 +85,29 @@ public DefaultGlobalIndexBuilder( FileStoreTable table, BinaryRow partition, RowType readType, - List indexFields, + DataField indexField, + List extraFields, String indexType, Range rowRange, Options options) { this.table = table; this.partition = partition; this.readType = readType; - this.indexFields = indexFields; + this.indexField = indexField; + this.extraFields = extraFields; this.indexType = indexType; this.rowRange = rowRange; this.options = options; } + /** The primary index column followed by the extra columns, in index order. */ + private List indexedFields() { + List fields = new ArrayList<>(1 + extraFields.size()); + fields.add(indexField); + fields.addAll(extraFields); + return fields; + } + public FileStoreTable table() { return table; } @@ -112,7 +125,7 @@ public CommitMessage build(CloseableIterator data) throws IOExcepti table.store().pathFactory().globalIndexFileFactory(), table.coreOptions(), rowRange, - indexFields, + indexedFields(), indexType, resultEntries); DataIncrement dataIncrement = DataIncrement.indexIncrement(indexFileMetas); @@ -123,21 +136,17 @@ public CommitMessage build(CloseableIterator data) throws IOExcepti private List writePaimonRows( CloseableIterator rows, LongCounter rowCounter) throws IOException { GlobalIndexWriter indexWriter = - createIndexWriter( - table, - indexType, - indexFields.get(0), - indexFields.subList(1, indexFields.size()), - options); - boolean multiColumn = indexFields.size() > 1; + createIndexWriter(table, indexType, indexField, extraFields, options); + boolean multiColumn = !extraFields.isEmpty(); try { if (multiColumn) { GlobalIndexMultiColumnWriter multiWriter = (GlobalIndexMultiColumnWriter) indexWriter; - int[] projection = new int[indexFields.size()]; - for (int i = 0; i < indexFields.size(); i++) { - DataField field = indexFields.get(i); + List indexedFields = indexedFields(); + int[] projection = new int[indexedFields.size()]; + for (int i = 0; i < indexedFields.size(); i++) { + DataField field = indexedFields.get(i); projection[i] = readType.getFieldIndex(field.name()); } ProjectedRow projectedRow = ProjectedRow.from(projection); @@ -152,7 +161,6 @@ private List writePaimonRows( rowCounter.add(1); } } else { - DataField indexField = indexFields.get(0); GlobalIndexSingletonWriter singleWriter = (GlobalIndexSingletonWriter) indexWriter; InternalRow.FieldGetter getter = InternalRow.createFieldGetter( diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexTopoBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexTopoBuilder.java index ea2cda4a8b85..cf65ed937a95 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexTopoBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexTopoBuilder.java @@ -86,7 +86,8 @@ public List buildIndex( table, indexType, readType, - Collections.singletonList(indexField), + indexField, + Collections.emptyList(), options); } @@ -98,7 +99,8 @@ public List buildIndex( FileStoreTable table, String indexType, RowType readType, - List indexFields, + DataField indexField, + List extraFields, Options options) throws IOException { Options tableOptions = table.coreOptions().toConfiguration(); @@ -112,6 +114,9 @@ public List buildIndex( List entries = table.store().newScan().withPartitionFilter(partitionPredicate).plan().files(); + List indexFields = new ArrayList<>(); + indexFields.add(indexField); + indexFields.addAll(extraFields); List indexColumns = indexFields.stream().map(DataField::name).collect(Collectors.toList()); SchemaManager schemaManager = new SchemaManager(table.fileIO(), table.location()); @@ -137,7 +142,8 @@ public List buildIndex( table, partition, readType, - indexFields, + indexField, + extraFields, indexType, indexedSplit.rowRanges().get(0), options); diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/GlobalIndexTopologyBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/GlobalIndexTopologyBuilder.java index 3d751f4585ac..d7a47cfdc9ed 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/GlobalIndexTopologyBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/GlobalIndexTopologyBuilder.java @@ -54,14 +54,15 @@ default List buildIndex( FileStoreTable table, String indexType, RowType readType, - List indexFields, + DataField indexField, + List extraFields, Options options) throws IOException { - if (indexFields.size() > 1) { + if (extraFields != null && !extraFields.isEmpty()) { throw new UnsupportedOperationException( String.format( - "Topology builder '%s' does not support multi-column index, got columns: %s", - identifier(), indexFields)); + "Topology builder '%s' does not support multi-column index, got extra columns: %s", + identifier(), extraFields)); } return buildIndex( spark, @@ -70,7 +71,7 @@ default List buildIndex( table, indexType, readType, - indexFields.get(0), + indexField, options); } } diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java index 86738e515697..bcb9c06cffb5 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java @@ -197,7 +197,8 @@ public InternalRow[] call(InternalRow args) { table, indexType, readRowType, - indexFields, + indexFields.get(0), + indexFields.subList(1, indexFields.size()), userOptions); try (TableCommitImpl commit = From de3bd36baf0add6822ca8e240386d7f90049a5bc Mon Sep 17 00:00:00 2001 From: CrownChu Date: Wed, 10 Jun 2026 20:26:09 +0800 Subject: [PATCH 24/24] [globalindex] Fix non-serializable subList in Spark global index builder CreateGlobalIndexProcedure passes indexFields.subList(1, size) as extraFields, which is a non-serializable List#subList view. DefaultGlobalIndexBuilder holds it and is serialized to Spark executors via InstantiationUtil.serializeObject, so the job failed with NotSerializableException. Copy extraFields into a serializable ArrayList in the constructor (null-safe), independent of the caller. --- .../paimon/spark/globalindex/DefaultGlobalIndexBuilder.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java index 6105f0342811..ae87dc96a45a 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java @@ -94,7 +94,11 @@ public DefaultGlobalIndexBuilder( this.partition = partition; this.readType = readType; this.indexField = indexField; - this.extraFields = extraFields; + // Copy into a serializable ArrayList: callers may pass a List#subList view (e.g. + // indexFields.subList(1, ...)), which is not Serializable, and this builder is serialized + // and shipped to Spark executors. A null value means no extra columns. + this.extraFields = + extraFields == null ? Collections.emptyList() : new ArrayList<>(extraFields); this.indexType = indexType; this.rowRange = rowRange; this.options = options;