From d018a895794678c219bff4facd769d42970bede2 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Tue, 9 Jun 2026 09:06:43 +0800 Subject: [PATCH 01/11] [core] Add paimon-ivfpq module for IVF-PQ vector index integration Integrate apache/paimon-vector-index (pure Rust IVF-PQ) into Paimon's GlobalIndex SPI framework. Follows the paimon-tantivy two-level module pattern: paimon-ivfpq-jni for Java JNI bindings and NativeLoader, paimon-ivfpq-index for Paimon GlobalIndexer integration. Key features: - IVF-PQ vector index with identifier "ivfpq" - Native Roaring bitmap filter pushdown (byte[] format) - Direct stream I/O via JNI (no adapter classes needed) - Reservoir sampling for training with configurable sample ratio - Batched vector insertion for memory efficiency Co-Authored-By: Claude Opus 4.6 --- .github/workflows/utcase-ivfpq.yml | 74 +++ paimon-ivfpq/paimon-ivfpq-index/pom.xml | 200 ++++++++ .../paimon/ivfpq/index/IvfpqIndexMeta.java | 106 +++++ .../index/IvfpqVectorGlobalIndexReader.java | 336 ++++++++++++++ .../index/IvfpqVectorGlobalIndexWriter.java | 433 ++++++++++++++++++ .../ivfpq/index/IvfpqVectorGlobalIndexer.java | 56 +++ .../IvfpqVectorGlobalIndexerFactory.java | 40 ++ .../ivfpq/index/IvfpqVectorIndexOptions.java | 161 +++++++ .../paimon/ivfpq/index/IvfpqVectorMetric.java | 57 +++ ...he.paimon.globalindex.GlobalIndexerFactory | 1 + paimon-ivfpq/paimon-ivfpq-jni/pom.xml | 69 +++ .../paimon/index/ivfpq/IVFPQBatchResult.java | 102 +++++ .../paimon/index/ivfpq/IVFPQNative.java | 57 +++ .../paimon/index/ivfpq/IVFPQReader.java | 108 +++++ .../paimon/index/ivfpq/IVFPQResult.java | 61 +++ .../paimon/index/ivfpq/IVFPQWriter.java | 105 +++++ .../org/apache/paimon/index/ivfpq/Metric.java | 36 ++ .../paimon/index/ivfpq/NativeLoader.java | 83 ++++ paimon-ivfpq/pom.xml | 39 ++ pom.xml | 1 + 20 files changed, 2125 insertions(+) create mode 100644 .github/workflows/utcase-ivfpq.yml create mode 100644 paimon-ivfpq/paimon-ivfpq-index/pom.xml create mode 100644 paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqIndexMeta.java create mode 100644 paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexReader.java create mode 100644 paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexWriter.java create mode 100644 paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexer.java create mode 100644 paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactory.java create mode 100644 paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptions.java create mode 100644 paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorMetric.java create mode 100644 paimon-ivfpq/paimon-ivfpq-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/pom.xml create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQBatchResult.java create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQNative.java create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQReader.java create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQResult.java create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQWriter.java create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/Metric.java create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/NativeLoader.java create mode 100644 paimon-ivfpq/pom.xml diff --git a/.github/workflows/utcase-ivfpq.yml b/.github/workflows/utcase-ivfpq.yml new file mode 100644 index 000000000000..fc63ae1cd6fd --- /dev/null +++ b/.github/workflows/utcase-ivfpq.yml @@ -0,0 +1,74 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +name: UTCase IVF-PQ + +on: + push: + paths: + - 'paimon-ivfpq/**' + pull_request: + paths: + - 'paimon-ivfpq/**' + +env: + JDK_VERSION: 8 + MAVEN_OPTS: -Dmaven.wagon.httpconnectionManager.ttlSeconds=30 -Dmaven.wagon.http.retryHandler.requestSentEnabled=true + +concurrency: + group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.number || github.run_id }} + cancel-in-progress: true + +jobs: + ivfpq_test: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v6 + + - name: Set up JDK ${{ env.JDK_VERSION }} + uses: actions/setup-java@v5 + with: + java-version: ${{ env.JDK_VERSION }} + distribution: 'temurin' + + - name: Install Rust toolchain + run: | + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable --profile minimal + echo "$HOME/.cargo/bin" >> $GITHUB_PATH + + - name: Clone and build paimon-vector-index native library + run: | + git clone --depth 1 https://github.com/apache/paimon-vector-index.git /tmp/paimon-vector-index + cd /tmp/paimon-vector-index + cargo build --release -p paimon-vindex-jni + + - name: Copy native library to resources + run: | + RESOURCE_DIR=paimon-ivfpq/paimon-ivfpq-jni/src/main/resources/native/linux-amd64 + mkdir -p ${RESOURCE_DIR} + cp /tmp/paimon-vector-index/target/release/libpaimon_vindex_jni.so ${RESOURCE_DIR}/ + + - name: Build and test IVF-PQ modules + timeout-minutes: 30 + run: | + mvn -T 2C -B -ntp clean install -DskipTests + mvn -B -ntp verify -pl paimon-ivfpq/paimon-ivfpq-jni,paimon-ivfpq/paimon-ivfpq-index -Dcheckstyle.skip=true -Dspotless.check.skip=true + env: + MAVEN_OPTS: -Xmx4096m diff --git a/paimon-ivfpq/paimon-ivfpq-index/pom.xml b/paimon-ivfpq/paimon-ivfpq-index/pom.xml new file mode 100644 index 000000000000..a40f4feb60b8 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-index/pom.xml @@ -0,0 +1,200 @@ + + + + 4.0.0 + + + paimon-ivfpq + org.apache.paimon + 1.5-SNAPSHOT + + + paimon-ivfpq-index + Paimon : IVF-PQ Index + + + + org.apache.paimon + paimon-ivfpq-jni + ${project.version} + + + + org.apache.paimon + paimon-common + ${project.version} + provided + + + + org.apache.paimon + paimon-shade-jackson-2 + ${paimon.shade.jackson.version}-${paimon.shade.version} + + + + + org.junit.jupiter + junit-jupiter + ${junit5.version} + test + + + + org.apache.paimon + paimon-core + ${project.version} + test + + + + org.apache.paimon + paimon-core + ${project.version} + test-jar + test + + + + org.apache.paimon + paimon-common + ${project.version} + test-jar + test + + + + org.apache.paimon + paimon-format + ${project.version} + test + + + + org.apache.paimon + paimon-test-utils + ${project.version} + test + + + + org.apache.hadoop + hadoop-hdfs-client + ${hadoop.version} + test + + + org.apache.avro + avro + + + log4j + log4j + + + org.slf4j + slf4j-log4j12 + + + + + + org.apache.hadoop + hadoop-common + ${hadoop.version} + test + + + org.apache.avro + avro + + + log4j + log4j + + + org.slf4j + slf4j-log4j12 + + + jdk.tools + jdk.tools + + + com.google.protobuf + protobuf-java + + + + + + org.apache.hadoop + hadoop-mapreduce-client-core + ${hadoop.version} + test + + + org.apache.avro + avro + + + com.google.protobuf + protobuf-java + + + ch.qos.reload4j + reload4j + + + org.slf4j + slf4j-reload4j + + + log4j + log4j + + + org.slf4j + slf4j-log4j12 + + + jdk.tools + jdk.tools + + + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + 1 + true + none + + + + + diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqIndexMeta.java b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqIndexMeta.java new file mode 100644 index 000000000000..2b38723ee4b3 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqIndexMeta.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.ivfpq.index; + +import org.apache.paimon.shade.jackson2.com.fasterxml.jackson.core.type.TypeReference; +import org.apache.paimon.shade.jackson2.com.fasterxml.jackson.databind.ObjectMapper; + +import java.io.IOException; +import java.io.Serializable; +import java.util.LinkedHashMap; +import java.util.Map; + +/** + * Metadata for an IVF-PQ vector index file. + * + *

Serialized as a flat JSON {@code Map} storing the index build parameters + * required for correct search-time behavior. + */ +public class IvfpqIndexMeta implements Serializable { + + private static final long serialVersionUID = 1L; + + private static final String KEY_DIMENSION = "dimension"; + private static final String KEY_METRIC = "metric"; + private static final String KEY_NLIST = "nlist"; + private static final String KEY_M = "m"; + private static final String KEY_USE_OPQ = "use_opq"; + private static final String KEY_NPROBE = "nprobe"; + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static final TypeReference> MAP_TYPE_REF = + new TypeReference>() {}; + + private final Map params; + + public IvfpqIndexMeta(IvfpqVectorIndexOptions options) { + this.params = new LinkedHashMap<>(); + params.put(KEY_DIMENSION, String.valueOf(options.dimension())); + params.put(KEY_METRIC, options.metric().getConfigName()); + params.put(KEY_NLIST, String.valueOf(options.nlist())); + params.put(KEY_M, String.valueOf(options.m())); + params.put(KEY_USE_OPQ, String.valueOf(options.useOpq())); + params.put(KEY_NPROBE, String.valueOf(options.nprobe())); + } + + private IvfpqIndexMeta(Map params) { + this.params = new LinkedHashMap<>(params); + } + + public int dimension() { + return Integer.parseInt(params.get(KEY_DIMENSION)); + } + + public IvfpqVectorMetric metric() { + return IvfpqVectorMetric.fromConfigName(params.get(KEY_METRIC)); + } + + public int nlist() { + return Integer.parseInt(params.get(KEY_NLIST)); + } + + public int m() { + return Integer.parseInt(params.get(KEY_M)); + } + + public boolean useOpq() { + return Boolean.parseBoolean(params.get(KEY_USE_OPQ)); + } + + public int nprobe() { + String val = params.get(KEY_NPROBE); + return val != null ? Integer.parseInt(val) : 16; + } + + public byte[] serialize() throws IOException { + return OBJECT_MAPPER.writeValueAsBytes(params); + } + + public static IvfpqIndexMeta deserialize(byte[] data) throws IOException { + Map map = OBJECT_MAPPER.readValue(data, MAP_TYPE_REF); + if (!map.containsKey(KEY_DIMENSION)) { + throw new IOException("Missing required key in IVF-PQ index metadata: " + KEY_DIMENSION); + } + if (!map.containsKey(KEY_METRIC)) { + throw new IOException("Missing required key in IVF-PQ index metadata: " + KEY_METRIC); + } + return new IvfpqIndexMeta(map); + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexReader.java b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexReader.java new file mode 100644 index 000000000000..b8827e9996f7 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexReader.java @@ -0,0 +1,336 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.ivfpq.index; + +import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.globalindex.GlobalIndexIOMeta; +import org.apache.paimon.globalindex.GlobalIndexReader; +import org.apache.paimon.globalindex.GlobalIndexResult; +import org.apache.paimon.globalindex.ScoredGlobalIndexResult; +import org.apache.paimon.globalindex.io.GlobalIndexFileReader; +import org.apache.paimon.index.ivfpq.IVFPQReader; +import org.apache.paimon.index.ivfpq.IVFPQResult; +import org.apache.paimon.predicate.FieldRef; +import org.apache.paimon.predicate.VectorSearch; +import org.apache.paimon.types.ArrayType; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.FloatType; +import org.apache.paimon.types.VectorType; +import org.apache.paimon.utils.IOUtils; +import org.apache.paimon.utils.RoaringNavigableMap64; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; + +import static org.apache.paimon.utils.Preconditions.checkArgument; + +/** + * Vector global index reader using IVF-PQ. + * + *

Each shard has exactly one IVF-PQ index file. The reader lazily opens the index and performs + * vector similarity search. The native Rust JNI layer calls {@code seek(long)} and {@code + * read(byte[], int, int)} directly on the {@link SeekableInputStream}, so no adapter is needed. + */ +public class IvfpqVectorGlobalIndexReader implements GlobalIndexReader { + + private final GlobalIndexIOMeta ioMeta; + private final GlobalIndexFileReader fileReader; + private final DataType fieldType; + private final IvfpqVectorIndexOptions options; + private final ExecutorService executor; + + private volatile IvfpqIndexMeta indexMeta; + private volatile IVFPQReader ivfpqReader; + private SeekableInputStream openStream; + + public IvfpqVectorGlobalIndexReader( + GlobalIndexFileReader fileReader, + List ioMetas, + DataType fieldType, + IvfpqVectorIndexOptions options, + ExecutorService executor) { + checkArgument(ioMetas.size() == 1, "Expected exactly one index file per shard"); + this.executor = executor; + this.fileReader = fileReader; + this.ioMeta = ioMetas.get(0); + this.fieldType = fieldType; + this.options = options; + } + + @Override + public CompletableFuture> visitVectorSearch( + VectorSearch vectorSearch) { + return CompletableFuture.supplyAsync( + () -> { + try { + ensureLoaded(); + return Optional.ofNullable(search(vectorSearch)); + } catch (IOException e) { + throw new RuntimeException( + String.format( + "Failed IVF-PQ search: field=%s, limit=%d", + vectorSearch.fieldName(), vectorSearch.limit()), + e); + } + }, + executor); + } + + private ScoredGlobalIndexResult search(VectorSearch vectorSearch) throws IOException { + validateSearchVector(vectorSearch.vector()); + float[] queryVector = vectorSearch.vector().clone(); + int limit = vectorSearch.limit(); + int nprobe = indexMeta.nprobe(); + IvfpqVectorMetric metric = indexMeta.metric(); + + RoaringNavigableMap64 includeRowIds = vectorSearch.includeRowIds(); + IVFPQResult result; + + if (includeRowIds != null) { + long cardinality = includeRowIds.getLongCardinality(); + if (cardinality == 0) { + return null; + } + byte[] filterBytes = includeRowIds.serialize(); + int effectiveK = (int) Math.min(limit, cardinality); + result = ivfpqReader.search(queryVector, effectiveK, nprobe, filterBytes); + } else { + result = ivfpqReader.search(queryVector, limit, nprobe); + } + + long[] ids = result.ids(); + float[] distances = result.distances(); + + if (ids.length == 0) { + return null; + } + + RoaringNavigableMap64 resultBitmap = new RoaringNavigableMap64(); + HashMap id2scores = new HashMap<>(ids.length); + + for (int i = 0; i < ids.length; i++) { + long rowId = ids[i]; + if (rowId < 0) { + continue; + } + float score = convertDistanceToScore(distances[i], metric); + resultBitmap.add(rowId); + id2scores.put(rowId, score); + } + + if (resultBitmap.isEmpty()) { + return null; + } + + return ScoredGlobalIndexResult.create( + resultBitmap, + rowId -> { + Float score = id2scores.get(rowId); + if (score == null) { + throw new IllegalArgumentException( + "No score found for rowId: " + + rowId + + ". Only rowIds present in results() are valid."); + } + return score; + }); + } + + private static float convertDistanceToScore(float distance, IvfpqVectorMetric metric) { + switch (metric) { + case L2: + return 1.0f / (1.0f + distance); + case COSINE: + return 1.0f - distance; + case INNER_PRODUCT: + return distance; + default: + throw new IllegalArgumentException("Unknown metric: " + metric); + } + } + + private void validateSearchVector(Object vector) { + if (!(vector instanceof float[])) { + throw new IllegalArgumentException( + "Expected float[] vector but got: " + vector.getClass()); + } + boolean validFieldType = false; + if (fieldType instanceof VectorType) { + validFieldType = ((VectorType) fieldType).getElementType() instanceof FloatType; + } else if (fieldType instanceof ArrayType) { + validFieldType = ((ArrayType) fieldType).getElementType() instanceof FloatType; + } + if (!validFieldType) { + throw new IllegalArgumentException( + "IVF-PQ requires VectorType or ArrayType, but field type is: " + + fieldType); + } + int queryDim = ((float[]) vector).length; + if (queryDim != indexMeta.dimension()) { + throw new IllegalArgumentException( + String.format( + "Query vector dimension mismatch: index expects %d, but got %d", + indexMeta.dimension(), queryDim)); + } + } + + private void ensureLoaded() throws IOException { + if (ivfpqReader == null) { + synchronized (this) { + if (ivfpqReader == null) { + indexMeta = IvfpqIndexMeta.deserialize(ioMeta.metadata()); + SeekableInputStream in = fileReader.getInputStream(ioMeta); + try { + ivfpqReader = new IVFPQReader(in); + openStream = in; + } catch (Exception e) { + IOUtils.closeQuietly(in); + throw e; + } + } + } + } + } + + @Override + public void close() throws IOException { + Throwable firstException = null; + + if (ivfpqReader != null) { + try { + ivfpqReader.close(); + } catch (Throwable t) { + firstException = t; + } + ivfpqReader = null; + } + + if (openStream != null) { + try { + openStream.close(); + } catch (Throwable t) { + if (firstException == null) { + firstException = t; + } else { + firstException.addSuppressed(t); + } + } + openStream = null; + } + + if (firstException != null) { + if (firstException instanceof IOException) { + throw (IOException) firstException; + } else if (firstException instanceof RuntimeException) { + throw (RuntimeException) firstException; + } else { + throw new RuntimeException( + "Failed to close IVF-PQ vector global index reader", firstException); + } + } + } + + // =================== unsupported ===================== + + @Override + public CompletableFuture> visitIsNotNull(FieldRef fieldRef) { + return CompletableFuture.completedFuture(Optional.empty()); + } + + @Override + public CompletableFuture> visitIsNull(FieldRef fieldRef) { + return CompletableFuture.completedFuture(Optional.empty()); + } + + @Override + public CompletableFuture> visitStartsWith( + FieldRef fieldRef, Object literal) { + return CompletableFuture.completedFuture(Optional.empty()); + } + + @Override + public CompletableFuture> visitEndsWith( + FieldRef fieldRef, Object literal) { + return CompletableFuture.completedFuture(Optional.empty()); + } + + @Override + public CompletableFuture> visitContains( + FieldRef fieldRef, Object literal) { + return CompletableFuture.completedFuture(Optional.empty()); + } + + @Override + public CompletableFuture> visitLike( + FieldRef fieldRef, Object literal) { + return CompletableFuture.completedFuture(Optional.empty()); + } + + @Override + public CompletableFuture> visitLessThan( + FieldRef fieldRef, Object literal) { + return CompletableFuture.completedFuture(Optional.empty()); + } + + @Override + public CompletableFuture> visitGreaterOrEqual( + FieldRef fieldRef, Object literal) { + return CompletableFuture.completedFuture(Optional.empty()); + } + + @Override + public CompletableFuture> visitNotEqual( + FieldRef fieldRef, Object literal) { + return CompletableFuture.completedFuture(Optional.empty()); + } + + @Override + public CompletableFuture> visitLessOrEqual( + FieldRef fieldRef, Object literal) { + return CompletableFuture.completedFuture(Optional.empty()); + } + + @Override + public CompletableFuture> visitEqual( + FieldRef fieldRef, Object literal) { + return CompletableFuture.completedFuture(Optional.empty()); + } + + @Override + public CompletableFuture> visitGreaterThan( + FieldRef fieldRef, Object literal) { + return CompletableFuture.completedFuture(Optional.empty()); + } + + @Override + public CompletableFuture> visitIn( + FieldRef fieldRef, List literals) { + return CompletableFuture.completedFuture(Optional.empty()); + } + + @Override + public CompletableFuture> visitNotIn( + FieldRef fieldRef, List literals) { + return CompletableFuture.completedFuture(Optional.empty()); + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexWriter.java b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexWriter.java new file mode 100644 index 000000000000..36e03ed39c77 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexWriter.java @@ -0,0 +1,433 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.ivfpq.index; + +import org.apache.paimon.data.InternalArray; +import org.apache.paimon.data.InternalVector; +import org.apache.paimon.fs.PositionOutputStream; +import org.apache.paimon.globalindex.GlobalIndexSingletonWriter; +import org.apache.paimon.globalindex.ResultEntry; +import org.apache.paimon.globalindex.io.GlobalIndexFileWriter; +import org.apache.paimon.index.ivfpq.IVFPQWriter; +import org.apache.paimon.types.ArrayType; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.FloatType; +import org.apache.paimon.types.VectorType; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.FileChannel; +import java.util.Collections; +import java.util.List; +import java.util.Random; + +/** + * Vector global index writer using IVF-PQ. + * + *

Vectors are spilled to a temporary file on disk as they arrive via {@link #write(Object)}, + * keeping Java heap usage constant (~8 MB buffer). During index build, vectors are read back for + * training (with optional reservoir sampling) and batch insertion. + * + *

Thread safety: This class is not thread-safe. + */ +public class IvfpqVectorGlobalIndexWriter implements GlobalIndexSingletonWriter, Closeable { + + private static final String FILE_NAME_PREFIX = "ivfpq"; + + private static final Logger LOG = LoggerFactory.getLogger(IvfpqVectorGlobalIndexWriter.class); + + private static final int IO_BUFFER_SIZE = 8 * 1024 * 1024; + + private final GlobalIndexFileWriter fileWriter; + private final IvfpqVectorIndexOptions options; + private final int dim; + + private File tempVectorFile; + private FileChannel writeChannel; + private ByteBuffer writeBuf; + + private final int recordSizeInBytes; + private final float[] vectorBuf; + private long count; + private boolean closed; + + private long logicalRowId; + + public IvfpqVectorGlobalIndexWriter( + GlobalIndexFileWriter fileWriter, + DataType fieldType, + IvfpqVectorIndexOptions options) { + this.fileWriter = fileWriter; + this.options = options; + this.dim = options.dimension(); + this.count = 0; + this.closed = false; + this.recordSizeInBytes = checkedRecordSize(dim, IO_BUFFER_SIZE); + this.vectorBuf = new float[dim]; + + validateFieldType(fieldType); + + try { + this.tempVectorFile = File.createTempFile("ivfpq-vectors-", ".bin"); + this.tempVectorFile.deleteOnExit(); + @SuppressWarnings("resource") + RandomAccessFile raf = new RandomAccessFile(tempVectorFile, "rw"); + this.writeChannel = raf.getChannel(); + this.writeBuf = ByteBuffer.allocateDirect(IO_BUFFER_SIZE); + this.writeBuf.order(ByteOrder.nativeOrder()); + } catch (IOException e) { + throw new RuntimeException("Failed to create temp vector file", e); + } + } + + private void validateFieldType(DataType dataType) { + if (dataType instanceof VectorType) { + DataType elementType = ((VectorType) dataType).getElementType(); + if (!(elementType instanceof FloatType)) { + throw new IllegalArgumentException( + "IVF-PQ index requires float vector, but got: " + elementType); + } + return; + } + if (dataType instanceof ArrayType) { + DataType elementType = ((ArrayType) dataType).getElementType(); + if (!(elementType instanceof FloatType)) { + throw new IllegalArgumentException( + "IVF-PQ index requires float array, but got: " + elementType); + } + return; + } + throw new IllegalArgumentException( + "IVF-PQ index requires VectorType or ArrayType, but got: " + dataType); + } + + @Override + public void write(Object fieldData) { + if (fieldData == null) { + logicalRowId++; + return; + } + + float[] src = materializeAndValidate(fieldData); + + if (writeBuf.remaining() < recordSizeInBytes) { + flushWriteBuffer(); + } + writeBuf.putLong(logicalRowId); + for (int i = 0; i < dim; i++) { + writeBuf.putFloat(src[i]); + } + logicalRowId++; + count++; + } + + private float[] materializeAndValidate(Object fieldData) { + if (fieldData instanceof float[]) { + float[] vector = (float[]) fieldData; + checkDimension(vector.length); + for (int i = 0; i < dim; i++) { + checkFinite(vector[i], i); + } + return vector; + } else if (fieldData instanceof InternalVector) { + InternalVector vector = (InternalVector) fieldData; + checkDimension(vector.size()); + for (int i = 0; i < dim; i++) { + float v = vector.getFloat(i); + checkFinite(v, i); + vectorBuf[i] = v; + } + return vectorBuf; + } else if (fieldData instanceof InternalArray) { + InternalArray array = (InternalArray) fieldData; + checkDimension(array.size()); + for (int i = 0; i < dim; i++) { + if (array.isNullAt(i)) { + throw new IllegalArgumentException("Vector element at index " + i + " is null"); + } + float v = array.getFloat(i); + checkFinite(v, i); + vectorBuf[i] = v; + } + return vectorBuf; + } else { + throw new RuntimeException( + "Unsupported vector type: " + fieldData.getClass().getName()); + } + } + + private void flushWriteBuffer() { + try { + writeBuf.flip(); + while (writeBuf.hasRemaining()) { + writeChannel.write(writeBuf); + } + writeBuf.clear(); + } catch (IOException e) { + throw new RuntimeException("Failed to flush vector buffer to disk", e); + } + } + + @Override + public List finish() { + try { + if (count == 0) { + writeChannel.close(); + writeChannel = null; + writeBuf = null; + return Collections.emptyList(); + } + flushWriteBuffer(); + writeChannel.close(); + writeChannel = null; + writeBuf = null; + return Collections.singletonList(buildIndex()); + } catch (IOException e) { + throw new RuntimeException("Failed to write IVF-PQ vector global index", e); + } finally { + if (tempVectorFile != null) { + tempVectorFile.delete(); + tempVectorFile = null; + } + } + } + + private ResultEntry buildIndex() throws IOException { + int effectiveNlist = (int) Math.min(options.nlist(), count); + + LOG.info( + "IVF-PQ index build started: {} vectors, dim={}, nlist={}, m={}, metric={}", + count, + dim, + effectiveNlist, + options.m(), + options.metric()); + long buildStart = System.currentTimeMillis(); + + try (IVFPQWriter writer = + new IVFPQWriter( + dim, + effectiveNlist, + options.m(), + options.metric().toNativeMetric(), + options.useOpq())) { + + // Phase 1: Train + long phaseStart = System.currentTimeMillis(); + LOG.info( + "IVF-PQ train phase started (sample_ratio={})", options.trainSampleRatio()); + trainFromTempFile(writer); + LOG.info("IVF-PQ train phase done in {} ms", System.currentTimeMillis() - phaseStart); + + // Phase 2: Add all vectors in batches + phaseStart = System.currentTimeMillis(); + LOG.info("IVF-PQ add phase started"); + addVectorsFromTempFile(writer); + LOG.info("IVF-PQ add phase done in {} ms", System.currentTimeMillis() - phaseStart); + + // Phase 3: Write index + phaseStart = System.currentTimeMillis(); + LOG.info("IVF-PQ write phase started"); + String fileName = fileWriter.newFileName(FILE_NAME_PREFIX); + try (PositionOutputStream out = fileWriter.newOutputStream(fileName)) { + writer.writeIndex(out); + out.flush(); + } + LOG.info("IVF-PQ write phase done in {} ms", System.currentTimeMillis() - phaseStart); + + LOG.info( + "IVF-PQ index build completed in {} ms", + System.currentTimeMillis() - buildStart); + + IvfpqIndexMeta meta = new IvfpqIndexMeta(options); + return new ResultEntry(fileName, logicalRowId, meta.serialize()); + } + } + + private void trainFromTempFile(IVFPQWriter writer) throws IOException { + double sampleRatio = options.trainSampleRatio(); + int minTrainSize = (int) Math.min(count, Math.max(options.nlist() * 39L, 256)); + int sampleCount; + if (sampleRatio >= 1.0) { + sampleCount = (int) count; + } else { + sampleCount = Math.max((int) (count * sampleRatio), minTrainSize); + sampleCount = (int) Math.min(sampleCount, count); + } + + float[] trainData = new float[sampleCount * dim]; + + try (RandomAccessFile raf = new RandomAccessFile(tempVectorFile, "r"); + FileChannel channel = raf.getChannel()) { + ByteBuffer readBuf = ByteBuffer.allocateDirect(IO_BUFFER_SIZE); + readBuf.order(ByteOrder.nativeOrder()); + readBuf.limit(0); + + if (sampleCount == (int) count) { + // Read all vectors + for (int i = 0; i < sampleCount; i++) { + ensureAvailable(readBuf, channel, recordSizeInBytes); + readBuf.getLong(); // skip rowId + for (int d = 0; d < dim; d++) { + trainData[i * dim + d] = readBuf.getFloat(); + } + } + } else { + // Reservoir sampling + Random rng = new Random(42); + int collected = 0; + for (long i = 0; i < count; i++) { + ensureAvailable(readBuf, channel, recordSizeInBytes); + readBuf.getLong(); // skip rowId + if (collected < sampleCount) { + for (int d = 0; d < dim; d++) { + trainData[collected * dim + d] = readBuf.getFloat(); + } + collected++; + } else { + int j = rng.nextInt((int) (i + 1)); + if (j < sampleCount) { + for (int d = 0; d < dim; d++) { + trainData[j * dim + d] = readBuf.getFloat(); + } + } else { + readBuf.position(readBuf.position() + dim * Float.BYTES); + } + } + } + } + } + + writer.train(trainData, sampleCount); + } + + private void addVectorsFromTempFile(IVFPQWriter writer) throws IOException { + int batchSize = options.addBatchSize(); + long[] batchIds = new long[batchSize]; + float[] batchVectors = new float[batchSize * dim]; + + try (RandomAccessFile raf = new RandomAccessFile(tempVectorFile, "r"); + FileChannel channel = raf.getChannel()) { + ByteBuffer readBuf = ByteBuffer.allocateDirect(IO_BUFFER_SIZE); + readBuf.order(ByteOrder.nativeOrder()); + readBuf.limit(0); + + long remaining = count; + int lastLoggedPercent = -1; + + while (remaining > 0) { + int thisBatch = (int) Math.min(batchSize, remaining); + for (int i = 0; i < thisBatch; i++) { + ensureAvailable(readBuf, channel, recordSizeInBytes); + batchIds[i] = readBuf.getLong(); + for (int d = 0; d < dim; d++) { + batchVectors[i * dim + d] = readBuf.getFloat(); + } + } + writer.addVectors(batchIds, batchVectors, thisBatch); + remaining -= thisBatch; + + int percent = (int) ((count - remaining) * 100 / count); + if (percent / 10 > lastLoggedPercent / 10) { + LOG.info( + "IVF-PQ add progress: {}/{} vectors ({}%)", + count - remaining, + count, + percent); + lastLoggedPercent = percent; + } + } + } + } + + private static void ensureAvailable(ByteBuffer readBuf, FileChannel channel, int minBytes) + throws IOException { + int zeroReadCount = 0; + while (readBuf.remaining() < minBytes) { + readBuf.compact(); + int bytesRead = channel.read(readBuf); + readBuf.flip(); + if (bytesRead == -1) { + throw new IOException("Unexpected end of temp file"); + } + if (bytesRead == 0) { + if (++zeroReadCount > 100) { + throw new IOException( + "Unable to read from temp file: repeated zero-byte reads"); + } + } else { + zeroReadCount = 0; + } + } + } + + private void checkDimension(int actualDim) { + if (actualDim != dim) { + throw new IllegalArgumentException( + String.format( + "Vector dimension mismatch: expected %d, but got %d", dim, actualDim)); + } + } + + private void checkFinite(float value, int elementIndex) { + if (!Float.isFinite(value)) { + throw new IllegalArgumentException( + String.format( + "Vector element at rowId=%d, index=%d is %s", + logicalRowId, elementIndex, Float.toString(value))); + } + } + + private static int checkedRecordSize(int dim, int bufferCapacity) { + long recordSize = Long.BYTES + (long) dim * Float.BYTES; + if (recordSize > bufferCapacity || recordSize > Integer.MAX_VALUE) { + throw new IllegalStateException( + "Vector record size " + + recordSize + + " exceeds buffer capacity " + + bufferCapacity); + } + return (int) recordSize; + } + + @Override + public void close() { + if (!closed) { + closed = true; + try { + if (writeChannel != null) { + writeChannel.close(); + } + } catch (IOException ignored) { + } + writeBuf = null; + if (tempVectorFile != null) { + tempVectorFile.delete(); + tempVectorFile = null; + } + } + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexer.java b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexer.java new file mode 100644 index 000000000000..9428631bde7f --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexer.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.ivfpq.index; + +import org.apache.paimon.globalindex.GlobalIndexIOMeta; +import org.apache.paimon.globalindex.GlobalIndexReader; +import org.apache.paimon.globalindex.GlobalIndexWriter; +import org.apache.paimon.globalindex.GlobalIndexer; +import org.apache.paimon.globalindex.io.GlobalIndexFileReader; +import org.apache.paimon.globalindex.io.GlobalIndexFileWriter; +import org.apache.paimon.options.Options; +import org.apache.paimon.types.DataType; + +import java.util.List; +import java.util.concurrent.ExecutorService; + +/** IVF-PQ vector global indexer. */ +public class IvfpqVectorGlobalIndexer implements GlobalIndexer { + + private final DataType fieldType; + private final IvfpqVectorIndexOptions options; + + public IvfpqVectorGlobalIndexer(DataType fieldType, Options options) { + this.fieldType = fieldType; + this.options = new IvfpqVectorIndexOptions(options); + } + + @Override + public GlobalIndexWriter createWriter(GlobalIndexFileWriter fileWriter) { + return new IvfpqVectorGlobalIndexWriter(fileWriter, fieldType, options); + } + + @Override + public GlobalIndexReader createReader( + GlobalIndexFileReader fileReader, + List files, + ExecutorService executor) { + return new IvfpqVectorGlobalIndexReader(fileReader, files, fieldType, options, executor); + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactory.java b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactory.java new file mode 100644 index 000000000000..21974d8e2880 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactory.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.ivfpq.index; + +import org.apache.paimon.globalindex.GlobalIndexer; +import org.apache.paimon.globalindex.GlobalIndexerFactory; +import org.apache.paimon.options.Options; +import org.apache.paimon.types.DataField; + +/** Factory for creating IVF-PQ vector index. */ +public class IvfpqVectorGlobalIndexerFactory implements GlobalIndexerFactory { + + public static final String IDENTIFIER = "ivfpq"; + + @Override + public String identifier() { + return IDENTIFIER; + } + + @Override + public GlobalIndexer create(DataField field, Options options) { + return new IvfpqVectorGlobalIndexer(field.type(), options); + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptions.java b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptions.java new file mode 100644 index 000000000000..4663792ce25f --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptions.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.ivfpq.index; + +import org.apache.paimon.options.ConfigOption; +import org.apache.paimon.options.ConfigOptions; +import org.apache.paimon.options.Options; + +/** Options for the IVF-PQ vector index. */ +public class IvfpqVectorIndexOptions { + + public static final ConfigOption DIMENSION = + ConfigOptions.key("ivfpq.index.dimension") + .intType() + .defaultValue(128) + .withDescription("The dimension of the vector."); + + public static final ConfigOption DISTANCE_METRIC = + ConfigOptions.key("ivfpq.distance.metric") + .stringType() + .defaultValue("inner_product") + .withDescription( + "Distance metric for vector search (l2, cosine, inner_product)."); + + public static final ConfigOption NLIST = + ConfigOptions.key("ivfpq.nlist") + .intType() + .defaultValue(256) + .withDescription("Number of IVF partitions (Voronoi cells)."); + + public static final ConfigOption M = + ConfigOptions.key("ivfpq.m") + .intType() + .defaultValue(16) + .withDescription( + "Number of PQ sub-quantizers. Must divide the vector dimension."); + + public static final ConfigOption USE_OPQ = + ConfigOptions.key("ivfpq.use_opq") + .booleanType() + .defaultValue(false) + .withDescription( + "Whether to use OPQ (Optimized Product Quantization) rotation."); + + public static final ConfigOption NPROBE = + ConfigOptions.key("ivfpq.nprobe") + .intType() + .defaultValue(16) + .withDescription("Number of IVF partitions to probe during search."); + + public static final ConfigOption TRAIN_SAMPLE_RATIO = + ConfigOptions.key("ivfpq.train.sample_ratio") + .doubleType() + .defaultValue(1.0) + .withDescription( + "Ratio of vectors sampled for training (0.0-1.0]. " + + "1.0 means use all vectors for training."); + + public static final ConfigOption ADD_BATCH_SIZE = + ConfigOptions.key("ivfpq.add.batch_size") + .intType() + .defaultValue(10000) + .withDescription("Batch size for adding vectors after training."); + + private final int dimension; + private final IvfpqVectorMetric metric; + private final int nlist; + private final int m; + private final boolean useOpq; + private final int nprobe; + private final double trainSampleRatio; + private final int addBatchSize; + + public IvfpqVectorIndexOptions(Options options) { + this.dimension = validatePositive(options.get(DIMENSION), DIMENSION.key()); + this.metric = parseMetric(options.get(DISTANCE_METRIC)); + this.nlist = validatePositive(options.get(NLIST), NLIST.key()); + this.m = validatePositive(options.get(M), M.key()); + this.useOpq = options.get(USE_OPQ); + this.nprobe = validatePositive(options.get(NPROBE), NPROBE.key()); + this.trainSampleRatio = options.get(TRAIN_SAMPLE_RATIO); + this.addBatchSize = validatePositive(options.get(ADD_BATCH_SIZE), ADD_BATCH_SIZE.key()); + + if (dimension % m != 0) { + throw new IllegalArgumentException( + String.format( + "ivfpq.m (%d) must divide ivfpq.index.dimension (%d)", m, dimension)); + } + if (trainSampleRatio <= 0 || trainSampleRatio > 1.0) { + throw new IllegalArgumentException( + String.format( + "ivfpq.train.sample_ratio must be in (0, 1.0], but got %f", + trainSampleRatio)); + } + } + + public int dimension() { + return dimension; + } + + public IvfpqVectorMetric metric() { + return metric; + } + + public int nlist() { + return nlist; + } + + public int m() { + return m; + } + + public boolean useOpq() { + return useOpq; + } + + public int nprobe() { + return nprobe; + } + + public double trainSampleRatio() { + return trainSampleRatio; + } + + public int addBatchSize() { + return addBatchSize; + } + + private static IvfpqVectorMetric parseMetric(String value) { + try { + return IvfpqVectorMetric.fromConfigName(value); + } catch (IllegalArgumentException e) { + return IvfpqVectorMetric.fromString(value); + } + } + + private static int validatePositive(int value, String key) { + if (value <= 0) { + throw new IllegalArgumentException( + String.format( + "Invalid value for '%s': %d. Must be a positive integer.", key, value)); + } + return value; + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorMetric.java b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorMetric.java new file mode 100644 index 000000000000..95d612b432a3 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorMetric.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.ivfpq.index; + +import org.apache.paimon.index.ivfpq.Metric; + +/** Enumeration of supported IVF-PQ vector distance metrics. */ +public enum IvfpqVectorMetric { + L2("l2", Metric.L2), + COSINE("cosine", Metric.COSINE), + INNER_PRODUCT("inner_product", Metric.INNER_PRODUCT); + + private final String configName; + private final Metric nativeMetric; + + IvfpqVectorMetric(String configName, Metric nativeMetric) { + this.configName = configName; + this.nativeMetric = nativeMetric; + } + + public String getConfigName() { + return configName; + } + + public Metric toNativeMetric() { + return nativeMetric; + } + + public static IvfpqVectorMetric fromConfigName(String configName) { + for (IvfpqVectorMetric m : values()) { + if (m.configName.equals(configName)) { + return m; + } + } + throw new IllegalArgumentException("Unknown metric: " + configName); + } + + public static IvfpqVectorMetric fromString(String name) { + return valueOf(name.toUpperCase()); + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory b/paimon-ivfpq/paimon-ivfpq-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory new file mode 100644 index 000000000000..5b8845427874 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory @@ -0,0 +1 @@ +org.apache.paimon.ivfpq.index.IvfpqVectorGlobalIndexerFactory diff --git a/paimon-ivfpq/paimon-ivfpq-jni/pom.xml b/paimon-ivfpq/paimon-ivfpq-jni/pom.xml new file mode 100644 index 000000000000..7411564074d2 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/pom.xml @@ -0,0 +1,69 @@ + + + + 4.0.0 + + + paimon-ivfpq + org.apache.paimon + 1.5-SNAPSHOT + + + paimon-ivfpq-jni + Paimon : IVF-PQ JNI + + + 1.8 + true + true + true + + + + + org.apache.paimon + paimon-shade-guava-30 + ${paimon.shade.guava.version}-${paimon.shade.version} + + + + org.junit.jupiter + junit-jupiter + ${junit5.version} + test + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + 1 + true + none + + + + + diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQBatchResult.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQBatchResult.java new file mode 100644 index 000000000000..0ef0067ec958 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQBatchResult.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.index.ivfpq; + +import java.util.Arrays; + +/** Result of a batch IVF-PQ search query. */ +public final class IVFPQBatchResult { + + private final long[] ids; + private final float[] distances; + private final int queryCount; + private final int topK; + + public IVFPQBatchResult(long[] ids, float[] distances, int queryCount, int topK) { + if (ids == null) { + throw new NullPointerException("ids"); + } + if (distances == null) { + throw new NullPointerException("distances"); + } + if (queryCount < 0) { + throw new IllegalArgumentException("queryCount must be >= 0"); + } + if (topK < 0) { + throw new IllegalArgumentException("topK must be >= 0"); + } + int expectedLength = checkedResultLength(queryCount, topK); + if (ids.length != expectedLength) { + throw new IllegalArgumentException( + "ids length " + ids.length + " != queryCount * topK " + expectedLength); + } + if (distances.length != expectedLength) { + throw new IllegalArgumentException( + "distances length " + + distances.length + + " != queryCount * topK " + + expectedLength); + } + this.ids = ids.clone(); + this.distances = distances.clone(); + this.queryCount = queryCount; + this.topK = topK; + } + + public int queryCount() { + return queryCount; + } + + public int topK() { + return topK; + } + + public long[] ids() { + return ids.clone(); + } + + public float[] distances() { + return distances.clone(); + } + + public long[] idsForQuery(int queryIndex) { + checkQueryIndex(queryIndex); + return Arrays.copyOfRange(ids, queryIndex * topK, (queryIndex + 1) * topK); + } + + public float[] distancesForQuery(int queryIndex) { + checkQueryIndex(queryIndex); + return Arrays.copyOfRange(distances, queryIndex * topK, (queryIndex + 1) * topK); + } + + private void checkQueryIndex(int queryIndex) { + if (queryIndex < 0 || queryIndex >= queryCount) { + throw new IndexOutOfBoundsException( + "queryIndex " + queryIndex + " out of range [0, " + queryCount + ')'); + } + } + + private static int checkedResultLength(int queryCount, int topK) { + long length = (long) queryCount * (long) topK; + if (length > Integer.MAX_VALUE) { + throw new IllegalArgumentException("queryCount * topK overflows int"); + } + return (int) length; + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQNative.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQNative.java new file mode 100644 index 000000000000..5fee60c2c1a0 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQNative.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.index.ivfpq; + +final class IVFPQNative { + + static { + NativeLoader.loadJni(); + } + + private IVFPQNative() {} + + static native long createWriter(int d, int nlist, int m, int metric, boolean useOpq); + + static native void train(long ptr, float[] data, int n); + + static native void addVectors(long ptr, long[] ids, float[] data, int n); + + static native void writeIndex(long ptr, Object streamOutput); + + static native void freeWriter(long ptr); + + static native long openReader(Object streamInput); + + static native IVFPQResult search(long ptr, float[] query, int k, int nprobe); + + static native IVFPQResult searchWithRoaringFilter( + long ptr, float[] query, int k, int nprobe, byte[] roaringFilter); + + static native int getDimension(long ptr); + + static native long getTotalVectors(long ptr); + + static native IVFPQBatchResult searchBatch( + long ptr, float[] queries, int queryCount, int k, int nprobe); + + static native IVFPQBatchResult searchBatchWithRoaringFilter( + long ptr, float[] queries, int queryCount, int k, int nprobe, byte[] roaringFilter); + + static native void freeReader(long ptr); +} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQReader.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQReader.java new file mode 100644 index 000000000000..51d5cc094713 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQReader.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.index.ivfpq; + +/** Opens a serialized IVF-PQ index for search. */ +public final class IVFPQReader implements AutoCloseable { + + private long nativePtr; + + public IVFPQReader(Object input) { + if (input == null) { + throw new NullPointerException("input"); + } + this.nativePtr = IVFPQNative.openReader(input); + } + + public int dimension() { + return IVFPQNative.getDimension(requireOpen()); + } + + public long totalVectors() { + return IVFPQNative.getTotalVectors(requireOpen()); + } + + public IVFPQResult search(float[] query, int topK, int nprobe) { + if (query == null) { + throw new NullPointerException("query"); + } + validatePositive(topK, "topK"); + validatePositive(nprobe, "nprobe"); + return IVFPQNative.search(requireOpen(), query, topK, nprobe); + } + + public IVFPQResult search(float[] query, int topK, int nprobe, byte[] roaringFilter) { + if (query == null) { + throw new NullPointerException("query"); + } + if (roaringFilter == null) { + throw new NullPointerException("roaringFilter"); + } + validatePositive(topK, "topK"); + validatePositive(nprobe, "nprobe"); + return IVFPQNative.searchWithRoaringFilter(requireOpen(), query, topK, nprobe, roaringFilter); + } + + public IVFPQBatchResult searchBatch(float[] queries, int queryCount, int topK, int nprobe) { + if (queries == null) { + throw new NullPointerException("queries"); + } + validatePositive(queryCount, "queryCount"); + validatePositive(topK, "topK"); + validatePositive(nprobe, "nprobe"); + return IVFPQNative.searchBatch(requireOpen(), queries, queryCount, topK, nprobe); + } + + public IVFPQBatchResult searchBatch( + float[] queries, int queryCount, int topK, int nprobe, byte[] roaringFilter) { + if (queries == null) { + throw new NullPointerException("queries"); + } + if (roaringFilter == null) { + throw new NullPointerException("roaringFilter"); + } + validatePositive(queryCount, "queryCount"); + validatePositive(topK, "topK"); + validatePositive(nprobe, "nprobe"); + return IVFPQNative.searchBatchWithRoaringFilter( + requireOpen(), queries, queryCount, topK, nprobe, roaringFilter); + } + + @Override + public void close() { + long ptr = nativePtr; + nativePtr = 0L; + if (ptr != 0L) { + IVFPQNative.freeReader(ptr); + } + } + + private long requireOpen() { + if (nativePtr == 0L) { + throw new IllegalStateException("IVFPQReader is closed"); + } + return nativePtr; + } + + private static void validatePositive(int value, String name) { + if (value <= 0) { + throw new IllegalArgumentException(name + " must be > 0"); + } + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQResult.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQResult.java new file mode 100644 index 000000000000..3b6809b084c9 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQResult.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.index.ivfpq; + +import java.util.Arrays; + +/** Result of a single IVF-PQ search query. */ +public final class IVFPQResult { + + private final long[] ids; + private final float[] distances; + + public IVFPQResult(long[] ids, float[] distances) { + if (ids == null) { + throw new NullPointerException("ids"); + } + if (distances == null) { + throw new NullPointerException("distances"); + } + if (ids.length != distances.length) { + throw new IllegalArgumentException( + "ids length " + ids.length + " != distances length " + distances.length); + } + this.ids = ids.clone(); + this.distances = distances.clone(); + } + + public int size() { + return ids.length; + } + + public long[] ids() { + return ids.clone(); + } + + public float[] distances() { + return distances.clone(); + } + + @Override + public String toString() { + return "IVFPQResult{ids=" + Arrays.toString(ids) + + ", distances=" + Arrays.toString(distances) + '}'; + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQWriter.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQWriter.java new file mode 100644 index 000000000000..ec4113175890 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQWriter.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.index.ivfpq; + +/** Builds an IVF-PQ index via the native Rust library. */ +public final class IVFPQWriter implements AutoCloseable { + + private final int dimension; + private long nativePtr; + + public IVFPQWriter(int dimension, int nlist, int m, Metric metric, boolean useOpq) { + if (metric == null) { + throw new NullPointerException("metric"); + } + validatePositive(dimension, "dimension"); + validatePositive(nlist, "nlist"); + validatePositive(m, "m"); + if (dimension % m != 0) { + throw new IllegalArgumentException("dimension must be divisible by m"); + } + this.dimension = dimension; + this.nativePtr = IVFPQNative.createWriter(dimension, nlist, m, metric.code(), useOpq); + } + + public int dimension() { + return dimension; + } + + public void train(float[] data, int vectorCount) { + validateVectors(data, vectorCount); + IVFPQNative.train(requireOpen(), data, vectorCount); + } + + public void addVectors(long[] ids, float[] data, int vectorCount) { + if (ids == null) { + throw new NullPointerException("ids"); + } + validateVectors(data, vectorCount); + if (ids.length < vectorCount) { + throw new IllegalArgumentException( + "ids length " + ids.length + " < vectorCount " + vectorCount); + } + IVFPQNative.addVectors(requireOpen(), ids, data, vectorCount); + } + + public void writeIndex(Object output) { + if (output == null) { + throw new NullPointerException("output"); + } + IVFPQNative.writeIndex(requireOpen(), output); + } + + @Override + public void close() { + long ptr = nativePtr; + nativePtr = 0L; + if (ptr != 0L) { + IVFPQNative.freeWriter(ptr); + } + } + + private void validateVectors(float[] data, int vectorCount) { + if (data == null) { + throw new NullPointerException("data"); + } + validatePositive(vectorCount, "vectorCount"); + long expected = (long) vectorCount * (long) dimension; + if (expected > Integer.MAX_VALUE) { + throw new IllegalArgumentException("vectorCount * dimension overflows int"); + } + if (data.length < expected) { + throw new IllegalArgumentException( + "data length " + data.length + " < vectorCount * dimension " + expected); + } + } + + private long requireOpen() { + if (nativePtr == 0L) { + throw new IllegalStateException("IVFPQWriter is closed"); + } + return nativePtr; + } + + private static void validatePositive(int value, String name) { + if (value <= 0) { + throw new IllegalArgumentException(name + " must be > 0"); + } + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/Metric.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/Metric.java new file mode 100644 index 000000000000..385e477863e1 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/Metric.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.index.ivfpq; + +/** Distance metric for IVF-PQ vector search. */ +public enum Metric { + L2(0), + INNER_PRODUCT(1), + COSINE(2); + + private final int code; + + Metric(int code) { + this.code = code; + } + + public int code() { + return code; + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/NativeLoader.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/NativeLoader.java new file mode 100644 index 000000000000..0e38a050a14b --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/NativeLoader.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.index.ivfpq; + +import org.apache.paimon.shade.guava30.com.google.common.io.ByteStreams; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Locale; + +/** Utility class for loading the native IVF-PQ JNI library. */ +public final class NativeLoader { + private static boolean loaded = false; + + private NativeLoader() {} + + public static synchronized void loadJni() { + if (loaded) { + return; + } + + String osName = System.getProperty("os.name").toLowerCase(Locale.ROOT); + String osArch = System.getProperty("os.arch").toLowerCase(Locale.ROOT); + String libName = "libpaimon_vindex_jni"; + + String libExt; + String osShortName; + if (osName.contains("win")) { + osShortName = "win"; + libExt = ".dll"; + libName += libExt; + } else if (osName.contains("mac")) { + osShortName = "darwin"; + libExt = ".dylib"; + libName += libExt; + } else if (osName.contains("nix") || osName.contains("nux")) { + osShortName = "linux"; + libExt = ".so"; + libName += libExt; + } else { + throw new UnsupportedOperationException("Unsupported OS: " + osName); + } + + String libPath = "/native/" + osShortName + "-" + osArch + "/" + libName; + try (InputStream in = NativeLoader.class.getResourceAsStream(libPath)) { + if (in == null) { + throw new FileNotFoundException("Library not found: " + libPath); + } + File tempFile = File.createTempFile("libpaimon_vindex_jni", libExt); + tempFile.deleteOnExit(); + + try (OutputStream out = new FileOutputStream(tempFile)) { + ByteStreams.copy(in, out); + } + libName = tempFile.getAbsolutePath(); + } catch (IOException e) { + throw new RuntimeException("Failed to load library: " + e.getMessage(), e); + } + + System.load(libName); + loaded = true; + } +} diff --git a/paimon-ivfpq/pom.xml b/paimon-ivfpq/pom.xml new file mode 100644 index 000000000000..8e1ec5bc8779 --- /dev/null +++ b/paimon-ivfpq/pom.xml @@ -0,0 +1,39 @@ + + + + 4.0.0 + + + paimon-parent + org.apache.paimon + 1.5-SNAPSHOT + + + paimon-ivfpq + Paimon : IVF-PQ : + pom + + + paimon-ivfpq-jni + paimon-ivfpq-index + + diff --git a/pom.xml b/pom.xml index d2c02ae4d60c..05e0fa264d4a 100644 --- a/pom.xml +++ b/pom.xml @@ -76,6 +76,7 @@ under the License. paimon-vortex paimon-mosaic paimon-tantivy + paimon-ivfpq From 92d4b64710374134b440ae6c9169b2e3d59dd9d4 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Tue, 9 Jun 2026 10:29:44 +0800 Subject: [PATCH 02/11] Fix comments --- .../paimon/ivfpq/index/IvfpqIndexMeta.java | 3 +- .../index/IvfpqVectorGlobalIndexWriter.java | 11 +- ...he.paimon.globalindex.GlobalIndexerFactory | 15 + .../index/IvfpqVectorGlobalIndexTest.java | 401 ++++++++++++++++++ .../IvfpqVectorGlobalIndexerFactoryTest.java | 40 ++ .../index/IvfpqVectorIndexOptionsTest.java | 110 +++++ 6 files changed, 571 insertions(+), 9 deletions(-) create mode 100644 paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexTest.java create mode 100644 paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactoryTest.java create mode 100644 paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptionsTest.java diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqIndexMeta.java b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqIndexMeta.java index 2b38723ee4b3..2f1f0dc7f731 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqIndexMeta.java +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqIndexMeta.java @@ -96,7 +96,8 @@ public byte[] serialize() throws IOException { public static IvfpqIndexMeta deserialize(byte[] data) throws IOException { Map map = OBJECT_MAPPER.readValue(data, MAP_TYPE_REF); if (!map.containsKey(KEY_DIMENSION)) { - throw new IOException("Missing required key in IVF-PQ index metadata: " + KEY_DIMENSION); + throw new IOException( + "Missing required key in IVF-PQ index metadata: " + KEY_DIMENSION); } if (!map.containsKey(KEY_METRIC)) { throw new IOException("Missing required key in IVF-PQ index metadata: " + KEY_METRIC); diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexWriter.java b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexWriter.java index 36e03ed39c77..6bec7f4dee46 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexWriter.java +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexWriter.java @@ -77,9 +77,7 @@ public class IvfpqVectorGlobalIndexWriter implements GlobalIndexSingletonWriter, private long logicalRowId; public IvfpqVectorGlobalIndexWriter( - GlobalIndexFileWriter fileWriter, - DataType fieldType, - IvfpqVectorIndexOptions options) { + GlobalIndexFileWriter fileWriter, DataType fieldType, IvfpqVectorIndexOptions options) { this.fileWriter = fileWriter; this.options = options; this.dim = options.dimension(); @@ -237,8 +235,7 @@ private ResultEntry buildIndex() throws IOException { // Phase 1: Train long phaseStart = System.currentTimeMillis(); - LOG.info( - "IVF-PQ train phase started (sample_ratio={})", options.trainSampleRatio()); + LOG.info("IVF-PQ train phase started (sample_ratio={})", options.trainSampleRatio()); trainFromTempFile(writer); LOG.info("IVF-PQ train phase done in {} ms", System.currentTimeMillis() - phaseStart); @@ -354,9 +351,7 @@ private void addVectorsFromTempFile(IVFPQWriter writer) throws IOException { if (percent / 10 > lastLoggedPercent / 10) { LOG.info( "IVF-PQ add progress: {}/{} vectors ({}%)", - count - remaining, - count, - percent); + count - remaining, count, percent); lastLoggedPercent = percent; } } diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory b/paimon-ivfpq/paimon-ivfpq-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory index 5b8845427874..e9be2ab97789 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory @@ -1 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + org.apache.paimon.ivfpq.index.IvfpqVectorGlobalIndexerFactory diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexTest.java b/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexTest.java new file mode 100644 index 000000000000..3364fbc9f657 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexTest.java @@ -0,0 +1,401 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.ivfpq.index; + +import org.apache.paimon.fs.FileIO; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.PositionOutputStream; +import org.apache.paimon.fs.local.LocalFileIO; +import org.apache.paimon.globalindex.GlobalIndexIOMeta; +import org.apache.paimon.globalindex.ResultEntry; +import org.apache.paimon.globalindex.ScoredGlobalIndexResult; +import org.apache.paimon.globalindex.io.GlobalIndexFileReader; +import org.apache.paimon.globalindex.io.GlobalIndexFileWriter; +import org.apache.paimon.index.ivfpq.NativeLoader; +import org.apache.paimon.options.Options; +import org.apache.paimon.predicate.VectorSearch; +import org.apache.paimon.types.ArrayType; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.FloatType; +import org.apache.paimon.types.IntType; +import org.apache.paimon.types.VectorType; +import org.apache.paimon.utils.RoaringNavigableMap64; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +/** Tests for {@link IvfpqVectorGlobalIndexWriter} and {@link IvfpqVectorGlobalIndexReader}. */ +public class IvfpqVectorGlobalIndexTest { + + @TempDir java.nio.file.Path tempDir; + + private FileIO fileIO; + private Path indexPath; + private DataType vectorType; + private final String fieldName = "vec"; + private ExecutorService executor; + + private static boolean isNativeAvailable() { + try { + NativeLoader.loadJni(); + return true; + } catch (Throwable t) { + return false; + } + } + + @BeforeEach + public void setup() { + fileIO = new LocalFileIO(); + indexPath = new Path(tempDir.toString()); + vectorType = new ArrayType(new FloatType()); + executor = Executors.newCachedThreadPool(); + } + + @AfterEach + public void cleanup() throws IOException { + if (executor != null) { + executor.shutdownNow(); + } + if (fileIO != null) { + fileIO.delete(indexPath, true); + } + } + + // =================== Tests that do NOT need native library ===================== + + @Test + public void testDimensionMismatch() { + Options options = createDefaultOptions(64); + IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); + IvfpqVectorGlobalIndexWriter writer = + new IvfpqVectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + + float[] wrongDimVector = new float[32]; + assertThatThrownBy(() -> writer.write(wrongDimVector)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("dimension mismatch"); + } + + @Test + public void testVectorTypeRejectsNonFloatElement() { + DataType intVecType = new VectorType(2, new IntType()); + Options options = createDefaultOptions(2); + IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); + + assertThatThrownBy( + () -> + new IvfpqVectorGlobalIndexWriter( + fileWriter, intVecType, indexOptions)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("float"); + } + + @Test + public void testNanInVectorRejected() { + Options options = createDefaultOptions(2); + IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); + IvfpqVectorGlobalIndexWriter writer = + new IvfpqVectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + + assertThatThrownBy(() -> writer.write(new float[] {1.0f, Float.NaN})) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("rowId=0") + .hasMessageContaining("index=1") + .hasMessageContaining("NaN"); + } + + @Test + public void testInfinityInVectorRejected() { + Options options = createDefaultOptions(2); + IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); + IvfpqVectorGlobalIndexWriter writer = + new IvfpqVectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + + writer.write(null); // row 0 - null, advances logicalRowId + assertThatThrownBy(() -> writer.write(new float[] {Float.POSITIVE_INFINITY, 0.0f})) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("rowId=1") + .hasMessageContaining("index=0") + .hasMessageContaining("Infinity"); + } + + @Test + public void testAllNullReturnsEmpty() { + Options options = createDefaultOptions(2); + IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); + IvfpqVectorGlobalIndexWriter writer = + new IvfpqVectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + + writer.write(null); + writer.write(null); + writer.write(null); + + List results = writer.finish(); + assertThat(results).isEmpty(); + } + + @Test + public void testMetaSerializationRoundTrip() throws IOException { + Options options = createDefaultOptions(32); + options.setString("ivfpq.distance.metric", "cosine"); + options.setInteger("ivfpq.nlist", 64); + options.setInteger("ivfpq.m", 8); + options.setBoolean("ivfpq.use_opq", true); + options.setInteger("ivfpq.nprobe", 24); + IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + + IvfpqIndexMeta meta = new IvfpqIndexMeta(indexOptions); + byte[] serialized = meta.serialize(); + IvfpqIndexMeta deserialized = IvfpqIndexMeta.deserialize(serialized); + + assertThat(deserialized.dimension()).isEqualTo(32); + assertThat(deserialized.metric()).isEqualTo(IvfpqVectorMetric.COSINE); + assertThat(deserialized.nlist()).isEqualTo(64); + assertThat(deserialized.m()).isEqualTo(8); + assertThat(deserialized.useOpq()).isTrue(); + assertThat(deserialized.nprobe()).isEqualTo(24); + } + + // =================== Tests that NEED native library ===================== + + @Test + public void testFloatVectorEndToEnd() throws IOException { + Assumptions.assumeTrue(isNativeAvailable(), "IVF-PQ native library not available"); + + int dimension = 2; + Options options = createDefaultOptions(dimension); + options.setInteger("ivfpq.nlist", 2); + options.setInteger("ivfpq.m", 1); + IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + + float[][] vectors = + new float[][] { + new float[] {1.0f, 0.0f}, + new float[] {0.95f, 0.1f}, + new float[] {0.1f, 0.95f}, + new float[] {0.98f, 0.05f}, + new float[] {0.0f, 1.0f}, + new float[] {0.05f, 0.98f} + }; + + GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); + IvfpqVectorGlobalIndexWriter writer = + new IvfpqVectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + Arrays.stream(vectors).forEach(writer::write); + List results = writer.finish(); + List metas = toIOMetas(results, indexPath); + + GlobalIndexFileReader fileReader = createFileReader(indexPath); + try (IvfpqVectorGlobalIndexReader reader = + new IvfpqVectorGlobalIndexReader( + fileReader, metas, vectorType, indexOptions, executor)) { + VectorSearch vectorSearch = new VectorSearch(vectors[0], 3, fieldName); + ScoredGlobalIndexResult result = reader.visitVectorSearch(vectorSearch).join().get(); + assertThat(result.results().getLongCardinality()).isEqualTo(3); + assertThat(result.results().contains(0L)).isTrue(); + float score = result.scoreGetter().score(0L); + assertThat(score).isNotNaN(); + } + } + + @Test + public void testSearchWithRoaringFilter() throws IOException { + Assumptions.assumeTrue(isNativeAvailable(), "IVF-PQ native library not available"); + + int dimension = 2; + Options options = createDefaultOptions(dimension); + options.setInteger("ivfpq.nlist", 2); + options.setInteger("ivfpq.m", 1); + IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + + float[][] vectors = + new float[][] { + new float[] {1.0f, 0.0f}, + new float[] {0.95f, 0.1f}, + new float[] {0.9f, 0.2f}, + new float[] {-1.0f, 0.0f}, + new float[] {-0.95f, 0.1f}, + new float[] {-0.9f, 0.2f} + }; + + GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); + IvfpqVectorGlobalIndexWriter writer = + new IvfpqVectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + Arrays.stream(vectors).forEach(writer::write); + List results = writer.finish(); + List metas = toIOMetas(results, indexPath); + + GlobalIndexFileReader fileReader = createFileReader(indexPath); + try (IvfpqVectorGlobalIndexReader reader = + new IvfpqVectorGlobalIndexReader( + fileReader, metas, vectorType, indexOptions, executor)) { + + // Filter to rows {1, 4} only + RoaringNavigableMap64 filter = new RoaringNavigableMap64(); + filter.add(1L); + filter.add(4L); + VectorSearch search = + new VectorSearch(vectors[0], 6, fieldName).withIncludeRowIds(filter); + ScoredGlobalIndexResult result = reader.visitVectorSearch(search).join().get(); + assertThat(result.results().contains(1L)).isTrue(); + assertThat(result.results().contains(4L)).isTrue(); + assertThat(result.results().getLongCardinality()).isEqualTo(2); + } + } + + @Test + public void testNullVectorSkipWithCorrectIds() throws IOException { + Assumptions.assumeTrue(isNativeAvailable(), "IVF-PQ native library not available"); + + int dimension = 2; + Options options = createDefaultOptions(dimension); + options.setInteger("ivfpq.nlist", 2); + options.setInteger("ivfpq.m", 1); + IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + + float[][] vectors = + new float[][] { + new float[] {1.0f, 0.0f}, + new float[] {0.1f, 0.95f}, + new float[] {0.0f, 1.0f} + }; + + GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); + IvfpqVectorGlobalIndexWriter writer = + new IvfpqVectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + + writer.write(vectors[0]); // row 0 + writer.write(null); // row 1 - null + writer.write(vectors[1]); // row 2 + writer.write(null); // row 3 - null + writer.write(null); // row 4 - null + writer.write(vectors[2]); // row 5 + + List results = writer.finish(); + assertThat(results).hasSize(1); + assertThat(results.get(0).rowCount()).isEqualTo(6); + + List metas = toIOMetas(results, indexPath); + GlobalIndexFileReader fileReader = createFileReader(indexPath); + try (IvfpqVectorGlobalIndexReader reader = + new IvfpqVectorGlobalIndexReader( + fileReader, metas, vectorType, indexOptions, executor)) { + VectorSearch vectorSearch = new VectorSearch(vectors[0], 3, fieldName); + ScoredGlobalIndexResult result = reader.visitVectorSearch(vectorSearch).join().get(); + assertThat(result.results().getLongCardinality()).isEqualTo(3); + assertThat(result.results().contains(0L)).isTrue(); + assertThat(result.results().contains(2L)).isTrue(); + assertThat(result.results().contains(5L)).isTrue(); + assertThat(result.results().contains(1L)).isFalse(); + assertThat(result.results().contains(3L)).isFalse(); + assertThat(result.results().contains(4L)).isFalse(); + } + } + + @Test + public void testViaIndexer() throws IOException { + Assumptions.assumeTrue(isNativeAvailable(), "IVF-PQ native library not available"); + + int dimension = 2; + Options options = createDefaultOptions(dimension); + options.setInteger("ivfpq.nlist", 2); + options.setInteger("ivfpq.m", 1); + + float[][] vectors = + new float[][] { + new float[] {1.0f, 0.0f}, + new float[] {0.0f, 1.0f}, + new float[] {0.7f, 0.7f} + }; + + IvfpqVectorGlobalIndexer indexer = new IvfpqVectorGlobalIndexer(vectorType, options); + + GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); + IvfpqVectorGlobalIndexWriter writer = + (IvfpqVectorGlobalIndexWriter) indexer.createWriter(fileWriter); + Arrays.stream(vectors).forEach(writer::write); + List results = writer.finish(); + List metas = toIOMetas(results, indexPath); + + GlobalIndexFileReader fileReader = createFileReader(indexPath); + try (IvfpqVectorGlobalIndexReader reader = + (IvfpqVectorGlobalIndexReader) indexer.createReader(fileReader, metas, executor)) { + VectorSearch vectorSearch = new VectorSearch(vectors[0], 2, fieldName); + ScoredGlobalIndexResult result = reader.visitVectorSearch(vectorSearch).join().get(); + assertThat(result.results().getLongCardinality()).isEqualTo(2); + assertThat(result.results().contains(0L)).isTrue(); + } + } + + // =================== Helpers ===================== + + private Options createDefaultOptions(int dimension) { + Options options = new Options(); + options.setInteger("ivfpq.index.dimension", dimension); + options.setString("ivfpq.distance.metric", "l2"); + return options; + } + + private GlobalIndexFileWriter createFileWriter(Path path) { + return new GlobalIndexFileWriter() { + @Override + public String newFileName(String prefix) { + return prefix + "-" + UUID.randomUUID(); + } + + @Override + public PositionOutputStream newOutputStream(String fileName) throws IOException { + return fileIO.newOutputStream(new Path(path, fileName), false); + } + }; + } + + private GlobalIndexFileReader createFileReader(Path path) { + return meta -> fileIO.newInputStream(new Path(path, meta.filePath())); + } + + private List toIOMetas(List results, Path path) + throws IOException { + assertThat(results).hasSize(1); + ResultEntry result = results.get(0); + Path filePath = new Path(path, result.fileName()); + return Collections.singletonList( + new GlobalIndexIOMeta(filePath, fileIO.getFileSize(filePath), result.meta())); + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactoryTest.java b/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactoryTest.java new file mode 100644 index 000000000000..4e43af620c8c --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactoryTest.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.ivfpq.index; + +import org.apache.paimon.globalindex.GlobalIndexerFactoryUtils; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Tests for IVF-PQ global indexer factory SPI registration. */ +public class IvfpqVectorGlobalIndexerFactoryTest { + + @Test + public void testIdentifier() { + assertThat(new IvfpqVectorGlobalIndexerFactory().identifier()).isEqualTo("ivfpq"); + } + + @Test + public void testLoadByIdentifier() { + assertThat(GlobalIndexerFactoryUtils.load("ivfpq")) + .isExactlyInstanceOf(IvfpqVectorGlobalIndexerFactory.class); + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptionsTest.java b/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptionsTest.java new file mode 100644 index 000000000000..574c8e4dba36 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptionsTest.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.ivfpq.index; + +import org.apache.paimon.options.Options; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +/** Tests for {@link IvfpqVectorIndexOptions}. */ +public class IvfpqVectorIndexOptionsTest { + + @Test + public void testDefaults() { + Options options = new Options(); + IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + assertThat(indexOptions.dimension()).isEqualTo(128); + assertThat(indexOptions.metric()).isEqualTo(IvfpqVectorMetric.INNER_PRODUCT); + assertThat(indexOptions.nlist()).isEqualTo(256); + assertThat(indexOptions.m()).isEqualTo(16); + assertThat(indexOptions.useOpq()).isFalse(); + assertThat(indexOptions.nprobe()).isEqualTo(16); + assertThat(indexOptions.trainSampleRatio()).isEqualTo(1.0); + assertThat(indexOptions.addBatchSize()).isEqualTo(10000); + } + + @Test + public void testCustomOptions() { + Options options = new Options(); + options.setInteger("ivfpq.index.dimension", 64); + options.setString("ivfpq.distance.metric", "l2"); + options.setInteger("ivfpq.nlist", 128); + options.setInteger("ivfpq.m", 8); + options.setBoolean("ivfpq.use_opq", true); + options.setInteger("ivfpq.nprobe", 32); + options.setDouble("ivfpq.train.sample_ratio", 0.5); + options.setInteger("ivfpq.add.batch_size", 5000); + + IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + assertThat(indexOptions.dimension()).isEqualTo(64); + assertThat(indexOptions.metric()).isEqualTo(IvfpqVectorMetric.L2); + assertThat(indexOptions.nlist()).isEqualTo(128); + assertThat(indexOptions.m()).isEqualTo(8); + assertThat(indexOptions.useOpq()).isTrue(); + assertThat(indexOptions.nprobe()).isEqualTo(32); + assertThat(indexOptions.trainSampleRatio()).isEqualTo(0.5); + assertThat(indexOptions.addBatchSize()).isEqualTo(5000); + } + + @Test + public void testMDivisibilityValidation() { + Options options = new Options(); + options.setInteger("ivfpq.index.dimension", 10); + options.setInteger("ivfpq.m", 3); + assertThatThrownBy(() -> new IvfpqVectorIndexOptions(options)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("must divide"); + } + + @Test + public void testInvalidSampleRatio() { + Options options = new Options(); + options.setDouble("ivfpq.train.sample_ratio", 0.0); + assertThatThrownBy(() -> new IvfpqVectorIndexOptions(options)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("sample_ratio"); + + Options options2 = new Options(); + options2.setDouble("ivfpq.train.sample_ratio", 1.5); + assertThatThrownBy(() -> new IvfpqVectorIndexOptions(options2)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("sample_ratio"); + } + + @Test + public void testMetricParsing() { + for (String metric : new String[] {"l2", "cosine", "inner_product"}) { + Options options = new Options(); + options.setString("ivfpq.distance.metric", metric); + IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + assertThat(indexOptions.metric().getConfigName()).isEqualTo(metric); + } + } + + @Test + public void testMetricParsingUpperCase() { + Options options = new Options(); + options.setString("ivfpq.distance.metric", "L2"); + IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + assertThat(indexOptions.metric()).isEqualTo(IvfpqVectorMetric.L2); + } +} From 8e6dc10eeeaa77f3bc58b14cda01f3813237eb93 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Wed, 10 Jun 2026 17:05:17 +0800 Subject: [PATCH 03/11] [ivfpq] Integrate unified vector index API --- paimon-ivfpq/paimon-ivfpq-index/pom.xml | 2 +- .../IvfFlatVectorGlobalIndexerFactory.java | 30 +++ ...IvfHnswFlatVectorGlobalIndexerFactory.java | 30 +++ .../IvfHnswSqVectorGlobalIndexerFactory.java | 30 +++ ...PqAlgorithmVectorGlobalIndexerFactory.java | 30 +++ .../paimon/ivfpq/index/IvfpqIndexMeta.java | 53 ++++- .../index/IvfpqVectorGlobalIndexReader.java | 80 +++++-- .../index/IvfpqVectorGlobalIndexWriter.java | 70 +++--- .../ivfpq/index/IvfpqVectorGlobalIndexer.java | 8 +- .../IvfpqVectorGlobalIndexerFactory.java | 6 +- .../ivfpq/index/IvfpqVectorIndexOptions.java | 203 ++++++++++++++++-- .../paimon/ivfpq/index/IvfpqVectorMetric.java | 2 +- ...he.paimon.globalindex.GlobalIndexerFactory | 4 + .../index/IvfpqVectorGlobalIndexTest.java | 71 ++++-- .../IvfpqVectorGlobalIndexerFactoryTest.java | 19 +- .../index/IvfpqVectorIndexOptionsTest.java | 70 ++++-- paimon-ivfpq/paimon-ivfpq-jni/pom.xml | 2 +- .../apache/paimon/index/ivfpq/HnswConfig.java | 54 +++++ .../paimon/index/ivfpq/IVFPQNative.java | 57 ----- .../paimon/index/ivfpq/IVFPQReader.java | 108 ---------- .../paimon/index/ivfpq/IVFPQResult.java | 61 ------ .../paimon/index/ivfpq/IVFPQWriter.java | 105 --------- .../apache/paimon/index/ivfpq/IndexType.java | 44 ++++ .../paimon/index/ivfpq/IvfFlatConfig.java | 25 +++ .../paimon/index/ivfpq/IvfHnswFlatConfig.java | 35 +++ .../paimon/index/ivfpq/IvfHnswSqConfig.java | 35 +++ .../paimon/index/ivfpq/IvfPqConfig.java | 47 ++++ .../org/apache/paimon/index/ivfpq/Metric.java | 2 +- .../paimon/index/ivfpq/NativeLoader.java | 2 +- .../paimon/index/ivfpq/VectorIndexConfig.java | 94 ++++++++ .../paimon/index/ivfpq/VectorIndexInput.java | 23 ++ .../index/ivfpq/VectorIndexMetadata.java | 85 ++++++++ .../paimon/index/ivfpq/VectorIndexNative.java | 69 ++++++ .../paimon/index/ivfpq/VectorIndexReader.java | 165 ++++++++++++++ .../paimon/index/ivfpq/VectorIndexWriter.java | 115 ++++++++++ ...sult.java => VectorSearchBatchResult.java} | 38 ++-- .../index/ivfpq/VectorSearchResult.java | 62 ++++++ paimon-ivfpq/pom.xml | 2 +- 38 files changed, 1465 insertions(+), 473 deletions(-) create mode 100644 paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfFlatVectorGlobalIndexerFactory.java create mode 100644 paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfHnswFlatVectorGlobalIndexerFactory.java create mode 100644 paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfHnswSqVectorGlobalIndexerFactory.java create mode 100644 paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/HnswConfig.java delete mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQNative.java delete mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQReader.java delete mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQResult.java delete mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQWriter.java create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IndexType.java create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfFlatConfig.java create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswFlatConfig.java create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswSqConfig.java create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfPqConfig.java create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexConfig.java create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexInput.java create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexMetadata.java create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexNative.java create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexReader.java create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexWriter.java rename paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/{IVFPQBatchResult.java => VectorSearchBatchResult.java} (71%) create mode 100644 paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchResult.java diff --git a/paimon-ivfpq/paimon-ivfpq-index/pom.xml b/paimon-ivfpq/paimon-ivfpq-index/pom.xml index a40f4feb60b8..4f9ca0381493 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/pom.xml +++ b/paimon-ivfpq/paimon-ivfpq-index/pom.xml @@ -29,7 +29,7 @@ under the License. paimon-ivfpq-index - Paimon : IVF-PQ Index + Paimon : Vector Index diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfFlatVectorGlobalIndexerFactory.java b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfFlatVectorGlobalIndexerFactory.java new file mode 100644 index 000000000000..24c25e58bc8a --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfFlatVectorGlobalIndexerFactory.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.ivfpq.index; + +/** Factory for the {@code ivf-flat} vector index identifier. */ +public class IvfFlatVectorGlobalIndexerFactory extends IvfpqVectorGlobalIndexerFactory { + + public static final String IDENTIFIER = "ivf-flat"; + + @Override + public String identifier() { + return IDENTIFIER; + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfHnswFlatVectorGlobalIndexerFactory.java b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfHnswFlatVectorGlobalIndexerFactory.java new file mode 100644 index 000000000000..bf4fc55369b9 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfHnswFlatVectorGlobalIndexerFactory.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.ivfpq.index; + +/** Factory for the {@code ivf-hnsw-flat} vector index identifier. */ +public class IvfHnswFlatVectorGlobalIndexerFactory extends IvfpqVectorGlobalIndexerFactory { + + public static final String IDENTIFIER = "ivf-hnsw-flat"; + + @Override + public String identifier() { + return IDENTIFIER; + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfHnswSqVectorGlobalIndexerFactory.java b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfHnswSqVectorGlobalIndexerFactory.java new file mode 100644 index 000000000000..646e068998cc --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfHnswSqVectorGlobalIndexerFactory.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.ivfpq.index; + +/** Factory for the {@code ivf-hnsw-sq} vector index identifier. */ +public class IvfHnswSqVectorGlobalIndexerFactory extends IvfpqVectorGlobalIndexerFactory { + + public static final String IDENTIFIER = "ivf-hnsw-sq"; + + @Override + public String identifier() { + return IDENTIFIER; + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java new file mode 100644 index 000000000000..53c876279bc1 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.ivfpq.index; + +/** Factory for the {@code ivf-pq} vector index identifier. */ +public class IvfPqAlgorithmVectorGlobalIndexerFactory extends IvfpqVectorGlobalIndexerFactory { + + public static final String IDENTIFIER = "ivf-pq"; + + @Override + public String identifier() { + return IDENTIFIER; + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqIndexMeta.java b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqIndexMeta.java index 2f1f0dc7f731..60e0aee4488a 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqIndexMeta.java +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqIndexMeta.java @@ -18,6 +18,9 @@ package org.apache.paimon.ivfpq.index; +import org.apache.paimon.index.ivfpq.HnswConfig; +import org.apache.paimon.index.ivfpq.IndexType; + import org.apache.paimon.shade.jackson2.com.fasterxml.jackson.core.type.TypeReference; import org.apache.paimon.shade.jackson2.com.fasterxml.jackson.databind.ObjectMapper; @@ -27,7 +30,7 @@ import java.util.Map; /** - * Metadata for an IVF-PQ vector index file. + * Metadata for a vector index file. * *

Serialized as a flat JSON {@code Map} storing the index build parameters * required for correct search-time behavior. @@ -36,12 +39,17 @@ public class IvfpqIndexMeta implements Serializable { private static final long serialVersionUID = 1L; + private static final String KEY_INDEX_TYPE = "index_type"; private static final String KEY_DIMENSION = "dimension"; private static final String KEY_METRIC = "metric"; private static final String KEY_NLIST = "nlist"; private static final String KEY_M = "m"; private static final String KEY_USE_OPQ = "use_opq"; + private static final String KEY_HNSW_M = "hnsw_m"; + private static final String KEY_HNSW_EF_CONSTRUCTION = "hnsw_ef_construction"; + private static final String KEY_HNSW_MAX_LEVEL = "hnsw_max_level"; private static final String KEY_NPROBE = "nprobe"; + private static final String KEY_EF_SEARCH = "ef_search"; private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -52,18 +60,32 @@ public class IvfpqIndexMeta implements Serializable { public IvfpqIndexMeta(IvfpqVectorIndexOptions options) { this.params = new LinkedHashMap<>(); + params.put(KEY_INDEX_TYPE, IvfpqVectorIndexOptions.toIdentifier(options.indexType())); params.put(KEY_DIMENSION, String.valueOf(options.dimension())); params.put(KEY_METRIC, options.metric().getConfigName()); params.put(KEY_NLIST, String.valueOf(options.nlist())); params.put(KEY_M, String.valueOf(options.m())); params.put(KEY_USE_OPQ, String.valueOf(options.useOpq())); + params.put(KEY_HNSW_M, String.valueOf(options.hnswConfig().m())); + params.put(KEY_HNSW_EF_CONSTRUCTION, String.valueOf(options.hnswConfig().efConstruction())); + params.put(KEY_HNSW_MAX_LEVEL, String.valueOf(options.hnswConfig().maxLevel())); params.put(KEY_NPROBE, String.valueOf(options.nprobe())); + params.put(KEY_EF_SEARCH, String.valueOf(options.efSearch())); } private IvfpqIndexMeta(Map params) { this.params = new LinkedHashMap<>(params); } + public IndexType indexType() { + String value = params.get(KEY_INDEX_TYPE); + if (value == null) { + throw new IllegalArgumentException( + "Missing required key in vector index metadata: " + KEY_INDEX_TYPE); + } + return IvfpqVectorIndexOptions.parseIndexType(value); + } + public int dimension() { return Integer.parseInt(params.get(KEY_DIMENSION)); } @@ -77,16 +99,26 @@ public int nlist() { } public int m() { - return Integer.parseInt(params.get(KEY_M)); + return intValue(KEY_M, 0); } public boolean useOpq() { return Boolean.parseBoolean(params.get(KEY_USE_OPQ)); } + public HnswConfig hnswConfig() { + return new HnswConfig( + intValue(KEY_HNSW_M, HnswConfig.DEFAULT.m()), + intValue(KEY_HNSW_EF_CONSTRUCTION, HnswConfig.DEFAULT.efConstruction()), + intValue(KEY_HNSW_MAX_LEVEL, HnswConfig.DEFAULT.maxLevel())); + } + public int nprobe() { - String val = params.get(KEY_NPROBE); - return val != null ? Integer.parseInt(val) : 16; + return intValue(KEY_NPROBE, 16); + } + + public int efSearch() { + return intValue(KEY_EF_SEARCH, 0); } public byte[] serialize() throws IOException { @@ -97,11 +129,20 @@ public static IvfpqIndexMeta deserialize(byte[] data) throws IOException { Map map = OBJECT_MAPPER.readValue(data, MAP_TYPE_REF); if (!map.containsKey(KEY_DIMENSION)) { throw new IOException( - "Missing required key in IVF-PQ index metadata: " + KEY_DIMENSION); + "Missing required key in vector index metadata: " + KEY_DIMENSION); + } + if (!map.containsKey(KEY_INDEX_TYPE)) { + throw new IOException( + "Missing required key in vector index metadata: " + KEY_INDEX_TYPE); } if (!map.containsKey(KEY_METRIC)) { - throw new IOException("Missing required key in IVF-PQ index metadata: " + KEY_METRIC); + throw new IOException("Missing required key in vector index metadata: " + KEY_METRIC); } return new IvfpqIndexMeta(map); } + + private int intValue(String key, int defaultValue) { + String val = params.get(key); + return val == null ? defaultValue : Integer.parseInt(val); + } } diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexReader.java b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexReader.java index b8827e9996f7..5b740006171f 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexReader.java +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexReader.java @@ -24,8 +24,9 @@ import org.apache.paimon.globalindex.GlobalIndexResult; import org.apache.paimon.globalindex.ScoredGlobalIndexResult; import org.apache.paimon.globalindex.io.GlobalIndexFileReader; -import org.apache.paimon.index.ivfpq.IVFPQReader; -import org.apache.paimon.index.ivfpq.IVFPQResult; +import org.apache.paimon.index.ivfpq.VectorIndexInput; +import org.apache.paimon.index.ivfpq.VectorIndexReader; +import org.apache.paimon.index.ivfpq.VectorSearchResult; import org.apache.paimon.predicate.FieldRef; import org.apache.paimon.predicate.VectorSearch; import org.apache.paimon.types.ArrayType; @@ -45,11 +46,10 @@ import static org.apache.paimon.utils.Preconditions.checkArgument; /** - * Vector global index reader using IVF-PQ. + * Vector global index reader using paimon-vector-index. * - *

Each shard has exactly one IVF-PQ index file. The reader lazily opens the index and performs - * vector similarity search. The native Rust JNI layer calls {@code seek(long)} and {@code - * read(byte[], int, int)} directly on the {@link SeekableInputStream}, so no adapter is needed. + *

Each shard has exactly one vector index file. The reader lazily opens the index and performs + * vector similarity search. */ public class IvfpqVectorGlobalIndexReader implements GlobalIndexReader { @@ -60,7 +60,7 @@ public class IvfpqVectorGlobalIndexReader implements GlobalIndexReader { private final ExecutorService executor; private volatile IvfpqIndexMeta indexMeta; - private volatile IVFPQReader ivfpqReader; + private volatile VectorIndexReader vectorReader; private SeekableInputStream openStream; public IvfpqVectorGlobalIndexReader( @@ -88,7 +88,7 @@ public CompletableFuture> visitVectorSearch( } catch (IOException e) { throw new RuntimeException( String.format( - "Failed IVF-PQ search: field=%s, limit=%d", + "Failed vector index search: field=%s, limit=%d", vectorSearch.fieldName(), vectorSearch.limit()), e); } @@ -104,7 +104,7 @@ private ScoredGlobalIndexResult search(VectorSearch vectorSearch) throws IOExcep IvfpqVectorMetric metric = indexMeta.metric(); RoaringNavigableMap64 includeRowIds = vectorSearch.includeRowIds(); - IVFPQResult result; + VectorSearchResult result; if (includeRowIds != null) { long cardinality = includeRowIds.getLongCardinality(); @@ -113,9 +113,11 @@ private ScoredGlobalIndexResult search(VectorSearch vectorSearch) throws IOExcep } byte[] filterBytes = includeRowIds.serialize(); int effectiveK = (int) Math.min(limit, cardinality); - result = ivfpqReader.search(queryVector, effectiveK, nprobe, filterBytes); + result = + vectorReader.search( + queryVector, effectiveK, nprobe, indexMeta.efSearch(), filterBytes); } else { - result = ivfpqReader.search(queryVector, limit, nprobe); + result = vectorReader.search(queryVector, limit, nprobe, indexMeta.efSearch()); } long[] ids = result.ids(); @@ -182,7 +184,7 @@ private void validateSearchVector(Object vector) { } if (!validFieldType) { throw new IllegalArgumentException( - "IVF-PQ requires VectorType or ArrayType, but field type is: " + "Vector index requires VectorType or ArrayType, but field type is: " + fieldType); } int queryDim = ((float[]) vector).length; @@ -195,13 +197,14 @@ private void validateSearchVector(Object vector) { } private void ensureLoaded() throws IOException { - if (ivfpqReader == null) { + if (vectorReader == null) { synchronized (this) { - if (ivfpqReader == null) { + if (vectorReader == null) { indexMeta = IvfpqIndexMeta.deserialize(ioMeta.metadata()); SeekableInputStream in = fileReader.getInputStream(ioMeta); try { - ivfpqReader = new IVFPQReader(in); + vectorReader = + new VectorIndexReader(new SeekableStreamVectorIndexInput(in)); openStream = in; } catch (Exception e) { IOUtils.closeQuietly(in); @@ -216,13 +219,13 @@ private void ensureLoaded() throws IOException { public void close() throws IOException { Throwable firstException = null; - if (ivfpqReader != null) { + if (vectorReader != null) { try { - ivfpqReader.close(); + vectorReader.close(); } catch (Throwable t) { firstException = t; } - ivfpqReader = null; + vectorReader = null; } if (openStream != null) { @@ -245,7 +248,46 @@ public void close() throws IOException { throw (RuntimeException) firstException; } else { throw new RuntimeException( - "Failed to close IVF-PQ vector global index reader", firstException); + "Failed to close vector global index reader", firstException); + } + } + } + + private static class SeekableStreamVectorIndexInput implements VectorIndexInput { + + private final SeekableInputStream input; + + private SeekableStreamVectorIndexInput(SeekableInputStream input) { + this.input = input; + } + + @Override + public synchronized void pread(long[] positions, byte[][] buffers) { + if (positions.length != buffers.length) { + throw new IllegalArgumentException( + "positions length " + + positions.length + + " != buffers length " + + buffers.length); + } + try { + for (int i = 0; i < positions.length; i++) { + input.seek(positions[i]); + readFully(input, buffers[i]); + } + } catch (IOException e) { + throw new RuntimeException("Failed to read vector index", e); + } + } + + private static void readFully(SeekableInputStream input, byte[] buffer) throws IOException { + int offset = 0; + while (offset < buffer.length) { + int read = input.read(buffer, offset, buffer.length - offset); + if (read < 0) { + throw new IOException("Unexpected end of vector index file"); + } + offset += read; } } } diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexWriter.java b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexWriter.java index 6bec7f4dee46..57b57a8f30c4 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexWriter.java +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexWriter.java @@ -24,7 +24,7 @@ import org.apache.paimon.globalindex.GlobalIndexSingletonWriter; import org.apache.paimon.globalindex.ResultEntry; import org.apache.paimon.globalindex.io.GlobalIndexFileWriter; -import org.apache.paimon.index.ivfpq.IVFPQWriter; +import org.apache.paimon.index.ivfpq.VectorIndexWriter; import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.DataType; import org.apache.paimon.types.FloatType; @@ -45,7 +45,7 @@ import java.util.Random; /** - * Vector global index writer using IVF-PQ. + * Vector global index writer using paimon-vector-index. * *

Vectors are spilled to a temporary file on disk as they arrive via {@link #write(Object)}, * keeping Java heap usage constant (~8 MB buffer). During index build, vectors are read back for @@ -55,7 +55,7 @@ */ public class IvfpqVectorGlobalIndexWriter implements GlobalIndexSingletonWriter, Closeable { - private static final String FILE_NAME_PREFIX = "ivfpq"; + private static final String FILE_NAME_PREFIX = "vector"; private static final Logger LOG = LoggerFactory.getLogger(IvfpqVectorGlobalIndexWriter.class); @@ -89,7 +89,7 @@ public IvfpqVectorGlobalIndexWriter( validateFieldType(fieldType); try { - this.tempVectorFile = File.createTempFile("ivfpq-vectors-", ".bin"); + this.tempVectorFile = File.createTempFile("paimon-vector-index-vectors-", ".bin"); this.tempVectorFile.deleteOnExit(); @SuppressWarnings("resource") RandomAccessFile raf = new RandomAccessFile(tempVectorFile, "rw"); @@ -106,7 +106,7 @@ private void validateFieldType(DataType dataType) { DataType elementType = ((VectorType) dataType).getElementType(); if (!(elementType instanceof FloatType)) { throw new IllegalArgumentException( - "IVF-PQ index requires float vector, but got: " + elementType); + "Vector index requires float vector, but got: " + elementType); } return; } @@ -114,12 +114,12 @@ private void validateFieldType(DataType dataType) { DataType elementType = ((ArrayType) dataType).getElementType(); if (!(elementType instanceof FloatType)) { throw new IllegalArgumentException( - "IVF-PQ index requires float array, but got: " + elementType); + "Vector index requires float array, but got: " + elementType); } return; } throw new IllegalArgumentException( - "IVF-PQ index requires VectorType or ArrayType, but got: " + dataType); + "Vector index requires VectorType or ArrayType, but got: " + dataType); } @Override @@ -204,7 +204,7 @@ public List finish() { writeBuf = null; return Collections.singletonList(buildIndex()); } catch (IOException e) { - throw new RuntimeException("Failed to write IVF-PQ vector global index", e); + throw new RuntimeException("Failed to write vector global index", e); } finally { if (tempVectorFile != null) { tempVectorFile.delete(); @@ -217,46 +217,54 @@ private ResultEntry buildIndex() throws IOException { int effectiveNlist = (int) Math.min(options.nlist(), count); LOG.info( - "IVF-PQ index build started: {} vectors, dim={}, nlist={}, m={}, metric={}", + "{} vector index build started: {} vectors, dim={}, nlist={}, metric={}", + options.logName(), count, dim, effectiveNlist, - options.m(), options.metric()); long buildStart = System.currentTimeMillis(); - try (IVFPQWriter writer = - new IVFPQWriter( - dim, - effectiveNlist, - options.m(), - options.metric().toNativeMetric(), - options.useOpq())) { + try (VectorIndexWriter writer = + new VectorIndexWriter(options.toVectorIndexConfig(effectiveNlist))) { // Phase 1: Train long phaseStart = System.currentTimeMillis(); - LOG.info("IVF-PQ train phase started (sample_ratio={})", options.trainSampleRatio()); + LOG.info( + "{} train phase started (sample_ratio={})", + options.logName(), + options.trainSampleRatio()); trainFromTempFile(writer); - LOG.info("IVF-PQ train phase done in {} ms", System.currentTimeMillis() - phaseStart); + LOG.info( + "{} train phase done in {} ms", + options.logName(), + System.currentTimeMillis() - phaseStart); // Phase 2: Add all vectors in batches phaseStart = System.currentTimeMillis(); - LOG.info("IVF-PQ add phase started"); + LOG.info("{} add phase started", options.logName()); addVectorsFromTempFile(writer); - LOG.info("IVF-PQ add phase done in {} ms", System.currentTimeMillis() - phaseStart); + LOG.info( + "{} add phase done in {} ms", + options.logName(), + System.currentTimeMillis() - phaseStart); // Phase 3: Write index phaseStart = System.currentTimeMillis(); - LOG.info("IVF-PQ write phase started"); - String fileName = fileWriter.newFileName(FILE_NAME_PREFIX); + LOG.info("{} write phase started", options.logName()); + String fileName = fileWriter.newFileName(fileNamePrefix()); try (PositionOutputStream out = fileWriter.newOutputStream(fileName)) { writer.writeIndex(out); out.flush(); } - LOG.info("IVF-PQ write phase done in {} ms", System.currentTimeMillis() - phaseStart); + LOG.info( + "{} write phase done in {} ms", + options.logName(), + System.currentTimeMillis() - phaseStart); LOG.info( - "IVF-PQ index build completed in {} ms", + "{} vector index build completed in {} ms", + options.logName(), System.currentTimeMillis() - buildStart); IvfpqIndexMeta meta = new IvfpqIndexMeta(options); @@ -264,7 +272,11 @@ private ResultEntry buildIndex() throws IOException { } } - private void trainFromTempFile(IVFPQWriter writer) throws IOException { + private String fileNamePrefix() { + return FILE_NAME_PREFIX + "-" + options.logName(); + } + + private void trainFromTempFile(VectorIndexWriter writer) throws IOException { double sampleRatio = options.trainSampleRatio(); int minTrainSize = (int) Math.min(count, Math.max(options.nlist() * 39L, 256)); int sampleCount; @@ -321,7 +333,7 @@ private void trainFromTempFile(IVFPQWriter writer) throws IOException { writer.train(trainData, sampleCount); } - private void addVectorsFromTempFile(IVFPQWriter writer) throws IOException { + private void addVectorsFromTempFile(VectorIndexWriter writer) throws IOException { int batchSize = options.addBatchSize(); long[] batchIds = new long[batchSize]; float[] batchVectors = new float[batchSize * dim]; @@ -350,8 +362,8 @@ private void addVectorsFromTempFile(IVFPQWriter writer) throws IOException { int percent = (int) ((count - remaining) * 100 / count); if (percent / 10 > lastLoggedPercent / 10) { LOG.info( - "IVF-PQ add progress: {}/{} vectors ({}%)", - count - remaining, count, percent); + "{} add progress: {}/{} vectors ({}%)", + options.logName(), count - remaining, count, percent); lastLoggedPercent = percent; } } diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexer.java b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexer.java index 9428631bde7f..b7574886024d 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexer.java +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexer.java @@ -30,15 +30,19 @@ import java.util.List; import java.util.concurrent.ExecutorService; -/** IVF-PQ vector global indexer. */ +/** Vector global indexer backed by paimon-vector-index. */ public class IvfpqVectorGlobalIndexer implements GlobalIndexer { private final DataType fieldType; private final IvfpqVectorIndexOptions options; public IvfpqVectorGlobalIndexer(DataType fieldType, Options options) { + this(fieldType, options, IvfpqVectorGlobalIndexerFactory.IDENTIFIER); + } + + public IvfpqVectorGlobalIndexer(DataType fieldType, Options options, String identifier) { this.fieldType = fieldType; - this.options = new IvfpqVectorIndexOptions(options); + this.options = new IvfpqVectorIndexOptions(options, identifier); } @Override diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactory.java b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactory.java index 21974d8e2880..fd6a0aafb353 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactory.java +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactory.java @@ -23,10 +23,10 @@ import org.apache.paimon.options.Options; import org.apache.paimon.types.DataField; -/** Factory for creating IVF-PQ vector index. */ +/** Factory for creating vector indexes backed by paimon-vector-index. */ public class IvfpqVectorGlobalIndexerFactory implements GlobalIndexerFactory { - public static final String IDENTIFIER = "ivfpq"; + public static final String IDENTIFIER = "vector"; @Override public String identifier() { @@ -35,6 +35,6 @@ public String identifier() { @Override public GlobalIndexer create(DataField field, Options options) { - return new IvfpqVectorGlobalIndexer(field.type(), options); + return new IvfpqVectorGlobalIndexer(field.type(), options, identifier()); } } diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptions.java b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptions.java index 4663792ce25f..6fb115c20c2a 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptions.java +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptions.java @@ -18,54 +18,97 @@ package org.apache.paimon.ivfpq.index; +import org.apache.paimon.index.ivfpq.HnswConfig; +import org.apache.paimon.index.ivfpq.IndexType; +import org.apache.paimon.index.ivfpq.VectorIndexConfig; import org.apache.paimon.options.ConfigOption; import org.apache.paimon.options.ConfigOptions; import org.apache.paimon.options.Options; -/** Options for the IVF-PQ vector index. */ +import java.util.Locale; + +/** Options for the Paimon vector index backed by paimon-vector-index. */ public class IvfpqVectorIndexOptions { + public static final String DEFAULT_IDENTIFIER = "vector"; + public static final String IVF_FLAT_IDENTIFIER = "ivf-flat"; + public static final String IVF_PQ_IDENTIFIER = "ivf-pq"; + public static final String IVF_HNSW_FLAT_IDENTIFIER = "ivf-hnsw-flat"; + public static final String IVF_HNSW_SQ_IDENTIFIER = "ivf-hnsw-sq"; + + public static final ConfigOption INDEX_TYPE = + ConfigOptions.key("vector.index.type") + .stringType() + .defaultValue(IVF_PQ_IDENTIFIER) + .withDescription( + "Vector index algorithm (ivf-flat, ivf-pq, ivf-hnsw-flat, ivf-hnsw-sq)."); + public static final ConfigOption DIMENSION = - ConfigOptions.key("ivfpq.index.dimension") + ConfigOptions.key("vector.index.dimension") .intType() .defaultValue(128) .withDescription("The dimension of the vector."); public static final ConfigOption DISTANCE_METRIC = - ConfigOptions.key("ivfpq.distance.metric") + ConfigOptions.key("vector.distance.metric") .stringType() .defaultValue("inner_product") .withDescription( "Distance metric for vector search (l2, cosine, inner_product)."); public static final ConfigOption NLIST = - ConfigOptions.key("ivfpq.nlist") + ConfigOptions.key("vector.nlist") .intType() .defaultValue(256) .withDescription("Number of IVF partitions (Voronoi cells)."); public static final ConfigOption M = - ConfigOptions.key("ivfpq.m") + ConfigOptions.key("vector.pq.m") .intType() .defaultValue(16) .withDescription( "Number of PQ sub-quantizers. Must divide the vector dimension."); public static final ConfigOption USE_OPQ = - ConfigOptions.key("ivfpq.use_opq") + ConfigOptions.key("vector.pq.use-opq") .booleanType() .defaultValue(false) .withDescription( "Whether to use OPQ (Optimized Product Quantization) rotation."); + public static final ConfigOption HNSW_M = + ConfigOptions.key("vector.hnsw.m") + .intType() + .defaultValue(HnswConfig.DEFAULT.m()) + .withDescription("Maximum number of HNSW neighbors per node."); + + public static final ConfigOption HNSW_EF_CONSTRUCTION = + ConfigOptions.key("vector.hnsw.ef-construction") + .intType() + .defaultValue(HnswConfig.DEFAULT.efConstruction()) + .withDescription("HNSW efConstruction value used during index build."); + + public static final ConfigOption HNSW_MAX_LEVEL = + ConfigOptions.key("vector.hnsw.max-level") + .intType() + .defaultValue(HnswConfig.DEFAULT.maxLevel()) + .withDescription("Maximum HNSW graph level."); + public static final ConfigOption NPROBE = - ConfigOptions.key("ivfpq.nprobe") + ConfigOptions.key("vector.nprobe") .intType() .defaultValue(16) .withDescription("Number of IVF partitions to probe during search."); + public static final ConfigOption EF_SEARCH = + ConfigOptions.key("vector.hnsw.ef-search") + .intType() + .defaultValue(0) + .withDescription( + "HNSW efSearch value used during search. 0 uses the native default."); + public static final ConfigOption TRAIN_SAMPLE_RATIO = - ConfigOptions.key("ivfpq.train.sample_ratio") + ConfigOptions.key("vector.train.sample-ratio") .doubleType() .defaultValue(1.0) .withDescription( @@ -73,43 +116,70 @@ public class IvfpqVectorIndexOptions { + "1.0 means use all vectors for training."); public static final ConfigOption ADD_BATCH_SIZE = - ConfigOptions.key("ivfpq.add.batch_size") + ConfigOptions.key("vector.add.batch-size") .intType() .defaultValue(10000) .withDescription("Batch size for adding vectors after training."); + private final String identifier; + private final IndexType indexType; private final int dimension; private final IvfpqVectorMetric metric; private final int nlist; private final int m; private final boolean useOpq; + private final HnswConfig hnswConfig; private final int nprobe; + private final int efSearch; private final double trainSampleRatio; private final int addBatchSize; public IvfpqVectorIndexOptions(Options options) { - this.dimension = validatePositive(options.get(DIMENSION), DIMENSION.key()); + this(options, DEFAULT_IDENTIFIER); + } + + public IvfpqVectorIndexOptions(Options options, String identifier) { + this.identifier = normalizeIdentifier(identifier); + this.indexType = resolveIndexType(options, this.identifier); + this.dimension = validatePositive(options.get(DIMENSION), optionKey(DIMENSION)); this.metric = parseMetric(options.get(DISTANCE_METRIC)); - this.nlist = validatePositive(options.get(NLIST), NLIST.key()); - this.m = validatePositive(options.get(M), M.key()); + this.nlist = validatePositive(options.get(NLIST), optionKey(NLIST)); + this.m = validatePositive(options.get(M), optionKey(M)); this.useOpq = options.get(USE_OPQ); - this.nprobe = validatePositive(options.get(NPROBE), NPROBE.key()); + this.hnswConfig = + new HnswConfig( + validatePositive(options.get(HNSW_M), optionKey(HNSW_M)), + validatePositive( + options.get(HNSW_EF_CONSTRUCTION), optionKey(HNSW_EF_CONSTRUCTION)), + validatePositive(options.get(HNSW_MAX_LEVEL), optionKey(HNSW_MAX_LEVEL))); + this.nprobe = validatePositive(options.get(NPROBE), optionKey(NPROBE)); + this.efSearch = validateNonNegative(options.get(EF_SEARCH), optionKey(EF_SEARCH)); this.trainSampleRatio = options.get(TRAIN_SAMPLE_RATIO); - this.addBatchSize = validatePositive(options.get(ADD_BATCH_SIZE), ADD_BATCH_SIZE.key()); + this.addBatchSize = + validatePositive(options.get(ADD_BATCH_SIZE), optionKey(ADD_BATCH_SIZE)); - if (dimension % m != 0) { + if (indexType == IndexType.IVF_PQ && dimension % m != 0) { throw new IllegalArgumentException( String.format( - "ivfpq.m (%d) must divide ivfpq.index.dimension (%d)", m, dimension)); + "%s (%d) must divide %s (%d)", + optionKey(M), m, optionKey(DIMENSION), dimension)); } if (trainSampleRatio <= 0 || trainSampleRatio > 1.0) { throw new IllegalArgumentException( String.format( - "ivfpq.train.sample_ratio must be in (0, 1.0], but got %f", - trainSampleRatio)); + "%s must be in (0, 1.0], but got %f", + optionKey(TRAIN_SAMPLE_RATIO), trainSampleRatio)); } } + public String identifier() { + return identifier; + } + + public IndexType indexType() { + return indexType; + } + public int dimension() { return dimension; } @@ -130,10 +200,18 @@ public boolean useOpq() { return useOpq; } + public HnswConfig hnswConfig() { + return hnswConfig; + } + public int nprobe() { return nprobe; } + public int efSearch() { + return efSearch; + } + public double trainSampleRatio() { return trainSampleRatio; } @@ -142,6 +220,29 @@ public int addBatchSize() { return addBatchSize; } + public VectorIndexConfig toVectorIndexConfig(int effectiveNlist) { + switch (indexType) { + case IVF_FLAT: + return VectorIndexConfig.ivfFlat( + dimension, effectiveNlist, metric.toNativeMetric()); + case IVF_PQ: + return VectorIndexConfig.ivfPq( + dimension, effectiveNlist, m, metric.toNativeMetric(), useOpq); + case IVF_HNSW_FLAT: + return VectorIndexConfig.ivfHnswFlat( + dimension, effectiveNlist, metric.toNativeMetric(), hnswConfig); + case IVF_HNSW_SQ: + return VectorIndexConfig.ivfHnswSq( + dimension, effectiveNlist, metric.toNativeMetric(), hnswConfig); + default: + throw new IllegalArgumentException("Unsupported vector index type: " + indexType); + } + } + + public String logName() { + return toIdentifier(indexType); + } + private static IvfpqVectorMetric parseMetric(String value) { try { return IvfpqVectorMetric.fromConfigName(value); @@ -158,4 +259,70 @@ private static int validatePositive(int value, String key) { } return value; } + + private static int validateNonNegative(int value, String key) { + if (value < 0) { + throw new IllegalArgumentException( + String.format( + "Invalid value for '%s': %d. Must be a non-negative integer.", + key, value)); + } + return value; + } + + private static IndexType resolveIndexType(Options options, String identifier) { + if (DEFAULT_IDENTIFIER.equals(identifier)) { + return parseIndexType(options.get(INDEX_TYPE)); + } + + IndexType identifierType = parseIndexType(identifier); + if (options.contains(INDEX_TYPE)) { + IndexType configuredType = parseIndexType(options.get(INDEX_TYPE)); + if (configuredType != identifierType) { + throw new IllegalArgumentException( + String.format( + "Conflicting vector index type: identifier is '%s' but %s is '%s'", + identifier, INDEX_TYPE.key(), options.get(INDEX_TYPE))); + } + } + return identifierType; + } + + public static IndexType parseIndexType(String value) { + String normalized = normalizeIdentifier(value); + if (IVF_PQ_IDENTIFIER.equals(normalized)) { + return IndexType.IVF_PQ; + } else if (IVF_FLAT_IDENTIFIER.equals(normalized)) { + return IndexType.IVF_FLAT; + } else if (IVF_HNSW_FLAT_IDENTIFIER.equals(normalized)) { + return IndexType.IVF_HNSW_FLAT; + } else if (IVF_HNSW_SQ_IDENTIFIER.equals(normalized)) { + return IndexType.IVF_HNSW_SQ; + } + throw new IllegalArgumentException("Unknown vector index type: " + value); + } + + public static String toIdentifier(IndexType indexType) { + switch (indexType) { + case IVF_FLAT: + return IVF_FLAT_IDENTIFIER; + case IVF_PQ: + return IVF_PQ_IDENTIFIER; + case IVF_HNSW_FLAT: + return IVF_HNSW_FLAT_IDENTIFIER; + case IVF_HNSW_SQ: + return IVF_HNSW_SQ_IDENTIFIER; + default: + throw new IllegalArgumentException("Unsupported vector index type: " + indexType); + } + } + + private static String normalizeIdentifier(String identifier) { + String value = identifier == null ? DEFAULT_IDENTIFIER : identifier; + return value.trim().toLowerCase(Locale.ROOT).replace('_', '-'); + } + + private static String optionKey(ConfigOption option) { + return option.key(); + } } diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorMetric.java b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorMetric.java index 95d612b432a3..636c01c3cbe4 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorMetric.java +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorMetric.java @@ -20,7 +20,7 @@ import org.apache.paimon.index.ivfpq.Metric; -/** Enumeration of supported IVF-PQ vector distance metrics. */ +/** Enumeration of supported vector distance metrics. */ public enum IvfpqVectorMetric { L2("l2", Metric.L2), COSINE("cosine", Metric.COSINE), diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory b/paimon-ivfpq/paimon-ivfpq-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory index e9be2ab97789..7a9e3c63e4ed 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory +++ b/paimon-ivfpq/paimon-ivfpq-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory @@ -14,3 +14,7 @@ # limitations under the License. org.apache.paimon.ivfpq.index.IvfpqVectorGlobalIndexerFactory +org.apache.paimon.ivfpq.index.IvfFlatVectorGlobalIndexerFactory +org.apache.paimon.ivfpq.index.IvfPqAlgorithmVectorGlobalIndexerFactory +org.apache.paimon.ivfpq.index.IvfHnswFlatVectorGlobalIndexerFactory +org.apache.paimon.ivfpq.index.IvfHnswSqVectorGlobalIndexerFactory diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexTest.java b/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexTest.java index 3364fbc9f657..507241bf1eaa 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexTest.java +++ b/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexTest.java @@ -27,6 +27,7 @@ import org.apache.paimon.globalindex.ScoredGlobalIndexResult; import org.apache.paimon.globalindex.io.GlobalIndexFileReader; import org.apache.paimon.globalindex.io.GlobalIndexFileWriter; +import org.apache.paimon.index.ivfpq.IndexType; import org.apache.paimon.index.ivfpq.NativeLoader; import org.apache.paimon.options.Options; import org.apache.paimon.predicate.VectorSearch; @@ -112,6 +113,7 @@ public void testDimensionMismatch() { public void testVectorTypeRejectsNonFloatElement() { DataType intVecType = new VectorType(2, new IntType()); Options options = createDefaultOptions(2); + options.setInteger("vector.pq.m", 1); IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); @@ -126,6 +128,7 @@ public void testVectorTypeRejectsNonFloatElement() { @Test public void testNanInVectorRejected() { Options options = createDefaultOptions(2); + options.setInteger("vector.pq.m", 1); IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); IvfpqVectorGlobalIndexWriter writer = @@ -141,6 +144,7 @@ public void testNanInVectorRejected() { @Test public void testInfinityInVectorRejected() { Options options = createDefaultOptions(2); + options.setInteger("vector.pq.m", 1); IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); IvfpqVectorGlobalIndexWriter writer = @@ -157,6 +161,7 @@ public void testInfinityInVectorRejected() { @Test public void testAllNullReturnsEmpty() { Options options = createDefaultOptions(2); + options.setInteger("vector.pq.m", 1); IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); IvfpqVectorGlobalIndexWriter writer = @@ -172,12 +177,13 @@ public void testAllNullReturnsEmpty() { @Test public void testMetaSerializationRoundTrip() throws IOException { - Options options = createDefaultOptions(32); - options.setString("ivfpq.distance.metric", "cosine"); - options.setInteger("ivfpq.nlist", 64); - options.setInteger("ivfpq.m", 8); - options.setBoolean("ivfpq.use_opq", true); - options.setInteger("ivfpq.nprobe", 24); + Options options = new Options(); + options.setInteger("vector.index.dimension", 32); + options.setString("vector.distance.metric", "cosine"); + options.setInteger("vector.nlist", 64); + options.setInteger("vector.pq.m", 8); + options.setString("vector.pq.use-opq", "true"); + options.setInteger("vector.nprobe", 24); IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); IvfpqIndexMeta meta = new IvfpqIndexMeta(indexOptions); @@ -185,6 +191,7 @@ public void testMetaSerializationRoundTrip() throws IOException { IvfpqIndexMeta deserialized = IvfpqIndexMeta.deserialize(serialized); assertThat(deserialized.dimension()).isEqualTo(32); + assertThat(deserialized.indexType()).isEqualTo(IndexType.IVF_PQ); assertThat(deserialized.metric()).isEqualTo(IvfpqVectorMetric.COSINE); assertThat(deserialized.nlist()).isEqualTo(64); assertThat(deserialized.m()).isEqualTo(8); @@ -192,16 +199,40 @@ public void testMetaSerializationRoundTrip() throws IOException { assertThat(deserialized.nprobe()).isEqualTo(24); } + @Test + public void testMetaSerializationRoundTripForHnsw() throws IOException { + Options options = new Options(); + options.setString("vector.index.type", "ivf-hnsw-flat"); + options.setInteger("vector.index.dimension", 16); + options.setString("vector.distance.metric", "l2"); + options.setInteger("vector.nlist", 8); + options.setInteger("vector.hnsw.m", 12); + options.setInteger("vector.hnsw.ef-construction", 64); + options.setInteger("vector.hnsw.max-level", 5); + options.setInteger("vector.hnsw.ef-search", 80); + IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + + IvfpqIndexMeta deserialized = + IvfpqIndexMeta.deserialize(new IvfpqIndexMeta(indexOptions).serialize()); + + assertThat(deserialized.indexType()).isEqualTo(IndexType.IVF_HNSW_FLAT); + assertThat(deserialized.dimension()).isEqualTo(16); + assertThat(deserialized.hnswConfig().m()).isEqualTo(12); + assertThat(deserialized.hnswConfig().efConstruction()).isEqualTo(64); + assertThat(deserialized.hnswConfig().maxLevel()).isEqualTo(5); + assertThat(deserialized.efSearch()).isEqualTo(80); + } + // =================== Tests that NEED native library ===================== @Test public void testFloatVectorEndToEnd() throws IOException { - Assumptions.assumeTrue(isNativeAvailable(), "IVF-PQ native library not available"); + Assumptions.assumeTrue(isNativeAvailable(), "Vector index native library not available"); int dimension = 2; Options options = createDefaultOptions(dimension); - options.setInteger("ivfpq.nlist", 2); - options.setInteger("ivfpq.m", 1); + options.setInteger("vector.nlist", 2); + options.setInteger("vector.pq.m", 1); IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); float[][] vectors = @@ -236,12 +267,12 @@ public void testFloatVectorEndToEnd() throws IOException { @Test public void testSearchWithRoaringFilter() throws IOException { - Assumptions.assumeTrue(isNativeAvailable(), "IVF-PQ native library not available"); + Assumptions.assumeTrue(isNativeAvailable(), "Vector index native library not available"); int dimension = 2; Options options = createDefaultOptions(dimension); - options.setInteger("ivfpq.nlist", 2); - options.setInteger("ivfpq.m", 1); + options.setInteger("vector.nlist", 2); + options.setInteger("vector.pq.m", 1); IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); float[][] vectors = @@ -281,12 +312,12 @@ public void testSearchWithRoaringFilter() throws IOException { @Test public void testNullVectorSkipWithCorrectIds() throws IOException { - Assumptions.assumeTrue(isNativeAvailable(), "IVF-PQ native library not available"); + Assumptions.assumeTrue(isNativeAvailable(), "Vector index native library not available"); int dimension = 2; Options options = createDefaultOptions(dimension); - options.setInteger("ivfpq.nlist", 2); - options.setInteger("ivfpq.m", 1); + options.setInteger("vector.nlist", 2); + options.setInteger("vector.pq.m", 1); IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); float[][] vectors = @@ -330,12 +361,12 @@ public void testNullVectorSkipWithCorrectIds() throws IOException { @Test public void testViaIndexer() throws IOException { - Assumptions.assumeTrue(isNativeAvailable(), "IVF-PQ native library not available"); + Assumptions.assumeTrue(isNativeAvailable(), "Vector index native library not available"); int dimension = 2; Options options = createDefaultOptions(dimension); - options.setInteger("ivfpq.nlist", 2); - options.setInteger("ivfpq.m", 1); + options.setInteger("vector.nlist", 2); + options.setInteger("vector.pq.m", 1); float[][] vectors = new float[][] { @@ -367,8 +398,8 @@ public void testViaIndexer() throws IOException { private Options createDefaultOptions(int dimension) { Options options = new Options(); - options.setInteger("ivfpq.index.dimension", dimension); - options.setString("ivfpq.distance.metric", "l2"); + options.setInteger("vector.index.dimension", dimension); + options.setString("vector.distance.metric", "l2"); return options; } diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactoryTest.java b/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactoryTest.java index 4e43af620c8c..6010e68256d8 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactoryTest.java +++ b/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactoryTest.java @@ -24,17 +24,30 @@ import static org.assertj.core.api.Assertions.assertThat; -/** Tests for IVF-PQ global indexer factory SPI registration. */ +/** Tests for vector global indexer factory SPI registration. */ public class IvfpqVectorGlobalIndexerFactoryTest { @Test public void testIdentifier() { - assertThat(new IvfpqVectorGlobalIndexerFactory().identifier()).isEqualTo("ivfpq"); + assertThat(new IvfpqVectorGlobalIndexerFactory().identifier()).isEqualTo("vector"); + assertThat(new IvfFlatVectorGlobalIndexerFactory().identifier()).isEqualTo("ivf-flat"); + assertThat(new IvfPqAlgorithmVectorGlobalIndexerFactory().identifier()).isEqualTo("ivf-pq"); + assertThat(new IvfHnswFlatVectorGlobalIndexerFactory().identifier()) + .isEqualTo("ivf-hnsw-flat"); + assertThat(new IvfHnswSqVectorGlobalIndexerFactory().identifier()).isEqualTo("ivf-hnsw-sq"); } @Test public void testLoadByIdentifier() { - assertThat(GlobalIndexerFactoryUtils.load("ivfpq")) + assertThat(GlobalIndexerFactoryUtils.load("vector")) .isExactlyInstanceOf(IvfpqVectorGlobalIndexerFactory.class); + assertThat(GlobalIndexerFactoryUtils.load("ivf-flat")) + .isExactlyInstanceOf(IvfFlatVectorGlobalIndexerFactory.class); + assertThat(GlobalIndexerFactoryUtils.load("ivf-pq")) + .isExactlyInstanceOf(IvfPqAlgorithmVectorGlobalIndexerFactory.class); + assertThat(GlobalIndexerFactoryUtils.load("ivf-hnsw-flat")) + .isExactlyInstanceOf(IvfHnswFlatVectorGlobalIndexerFactory.class); + assertThat(GlobalIndexerFactoryUtils.load("ivf-hnsw-sq")) + .isExactlyInstanceOf(IvfHnswSqVectorGlobalIndexerFactory.class); } } diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptionsTest.java b/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptionsTest.java index 574c8e4dba36..f06882381a50 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptionsTest.java +++ b/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptionsTest.java @@ -18,6 +18,7 @@ package org.apache.paimon.ivfpq.index; +import org.apache.paimon.index.ivfpq.IndexType; import org.apache.paimon.options.Options; import org.junit.jupiter.api.Test; @@ -33,11 +34,16 @@ public void testDefaults() { Options options = new Options(); IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); assertThat(indexOptions.dimension()).isEqualTo(128); + assertThat(indexOptions.indexType()).isEqualTo(IndexType.IVF_PQ); assertThat(indexOptions.metric()).isEqualTo(IvfpqVectorMetric.INNER_PRODUCT); assertThat(indexOptions.nlist()).isEqualTo(256); assertThat(indexOptions.m()).isEqualTo(16); assertThat(indexOptions.useOpq()).isFalse(); + assertThat(indexOptions.hnswConfig().m()).isEqualTo(20); + assertThat(indexOptions.hnswConfig().efConstruction()).isEqualTo(150); + assertThat(indexOptions.hnswConfig().maxLevel()).isEqualTo(7); assertThat(indexOptions.nprobe()).isEqualTo(16); + assertThat(indexOptions.efSearch()).isEqualTo(0); assertThat(indexOptions.trainSampleRatio()).isEqualTo(1.0); assertThat(indexOptions.addBatchSize()).isEqualTo(10000); } @@ -45,31 +51,63 @@ public void testDefaults() { @Test public void testCustomOptions() { Options options = new Options(); - options.setInteger("ivfpq.index.dimension", 64); - options.setString("ivfpq.distance.metric", "l2"); - options.setInteger("ivfpq.nlist", 128); - options.setInteger("ivfpq.m", 8); - options.setBoolean("ivfpq.use_opq", true); - options.setInteger("ivfpq.nprobe", 32); - options.setDouble("ivfpq.train.sample_ratio", 0.5); - options.setInteger("ivfpq.add.batch_size", 5000); + options.setString("vector.index.type", "ivf-hnsw-sq"); + options.setInteger("vector.index.dimension", 64); + options.setString("vector.distance.metric", "l2"); + options.setInteger("vector.nlist", 128); + options.setInteger("vector.pq.m", 8); + options.setString("vector.pq.use-opq", "true"); + options.setInteger("vector.hnsw.m", 12); + options.setInteger("vector.hnsw.ef-construction", 64); + options.setInteger("vector.hnsw.max-level", 5); + options.setInteger("vector.nprobe", 32); + options.setInteger("vector.hnsw.ef-search", 96); + options.setString("vector.train.sample-ratio", "0.5"); + options.setInteger("vector.add.batch-size", 5000); IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); assertThat(indexOptions.dimension()).isEqualTo(64); + assertThat(indexOptions.indexType()).isEqualTo(IndexType.IVF_HNSW_SQ); assertThat(indexOptions.metric()).isEqualTo(IvfpqVectorMetric.L2); assertThat(indexOptions.nlist()).isEqualTo(128); assertThat(indexOptions.m()).isEqualTo(8); assertThat(indexOptions.useOpq()).isTrue(); + assertThat(indexOptions.hnswConfig().m()).isEqualTo(12); + assertThat(indexOptions.hnswConfig().efConstruction()).isEqualTo(64); + assertThat(indexOptions.hnswConfig().maxLevel()).isEqualTo(5); assertThat(indexOptions.nprobe()).isEqualTo(32); + assertThat(indexOptions.efSearch()).isEqualTo(96); assertThat(indexOptions.trainSampleRatio()).isEqualTo(0.5); assertThat(indexOptions.addBatchSize()).isEqualTo(5000); } + @Test + public void testIdentifierSelectsIndexType() { + assertThat(new IvfpqVectorIndexOptions(new Options(), "ivf-flat").indexType()) + .isEqualTo(IndexType.IVF_FLAT); + assertThat(new IvfpqVectorIndexOptions(new Options(), "ivf-pq").indexType()) + .isEqualTo(IndexType.IVF_PQ); + assertThat(new IvfpqVectorIndexOptions(new Options(), "ivf-hnsw-flat").indexType()) + .isEqualTo(IndexType.IVF_HNSW_FLAT); + assertThat(new IvfpqVectorIndexOptions(new Options(), "ivf-hnsw-sq").indexType()) + .isEqualTo(IndexType.IVF_HNSW_SQ); + } + + @Test + public void testIdentifierRejectsConflictingIndexType() { + Options options = new Options(); + options.setString("vector.index.type", "ivf-pq"); + + assertThatThrownBy(() -> new IvfpqVectorIndexOptions(options, "ivf-flat")) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Conflicting vector index type"); + } + @Test public void testMDivisibilityValidation() { Options options = new Options(); - options.setInteger("ivfpq.index.dimension", 10); - options.setInteger("ivfpq.m", 3); + options.setInteger("vector.index.dimension", 10); + options.setInteger("vector.pq.m", 3); assertThatThrownBy(() -> new IvfpqVectorIndexOptions(options)) .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("must divide"); @@ -78,23 +116,23 @@ public void testMDivisibilityValidation() { @Test public void testInvalidSampleRatio() { Options options = new Options(); - options.setDouble("ivfpq.train.sample_ratio", 0.0); + options.setString("vector.train.sample-ratio", "0.0"); assertThatThrownBy(() -> new IvfpqVectorIndexOptions(options)) .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("sample_ratio"); + .hasMessageContaining("vector.train.sample-ratio"); Options options2 = new Options(); - options2.setDouble("ivfpq.train.sample_ratio", 1.5); + options2.setString("vector.train.sample-ratio", "1.5"); assertThatThrownBy(() -> new IvfpqVectorIndexOptions(options2)) .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("sample_ratio"); + .hasMessageContaining("vector.train.sample-ratio"); } @Test public void testMetricParsing() { for (String metric : new String[] {"l2", "cosine", "inner_product"}) { Options options = new Options(); - options.setString("ivfpq.distance.metric", metric); + options.setString("vector.distance.metric", metric); IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); assertThat(indexOptions.metric().getConfigName()).isEqualTo(metric); } @@ -103,7 +141,7 @@ public void testMetricParsing() { @Test public void testMetricParsingUpperCase() { Options options = new Options(); - options.setString("ivfpq.distance.metric", "L2"); + options.setString("vector.distance.metric", "L2"); IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); assertThat(indexOptions.metric()).isEqualTo(IvfpqVectorMetric.L2); } diff --git a/paimon-ivfpq/paimon-ivfpq-jni/pom.xml b/paimon-ivfpq/paimon-ivfpq-jni/pom.xml index 7411564074d2..4587ad0403ca 100644 --- a/paimon-ivfpq/paimon-ivfpq-jni/pom.xml +++ b/paimon-ivfpq/paimon-ivfpq-jni/pom.xml @@ -29,7 +29,7 @@ under the License. paimon-ivfpq-jni - Paimon : IVF-PQ JNI + Paimon : Vector Index JNI 1.8 diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/HnswConfig.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/HnswConfig.java new file mode 100644 index 000000000000..eb0bb1313f53 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/HnswConfig.java @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.paimon.index.ivfpq; + +public final class HnswConfig { + + public static final HnswConfig DEFAULT = new HnswConfig(20, 150, 7); + + private final int m; + private final int efConstruction; + private final int maxLevel; + + public HnswConfig(int m, int efConstruction, int maxLevel) { + validatePositive(m, "m"); + validatePositive(efConstruction, "efConstruction"); + validatePositive(maxLevel, "maxLevel"); + this.m = m; + this.efConstruction = efConstruction; + this.maxLevel = maxLevel; + } + + public int m() { + return m; + } + + public int efConstruction() { + return efConstruction; + } + + public int maxLevel() { + return maxLevel; + } + + private static void validatePositive(int value, String name) { + if (value <= 0) { + throw new IllegalArgumentException(name + " must be > 0"); + } + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQNative.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQNative.java deleted file mode 100644 index 5fee60c2c1a0..000000000000 --- a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQNative.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.index.ivfpq; - -final class IVFPQNative { - - static { - NativeLoader.loadJni(); - } - - private IVFPQNative() {} - - static native long createWriter(int d, int nlist, int m, int metric, boolean useOpq); - - static native void train(long ptr, float[] data, int n); - - static native void addVectors(long ptr, long[] ids, float[] data, int n); - - static native void writeIndex(long ptr, Object streamOutput); - - static native void freeWriter(long ptr); - - static native long openReader(Object streamInput); - - static native IVFPQResult search(long ptr, float[] query, int k, int nprobe); - - static native IVFPQResult searchWithRoaringFilter( - long ptr, float[] query, int k, int nprobe, byte[] roaringFilter); - - static native int getDimension(long ptr); - - static native long getTotalVectors(long ptr); - - static native IVFPQBatchResult searchBatch( - long ptr, float[] queries, int queryCount, int k, int nprobe); - - static native IVFPQBatchResult searchBatchWithRoaringFilter( - long ptr, float[] queries, int queryCount, int k, int nprobe, byte[] roaringFilter); - - static native void freeReader(long ptr); -} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQReader.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQReader.java deleted file mode 100644 index 51d5cc094713..000000000000 --- a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQReader.java +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.index.ivfpq; - -/** Opens a serialized IVF-PQ index for search. */ -public final class IVFPQReader implements AutoCloseable { - - private long nativePtr; - - public IVFPQReader(Object input) { - if (input == null) { - throw new NullPointerException("input"); - } - this.nativePtr = IVFPQNative.openReader(input); - } - - public int dimension() { - return IVFPQNative.getDimension(requireOpen()); - } - - public long totalVectors() { - return IVFPQNative.getTotalVectors(requireOpen()); - } - - public IVFPQResult search(float[] query, int topK, int nprobe) { - if (query == null) { - throw new NullPointerException("query"); - } - validatePositive(topK, "topK"); - validatePositive(nprobe, "nprobe"); - return IVFPQNative.search(requireOpen(), query, topK, nprobe); - } - - public IVFPQResult search(float[] query, int topK, int nprobe, byte[] roaringFilter) { - if (query == null) { - throw new NullPointerException("query"); - } - if (roaringFilter == null) { - throw new NullPointerException("roaringFilter"); - } - validatePositive(topK, "topK"); - validatePositive(nprobe, "nprobe"); - return IVFPQNative.searchWithRoaringFilter(requireOpen(), query, topK, nprobe, roaringFilter); - } - - public IVFPQBatchResult searchBatch(float[] queries, int queryCount, int topK, int nprobe) { - if (queries == null) { - throw new NullPointerException("queries"); - } - validatePositive(queryCount, "queryCount"); - validatePositive(topK, "topK"); - validatePositive(nprobe, "nprobe"); - return IVFPQNative.searchBatch(requireOpen(), queries, queryCount, topK, nprobe); - } - - public IVFPQBatchResult searchBatch( - float[] queries, int queryCount, int topK, int nprobe, byte[] roaringFilter) { - if (queries == null) { - throw new NullPointerException("queries"); - } - if (roaringFilter == null) { - throw new NullPointerException("roaringFilter"); - } - validatePositive(queryCount, "queryCount"); - validatePositive(topK, "topK"); - validatePositive(nprobe, "nprobe"); - return IVFPQNative.searchBatchWithRoaringFilter( - requireOpen(), queries, queryCount, topK, nprobe, roaringFilter); - } - - @Override - public void close() { - long ptr = nativePtr; - nativePtr = 0L; - if (ptr != 0L) { - IVFPQNative.freeReader(ptr); - } - } - - private long requireOpen() { - if (nativePtr == 0L) { - throw new IllegalStateException("IVFPQReader is closed"); - } - return nativePtr; - } - - private static void validatePositive(int value, String name) { - if (value <= 0) { - throw new IllegalArgumentException(name + " must be > 0"); - } - } -} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQResult.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQResult.java deleted file mode 100644 index 3b6809b084c9..000000000000 --- a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQResult.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.index.ivfpq; - -import java.util.Arrays; - -/** Result of a single IVF-PQ search query. */ -public final class IVFPQResult { - - private final long[] ids; - private final float[] distances; - - public IVFPQResult(long[] ids, float[] distances) { - if (ids == null) { - throw new NullPointerException("ids"); - } - if (distances == null) { - throw new NullPointerException("distances"); - } - if (ids.length != distances.length) { - throw new IllegalArgumentException( - "ids length " + ids.length + " != distances length " + distances.length); - } - this.ids = ids.clone(); - this.distances = distances.clone(); - } - - public int size() { - return ids.length; - } - - public long[] ids() { - return ids.clone(); - } - - public float[] distances() { - return distances.clone(); - } - - @Override - public String toString() { - return "IVFPQResult{ids=" + Arrays.toString(ids) - + ", distances=" + Arrays.toString(distances) + '}'; - } -} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQWriter.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQWriter.java deleted file mode 100644 index ec4113175890..000000000000 --- a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQWriter.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.index.ivfpq; - -/** Builds an IVF-PQ index via the native Rust library. */ -public final class IVFPQWriter implements AutoCloseable { - - private final int dimension; - private long nativePtr; - - public IVFPQWriter(int dimension, int nlist, int m, Metric metric, boolean useOpq) { - if (metric == null) { - throw new NullPointerException("metric"); - } - validatePositive(dimension, "dimension"); - validatePositive(nlist, "nlist"); - validatePositive(m, "m"); - if (dimension % m != 0) { - throw new IllegalArgumentException("dimension must be divisible by m"); - } - this.dimension = dimension; - this.nativePtr = IVFPQNative.createWriter(dimension, nlist, m, metric.code(), useOpq); - } - - public int dimension() { - return dimension; - } - - public void train(float[] data, int vectorCount) { - validateVectors(data, vectorCount); - IVFPQNative.train(requireOpen(), data, vectorCount); - } - - public void addVectors(long[] ids, float[] data, int vectorCount) { - if (ids == null) { - throw new NullPointerException("ids"); - } - validateVectors(data, vectorCount); - if (ids.length < vectorCount) { - throw new IllegalArgumentException( - "ids length " + ids.length + " < vectorCount " + vectorCount); - } - IVFPQNative.addVectors(requireOpen(), ids, data, vectorCount); - } - - public void writeIndex(Object output) { - if (output == null) { - throw new NullPointerException("output"); - } - IVFPQNative.writeIndex(requireOpen(), output); - } - - @Override - public void close() { - long ptr = nativePtr; - nativePtr = 0L; - if (ptr != 0L) { - IVFPQNative.freeWriter(ptr); - } - } - - private void validateVectors(float[] data, int vectorCount) { - if (data == null) { - throw new NullPointerException("data"); - } - validatePositive(vectorCount, "vectorCount"); - long expected = (long) vectorCount * (long) dimension; - if (expected > Integer.MAX_VALUE) { - throw new IllegalArgumentException("vectorCount * dimension overflows int"); - } - if (data.length < expected) { - throw new IllegalArgumentException( - "data length " + data.length + " < vectorCount * dimension " + expected); - } - } - - private long requireOpen() { - if (nativePtr == 0L) { - throw new IllegalStateException("IVFPQWriter is closed"); - } - return nativePtr; - } - - private static void validatePositive(int value, String name) { - if (value <= 0) { - throw new IllegalArgumentException(name + " must be > 0"); - } - } -} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IndexType.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IndexType.java new file mode 100644 index 000000000000..b61a38794e34 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IndexType.java @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.paimon.index.ivfpq; + +public enum IndexType { + IVF_FLAT(0), + IVF_PQ(1), + IVF_HNSW_FLAT(2), + IVF_HNSW_SQ(3); + + private final int code; + + IndexType(int code) { + this.code = code; + } + + public int code() { + return code; + } + + static IndexType fromCode(int code) { + for (IndexType type : values()) { + if (type.code == code) { + return type; + } + } + throw new IllegalArgumentException("unknown index type code: " + code); + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfFlatConfig.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfFlatConfig.java new file mode 100644 index 000000000000..285afc181032 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfFlatConfig.java @@ -0,0 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.paimon.index.ivfpq; + +public final class IvfFlatConfig extends VectorIndexConfig { + + public IvfFlatConfig(int dimension, int nlist, Metric metric) { + super(IndexType.IVF_FLAT, dimension, nlist, metric); + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswFlatConfig.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswFlatConfig.java new file mode 100644 index 000000000000..3b17586fbb20 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswFlatConfig.java @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.paimon.index.ivfpq; + +public final class IvfHnswFlatConfig extends VectorIndexConfig { + + private final HnswConfig hnsw; + + public IvfHnswFlatConfig(int dimension, int nlist, Metric metric, HnswConfig hnsw) { + super(IndexType.IVF_HNSW_FLAT, dimension, nlist, metric); + if (hnsw == null) { + throw new NullPointerException("hnsw"); + } + this.hnsw = hnsw; + } + + public HnswConfig hnsw() { + return hnsw; + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswSqConfig.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswSqConfig.java new file mode 100644 index 000000000000..80fe09a3d835 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswSqConfig.java @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.paimon.index.ivfpq; + +public final class IvfHnswSqConfig extends VectorIndexConfig { + + private final HnswConfig hnsw; + + public IvfHnswSqConfig(int dimension, int nlist, Metric metric, HnswConfig hnsw) { + super(IndexType.IVF_HNSW_SQ, dimension, nlist, metric); + if (hnsw == null) { + throw new NullPointerException("hnsw"); + } + this.hnsw = hnsw; + } + + public HnswConfig hnsw() { + return hnsw; + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfPqConfig.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfPqConfig.java new file mode 100644 index 000000000000..751f560fd208 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfPqConfig.java @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.paimon.index.ivfpq; + +public final class IvfPqConfig extends VectorIndexConfig { + + private final int m; + private final boolean useOpq; + + public IvfPqConfig(int dimension, int nlist, int m, Metric metric, boolean useOpq) { + super(IndexType.IVF_PQ, dimension, nlist, metric); + validatePositive(m, "m"); + if (dimension % m != 0) { + throw new IllegalArgumentException("dimension must be divisible by m"); + } + this.m = m; + this.useOpq = useOpq; + } + + public int m() { + return m; + } + + public boolean useOpq() { + return useOpq; + } + + @Override + int pqM() { + return m; + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/Metric.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/Metric.java index 385e477863e1..f31327bb3175 100644 --- a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/Metric.java +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/Metric.java @@ -18,7 +18,7 @@ package org.apache.paimon.index.ivfpq; -/** Distance metric for IVF-PQ vector search. */ +/** Distance metric for vector search. */ public enum Metric { L2(0), INNER_PRODUCT(1), diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/NativeLoader.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/NativeLoader.java index 0e38a050a14b..242880096b0b 100644 --- a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/NativeLoader.java +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/NativeLoader.java @@ -28,7 +28,7 @@ import java.io.OutputStream; import java.util.Locale; -/** Utility class for loading the native IVF-PQ JNI library. */ +/** Utility class for loading the native vector index JNI library. */ public final class NativeLoader { private static boolean loaded = false; diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexConfig.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexConfig.java new file mode 100644 index 000000000000..0152492f398d --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexConfig.java @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.paimon.index.ivfpq; + +public abstract class VectorIndexConfig { + + private final IndexType indexType; + private final int dimension; + private final int nlist; + private final Metric metric; + + VectorIndexConfig(IndexType indexType, int dimension, int nlist, Metric metric) { + if (indexType == null) { + throw new NullPointerException("indexType"); + } + if (metric == null) { + throw new NullPointerException("metric"); + } + validatePositive(dimension, "dimension"); + validatePositive(nlist, "nlist"); + this.indexType = indexType; + this.dimension = dimension; + this.nlist = nlist; + this.metric = metric; + } + + public static VectorIndexConfig ivfFlat(int dimension, int nlist, Metric metric) { + return new IvfFlatConfig(dimension, nlist, metric); + } + + public static VectorIndexConfig ivfPq( + int dimension, int nlist, int m, Metric metric, boolean useOpq) { + return new IvfPqConfig(dimension, nlist, m, metric, useOpq); + } + + public static VectorIndexConfig ivfHnswFlat( + int dimension, int nlist, Metric metric, HnswConfig hnsw) { + return new IvfHnswFlatConfig(dimension, nlist, metric, hnsw); + } + + public static VectorIndexConfig ivfHnswSq( + int dimension, int nlist, Metric metric, HnswConfig hnsw) { + return new IvfHnswSqConfig(dimension, nlist, metric, hnsw); + } + + public IndexType indexType() { + return indexType; + } + + public int dimension() { + return dimension; + } + + public int nlist() { + return nlist; + } + + public Metric metric() { + return metric; + } + + int pqM() { + return 0; + } + + boolean useOpq() { + return false; + } + + HnswConfig hnsw() { + return HnswConfig.DEFAULT; + } + + static void validatePositive(int value, String name) { + if (value <= 0) { + throw new IllegalArgumentException(name + " must be > 0"); + } + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexInput.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexInput.java new file mode 100644 index 000000000000..451c88463980 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexInput.java @@ -0,0 +1,23 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.paimon.index.ivfpq; + +public interface VectorIndexInput { + + void pread(long[] positions, byte[][] buffers); +} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexMetadata.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexMetadata.java new file mode 100644 index 000000000000..931a5e4af471 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexMetadata.java @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.paimon.index.ivfpq; + +public final class VectorIndexMetadata { + + private final IndexType indexType; + private final int dimension; + private final int nlist; + private final Metric metric; + private final long totalVectors; + private final int pqM; + private final HnswConfig hnsw; + + public VectorIndexMetadata( + int indexType, + int dimension, + int nlist, + int metric, + long totalVectors, + int pqM, + int hnswM, + int efConstruction, + int maxLevel) { + this.indexType = IndexType.fromCode(indexType); + this.dimension = dimension; + this.nlist = nlist; + this.metric = metricFromCode(metric); + this.totalVectors = totalVectors; + this.pqM = pqM; + this.hnsw = hnswM > 0 ? new HnswConfig(hnswM, efConstruction, maxLevel) : null; + } + + public IndexType indexType() { + return indexType; + } + + public int dimension() { + return dimension; + } + + public int nlist() { + return nlist; + } + + public Metric metric() { + return metric; + } + + public long totalVectors() { + return totalVectors; + } + + public int pqM() { + return pqM; + } + + public HnswConfig hnsw() { + return hnsw; + } + + private static Metric metricFromCode(int code) { + for (Metric metric : Metric.values()) { + if (metric.code() == code) { + return metric; + } + } + throw new IllegalArgumentException("unknown metric code: " + code); + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexNative.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexNative.java new file mode 100644 index 000000000000..d4c75e65ea38 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexNative.java @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.paimon.index.ivfpq; + +final class VectorIndexNative { + + static { + NativeLoader.loadJni(); + } + + private VectorIndexNative() {} + + static native long createWriter( + int indexType, + int dimension, + int nlist, + int pqM, + int metric, + boolean useOpq, + int hnswM, + int efConstruction, + int maxLevel); + + static native void train(long ptr, float[] data, int n); + + static native void addVectors(long ptr, long[] ids, float[] data, int n); + + static native void writeIndex(long ptr, Object streamOutput); + + static native void freeWriter(long ptr); + + static native long openReader(Object streamInput); + + static native VectorIndexMetadata metadata(long ptr); + + static native VectorSearchResult search(long ptr, float[] query, int k, int nprobe, int efSearch); + + static native VectorSearchResult searchWithRoaringFilter( + long ptr, float[] query, int k, int nprobe, int efSearch, byte[] roaringFilter); + + static native VectorSearchBatchResult searchBatch( + long ptr, float[] queries, int queryCount, int k, int nprobe, int efSearch); + + static native VectorSearchBatchResult searchBatchWithRoaringFilter( + long ptr, + float[] queries, + int queryCount, + int k, + int nprobe, + int efSearch, + byte[] roaringFilter); + + static native void freeReader(long ptr); +} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexReader.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexReader.java new file mode 100644 index 000000000000..d0da1f42a8dc --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexReader.java @@ -0,0 +1,165 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.paimon.index.ivfpq; + +public final class VectorIndexReader implements AutoCloseable { + + private long nativePtr; + private VectorIndexMetadata metadata; + + public VectorIndexReader(VectorIndexInput input) { + if (input == null) { + throw new NullPointerException("input"); + } + this.nativePtr = VectorIndexNative.openReader(input); + } + + private VectorIndexReader(long nativePtr) { + this.nativePtr = nativePtr; + } + + static VectorIndexReader fromNativePointerForTesting(long nativePtr) { + return new VectorIndexReader(nativePtr); + } + + public VectorIndexMetadata metadata() { + if (metadata == null) { + metadata = VectorIndexNative.metadata(requireOpen()); + } + return metadata; + } + + public IndexType indexType() { + return metadata().indexType(); + } + + public int dimension() { + return metadata().dimension(); + } + + public long totalVectors() { + return metadata().totalVectors(); + } + + public VectorSearchResult search(float[] query, int topK, int nprobe) { + return search(query, topK, nprobe, 0); + } + + public VectorSearchResult search(float[] query, int topK, int nprobe, int efSearch) { + validateQuery(query); + validateSearchParams(topK, nprobe, efSearch); + return VectorIndexNative.search(requireOpen(), query, topK, nprobe, efSearch); + } + + public VectorSearchResult search(float[] query, int topK, int nprobe, byte[] roaringFilter) { + return search(query, topK, nprobe, 0, roaringFilter); + } + + public VectorSearchResult search( + float[] query, int topK, int nprobe, int efSearch, byte[] roaringFilter) { + validateQuery(query); + if (roaringFilter == null) { + throw new NullPointerException("roaringFilter"); + } + validateSearchParams(topK, nprobe, efSearch); + return VectorIndexNative.searchWithRoaringFilter( + requireOpen(), query, topK, nprobe, efSearch, roaringFilter); + } + + public VectorSearchBatchResult searchBatch( + float[] queries, int queryCount, int topK, int nprobe) { + return searchBatch(queries, queryCount, topK, nprobe, 0); + } + + public VectorSearchBatchResult searchBatch( + float[] queries, int queryCount, int topK, int nprobe, int efSearch) { + validateQueries(queries, queryCount); + validateSearchParams(topK, nprobe, efSearch); + return VectorIndexNative.searchBatch(requireOpen(), queries, queryCount, topK, nprobe, efSearch); + } + + public VectorSearchBatchResult searchBatch( + float[] queries, int queryCount, int topK, int nprobe, byte[] roaringFilter) { + return searchBatch(queries, queryCount, topK, nprobe, 0, roaringFilter); + } + + public VectorSearchBatchResult searchBatch( + float[] queries, + int queryCount, + int topK, + int nprobe, + int efSearch, + byte[] roaringFilter) { + validateQueries(queries, queryCount); + if (roaringFilter == null) { + throw new NullPointerException("roaringFilter"); + } + validateSearchParams(topK, nprobe, efSearch); + return VectorIndexNative.searchBatchWithRoaringFilter( + requireOpen(), queries, queryCount, topK, nprobe, efSearch, roaringFilter); + } + + @Override + public void close() { + long ptr = nativePtr; + nativePtr = 0L; + if (ptr != 0L) { + VectorIndexNative.freeReader(ptr); + } + } + + private void validateQuery(float[] query) { + if (query == null) { + throw new NullPointerException("query"); + } + if (query.length != dimension()) { + throw new IllegalArgumentException( + "query length " + query.length + " != index dimension " + dimension()); + } + } + + private void validateQueries(float[] queries, int queryCount) { + if (queries == null) { + throw new NullPointerException("queries"); + } + VectorIndexConfig.validatePositive(queryCount, "queryCount"); + long expected = (long) queryCount * (long) dimension(); + if (expected > Integer.MAX_VALUE) { + throw new IllegalArgumentException("queryCount * dimension overflows int"); + } + if (queries.length != expected) { + throw new IllegalArgumentException( + "queries length " + queries.length + " != queryCount * dimension " + expected); + } + } + + private static void validateSearchParams(int topK, int nprobe, int efSearch) { + VectorIndexConfig.validatePositive(topK, "topK"); + VectorIndexConfig.validatePositive(nprobe, "nprobe"); + if (efSearch < 0) { + throw new IllegalArgumentException("efSearch must be >= 0"); + } + } + + private long requireOpen() { + if (nativePtr == 0L) { + throw new IllegalStateException("VectorIndexReader is closed"); + } + return nativePtr; + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexWriter.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexWriter.java new file mode 100644 index 000000000000..f62d19513ac6 --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexWriter.java @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.paimon.index.ivfpq; + +public final class VectorIndexWriter implements AutoCloseable { + + private final VectorIndexConfig config; + private long nativePtr; + + public VectorIndexWriter(VectorIndexConfig config) { + if (config == null) { + throw new NullPointerException("config"); + } + this.config = config; + HnswConfig hnsw = config.hnsw(); + this.nativePtr = + VectorIndexNative.createWriter( + config.indexType().code(), + config.dimension(), + config.nlist(), + config.pqM(), + config.metric().code(), + config.useOpq(), + hnsw.m(), + hnsw.efConstruction(), + hnsw.maxLevel()); + } + + private VectorIndexWriter(long nativePtr, VectorIndexConfig config) { + this.nativePtr = nativePtr; + this.config = config; + } + + static VectorIndexWriter fromNativePointerForTesting(long nativePtr, VectorIndexConfig config) { + return new VectorIndexWriter(nativePtr, config); + } + + public VectorIndexConfig config() { + return config; + } + + public int dimension() { + return config.dimension(); + } + + public void train(float[] data, int vectorCount) { + validateVectors(data, vectorCount); + VectorIndexNative.train(requireOpen(), data, vectorCount); + } + + public void addVectors(long[] ids, float[] data, int vectorCount) { + if (ids == null) { + throw new NullPointerException("ids"); + } + validateVectors(data, vectorCount); + if (ids.length < vectorCount) { + throw new IllegalArgumentException( + "ids length " + ids.length + " < vectorCount " + vectorCount); + } + VectorIndexNative.addVectors(requireOpen(), ids, data, vectorCount); + } + + public void writeIndex(Object output) { + if (output == null) { + throw new NullPointerException("output"); + } + VectorIndexNative.writeIndex(requireOpen(), output); + } + + @Override + public void close() { + long ptr = nativePtr; + nativePtr = 0L; + if (ptr != 0L) { + VectorIndexNative.freeWriter(ptr); + } + } + + private void validateVectors(float[] data, int vectorCount) { + if (data == null) { + throw new NullPointerException("data"); + } + VectorIndexConfig.validatePositive(vectorCount, "vectorCount"); + long expected = (long) vectorCount * (long) config.dimension(); + if (expected > Integer.MAX_VALUE) { + throw new IllegalArgumentException("vectorCount * dimension overflows int"); + } + if (data.length < expected) { + throw new IllegalArgumentException( + "data length " + data.length + " < vectorCount * dimension " + expected); + } + } + + private long requireOpen() { + if (nativePtr == 0L) { + throw new IllegalStateException("VectorIndexWriter is closed"); + } + return nativePtr; + } +} diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQBatchResult.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchBatchResult.java similarity index 71% rename from paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQBatchResult.java rename to paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchBatchResult.java index 0ef0067ec958..05ff0ad6a13f 100644 --- a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IVFPQBatchResult.java +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchBatchResult.java @@ -1,34 +1,32 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. package org.apache.paimon.index.ivfpq; import java.util.Arrays; -/** Result of a batch IVF-PQ search query. */ -public final class IVFPQBatchResult { +public final class VectorSearchBatchResult { private final long[] ids; private final float[] distances; private final int queryCount; private final int topK; - public IVFPQBatchResult(long[] ids, float[] distances, int queryCount, int topK) { + public VectorSearchBatchResult(long[] ids, float[] distances, int queryCount, int topK) { if (ids == null) { throw new NullPointerException("ids"); } diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchResult.java b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchResult.java new file mode 100644 index 000000000000..b0a1e559046d --- /dev/null +++ b/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchResult.java @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.paimon.index.ivfpq; + +import java.util.Arrays; + +public final class VectorSearchResult { + + private final long[] ids; + private final float[] distances; + + public VectorSearchResult(long[] ids, float[] distances) { + if (ids == null) { + throw new NullPointerException("ids"); + } + if (distances == null) { + throw new NullPointerException("distances"); + } + if (ids.length != distances.length) { + throw new IllegalArgumentException( + "ids length " + ids.length + " != distances length " + distances.length); + } + this.ids = ids.clone(); + this.distances = distances.clone(); + } + + public int size() { + return ids.length; + } + + public long[] ids() { + return ids.clone(); + } + + public float[] distances() { + return distances.clone(); + } + + @Override + public String toString() { + return "VectorSearchResult{ids=" + + Arrays.toString(ids) + + ", distances=" + + Arrays.toString(distances) + + '}'; + } +} diff --git a/paimon-ivfpq/pom.xml b/paimon-ivfpq/pom.xml index 8e1ec5bc8779..92272343ac45 100644 --- a/paimon-ivfpq/pom.xml +++ b/paimon-ivfpq/pom.xml @@ -29,7 +29,7 @@ under the License. paimon-ivfpq - Paimon : IVF-PQ : + Paimon : Vector Index : pom From e5dcd09dac9704c113cf5d5b4099c50337ce06c4 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Wed, 10 Jun 2026 17:21:45 +0800 Subject: [PATCH 04/11] [vector] Rename vector index module --- ...case-ivfpq.yml => utcase-vector-index.yml} | 14 +-- .../paimon-vector-index}/pom.xml | 6 +- .../IvfFlatVectorGlobalIndexerFactory.java | 4 +- ...IvfHnswFlatVectorGlobalIndexerFactory.java | 4 +- .../IvfHnswSqVectorGlobalIndexerFactory.java | 4 +- ...PqAlgorithmVectorGlobalIndexerFactory.java | 4 +- .../vector/index/VectorGlobalIndexReader.java | 18 ++-- .../vector/index/VectorGlobalIndexWriter.java | 14 +-- .../vector/index/VectorGlobalIndexer.java | 18 ++-- .../index/VectorGlobalIndexerFactory.java | 6 +- .../paimon/vector/index/VectorIndexMeta.java | 20 ++--- .../vector/index/VectorIndexOptions.java | 18 ++-- .../paimon/vector/index/VectorMetric.java | 12 +-- ...he.paimon.globalindex.GlobalIndexerFactory | 10 +-- .../vector/index/VectorGlobalIndexTest.java | 90 +++++++++---------- .../index/VectorGlobalIndexerFactoryTest.java | 8 +- .../vector/index/VectorIndexOptionsTest.java | 36 ++++---- .../paimon-vector-jni}/pom.xml | 4 +- .../apache/paimon/index/ivfpq/HnswConfig.java | 0 .../apache/paimon/index/ivfpq/IndexType.java | 0 .../paimon/index/ivfpq/IvfFlatConfig.java | 0 .../paimon/index/ivfpq/IvfHnswFlatConfig.java | 0 .../paimon/index/ivfpq/IvfHnswSqConfig.java | 0 .../paimon/index/ivfpq/IvfPqConfig.java | 0 .../org/apache/paimon/index/ivfpq/Metric.java | 0 .../paimon/index/ivfpq/NativeLoader.java | 0 .../paimon/index/ivfpq/VectorIndexConfig.java | 0 .../paimon/index/ivfpq/VectorIndexInput.java | 0 .../index/ivfpq/VectorIndexMetadata.java | 0 .../paimon/index/ivfpq/VectorIndexNative.java | 0 .../paimon/index/ivfpq/VectorIndexReader.java | 0 .../paimon/index/ivfpq/VectorIndexWriter.java | 0 .../index/ivfpq/VectorSearchBatchResult.java | 0 .../index/ivfpq/VectorSearchResult.java | 0 {paimon-ivfpq => paimon-vector}/pom.xml | 8 +- pom.xml | 2 +- 36 files changed, 148 insertions(+), 152 deletions(-) rename .github/workflows/{utcase-ivfpq.yml => utcase-vector-index.yml} (86%) rename {paimon-ivfpq/paimon-ivfpq-index => paimon-vector/paimon-vector-index}/pom.xml (97%) rename {paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq => paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector}/index/IvfFlatVectorGlobalIndexerFactory.java (88%) rename {paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq => paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector}/index/IvfHnswFlatVectorGlobalIndexerFactory.java (88%) rename {paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq => paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector}/index/IvfHnswSqVectorGlobalIndexerFactory.java (88%) rename {paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq => paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector}/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java (88%) rename paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexReader.java => paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java (96%) rename paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexWriter.java => paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java (97%) rename paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexer.java => paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java (72%) rename paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactory.java => paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java (86%) rename paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqIndexMeta.java => paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexMeta.java (89%) rename paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptions.java => paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexOptions.java (96%) rename paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorMetric.java => paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorMetric.java (83%) rename {paimon-ivfpq/paimon-ivfpq-index => paimon-vector/paimon-vector-index}/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory (70%) rename paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexTest.java => paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java (82%) rename paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactoryTest.java => paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactoryTest.java (89%) rename paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptionsTest.java => paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorIndexOptionsTest.java (79%) rename {paimon-ivfpq/paimon-ivfpq-jni => paimon-vector/paimon-vector-jni}/pom.xml (96%) rename {paimon-ivfpq/paimon-ivfpq-jni => paimon-vector/paimon-vector-jni}/src/main/java/org/apache/paimon/index/ivfpq/HnswConfig.java (100%) rename {paimon-ivfpq/paimon-ivfpq-jni => paimon-vector/paimon-vector-jni}/src/main/java/org/apache/paimon/index/ivfpq/IndexType.java (100%) rename {paimon-ivfpq/paimon-ivfpq-jni => paimon-vector/paimon-vector-jni}/src/main/java/org/apache/paimon/index/ivfpq/IvfFlatConfig.java (100%) rename {paimon-ivfpq/paimon-ivfpq-jni => paimon-vector/paimon-vector-jni}/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswFlatConfig.java (100%) rename {paimon-ivfpq/paimon-ivfpq-jni => paimon-vector/paimon-vector-jni}/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswSqConfig.java (100%) rename {paimon-ivfpq/paimon-ivfpq-jni => paimon-vector/paimon-vector-jni}/src/main/java/org/apache/paimon/index/ivfpq/IvfPqConfig.java (100%) rename {paimon-ivfpq/paimon-ivfpq-jni => paimon-vector/paimon-vector-jni}/src/main/java/org/apache/paimon/index/ivfpq/Metric.java (100%) rename {paimon-ivfpq/paimon-ivfpq-jni => paimon-vector/paimon-vector-jni}/src/main/java/org/apache/paimon/index/ivfpq/NativeLoader.java (100%) rename {paimon-ivfpq/paimon-ivfpq-jni => paimon-vector/paimon-vector-jni}/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexConfig.java (100%) rename {paimon-ivfpq/paimon-ivfpq-jni => paimon-vector/paimon-vector-jni}/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexInput.java (100%) rename {paimon-ivfpq/paimon-ivfpq-jni => paimon-vector/paimon-vector-jni}/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexMetadata.java (100%) rename {paimon-ivfpq/paimon-ivfpq-jni => paimon-vector/paimon-vector-jni}/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexNative.java (100%) rename {paimon-ivfpq/paimon-ivfpq-jni => paimon-vector/paimon-vector-jni}/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexReader.java (100%) rename {paimon-ivfpq/paimon-ivfpq-jni => paimon-vector/paimon-vector-jni}/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexWriter.java (100%) rename {paimon-ivfpq/paimon-ivfpq-jni => paimon-vector/paimon-vector-jni}/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchBatchResult.java (100%) rename {paimon-ivfpq/paimon-ivfpq-jni => paimon-vector/paimon-vector-jni}/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchResult.java (100%) rename {paimon-ivfpq => paimon-vector}/pom.xml (88%) diff --git a/.github/workflows/utcase-ivfpq.yml b/.github/workflows/utcase-vector-index.yml similarity index 86% rename from .github/workflows/utcase-ivfpq.yml rename to .github/workflows/utcase-vector-index.yml index fc63ae1cd6fd..fa3da08567c3 100644 --- a/.github/workflows/utcase-ivfpq.yml +++ b/.github/workflows/utcase-vector-index.yml @@ -16,15 +16,15 @@ # limitations under the License. ################################################################################ -name: UTCase IVF-PQ +name: UTCase Vector Index on: push: paths: - - 'paimon-ivfpq/**' + - 'paimon-vector/**' pull_request: paths: - - 'paimon-ivfpq/**' + - 'paimon-vector/**' env: JDK_VERSION: 8 @@ -35,7 +35,7 @@ concurrency: cancel-in-progress: true jobs: - ivfpq_test: + vector_index_test: runs-on: ubuntu-latest steps: @@ -61,14 +61,14 @@ jobs: - name: Copy native library to resources run: | - RESOURCE_DIR=paimon-ivfpq/paimon-ivfpq-jni/src/main/resources/native/linux-amd64 + RESOURCE_DIR=paimon-vector/paimon-vector-jni/src/main/resources/native/linux-amd64 mkdir -p ${RESOURCE_DIR} cp /tmp/paimon-vector-index/target/release/libpaimon_vindex_jni.so ${RESOURCE_DIR}/ - - name: Build and test IVF-PQ modules + - name: Build and test vector index modules timeout-minutes: 30 run: | mvn -T 2C -B -ntp clean install -DskipTests - mvn -B -ntp verify -pl paimon-ivfpq/paimon-ivfpq-jni,paimon-ivfpq/paimon-ivfpq-index -Dcheckstyle.skip=true -Dspotless.check.skip=true + mvn -B -ntp verify -pl paimon-vector/paimon-vector-jni,paimon-vector/paimon-vector-index -Dcheckstyle.skip=true -Dspotless.check.skip=true env: MAVEN_OPTS: -Xmx4096m diff --git a/paimon-ivfpq/paimon-ivfpq-index/pom.xml b/paimon-vector/paimon-vector-index/pom.xml similarity index 97% rename from paimon-ivfpq/paimon-ivfpq-index/pom.xml rename to paimon-vector/paimon-vector-index/pom.xml index 4f9ca0381493..6744de7c9e4a 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/pom.xml +++ b/paimon-vector/paimon-vector-index/pom.xml @@ -23,18 +23,18 @@ under the License. 4.0.0 - paimon-ivfpq + paimon-vector org.apache.paimon 1.5-SNAPSHOT - paimon-ivfpq-index + paimon-vector-index Paimon : Vector Index org.apache.paimon - paimon-ivfpq-jni + paimon-vector-jni ${project.version} diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfFlatVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfFlatVectorGlobalIndexerFactory.java similarity index 88% rename from paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfFlatVectorGlobalIndexerFactory.java rename to paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfFlatVectorGlobalIndexerFactory.java index 24c25e58bc8a..572c7cf4edb2 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfFlatVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfFlatVectorGlobalIndexerFactory.java @@ -16,10 +16,10 @@ * limitations under the License. */ -package org.apache.paimon.ivfpq.index; +package org.apache.paimon.vector.index; /** Factory for the {@code ivf-flat} vector index identifier. */ -public class IvfFlatVectorGlobalIndexerFactory extends IvfpqVectorGlobalIndexerFactory { +public class IvfFlatVectorGlobalIndexerFactory extends VectorGlobalIndexerFactory { public static final String IDENTIFIER = "ivf-flat"; diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfHnswFlatVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswFlatVectorGlobalIndexerFactory.java similarity index 88% rename from paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfHnswFlatVectorGlobalIndexerFactory.java rename to paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswFlatVectorGlobalIndexerFactory.java index bf4fc55369b9..159e7af6f1ba 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfHnswFlatVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswFlatVectorGlobalIndexerFactory.java @@ -16,10 +16,10 @@ * limitations under the License. */ -package org.apache.paimon.ivfpq.index; +package org.apache.paimon.vector.index; /** Factory for the {@code ivf-hnsw-flat} vector index identifier. */ -public class IvfHnswFlatVectorGlobalIndexerFactory extends IvfpqVectorGlobalIndexerFactory { +public class IvfHnswFlatVectorGlobalIndexerFactory extends VectorGlobalIndexerFactory { public static final String IDENTIFIER = "ivf-hnsw-flat"; diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfHnswSqVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswSqVectorGlobalIndexerFactory.java similarity index 88% rename from paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfHnswSqVectorGlobalIndexerFactory.java rename to paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswSqVectorGlobalIndexerFactory.java index 646e068998cc..51c72cd8f39c 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfHnswSqVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswSqVectorGlobalIndexerFactory.java @@ -16,10 +16,10 @@ * limitations under the License. */ -package org.apache.paimon.ivfpq.index; +package org.apache.paimon.vector.index; /** Factory for the {@code ivf-hnsw-sq} vector index identifier. */ -public class IvfHnswSqVectorGlobalIndexerFactory extends IvfpqVectorGlobalIndexerFactory { +public class IvfHnswSqVectorGlobalIndexerFactory extends VectorGlobalIndexerFactory { public static final String IDENTIFIER = "ivf-hnsw-sq"; diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java similarity index 88% rename from paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java rename to paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java index 53c876279bc1..f3932de46ed6 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java @@ -16,10 +16,10 @@ * limitations under the License. */ -package org.apache.paimon.ivfpq.index; +package org.apache.paimon.vector.index; /** Factory for the {@code ivf-pq} vector index identifier. */ -public class IvfPqAlgorithmVectorGlobalIndexerFactory extends IvfpqVectorGlobalIndexerFactory { +public class IvfPqAlgorithmVectorGlobalIndexerFactory extends VectorGlobalIndexerFactory { public static final String IDENTIFIER = "ivf-pq"; diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexReader.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java similarity index 96% rename from paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexReader.java rename to paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java index 5b740006171f..2b017888b564 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexReader.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.paimon.ivfpq.index; +package org.apache.paimon.vector.index; import org.apache.paimon.fs.SeekableInputStream; import org.apache.paimon.globalindex.GlobalIndexIOMeta; @@ -51,23 +51,23 @@ *

Each shard has exactly one vector index file. The reader lazily opens the index and performs * vector similarity search. */ -public class IvfpqVectorGlobalIndexReader implements GlobalIndexReader { +public class VectorGlobalIndexReader implements GlobalIndexReader { private final GlobalIndexIOMeta ioMeta; private final GlobalIndexFileReader fileReader; private final DataType fieldType; - private final IvfpqVectorIndexOptions options; + private final VectorIndexOptions options; private final ExecutorService executor; - private volatile IvfpqIndexMeta indexMeta; + private volatile VectorIndexMeta indexMeta; private volatile VectorIndexReader vectorReader; private SeekableInputStream openStream; - public IvfpqVectorGlobalIndexReader( + public VectorGlobalIndexReader( GlobalIndexFileReader fileReader, List ioMetas, DataType fieldType, - IvfpqVectorIndexOptions options, + VectorIndexOptions options, ExecutorService executor) { checkArgument(ioMetas.size() == 1, "Expected exactly one index file per shard"); this.executor = executor; @@ -101,7 +101,7 @@ private ScoredGlobalIndexResult search(VectorSearch vectorSearch) throws IOExcep float[] queryVector = vectorSearch.vector().clone(); int limit = vectorSearch.limit(); int nprobe = indexMeta.nprobe(); - IvfpqVectorMetric metric = indexMeta.metric(); + VectorMetric metric = indexMeta.metric(); RoaringNavigableMap64 includeRowIds = vectorSearch.includeRowIds(); VectorSearchResult result; @@ -158,7 +158,7 @@ private ScoredGlobalIndexResult search(VectorSearch vectorSearch) throws IOExcep }); } - private static float convertDistanceToScore(float distance, IvfpqVectorMetric metric) { + private static float convertDistanceToScore(float distance, VectorMetric metric) { switch (metric) { case L2: return 1.0f / (1.0f + distance); @@ -200,7 +200,7 @@ private void ensureLoaded() throws IOException { if (vectorReader == null) { synchronized (this) { if (vectorReader == null) { - indexMeta = IvfpqIndexMeta.deserialize(ioMeta.metadata()); + indexMeta = VectorIndexMeta.deserialize(ioMeta.metadata()); SeekableInputStream in = fileReader.getInputStream(ioMeta); try { vectorReader = diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexWriter.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java similarity index 97% rename from paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexWriter.java rename to paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java index 57b57a8f30c4..b26015a9fd31 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexWriter.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.paimon.ivfpq.index; +package org.apache.paimon.vector.index; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalVector; @@ -53,16 +53,16 @@ * *

Thread safety: This class is not thread-safe. */ -public class IvfpqVectorGlobalIndexWriter implements GlobalIndexSingletonWriter, Closeable { +public class VectorGlobalIndexWriter implements GlobalIndexSingletonWriter, Closeable { private static final String FILE_NAME_PREFIX = "vector"; - private static final Logger LOG = LoggerFactory.getLogger(IvfpqVectorGlobalIndexWriter.class); + private static final Logger LOG = LoggerFactory.getLogger(VectorGlobalIndexWriter.class); private static final int IO_BUFFER_SIZE = 8 * 1024 * 1024; private final GlobalIndexFileWriter fileWriter; - private final IvfpqVectorIndexOptions options; + private final VectorIndexOptions options; private final int dim; private File tempVectorFile; @@ -76,8 +76,8 @@ public class IvfpqVectorGlobalIndexWriter implements GlobalIndexSingletonWriter, private long logicalRowId; - public IvfpqVectorGlobalIndexWriter( - GlobalIndexFileWriter fileWriter, DataType fieldType, IvfpqVectorIndexOptions options) { + public VectorGlobalIndexWriter( + GlobalIndexFileWriter fileWriter, DataType fieldType, VectorIndexOptions options) { this.fileWriter = fileWriter; this.options = options; this.dim = options.dimension(); @@ -267,7 +267,7 @@ private ResultEntry buildIndex() throws IOException { options.logName(), System.currentTimeMillis() - buildStart); - IvfpqIndexMeta meta = new IvfpqIndexMeta(options); + VectorIndexMeta meta = new VectorIndexMeta(options); return new ResultEntry(fileName, logicalRowId, meta.serialize()); } } diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexer.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java similarity index 72% rename from paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexer.java rename to paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java index b7574886024d..193424846b25 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexer.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.paimon.ivfpq.index; +package org.apache.paimon.vector.index; import org.apache.paimon.globalindex.GlobalIndexIOMeta; import org.apache.paimon.globalindex.GlobalIndexReader; @@ -31,23 +31,23 @@ import java.util.concurrent.ExecutorService; /** Vector global indexer backed by paimon-vector-index. */ -public class IvfpqVectorGlobalIndexer implements GlobalIndexer { +public class VectorGlobalIndexer implements GlobalIndexer { private final DataType fieldType; - private final IvfpqVectorIndexOptions options; + private final VectorIndexOptions options; - public IvfpqVectorGlobalIndexer(DataType fieldType, Options options) { - this(fieldType, options, IvfpqVectorGlobalIndexerFactory.IDENTIFIER); + public VectorGlobalIndexer(DataType fieldType, Options options) { + this(fieldType, options, VectorGlobalIndexerFactory.IDENTIFIER); } - public IvfpqVectorGlobalIndexer(DataType fieldType, Options options, String identifier) { + public VectorGlobalIndexer(DataType fieldType, Options options, String identifier) { this.fieldType = fieldType; - this.options = new IvfpqVectorIndexOptions(options, identifier); + this.options = new VectorIndexOptions(options, identifier); } @Override public GlobalIndexWriter createWriter(GlobalIndexFileWriter fileWriter) { - return new IvfpqVectorGlobalIndexWriter(fileWriter, fieldType, options); + return new VectorGlobalIndexWriter(fileWriter, fieldType, options); } @Override @@ -55,6 +55,6 @@ public GlobalIndexReader createReader( GlobalIndexFileReader fileReader, List files, ExecutorService executor) { - return new IvfpqVectorGlobalIndexReader(fileReader, files, fieldType, options, executor); + return new VectorGlobalIndexReader(fileReader, files, fieldType, options, executor); } } diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java similarity index 86% rename from paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactory.java rename to paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java index fd6a0aafb353..c6554679aeb2 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.paimon.ivfpq.index; +package org.apache.paimon.vector.index; import org.apache.paimon.globalindex.GlobalIndexer; import org.apache.paimon.globalindex.GlobalIndexerFactory; @@ -24,7 +24,7 @@ import org.apache.paimon.types.DataField; /** Factory for creating vector indexes backed by paimon-vector-index. */ -public class IvfpqVectorGlobalIndexerFactory implements GlobalIndexerFactory { +public class VectorGlobalIndexerFactory implements GlobalIndexerFactory { public static final String IDENTIFIER = "vector"; @@ -35,6 +35,6 @@ public String identifier() { @Override public GlobalIndexer create(DataField field, Options options) { - return new IvfpqVectorGlobalIndexer(field.type(), options, identifier()); + return new VectorGlobalIndexer(field.type(), options, identifier()); } } diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqIndexMeta.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexMeta.java similarity index 89% rename from paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqIndexMeta.java rename to paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexMeta.java index 60e0aee4488a..d47a582cfe6d 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqIndexMeta.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexMeta.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.paimon.ivfpq.index; +package org.apache.paimon.vector.index; import org.apache.paimon.index.ivfpq.HnswConfig; import org.apache.paimon.index.ivfpq.IndexType; @@ -35,7 +35,7 @@ *

Serialized as a flat JSON {@code Map} storing the index build parameters * required for correct search-time behavior. */ -public class IvfpqIndexMeta implements Serializable { +public class VectorIndexMeta implements Serializable { private static final long serialVersionUID = 1L; @@ -58,9 +58,9 @@ public class IvfpqIndexMeta implements Serializable { private final Map params; - public IvfpqIndexMeta(IvfpqVectorIndexOptions options) { + public VectorIndexMeta(VectorIndexOptions options) { this.params = new LinkedHashMap<>(); - params.put(KEY_INDEX_TYPE, IvfpqVectorIndexOptions.toIdentifier(options.indexType())); + params.put(KEY_INDEX_TYPE, VectorIndexOptions.toIdentifier(options.indexType())); params.put(KEY_DIMENSION, String.valueOf(options.dimension())); params.put(KEY_METRIC, options.metric().getConfigName()); params.put(KEY_NLIST, String.valueOf(options.nlist())); @@ -73,7 +73,7 @@ public IvfpqIndexMeta(IvfpqVectorIndexOptions options) { params.put(KEY_EF_SEARCH, String.valueOf(options.efSearch())); } - private IvfpqIndexMeta(Map params) { + private VectorIndexMeta(Map params) { this.params = new LinkedHashMap<>(params); } @@ -83,15 +83,15 @@ public IndexType indexType() { throw new IllegalArgumentException( "Missing required key in vector index metadata: " + KEY_INDEX_TYPE); } - return IvfpqVectorIndexOptions.parseIndexType(value); + return VectorIndexOptions.parseIndexType(value); } public int dimension() { return Integer.parseInt(params.get(KEY_DIMENSION)); } - public IvfpqVectorMetric metric() { - return IvfpqVectorMetric.fromConfigName(params.get(KEY_METRIC)); + public VectorMetric metric() { + return VectorMetric.fromConfigName(params.get(KEY_METRIC)); } public int nlist() { @@ -125,7 +125,7 @@ public byte[] serialize() throws IOException { return OBJECT_MAPPER.writeValueAsBytes(params); } - public static IvfpqIndexMeta deserialize(byte[] data) throws IOException { + public static VectorIndexMeta deserialize(byte[] data) throws IOException { Map map = OBJECT_MAPPER.readValue(data, MAP_TYPE_REF); if (!map.containsKey(KEY_DIMENSION)) { throw new IOException( @@ -138,7 +138,7 @@ public static IvfpqIndexMeta deserialize(byte[] data) throws IOException { if (!map.containsKey(KEY_METRIC)) { throw new IOException("Missing required key in vector index metadata: " + KEY_METRIC); } - return new IvfpqIndexMeta(map); + return new VectorIndexMeta(map); } private int intValue(String key, int defaultValue) { diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptions.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexOptions.java similarity index 96% rename from paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptions.java rename to paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexOptions.java index 6fb115c20c2a..b3c12e894104 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptions.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexOptions.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.paimon.ivfpq.index; +package org.apache.paimon.vector.index; import org.apache.paimon.index.ivfpq.HnswConfig; import org.apache.paimon.index.ivfpq.IndexType; @@ -28,7 +28,7 @@ import java.util.Locale; /** Options for the Paimon vector index backed by paimon-vector-index. */ -public class IvfpqVectorIndexOptions { +public class VectorIndexOptions { public static final String DEFAULT_IDENTIFIER = "vector"; public static final String IVF_FLAT_IDENTIFIER = "ivf-flat"; @@ -124,7 +124,7 @@ public class IvfpqVectorIndexOptions { private final String identifier; private final IndexType indexType; private final int dimension; - private final IvfpqVectorMetric metric; + private final VectorMetric metric; private final int nlist; private final int m; private final boolean useOpq; @@ -134,11 +134,11 @@ public class IvfpqVectorIndexOptions { private final double trainSampleRatio; private final int addBatchSize; - public IvfpqVectorIndexOptions(Options options) { + public VectorIndexOptions(Options options) { this(options, DEFAULT_IDENTIFIER); } - public IvfpqVectorIndexOptions(Options options, String identifier) { + public VectorIndexOptions(Options options, String identifier) { this.identifier = normalizeIdentifier(identifier); this.indexType = resolveIndexType(options, this.identifier); this.dimension = validatePositive(options.get(DIMENSION), optionKey(DIMENSION)); @@ -184,7 +184,7 @@ public int dimension() { return dimension; } - public IvfpqVectorMetric metric() { + public VectorMetric metric() { return metric; } @@ -243,11 +243,11 @@ public String logName() { return toIdentifier(indexType); } - private static IvfpqVectorMetric parseMetric(String value) { + private static VectorMetric parseMetric(String value) { try { - return IvfpqVectorMetric.fromConfigName(value); + return VectorMetric.fromConfigName(value); } catch (IllegalArgumentException e) { - return IvfpqVectorMetric.fromString(value); + return VectorMetric.fromString(value); } } diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorMetric.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorMetric.java similarity index 83% rename from paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorMetric.java rename to paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorMetric.java index 636c01c3cbe4..e8642303406d 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/java/org/apache/paimon/ivfpq/index/IvfpqVectorMetric.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorMetric.java @@ -16,12 +16,12 @@ * limitations under the License. */ -package org.apache.paimon.ivfpq.index; +package org.apache.paimon.vector.index; import org.apache.paimon.index.ivfpq.Metric; /** Enumeration of supported vector distance metrics. */ -public enum IvfpqVectorMetric { +public enum VectorMetric { L2("l2", Metric.L2), COSINE("cosine", Metric.COSINE), INNER_PRODUCT("inner_product", Metric.INNER_PRODUCT); @@ -29,7 +29,7 @@ public enum IvfpqVectorMetric { private final String configName; private final Metric nativeMetric; - IvfpqVectorMetric(String configName, Metric nativeMetric) { + VectorMetric(String configName, Metric nativeMetric) { this.configName = configName; this.nativeMetric = nativeMetric; } @@ -42,8 +42,8 @@ public Metric toNativeMetric() { return nativeMetric; } - public static IvfpqVectorMetric fromConfigName(String configName) { - for (IvfpqVectorMetric m : values()) { + public static VectorMetric fromConfigName(String configName) { + for (VectorMetric m : values()) { if (m.configName.equals(configName)) { return m; } @@ -51,7 +51,7 @@ public static IvfpqVectorMetric fromConfigName(String configName) { throw new IllegalArgumentException("Unknown metric: " + configName); } - public static IvfpqVectorMetric fromString(String name) { + public static VectorMetric fromString(String name) { return valueOf(name.toUpperCase()); } } diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory b/paimon-vector/paimon-vector-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory similarity index 70% rename from paimon-ivfpq/paimon-ivfpq-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory rename to paimon-vector/paimon-vector-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory index 7a9e3c63e4ed..ffcfe9350e81 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory +++ b/paimon-vector/paimon-vector-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory @@ -13,8 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.paimon.ivfpq.index.IvfpqVectorGlobalIndexerFactory -org.apache.paimon.ivfpq.index.IvfFlatVectorGlobalIndexerFactory -org.apache.paimon.ivfpq.index.IvfPqAlgorithmVectorGlobalIndexerFactory -org.apache.paimon.ivfpq.index.IvfHnswFlatVectorGlobalIndexerFactory -org.apache.paimon.ivfpq.index.IvfHnswSqVectorGlobalIndexerFactory +org.apache.paimon.vector.index.VectorGlobalIndexerFactory +org.apache.paimon.vector.index.IvfFlatVectorGlobalIndexerFactory +org.apache.paimon.vector.index.IvfPqAlgorithmVectorGlobalIndexerFactory +org.apache.paimon.vector.index.IvfHnswFlatVectorGlobalIndexerFactory +org.apache.paimon.vector.index.IvfHnswSqVectorGlobalIndexerFactory diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java similarity index 82% rename from paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexTest.java rename to paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java index 507241bf1eaa..7b524c26a759 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexTest.java +++ b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.paimon.ivfpq.index; +package org.apache.paimon.vector.index; import org.apache.paimon.fs.FileIO; import org.apache.paimon.fs.Path; @@ -55,8 +55,8 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; -/** Tests for {@link IvfpqVectorGlobalIndexWriter} and {@link IvfpqVectorGlobalIndexReader}. */ -public class IvfpqVectorGlobalIndexTest { +/** Tests for {@link VectorGlobalIndexWriter} and {@link VectorGlobalIndexReader}. */ +public class VectorGlobalIndexTest { @TempDir java.nio.file.Path tempDir; @@ -98,10 +98,10 @@ public void cleanup() throws IOException { @Test public void testDimensionMismatch() { Options options = createDefaultOptions(64); - IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - IvfpqVectorGlobalIndexWriter writer = - new IvfpqVectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + VectorGlobalIndexWriter writer = + new VectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); float[] wrongDimVector = new float[32]; assertThatThrownBy(() -> writer.write(wrongDimVector)) @@ -114,13 +114,10 @@ public void testVectorTypeRejectsNonFloatElement() { DataType intVecType = new VectorType(2, new IntType()); Options options = createDefaultOptions(2); options.setInteger("vector.pq.m", 1); - IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - assertThatThrownBy( - () -> - new IvfpqVectorGlobalIndexWriter( - fileWriter, intVecType, indexOptions)) + assertThatThrownBy(() -> new VectorGlobalIndexWriter(fileWriter, intVecType, indexOptions)) .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("float"); } @@ -129,10 +126,10 @@ public void testVectorTypeRejectsNonFloatElement() { public void testNanInVectorRejected() { Options options = createDefaultOptions(2); options.setInteger("vector.pq.m", 1); - IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - IvfpqVectorGlobalIndexWriter writer = - new IvfpqVectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + VectorGlobalIndexWriter writer = + new VectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); assertThatThrownBy(() -> writer.write(new float[] {1.0f, Float.NaN})) .isInstanceOf(IllegalArgumentException.class) @@ -145,10 +142,10 @@ public void testNanInVectorRejected() { public void testInfinityInVectorRejected() { Options options = createDefaultOptions(2); options.setInteger("vector.pq.m", 1); - IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - IvfpqVectorGlobalIndexWriter writer = - new IvfpqVectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + VectorGlobalIndexWriter writer = + new VectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); writer.write(null); // row 0 - null, advances logicalRowId assertThatThrownBy(() -> writer.write(new float[] {Float.POSITIVE_INFINITY, 0.0f})) @@ -162,10 +159,10 @@ public void testInfinityInVectorRejected() { public void testAllNullReturnsEmpty() { Options options = createDefaultOptions(2); options.setInteger("vector.pq.m", 1); - IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - IvfpqVectorGlobalIndexWriter writer = - new IvfpqVectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + VectorGlobalIndexWriter writer = + new VectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); writer.write(null); writer.write(null); @@ -184,15 +181,15 @@ public void testMetaSerializationRoundTrip() throws IOException { options.setInteger("vector.pq.m", 8); options.setString("vector.pq.use-opq", "true"); options.setInteger("vector.nprobe", 24); - IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options); - IvfpqIndexMeta meta = new IvfpqIndexMeta(indexOptions); + VectorIndexMeta meta = new VectorIndexMeta(indexOptions); byte[] serialized = meta.serialize(); - IvfpqIndexMeta deserialized = IvfpqIndexMeta.deserialize(serialized); + VectorIndexMeta deserialized = VectorIndexMeta.deserialize(serialized); assertThat(deserialized.dimension()).isEqualTo(32); assertThat(deserialized.indexType()).isEqualTo(IndexType.IVF_PQ); - assertThat(deserialized.metric()).isEqualTo(IvfpqVectorMetric.COSINE); + assertThat(deserialized.metric()).isEqualTo(VectorMetric.COSINE); assertThat(deserialized.nlist()).isEqualTo(64); assertThat(deserialized.m()).isEqualTo(8); assertThat(deserialized.useOpq()).isTrue(); @@ -210,10 +207,10 @@ public void testMetaSerializationRoundTripForHnsw() throws IOException { options.setInteger("vector.hnsw.ef-construction", 64); options.setInteger("vector.hnsw.max-level", 5); options.setInteger("vector.hnsw.ef-search", 80); - IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options); - IvfpqIndexMeta deserialized = - IvfpqIndexMeta.deserialize(new IvfpqIndexMeta(indexOptions).serialize()); + VectorIndexMeta deserialized = + VectorIndexMeta.deserialize(new VectorIndexMeta(indexOptions).serialize()); assertThat(deserialized.indexType()).isEqualTo(IndexType.IVF_HNSW_FLAT); assertThat(deserialized.dimension()).isEqualTo(16); @@ -233,7 +230,7 @@ public void testFloatVectorEndToEnd() throws IOException { Options options = createDefaultOptions(dimension); options.setInteger("vector.nlist", 2); options.setInteger("vector.pq.m", 1); - IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options); float[][] vectors = new float[][] { @@ -246,15 +243,15 @@ public void testFloatVectorEndToEnd() throws IOException { }; GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - IvfpqVectorGlobalIndexWriter writer = - new IvfpqVectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + VectorGlobalIndexWriter writer = + new VectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); Arrays.stream(vectors).forEach(writer::write); List results = writer.finish(); List metas = toIOMetas(results, indexPath); GlobalIndexFileReader fileReader = createFileReader(indexPath); - try (IvfpqVectorGlobalIndexReader reader = - new IvfpqVectorGlobalIndexReader( + try (VectorGlobalIndexReader reader = + new VectorGlobalIndexReader( fileReader, metas, vectorType, indexOptions, executor)) { VectorSearch vectorSearch = new VectorSearch(vectors[0], 3, fieldName); ScoredGlobalIndexResult result = reader.visitVectorSearch(vectorSearch).join().get(); @@ -273,7 +270,7 @@ public void testSearchWithRoaringFilter() throws IOException { Options options = createDefaultOptions(dimension); options.setInteger("vector.nlist", 2); options.setInteger("vector.pq.m", 1); - IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options); float[][] vectors = new float[][] { @@ -286,15 +283,15 @@ public void testSearchWithRoaringFilter() throws IOException { }; GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - IvfpqVectorGlobalIndexWriter writer = - new IvfpqVectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + VectorGlobalIndexWriter writer = + new VectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); Arrays.stream(vectors).forEach(writer::write); List results = writer.finish(); List metas = toIOMetas(results, indexPath); GlobalIndexFileReader fileReader = createFileReader(indexPath); - try (IvfpqVectorGlobalIndexReader reader = - new IvfpqVectorGlobalIndexReader( + try (VectorGlobalIndexReader reader = + new VectorGlobalIndexReader( fileReader, metas, vectorType, indexOptions, executor)) { // Filter to rows {1, 4} only @@ -318,7 +315,7 @@ public void testNullVectorSkipWithCorrectIds() throws IOException { Options options = createDefaultOptions(dimension); options.setInteger("vector.nlist", 2); options.setInteger("vector.pq.m", 1); - IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options); float[][] vectors = new float[][] { @@ -328,8 +325,8 @@ public void testNullVectorSkipWithCorrectIds() throws IOException { }; GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - IvfpqVectorGlobalIndexWriter writer = - new IvfpqVectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + VectorGlobalIndexWriter writer = + new VectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); writer.write(vectors[0]); // row 0 writer.write(null); // row 1 - null @@ -344,8 +341,8 @@ public void testNullVectorSkipWithCorrectIds() throws IOException { List metas = toIOMetas(results, indexPath); GlobalIndexFileReader fileReader = createFileReader(indexPath); - try (IvfpqVectorGlobalIndexReader reader = - new IvfpqVectorGlobalIndexReader( + try (VectorGlobalIndexReader reader = + new VectorGlobalIndexReader( fileReader, metas, vectorType, indexOptions, executor)) { VectorSearch vectorSearch = new VectorSearch(vectors[0], 3, fieldName); ScoredGlobalIndexResult result = reader.visitVectorSearch(vectorSearch).join().get(); @@ -375,18 +372,17 @@ public void testViaIndexer() throws IOException { new float[] {0.7f, 0.7f} }; - IvfpqVectorGlobalIndexer indexer = new IvfpqVectorGlobalIndexer(vectorType, options); + VectorGlobalIndexer indexer = new VectorGlobalIndexer(vectorType, options); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - IvfpqVectorGlobalIndexWriter writer = - (IvfpqVectorGlobalIndexWriter) indexer.createWriter(fileWriter); + VectorGlobalIndexWriter writer = (VectorGlobalIndexWriter) indexer.createWriter(fileWriter); Arrays.stream(vectors).forEach(writer::write); List results = writer.finish(); List metas = toIOMetas(results, indexPath); GlobalIndexFileReader fileReader = createFileReader(indexPath); - try (IvfpqVectorGlobalIndexReader reader = - (IvfpqVectorGlobalIndexReader) indexer.createReader(fileReader, metas, executor)) { + try (VectorGlobalIndexReader reader = + (VectorGlobalIndexReader) indexer.createReader(fileReader, metas, executor)) { VectorSearch vectorSearch = new VectorSearch(vectors[0], 2, fieldName); ScoredGlobalIndexResult result = reader.visitVectorSearch(vectorSearch).join().get(); assertThat(result.results().getLongCardinality()).isEqualTo(2); diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactoryTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactoryTest.java similarity index 89% rename from paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactoryTest.java rename to paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactoryTest.java index 6010e68256d8..89c1ae3d10ec 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorGlobalIndexerFactoryTest.java +++ b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactoryTest.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.paimon.ivfpq.index; +package org.apache.paimon.vector.index; import org.apache.paimon.globalindex.GlobalIndexerFactoryUtils; @@ -25,11 +25,11 @@ import static org.assertj.core.api.Assertions.assertThat; /** Tests for vector global indexer factory SPI registration. */ -public class IvfpqVectorGlobalIndexerFactoryTest { +public class VectorGlobalIndexerFactoryTest { @Test public void testIdentifier() { - assertThat(new IvfpqVectorGlobalIndexerFactory().identifier()).isEqualTo("vector"); + assertThat(new VectorGlobalIndexerFactory().identifier()).isEqualTo("vector"); assertThat(new IvfFlatVectorGlobalIndexerFactory().identifier()).isEqualTo("ivf-flat"); assertThat(new IvfPqAlgorithmVectorGlobalIndexerFactory().identifier()).isEqualTo("ivf-pq"); assertThat(new IvfHnswFlatVectorGlobalIndexerFactory().identifier()) @@ -40,7 +40,7 @@ public void testIdentifier() { @Test public void testLoadByIdentifier() { assertThat(GlobalIndexerFactoryUtils.load("vector")) - .isExactlyInstanceOf(IvfpqVectorGlobalIndexerFactory.class); + .isExactlyInstanceOf(VectorGlobalIndexerFactory.class); assertThat(GlobalIndexerFactoryUtils.load("ivf-flat")) .isExactlyInstanceOf(IvfFlatVectorGlobalIndexerFactory.class); assertThat(GlobalIndexerFactoryUtils.load("ivf-pq")) diff --git a/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptionsTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorIndexOptionsTest.java similarity index 79% rename from paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptionsTest.java rename to paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorIndexOptionsTest.java index f06882381a50..16a646dad755 100644 --- a/paimon-ivfpq/paimon-ivfpq-index/src/test/java/org/apache/paimon/ivfpq/index/IvfpqVectorIndexOptionsTest.java +++ b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorIndexOptionsTest.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.paimon.ivfpq.index; +package org.apache.paimon.vector.index; import org.apache.paimon.index.ivfpq.IndexType; import org.apache.paimon.options.Options; @@ -26,16 +26,16 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; -/** Tests for {@link IvfpqVectorIndexOptions}. */ -public class IvfpqVectorIndexOptionsTest { +/** Tests for {@link VectorIndexOptions}. */ +public class VectorIndexOptionsTest { @Test public void testDefaults() { Options options = new Options(); - IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options); assertThat(indexOptions.dimension()).isEqualTo(128); assertThat(indexOptions.indexType()).isEqualTo(IndexType.IVF_PQ); - assertThat(indexOptions.metric()).isEqualTo(IvfpqVectorMetric.INNER_PRODUCT); + assertThat(indexOptions.metric()).isEqualTo(VectorMetric.INNER_PRODUCT); assertThat(indexOptions.nlist()).isEqualTo(256); assertThat(indexOptions.m()).isEqualTo(16); assertThat(indexOptions.useOpq()).isFalse(); @@ -65,10 +65,10 @@ public void testCustomOptions() { options.setString("vector.train.sample-ratio", "0.5"); options.setInteger("vector.add.batch-size", 5000); - IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options); assertThat(indexOptions.dimension()).isEqualTo(64); assertThat(indexOptions.indexType()).isEqualTo(IndexType.IVF_HNSW_SQ); - assertThat(indexOptions.metric()).isEqualTo(IvfpqVectorMetric.L2); + assertThat(indexOptions.metric()).isEqualTo(VectorMetric.L2); assertThat(indexOptions.nlist()).isEqualTo(128); assertThat(indexOptions.m()).isEqualTo(8); assertThat(indexOptions.useOpq()).isTrue(); @@ -83,13 +83,13 @@ public void testCustomOptions() { @Test public void testIdentifierSelectsIndexType() { - assertThat(new IvfpqVectorIndexOptions(new Options(), "ivf-flat").indexType()) + assertThat(new VectorIndexOptions(new Options(), "ivf-flat").indexType()) .isEqualTo(IndexType.IVF_FLAT); - assertThat(new IvfpqVectorIndexOptions(new Options(), "ivf-pq").indexType()) + assertThat(new VectorIndexOptions(new Options(), "ivf-pq").indexType()) .isEqualTo(IndexType.IVF_PQ); - assertThat(new IvfpqVectorIndexOptions(new Options(), "ivf-hnsw-flat").indexType()) + assertThat(new VectorIndexOptions(new Options(), "ivf-hnsw-flat").indexType()) .isEqualTo(IndexType.IVF_HNSW_FLAT); - assertThat(new IvfpqVectorIndexOptions(new Options(), "ivf-hnsw-sq").indexType()) + assertThat(new VectorIndexOptions(new Options(), "ivf-hnsw-sq").indexType()) .isEqualTo(IndexType.IVF_HNSW_SQ); } @@ -98,7 +98,7 @@ public void testIdentifierRejectsConflictingIndexType() { Options options = new Options(); options.setString("vector.index.type", "ivf-pq"); - assertThatThrownBy(() -> new IvfpqVectorIndexOptions(options, "ivf-flat")) + assertThatThrownBy(() -> new VectorIndexOptions(options, "ivf-flat")) .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("Conflicting vector index type"); } @@ -108,7 +108,7 @@ public void testMDivisibilityValidation() { Options options = new Options(); options.setInteger("vector.index.dimension", 10); options.setInteger("vector.pq.m", 3); - assertThatThrownBy(() -> new IvfpqVectorIndexOptions(options)) + assertThatThrownBy(() -> new VectorIndexOptions(options)) .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("must divide"); } @@ -117,13 +117,13 @@ public void testMDivisibilityValidation() { public void testInvalidSampleRatio() { Options options = new Options(); options.setString("vector.train.sample-ratio", "0.0"); - assertThatThrownBy(() -> new IvfpqVectorIndexOptions(options)) + assertThatThrownBy(() -> new VectorIndexOptions(options)) .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("vector.train.sample-ratio"); Options options2 = new Options(); options2.setString("vector.train.sample-ratio", "1.5"); - assertThatThrownBy(() -> new IvfpqVectorIndexOptions(options2)) + assertThatThrownBy(() -> new VectorIndexOptions(options2)) .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("vector.train.sample-ratio"); } @@ -133,7 +133,7 @@ public void testMetricParsing() { for (String metric : new String[] {"l2", "cosine", "inner_product"}) { Options options = new Options(); options.setString("vector.distance.metric", metric); - IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options); assertThat(indexOptions.metric().getConfigName()).isEqualTo(metric); } } @@ -142,7 +142,7 @@ public void testMetricParsing() { public void testMetricParsingUpperCase() { Options options = new Options(); options.setString("vector.distance.metric", "L2"); - IvfpqVectorIndexOptions indexOptions = new IvfpqVectorIndexOptions(options); - assertThat(indexOptions.metric()).isEqualTo(IvfpqVectorMetric.L2); + VectorIndexOptions indexOptions = new VectorIndexOptions(options); + assertThat(indexOptions.metric()).isEqualTo(VectorMetric.L2); } } diff --git a/paimon-ivfpq/paimon-ivfpq-jni/pom.xml b/paimon-vector/paimon-vector-jni/pom.xml similarity index 96% rename from paimon-ivfpq/paimon-ivfpq-jni/pom.xml rename to paimon-vector/paimon-vector-jni/pom.xml index 4587ad0403ca..23d00d043a34 100644 --- a/paimon-ivfpq/paimon-ivfpq-jni/pom.xml +++ b/paimon-vector/paimon-vector-jni/pom.xml @@ -23,12 +23,12 @@ under the License. 4.0.0 - paimon-ivfpq + paimon-vector org.apache.paimon 1.5-SNAPSHOT - paimon-ivfpq-jni + paimon-vector-jni Paimon : Vector Index JNI diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/HnswConfig.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/HnswConfig.java similarity index 100% rename from paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/HnswConfig.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/HnswConfig.java diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IndexType.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IndexType.java similarity index 100% rename from paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IndexType.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IndexType.java diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfFlatConfig.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfFlatConfig.java similarity index 100% rename from paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfFlatConfig.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfFlatConfig.java diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswFlatConfig.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswFlatConfig.java similarity index 100% rename from paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswFlatConfig.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswFlatConfig.java diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswSqConfig.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswSqConfig.java similarity index 100% rename from paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswSqConfig.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswSqConfig.java diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfPqConfig.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfPqConfig.java similarity index 100% rename from paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfPqConfig.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfPqConfig.java diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/Metric.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/Metric.java similarity index 100% rename from paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/Metric.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/Metric.java diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/NativeLoader.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/NativeLoader.java similarity index 100% rename from paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/NativeLoader.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/NativeLoader.java diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexConfig.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexConfig.java similarity index 100% rename from paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexConfig.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexConfig.java diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexInput.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexInput.java similarity index 100% rename from paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexInput.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexInput.java diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexMetadata.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexMetadata.java similarity index 100% rename from paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexMetadata.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexMetadata.java diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexNative.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexNative.java similarity index 100% rename from paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexNative.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexNative.java diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexReader.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexReader.java similarity index 100% rename from paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexReader.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexReader.java diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexWriter.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexWriter.java similarity index 100% rename from paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexWriter.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexWriter.java diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchBatchResult.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchBatchResult.java similarity index 100% rename from paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchBatchResult.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchBatchResult.java diff --git a/paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchResult.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchResult.java similarity index 100% rename from paimon-ivfpq/paimon-ivfpq-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchResult.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchResult.java diff --git a/paimon-ivfpq/pom.xml b/paimon-vector/pom.xml similarity index 88% rename from paimon-ivfpq/pom.xml rename to paimon-vector/pom.xml index 92272343ac45..5cbb01a35a4e 100644 --- a/paimon-ivfpq/pom.xml +++ b/paimon-vector/pom.xml @@ -28,12 +28,12 @@ under the License. 1.5-SNAPSHOT - paimon-ivfpq - Paimon : Vector Index : + paimon-vector + Paimon : Vector Index pom - paimon-ivfpq-jni - paimon-ivfpq-index + paimon-vector-jni + paimon-vector-index diff --git a/pom.xml b/pom.xml index 05e0fa264d4a..a1a6a5e054f4 100644 --- a/pom.xml +++ b/pom.xml @@ -76,7 +76,7 @@ under the License. paimon-vortex paimon-mosaic paimon-tantivy - paimon-ivfpq + paimon-vector From 2eb5cf889e08389db417ac0864fa7f24739f8fcf Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Wed, 10 Jun 2026 21:13:03 +0800 Subject: [PATCH 05/11] [vector] Use factory index type --- .../IvfFlatVectorGlobalIndexerFactory.java | 7 +++ ...IvfHnswFlatVectorGlobalIndexerFactory.java | 7 +++ .../IvfHnswSqVectorGlobalIndexerFactory.java | 7 +++ ...PqAlgorithmVectorGlobalIndexerFactory.java | 7 +++ .../vector/index/VectorGlobalIndexer.java | 9 +-- .../index/VectorGlobalIndexerFactory.java | 12 ++-- .../vector/index/VectorIndexOptions.java | 56 +++---------------- ...he.paimon.globalindex.GlobalIndexerFactory | 1 - .../vector/index/VectorGlobalIndexTest.java | 24 ++++---- .../index/VectorGlobalIndexerFactoryTest.java | 16 +++++- .../vector/index/VectorIndexOptionsTest.java | 37 +++--------- 11 files changed, 74 insertions(+), 109 deletions(-) diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfFlatVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfFlatVectorGlobalIndexerFactory.java index 572c7cf4edb2..8c7f8bd5d9b7 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfFlatVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfFlatVectorGlobalIndexerFactory.java @@ -18,6 +18,8 @@ package org.apache.paimon.vector.index; +import org.apache.paimon.index.ivfpq.IndexType; + /** Factory for the {@code ivf-flat} vector index identifier. */ public class IvfFlatVectorGlobalIndexerFactory extends VectorGlobalIndexerFactory { @@ -27,4 +29,9 @@ public class IvfFlatVectorGlobalIndexerFactory extends VectorGlobalIndexerFactor public String identifier() { return IDENTIFIER; } + + @Override + protected IndexType indexType() { + return IndexType.IVF_FLAT; + } } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswFlatVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswFlatVectorGlobalIndexerFactory.java index 159e7af6f1ba..181ef5be7735 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswFlatVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswFlatVectorGlobalIndexerFactory.java @@ -18,6 +18,8 @@ package org.apache.paimon.vector.index; +import org.apache.paimon.index.ivfpq.IndexType; + /** Factory for the {@code ivf-hnsw-flat} vector index identifier. */ public class IvfHnswFlatVectorGlobalIndexerFactory extends VectorGlobalIndexerFactory { @@ -27,4 +29,9 @@ public class IvfHnswFlatVectorGlobalIndexerFactory extends VectorGlobalIndexerFa public String identifier() { return IDENTIFIER; } + + @Override + protected IndexType indexType() { + return IndexType.IVF_HNSW_FLAT; + } } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswSqVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswSqVectorGlobalIndexerFactory.java index 51c72cd8f39c..3e677d6bdacd 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswSqVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswSqVectorGlobalIndexerFactory.java @@ -18,6 +18,8 @@ package org.apache.paimon.vector.index; +import org.apache.paimon.index.ivfpq.IndexType; + /** Factory for the {@code ivf-hnsw-sq} vector index identifier. */ public class IvfHnswSqVectorGlobalIndexerFactory extends VectorGlobalIndexerFactory { @@ -27,4 +29,9 @@ public class IvfHnswSqVectorGlobalIndexerFactory extends VectorGlobalIndexerFact public String identifier() { return IDENTIFIER; } + + @Override + protected IndexType indexType() { + return IndexType.IVF_HNSW_SQ; + } } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java index f3932de46ed6..649e609fff1b 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java @@ -18,6 +18,8 @@ package org.apache.paimon.vector.index; +import org.apache.paimon.index.ivfpq.IndexType; + /** Factory for the {@code ivf-pq} vector index identifier. */ public class IvfPqAlgorithmVectorGlobalIndexerFactory extends VectorGlobalIndexerFactory { @@ -27,4 +29,9 @@ public class IvfPqAlgorithmVectorGlobalIndexerFactory extends VectorGlobalIndexe public String identifier() { return IDENTIFIER; } + + @Override + protected IndexType indexType() { + return IndexType.IVF_PQ; + } } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java index 193424846b25..4fc10a8c0709 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java @@ -24,6 +24,7 @@ import org.apache.paimon.globalindex.GlobalIndexer; import org.apache.paimon.globalindex.io.GlobalIndexFileReader; import org.apache.paimon.globalindex.io.GlobalIndexFileWriter; +import org.apache.paimon.index.ivfpq.IndexType; import org.apache.paimon.options.Options; import org.apache.paimon.types.DataType; @@ -36,13 +37,9 @@ public class VectorGlobalIndexer implements GlobalIndexer { private final DataType fieldType; private final VectorIndexOptions options; - public VectorGlobalIndexer(DataType fieldType, Options options) { - this(fieldType, options, VectorGlobalIndexerFactory.IDENTIFIER); - } - - public VectorGlobalIndexer(DataType fieldType, Options options, String identifier) { + public VectorGlobalIndexer(DataType fieldType, Options options, IndexType indexType) { this.fieldType = fieldType; - this.options = new VectorIndexOptions(options, identifier); + this.options = new VectorIndexOptions(options, indexType); } @Override diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java index c6554679aeb2..cdbea58a7cb1 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java @@ -20,21 +20,17 @@ import org.apache.paimon.globalindex.GlobalIndexer; import org.apache.paimon.globalindex.GlobalIndexerFactory; +import org.apache.paimon.index.ivfpq.IndexType; import org.apache.paimon.options.Options; import org.apache.paimon.types.DataField; /** Factory for creating vector indexes backed by paimon-vector-index. */ -public class VectorGlobalIndexerFactory implements GlobalIndexerFactory { +public abstract class VectorGlobalIndexerFactory implements GlobalIndexerFactory { - public static final String IDENTIFIER = "vector"; - - @Override - public String identifier() { - return IDENTIFIER; - } + protected abstract IndexType indexType(); @Override public GlobalIndexer create(DataField field, Options options) { - return new VectorGlobalIndexer(field.type(), options, identifier()); + return new VectorGlobalIndexer(field.type(), options, indexType()); } } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexOptions.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexOptions.java index b3c12e894104..93931a39ac66 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexOptions.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexOptions.java @@ -25,24 +25,16 @@ import org.apache.paimon.options.ConfigOptions; import org.apache.paimon.options.Options; -import java.util.Locale; +import java.util.Objects; /** Options for the Paimon vector index backed by paimon-vector-index. */ public class VectorIndexOptions { - public static final String DEFAULT_IDENTIFIER = "vector"; public static final String IVF_FLAT_IDENTIFIER = "ivf-flat"; public static final String IVF_PQ_IDENTIFIER = "ivf-pq"; public static final String IVF_HNSW_FLAT_IDENTIFIER = "ivf-hnsw-flat"; public static final String IVF_HNSW_SQ_IDENTIFIER = "ivf-hnsw-sq"; - public static final ConfigOption INDEX_TYPE = - ConfigOptions.key("vector.index.type") - .stringType() - .defaultValue(IVF_PQ_IDENTIFIER) - .withDescription( - "Vector index algorithm (ivf-flat, ivf-pq, ivf-hnsw-flat, ivf-hnsw-sq)."); - public static final ConfigOption DIMENSION = ConfigOptions.key("vector.index.dimension") .intType() @@ -121,7 +113,6 @@ public class VectorIndexOptions { .defaultValue(10000) .withDescription("Batch size for adding vectors after training."); - private final String identifier; private final IndexType indexType; private final int dimension; private final VectorMetric metric; @@ -134,13 +125,8 @@ public class VectorIndexOptions { private final double trainSampleRatio; private final int addBatchSize; - public VectorIndexOptions(Options options) { - this(options, DEFAULT_IDENTIFIER); - } - - public VectorIndexOptions(Options options, String identifier) { - this.identifier = normalizeIdentifier(identifier); - this.indexType = resolveIndexType(options, this.identifier); + public VectorIndexOptions(Options options, IndexType indexType) { + this.indexType = Objects.requireNonNull(indexType, "indexType must not be null"); this.dimension = validatePositive(options.get(DIMENSION), optionKey(DIMENSION)); this.metric = parseMetric(options.get(DISTANCE_METRIC)); this.nlist = validatePositive(options.get(NLIST), optionKey(NLIST)); @@ -172,10 +158,6 @@ public VectorIndexOptions(Options options, String identifier) { } } - public String identifier() { - return identifier; - } - public IndexType indexType() { return indexType; } @@ -270,33 +252,14 @@ private static int validateNonNegative(int value, String key) { return value; } - private static IndexType resolveIndexType(Options options, String identifier) { - if (DEFAULT_IDENTIFIER.equals(identifier)) { - return parseIndexType(options.get(INDEX_TYPE)); - } - - IndexType identifierType = parseIndexType(identifier); - if (options.contains(INDEX_TYPE)) { - IndexType configuredType = parseIndexType(options.get(INDEX_TYPE)); - if (configuredType != identifierType) { - throw new IllegalArgumentException( - String.format( - "Conflicting vector index type: identifier is '%s' but %s is '%s'", - identifier, INDEX_TYPE.key(), options.get(INDEX_TYPE))); - } - } - return identifierType; - } - public static IndexType parseIndexType(String value) { - String normalized = normalizeIdentifier(value); - if (IVF_PQ_IDENTIFIER.equals(normalized)) { + if (IVF_PQ_IDENTIFIER.equals(value)) { return IndexType.IVF_PQ; - } else if (IVF_FLAT_IDENTIFIER.equals(normalized)) { + } else if (IVF_FLAT_IDENTIFIER.equals(value)) { return IndexType.IVF_FLAT; - } else if (IVF_HNSW_FLAT_IDENTIFIER.equals(normalized)) { + } else if (IVF_HNSW_FLAT_IDENTIFIER.equals(value)) { return IndexType.IVF_HNSW_FLAT; - } else if (IVF_HNSW_SQ_IDENTIFIER.equals(normalized)) { + } else if (IVF_HNSW_SQ_IDENTIFIER.equals(value)) { return IndexType.IVF_HNSW_SQ; } throw new IllegalArgumentException("Unknown vector index type: " + value); @@ -317,11 +280,6 @@ public static String toIdentifier(IndexType indexType) { } } - private static String normalizeIdentifier(String identifier) { - String value = identifier == null ? DEFAULT_IDENTIFIER : identifier; - return value.trim().toLowerCase(Locale.ROOT).replace('_', '-'); - } - private static String optionKey(ConfigOption option) { return option.key(); } diff --git a/paimon-vector/paimon-vector-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory b/paimon-vector/paimon-vector-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory index ffcfe9350e81..a11570704a1f 100644 --- a/paimon-vector/paimon-vector-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory +++ b/paimon-vector/paimon-vector-index/src/main/resources/META-INF/services/org.apache.paimon.globalindex.GlobalIndexerFactory @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.paimon.vector.index.VectorGlobalIndexerFactory org.apache.paimon.vector.index.IvfFlatVectorGlobalIndexerFactory org.apache.paimon.vector.index.IvfPqAlgorithmVectorGlobalIndexerFactory org.apache.paimon.vector.index.IvfHnswFlatVectorGlobalIndexerFactory diff --git a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java index 7b524c26a759..ed712e8cd62b 100644 --- a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java +++ b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java @@ -98,7 +98,7 @@ public void cleanup() throws IOException { @Test public void testDimensionMismatch() { Options options = createDefaultOptions(64); - VectorIndexOptions indexOptions = new VectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); VectorGlobalIndexWriter writer = new VectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); @@ -114,7 +114,7 @@ public void testVectorTypeRejectsNonFloatElement() { DataType intVecType = new VectorType(2, new IntType()); Options options = createDefaultOptions(2); options.setInteger("vector.pq.m", 1); - VectorIndexOptions indexOptions = new VectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); assertThatThrownBy(() -> new VectorGlobalIndexWriter(fileWriter, intVecType, indexOptions)) @@ -126,7 +126,7 @@ public void testVectorTypeRejectsNonFloatElement() { public void testNanInVectorRejected() { Options options = createDefaultOptions(2); options.setInteger("vector.pq.m", 1); - VectorIndexOptions indexOptions = new VectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); VectorGlobalIndexWriter writer = new VectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); @@ -142,7 +142,7 @@ public void testNanInVectorRejected() { public void testInfinityInVectorRejected() { Options options = createDefaultOptions(2); options.setInteger("vector.pq.m", 1); - VectorIndexOptions indexOptions = new VectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); VectorGlobalIndexWriter writer = new VectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); @@ -159,7 +159,7 @@ public void testInfinityInVectorRejected() { public void testAllNullReturnsEmpty() { Options options = createDefaultOptions(2); options.setInteger("vector.pq.m", 1); - VectorIndexOptions indexOptions = new VectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); VectorGlobalIndexWriter writer = new VectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); @@ -181,7 +181,7 @@ public void testMetaSerializationRoundTrip() throws IOException { options.setInteger("vector.pq.m", 8); options.setString("vector.pq.use-opq", "true"); options.setInteger("vector.nprobe", 24); - VectorIndexOptions indexOptions = new VectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); VectorIndexMeta meta = new VectorIndexMeta(indexOptions); byte[] serialized = meta.serialize(); @@ -199,7 +199,6 @@ public void testMetaSerializationRoundTrip() throws IOException { @Test public void testMetaSerializationRoundTripForHnsw() throws IOException { Options options = new Options(); - options.setString("vector.index.type", "ivf-hnsw-flat"); options.setInteger("vector.index.dimension", 16); options.setString("vector.distance.metric", "l2"); options.setInteger("vector.nlist", 8); @@ -207,7 +206,7 @@ public void testMetaSerializationRoundTripForHnsw() throws IOException { options.setInteger("vector.hnsw.ef-construction", 64); options.setInteger("vector.hnsw.max-level", 5); options.setInteger("vector.hnsw.ef-search", 80); - VectorIndexOptions indexOptions = new VectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_HNSW_FLAT); VectorIndexMeta deserialized = VectorIndexMeta.deserialize(new VectorIndexMeta(indexOptions).serialize()); @@ -230,7 +229,7 @@ public void testFloatVectorEndToEnd() throws IOException { Options options = createDefaultOptions(dimension); options.setInteger("vector.nlist", 2); options.setInteger("vector.pq.m", 1); - VectorIndexOptions indexOptions = new VectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); float[][] vectors = new float[][] { @@ -270,7 +269,7 @@ public void testSearchWithRoaringFilter() throws IOException { Options options = createDefaultOptions(dimension); options.setInteger("vector.nlist", 2); options.setInteger("vector.pq.m", 1); - VectorIndexOptions indexOptions = new VectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); float[][] vectors = new float[][] { @@ -315,7 +314,7 @@ public void testNullVectorSkipWithCorrectIds() throws IOException { Options options = createDefaultOptions(dimension); options.setInteger("vector.nlist", 2); options.setInteger("vector.pq.m", 1); - VectorIndexOptions indexOptions = new VectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); float[][] vectors = new float[][] { @@ -372,7 +371,8 @@ public void testViaIndexer() throws IOException { new float[] {0.7f, 0.7f} }; - VectorGlobalIndexer indexer = new VectorGlobalIndexer(vectorType, options); + VectorGlobalIndexer indexer = + new VectorGlobalIndexer(vectorType, options, IndexType.IVF_PQ); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); VectorGlobalIndexWriter writer = (VectorGlobalIndexWriter) indexer.createWriter(fileWriter); diff --git a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactoryTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactoryTest.java index 89c1ae3d10ec..bdf7dbdc66e4 100644 --- a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactoryTest.java +++ b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactoryTest.java @@ -19,6 +19,7 @@ package org.apache.paimon.vector.index; import org.apache.paimon.globalindex.GlobalIndexerFactoryUtils; +import org.apache.paimon.index.ivfpq.IndexType; import org.junit.jupiter.api.Test; @@ -29,7 +30,6 @@ public class VectorGlobalIndexerFactoryTest { @Test public void testIdentifier() { - assertThat(new VectorGlobalIndexerFactory().identifier()).isEqualTo("vector"); assertThat(new IvfFlatVectorGlobalIndexerFactory().identifier()).isEqualTo("ivf-flat"); assertThat(new IvfPqAlgorithmVectorGlobalIndexerFactory().identifier()).isEqualTo("ivf-pq"); assertThat(new IvfHnswFlatVectorGlobalIndexerFactory().identifier()) @@ -39,8 +39,6 @@ public void testIdentifier() { @Test public void testLoadByIdentifier() { - assertThat(GlobalIndexerFactoryUtils.load("vector")) - .isExactlyInstanceOf(VectorGlobalIndexerFactory.class); assertThat(GlobalIndexerFactoryUtils.load("ivf-flat")) .isExactlyInstanceOf(IvfFlatVectorGlobalIndexerFactory.class); assertThat(GlobalIndexerFactoryUtils.load("ivf-pq")) @@ -50,4 +48,16 @@ public void testLoadByIdentifier() { assertThat(GlobalIndexerFactoryUtils.load("ivf-hnsw-sq")) .isExactlyInstanceOf(IvfHnswSqVectorGlobalIndexerFactory.class); } + + @Test + public void testFactoryIndexType() { + assertThat(new IvfFlatVectorGlobalIndexerFactory().indexType()) + .isEqualTo(IndexType.IVF_FLAT); + assertThat(new IvfPqAlgorithmVectorGlobalIndexerFactory().indexType()) + .isEqualTo(IndexType.IVF_PQ); + assertThat(new IvfHnswFlatVectorGlobalIndexerFactory().indexType()) + .isEqualTo(IndexType.IVF_HNSW_FLAT); + assertThat(new IvfHnswSqVectorGlobalIndexerFactory().indexType()) + .isEqualTo(IndexType.IVF_HNSW_SQ); + } } diff --git a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorIndexOptionsTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorIndexOptionsTest.java index 16a646dad755..a33a4035656c 100644 --- a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorIndexOptionsTest.java +++ b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorIndexOptionsTest.java @@ -32,7 +32,7 @@ public class VectorIndexOptionsTest { @Test public void testDefaults() { Options options = new Options(); - VectorIndexOptions indexOptions = new VectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); assertThat(indexOptions.dimension()).isEqualTo(128); assertThat(indexOptions.indexType()).isEqualTo(IndexType.IVF_PQ); assertThat(indexOptions.metric()).isEqualTo(VectorMetric.INNER_PRODUCT); @@ -51,7 +51,6 @@ public void testDefaults() { @Test public void testCustomOptions() { Options options = new Options(); - options.setString("vector.index.type", "ivf-hnsw-sq"); options.setInteger("vector.index.dimension", 64); options.setString("vector.distance.metric", "l2"); options.setInteger("vector.nlist", 128); @@ -65,7 +64,7 @@ public void testCustomOptions() { options.setString("vector.train.sample-ratio", "0.5"); options.setInteger("vector.add.batch-size", 5000); - VectorIndexOptions indexOptions = new VectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_HNSW_SQ); assertThat(indexOptions.dimension()).isEqualTo(64); assertThat(indexOptions.indexType()).isEqualTo(IndexType.IVF_HNSW_SQ); assertThat(indexOptions.metric()).isEqualTo(VectorMetric.L2); @@ -81,34 +80,12 @@ public void testCustomOptions() { assertThat(indexOptions.addBatchSize()).isEqualTo(5000); } - @Test - public void testIdentifierSelectsIndexType() { - assertThat(new VectorIndexOptions(new Options(), "ivf-flat").indexType()) - .isEqualTo(IndexType.IVF_FLAT); - assertThat(new VectorIndexOptions(new Options(), "ivf-pq").indexType()) - .isEqualTo(IndexType.IVF_PQ); - assertThat(new VectorIndexOptions(new Options(), "ivf-hnsw-flat").indexType()) - .isEqualTo(IndexType.IVF_HNSW_FLAT); - assertThat(new VectorIndexOptions(new Options(), "ivf-hnsw-sq").indexType()) - .isEqualTo(IndexType.IVF_HNSW_SQ); - } - - @Test - public void testIdentifierRejectsConflictingIndexType() { - Options options = new Options(); - options.setString("vector.index.type", "ivf-pq"); - - assertThatThrownBy(() -> new VectorIndexOptions(options, "ivf-flat")) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Conflicting vector index type"); - } - @Test public void testMDivisibilityValidation() { Options options = new Options(); options.setInteger("vector.index.dimension", 10); options.setInteger("vector.pq.m", 3); - assertThatThrownBy(() -> new VectorIndexOptions(options)) + assertThatThrownBy(() -> new VectorIndexOptions(options, IndexType.IVF_PQ)) .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("must divide"); } @@ -117,13 +94,13 @@ public void testMDivisibilityValidation() { public void testInvalidSampleRatio() { Options options = new Options(); options.setString("vector.train.sample-ratio", "0.0"); - assertThatThrownBy(() -> new VectorIndexOptions(options)) + assertThatThrownBy(() -> new VectorIndexOptions(options, IndexType.IVF_PQ)) .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("vector.train.sample-ratio"); Options options2 = new Options(); options2.setString("vector.train.sample-ratio", "1.5"); - assertThatThrownBy(() -> new VectorIndexOptions(options2)) + assertThatThrownBy(() -> new VectorIndexOptions(options2, IndexType.IVF_PQ)) .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("vector.train.sample-ratio"); } @@ -133,7 +110,7 @@ public void testMetricParsing() { for (String metric : new String[] {"l2", "cosine", "inner_product"}) { Options options = new Options(); options.setString("vector.distance.metric", metric); - VectorIndexOptions indexOptions = new VectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); assertThat(indexOptions.metric().getConfigName()).isEqualTo(metric); } } @@ -142,7 +119,7 @@ public void testMetricParsing() { public void testMetricParsingUpperCase() { Options options = new Options(); options.setString("vector.distance.metric", "L2"); - VectorIndexOptions indexOptions = new VectorIndexOptions(options); + VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); assertThat(indexOptions.metric()).isEqualTo(VectorMetric.L2); } } From 50a82dabaa902d1b5e13a5a2cd3d37b426c6fb9f Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Wed, 10 Jun 2026 23:13:16 +0800 Subject: [PATCH 06/11] [vector] Align vector JNI options API --- .../vector/index/VectorGlobalIndexWriter.java | 2 +- .../paimon/vector/index/VectorIndexMeta.java | 22 ++-- .../vector/index/VectorIndexOptions.java | 87 +++++++++---- .../paimon/vector/index/VectorMetric.java | 20 +-- .../vector/index/VectorGlobalIndexTest.java | 6 +- .../vector/index/VectorIndexOptionsTest.java | 34 +++-- .../apache/paimon/index/ivfpq/HnswConfig.java | 54 -------- .../paimon/index/ivfpq/IvfFlatConfig.java | 25 ---- .../paimon/index/ivfpq/IvfHnswFlatConfig.java | 35 ----- .../paimon/index/ivfpq/IvfHnswSqConfig.java | 35 ----- .../paimon/index/ivfpq/IvfPqConfig.java | 47 ------- .../org/apache/paimon/index/ivfpq/Metric.java | 34 +++-- .../paimon/index/ivfpq/VectorIndexConfig.java | 94 -------------- .../index/ivfpq/VectorIndexMetadata.java | 20 ++- .../paimon/index/ivfpq/VectorIndexNative.java | 13 +- .../paimon/index/ivfpq/VectorIndexReader.java | 121 ++++++++++------- .../paimon/index/ivfpq/VectorIndexWriter.java | 122 ++++++++++-------- 17 files changed, 285 insertions(+), 486 deletions(-) delete mode 100644 paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/HnswConfig.java delete mode 100644 paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfFlatConfig.java delete mode 100644 paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswFlatConfig.java delete mode 100644 paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswSqConfig.java delete mode 100644 paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfPqConfig.java delete mode 100644 paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexConfig.java diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java index b26015a9fd31..dacef17ca4e8 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java @@ -226,7 +226,7 @@ private ResultEntry buildIndex() throws IOException { long buildStart = System.currentTimeMillis(); try (VectorIndexWriter writer = - new VectorIndexWriter(options.toVectorIndexConfig(effectiveNlist))) { + new VectorIndexWriter(options.toNativeOptions(effectiveNlist))) { // Phase 1: Train long phaseStart = System.currentTimeMillis(); diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexMeta.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexMeta.java index d47a582cfe6d..2c8e29f4cb23 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexMeta.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexMeta.java @@ -18,7 +18,6 @@ package org.apache.paimon.vector.index; -import org.apache.paimon.index.ivfpq.HnswConfig; import org.apache.paimon.index.ivfpq.IndexType; import org.apache.paimon.shade.jackson2.com.fasterxml.jackson.core.type.TypeReference; @@ -66,9 +65,9 @@ public VectorIndexMeta(VectorIndexOptions options) { params.put(KEY_NLIST, String.valueOf(options.nlist())); params.put(KEY_M, String.valueOf(options.m())); params.put(KEY_USE_OPQ, String.valueOf(options.useOpq())); - params.put(KEY_HNSW_M, String.valueOf(options.hnswConfig().m())); - params.put(KEY_HNSW_EF_CONSTRUCTION, String.valueOf(options.hnswConfig().efConstruction())); - params.put(KEY_HNSW_MAX_LEVEL, String.valueOf(options.hnswConfig().maxLevel())); + params.put(KEY_HNSW_M, String.valueOf(options.hnswM())); + params.put(KEY_HNSW_EF_CONSTRUCTION, String.valueOf(options.hnswEfConstruction())); + params.put(KEY_HNSW_MAX_LEVEL, String.valueOf(options.hnswMaxLevel())); params.put(KEY_NPROBE, String.valueOf(options.nprobe())); params.put(KEY_EF_SEARCH, String.valueOf(options.efSearch())); } @@ -106,11 +105,16 @@ public boolean useOpq() { return Boolean.parseBoolean(params.get(KEY_USE_OPQ)); } - public HnswConfig hnswConfig() { - return new HnswConfig( - intValue(KEY_HNSW_M, HnswConfig.DEFAULT.m()), - intValue(KEY_HNSW_EF_CONSTRUCTION, HnswConfig.DEFAULT.efConstruction()), - intValue(KEY_HNSW_MAX_LEVEL, HnswConfig.DEFAULT.maxLevel())); + public int hnswM() { + return intValue(KEY_HNSW_M, VectorIndexOptions.DEFAULT_HNSW_M); + } + + public int hnswEfConstruction() { + return intValue(KEY_HNSW_EF_CONSTRUCTION, VectorIndexOptions.DEFAULT_HNSW_EF_CONSTRUCTION); + } + + public int hnswMaxLevel() { + return intValue(KEY_HNSW_MAX_LEVEL, VectorIndexOptions.DEFAULT_HNSW_MAX_LEVEL); } public int nprobe() { diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexOptions.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexOptions.java index 93931a39ac66..5f31a9481c14 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexOptions.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexOptions.java @@ -18,13 +18,13 @@ package org.apache.paimon.vector.index; -import org.apache.paimon.index.ivfpq.HnswConfig; import org.apache.paimon.index.ivfpq.IndexType; -import org.apache.paimon.index.ivfpq.VectorIndexConfig; import org.apache.paimon.options.ConfigOption; import org.apache.paimon.options.ConfigOptions; import org.apache.paimon.options.Options; +import java.util.LinkedHashMap; +import java.util.Map; import java.util.Objects; /** Options for the Paimon vector index backed by paimon-vector-index. */ @@ -35,6 +35,10 @@ public class VectorIndexOptions { public static final String IVF_HNSW_FLAT_IDENTIFIER = "ivf-hnsw-flat"; public static final String IVF_HNSW_SQ_IDENTIFIER = "ivf-hnsw-sq"; + static final int DEFAULT_HNSW_M = 20; + static final int DEFAULT_HNSW_EF_CONSTRUCTION = 150; + static final int DEFAULT_HNSW_MAX_LEVEL = 7; + public static final ConfigOption DIMENSION = ConfigOptions.key("vector.index.dimension") .intType() @@ -71,19 +75,19 @@ public class VectorIndexOptions { public static final ConfigOption HNSW_M = ConfigOptions.key("vector.hnsw.m") .intType() - .defaultValue(HnswConfig.DEFAULT.m()) + .defaultValue(DEFAULT_HNSW_M) .withDescription("Maximum number of HNSW neighbors per node."); public static final ConfigOption HNSW_EF_CONSTRUCTION = ConfigOptions.key("vector.hnsw.ef-construction") .intType() - .defaultValue(HnswConfig.DEFAULT.efConstruction()) + .defaultValue(DEFAULT_HNSW_EF_CONSTRUCTION) .withDescription("HNSW efConstruction value used during index build."); public static final ConfigOption HNSW_MAX_LEVEL = ConfigOptions.key("vector.hnsw.max-level") .intType() - .defaultValue(HnswConfig.DEFAULT.maxLevel()) + .defaultValue(DEFAULT_HNSW_MAX_LEVEL) .withDescription("Maximum HNSW graph level."); public static final ConfigOption NPROBE = @@ -119,7 +123,9 @@ public class VectorIndexOptions { private final int nlist; private final int m; private final boolean useOpq; - private final HnswConfig hnswConfig; + private final int hnswM; + private final int hnswEfConstruction; + private final int hnswMaxLevel; private final int nprobe; private final int efSearch; private final double trainSampleRatio; @@ -132,12 +138,12 @@ public VectorIndexOptions(Options options, IndexType indexType) { this.nlist = validatePositive(options.get(NLIST), optionKey(NLIST)); this.m = validatePositive(options.get(M), optionKey(M)); this.useOpq = options.get(USE_OPQ); - this.hnswConfig = - new HnswConfig( - validatePositive(options.get(HNSW_M), optionKey(HNSW_M)), - validatePositive( - options.get(HNSW_EF_CONSTRUCTION), optionKey(HNSW_EF_CONSTRUCTION)), - validatePositive(options.get(HNSW_MAX_LEVEL), optionKey(HNSW_MAX_LEVEL))); + this.hnswM = validatePositive(options.get(HNSW_M), optionKey(HNSW_M)); + this.hnswEfConstruction = + validatePositive( + options.get(HNSW_EF_CONSTRUCTION), optionKey(HNSW_EF_CONSTRUCTION)); + this.hnswMaxLevel = + validatePositive(options.get(HNSW_MAX_LEVEL), optionKey(HNSW_MAX_LEVEL)); this.nprobe = validatePositive(options.get(NPROBE), optionKey(NPROBE)); this.efSearch = validateNonNegative(options.get(EF_SEARCH), optionKey(EF_SEARCH)); this.trainSampleRatio = options.get(TRAIN_SAMPLE_RATIO); @@ -182,8 +188,16 @@ public boolean useOpq() { return useOpq; } - public HnswConfig hnswConfig() { - return hnswConfig; + public int hnswM() { + return hnswM; + } + + public int hnswEfConstruction() { + return hnswEfConstruction; + } + + public int hnswMaxLevel() { + return hnswMaxLevel; } public int nprobe() { @@ -202,23 +216,29 @@ public int addBatchSize() { return addBatchSize; } - public VectorIndexConfig toVectorIndexConfig(int effectiveNlist) { + public Map toNativeOptions(int effectiveNlist) { + Map nativeOptions = new LinkedHashMap<>(); + nativeOptions.put("index.type", toNativeIndexType(indexType)); + nativeOptions.put("dimension", String.valueOf(dimension)); + nativeOptions.put("nlist", String.valueOf(effectiveNlist)); + nativeOptions.put("metric", metric.getConfigName()); switch (indexType) { case IVF_FLAT: - return VectorIndexConfig.ivfFlat( - dimension, effectiveNlist, metric.toNativeMetric()); + break; case IVF_PQ: - return VectorIndexConfig.ivfPq( - dimension, effectiveNlist, m, metric.toNativeMetric(), useOpq); + nativeOptions.put("pq.m", String.valueOf(m)); + nativeOptions.put("use-opq", String.valueOf(useOpq)); + break; case IVF_HNSW_FLAT: - return VectorIndexConfig.ivfHnswFlat( - dimension, effectiveNlist, metric.toNativeMetric(), hnswConfig); case IVF_HNSW_SQ: - return VectorIndexConfig.ivfHnswSq( - dimension, effectiveNlist, metric.toNativeMetric(), hnswConfig); + nativeOptions.put("hnsw.m", String.valueOf(hnswM)); + nativeOptions.put("hnsw.ef-construction", String.valueOf(hnswEfConstruction)); + nativeOptions.put("hnsw.max-level", String.valueOf(hnswMaxLevel)); + break; default: throw new IllegalArgumentException("Unsupported vector index type: " + indexType); } + return nativeOptions; } public String logName() { @@ -226,11 +246,7 @@ public String logName() { } private static VectorMetric parseMetric(String value) { - try { - return VectorMetric.fromConfigName(value); - } catch (IllegalArgumentException e) { - return VectorMetric.fromString(value); - } + return VectorMetric.fromConfigName(value); } private static int validatePositive(int value, String key) { @@ -280,6 +296,21 @@ public static String toIdentifier(IndexType indexType) { } } + private static String toNativeIndexType(IndexType indexType) { + switch (indexType) { + case IVF_FLAT: + return "ivf_flat"; + case IVF_PQ: + return "ivf_pq"; + case IVF_HNSW_FLAT: + return "ivf_hnsw_flat"; + case IVF_HNSW_SQ: + return "ivf_hnsw_sq"; + default: + throw new IllegalArgumentException("Unsupported vector index type: " + indexType); + } + } + private static String optionKey(ConfigOption option) { return option.key(); } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorMetric.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorMetric.java index e8642303406d..91ed9a357dc8 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorMetric.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorMetric.java @@ -18,30 +18,22 @@ package org.apache.paimon.vector.index; -import org.apache.paimon.index.ivfpq.Metric; - /** Enumeration of supported vector distance metrics. */ public enum VectorMetric { - L2("l2", Metric.L2), - COSINE("cosine", Metric.COSINE), - INNER_PRODUCT("inner_product", Metric.INNER_PRODUCT); + L2("l2"), + COSINE("cosine"), + INNER_PRODUCT("inner_product"); private final String configName; - private final Metric nativeMetric; - VectorMetric(String configName, Metric nativeMetric) { + VectorMetric(String configName) { this.configName = configName; - this.nativeMetric = nativeMetric; } public String getConfigName() { return configName; } - public Metric toNativeMetric() { - return nativeMetric; - } - public static VectorMetric fromConfigName(String configName) { for (VectorMetric m : values()) { if (m.configName.equals(configName)) { @@ -50,8 +42,4 @@ public static VectorMetric fromConfigName(String configName) { } throw new IllegalArgumentException("Unknown metric: " + configName); } - - public static VectorMetric fromString(String name) { - return valueOf(name.toUpperCase()); - } } diff --git a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java index ed712e8cd62b..9ea8ec2c38cf 100644 --- a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java +++ b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java @@ -213,9 +213,9 @@ public void testMetaSerializationRoundTripForHnsw() throws IOException { assertThat(deserialized.indexType()).isEqualTo(IndexType.IVF_HNSW_FLAT); assertThat(deserialized.dimension()).isEqualTo(16); - assertThat(deserialized.hnswConfig().m()).isEqualTo(12); - assertThat(deserialized.hnswConfig().efConstruction()).isEqualTo(64); - assertThat(deserialized.hnswConfig().maxLevel()).isEqualTo(5); + assertThat(deserialized.hnswM()).isEqualTo(12); + assertThat(deserialized.hnswEfConstruction()).isEqualTo(64); + assertThat(deserialized.hnswMaxLevel()).isEqualTo(5); assertThat(deserialized.efSearch()).isEqualTo(80); } diff --git a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorIndexOptionsTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorIndexOptionsTest.java index a33a4035656c..523de9fa8ad2 100644 --- a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorIndexOptionsTest.java +++ b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorIndexOptionsTest.java @@ -39,13 +39,20 @@ public void testDefaults() { assertThat(indexOptions.nlist()).isEqualTo(256); assertThat(indexOptions.m()).isEqualTo(16); assertThat(indexOptions.useOpq()).isFalse(); - assertThat(indexOptions.hnswConfig().m()).isEqualTo(20); - assertThat(indexOptions.hnswConfig().efConstruction()).isEqualTo(150); - assertThat(indexOptions.hnswConfig().maxLevel()).isEqualTo(7); + assertThat(indexOptions.hnswM()).isEqualTo(20); + assertThat(indexOptions.hnswEfConstruction()).isEqualTo(150); + assertThat(indexOptions.hnswMaxLevel()).isEqualTo(7); assertThat(indexOptions.nprobe()).isEqualTo(16); assertThat(indexOptions.efSearch()).isEqualTo(0); assertThat(indexOptions.trainSampleRatio()).isEqualTo(1.0); assertThat(indexOptions.addBatchSize()).isEqualTo(10000); + assertThat(indexOptions.toNativeOptions(12)) + .containsEntry("index.type", "ivf_pq") + .containsEntry("dimension", "128") + .containsEntry("nlist", "12") + .containsEntry("metric", "inner_product") + .containsEntry("pq.m", "16") + .containsEntry("use-opq", "false"); } @Test @@ -71,13 +78,21 @@ public void testCustomOptions() { assertThat(indexOptions.nlist()).isEqualTo(128); assertThat(indexOptions.m()).isEqualTo(8); assertThat(indexOptions.useOpq()).isTrue(); - assertThat(indexOptions.hnswConfig().m()).isEqualTo(12); - assertThat(indexOptions.hnswConfig().efConstruction()).isEqualTo(64); - assertThat(indexOptions.hnswConfig().maxLevel()).isEqualTo(5); + assertThat(indexOptions.hnswM()).isEqualTo(12); + assertThat(indexOptions.hnswEfConstruction()).isEqualTo(64); + assertThat(indexOptions.hnswMaxLevel()).isEqualTo(5); assertThat(indexOptions.nprobe()).isEqualTo(32); assertThat(indexOptions.efSearch()).isEqualTo(96); assertThat(indexOptions.trainSampleRatio()).isEqualTo(0.5); assertThat(indexOptions.addBatchSize()).isEqualTo(5000); + assertThat(indexOptions.toNativeOptions(7)) + .containsEntry("index.type", "ivf_hnsw_sq") + .containsEntry("dimension", "64") + .containsEntry("nlist", "7") + .containsEntry("metric", "l2") + .containsEntry("hnsw.m", "12") + .containsEntry("hnsw.ef-construction", "64") + .containsEntry("hnsw.max-level", "5"); } @Test @@ -116,10 +131,11 @@ public void testMetricParsing() { } @Test - public void testMetricParsingUpperCase() { + public void testMetricParsingRejectsUpperCase() { Options options = new Options(); options.setString("vector.distance.metric", "L2"); - VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); - assertThat(indexOptions.metric()).isEqualTo(VectorMetric.L2); + assertThatThrownBy(() -> new VectorIndexOptions(options, IndexType.IVF_PQ)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Unknown metric"); } } diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/HnswConfig.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/HnswConfig.java deleted file mode 100644 index eb0bb1313f53..000000000000 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/HnswConfig.java +++ /dev/null @@ -1,54 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.paimon.index.ivfpq; - -public final class HnswConfig { - - public static final HnswConfig DEFAULT = new HnswConfig(20, 150, 7); - - private final int m; - private final int efConstruction; - private final int maxLevel; - - public HnswConfig(int m, int efConstruction, int maxLevel) { - validatePositive(m, "m"); - validatePositive(efConstruction, "efConstruction"); - validatePositive(maxLevel, "maxLevel"); - this.m = m; - this.efConstruction = efConstruction; - this.maxLevel = maxLevel; - } - - public int m() { - return m; - } - - public int efConstruction() { - return efConstruction; - } - - public int maxLevel() { - return maxLevel; - } - - private static void validatePositive(int value, String name) { - if (value <= 0) { - throw new IllegalArgumentException(name + " must be > 0"); - } - } -} diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfFlatConfig.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfFlatConfig.java deleted file mode 100644 index 285afc181032..000000000000 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfFlatConfig.java +++ /dev/null @@ -1,25 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.paimon.index.ivfpq; - -public final class IvfFlatConfig extends VectorIndexConfig { - - public IvfFlatConfig(int dimension, int nlist, Metric metric) { - super(IndexType.IVF_FLAT, dimension, nlist, metric); - } -} diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswFlatConfig.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswFlatConfig.java deleted file mode 100644 index 3b17586fbb20..000000000000 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswFlatConfig.java +++ /dev/null @@ -1,35 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.paimon.index.ivfpq; - -public final class IvfHnswFlatConfig extends VectorIndexConfig { - - private final HnswConfig hnsw; - - public IvfHnswFlatConfig(int dimension, int nlist, Metric metric, HnswConfig hnsw) { - super(IndexType.IVF_HNSW_FLAT, dimension, nlist, metric); - if (hnsw == null) { - throw new NullPointerException("hnsw"); - } - this.hnsw = hnsw; - } - - public HnswConfig hnsw() { - return hnsw; - } -} diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswSqConfig.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswSqConfig.java deleted file mode 100644 index 80fe09a3d835..000000000000 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfHnswSqConfig.java +++ /dev/null @@ -1,35 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.paimon.index.ivfpq; - -public final class IvfHnswSqConfig extends VectorIndexConfig { - - private final HnswConfig hnsw; - - public IvfHnswSqConfig(int dimension, int nlist, Metric metric, HnswConfig hnsw) { - super(IndexType.IVF_HNSW_SQ, dimension, nlist, metric); - if (hnsw == null) { - throw new NullPointerException("hnsw"); - } - this.hnsw = hnsw; - } - - public HnswConfig hnsw() { - return hnsw; - } -} diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfPqConfig.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfPqConfig.java deleted file mode 100644 index 751f560fd208..000000000000 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IvfPqConfig.java +++ /dev/null @@ -1,47 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.paimon.index.ivfpq; - -public final class IvfPqConfig extends VectorIndexConfig { - - private final int m; - private final boolean useOpq; - - public IvfPqConfig(int dimension, int nlist, int m, Metric metric, boolean useOpq) { - super(IndexType.IVF_PQ, dimension, nlist, metric); - validatePositive(m, "m"); - if (dimension % m != 0) { - throw new IllegalArgumentException("dimension must be divisible by m"); - } - this.m = m; - this.useOpq = useOpq; - } - - public int m() { - return m; - } - - public boolean useOpq() { - return useOpq; - } - - @Override - int pqM() { - return m; - } -} diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/Metric.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/Metric.java index f31327bb3175..c31dbc2e5f22 100644 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/Metric.java +++ b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/Metric.java @@ -1,24 +1,22 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. package org.apache.paimon.index.ivfpq; -/** Distance metric for vector search. */ public enum Metric { L2(0), INNER_PRODUCT(1), diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexConfig.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexConfig.java deleted file mode 100644 index 0152492f398d..000000000000 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexConfig.java +++ /dev/null @@ -1,94 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.paimon.index.ivfpq; - -public abstract class VectorIndexConfig { - - private final IndexType indexType; - private final int dimension; - private final int nlist; - private final Metric metric; - - VectorIndexConfig(IndexType indexType, int dimension, int nlist, Metric metric) { - if (indexType == null) { - throw new NullPointerException("indexType"); - } - if (metric == null) { - throw new NullPointerException("metric"); - } - validatePositive(dimension, "dimension"); - validatePositive(nlist, "nlist"); - this.indexType = indexType; - this.dimension = dimension; - this.nlist = nlist; - this.metric = metric; - } - - public static VectorIndexConfig ivfFlat(int dimension, int nlist, Metric metric) { - return new IvfFlatConfig(dimension, nlist, metric); - } - - public static VectorIndexConfig ivfPq( - int dimension, int nlist, int m, Metric metric, boolean useOpq) { - return new IvfPqConfig(dimension, nlist, m, metric, useOpq); - } - - public static VectorIndexConfig ivfHnswFlat( - int dimension, int nlist, Metric metric, HnswConfig hnsw) { - return new IvfHnswFlatConfig(dimension, nlist, metric, hnsw); - } - - public static VectorIndexConfig ivfHnswSq( - int dimension, int nlist, Metric metric, HnswConfig hnsw) { - return new IvfHnswSqConfig(dimension, nlist, metric, hnsw); - } - - public IndexType indexType() { - return indexType; - } - - public int dimension() { - return dimension; - } - - public int nlist() { - return nlist; - } - - public Metric metric() { - return metric; - } - - int pqM() { - return 0; - } - - boolean useOpq() { - return false; - } - - HnswConfig hnsw() { - return HnswConfig.DEFAULT; - } - - static void validatePositive(int value, String name) { - if (value <= 0) { - throw new IllegalArgumentException(name + " must be > 0"); - } - } -} diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexMetadata.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexMetadata.java index 931a5e4af471..ec52f115bcfe 100644 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexMetadata.java +++ b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexMetadata.java @@ -25,7 +25,9 @@ public final class VectorIndexMetadata { private final Metric metric; private final long totalVectors; private final int pqM; - private final HnswConfig hnsw; + private final int hnswM; + private final int hnswEfConstruction; + private final int hnswMaxLevel; public VectorIndexMetadata( int indexType, @@ -43,7 +45,9 @@ public VectorIndexMetadata( this.metric = metricFromCode(metric); this.totalVectors = totalVectors; this.pqM = pqM; - this.hnsw = hnswM > 0 ? new HnswConfig(hnswM, efConstruction, maxLevel) : null; + this.hnswM = hnswM; + this.hnswEfConstruction = efConstruction; + this.hnswMaxLevel = maxLevel; } public IndexType indexType() { @@ -70,8 +74,16 @@ public int pqM() { return pqM; } - public HnswConfig hnsw() { - return hnsw; + public int hnswM() { + return hnswM; + } + + public int hnswEfConstruction() { + return hnswEfConstruction; + } + + public int hnswMaxLevel() { + return hnswMaxLevel; } private static Metric metricFromCode(int code) { diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexNative.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexNative.java index d4c75e65ea38..3e49103d5e3d 100644 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexNative.java +++ b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexNative.java @@ -25,16 +25,9 @@ final class VectorIndexNative { private VectorIndexNative() {} - static native long createWriter( - int indexType, - int dimension, - int nlist, - int pqM, - int metric, - boolean useOpq, - int hnswM, - int efConstruction, - int maxLevel); + static native long createWriter(String[] optionKeys, String[] optionValues); + + static native int writerDimension(long ptr); static native void train(long ptr, float[] data, int n); diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexReader.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexReader.java index d0da1f42a8dc..b10fb525db26 100644 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexReader.java +++ b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexReader.java @@ -19,7 +19,9 @@ public final class VectorIndexReader implements AutoCloseable { + private final Object nativeHandleLock = new Object(); private long nativePtr; + private Thread nativeHandleOwner; private VectorIndexMetadata metadata; public VectorIndexReader(VectorIndexInput input) { @@ -38,10 +40,18 @@ static VectorIndexReader fromNativePointerForTesting(long nativePtr) { } public VectorIndexMetadata metadata() { - if (metadata == null) { - metadata = VectorIndexNative.metadata(requireOpen()); + synchronized (nativeHandleLock) { + enterNativeHandle(); + try { + requireOpen(); + if (metadata == null) { + metadata = VectorIndexNative.metadata(nativePtr); + } + return metadata; + } finally { + exitNativeHandle(); + } } - return metadata; } public IndexType indexType() { @@ -62,8 +72,14 @@ public VectorSearchResult search(float[] query, int topK, int nprobe) { public VectorSearchResult search(float[] query, int topK, int nprobe, int efSearch) { validateQuery(query); - validateSearchParams(topK, nprobe, efSearch); - return VectorIndexNative.search(requireOpen(), query, topK, nprobe, efSearch); + synchronized (nativeHandleLock) { + enterNativeHandle(); + try { + return VectorIndexNative.search(requireOpen(), query, topK, nprobe, efSearch); + } finally { + exitNativeHandle(); + } + } } public VectorSearchResult search(float[] query, int topK, int nprobe, byte[] roaringFilter) { @@ -76,9 +92,15 @@ public VectorSearchResult search( if (roaringFilter == null) { throw new NullPointerException("roaringFilter"); } - validateSearchParams(topK, nprobe, efSearch); - return VectorIndexNative.searchWithRoaringFilter( - requireOpen(), query, topK, nprobe, efSearch, roaringFilter); + synchronized (nativeHandleLock) { + enterNativeHandle(); + try { + return VectorIndexNative.searchWithRoaringFilter( + requireOpen(), query, topK, nprobe, efSearch, roaringFilter); + } finally { + exitNativeHandle(); + } + } } public VectorSearchBatchResult searchBatch( @@ -88,9 +110,18 @@ public VectorSearchBatchResult searchBatch( public VectorSearchBatchResult searchBatch( float[] queries, int queryCount, int topK, int nprobe, int efSearch) { - validateQueries(queries, queryCount); - validateSearchParams(topK, nprobe, efSearch); - return VectorIndexNative.searchBatch(requireOpen(), queries, queryCount, topK, nprobe, efSearch); + if (queries == null) { + throw new NullPointerException("queries"); + } + synchronized (nativeHandleLock) { + enterNativeHandle(); + try { + return VectorIndexNative.searchBatch( + requireOpen(), queries, queryCount, topK, nprobe, efSearch); + } finally { + exitNativeHandle(); + } + } } public VectorSearchBatchResult searchBatch( @@ -105,21 +136,36 @@ public VectorSearchBatchResult searchBatch( int nprobe, int efSearch, byte[] roaringFilter) { - validateQueries(queries, queryCount); + if (queries == null) { + throw new NullPointerException("queries"); + } if (roaringFilter == null) { throw new NullPointerException("roaringFilter"); } - validateSearchParams(topK, nprobe, efSearch); - return VectorIndexNative.searchBatchWithRoaringFilter( - requireOpen(), queries, queryCount, topK, nprobe, efSearch, roaringFilter); + synchronized (nativeHandleLock) { + enterNativeHandle(); + try { + return VectorIndexNative.searchBatchWithRoaringFilter( + requireOpen(), queries, queryCount, topK, nprobe, efSearch, roaringFilter); + } finally { + exitNativeHandle(); + } + } } @Override public void close() { - long ptr = nativePtr; - nativePtr = 0L; - if (ptr != 0L) { - VectorIndexNative.freeReader(ptr); + synchronized (nativeHandleLock) { + enterNativeHandle(); + try { + long ptr = nativePtr; + nativePtr = 0L; + if (ptr != 0L) { + VectorIndexNative.freeReader(ptr); + } + } finally { + exitNativeHandle(); + } } } @@ -127,39 +173,24 @@ private void validateQuery(float[] query) { if (query == null) { throw new NullPointerException("query"); } - if (query.length != dimension()) { - throw new IllegalArgumentException( - "query length " + query.length + " != index dimension " + dimension()); - } } - private void validateQueries(float[] queries, int queryCount) { - if (queries == null) { - throw new NullPointerException("queries"); - } - VectorIndexConfig.validatePositive(queryCount, "queryCount"); - long expected = (long) queryCount * (long) dimension(); - if (expected > Integer.MAX_VALUE) { - throw new IllegalArgumentException("queryCount * dimension overflows int"); - } - if (queries.length != expected) { - throw new IllegalArgumentException( - "queries length " + queries.length + " != queryCount * dimension " + expected); + private long requireOpen() { + if (nativePtr == 0L) { + throw new IllegalStateException("VectorIndexReader is closed"); } + return nativePtr; } - private static void validateSearchParams(int topK, int nprobe, int efSearch) { - VectorIndexConfig.validatePositive(topK, "topK"); - VectorIndexConfig.validatePositive(nprobe, "nprobe"); - if (efSearch < 0) { - throw new IllegalArgumentException("efSearch must be >= 0"); + private void enterNativeHandle() { + Thread current = Thread.currentThread(); + if (nativeHandleOwner == current) { + throw new IllegalStateException("VectorIndexReader native handle is already in use"); } + nativeHandleOwner = current; } - private long requireOpen() { - if (nativePtr == 0L) { - throw new IllegalStateException("VectorIndexReader is closed"); - } - return nativePtr; + private void exitNativeHandle() { + nativeHandleOwner = null; } } diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexWriter.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexWriter.java index f62d19513ac6..a82950d40126 100644 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexWriter.java +++ b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexWriter.java @@ -17,92 +17,96 @@ package org.apache.paimon.index.ivfpq; +import java.util.Map; + public final class VectorIndexWriter implements AutoCloseable { - private final VectorIndexConfig config; + private final Object nativeHandleLock = new Object(); private long nativePtr; + private Thread nativeHandleOwner; - public VectorIndexWriter(VectorIndexConfig config) { - if (config == null) { - throw new NullPointerException("config"); + public VectorIndexWriter(Map options) { + String[] keys = new String[options.size()]; + String[] values = new String[options.size()]; + int index = 0; + for (Map.Entry entry : options.entrySet()) { + keys[index] = entry.getKey(); + values[index] = entry.getValue(); + index++; } - this.config = config; - HnswConfig hnsw = config.hnsw(); - this.nativePtr = - VectorIndexNative.createWriter( - config.indexType().code(), - config.dimension(), - config.nlist(), - config.pqM(), - config.metric().code(), - config.useOpq(), - hnsw.m(), - hnsw.efConstruction(), - hnsw.maxLevel()); + this.nativePtr = VectorIndexNative.createWriter(keys, values); } - private VectorIndexWriter(long nativePtr, VectorIndexConfig config) { + private VectorIndexWriter(long nativePtr) { this.nativePtr = nativePtr; - this.config = config; - } - - static VectorIndexWriter fromNativePointerForTesting(long nativePtr, VectorIndexConfig config) { - return new VectorIndexWriter(nativePtr, config); } - public VectorIndexConfig config() { - return config; + static VectorIndexWriter fromNativePointerForTesting(long nativePtr) { + return new VectorIndexWriter(nativePtr); } public int dimension() { - return config.dimension(); + return VectorIndexNative.writerDimension(requireOpen()); } public void train(float[] data, int vectorCount) { - validateVectors(data, vectorCount); - VectorIndexNative.train(requireOpen(), data, vectorCount); + if (data == null) { + throw new NullPointerException("data"); + } + synchronized (nativeHandleLock) { + enterNativeHandle(); + try { + VectorIndexNative.train(requireOpen(), data, vectorCount); + } finally { + exitNativeHandle(); + } + } } public void addVectors(long[] ids, float[] data, int vectorCount) { if (ids == null) { throw new NullPointerException("ids"); } - validateVectors(data, vectorCount); - if (ids.length < vectorCount) { - throw new IllegalArgumentException( - "ids length " + ids.length + " < vectorCount " + vectorCount); + if (data == null) { + throw new NullPointerException("data"); + } + synchronized (nativeHandleLock) { + enterNativeHandle(); + try { + VectorIndexNative.addVectors(requireOpen(), ids, data, vectorCount); + } finally { + exitNativeHandle(); + } } - VectorIndexNative.addVectors(requireOpen(), ids, data, vectorCount); } public void writeIndex(Object output) { if (output == null) { throw new NullPointerException("output"); } - VectorIndexNative.writeIndex(requireOpen(), output); + synchronized (nativeHandleLock) { + enterNativeHandle(); + try { + VectorIndexNative.writeIndex(requireOpen(), output); + } finally { + exitNativeHandle(); + } + } } @Override public void close() { - long ptr = nativePtr; - nativePtr = 0L; - if (ptr != 0L) { - VectorIndexNative.freeWriter(ptr); - } - } - - private void validateVectors(float[] data, int vectorCount) { - if (data == null) { - throw new NullPointerException("data"); - } - VectorIndexConfig.validatePositive(vectorCount, "vectorCount"); - long expected = (long) vectorCount * (long) config.dimension(); - if (expected > Integer.MAX_VALUE) { - throw new IllegalArgumentException("vectorCount * dimension overflows int"); - } - if (data.length < expected) { - throw new IllegalArgumentException( - "data length " + data.length + " < vectorCount * dimension " + expected); + synchronized (nativeHandleLock) { + enterNativeHandle(); + try { + long ptr = nativePtr; + nativePtr = 0L; + if (ptr != 0L) { + VectorIndexNative.freeWriter(ptr); + } + } finally { + exitNativeHandle(); + } } } @@ -112,4 +116,16 @@ private long requireOpen() { } return nativePtr; } + + private void enterNativeHandle() { + Thread current = Thread.currentThread(); + if (nativeHandleOwner == current) { + throw new IllegalStateException("VectorIndexWriter native handle is already in use"); + } + nativeHandleOwner = current; + } + + private void exitNativeHandle() { + nativeHandleOwner = null; + } } From a13e06410c5f340136242f513a565cad45e1f614 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Wed, 10 Jun 2026 23:34:58 +0800 Subject: [PATCH 07/11] [vector] Remove vector index options wrapper --- docs/docs/flink/procedures.md | 2 +- docs/docs/learn-paimon/scenario-guide.mdx | 14 +- docs/docs/multimodal-table/global-index.mdx | 42 ++- docs/docs/multimodal-table/index.mdx | 2 +- .../vector/index/VectorGlobalIndexReader.java | 23 +- .../vector/index/VectorGlobalIndexWriter.java | 217 ++++++++++-- .../vector/index/VectorGlobalIndexer.java | 16 +- .../index/VectorGlobalIndexerFactory.java | 2 +- .../paimon/vector/index/VectorIndexMeta.java | 64 ++-- .../vector/index/VectorIndexOptions.java | 317 ------------------ .../paimon/vector/index/VectorMetric.java | 45 --- .../vector/index/VectorGlobalIndexTest.java | 98 ++++-- .../vector/index/VectorIndexOptionsTest.java | 141 -------- 13 files changed, 344 insertions(+), 639 deletions(-) delete mode 100644 paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexOptions.java delete mode 100644 paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorMetric.java delete mode 100644 paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorIndexOptionsTest.java diff --git a/docs/docs/flink/procedures.md b/docs/docs/flink/procedures.md index 31601354f1a8..0dbf8f0c1bc7 100644 --- a/docs/docs/flink/procedures.md +++ b/docs/docs/flink/procedures.md @@ -1004,7 +1004,7 @@ All available procedures are listed below. To create a global index on a table for accelerating queries. Arguments:

  • table(required): the target table identifier.
  • index_column(required): the column name to build index on.
  • -
  • index_type(required): the type of global index, supported types include 'btree', 'lumina', 'tantivy-fulltext'.
  • +
  • index_type(required): the type of global index, supported types include 'btree', 'ivf-flat', 'ivf-pq', 'ivf-hnsw-flat', 'ivf-hnsw-sq', 'tantivy-fulltext'.
  • partitions(optional): partition filter for selective index creation.
  • options(optional): additional dynamic options for index creation.
  • diff --git a/docs/docs/learn-paimon/scenario-guide.mdx b/docs/docs/learn-paimon/scenario-guide.mdx index 341ceddae99e..4aae116bc079 100644 --- a/docs/docs/learn-paimon/scenario-guide.mdx +++ b/docs/docs/learn-paimon/scenario-guide.mdx @@ -44,7 +44,7 @@ configurations that are suited for different scenarios. | Queue-like ordered streaming | Append Table | `bucket = N, bucket-key = col` | | Large-scale OLAP with ad-hoc queries | Append Table | Incremental Clustering | | Store images / videos / documents | Append Table (Blob) | `__BLOB_FIELD` comment, Data Evolution enabled | -| AI vector search / RAG | Append Table (Vector) | `VECTOR` type, Global Index (DiskANN) | +| AI vector search / RAG | Append Table (Vector) | `VECTOR` type, Vector Global Index | | AI feature engineering & column evolution | Append Table | `data-evolution.enabled = true` | | Python AI pipeline (Ray / PyTorch) | Append Table | PyPaimon SDK | @@ -456,21 +456,19 @@ Schema schema = Schema.newBuilder() **Build the vector index and search:** ```sql --- Build DiskANN vector index +-- Build IVF-PQ vector index CALL sys.create_global_index( table => 'db.doc_embeddings', index_column => 'embedding', - index_type => 'lumina', - options => 'lumina.index.dimension=768' + index_type => 'ivf-pq', + options => 'vector.distance.metric=cosine,vector.nlist=256,vector.pq.m=16' ); -- Search for top-5 nearest neighbors SELECT * FROM vector_search('doc_embeddings', 'embedding', array(0.1f, 0.2f, ...), 5); ``` -The legacy index type `lumina-vector-ann` is still accepted for existing tables and SQL compatibility. - -**Why:** The [Global Index](../multimodal-table/global-index) with DiskANN provides high-performance ANN search. +**Why:** The [Global Index](../multimodal-table/global-index) with vector indexes provides high-performance ANN search. Vector data is stored in dedicated `.vector.lance` files optimized for dense vectors, while scalar columns stay in Parquet. You can also build a **BTree Index** on scalar columns for efficient filtering: @@ -664,7 +662,7 @@ Do you need upsert / update / delete? │ └── AI / Multimodal scenarios? → Enable Data Evolution ├── Store images / videos / docs? → Blob Table (__BLOB_FIELD comment) - ├── Vector search / RAG? → VECTOR type + Global Index (DiskANN) + ├── Vector search / RAG? → VECTOR type + Vector Global Index ├── Feature engineering? → Data Evolution (MERGE INTO partial columns) └── Python pipeline? → PyPaimon (Ray / PyTorch / Pandas) ``` diff --git a/docs/docs/multimodal-table/global-index.mdx b/docs/docs/multimodal-table/global-index.mdx index 8edcfe87e2ad..cc0d08e9333a 100644 --- a/docs/docs/multimodal-table/global-index.mdx +++ b/docs/docs/multimodal-table/global-index.mdx @@ -33,7 +33,7 @@ Global Index is a powerful indexing mechanism for Data Evolution (append) tables without full-table scans. Paimon supports multiple global index types: - **BTree Index**: A B-tree based index for scalar column lookups. Supports equality, IN, range predicates, and can be combined across multiple columns with AND/OR logic. -- **Vector Index**: An approximate nearest neighbor (ANN) index powered by DiskANN for vector similarity search. +- **Vector Index**: An approximate nearest neighbor (ANN) index powered by Paimon's vector index library for vector similarity search. - **Full-Text Index**: A full-text search index powered by Tantivy for text retrieval. Supports term matching and relevance scoring. Global indexes work on top of Data Evolution tables. To use global indexes, your table **must** have: @@ -87,23 +87,49 @@ SELECT * FROM my_table WHERE name IN ('a200', 'a300'); ## Vector Index -Vector Index provides approximate nearest neighbor (ANN) search based on the DiskANN algorithm. It is suitable for -vector similarity search scenarios such as recommendation systems, image retrieval, and RAG (Retrieval Augmented -Generation) applications. +Vector Index provides approximate nearest neighbor (ANN) search for vector similarity search scenarios such as +recommendation systems, image retrieval, and RAG (Retrieval Augmented Generation) applications. + +Supported vector index types: + +| Index Type | Description | +|---|---| +| `ivf-flat` | IVF index with flat vector storage. | +| `ivf-pq` | IVF index with product quantization. | +| `ivf-hnsw-flat` | IVF index with HNSW flat quantizer. | +| `ivf-hnsw-sq` | IVF index with HNSW scalar quantizer. | **Build Vector Index** ```sql --- Create Lumina vector index on 'embedding' column +-- Create IVF-PQ vector index on 'embedding' column CALL sys.create_global_index( table => 'db.my_table', index_column => 'embedding', - index_type => 'lumina', - options => 'lumina.index.dimension=128' + index_type => 'ivf-pq', + options => 'vector.distance.metric=cosine,vector.nlist=256,vector.pq.m=16' ); ``` -The legacy index type `lumina-vector-ann` is still accepted for existing tables and SQL compatibility. +For `ARRAY` vector columns, specify the vector dimension with `vector.index.dimension`. +For `VECTOR` columns, Paimon uses the dimension from the column type. + +Supported vector index options: + +| Option | Default | Description | +|---|---|---| +| `vector.index.dimension` | `128` | Vector dimension for `ARRAY` columns. Ignored for `VECTOR` columns. | +| `vector.distance.metric` | `inner_product` | Distance metric. Supported values: `l2`, `cosine`, `inner_product`. | +| `vector.nlist` | `256` | Number of IVF clusters used during index build. | +| `vector.pq.m` | `16` | Number of PQ sub-vectors for `ivf-pq`. The vector dimension must be divisible by this value. | +| `vector.pq.use-opq` | `false` | Whether to enable OPQ for `ivf-pq`. | +| `vector.hnsw.m` | `20` | HNSW graph out-degree for `ivf-hnsw-flat` and `ivf-hnsw-sq`. | +| `vector.hnsw.ef-construction` | `150` | HNSW construction search width for `ivf-hnsw-flat` and `ivf-hnsw-sq`. | +| `vector.hnsw.max-level` | `7` | Maximum HNSW level for `ivf-hnsw-flat` and `ivf-hnsw-sq`. | +| `vector.nprobe` | `16` | Number of IVF clusters to probe during search. | +| `vector.hnsw.ef-search` | `0` | HNSW search width during search. `0` uses the native library default. | +| `vector.train.sample-ratio` | `1.0` | Ratio of vectors sampled for index training. Must be in `(0, 1.0]`. | +| `vector.add.batch-size` | `10000` | Batch size used when adding vectors to the native index writer. | **Vector Search** diff --git a/docs/docs/multimodal-table/index.mdx b/docs/docs/multimodal-table/index.mdx index 4e0f42a38712..f0a4b2975082 100644 --- a/docs/docs/multimodal-table/index.mdx +++ b/docs/docs/multimodal-table/index.mdx @@ -37,7 +37,7 @@ Key capabilities: - **[Data Evolution](./data-evolution)**: Update partial columns without rewriting entire files, enabling efficient schema evolution. - **[Blob Storage](./blob)**: Store large binary objects (images, videos, audio) in dedicated `.blob` files with efficient column projection. - **[Vector Storage](./vector)**: Store and manage vector embeddings in dedicated Vortex-format files optimized for vector workloads. -- **[Global Index](./global-index)**: Build BTree, vector (DiskANN), and full-text (Tantivy) indexes for efficient lookups and similarity search. +- **[Global Index](./global-index)**: Build BTree, vector, and full-text (Tantivy) indexes for efficient lookups and similarity search. All multimodal features require the following table properties: diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java index 2b017888b564..76ebdbdc9179 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java @@ -56,7 +56,6 @@ public class VectorGlobalIndexReader implements GlobalIndexReader { private final GlobalIndexIOMeta ioMeta; private final GlobalIndexFileReader fileReader; private final DataType fieldType; - private final VectorIndexOptions options; private final ExecutorService executor; private volatile VectorIndexMeta indexMeta; @@ -67,14 +66,12 @@ public VectorGlobalIndexReader( GlobalIndexFileReader fileReader, List ioMetas, DataType fieldType, - VectorIndexOptions options, ExecutorService executor) { checkArgument(ioMetas.size() == 1, "Expected exactly one index file per shard"); this.executor = executor; this.fileReader = fileReader; this.ioMeta = ioMetas.get(0); this.fieldType = fieldType; - this.options = options; } @Override @@ -101,7 +98,7 @@ private ScoredGlobalIndexResult search(VectorSearch vectorSearch) throws IOExcep float[] queryVector = vectorSearch.vector().clone(); int limit = vectorSearch.limit(); int nprobe = indexMeta.nprobe(); - VectorMetric metric = indexMeta.metric(); + String metric = indexMeta.metric(); RoaringNavigableMap64 includeRowIds = vectorSearch.includeRowIds(); VectorSearchResult result; @@ -158,17 +155,15 @@ private ScoredGlobalIndexResult search(VectorSearch vectorSearch) throws IOExcep }); } - private static float convertDistanceToScore(float distance, VectorMetric metric) { - switch (metric) { - case L2: - return 1.0f / (1.0f + distance); - case COSINE: - return 1.0f - distance; - case INNER_PRODUCT: - return distance; - default: - throw new IllegalArgumentException("Unknown metric: " + metric); + private static float convertDistanceToScore(float distance, String metric) { + if ("l2".equals(metric)) { + return 1.0f / (1.0f + distance); + } else if ("cosine".equals(metric)) { + return 1.0f - distance; + } else if ("inner_product".equals(metric)) { + return distance; } + throw new IllegalArgumentException("Unknown metric: " + metric); } private void validateSearchVector(Object vector) { diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java index dacef17ca4e8..e7cf54a01c9b 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java @@ -24,7 +24,9 @@ import org.apache.paimon.globalindex.GlobalIndexSingletonWriter; import org.apache.paimon.globalindex.ResultEntry; import org.apache.paimon.globalindex.io.GlobalIndexFileWriter; +import org.apache.paimon.index.ivfpq.IndexType; import org.apache.paimon.index.ivfpq.VectorIndexWriter; +import org.apache.paimon.options.Options; import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.DataType; import org.apache.paimon.types.FloatType; @@ -41,7 +43,9 @@ import java.nio.ByteOrder; import java.nio.channels.FileChannel; import java.util.Collections; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.Random; /** @@ -60,10 +64,35 @@ public class VectorGlobalIndexWriter implements GlobalIndexSingletonWriter, Clos private static final Logger LOG = LoggerFactory.getLogger(VectorGlobalIndexWriter.class); private static final int IO_BUFFER_SIZE = 8 * 1024 * 1024; + private static final String OPTION_PREFIX = "vector."; + private static final int DEFAULT_DIMENSION = 128; + private static final String DEFAULT_METRIC = "inner_product"; + private static final int DEFAULT_NLIST = 256; + private static final int DEFAULT_PQ_M = 16; + private static final boolean DEFAULT_USE_OPQ = false; + private static final int DEFAULT_HNSW_M = 20; + private static final int DEFAULT_HNSW_EF_CONSTRUCTION = 150; + private static final int DEFAULT_HNSW_MAX_LEVEL = 7; + private static final int DEFAULT_NPROBE = 16; + private static final int DEFAULT_EF_SEARCH = 0; + private static final double DEFAULT_TRAIN_SAMPLE_RATIO = 1.0; + private static final int DEFAULT_ADD_BATCH_SIZE = 10000; private final GlobalIndexFileWriter fileWriter; - private final VectorIndexOptions options; + private final IndexType indexType; + private final String identifier; private final int dim; + private final String metric; + private final int nlist; + private final int pqM; + private final boolean useOpq; + private final int hnswM; + private final int hnswEfConstruction; + private final int hnswMaxLevel; + private final int nprobe; + private final int efSearch; + private final double trainSampleRatio; + private final int addBatchSize; private File tempVectorFile; private FileChannel writeChannel; @@ -77,16 +106,35 @@ public class VectorGlobalIndexWriter implements GlobalIndexSingletonWriter, Clos private long logicalRowId; public VectorGlobalIndexWriter( - GlobalIndexFileWriter fileWriter, DataType fieldType, VectorIndexOptions options) { + GlobalIndexFileWriter fileWriter, + DataType fieldType, + Options options, + IndexType indexType, + String identifier) { this.fileWriter = fileWriter; - this.options = options; - this.dim = options.dimension(); + this.indexType = indexType; + this.identifier = identifier; + this.dim = dimension(fieldType, options); + this.metric = stringOption(options, "distance.metric", DEFAULT_METRIC); + this.nlist = positiveIntOption(options, "nlist", DEFAULT_NLIST); + this.pqM = positiveIntOption(options, "pq.m", DEFAULT_PQ_M); + this.useOpq = booleanOption(options, "pq.use-opq", DEFAULT_USE_OPQ); + this.hnswM = positiveIntOption(options, "hnsw.m", DEFAULT_HNSW_M); + this.hnswEfConstruction = + positiveIntOption(options, "hnsw.ef-construction", DEFAULT_HNSW_EF_CONSTRUCTION); + this.hnswMaxLevel = positiveIntOption(options, "hnsw.max-level", DEFAULT_HNSW_MAX_LEVEL); + this.nprobe = positiveIntOption(options, "nprobe", DEFAULT_NPROBE); + this.efSearch = nonNegativeIntOption(options, "hnsw.ef-search", DEFAULT_EF_SEARCH); + this.trainSampleRatio = + doubleOption(options, "train.sample-ratio", DEFAULT_TRAIN_SAMPLE_RATIO); + this.addBatchSize = positiveIntOption(options, "add.batch-size", DEFAULT_ADD_BATCH_SIZE); this.count = 0; this.closed = false; this.recordSizeInBytes = checkedRecordSize(dim, IO_BUFFER_SIZE); this.vectorBuf = new float[dim]; validateFieldType(fieldType); + validateOptions(); try { this.tempVectorFile = File.createTempFile("paimon-vector-index-vectors-", ".bin"); @@ -214,44 +262,40 @@ public List finish() { } private ResultEntry buildIndex() throws IOException { - int effectiveNlist = (int) Math.min(options.nlist(), count); + int effectiveNlist = (int) Math.min(nlist, count); LOG.info( "{} vector index build started: {} vectors, dim={}, nlist={}, metric={}", - options.logName(), + identifier, count, dim, effectiveNlist, - options.metric()); + metric); long buildStart = System.currentTimeMillis(); - try (VectorIndexWriter writer = - new VectorIndexWriter(options.toNativeOptions(effectiveNlist))) { + try (VectorIndexWriter writer = new VectorIndexWriter(nativeOptions(effectiveNlist))) { // Phase 1: Train long phaseStart = System.currentTimeMillis(); - LOG.info( - "{} train phase started (sample_ratio={})", - options.logName(), - options.trainSampleRatio()); + LOG.info("{} train phase started (sample_ratio={})", identifier, trainSampleRatio); trainFromTempFile(writer); LOG.info( "{} train phase done in {} ms", - options.logName(), + identifier, System.currentTimeMillis() - phaseStart); // Phase 2: Add all vectors in batches phaseStart = System.currentTimeMillis(); - LOG.info("{} add phase started", options.logName()); + LOG.info("{} add phase started", identifier); addVectorsFromTempFile(writer); LOG.info( "{} add phase done in {} ms", - options.logName(), + identifier, System.currentTimeMillis() - phaseStart); // Phase 3: Write index phaseStart = System.currentTimeMillis(); - LOG.info("{} write phase started", options.logName()); + LOG.info("{} write phase started", identifier); String fileName = fileWriter.newFileName(fileNamePrefix()); try (PositionOutputStream out = fileWriter.newOutputStream(fileName)) { writer.writeIndex(out); @@ -259,31 +303,30 @@ private ResultEntry buildIndex() throws IOException { } LOG.info( "{} write phase done in {} ms", - options.logName(), + identifier, System.currentTimeMillis() - phaseStart); LOG.info( "{} vector index build completed in {} ms", - options.logName(), + identifier, System.currentTimeMillis() - buildStart); - VectorIndexMeta meta = new VectorIndexMeta(options); + VectorIndexMeta meta = new VectorIndexMeta(metadata()); return new ResultEntry(fileName, logicalRowId, meta.serialize()); } } private String fileNamePrefix() { - return FILE_NAME_PREFIX + "-" + options.logName(); + return FILE_NAME_PREFIX + "-" + identifier; } private void trainFromTempFile(VectorIndexWriter writer) throws IOException { - double sampleRatio = options.trainSampleRatio(); - int minTrainSize = (int) Math.min(count, Math.max(options.nlist() * 39L, 256)); + int minTrainSize = (int) Math.min(count, Math.max(nlist * 39L, 256)); int sampleCount; - if (sampleRatio >= 1.0) { + if (trainSampleRatio >= 1.0) { sampleCount = (int) count; } else { - sampleCount = Math.max((int) (count * sampleRatio), minTrainSize); + sampleCount = Math.max((int) (count * trainSampleRatio), minTrainSize); sampleCount = (int) Math.min(sampleCount, count); } @@ -334,9 +377,8 @@ private void trainFromTempFile(VectorIndexWriter writer) throws IOException { } private void addVectorsFromTempFile(VectorIndexWriter writer) throws IOException { - int batchSize = options.addBatchSize(); - long[] batchIds = new long[batchSize]; - float[] batchVectors = new float[batchSize * dim]; + long[] batchIds = new long[addBatchSize]; + float[] batchVectors = new float[addBatchSize * dim]; try (RandomAccessFile raf = new RandomAccessFile(tempVectorFile, "r"); FileChannel channel = raf.getChannel()) { @@ -348,7 +390,7 @@ private void addVectorsFromTempFile(VectorIndexWriter writer) throws IOException int lastLoggedPercent = -1; while (remaining > 0) { - int thisBatch = (int) Math.min(batchSize, remaining); + int thisBatch = (int) Math.min(addBatchSize, remaining); for (int i = 0; i < thisBatch; i++) { ensureAvailable(readBuf, channel, recordSizeInBytes); batchIds[i] = readBuf.getLong(); @@ -363,13 +405,128 @@ private void addVectorsFromTempFile(VectorIndexWriter writer) throws IOException if (percent / 10 > lastLoggedPercent / 10) { LOG.info( "{} add progress: {}/{} vectors ({}%)", - options.logName(), count - remaining, count, percent); + identifier, count - remaining, count, percent); lastLoggedPercent = percent; } } } } + private Map nativeOptions(int effectiveNlist) { + Map nativeOptions = new LinkedHashMap<>(); + nativeOptions.put("index.type", nativeIndexType(indexType)); + nativeOptions.put("dimension", String.valueOf(dim)); + nativeOptions.put("nlist", String.valueOf(effectiveNlist)); + nativeOptions.put("metric", metric); + switch (indexType) { + case IVF_FLAT: + break; + case IVF_PQ: + nativeOptions.put("pq.m", String.valueOf(pqM)); + nativeOptions.put("use-opq", String.valueOf(useOpq)); + break; + case IVF_HNSW_FLAT: + case IVF_HNSW_SQ: + nativeOptions.put("hnsw.m", String.valueOf(hnswM)); + nativeOptions.put("hnsw.ef-construction", String.valueOf(hnswEfConstruction)); + nativeOptions.put("hnsw.max-level", String.valueOf(hnswMaxLevel)); + break; + default: + throw new IllegalArgumentException("Unsupported vector index type: " + indexType); + } + return nativeOptions; + } + + private Map metadata() { + Map metadata = new LinkedHashMap<>(); + metadata.put(VectorIndexMeta.KEY_INDEX_TYPE, identifier); + metadata.put(VectorIndexMeta.KEY_DIMENSION, String.valueOf(dim)); + metadata.put(VectorIndexMeta.KEY_METRIC, metric); + metadata.put(VectorIndexMeta.KEY_NLIST, String.valueOf(nlist)); + metadata.put(VectorIndexMeta.KEY_M, String.valueOf(pqM)); + metadata.put(VectorIndexMeta.KEY_USE_OPQ, String.valueOf(useOpq)); + metadata.put(VectorIndexMeta.KEY_HNSW_M, String.valueOf(hnswM)); + metadata.put(VectorIndexMeta.KEY_HNSW_EF_CONSTRUCTION, String.valueOf(hnswEfConstruction)); + metadata.put(VectorIndexMeta.KEY_HNSW_MAX_LEVEL, String.valueOf(hnswMaxLevel)); + metadata.put(VectorIndexMeta.KEY_NPROBE, String.valueOf(nprobe)); + metadata.put(VectorIndexMeta.KEY_EF_SEARCH, String.valueOf(efSearch)); + return metadata; + } + + private void validateOptions() { + if (indexType == IndexType.IVF_PQ && dim % pqM != 0) { + throw new IllegalArgumentException( + String.format("vector.pq.m (%d) must divide vector dimension (%d)", pqM, dim)); + } + if (trainSampleRatio <= 0 || trainSampleRatio > 1.0) { + throw new IllegalArgumentException( + String.format( + "vector.train.sample-ratio must be in (0, 1.0], but got %f", + trainSampleRatio)); + } + } + + private static int dimension(DataType fieldType, Options options) { + if (fieldType instanceof VectorType) { + return ((VectorType) fieldType).getLength(); + } + return positiveIntOption(options, "index.dimension", DEFAULT_DIMENSION); + } + + private static String stringOption(Options options, String key, String defaultValue) { + String value = options.get(OPTION_PREFIX + key); + return value == null ? defaultValue : value; + } + + private static int positiveIntOption(Options options, String key, int defaultValue) { + int value = options.getInteger(OPTION_PREFIX + key, defaultValue); + if (value <= 0) { + throw new IllegalArgumentException( + "Invalid value for 'vector." + + key + + "': " + + value + + ". Must be a positive integer."); + } + return value; + } + + private static int nonNegativeIntOption(Options options, String key, int defaultValue) { + int value = options.getInteger(OPTION_PREFIX + key, defaultValue); + if (value < 0) { + throw new IllegalArgumentException( + "Invalid value for 'vector." + + key + + "': " + + value + + ". Must be a non-negative integer."); + } + return value; + } + + private static boolean booleanOption(Options options, String key, boolean defaultValue) { + return options.getBoolean(OPTION_PREFIX + key, defaultValue); + } + + private static double doubleOption(Options options, String key, double defaultValue) { + return options.getDouble(OPTION_PREFIX + key, defaultValue); + } + + private static String nativeIndexType(IndexType indexType) { + switch (indexType) { + case IVF_FLAT: + return "ivf_flat"; + case IVF_PQ: + return "ivf_pq"; + case IVF_HNSW_FLAT: + return "ivf_hnsw_flat"; + case IVF_HNSW_SQ: + return "ivf_hnsw_sq"; + default: + throw new IllegalArgumentException("Unsupported vector index type: " + indexType); + } + } + private static void ensureAvailable(ByteBuffer readBuf, FileChannel channel, int minBytes) throws IOException { int zeroReadCount = 0; diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java index 4fc10a8c0709..608f528ceb1b 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java @@ -29,22 +29,28 @@ import org.apache.paimon.types.DataType; import java.util.List; +import java.util.Objects; import java.util.concurrent.ExecutorService; /** Vector global indexer backed by paimon-vector-index. */ public class VectorGlobalIndexer implements GlobalIndexer { private final DataType fieldType; - private final VectorIndexOptions options; + private final Options options; + private final IndexType indexType; + private final String identifier; - public VectorGlobalIndexer(DataType fieldType, Options options, IndexType indexType) { + public VectorGlobalIndexer( + DataType fieldType, Options options, IndexType indexType, String identifier) { this.fieldType = fieldType; - this.options = new VectorIndexOptions(options, indexType); + this.options = options; + this.indexType = Objects.requireNonNull(indexType, "indexType must not be null"); + this.identifier = Objects.requireNonNull(identifier, "identifier must not be null"); } @Override public GlobalIndexWriter createWriter(GlobalIndexFileWriter fileWriter) { - return new VectorGlobalIndexWriter(fileWriter, fieldType, options); + return new VectorGlobalIndexWriter(fileWriter, fieldType, options, indexType, identifier); } @Override @@ -52,6 +58,6 @@ public GlobalIndexReader createReader( GlobalIndexFileReader fileReader, List files, ExecutorService executor) { - return new VectorGlobalIndexReader(fileReader, files, fieldType, options, executor); + return new VectorGlobalIndexReader(fileReader, files, fieldType, executor); } } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java index cdbea58a7cb1..5b9ff4540712 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java @@ -31,6 +31,6 @@ public abstract class VectorGlobalIndexerFactory implements GlobalIndexerFactory @Override public GlobalIndexer create(DataField field, Options options) { - return new VectorGlobalIndexer(field.type(), options, indexType()); + return new VectorGlobalIndexer(field.type(), options, indexType(), identifier()); } } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexMeta.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexMeta.java index 2c8e29f4cb23..63fbe826e74d 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexMeta.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexMeta.java @@ -38,17 +38,17 @@ public class VectorIndexMeta implements Serializable { private static final long serialVersionUID = 1L; - private static final String KEY_INDEX_TYPE = "index_type"; - private static final String KEY_DIMENSION = "dimension"; - private static final String KEY_METRIC = "metric"; - private static final String KEY_NLIST = "nlist"; - private static final String KEY_M = "m"; - private static final String KEY_USE_OPQ = "use_opq"; - private static final String KEY_HNSW_M = "hnsw_m"; - private static final String KEY_HNSW_EF_CONSTRUCTION = "hnsw_ef_construction"; - private static final String KEY_HNSW_MAX_LEVEL = "hnsw_max_level"; - private static final String KEY_NPROBE = "nprobe"; - private static final String KEY_EF_SEARCH = "ef_search"; + static final String KEY_INDEX_TYPE = "index_type"; + static final String KEY_DIMENSION = "dimension"; + static final String KEY_METRIC = "metric"; + static final String KEY_NLIST = "nlist"; + static final String KEY_M = "m"; + static final String KEY_USE_OPQ = "use_opq"; + static final String KEY_HNSW_M = "hnsw_m"; + static final String KEY_HNSW_EF_CONSTRUCTION = "hnsw_ef_construction"; + static final String KEY_HNSW_MAX_LEVEL = "hnsw_max_level"; + static final String KEY_NPROBE = "nprobe"; + static final String KEY_EF_SEARCH = "ef_search"; private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -57,22 +57,7 @@ public class VectorIndexMeta implements Serializable { private final Map params; - public VectorIndexMeta(VectorIndexOptions options) { - this.params = new LinkedHashMap<>(); - params.put(KEY_INDEX_TYPE, VectorIndexOptions.toIdentifier(options.indexType())); - params.put(KEY_DIMENSION, String.valueOf(options.dimension())); - params.put(KEY_METRIC, options.metric().getConfigName()); - params.put(KEY_NLIST, String.valueOf(options.nlist())); - params.put(KEY_M, String.valueOf(options.m())); - params.put(KEY_USE_OPQ, String.valueOf(options.useOpq())); - params.put(KEY_HNSW_M, String.valueOf(options.hnswM())); - params.put(KEY_HNSW_EF_CONSTRUCTION, String.valueOf(options.hnswEfConstruction())); - params.put(KEY_HNSW_MAX_LEVEL, String.valueOf(options.hnswMaxLevel())); - params.put(KEY_NPROBE, String.valueOf(options.nprobe())); - params.put(KEY_EF_SEARCH, String.valueOf(options.efSearch())); - } - - private VectorIndexMeta(Map params) { + VectorIndexMeta(Map params) { this.params = new LinkedHashMap<>(params); } @@ -82,15 +67,15 @@ public IndexType indexType() { throw new IllegalArgumentException( "Missing required key in vector index metadata: " + KEY_INDEX_TYPE); } - return VectorIndexOptions.parseIndexType(value); + return parseIndexType(value); } public int dimension() { return Integer.parseInt(params.get(KEY_DIMENSION)); } - public VectorMetric metric() { - return VectorMetric.fromConfigName(params.get(KEY_METRIC)); + public String metric() { + return params.get(KEY_METRIC); } public int nlist() { @@ -106,15 +91,15 @@ public boolean useOpq() { } public int hnswM() { - return intValue(KEY_HNSW_M, VectorIndexOptions.DEFAULT_HNSW_M); + return intValue(KEY_HNSW_M, 20); } public int hnswEfConstruction() { - return intValue(KEY_HNSW_EF_CONSTRUCTION, VectorIndexOptions.DEFAULT_HNSW_EF_CONSTRUCTION); + return intValue(KEY_HNSW_EF_CONSTRUCTION, 150); } public int hnswMaxLevel() { - return intValue(KEY_HNSW_MAX_LEVEL, VectorIndexOptions.DEFAULT_HNSW_MAX_LEVEL); + return intValue(KEY_HNSW_MAX_LEVEL, 7); } public int nprobe() { @@ -149,4 +134,17 @@ private int intValue(String key, int defaultValue) { String val = params.get(key); return val == null ? defaultValue : Integer.parseInt(val); } + + private static IndexType parseIndexType(String value) { + if (IvfPqAlgorithmVectorGlobalIndexerFactory.IDENTIFIER.equals(value)) { + return IndexType.IVF_PQ; + } else if (IvfFlatVectorGlobalIndexerFactory.IDENTIFIER.equals(value)) { + return IndexType.IVF_FLAT; + } else if (IvfHnswFlatVectorGlobalIndexerFactory.IDENTIFIER.equals(value)) { + return IndexType.IVF_HNSW_FLAT; + } else if (IvfHnswSqVectorGlobalIndexerFactory.IDENTIFIER.equals(value)) { + return IndexType.IVF_HNSW_SQ; + } + throw new IllegalArgumentException("Unknown vector index type: " + value); + } } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexOptions.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexOptions.java deleted file mode 100644 index 5f31a9481c14..000000000000 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexOptions.java +++ /dev/null @@ -1,317 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.vector.index; - -import org.apache.paimon.index.ivfpq.IndexType; -import org.apache.paimon.options.ConfigOption; -import org.apache.paimon.options.ConfigOptions; -import org.apache.paimon.options.Options; - -import java.util.LinkedHashMap; -import java.util.Map; -import java.util.Objects; - -/** Options for the Paimon vector index backed by paimon-vector-index. */ -public class VectorIndexOptions { - - public static final String IVF_FLAT_IDENTIFIER = "ivf-flat"; - public static final String IVF_PQ_IDENTIFIER = "ivf-pq"; - public static final String IVF_HNSW_FLAT_IDENTIFIER = "ivf-hnsw-flat"; - public static final String IVF_HNSW_SQ_IDENTIFIER = "ivf-hnsw-sq"; - - static final int DEFAULT_HNSW_M = 20; - static final int DEFAULT_HNSW_EF_CONSTRUCTION = 150; - static final int DEFAULT_HNSW_MAX_LEVEL = 7; - - public static final ConfigOption DIMENSION = - ConfigOptions.key("vector.index.dimension") - .intType() - .defaultValue(128) - .withDescription("The dimension of the vector."); - - public static final ConfigOption DISTANCE_METRIC = - ConfigOptions.key("vector.distance.metric") - .stringType() - .defaultValue("inner_product") - .withDescription( - "Distance metric for vector search (l2, cosine, inner_product)."); - - public static final ConfigOption NLIST = - ConfigOptions.key("vector.nlist") - .intType() - .defaultValue(256) - .withDescription("Number of IVF partitions (Voronoi cells)."); - - public static final ConfigOption M = - ConfigOptions.key("vector.pq.m") - .intType() - .defaultValue(16) - .withDescription( - "Number of PQ sub-quantizers. Must divide the vector dimension."); - - public static final ConfigOption USE_OPQ = - ConfigOptions.key("vector.pq.use-opq") - .booleanType() - .defaultValue(false) - .withDescription( - "Whether to use OPQ (Optimized Product Quantization) rotation."); - - public static final ConfigOption HNSW_M = - ConfigOptions.key("vector.hnsw.m") - .intType() - .defaultValue(DEFAULT_HNSW_M) - .withDescription("Maximum number of HNSW neighbors per node."); - - public static final ConfigOption HNSW_EF_CONSTRUCTION = - ConfigOptions.key("vector.hnsw.ef-construction") - .intType() - .defaultValue(DEFAULT_HNSW_EF_CONSTRUCTION) - .withDescription("HNSW efConstruction value used during index build."); - - public static final ConfigOption HNSW_MAX_LEVEL = - ConfigOptions.key("vector.hnsw.max-level") - .intType() - .defaultValue(DEFAULT_HNSW_MAX_LEVEL) - .withDescription("Maximum HNSW graph level."); - - public static final ConfigOption NPROBE = - ConfigOptions.key("vector.nprobe") - .intType() - .defaultValue(16) - .withDescription("Number of IVF partitions to probe during search."); - - public static final ConfigOption EF_SEARCH = - ConfigOptions.key("vector.hnsw.ef-search") - .intType() - .defaultValue(0) - .withDescription( - "HNSW efSearch value used during search. 0 uses the native default."); - - public static final ConfigOption TRAIN_SAMPLE_RATIO = - ConfigOptions.key("vector.train.sample-ratio") - .doubleType() - .defaultValue(1.0) - .withDescription( - "Ratio of vectors sampled for training (0.0-1.0]. " - + "1.0 means use all vectors for training."); - - public static final ConfigOption ADD_BATCH_SIZE = - ConfigOptions.key("vector.add.batch-size") - .intType() - .defaultValue(10000) - .withDescription("Batch size for adding vectors after training."); - - private final IndexType indexType; - private final int dimension; - private final VectorMetric metric; - private final int nlist; - private final int m; - private final boolean useOpq; - private final int hnswM; - private final int hnswEfConstruction; - private final int hnswMaxLevel; - private final int nprobe; - private final int efSearch; - private final double trainSampleRatio; - private final int addBatchSize; - - public VectorIndexOptions(Options options, IndexType indexType) { - this.indexType = Objects.requireNonNull(indexType, "indexType must not be null"); - this.dimension = validatePositive(options.get(DIMENSION), optionKey(DIMENSION)); - this.metric = parseMetric(options.get(DISTANCE_METRIC)); - this.nlist = validatePositive(options.get(NLIST), optionKey(NLIST)); - this.m = validatePositive(options.get(M), optionKey(M)); - this.useOpq = options.get(USE_OPQ); - this.hnswM = validatePositive(options.get(HNSW_M), optionKey(HNSW_M)); - this.hnswEfConstruction = - validatePositive( - options.get(HNSW_EF_CONSTRUCTION), optionKey(HNSW_EF_CONSTRUCTION)); - this.hnswMaxLevel = - validatePositive(options.get(HNSW_MAX_LEVEL), optionKey(HNSW_MAX_LEVEL)); - this.nprobe = validatePositive(options.get(NPROBE), optionKey(NPROBE)); - this.efSearch = validateNonNegative(options.get(EF_SEARCH), optionKey(EF_SEARCH)); - this.trainSampleRatio = options.get(TRAIN_SAMPLE_RATIO); - this.addBatchSize = - validatePositive(options.get(ADD_BATCH_SIZE), optionKey(ADD_BATCH_SIZE)); - - if (indexType == IndexType.IVF_PQ && dimension % m != 0) { - throw new IllegalArgumentException( - String.format( - "%s (%d) must divide %s (%d)", - optionKey(M), m, optionKey(DIMENSION), dimension)); - } - if (trainSampleRatio <= 0 || trainSampleRatio > 1.0) { - throw new IllegalArgumentException( - String.format( - "%s must be in (0, 1.0], but got %f", - optionKey(TRAIN_SAMPLE_RATIO), trainSampleRatio)); - } - } - - public IndexType indexType() { - return indexType; - } - - public int dimension() { - return dimension; - } - - public VectorMetric metric() { - return metric; - } - - public int nlist() { - return nlist; - } - - public int m() { - return m; - } - - public boolean useOpq() { - return useOpq; - } - - public int hnswM() { - return hnswM; - } - - public int hnswEfConstruction() { - return hnswEfConstruction; - } - - public int hnswMaxLevel() { - return hnswMaxLevel; - } - - public int nprobe() { - return nprobe; - } - - public int efSearch() { - return efSearch; - } - - public double trainSampleRatio() { - return trainSampleRatio; - } - - public int addBatchSize() { - return addBatchSize; - } - - public Map toNativeOptions(int effectiveNlist) { - Map nativeOptions = new LinkedHashMap<>(); - nativeOptions.put("index.type", toNativeIndexType(indexType)); - nativeOptions.put("dimension", String.valueOf(dimension)); - nativeOptions.put("nlist", String.valueOf(effectiveNlist)); - nativeOptions.put("metric", metric.getConfigName()); - switch (indexType) { - case IVF_FLAT: - break; - case IVF_PQ: - nativeOptions.put("pq.m", String.valueOf(m)); - nativeOptions.put("use-opq", String.valueOf(useOpq)); - break; - case IVF_HNSW_FLAT: - case IVF_HNSW_SQ: - nativeOptions.put("hnsw.m", String.valueOf(hnswM)); - nativeOptions.put("hnsw.ef-construction", String.valueOf(hnswEfConstruction)); - nativeOptions.put("hnsw.max-level", String.valueOf(hnswMaxLevel)); - break; - default: - throw new IllegalArgumentException("Unsupported vector index type: " + indexType); - } - return nativeOptions; - } - - public String logName() { - return toIdentifier(indexType); - } - - private static VectorMetric parseMetric(String value) { - return VectorMetric.fromConfigName(value); - } - - private static int validatePositive(int value, String key) { - if (value <= 0) { - throw new IllegalArgumentException( - String.format( - "Invalid value for '%s': %d. Must be a positive integer.", key, value)); - } - return value; - } - - private static int validateNonNegative(int value, String key) { - if (value < 0) { - throw new IllegalArgumentException( - String.format( - "Invalid value for '%s': %d. Must be a non-negative integer.", - key, value)); - } - return value; - } - - public static IndexType parseIndexType(String value) { - if (IVF_PQ_IDENTIFIER.equals(value)) { - return IndexType.IVF_PQ; - } else if (IVF_FLAT_IDENTIFIER.equals(value)) { - return IndexType.IVF_FLAT; - } else if (IVF_HNSW_FLAT_IDENTIFIER.equals(value)) { - return IndexType.IVF_HNSW_FLAT; - } else if (IVF_HNSW_SQ_IDENTIFIER.equals(value)) { - return IndexType.IVF_HNSW_SQ; - } - throw new IllegalArgumentException("Unknown vector index type: " + value); - } - - public static String toIdentifier(IndexType indexType) { - switch (indexType) { - case IVF_FLAT: - return IVF_FLAT_IDENTIFIER; - case IVF_PQ: - return IVF_PQ_IDENTIFIER; - case IVF_HNSW_FLAT: - return IVF_HNSW_FLAT_IDENTIFIER; - case IVF_HNSW_SQ: - return IVF_HNSW_SQ_IDENTIFIER; - default: - throw new IllegalArgumentException("Unsupported vector index type: " + indexType); - } - } - - private static String toNativeIndexType(IndexType indexType) { - switch (indexType) { - case IVF_FLAT: - return "ivf_flat"; - case IVF_PQ: - return "ivf_pq"; - case IVF_HNSW_FLAT: - return "ivf_hnsw_flat"; - case IVF_HNSW_SQ: - return "ivf_hnsw_sq"; - default: - throw new IllegalArgumentException("Unsupported vector index type: " + indexType); - } - } - - private static String optionKey(ConfigOption option) { - return option.key(); - } -} diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorMetric.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorMetric.java deleted file mode 100644 index 91ed9a357dc8..000000000000 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorMetric.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.vector.index; - -/** Enumeration of supported vector distance metrics. */ -public enum VectorMetric { - L2("l2"), - COSINE("cosine"), - INNER_PRODUCT("inner_product"); - - private final String configName; - - VectorMetric(String configName) { - this.configName = configName; - } - - public String getConfigName() { - return configName; - } - - public static VectorMetric fromConfigName(String configName) { - for (VectorMetric m : values()) { - if (m.configName.equals(configName)) { - return m; - } - } - throw new IllegalArgumentException("Unknown metric: " + configName); - } -} diff --git a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java index 9ea8ec2c38cf..04ad2e730e68 100644 --- a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java +++ b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java @@ -47,7 +47,9 @@ import java.io.IOException; import java.util.Arrays; import java.util.Collections; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.UUID; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -60,6 +62,11 @@ public class VectorGlobalIndexTest { @TempDir java.nio.file.Path tempDir; + private static final String IVF_PQ_IDENTIFIER = + IvfPqAlgorithmVectorGlobalIndexerFactory.IDENTIFIER; + private static final String IVF_HNSW_FLAT_IDENTIFIER = + IvfHnswFlatVectorGlobalIndexerFactory.IDENTIFIER; + private FileIO fileIO; private Path indexPath; private DataType vectorType; @@ -98,10 +105,8 @@ public void cleanup() throws IOException { @Test public void testDimensionMismatch() { Options options = createDefaultOptions(64); - VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - VectorGlobalIndexWriter writer = - new VectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + VectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); float[] wrongDimVector = new float[32]; assertThatThrownBy(() -> writer.write(wrongDimVector)) @@ -114,10 +119,9 @@ public void testVectorTypeRejectsNonFloatElement() { DataType intVecType = new VectorType(2, new IntType()); Options options = createDefaultOptions(2); options.setInteger("vector.pq.m", 1); - VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - assertThatThrownBy(() -> new VectorGlobalIndexWriter(fileWriter, intVecType, indexOptions)) + assertThatThrownBy(() -> createIvfPqWriter(fileWriter, intVecType, options)) .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("float"); } @@ -126,10 +130,8 @@ public void testVectorTypeRejectsNonFloatElement() { public void testNanInVectorRejected() { Options options = createDefaultOptions(2); options.setInteger("vector.pq.m", 1); - VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - VectorGlobalIndexWriter writer = - new VectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + VectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); assertThatThrownBy(() -> writer.write(new float[] {1.0f, Float.NaN})) .isInstanceOf(IllegalArgumentException.class) @@ -142,10 +144,8 @@ public void testNanInVectorRejected() { public void testInfinityInVectorRejected() { Options options = createDefaultOptions(2); options.setInteger("vector.pq.m", 1); - VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - VectorGlobalIndexWriter writer = - new VectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + VectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); writer.write(null); // row 0 - null, advances logicalRowId assertThatThrownBy(() -> writer.write(new float[] {Float.POSITIVE_INFINITY, 0.0f})) @@ -159,10 +159,8 @@ public void testInfinityInVectorRejected() { public void testAllNullReturnsEmpty() { Options options = createDefaultOptions(2); options.setInteger("vector.pq.m", 1); - VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - VectorGlobalIndexWriter writer = - new VectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + VectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); writer.write(null); writer.write(null); @@ -181,15 +179,14 @@ public void testMetaSerializationRoundTrip() throws IOException { options.setInteger("vector.pq.m", 8); options.setString("vector.pq.use-opq", "true"); options.setInteger("vector.nprobe", 24); - VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); - VectorIndexMeta meta = new VectorIndexMeta(indexOptions); + VectorIndexMeta meta = new VectorIndexMeta(metaOptions(IVF_PQ_IDENTIFIER, options)); byte[] serialized = meta.serialize(); VectorIndexMeta deserialized = VectorIndexMeta.deserialize(serialized); assertThat(deserialized.dimension()).isEqualTo(32); assertThat(deserialized.indexType()).isEqualTo(IndexType.IVF_PQ); - assertThat(deserialized.metric()).isEqualTo(VectorMetric.COSINE); + assertThat(deserialized.metric()).isEqualTo("cosine"); assertThat(deserialized.nlist()).isEqualTo(64); assertThat(deserialized.m()).isEqualTo(8); assertThat(deserialized.useOpq()).isTrue(); @@ -206,10 +203,11 @@ public void testMetaSerializationRoundTripForHnsw() throws IOException { options.setInteger("vector.hnsw.ef-construction", 64); options.setInteger("vector.hnsw.max-level", 5); options.setInteger("vector.hnsw.ef-search", 80); - VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_HNSW_FLAT); VectorIndexMeta deserialized = - VectorIndexMeta.deserialize(new VectorIndexMeta(indexOptions).serialize()); + VectorIndexMeta.deserialize( + new VectorIndexMeta(metaOptions(IVF_HNSW_FLAT_IDENTIFIER, options)) + .serialize()); assertThat(deserialized.indexType()).isEqualTo(IndexType.IVF_HNSW_FLAT); assertThat(deserialized.dimension()).isEqualTo(16); @@ -229,7 +227,6 @@ public void testFloatVectorEndToEnd() throws IOException { Options options = createDefaultOptions(dimension); options.setInteger("vector.nlist", 2); options.setInteger("vector.pq.m", 1); - VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); float[][] vectors = new float[][] { @@ -242,16 +239,14 @@ public void testFloatVectorEndToEnd() throws IOException { }; GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - VectorGlobalIndexWriter writer = - new VectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + VectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); Arrays.stream(vectors).forEach(writer::write); List results = writer.finish(); List metas = toIOMetas(results, indexPath); GlobalIndexFileReader fileReader = createFileReader(indexPath); try (VectorGlobalIndexReader reader = - new VectorGlobalIndexReader( - fileReader, metas, vectorType, indexOptions, executor)) { + new VectorGlobalIndexReader(fileReader, metas, vectorType, executor)) { VectorSearch vectorSearch = new VectorSearch(vectors[0], 3, fieldName); ScoredGlobalIndexResult result = reader.visitVectorSearch(vectorSearch).join().get(); assertThat(result.results().getLongCardinality()).isEqualTo(3); @@ -269,7 +264,6 @@ public void testSearchWithRoaringFilter() throws IOException { Options options = createDefaultOptions(dimension); options.setInteger("vector.nlist", 2); options.setInteger("vector.pq.m", 1); - VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); float[][] vectors = new float[][] { @@ -282,16 +276,14 @@ public void testSearchWithRoaringFilter() throws IOException { }; GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - VectorGlobalIndexWriter writer = - new VectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + VectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); Arrays.stream(vectors).forEach(writer::write); List results = writer.finish(); List metas = toIOMetas(results, indexPath); GlobalIndexFileReader fileReader = createFileReader(indexPath); try (VectorGlobalIndexReader reader = - new VectorGlobalIndexReader( - fileReader, metas, vectorType, indexOptions, executor)) { + new VectorGlobalIndexReader(fileReader, metas, vectorType, executor)) { // Filter to rows {1, 4} only RoaringNavigableMap64 filter = new RoaringNavigableMap64(); @@ -314,7 +306,6 @@ public void testNullVectorSkipWithCorrectIds() throws IOException { Options options = createDefaultOptions(dimension); options.setInteger("vector.nlist", 2); options.setInteger("vector.pq.m", 1); - VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); float[][] vectors = new float[][] { @@ -324,8 +315,7 @@ public void testNullVectorSkipWithCorrectIds() throws IOException { }; GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); - VectorGlobalIndexWriter writer = - new VectorGlobalIndexWriter(fileWriter, vectorType, indexOptions); + VectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); writer.write(vectors[0]); // row 0 writer.write(null); // row 1 - null @@ -341,8 +331,7 @@ public void testNullVectorSkipWithCorrectIds() throws IOException { List metas = toIOMetas(results, indexPath); GlobalIndexFileReader fileReader = createFileReader(indexPath); try (VectorGlobalIndexReader reader = - new VectorGlobalIndexReader( - fileReader, metas, vectorType, indexOptions, executor)) { + new VectorGlobalIndexReader(fileReader, metas, vectorType, executor)) { VectorSearch vectorSearch = new VectorSearch(vectors[0], 3, fieldName); ScoredGlobalIndexResult result = reader.visitVectorSearch(vectorSearch).join().get(); assertThat(result.results().getLongCardinality()).isEqualTo(3); @@ -372,7 +361,7 @@ public void testViaIndexer() throws IOException { }; VectorGlobalIndexer indexer = - new VectorGlobalIndexer(vectorType, options, IndexType.IVF_PQ); + new VectorGlobalIndexer(vectorType, options, IndexType.IVF_PQ, IVF_PQ_IDENTIFIER); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); VectorGlobalIndexWriter writer = (VectorGlobalIndexWriter) indexer.createWriter(fileWriter); @@ -392,6 +381,12 @@ public void testViaIndexer() throws IOException { // =================== Helpers ===================== + private VectorGlobalIndexWriter createIvfPqWriter( + GlobalIndexFileWriter fileWriter, DataType fieldType, Options options) { + return new VectorGlobalIndexWriter( + fileWriter, fieldType, options, IndexType.IVF_PQ, IVF_PQ_IDENTIFIER); + } + private Options createDefaultOptions(int dimension) { Options options = new Options(); options.setInteger("vector.index.dimension", dimension); @@ -399,6 +394,39 @@ private Options createDefaultOptions(int dimension) { return options; } + private Map metaOptions(String indexType, Options options) { + Map meta = new LinkedHashMap<>(); + meta.put(VectorIndexMeta.KEY_INDEX_TYPE, indexType); + meta.put( + VectorIndexMeta.KEY_DIMENSION, + String.valueOf(options.getInteger("vector.index.dimension", 128))); + meta.put( + VectorIndexMeta.KEY_METRIC, + options.getString("vector.distance.metric", "inner_product")); + meta.put( + VectorIndexMeta.KEY_NLIST, String.valueOf(options.getInteger("vector.nlist", 256))); + meta.put(VectorIndexMeta.KEY_M, String.valueOf(options.getInteger("vector.pq.m", 16))); + meta.put( + VectorIndexMeta.KEY_USE_OPQ, + String.valueOf(options.getBoolean("vector.pq.use-opq", false))); + meta.put( + VectorIndexMeta.KEY_HNSW_M, + String.valueOf(options.getInteger("vector.hnsw.m", 20))); + meta.put( + VectorIndexMeta.KEY_HNSW_EF_CONSTRUCTION, + String.valueOf(options.getInteger("vector.hnsw.ef-construction", 150))); + meta.put( + VectorIndexMeta.KEY_HNSW_MAX_LEVEL, + String.valueOf(options.getInteger("vector.hnsw.max-level", 7))); + meta.put( + VectorIndexMeta.KEY_NPROBE, + String.valueOf(options.getInteger("vector.nprobe", 16))); + meta.put( + VectorIndexMeta.KEY_EF_SEARCH, + String.valueOf(options.getInteger("vector.hnsw.ef-search", 0))); + return meta; + } + private GlobalIndexFileWriter createFileWriter(Path path) { return new GlobalIndexFileWriter() { @Override diff --git a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorIndexOptionsTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorIndexOptionsTest.java deleted file mode 100644 index 523de9fa8ad2..000000000000 --- a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorIndexOptionsTest.java +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.vector.index; - -import org.apache.paimon.index.ivfpq.IndexType; -import org.apache.paimon.options.Options; - -import org.junit.jupiter.api.Test; - -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -/** Tests for {@link VectorIndexOptions}. */ -public class VectorIndexOptionsTest { - - @Test - public void testDefaults() { - Options options = new Options(); - VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); - assertThat(indexOptions.dimension()).isEqualTo(128); - assertThat(indexOptions.indexType()).isEqualTo(IndexType.IVF_PQ); - assertThat(indexOptions.metric()).isEqualTo(VectorMetric.INNER_PRODUCT); - assertThat(indexOptions.nlist()).isEqualTo(256); - assertThat(indexOptions.m()).isEqualTo(16); - assertThat(indexOptions.useOpq()).isFalse(); - assertThat(indexOptions.hnswM()).isEqualTo(20); - assertThat(indexOptions.hnswEfConstruction()).isEqualTo(150); - assertThat(indexOptions.hnswMaxLevel()).isEqualTo(7); - assertThat(indexOptions.nprobe()).isEqualTo(16); - assertThat(indexOptions.efSearch()).isEqualTo(0); - assertThat(indexOptions.trainSampleRatio()).isEqualTo(1.0); - assertThat(indexOptions.addBatchSize()).isEqualTo(10000); - assertThat(indexOptions.toNativeOptions(12)) - .containsEntry("index.type", "ivf_pq") - .containsEntry("dimension", "128") - .containsEntry("nlist", "12") - .containsEntry("metric", "inner_product") - .containsEntry("pq.m", "16") - .containsEntry("use-opq", "false"); - } - - @Test - public void testCustomOptions() { - Options options = new Options(); - options.setInteger("vector.index.dimension", 64); - options.setString("vector.distance.metric", "l2"); - options.setInteger("vector.nlist", 128); - options.setInteger("vector.pq.m", 8); - options.setString("vector.pq.use-opq", "true"); - options.setInteger("vector.hnsw.m", 12); - options.setInteger("vector.hnsw.ef-construction", 64); - options.setInteger("vector.hnsw.max-level", 5); - options.setInteger("vector.nprobe", 32); - options.setInteger("vector.hnsw.ef-search", 96); - options.setString("vector.train.sample-ratio", "0.5"); - options.setInteger("vector.add.batch-size", 5000); - - VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_HNSW_SQ); - assertThat(indexOptions.dimension()).isEqualTo(64); - assertThat(indexOptions.indexType()).isEqualTo(IndexType.IVF_HNSW_SQ); - assertThat(indexOptions.metric()).isEqualTo(VectorMetric.L2); - assertThat(indexOptions.nlist()).isEqualTo(128); - assertThat(indexOptions.m()).isEqualTo(8); - assertThat(indexOptions.useOpq()).isTrue(); - assertThat(indexOptions.hnswM()).isEqualTo(12); - assertThat(indexOptions.hnswEfConstruction()).isEqualTo(64); - assertThat(indexOptions.hnswMaxLevel()).isEqualTo(5); - assertThat(indexOptions.nprobe()).isEqualTo(32); - assertThat(indexOptions.efSearch()).isEqualTo(96); - assertThat(indexOptions.trainSampleRatio()).isEqualTo(0.5); - assertThat(indexOptions.addBatchSize()).isEqualTo(5000); - assertThat(indexOptions.toNativeOptions(7)) - .containsEntry("index.type", "ivf_hnsw_sq") - .containsEntry("dimension", "64") - .containsEntry("nlist", "7") - .containsEntry("metric", "l2") - .containsEntry("hnsw.m", "12") - .containsEntry("hnsw.ef-construction", "64") - .containsEntry("hnsw.max-level", "5"); - } - - @Test - public void testMDivisibilityValidation() { - Options options = new Options(); - options.setInteger("vector.index.dimension", 10); - options.setInteger("vector.pq.m", 3); - assertThatThrownBy(() -> new VectorIndexOptions(options, IndexType.IVF_PQ)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("must divide"); - } - - @Test - public void testInvalidSampleRatio() { - Options options = new Options(); - options.setString("vector.train.sample-ratio", "0.0"); - assertThatThrownBy(() -> new VectorIndexOptions(options, IndexType.IVF_PQ)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("vector.train.sample-ratio"); - - Options options2 = new Options(); - options2.setString("vector.train.sample-ratio", "1.5"); - assertThatThrownBy(() -> new VectorIndexOptions(options2, IndexType.IVF_PQ)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("vector.train.sample-ratio"); - } - - @Test - public void testMetricParsing() { - for (String metric : new String[] {"l2", "cosine", "inner_product"}) { - Options options = new Options(); - options.setString("vector.distance.metric", metric); - VectorIndexOptions indexOptions = new VectorIndexOptions(options, IndexType.IVF_PQ); - assertThat(indexOptions.metric().getConfigName()).isEqualTo(metric); - } - } - - @Test - public void testMetricParsingRejectsUpperCase() { - Options options = new Options(); - options.setString("vector.distance.metric", "L2"); - assertThatThrownBy(() -> new VectorIndexOptions(options, IndexType.IVF_PQ)) - .isInstanceOf(IllegalArgumentException.class) - .hasMessageContaining("Unknown metric"); - } -} From 460e202d4527e11c1828ddab68f27519f8b98552 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Wed, 10 Jun 2026 23:45:02 +0800 Subject: [PATCH 08/11] [vector] Minimize vector index metadata --- .../vector/index/VectorGlobalIndexReader.java | 28 ++++--- .../vector/index/VectorGlobalIndexWriter.java | 9 --- .../paimon/vector/index/VectorIndexMeta.java | 80 +------------------ .../vector/index/VectorGlobalIndexTest.java | 67 +++------------- 4 files changed, 28 insertions(+), 156 deletions(-) diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java index 76ebdbdc9179..3ab219833fd4 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java @@ -24,7 +24,9 @@ import org.apache.paimon.globalindex.GlobalIndexResult; import org.apache.paimon.globalindex.ScoredGlobalIndexResult; import org.apache.paimon.globalindex.io.GlobalIndexFileReader; +import org.apache.paimon.index.ivfpq.Metric; import org.apache.paimon.index.ivfpq.VectorIndexInput; +import org.apache.paimon.index.ivfpq.VectorIndexMetadata; import org.apache.paimon.index.ivfpq.VectorIndexReader; import org.apache.paimon.index.ivfpq.VectorSearchResult; import org.apache.paimon.predicate.FieldRef; @@ -59,6 +61,7 @@ public class VectorGlobalIndexReader implements GlobalIndexReader { private final ExecutorService executor; private volatile VectorIndexMeta indexMeta; + private volatile VectorIndexMetadata nativeMeta; private volatile VectorIndexReader vectorReader; private SeekableInputStream openStream; @@ -98,7 +101,7 @@ private ScoredGlobalIndexResult search(VectorSearch vectorSearch) throws IOExcep float[] queryVector = vectorSearch.vector().clone(); int limit = vectorSearch.limit(); int nprobe = indexMeta.nprobe(); - String metric = indexMeta.metric(); + Metric metric = nativeMeta.metric(); RoaringNavigableMap64 includeRowIds = vectorSearch.includeRowIds(); VectorSearchResult result; @@ -155,15 +158,17 @@ private ScoredGlobalIndexResult search(VectorSearch vectorSearch) throws IOExcep }); } - private static float convertDistanceToScore(float distance, String metric) { - if ("l2".equals(metric)) { - return 1.0f / (1.0f + distance); - } else if ("cosine".equals(metric)) { - return 1.0f - distance; - } else if ("inner_product".equals(metric)) { - return distance; + private static float convertDistanceToScore(float distance, Metric metric) { + switch (metric) { + case L2: + return 1.0f / (1.0f + distance); + case COSINE: + return 1.0f - distance; + case INNER_PRODUCT: + return distance; + default: + throw new IllegalArgumentException("Unknown metric: " + metric); } - throw new IllegalArgumentException("Unknown metric: " + metric); } private void validateSearchVector(Object vector) { @@ -183,11 +188,11 @@ private void validateSearchVector(Object vector) { + fieldType); } int queryDim = ((float[]) vector).length; - if (queryDim != indexMeta.dimension()) { + if (queryDim != nativeMeta.dimension()) { throw new IllegalArgumentException( String.format( "Query vector dimension mismatch: index expects %d, but got %d", - indexMeta.dimension(), queryDim)); + nativeMeta.dimension(), queryDim)); } } @@ -200,6 +205,7 @@ private void ensureLoaded() throws IOException { try { vectorReader = new VectorIndexReader(new SeekableStreamVectorIndexInput(in)); + nativeMeta = vectorReader.metadata(); openStream = in; } catch (Exception e) { IOUtils.closeQuietly(in); diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java index e7cf54a01c9b..b8bcf79fb89f 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java @@ -439,15 +439,6 @@ private Map nativeOptions(int effectiveNlist) { private Map metadata() { Map metadata = new LinkedHashMap<>(); - metadata.put(VectorIndexMeta.KEY_INDEX_TYPE, identifier); - metadata.put(VectorIndexMeta.KEY_DIMENSION, String.valueOf(dim)); - metadata.put(VectorIndexMeta.KEY_METRIC, metric); - metadata.put(VectorIndexMeta.KEY_NLIST, String.valueOf(nlist)); - metadata.put(VectorIndexMeta.KEY_M, String.valueOf(pqM)); - metadata.put(VectorIndexMeta.KEY_USE_OPQ, String.valueOf(useOpq)); - metadata.put(VectorIndexMeta.KEY_HNSW_M, String.valueOf(hnswM)); - metadata.put(VectorIndexMeta.KEY_HNSW_EF_CONSTRUCTION, String.valueOf(hnswEfConstruction)); - metadata.put(VectorIndexMeta.KEY_HNSW_MAX_LEVEL, String.valueOf(hnswMaxLevel)); metadata.put(VectorIndexMeta.KEY_NPROBE, String.valueOf(nprobe)); metadata.put(VectorIndexMeta.KEY_EF_SEARCH, String.valueOf(efSearch)); return metadata; diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexMeta.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexMeta.java index 63fbe826e74d..f494639dfe2b 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexMeta.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexMeta.java @@ -18,8 +18,6 @@ package org.apache.paimon.vector.index; -import org.apache.paimon.index.ivfpq.IndexType; - import org.apache.paimon.shade.jackson2.com.fasterxml.jackson.core.type.TypeReference; import org.apache.paimon.shade.jackson2.com.fasterxml.jackson.databind.ObjectMapper; @@ -31,22 +29,13 @@ /** * Metadata for a vector index file. * - *

    Serialized as a flat JSON {@code Map} storing the index build parameters - * required for correct search-time behavior. + *

    Serialized as a flat JSON {@code Map} storing Paimon search parameters that + * are not part of the native vector index file metadata. */ public class VectorIndexMeta implements Serializable { private static final long serialVersionUID = 1L; - static final String KEY_INDEX_TYPE = "index_type"; - static final String KEY_DIMENSION = "dimension"; - static final String KEY_METRIC = "metric"; - static final String KEY_NLIST = "nlist"; - static final String KEY_M = "m"; - static final String KEY_USE_OPQ = "use_opq"; - static final String KEY_HNSW_M = "hnsw_m"; - static final String KEY_HNSW_EF_CONSTRUCTION = "hnsw_ef_construction"; - static final String KEY_HNSW_MAX_LEVEL = "hnsw_max_level"; static final String KEY_NPROBE = "nprobe"; static final String KEY_EF_SEARCH = "ef_search"; @@ -61,47 +50,6 @@ public class VectorIndexMeta implements Serializable { this.params = new LinkedHashMap<>(params); } - public IndexType indexType() { - String value = params.get(KEY_INDEX_TYPE); - if (value == null) { - throw new IllegalArgumentException( - "Missing required key in vector index metadata: " + KEY_INDEX_TYPE); - } - return parseIndexType(value); - } - - public int dimension() { - return Integer.parseInt(params.get(KEY_DIMENSION)); - } - - public String metric() { - return params.get(KEY_METRIC); - } - - public int nlist() { - return Integer.parseInt(params.get(KEY_NLIST)); - } - - public int m() { - return intValue(KEY_M, 0); - } - - public boolean useOpq() { - return Boolean.parseBoolean(params.get(KEY_USE_OPQ)); - } - - public int hnswM() { - return intValue(KEY_HNSW_M, 20); - } - - public int hnswEfConstruction() { - return intValue(KEY_HNSW_EF_CONSTRUCTION, 150); - } - - public int hnswMaxLevel() { - return intValue(KEY_HNSW_MAX_LEVEL, 7); - } - public int nprobe() { return intValue(KEY_NPROBE, 16); } @@ -116,17 +64,6 @@ public byte[] serialize() throws IOException { public static VectorIndexMeta deserialize(byte[] data) throws IOException { Map map = OBJECT_MAPPER.readValue(data, MAP_TYPE_REF); - if (!map.containsKey(KEY_DIMENSION)) { - throw new IOException( - "Missing required key in vector index metadata: " + KEY_DIMENSION); - } - if (!map.containsKey(KEY_INDEX_TYPE)) { - throw new IOException( - "Missing required key in vector index metadata: " + KEY_INDEX_TYPE); - } - if (!map.containsKey(KEY_METRIC)) { - throw new IOException("Missing required key in vector index metadata: " + KEY_METRIC); - } return new VectorIndexMeta(map); } @@ -134,17 +71,4 @@ private int intValue(String key, int defaultValue) { String val = params.get(key); return val == null ? defaultValue : Integer.parseInt(val); } - - private static IndexType parseIndexType(String value) { - if (IvfPqAlgorithmVectorGlobalIndexerFactory.IDENTIFIER.equals(value)) { - return IndexType.IVF_PQ; - } else if (IvfFlatVectorGlobalIndexerFactory.IDENTIFIER.equals(value)) { - return IndexType.IVF_FLAT; - } else if (IvfHnswFlatVectorGlobalIndexerFactory.IDENTIFIER.equals(value)) { - return IndexType.IVF_HNSW_FLAT; - } else if (IvfHnswSqVectorGlobalIndexerFactory.IDENTIFIER.equals(value)) { - return IndexType.IVF_HNSW_SQ; - } - throw new IllegalArgumentException("Unknown vector index type: " + value); - } } diff --git a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java index 04ad2e730e68..b88b340f575a 100644 --- a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java +++ b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java @@ -64,9 +64,6 @@ public class VectorGlobalIndexTest { private static final String IVF_PQ_IDENTIFIER = IvfPqAlgorithmVectorGlobalIndexerFactory.IDENTIFIER; - private static final String IVF_HNSW_FLAT_IDENTIFIER = - IvfHnswFlatVectorGlobalIndexerFactory.IDENTIFIER; - private FileIO fileIO; private Path indexPath; private DataType vectorType; @@ -173,48 +170,24 @@ public void testAllNullReturnsEmpty() { @Test public void testMetaSerializationRoundTrip() throws IOException { Options options = new Options(); - options.setInteger("vector.index.dimension", 32); - options.setString("vector.distance.metric", "cosine"); - options.setInteger("vector.nlist", 64); - options.setInteger("vector.pq.m", 8); - options.setString("vector.pq.use-opq", "true"); options.setInteger("vector.nprobe", 24); + options.setInteger("vector.hnsw.ef-search", 80); - VectorIndexMeta meta = new VectorIndexMeta(metaOptions(IVF_PQ_IDENTIFIER, options)); + VectorIndexMeta meta = new VectorIndexMeta(metaOptions(options)); byte[] serialized = meta.serialize(); VectorIndexMeta deserialized = VectorIndexMeta.deserialize(serialized); - assertThat(deserialized.dimension()).isEqualTo(32); - assertThat(deserialized.indexType()).isEqualTo(IndexType.IVF_PQ); - assertThat(deserialized.metric()).isEqualTo("cosine"); - assertThat(deserialized.nlist()).isEqualTo(64); - assertThat(deserialized.m()).isEqualTo(8); - assertThat(deserialized.useOpq()).isTrue(); assertThat(deserialized.nprobe()).isEqualTo(24); + assertThat(deserialized.efSearch()).isEqualTo(80); } @Test - public void testMetaSerializationRoundTripForHnsw() throws IOException { - Options options = new Options(); - options.setInteger("vector.index.dimension", 16); - options.setString("vector.distance.metric", "l2"); - options.setInteger("vector.nlist", 8); - options.setInteger("vector.hnsw.m", 12); - options.setInteger("vector.hnsw.ef-construction", 64); - options.setInteger("vector.hnsw.max-level", 5); - options.setInteger("vector.hnsw.ef-search", 80); - + public void testMetaSerializationDefaults() throws IOException { VectorIndexMeta deserialized = - VectorIndexMeta.deserialize( - new VectorIndexMeta(metaOptions(IVF_HNSW_FLAT_IDENTIFIER, options)) - .serialize()); - - assertThat(deserialized.indexType()).isEqualTo(IndexType.IVF_HNSW_FLAT); - assertThat(deserialized.dimension()).isEqualTo(16); - assertThat(deserialized.hnswM()).isEqualTo(12); - assertThat(deserialized.hnswEfConstruction()).isEqualTo(64); - assertThat(deserialized.hnswMaxLevel()).isEqualTo(5); - assertThat(deserialized.efSearch()).isEqualTo(80); + VectorIndexMeta.deserialize(new VectorIndexMeta(new LinkedHashMap<>()).serialize()); + + assertThat(deserialized.nprobe()).isEqualTo(16); + assertThat(deserialized.efSearch()).isEqualTo(0); } // =================== Tests that NEED native library ===================== @@ -394,30 +367,8 @@ private Options createDefaultOptions(int dimension) { return options; } - private Map metaOptions(String indexType, Options options) { + private Map metaOptions(Options options) { Map meta = new LinkedHashMap<>(); - meta.put(VectorIndexMeta.KEY_INDEX_TYPE, indexType); - meta.put( - VectorIndexMeta.KEY_DIMENSION, - String.valueOf(options.getInteger("vector.index.dimension", 128))); - meta.put( - VectorIndexMeta.KEY_METRIC, - options.getString("vector.distance.metric", "inner_product")); - meta.put( - VectorIndexMeta.KEY_NLIST, String.valueOf(options.getInteger("vector.nlist", 256))); - meta.put(VectorIndexMeta.KEY_M, String.valueOf(options.getInteger("vector.pq.m", 16))); - meta.put( - VectorIndexMeta.KEY_USE_OPQ, - String.valueOf(options.getBoolean("vector.pq.use-opq", false))); - meta.put( - VectorIndexMeta.KEY_HNSW_M, - String.valueOf(options.getInteger("vector.hnsw.m", 20))); - meta.put( - VectorIndexMeta.KEY_HNSW_EF_CONSTRUCTION, - String.valueOf(options.getInteger("vector.hnsw.ef-construction", 150))); - meta.put( - VectorIndexMeta.KEY_HNSW_MAX_LEVEL, - String.valueOf(options.getInteger("vector.hnsw.max-level", 7))); meta.put( VectorIndexMeta.KEY_NPROBE, String.valueOf(options.getInteger("vector.nprobe", 16))); From 60373b9093497999759ef3c0bd3a81b36c2fe89b Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Thu, 11 Jun 2026 15:04:58 +0800 Subject: [PATCH 09/11] [vector] Align JNI wrapper with vector index API --- .../IvfFlatVectorGlobalIndexerFactory.java | 6 +-- ...IvfHnswFlatVectorGlobalIndexerFactory.java | 6 +-- .../IvfHnswSqVectorGlobalIndexerFactory.java | 6 +-- ...PqAlgorithmVectorGlobalIndexerFactory.java | 6 +-- .../vector/index/VectorGlobalIndexReader.java | 29 ++++++------ .../vector/index/VectorGlobalIndexWriter.java | 26 +++-------- .../vector/index/VectorGlobalIndexer.java | 5 +-- .../index/VectorGlobalIndexerFactory.java | 3 +- .../paimon/vector/index/VectorIndexType.java | 37 ++++++++++++++++ .../vector/index/VectorGlobalIndexTest.java | 8 ++-- .../index/VectorGlobalIndexerFactoryTest.java | 9 ++-- .../apache/paimon/index/ivfpq/IndexType.java | 44 ------------------- .../org/apache/paimon/index/ivfpq/Metric.java | 34 -------------- .../index/{ivfpq => vector}/NativeLoader.java | 2 +- .../{ivfpq => vector}/VectorIndexInput.java | 2 +- .../VectorIndexMetadata.java | 33 +++++++------- .../{ivfpq => vector}/VectorIndexNative.java | 2 +- .../{ivfpq => vector}/VectorIndexReader.java | 4 +- .../{ivfpq => vector}/VectorIndexWriter.java | 2 +- .../VectorSearchBatchResult.java | 2 +- .../{ivfpq => vector}/VectorSearchResult.java | 2 +- 21 files changed, 97 insertions(+), 171 deletions(-) create mode 100644 paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexType.java delete mode 100644 paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IndexType.java delete mode 100644 paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/Metric.java rename paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/{ivfpq => vector}/NativeLoader.java (98%) rename paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/{ivfpq => vector}/VectorIndexInput.java (95%) rename paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/{ivfpq => vector}/VectorIndexMetadata.java (77%) rename paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/{ivfpq => vector}/VectorIndexNative.java (98%) rename paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/{ivfpq => vector}/VectorIndexReader.java (98%) rename paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/{ivfpq => vector}/VectorIndexWriter.java (99%) rename paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/{ivfpq => vector}/VectorSearchBatchResult.java (98%) rename paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/{ivfpq => vector}/VectorSearchResult.java (97%) diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfFlatVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfFlatVectorGlobalIndexerFactory.java index 8c7f8bd5d9b7..95856b053542 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfFlatVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfFlatVectorGlobalIndexerFactory.java @@ -18,8 +18,6 @@ package org.apache.paimon.vector.index; -import org.apache.paimon.index.ivfpq.IndexType; - /** Factory for the {@code ivf-flat} vector index identifier. */ public class IvfFlatVectorGlobalIndexerFactory extends VectorGlobalIndexerFactory { @@ -31,7 +29,7 @@ public String identifier() { } @Override - protected IndexType indexType() { - return IndexType.IVF_FLAT; + protected VectorIndexType indexType() { + return VectorIndexType.IVF_FLAT; } } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswFlatVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswFlatVectorGlobalIndexerFactory.java index 181ef5be7735..e9646b03f6af 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswFlatVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswFlatVectorGlobalIndexerFactory.java @@ -18,8 +18,6 @@ package org.apache.paimon.vector.index; -import org.apache.paimon.index.ivfpq.IndexType; - /** Factory for the {@code ivf-hnsw-flat} vector index identifier. */ public class IvfHnswFlatVectorGlobalIndexerFactory extends VectorGlobalIndexerFactory { @@ -31,7 +29,7 @@ public String identifier() { } @Override - protected IndexType indexType() { - return IndexType.IVF_HNSW_FLAT; + protected VectorIndexType indexType() { + return VectorIndexType.IVF_HNSW_FLAT; } } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswSqVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswSqVectorGlobalIndexerFactory.java index 3e677d6bdacd..92b14501e76f 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswSqVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswSqVectorGlobalIndexerFactory.java @@ -18,8 +18,6 @@ package org.apache.paimon.vector.index; -import org.apache.paimon.index.ivfpq.IndexType; - /** Factory for the {@code ivf-hnsw-sq} vector index identifier. */ public class IvfHnswSqVectorGlobalIndexerFactory extends VectorGlobalIndexerFactory { @@ -31,7 +29,7 @@ public String identifier() { } @Override - protected IndexType indexType() { - return IndexType.IVF_HNSW_SQ; + protected VectorIndexType indexType() { + return VectorIndexType.IVF_HNSW_SQ; } } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java index 649e609fff1b..c1b674e25804 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java @@ -18,8 +18,6 @@ package org.apache.paimon.vector.index; -import org.apache.paimon.index.ivfpq.IndexType; - /** Factory for the {@code ivf-pq} vector index identifier. */ public class IvfPqAlgorithmVectorGlobalIndexerFactory extends VectorGlobalIndexerFactory { @@ -31,7 +29,7 @@ public String identifier() { } @Override - protected IndexType indexType() { - return IndexType.IVF_PQ; + protected VectorIndexType indexType() { + return VectorIndexType.IVF_PQ; } } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java index 3ab219833fd4..3f5405f16383 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java @@ -24,11 +24,10 @@ import org.apache.paimon.globalindex.GlobalIndexResult; import org.apache.paimon.globalindex.ScoredGlobalIndexResult; import org.apache.paimon.globalindex.io.GlobalIndexFileReader; -import org.apache.paimon.index.ivfpq.Metric; -import org.apache.paimon.index.ivfpq.VectorIndexInput; -import org.apache.paimon.index.ivfpq.VectorIndexMetadata; -import org.apache.paimon.index.ivfpq.VectorIndexReader; -import org.apache.paimon.index.ivfpq.VectorSearchResult; +import org.apache.paimon.index.vector.VectorIndexInput; +import org.apache.paimon.index.vector.VectorIndexMetadata; +import org.apache.paimon.index.vector.VectorIndexReader; +import org.apache.paimon.index.vector.VectorSearchResult; import org.apache.paimon.predicate.FieldRef; import org.apache.paimon.predicate.VectorSearch; import org.apache.paimon.types.ArrayType; @@ -101,7 +100,7 @@ private ScoredGlobalIndexResult search(VectorSearch vectorSearch) throws IOExcep float[] queryVector = vectorSearch.vector().clone(); int limit = vectorSearch.limit(); int nprobe = indexMeta.nprobe(); - Metric metric = nativeMeta.metric(); + String metric = nativeMeta.metric(); RoaringNavigableMap64 includeRowIds = vectorSearch.includeRowIds(); VectorSearchResult result; @@ -158,17 +157,15 @@ private ScoredGlobalIndexResult search(VectorSearch vectorSearch) throws IOExcep }); } - private static float convertDistanceToScore(float distance, Metric metric) { - switch (metric) { - case L2: - return 1.0f / (1.0f + distance); - case COSINE: - return 1.0f - distance; - case INNER_PRODUCT: - return distance; - default: - throw new IllegalArgumentException("Unknown metric: " + metric); + private static float convertDistanceToScore(float distance, String metric) { + if ("l2".equals(metric)) { + return 1.0f / (1.0f + distance); + } else if ("cosine".equals(metric)) { + return 1.0f - distance; + } else if ("inner_product".equals(metric)) { + return distance; } + throw new IllegalArgumentException("Unknown metric: " + metric); } private void validateSearchVector(Object vector) { diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java index b8bcf79fb89f..eec4c08a10ce 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java @@ -24,8 +24,7 @@ import org.apache.paimon.globalindex.GlobalIndexSingletonWriter; import org.apache.paimon.globalindex.ResultEntry; import org.apache.paimon.globalindex.io.GlobalIndexFileWriter; -import org.apache.paimon.index.ivfpq.IndexType; -import org.apache.paimon.index.ivfpq.VectorIndexWriter; +import org.apache.paimon.index.vector.VectorIndexWriter; import org.apache.paimon.options.Options; import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.DataType; @@ -79,7 +78,7 @@ public class VectorGlobalIndexWriter implements GlobalIndexSingletonWriter, Clos private static final int DEFAULT_ADD_BATCH_SIZE = 10000; private final GlobalIndexFileWriter fileWriter; - private final IndexType indexType; + private final VectorIndexType indexType; private final String identifier; private final int dim; private final String metric; @@ -109,7 +108,7 @@ public VectorGlobalIndexWriter( GlobalIndexFileWriter fileWriter, DataType fieldType, Options options, - IndexType indexType, + VectorIndexType indexType, String identifier) { this.fileWriter = fileWriter; this.indexType = indexType; @@ -414,7 +413,7 @@ private void addVectorsFromTempFile(VectorIndexWriter writer) throws IOException private Map nativeOptions(int effectiveNlist) { Map nativeOptions = new LinkedHashMap<>(); - nativeOptions.put("index.type", nativeIndexType(indexType)); + nativeOptions.put("index.type", indexType.nativeName()); nativeOptions.put("dimension", String.valueOf(dim)); nativeOptions.put("nlist", String.valueOf(effectiveNlist)); nativeOptions.put("metric", metric); @@ -445,7 +444,7 @@ private Map metadata() { } private void validateOptions() { - if (indexType == IndexType.IVF_PQ && dim % pqM != 0) { + if (indexType == VectorIndexType.IVF_PQ && dim % pqM != 0) { throw new IllegalArgumentException( String.format("vector.pq.m (%d) must divide vector dimension (%d)", pqM, dim)); } @@ -503,21 +502,6 @@ private static double doubleOption(Options options, String key, double defaultVa return options.getDouble(OPTION_PREFIX + key, defaultValue); } - private static String nativeIndexType(IndexType indexType) { - switch (indexType) { - case IVF_FLAT: - return "ivf_flat"; - case IVF_PQ: - return "ivf_pq"; - case IVF_HNSW_FLAT: - return "ivf_hnsw_flat"; - case IVF_HNSW_SQ: - return "ivf_hnsw_sq"; - default: - throw new IllegalArgumentException("Unsupported vector index type: " + indexType); - } - } - private static void ensureAvailable(ByteBuffer readBuf, FileChannel channel, int minBytes) throws IOException { int zeroReadCount = 0; diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java index 608f528ceb1b..8e0664a3a428 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java @@ -24,7 +24,6 @@ import org.apache.paimon.globalindex.GlobalIndexer; import org.apache.paimon.globalindex.io.GlobalIndexFileReader; import org.apache.paimon.globalindex.io.GlobalIndexFileWriter; -import org.apache.paimon.index.ivfpq.IndexType; import org.apache.paimon.options.Options; import org.apache.paimon.types.DataType; @@ -37,11 +36,11 @@ public class VectorGlobalIndexer implements GlobalIndexer { private final DataType fieldType; private final Options options; - private final IndexType indexType; + private final VectorIndexType indexType; private final String identifier; public VectorGlobalIndexer( - DataType fieldType, Options options, IndexType indexType, String identifier) { + DataType fieldType, Options options, VectorIndexType indexType, String identifier) { this.fieldType = fieldType; this.options = options; this.indexType = Objects.requireNonNull(indexType, "indexType must not be null"); diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java index 5b9ff4540712..add52f855428 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java @@ -20,14 +20,13 @@ import org.apache.paimon.globalindex.GlobalIndexer; import org.apache.paimon.globalindex.GlobalIndexerFactory; -import org.apache.paimon.index.ivfpq.IndexType; import org.apache.paimon.options.Options; import org.apache.paimon.types.DataField; /** Factory for creating vector indexes backed by paimon-vector-index. */ public abstract class VectorGlobalIndexerFactory implements GlobalIndexerFactory { - protected abstract IndexType indexType(); + protected abstract VectorIndexType indexType(); @Override public GlobalIndexer create(DataField field, Options options) { diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexType.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexType.java new file mode 100644 index 000000000000..cc18fe2d873a --- /dev/null +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexType.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.vector.index; + +/** Vector index type used by the native paimon-vector-index writer. */ +public enum VectorIndexType { + IVF_FLAT("ivf_flat"), + IVF_PQ("ivf_pq"), + IVF_HNSW_FLAT("ivf_hnsw_flat"), + IVF_HNSW_SQ("ivf_hnsw_sq"); + + private final String nativeName; + + VectorIndexType(String nativeName) { + this.nativeName = nativeName; + } + + public String nativeName() { + return nativeName; + } +} diff --git a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java index b88b340f575a..af722a7fc38b 100644 --- a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java +++ b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java @@ -27,8 +27,7 @@ import org.apache.paimon.globalindex.ScoredGlobalIndexResult; import org.apache.paimon.globalindex.io.GlobalIndexFileReader; import org.apache.paimon.globalindex.io.GlobalIndexFileWriter; -import org.apache.paimon.index.ivfpq.IndexType; -import org.apache.paimon.index.ivfpq.NativeLoader; +import org.apache.paimon.index.vector.NativeLoader; import org.apache.paimon.options.Options; import org.apache.paimon.predicate.VectorSearch; import org.apache.paimon.types.ArrayType; @@ -334,7 +333,8 @@ public void testViaIndexer() throws IOException { }; VectorGlobalIndexer indexer = - new VectorGlobalIndexer(vectorType, options, IndexType.IVF_PQ, IVF_PQ_IDENTIFIER); + new VectorGlobalIndexer( + vectorType, options, VectorIndexType.IVF_PQ, IVF_PQ_IDENTIFIER); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); VectorGlobalIndexWriter writer = (VectorGlobalIndexWriter) indexer.createWriter(fileWriter); @@ -357,7 +357,7 @@ public void testViaIndexer() throws IOException { private VectorGlobalIndexWriter createIvfPqWriter( GlobalIndexFileWriter fileWriter, DataType fieldType, Options options) { return new VectorGlobalIndexWriter( - fileWriter, fieldType, options, IndexType.IVF_PQ, IVF_PQ_IDENTIFIER); + fileWriter, fieldType, options, VectorIndexType.IVF_PQ, IVF_PQ_IDENTIFIER); } private Options createDefaultOptions(int dimension) { diff --git a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactoryTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactoryTest.java index bdf7dbdc66e4..4307c6f40795 100644 --- a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactoryTest.java +++ b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactoryTest.java @@ -19,7 +19,6 @@ package org.apache.paimon.vector.index; import org.apache.paimon.globalindex.GlobalIndexerFactoryUtils; -import org.apache.paimon.index.ivfpq.IndexType; import org.junit.jupiter.api.Test; @@ -52,12 +51,12 @@ public void testLoadByIdentifier() { @Test public void testFactoryIndexType() { assertThat(new IvfFlatVectorGlobalIndexerFactory().indexType()) - .isEqualTo(IndexType.IVF_FLAT); + .isEqualTo(VectorIndexType.IVF_FLAT); assertThat(new IvfPqAlgorithmVectorGlobalIndexerFactory().indexType()) - .isEqualTo(IndexType.IVF_PQ); + .isEqualTo(VectorIndexType.IVF_PQ); assertThat(new IvfHnswFlatVectorGlobalIndexerFactory().indexType()) - .isEqualTo(IndexType.IVF_HNSW_FLAT); + .isEqualTo(VectorIndexType.IVF_HNSW_FLAT); assertThat(new IvfHnswSqVectorGlobalIndexerFactory().indexType()) - .isEqualTo(IndexType.IVF_HNSW_SQ); + .isEqualTo(VectorIndexType.IVF_HNSW_SQ); } } diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IndexType.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IndexType.java deleted file mode 100644 index b61a38794e34..000000000000 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/IndexType.java +++ /dev/null @@ -1,44 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.paimon.index.ivfpq; - -public enum IndexType { - IVF_FLAT(0), - IVF_PQ(1), - IVF_HNSW_FLAT(2), - IVF_HNSW_SQ(3); - - private final int code; - - IndexType(int code) { - this.code = code; - } - - public int code() { - return code; - } - - static IndexType fromCode(int code) { - for (IndexType type : values()) { - if (type.code == code) { - return type; - } - } - throw new IllegalArgumentException("unknown index type code: " + code); - } -} diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/Metric.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/Metric.java deleted file mode 100644 index c31dbc2e5f22..000000000000 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/Metric.java +++ /dev/null @@ -1,34 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.paimon.index.ivfpq; - -public enum Metric { - L2(0), - INNER_PRODUCT(1), - COSINE(2); - - private final int code; - - Metric(int code) { - this.code = code; - } - - public int code() { - return code; - } -} diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/NativeLoader.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/NativeLoader.java similarity index 98% rename from paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/NativeLoader.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/NativeLoader.java index 242880096b0b..e667bbcb8fc7 100644 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/NativeLoader.java +++ b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/NativeLoader.java @@ -16,7 +16,7 @@ * limitations under the License. */ -package org.apache.paimon.index.ivfpq; +package org.apache.paimon.index.vector; import org.apache.paimon.shade.guava30.com.google.common.io.ByteStreams; diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexInput.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorIndexInput.java similarity index 95% rename from paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexInput.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorIndexInput.java index 451c88463980..dca4430181c4 100644 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexInput.java +++ b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorIndexInput.java @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -package org.apache.paimon.index.ivfpq; +package org.apache.paimon.index.vector; public interface VectorIndexInput { diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexMetadata.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorIndexMetadata.java similarity index 77% rename from paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexMetadata.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorIndexMetadata.java index ec52f115bcfe..4ffd89a4f3de 100644 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexMetadata.java +++ b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorIndexMetadata.java @@ -15,14 +15,14 @@ // specific language governing permissions and limitations // under the License. -package org.apache.paimon.index.ivfpq; +package org.apache.paimon.index.vector; public final class VectorIndexMetadata { - private final IndexType indexType; + private final String indexType; private final int dimension; private final int nlist; - private final Metric metric; + private final String metric; private final long totalVectors; private final int pqM; private final int hnswM; @@ -30,19 +30,25 @@ public final class VectorIndexMetadata { private final int hnswMaxLevel; public VectorIndexMetadata( - int indexType, + String indexType, int dimension, int nlist, - int metric, + String metric, long totalVectors, int pqM, int hnswM, int efConstruction, int maxLevel) { - this.indexType = IndexType.fromCode(indexType); + if (indexType == null) { + throw new NullPointerException("indexType"); + } + if (metric == null) { + throw new NullPointerException("metric"); + } + this.indexType = indexType; this.dimension = dimension; this.nlist = nlist; - this.metric = metricFromCode(metric); + this.metric = metric; this.totalVectors = totalVectors; this.pqM = pqM; this.hnswM = hnswM; @@ -50,7 +56,7 @@ public VectorIndexMetadata( this.hnswMaxLevel = maxLevel; } - public IndexType indexType() { + public String indexType() { return indexType; } @@ -62,7 +68,7 @@ public int nlist() { return nlist; } - public Metric metric() { + public String metric() { return metric; } @@ -85,13 +91,4 @@ public int hnswEfConstruction() { public int hnswMaxLevel() { return hnswMaxLevel; } - - private static Metric metricFromCode(int code) { - for (Metric metric : Metric.values()) { - if (metric.code() == code) { - return metric; - } - } - throw new IllegalArgumentException("unknown metric code: " + code); - } } diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexNative.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorIndexNative.java similarity index 98% rename from paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexNative.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorIndexNative.java index 3e49103d5e3d..b6a5bbf0a75c 100644 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexNative.java +++ b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorIndexNative.java @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -package org.apache.paimon.index.ivfpq; +package org.apache.paimon.index.vector; final class VectorIndexNative { diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexReader.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorIndexReader.java similarity index 98% rename from paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexReader.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorIndexReader.java index b10fb525db26..34eefc7dcada 100644 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexReader.java +++ b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorIndexReader.java @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -package org.apache.paimon.index.ivfpq; +package org.apache.paimon.index.vector; public final class VectorIndexReader implements AutoCloseable { @@ -54,7 +54,7 @@ public VectorIndexMetadata metadata() { } } - public IndexType indexType() { + public String indexType() { return metadata().indexType(); } diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexWriter.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorIndexWriter.java similarity index 99% rename from paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexWriter.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorIndexWriter.java index a82950d40126..0dda2a3eba18 100644 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorIndexWriter.java +++ b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorIndexWriter.java @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -package org.apache.paimon.index.ivfpq; +package org.apache.paimon.index.vector; import java.util.Map; diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchBatchResult.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorSearchBatchResult.java similarity index 98% rename from paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchBatchResult.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorSearchBatchResult.java index 05ff0ad6a13f..12952e932e7f 100644 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchBatchResult.java +++ b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorSearchBatchResult.java @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -package org.apache.paimon.index.ivfpq; +package org.apache.paimon.index.vector; import java.util.Arrays; diff --git a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchResult.java b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorSearchResult.java similarity index 97% rename from paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchResult.java rename to paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorSearchResult.java index b0a1e559046d..870aa49ce177 100644 --- a/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/ivfpq/VectorSearchResult.java +++ b/paimon-vector/paimon-vector-jni/src/main/java/org/apache/paimon/index/vector/VectorSearchResult.java @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -package org.apache.paimon.index.ivfpq; +package org.apache.paimon.index.vector; import java.util.Arrays; From 3adde86708c4aea80d950a341e46cd6ec1eed0a6 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Thu, 11 Jun 2026 18:24:44 +0800 Subject: [PATCH 10/11] Fix --- docs/docs/multimodal-table/global-index.mdx | 32 +-- .../apache/paimon/fs/VectoredReadUtils.java | 115 ++++++++- .../paimon/fs/VectoredReadUtilsTest.java | 109 +++++++++ .../IvfFlatVectorGlobalIndexerFactory.java | 5 - ...IvfHnswFlatVectorGlobalIndexerFactory.java | 5 - .../IvfHnswSqVectorGlobalIndexerFactory.java | 5 - ...PqAlgorithmVectorGlobalIndexerFactory.java | 5 - .../vector/index/VectorGlobalIndexReader.java | 114 +++++++-- .../vector/index/VectorGlobalIndexWriter.java | 219 ++---------------- .../vector/index/VectorGlobalIndexer.java | 13 +- .../index/VectorGlobalIndexerFactory.java | 70 +++++- .../paimon/vector/index/VectorIndexMeta.java | 33 +-- .../paimon/vector/index/VectorIndexType.java | 37 --- .../SeekableStreamVectorIndexInputTest.java | 151 ++++++++++++ .../vector/index/VectorGlobalIndexTest.java | 88 +++---- .../index/VectorGlobalIndexerFactoryTest.java | 71 +++++- 16 files changed, 699 insertions(+), 373 deletions(-) delete mode 100644 paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexType.java create mode 100644 paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/SeekableStreamVectorIndexInputTest.java diff --git a/docs/docs/multimodal-table/global-index.mdx b/docs/docs/multimodal-table/global-index.mdx index cc0d08e9333a..747ccbaa9ad0 100644 --- a/docs/docs/multimodal-table/global-index.mdx +++ b/docs/docs/multimodal-table/global-index.mdx @@ -107,32 +107,35 @@ CALL sys.create_global_index( table => 'db.my_table', index_column => 'embedding', index_type => 'ivf-pq', - options => 'vector.distance.metric=cosine,vector.nlist=256,vector.pq.m=16' + options => 'ivf-pq.distance.metric=cosine,ivf-pq.nlist=256,ivf-pq.pq.m=16' ); ``` -For `ARRAY` vector columns, specify the vector dimension with `vector.index.dimension`. +For `ARRAY` vector columns, specify the vector dimension with `.dimension`. For `VECTOR` columns, Paimon uses the dimension from the column type. Supported vector index options: | Option | Default | Description | |---|---|---| -| `vector.index.dimension` | `128` | Vector dimension for `ARRAY` columns. Ignored for `VECTOR` columns. | -| `vector.distance.metric` | `inner_product` | Distance metric. Supported values: `l2`, `cosine`, `inner_product`. | -| `vector.nlist` | `256` | Number of IVF clusters used during index build. | -| `vector.pq.m` | `16` | Number of PQ sub-vectors for `ivf-pq`. The vector dimension must be divisible by this value. | -| `vector.pq.use-opq` | `false` | Whether to enable OPQ for `ivf-pq`. | -| `vector.hnsw.m` | `20` | HNSW graph out-degree for `ivf-hnsw-flat` and `ivf-hnsw-sq`. | -| `vector.hnsw.ef-construction` | `150` | HNSW construction search width for `ivf-hnsw-flat` and `ivf-hnsw-sq`. | -| `vector.hnsw.max-level` | `7` | Maximum HNSW level for `ivf-hnsw-flat` and `ivf-hnsw-sq`. | -| `vector.nprobe` | `16` | Number of IVF clusters to probe during search. | -| `vector.hnsw.ef-search` | `0` | HNSW search width during search. `0` uses the native library default. | -| `vector.train.sample-ratio` | `1.0` | Ratio of vectors sampled for index training. Must be in `(0, 1.0]`. | -| `vector.add.batch-size` | `10000` | Batch size used when adding vectors to the native index writer. | +| `.dimension` | `128` | Vector dimension for `ARRAY` columns. Ignored for `VECTOR` columns. | +| `.distance.metric` | `inner_product` | Distance metric. Supported values: `l2`, `cosine`, `inner_product`. | +| `.nlist` | `256` | Number of IVF clusters used during index build. | +| `.pq.m` | `16` | Number of PQ sub-vectors for `ivf-pq`. The vector dimension must be divisible by this value. | +| `.pq.use-opq` | `false` | Whether to enable OPQ for `ivf-pq`. | +| `.hnsw.m` | `20` | HNSW graph out-degree for `ivf-hnsw-flat` and `ivf-hnsw-sq`. | +| `.hnsw.ef-construction` | `150` | HNSW construction search width for `ivf-hnsw-flat` and `ivf-hnsw-sq`. | +| `.hnsw.max-level` | `7` | Maximum HNSW level for `ivf-hnsw-flat` and `ivf-hnsw-sq`. | **Vector Search** +Search-time options are passed with each vector search request: + +| Option | Default | Description | +|---|---|---| +| `ivf.nprobe` | `16` | Number of IVF clusters to probe during search. | +| `hnsw.ef_search` | `0` | HNSW search width during search. `0` uses the native library default. | + @@ -181,6 +184,7 @@ GlobalIndexResult result = table.newVectorSearchBuilder() .withVector(queryVector) .withLimit(5) .withVectorColumn("embedding") + .withOption("ivf.nprobe", "16") .executeLocal(); // Step 2: Read matching rows using the search result diff --git a/paimon-common/src/main/java/org/apache/paimon/fs/VectoredReadUtils.java b/paimon-common/src/main/java/org/apache/paimon/fs/VectoredReadUtils.java index bda768ef96f6..8f1551bff988 100644 --- a/paimon-common/src/main/java/org/apache/paimon/fs/VectoredReadUtils.java +++ b/paimon-common/src/main/java/org/apache/paimon/fs/VectoredReadUtils.java @@ -51,20 +51,33 @@ public static void readVectored(VectoredReadable readable, List ranges, ReadOptions options) + throws IOException { + if (ranges.isEmpty()) { + return; + } + requireNonNull(readable, "readable is null"); + requireNonNull(options, "options is null"); List sortRanges = validateAndSortRanges(ranges); List combinedRanges = - mergeSortedRanges(sortRanges, readable.minSeekForVectorReads()); + mergeSortedRanges(sortRanges, options.minSeekForVectorReads); - int parallelism = readable.parallelismForVectorReads(); + int parallelism = options.parallelismForVectorReads; - if (combinedRanges.size() == 1 && readable instanceof SeekableInputStream) { + if (options.sequentialReadFallback + && combinedRanges.size() == 1 + && readable instanceof SeekableInputStream) { fallbackToReadSequence((SeekableInputStream) readable, sortRanges); return; } BlockingExecutor executor = new BlockingExecutor(IO_THREAD_POOL, parallelism); - long batchSize = readable.batchSizeForVectorReads(); + long batchSize = options.batchSizeForVectorReads; for (CombinedRange combinedRange : combinedRanges) { if (combinedRange.underlying.size() == 1) { FileRange fileRange = combinedRange.underlying.get(0); @@ -76,12 +89,95 @@ public static void readVectored(VectoredReadable readable, List> futures = splitBatches.stream().map(FileRange::getData).collect(Collectors.toList()); CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])) - .thenAcceptAsync( - unused -> copyToFileRanges(combinedRange, futures), IO_THREAD_POOL); + .whenCompleteAsync( + (unused, throwable) -> { + if (throwable == null) { + try { + copyToFileRanges(combinedRange, futures); + } catch (Throwable t) { + completeFileRangesExceptionally(combinedRange, t); + } + } else { + completeFileRangesExceptionally(combinedRange, throwable); + } + }, + IO_THREAD_POOL); } } } + /** Options for vectored reads. */ + public static class ReadOptions { + + private final int minSeekForVectorReads; + private final long batchSizeForVectorReads; + private final int parallelismForVectorReads; + private final boolean sequentialReadFallback; + + public static ReadOptions from(VectoredReadable readable) { + return new ReadOptions( + readable.minSeekForVectorReads(), + readable.batchSizeForVectorReads(), + readable.parallelismForVectorReads(), + true); + } + + public ReadOptions( + int minSeekForVectorReads, + long batchSizeForVectorReads, + int parallelismForVectorReads, + boolean sequentialReadFallback) { + checkArgument( + minSeekForVectorReads >= 0, + "minSeekForVectorReads must be non-negative: %s", + minSeekForVectorReads); + checkArgument( + batchSizeForVectorReads > 0, + "batchSizeForVectorReads must be positive: %s", + batchSizeForVectorReads); + checkArgument( + parallelismForVectorReads > 0, + "parallelismForVectorReads must be positive: %s", + parallelismForVectorReads); + this.minSeekForVectorReads = minSeekForVectorReads; + this.batchSizeForVectorReads = batchSizeForVectorReads; + this.parallelismForVectorReads = parallelismForVectorReads; + this.sequentialReadFallback = sequentialReadFallback; + } + + public ReadOptions withMinSeekForVectorReads(int minSeekForVectorReads) { + return new ReadOptions( + minSeekForVectorReads, + batchSizeForVectorReads, + parallelismForVectorReads, + sequentialReadFallback); + } + + public ReadOptions withBatchSizeForVectorReads(long batchSizeForVectorReads) { + return new ReadOptions( + minSeekForVectorReads, + batchSizeForVectorReads, + parallelismForVectorReads, + sequentialReadFallback); + } + + public ReadOptions withParallelismForVectorReads(int parallelismForVectorReads) { + return new ReadOptions( + minSeekForVectorReads, + batchSizeForVectorReads, + parallelismForVectorReads, + sequentialReadFallback); + } + + public ReadOptions withSequentialReadFallback(boolean sequentialReadFallback) { + return new ReadOptions( + minSeekForVectorReads, + batchSizeForVectorReads, + parallelismForVectorReads, + sequentialReadFallback); + } + } + private static void fallbackToReadSequence( SeekableInputStream in, List ranges) throws IOException { for (FileRange range : ranges) { @@ -126,6 +222,13 @@ private static void copyToFileRanges( } } + private static void completeFileRangesExceptionally( + CombinedRange combinedRange, Throwable throwable) { + for (FileRange fileRange : combinedRange.underlying) { + fileRange.getData().completeExceptionally(throwable); + } + } + private static void copyMultiBytesToBytes( List segments, int offset, byte[] bytes, int numBytes) { int remainSize = numBytes; diff --git a/paimon-common/src/test/java/org/apache/paimon/fs/VectoredReadUtilsTest.java b/paimon-common/src/test/java/org/apache/paimon/fs/VectoredReadUtilsTest.java index a3264b08e20c..1cd5476e63e4 100644 --- a/paimon-common/src/test/java/org/apache/paimon/fs/VectoredReadUtilsTest.java +++ b/paimon-common/src/test/java/org/apache/paimon/fs/VectoredReadUtilsTest.java @@ -25,9 +25,14 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutionException; import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; class VectoredReadUtilsTest { @@ -127,4 +132,108 @@ public void testRandom() throws Exception { } doTest(ranges); } + + @Test + public void testReadOptionsCanDisableSequentialReadFallback() throws Exception { + TestSeekableVectoredReadable readable = new TestSeekableVectoredReadable(2); + + List ranges = + Arrays.asList( + FileRange.createFileRange(0, 100), FileRange.createFileRange(150, 100)); + VectoredReadUtils.ReadOptions options = + new VectoredReadUtils.ReadOptions(1000, 100, 2, false); + + VectoredReadUtils.readVectored(readable, ranges, options); + assertThat(readable.readsStarted.await(5, TimeUnit.SECONDS)).isTrue(); + readable.finishReads.countDown(); + + for (FileRange range : ranges) { + assertThat(range.getData().get(5, TimeUnit.SECONDS)).hasSize(range.getLength()); + } + assertThat(readable.reads).hasValue(2); + assertThat(readable.sequentialReads).hasValue(0); + assertThat(readable.maxActiveReads).hasValue(2); + } + + @Test + public void testReadOptionsPropagateSplitReadFailure() throws Exception { + VectoredReadable readable = + new VectoredReadable() { + @Override + public int pread(long position, byte[] buffer, int offset, int length) + throws IOException { + throw new IOException("failed"); + } + }; + + List ranges = + Arrays.asList( + FileRange.createFileRange(0, 100), FileRange.createFileRange(150, 100)); + VectoredReadUtils.ReadOptions options = + new VectoredReadUtils.ReadOptions(1000, 100, 2, false); + + VectoredReadUtils.readVectored(readable, ranges, options); + + assertThatThrownBy(() -> ranges.get(0).getData().get(5, TimeUnit.SECONDS)) + .isInstanceOf(ExecutionException.class) + .hasMessageContaining("failed"); + assertThatThrownBy(() -> ranges.get(1).getData().get(5, TimeUnit.SECONDS)) + .isInstanceOf(ExecutionException.class) + .hasMessageContaining("failed"); + } + + private class TestSeekableVectoredReadable extends SeekableInputStream + implements VectoredReadable { + + private final CountDownLatch readsStarted; + private final CountDownLatch finishReads = new CountDownLatch(1); + private final AtomicInteger reads = new AtomicInteger(); + private final AtomicInteger sequentialReads = new AtomicInteger(); + private final AtomicInteger activeReads = new AtomicInteger(); + private final AtomicInteger maxActiveReads = new AtomicInteger(); + + private TestSeekableVectoredReadable(int expectedReads) { + this.readsStarted = new CountDownLatch(expectedReads); + } + + @Override + public void seek(long desired) {} + + @Override + public long getPos() { + return 0; + } + + @Override + public int read() throws IOException { + throw new IOException("Sequential read should not be used"); + } + + @Override + public int read(byte[] buffer, int offset, int length) throws IOException { + sequentialReads.incrementAndGet(); + throw new IOException("Sequential read should not be used"); + } + + @Override + public void close() {} + + @Override + public int pread(long position, byte[] buffer, int offset, int length) throws IOException { + int active = activeReads.incrementAndGet(); + maxActiveReads.accumulateAndGet(active, Math::max); + readsStarted.countDown(); + try { + finishReads.await(); + System.arraycopy(bytes, (int) position, buffer, offset, length); + reads.incrementAndGet(); + return length; + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new IOException(e); + } finally { + activeReads.decrementAndGet(); + } + } + } } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfFlatVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfFlatVectorGlobalIndexerFactory.java index 95856b053542..572c7cf4edb2 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfFlatVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfFlatVectorGlobalIndexerFactory.java @@ -27,9 +27,4 @@ public class IvfFlatVectorGlobalIndexerFactory extends VectorGlobalIndexerFactor public String identifier() { return IDENTIFIER; } - - @Override - protected VectorIndexType indexType() { - return VectorIndexType.IVF_FLAT; - } } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswFlatVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswFlatVectorGlobalIndexerFactory.java index e9646b03f6af..159e7af6f1ba 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswFlatVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswFlatVectorGlobalIndexerFactory.java @@ -27,9 +27,4 @@ public class IvfHnswFlatVectorGlobalIndexerFactory extends VectorGlobalIndexerFa public String identifier() { return IDENTIFIER; } - - @Override - protected VectorIndexType indexType() { - return VectorIndexType.IVF_HNSW_FLAT; - } } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswSqVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswSqVectorGlobalIndexerFactory.java index 92b14501e76f..51c72cd8f39c 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswSqVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfHnswSqVectorGlobalIndexerFactory.java @@ -27,9 +27,4 @@ public class IvfHnswSqVectorGlobalIndexerFactory extends VectorGlobalIndexerFact public String identifier() { return IDENTIFIER; } - - @Override - protected VectorIndexType indexType() { - return VectorIndexType.IVF_HNSW_SQ; - } } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java index c1b674e25804..f3932de46ed6 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/IvfPqAlgorithmVectorGlobalIndexerFactory.java @@ -27,9 +27,4 @@ public class IvfPqAlgorithmVectorGlobalIndexerFactory extends VectorGlobalIndexe public String identifier() { return IDENTIFIER; } - - @Override - protected VectorIndexType indexType() { - return VectorIndexType.IVF_PQ; - } } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java index 3f5405f16383..54f8532a2227 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexReader.java @@ -18,7 +18,10 @@ package org.apache.paimon.vector.index; +import org.apache.paimon.fs.FileRange; import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.fs.VectoredReadUtils; +import org.apache.paimon.fs.VectoredReadable; import org.apache.paimon.globalindex.GlobalIndexIOMeta; import org.apache.paimon.globalindex.GlobalIndexReader; import org.apache.paimon.globalindex.GlobalIndexResult; @@ -38,8 +41,11 @@ import org.apache.paimon.utils.RoaringNavigableMap64; import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutorService; @@ -54,12 +60,18 @@ */ public class VectorGlobalIndexReader implements GlobalIndexReader { + private static final String NPROBE_PARAMETER = "ivf.nprobe"; + private static final String EF_SEARCH_PARAMETER = "hnsw.ef_search"; + private static final int DEFAULT_NPROBE = 16; + private static final int DEFAULT_EF_SEARCH = 0; + private static final int VECTOR_INDEX_MIN_SEEK_FOR_VECTOR_READS = 16 * 1024; + private static final int VECTOR_INDEX_PARALLELISM_FOR_VECTOR_READS = 32; + private final GlobalIndexIOMeta ioMeta; private final GlobalIndexFileReader fileReader; private final DataType fieldType; private final ExecutorService executor; - private volatile VectorIndexMeta indexMeta; private volatile VectorIndexMetadata nativeMeta; private volatile VectorIndexReader vectorReader; private SeekableInputStream openStream; @@ -99,7 +111,8 @@ private ScoredGlobalIndexResult search(VectorSearch vectorSearch) throws IOExcep validateSearchVector(vectorSearch.vector()); float[] queryVector = vectorSearch.vector().clone(); int limit = vectorSearch.limit(); - int nprobe = indexMeta.nprobe(); + int nprobe = nprobe(vectorSearch.options()); + int efSearch = efSearch(vectorSearch.options()); String metric = nativeMeta.metric(); RoaringNavigableMap64 includeRowIds = vectorSearch.includeRowIds(); @@ -112,11 +125,9 @@ private ScoredGlobalIndexResult search(VectorSearch vectorSearch) throws IOExcep } byte[] filterBytes = includeRowIds.serialize(); int effectiveK = (int) Math.min(limit, cardinality); - result = - vectorReader.search( - queryVector, effectiveK, nprobe, indexMeta.efSearch(), filterBytes); + result = vectorReader.search(queryVector, effectiveK, nprobe, efSearch, filterBytes); } else { - result = vectorReader.search(queryVector, limit, nprobe, indexMeta.efSearch()); + result = vectorReader.search(queryVector, limit, nprobe, efSearch); } long[] ids = result.ids(); @@ -168,6 +179,27 @@ private static float convertDistanceToScore(float distance, String metric) { throw new IllegalArgumentException("Unknown metric: " + metric); } + static int nprobe(Map parameters) { + return intParameter(parameters, NPROBE_PARAMETER, DEFAULT_NPROBE); + } + + static int efSearch(Map parameters) { + return intParameter(parameters, EF_SEARCH_PARAMETER, DEFAULT_EF_SEARCH); + } + + private static int intParameter(Map parameters, String key, int defaultValue) { + String value = parameters.get(key); + if (value == null) { + return defaultValue; + } + try { + return Integer.parseInt(value); + } catch (NumberFormatException e) { + throw new IllegalArgumentException( + "Invalid value for '" + key + "': " + value + ". Must be an integer.", e); + } + } + private void validateSearchVector(Object vector) { if (!(vector instanceof float[])) { throw new IllegalArgumentException( @@ -197,7 +229,6 @@ private void ensureLoaded() throws IOException { if (vectorReader == null) { synchronized (this) { if (vectorReader == null) { - indexMeta = VectorIndexMeta.deserialize(ioMeta.metadata()); SeekableInputStream in = fileReader.getInputStream(ioMeta); try { vectorReader = @@ -251,16 +282,16 @@ public void close() throws IOException { } } - private static class SeekableStreamVectorIndexInput implements VectorIndexInput { + static class SeekableStreamVectorIndexInput implements VectorIndexInput { private final SeekableInputStream input; - private SeekableStreamVectorIndexInput(SeekableInputStream input) { + SeekableStreamVectorIndexInput(SeekableInputStream input) { this.input = input; } @Override - public synchronized void pread(long[] positions, byte[][] buffers) { + public void pread(long[] positions, byte[][] buffers) { if (positions.length != buffers.length) { throw new IllegalArgumentException( "positions length " @@ -269,15 +300,47 @@ public synchronized void pread(long[] positions, byte[][] buffers) { + buffers.length); } try { - for (int i = 0; i < positions.length; i++) { - input.seek(positions[i]); - readFully(input, buffers[i]); + if (input instanceof VectoredReadable + && areRangesNonOverlapping(positions, buffers)) { + preadVectored((VectoredReadable) input, positions, buffers); + } else { + synchronized (this) { + preadSequential(positions, buffers); + } } } catch (IOException e) { throw new RuntimeException("Failed to read vector index", e); } } + private void preadVectored(VectoredReadable readable, long[] positions, byte[][] buffers) + throws IOException { + List ranges = new ArrayList<>(positions.length); + for (int i = 0; i < positions.length; i++) { + ranges.add(FileRange.createFileRange(positions[i], buffers[i].length)); + } + + VectoredReadUtils.ReadOptions options = + VectoredReadUtils.ReadOptions.from(readable) + .withMinSeekForVectorReads(VECTOR_INDEX_MIN_SEEK_FOR_VECTOR_READS) + .withParallelismForVectorReads( + VECTOR_INDEX_PARALLELISM_FOR_VECTOR_READS) + .withSequentialReadFallback(false); + VectoredReadUtils.readVectored(readable, ranges, options); + + for (int i = 0; i < ranges.size(); i++) { + byte[] bytes = ranges.get(i).getData().join(); + System.arraycopy(bytes, 0, buffers[i], 0, bytes.length); + } + } + + private void preadSequential(long[] positions, byte[][] buffers) throws IOException { + for (int i = 0; i < positions.length; i++) { + input.seek(positions[i]); + readFully(input, buffers[i]); + } + } + private static void readFully(SeekableInputStream input, byte[] buffer) throws IOException { int offset = 0; while (offset < buffer.length) { @@ -288,6 +351,31 @@ private static void readFully(SeekableInputStream input, byte[] buffer) throws I offset += read; } } + + private static boolean areRangesNonOverlapping(long[] positions, byte[][] buffers) { + if (positions.length < 2) { + return true; + } + + List indexes = new ArrayList<>(positions.length); + for (int i = 0; i < positions.length; i++) { + indexes.add(i); + } + indexes.sort(Comparator.comparingLong(index -> positions[index])); + + boolean hasPrevious = false; + long previousEnd = 0; + for (int index : indexes) { + long offset = positions[index]; + long end = offset + buffers[index].length; + if (end < offset || (hasPrevious && offset < previousEnd)) { + return false; + } + previousEnd = end; + hasPrevious = true; + } + return true; + } } // =================== unsupported ===================== diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java index eec4c08a10ce..29459d056828 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java @@ -25,7 +25,6 @@ import org.apache.paimon.globalindex.ResultEntry; import org.apache.paimon.globalindex.io.GlobalIndexFileWriter; import org.apache.paimon.index.vector.VectorIndexWriter; -import org.apache.paimon.options.Options; import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.DataType; import org.apache.paimon.types.FloatType; @@ -42,17 +41,15 @@ import java.nio.ByteOrder; import java.nio.channels.FileChannel; import java.util.Collections; -import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.Random; /** * Vector global index writer using paimon-vector-index. * *

    Vectors are spilled to a temporary file on disk as they arrive via {@link #write(Object)}, * keeping Java heap usage constant (~8 MB buffer). During index build, vectors are read back for - * training (with optional reservoir sampling) and batch insertion. + * training and batch insertion. * *

    Thread safety: This class is not thread-safe. */ @@ -63,35 +60,12 @@ public class VectorGlobalIndexWriter implements GlobalIndexSingletonWriter, Clos private static final Logger LOG = LoggerFactory.getLogger(VectorGlobalIndexWriter.class); private static final int IO_BUFFER_SIZE = 8 * 1024 * 1024; - private static final String OPTION_PREFIX = "vector."; - private static final int DEFAULT_DIMENSION = 128; - private static final String DEFAULT_METRIC = "inner_product"; - private static final int DEFAULT_NLIST = 256; - private static final int DEFAULT_PQ_M = 16; - private static final boolean DEFAULT_USE_OPQ = false; - private static final int DEFAULT_HNSW_M = 20; - private static final int DEFAULT_HNSW_EF_CONSTRUCTION = 150; - private static final int DEFAULT_HNSW_MAX_LEVEL = 7; - private static final int DEFAULT_NPROBE = 16; - private static final int DEFAULT_EF_SEARCH = 0; - private static final double DEFAULT_TRAIN_SAMPLE_RATIO = 1.0; - private static final int DEFAULT_ADD_BATCH_SIZE = 10000; + private static final int ADD_BATCH_SIZE = 10000; private final GlobalIndexFileWriter fileWriter; - private final VectorIndexType indexType; private final String identifier; + private final Map nativeOptions; private final int dim; - private final String metric; - private final int nlist; - private final int pqM; - private final boolean useOpq; - private final int hnswM; - private final int hnswEfConstruction; - private final int hnswMaxLevel; - private final int nprobe; - private final int efSearch; - private final double trainSampleRatio; - private final int addBatchSize; private File tempVectorFile; private FileChannel writeChannel; @@ -107,34 +81,18 @@ public class VectorGlobalIndexWriter implements GlobalIndexSingletonWriter, Clos public VectorGlobalIndexWriter( GlobalIndexFileWriter fileWriter, DataType fieldType, - Options options, - VectorIndexType indexType, + Map options, String identifier) { this.fileWriter = fileWriter; - this.indexType = indexType; this.identifier = identifier; - this.dim = dimension(fieldType, options); - this.metric = stringOption(options, "distance.metric", DEFAULT_METRIC); - this.nlist = positiveIntOption(options, "nlist", DEFAULT_NLIST); - this.pqM = positiveIntOption(options, "pq.m", DEFAULT_PQ_M); - this.useOpq = booleanOption(options, "pq.use-opq", DEFAULT_USE_OPQ); - this.hnswM = positiveIntOption(options, "hnsw.m", DEFAULT_HNSW_M); - this.hnswEfConstruction = - positiveIntOption(options, "hnsw.ef-construction", DEFAULT_HNSW_EF_CONSTRUCTION); - this.hnswMaxLevel = positiveIntOption(options, "hnsw.max-level", DEFAULT_HNSW_MAX_LEVEL); - this.nprobe = positiveIntOption(options, "nprobe", DEFAULT_NPROBE); - this.efSearch = nonNegativeIntOption(options, "hnsw.ef-search", DEFAULT_EF_SEARCH); - this.trainSampleRatio = - doubleOption(options, "train.sample-ratio", DEFAULT_TRAIN_SAMPLE_RATIO); - this.addBatchSize = positiveIntOption(options, "add.batch-size", DEFAULT_ADD_BATCH_SIZE); + validateFieldType(fieldType); + this.nativeOptions = options; + this.dim = Integer.parseInt(options.get("dimension")); this.count = 0; this.closed = false; this.recordSizeInBytes = checkedRecordSize(dim, IO_BUFFER_SIZE); this.vectorBuf = new float[dim]; - validateFieldType(fieldType); - validateOptions(); - try { this.tempVectorFile = File.createTempFile("paimon-vector-index-vectors-", ".bin"); this.tempVectorFile.deleteOnExit(); @@ -261,22 +219,14 @@ public List finish() { } private ResultEntry buildIndex() throws IOException { - int effectiveNlist = (int) Math.min(nlist, count); - - LOG.info( - "{} vector index build started: {} vectors, dim={}, nlist={}, metric={}", - identifier, - count, - dim, - effectiveNlist, - metric); + LOG.info("{} vector index build started: {} vectors, dim={}", identifier, count, dim); long buildStart = System.currentTimeMillis(); - try (VectorIndexWriter writer = new VectorIndexWriter(nativeOptions(effectiveNlist))) { + try (VectorIndexWriter writer = new VectorIndexWriter(nativeOptions)) { // Phase 1: Train long phaseStart = System.currentTimeMillis(); - LOG.info("{} train phase started (sample_ratio={})", identifier, trainSampleRatio); + LOG.info("{} train phase started", identifier); trainFromTempFile(writer); LOG.info( "{} train phase done in {} ms", @@ -310,7 +260,7 @@ private ResultEntry buildIndex() throws IOException { identifier, System.currentTimeMillis() - buildStart); - VectorIndexMeta meta = new VectorIndexMeta(metadata()); + VectorIndexMeta meta = new VectorIndexMeta(); return new ResultEntry(fileName, logicalRowId, meta.serialize()); } } @@ -320,16 +270,8 @@ private String fileNamePrefix() { } private void trainFromTempFile(VectorIndexWriter writer) throws IOException { - int minTrainSize = (int) Math.min(count, Math.max(nlist * 39L, 256)); - int sampleCount; - if (trainSampleRatio >= 1.0) { - sampleCount = (int) count; - } else { - sampleCount = Math.max((int) (count * trainSampleRatio), minTrainSize); - sampleCount = (int) Math.min(sampleCount, count); - } - - float[] trainData = new float[sampleCount * dim]; + int trainCount = (int) count; + float[] trainData = new float[trainCount * dim]; try (RandomAccessFile raf = new RandomAccessFile(tempVectorFile, "r"); FileChannel channel = raf.getChannel()) { @@ -337,47 +279,21 @@ private void trainFromTempFile(VectorIndexWriter writer) throws IOException { readBuf.order(ByteOrder.nativeOrder()); readBuf.limit(0); - if (sampleCount == (int) count) { - // Read all vectors - for (int i = 0; i < sampleCount; i++) { - ensureAvailable(readBuf, channel, recordSizeInBytes); - readBuf.getLong(); // skip rowId - for (int d = 0; d < dim; d++) { - trainData[i * dim + d] = readBuf.getFloat(); - } - } - } else { - // Reservoir sampling - Random rng = new Random(42); - int collected = 0; - for (long i = 0; i < count; i++) { - ensureAvailable(readBuf, channel, recordSizeInBytes); - readBuf.getLong(); // skip rowId - if (collected < sampleCount) { - for (int d = 0; d < dim; d++) { - trainData[collected * dim + d] = readBuf.getFloat(); - } - collected++; - } else { - int j = rng.nextInt((int) (i + 1)); - if (j < sampleCount) { - for (int d = 0; d < dim; d++) { - trainData[j * dim + d] = readBuf.getFloat(); - } - } else { - readBuf.position(readBuf.position() + dim * Float.BYTES); - } - } + for (int i = 0; i < trainCount; i++) { + ensureAvailable(readBuf, channel, recordSizeInBytes); + readBuf.getLong(); // skip rowId + for (int d = 0; d < dim; d++) { + trainData[i * dim + d] = readBuf.getFloat(); } } } - writer.train(trainData, sampleCount); + writer.train(trainData, trainCount); } private void addVectorsFromTempFile(VectorIndexWriter writer) throws IOException { - long[] batchIds = new long[addBatchSize]; - float[] batchVectors = new float[addBatchSize * dim]; + long[] batchIds = new long[ADD_BATCH_SIZE]; + float[] batchVectors = new float[ADD_BATCH_SIZE * dim]; try (RandomAccessFile raf = new RandomAccessFile(tempVectorFile, "r"); FileChannel channel = raf.getChannel()) { @@ -389,7 +305,7 @@ private void addVectorsFromTempFile(VectorIndexWriter writer) throws IOException int lastLoggedPercent = -1; while (remaining > 0) { - int thisBatch = (int) Math.min(addBatchSize, remaining); + int thisBatch = (int) Math.min(ADD_BATCH_SIZE, remaining); for (int i = 0; i < thisBatch; i++) { ensureAvailable(readBuf, channel, recordSizeInBytes); batchIds[i] = readBuf.getLong(); @@ -411,97 +327,6 @@ private void addVectorsFromTempFile(VectorIndexWriter writer) throws IOException } } - private Map nativeOptions(int effectiveNlist) { - Map nativeOptions = new LinkedHashMap<>(); - nativeOptions.put("index.type", indexType.nativeName()); - nativeOptions.put("dimension", String.valueOf(dim)); - nativeOptions.put("nlist", String.valueOf(effectiveNlist)); - nativeOptions.put("metric", metric); - switch (indexType) { - case IVF_FLAT: - break; - case IVF_PQ: - nativeOptions.put("pq.m", String.valueOf(pqM)); - nativeOptions.put("use-opq", String.valueOf(useOpq)); - break; - case IVF_HNSW_FLAT: - case IVF_HNSW_SQ: - nativeOptions.put("hnsw.m", String.valueOf(hnswM)); - nativeOptions.put("hnsw.ef-construction", String.valueOf(hnswEfConstruction)); - nativeOptions.put("hnsw.max-level", String.valueOf(hnswMaxLevel)); - break; - default: - throw new IllegalArgumentException("Unsupported vector index type: " + indexType); - } - return nativeOptions; - } - - private Map metadata() { - Map metadata = new LinkedHashMap<>(); - metadata.put(VectorIndexMeta.KEY_NPROBE, String.valueOf(nprobe)); - metadata.put(VectorIndexMeta.KEY_EF_SEARCH, String.valueOf(efSearch)); - return metadata; - } - - private void validateOptions() { - if (indexType == VectorIndexType.IVF_PQ && dim % pqM != 0) { - throw new IllegalArgumentException( - String.format("vector.pq.m (%d) must divide vector dimension (%d)", pqM, dim)); - } - if (trainSampleRatio <= 0 || trainSampleRatio > 1.0) { - throw new IllegalArgumentException( - String.format( - "vector.train.sample-ratio must be in (0, 1.0], but got %f", - trainSampleRatio)); - } - } - - private static int dimension(DataType fieldType, Options options) { - if (fieldType instanceof VectorType) { - return ((VectorType) fieldType).getLength(); - } - return positiveIntOption(options, "index.dimension", DEFAULT_DIMENSION); - } - - private static String stringOption(Options options, String key, String defaultValue) { - String value = options.get(OPTION_PREFIX + key); - return value == null ? defaultValue : value; - } - - private static int positiveIntOption(Options options, String key, int defaultValue) { - int value = options.getInteger(OPTION_PREFIX + key, defaultValue); - if (value <= 0) { - throw new IllegalArgumentException( - "Invalid value for 'vector." - + key - + "': " - + value - + ". Must be a positive integer."); - } - return value; - } - - private static int nonNegativeIntOption(Options options, String key, int defaultValue) { - int value = options.getInteger(OPTION_PREFIX + key, defaultValue); - if (value < 0) { - throw new IllegalArgumentException( - "Invalid value for 'vector." - + key - + "': " - + value - + ". Must be a non-negative integer."); - } - return value; - } - - private static boolean booleanOption(Options options, String key, boolean defaultValue) { - return options.getBoolean(OPTION_PREFIX + key, defaultValue); - } - - private static double doubleOption(Options options, String key, double defaultValue) { - return options.getDouble(OPTION_PREFIX + key, defaultValue); - } - private static void ensureAvailable(ByteBuffer readBuf, FileChannel channel, int minBytes) throws IOException { int zeroReadCount = 0; diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java index 8e0664a3a428..a3f3bf51fb9a 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexer.java @@ -24,10 +24,10 @@ import org.apache.paimon.globalindex.GlobalIndexer; import org.apache.paimon.globalindex.io.GlobalIndexFileReader; import org.apache.paimon.globalindex.io.GlobalIndexFileWriter; -import org.apache.paimon.options.Options; import org.apache.paimon.types.DataType; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.concurrent.ExecutorService; @@ -35,21 +35,18 @@ public class VectorGlobalIndexer implements GlobalIndexer { private final DataType fieldType; - private final Options options; - private final VectorIndexType indexType; + private final Map options; private final String identifier; - public VectorGlobalIndexer( - DataType fieldType, Options options, VectorIndexType indexType, String identifier) { + public VectorGlobalIndexer(DataType fieldType, Map options, String identifier) { this.fieldType = fieldType; - this.options = options; - this.indexType = Objects.requireNonNull(indexType, "indexType must not be null"); + this.options = Objects.requireNonNull(options, "options must not be null"); this.identifier = Objects.requireNonNull(identifier, "identifier must not be null"); } @Override public GlobalIndexWriter createWriter(GlobalIndexFileWriter fileWriter) { - return new VectorGlobalIndexWriter(fileWriter, fieldType, options, indexType, identifier); + return new VectorGlobalIndexWriter(fileWriter, fieldType, options, identifier); } @Override diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java index add52f855428..354c967b56ad 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactory.java @@ -22,14 +22,80 @@ import org.apache.paimon.globalindex.GlobalIndexerFactory; import org.apache.paimon.options.Options; import org.apache.paimon.types.DataField; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.VectorType; + +import java.util.LinkedHashMap; +import java.util.Map; /** Factory for creating vector indexes backed by paimon-vector-index. */ public abstract class VectorGlobalIndexerFactory implements GlobalIndexerFactory { - protected abstract VectorIndexType indexType(); + private static final int DEFAULT_DIMENSION = 128; @Override public GlobalIndexer create(DataField field, Options options) { - return new VectorGlobalIndexer(field.type(), options, indexType(), identifier()); + String identifier = identifier(); + return new VectorGlobalIndexer( + field.type(), nativeOptions(field.type(), options, identifier), identifier); + } + + static Map nativeOptions( + DataType fieldType, Options tableOptions, String identifier) { + Map nativeOptions = new LinkedHashMap<>(); + String optionPrefix = identifier + "."; + for (Map.Entry entry : tableOptions.toMap().entrySet()) { + String optionKey = entry.getKey(); + if (optionKey.startsWith(optionPrefix)) { + String nativeKey = nativeOptionKey(optionKey.substring(optionPrefix.length())); + if (nativeKey != null) { + nativeOptions.put(nativeKey, entry.getValue()); + } + } + } + nativeOptions.put("index.type", identifier.replace('-', '_')); + nativeOptions.put( + "dimension", String.valueOf(dimension(fieldType, nativeOptions, identifier))); + return nativeOptions; + } + + private static String nativeOptionKey(String optionKey) { + switch (optionKey) { + case "index.dimension": + case "dimension": + return "dimension"; + case "distance.metric": + case "metric": + return "metric"; + case "nlist": + case "pq.m": + case "hnsw.m": + case "hnsw.ef-construction": + case "hnsw.max-level": + return optionKey; + case "pq.use-opq": + case "use-opq": + return "use-opq"; + default: + return null; + } + } + + private static int dimension( + DataType fieldType, Map nativeOptions, String identifier) { + if (fieldType instanceof VectorType) { + return ((VectorType) fieldType).getLength(); + } + String dimension = nativeOptions.get("dimension"); + int value = dimension == null ? DEFAULT_DIMENSION : Integer.parseInt(dimension); + if (value <= 0) { + throw new IllegalArgumentException( + "Invalid value for '" + + identifier + + ".dimension': " + + value + + ". Must be a positive integer."); + } + return value; } } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexMeta.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexMeta.java index f494639dfe2b..ea18e7efebc9 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexMeta.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexMeta.java @@ -23,52 +23,33 @@ import java.io.IOException; import java.io.Serializable; +import java.util.Collections; import java.util.LinkedHashMap; import java.util.Map; /** * Metadata for a vector index file. * - *

    Serialized as a flat JSON {@code Map} storing Paimon search parameters that - * are not part of the native vector index file metadata. + *

    Serialized as an empty JSON {@code Map}. Search-time parameters are passed + * through {@link org.apache.paimon.predicate.VectorSearch#options()}. */ public class VectorIndexMeta implements Serializable { private static final long serialVersionUID = 1L; - static final String KEY_NPROBE = "nprobe"; - static final String KEY_EF_SEARCH = "ef_search"; - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final TypeReference> MAP_TYPE_REF = new TypeReference>() {}; - private final Map params; - - VectorIndexMeta(Map params) { - this.params = new LinkedHashMap<>(params); - } - - public int nprobe() { - return intValue(KEY_NPROBE, 16); - } - - public int efSearch() { - return intValue(KEY_EF_SEARCH, 0); - } + VectorIndexMeta() {} public byte[] serialize() throws IOException { - return OBJECT_MAPPER.writeValueAsBytes(params); + return OBJECT_MAPPER.writeValueAsBytes(Collections.emptyMap()); } public static VectorIndexMeta deserialize(byte[] data) throws IOException { - Map map = OBJECT_MAPPER.readValue(data, MAP_TYPE_REF); - return new VectorIndexMeta(map); - } - - private int intValue(String key, int defaultValue) { - String val = params.get(key); - return val == null ? defaultValue : Integer.parseInt(val); + Map ignored = OBJECT_MAPPER.readValue(data, MAP_TYPE_REF); + return new VectorIndexMeta(); } } diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexType.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexType.java deleted file mode 100644 index cc18fe2d873a..000000000000 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorIndexType.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.vector.index; - -/** Vector index type used by the native paimon-vector-index writer. */ -public enum VectorIndexType { - IVF_FLAT("ivf_flat"), - IVF_PQ("ivf_pq"), - IVF_HNSW_FLAT("ivf_hnsw_flat"), - IVF_HNSW_SQ("ivf_hnsw_sq"); - - private final String nativeName; - - VectorIndexType(String nativeName) { - this.nativeName = nativeName; - } - - public String nativeName() { - return nativeName; - } -} diff --git a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/SeekableStreamVectorIndexInputTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/SeekableStreamVectorIndexInputTest.java new file mode 100644 index 000000000000..ddb29eab6e97 --- /dev/null +++ b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/SeekableStreamVectorIndexInputTest.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.vector.index; + +import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.fs.VectoredReadable; + +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Tests for {@link VectorGlobalIndexReader.SeekableStreamVectorIndexInput}. */ +public class SeekableStreamVectorIndexInputTest { + + @Test + public void testVectoredReadableInputUsesParallelPositionReads() throws Exception { + byte[] data = data(128 * 1024); + TestVectoredSeekableInputStream input = new TestVectoredSeekableInputStream(data, 2); + VectorGlobalIndexReader.SeekableStreamVectorIndexInput indexInput = + new VectorGlobalIndexReader.SeekableStreamVectorIndexInput(input); + + byte[][] buffers = new byte[][] {new byte[64], new byte[64]}; + indexInput.pread(new long[] {0, 32 * 1024}, buffers); + + assertThat(buffers[0]).isEqualTo(slice(data, 0, 64)); + assertThat(buffers[1]).isEqualTo(slice(data, 32 * 1024, 64)); + assertThat(input.positionReads).hasValue(2); + assertThat(input.sequentialReads).hasValue(0); + assertThat(input.maxActiveReads).hasValue(2); + } + + @Test + public void testFallbackToSequentialReadWhenRangesOverlap() { + byte[] data = data(1024); + TestVectoredSeekableInputStream input = new TestVectoredSeekableInputStream(data, 0); + VectorGlobalIndexReader.SeekableStreamVectorIndexInput indexInput = + new VectorGlobalIndexReader.SeekableStreamVectorIndexInput(input); + + byte[][] buffers = new byte[][] {new byte[64], new byte[64]}; + indexInput.pread(new long[] {0, 32}, buffers); + + assertThat(buffers[0]).isEqualTo(slice(data, 0, 64)); + assertThat(buffers[1]).isEqualTo(slice(data, 32, 64)); + assertThat(input.positionReads).hasValue(0); + assertThat(input.sequentialReads).hasValue(2); + } + + private static byte[] data(int length) { + byte[] data = new byte[length]; + for (int i = 0; i < length; i++) { + data[i] = (byte) i; + } + return data; + } + + private static byte[] slice(byte[] data, int offset, int length) { + byte[] expected = new byte[length]; + System.arraycopy(data, offset, expected, 0, length); + return expected; + } + + private static class TestVectoredSeekableInputStream extends SeekableInputStream + implements VectoredReadable { + + private final byte[] data; + private final CountDownLatch readsStarted; + private final CountDownLatch finishReads = new CountDownLatch(1); + private final AtomicInteger activeReads = new AtomicInteger(); + private final AtomicInteger positionReads = new AtomicInteger(); + private final AtomicInteger sequentialReads = new AtomicInteger(); + private final AtomicInteger maxActiveReads = new AtomicInteger(); + + private int position; + + private TestVectoredSeekableInputStream(byte[] data, int expectedPositionReads) { + this.data = data; + this.readsStarted = new CountDownLatch(expectedPositionReads); + if (expectedPositionReads == 0) { + finishReads.countDown(); + } + } + + @Override + public void seek(long desired) { + position = (int) desired; + } + + @Override + public long getPos() { + return position; + } + + @Override + public int read() { + return data[position++]; + } + + @Override + public int read(byte[] buffer, int offset, int length) { + System.arraycopy(data, position, buffer, offset, length); + position += length; + sequentialReads.incrementAndGet(); + return length; + } + + @Override + public void close() {} + + @Override + public int pread(long position, byte[] buffer, int offset, int length) throws IOException { + int active = activeReads.incrementAndGet(); + maxActiveReads.accumulateAndGet(active, Math::max); + readsStarted.countDown(); + try { + if (!readsStarted.await(5, TimeUnit.SECONDS)) { + throw new IOException("Timed out waiting for parallel vector index reads"); + } + finishReads.countDown(); + System.arraycopy(data, (int) position, buffer, offset, length); + positionReads.incrementAndGet(); + return length; + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new IOException(e); + } finally { + activeReads.decrementAndGet(); + } + } + } +} diff --git a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java index af722a7fc38b..73f8a4b28f49 100644 --- a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java +++ b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexTest.java @@ -44,9 +44,10 @@ import org.junit.jupiter.api.io.TempDir; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.Collections; -import java.util.LinkedHashMap; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.UUID; @@ -114,7 +115,7 @@ public void testDimensionMismatch() { public void testVectorTypeRejectsNonFloatElement() { DataType intVecType = new VectorType(2, new IntType()); Options options = createDefaultOptions(2); - options.setInteger("vector.pq.m", 1); + options.setInteger("ivf-pq.pq.m", 1); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); assertThatThrownBy(() -> createIvfPqWriter(fileWriter, intVecType, options)) @@ -125,7 +126,7 @@ public void testVectorTypeRejectsNonFloatElement() { @Test public void testNanInVectorRejected() { Options options = createDefaultOptions(2); - options.setInteger("vector.pq.m", 1); + options.setInteger("ivf-pq.pq.m", 1); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); VectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); @@ -139,7 +140,7 @@ public void testNanInVectorRejected() { @Test public void testInfinityInVectorRejected() { Options options = createDefaultOptions(2); - options.setInteger("vector.pq.m", 1); + options.setInteger("ivf-pq.pq.m", 1); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); VectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); @@ -154,7 +155,7 @@ public void testInfinityInVectorRejected() { @Test public void testAllNullReturnsEmpty() { Options options = createDefaultOptions(2); - options.setInteger("vector.pq.m", 1); + options.setInteger("ivf-pq.pq.m", 1); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); VectorGlobalIndexWriter writer = createIvfPqWriter(fileWriter, vectorType, options); @@ -167,26 +168,36 @@ public void testAllNullReturnsEmpty() { } @Test - public void testMetaSerializationRoundTrip() throws IOException { - Options options = new Options(); - options.setInteger("vector.nprobe", 24); - options.setInteger("vector.hnsw.ef-search", 80); - - VectorIndexMeta meta = new VectorIndexMeta(metaOptions(options)); + public void testMetaSerializationIsEmptyMap() throws IOException { + VectorIndexMeta meta = new VectorIndexMeta(); byte[] serialized = meta.serialize(); VectorIndexMeta deserialized = VectorIndexMeta.deserialize(serialized); - assertThat(deserialized.nprobe()).isEqualTo(24); - assertThat(deserialized.efSearch()).isEqualTo(80); + assertThat(new String(serialized, StandardCharsets.UTF_8)).isEqualTo("{}"); + assertThat(new String(deserialized.serialize(), StandardCharsets.UTF_8)).isEqualTo("{}"); } @Test - public void testMetaSerializationDefaults() throws IOException { - VectorIndexMeta deserialized = - VectorIndexMeta.deserialize(new VectorIndexMeta(new LinkedHashMap<>()).serialize()); + public void testVectorSearchParameterParsing() { + Map parameters = new HashMap<>(); + parameters.put("ivf.nprobe", "24"); + parameters.put("hnsw.ef_search", "80"); + parameters.put("ignored", "bad"); + + assertThat(VectorGlobalIndexReader.nprobe(parameters)).isEqualTo(24); + assertThat(VectorGlobalIndexReader.efSearch(parameters)).isEqualTo(80); + assertThat(VectorGlobalIndexReader.nprobe(Collections.emptyMap())).isEqualTo(16); + assertThat(VectorGlobalIndexReader.efSearch(Collections.emptyMap())).isEqualTo(0); + } - assertThat(deserialized.nprobe()).isEqualTo(16); - assertThat(deserialized.efSearch()).isEqualTo(0); + @Test + public void testVectorSearchParameterRangeValidationDelegatedToNative() { + assertThat(VectorGlobalIndexReader.nprobe(Collections.singletonMap("ivf.nprobe", "0"))) + .isEqualTo(0); + assertThat( + VectorGlobalIndexReader.efSearch( + Collections.singletonMap("hnsw.ef_search", "-1"))) + .isEqualTo(-1); } // =================== Tests that NEED native library ===================== @@ -197,8 +208,8 @@ public void testFloatVectorEndToEnd() throws IOException { int dimension = 2; Options options = createDefaultOptions(dimension); - options.setInteger("vector.nlist", 2); - options.setInteger("vector.pq.m", 1); + options.setInteger("ivf-pq.nlist", 2); + options.setInteger("ivf-pq.pq.m", 1); float[][] vectors = new float[][] { @@ -234,8 +245,8 @@ public void testSearchWithRoaringFilter() throws IOException { int dimension = 2; Options options = createDefaultOptions(dimension); - options.setInteger("vector.nlist", 2); - options.setInteger("vector.pq.m", 1); + options.setInteger("ivf-pq.nlist", 2); + options.setInteger("ivf-pq.pq.m", 1); float[][] vectors = new float[][] { @@ -276,8 +287,8 @@ public void testNullVectorSkipWithCorrectIds() throws IOException { int dimension = 2; Options options = createDefaultOptions(dimension); - options.setInteger("vector.nlist", 2); - options.setInteger("vector.pq.m", 1); + options.setInteger("ivf-pq.nlist", 2); + options.setInteger("ivf-pq.pq.m", 1); float[][] vectors = new float[][] { @@ -322,8 +333,8 @@ public void testViaIndexer() throws IOException { int dimension = 2; Options options = createDefaultOptions(dimension); - options.setInteger("vector.nlist", 2); - options.setInteger("vector.pq.m", 1); + options.setInteger("ivf-pq.nlist", 2); + options.setInteger("ivf-pq.pq.m", 1); float[][] vectors = new float[][] { @@ -334,7 +345,10 @@ public void testViaIndexer() throws IOException { VectorGlobalIndexer indexer = new VectorGlobalIndexer( - vectorType, options, VectorIndexType.IVF_PQ, IVF_PQ_IDENTIFIER); + vectorType, + VectorGlobalIndexerFactory.nativeOptions( + vectorType, options, IVF_PQ_IDENTIFIER), + IVF_PQ_IDENTIFIER); GlobalIndexFileWriter fileWriter = createFileWriter(indexPath); VectorGlobalIndexWriter writer = (VectorGlobalIndexWriter) indexer.createWriter(fileWriter); @@ -357,27 +371,19 @@ public void testViaIndexer() throws IOException { private VectorGlobalIndexWriter createIvfPqWriter( GlobalIndexFileWriter fileWriter, DataType fieldType, Options options) { return new VectorGlobalIndexWriter( - fileWriter, fieldType, options, VectorIndexType.IVF_PQ, IVF_PQ_IDENTIFIER); + fileWriter, + fieldType, + VectorGlobalIndexerFactory.nativeOptions(fieldType, options, IVF_PQ_IDENTIFIER), + IVF_PQ_IDENTIFIER); } private Options createDefaultOptions(int dimension) { Options options = new Options(); - options.setInteger("vector.index.dimension", dimension); - options.setString("vector.distance.metric", "l2"); + options.setInteger("ivf-pq.dimension", dimension); + options.setString("ivf-pq.metric", "l2"); return options; } - private Map metaOptions(Options options) { - Map meta = new LinkedHashMap<>(); - meta.put( - VectorIndexMeta.KEY_NPROBE, - String.valueOf(options.getInteger("vector.nprobe", 16))); - meta.put( - VectorIndexMeta.KEY_EF_SEARCH, - String.valueOf(options.getInteger("vector.hnsw.ef-search", 0))); - return meta; - } - private GlobalIndexFileWriter createFileWriter(Path path) { return new GlobalIndexFileWriter() { @Override diff --git a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactoryTest.java b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactoryTest.java index 4307c6f40795..1caf082f2c4d 100644 --- a/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactoryTest.java +++ b/paimon-vector/paimon-vector-index/src/test/java/org/apache/paimon/vector/index/VectorGlobalIndexerFactoryTest.java @@ -19,10 +19,17 @@ package org.apache.paimon.vector.index; import org.apache.paimon.globalindex.GlobalIndexerFactoryUtils; +import org.apache.paimon.options.Options; +import org.apache.paimon.types.ArrayType; +import org.apache.paimon.types.FloatType; +import org.apache.paimon.types.VectorType; import org.junit.jupiter.api.Test; +import java.util.Map; + import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; /** Tests for vector global indexer factory SPI registration. */ public class VectorGlobalIndexerFactoryTest { @@ -49,14 +56,60 @@ public void testLoadByIdentifier() { } @Test - public void testFactoryIndexType() { - assertThat(new IvfFlatVectorGlobalIndexerFactory().indexType()) - .isEqualTo(VectorIndexType.IVF_FLAT); - assertThat(new IvfPqAlgorithmVectorGlobalIndexerFactory().indexType()) - .isEqualTo(VectorIndexType.IVF_PQ); - assertThat(new IvfHnswFlatVectorGlobalIndexerFactory().indexType()) - .isEqualTo(VectorIndexType.IVF_HNSW_FLAT); - assertThat(new IvfHnswSqVectorGlobalIndexerFactory().indexType()) - .isEqualTo(VectorIndexType.IVF_HNSW_SQ); + public void testNativeOptionsOnlyUsesIdentifierPrefix() { + Options options = new Options(); + options.setString("bucket", "4"); + options.setString("vector.file.format", "vortex"); + options.setString("vector.nlist", "64"); + options.setString("ivf-flat.dimension", "32"); + options.setString("ivf-flat.distance.metric", "cosine"); + options.setString("ivf-flat.nlist", "128"); + options.setString("ivf-pq.nlist", "256"); + + Map nativeOptions = + VectorGlobalIndexerFactory.nativeOptions( + new ArrayType(new FloatType()), + options, + IvfFlatVectorGlobalIndexerFactory.IDENTIFIER); + + assertThat(nativeOptions) + .containsEntry("index.type", "ivf_flat") + .containsEntry("dimension", "32") + .containsEntry("metric", "cosine") + .containsEntry("nlist", "128") + .doesNotContainEntry("nlist", "64") + .doesNotContainEntry("nlist", "256") + .doesNotContainKey("bucket") + .doesNotContainKey("vector.file.format"); + } + + @Test + public void testNativeOptionsUsesVectorTypeDimension() { + Options options = new Options(); + options.setString("ivf-flat.dimension", "32"); + + Map nativeOptions = + VectorGlobalIndexerFactory.nativeOptions( + new VectorType(8, new FloatType()), + options, + IvfFlatVectorGlobalIndexerFactory.IDENTIFIER); + + assertThat(nativeOptions).containsEntry("dimension", "8"); + } + + @Test + public void testInvalidDimension() { + Options options = new Options(); + options.setString("ivf-flat.dimension", "0"); + + assertThatThrownBy( + () -> + VectorGlobalIndexerFactory.nativeOptions( + new ArrayType(new FloatType()), + options, + IvfFlatVectorGlobalIndexerFactory.IDENTIFIER)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("ivf-flat.dimension") + .hasMessageContaining("positive integer"); } } From fd03c65cedb32e289426782d86e3dcdf92dcff85 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Thu, 11 Jun 2026 19:41:38 +0800 Subject: [PATCH 11/11] [vector] Trim partial vector add batches --- .../paimon/vector/index/VectorGlobalIndexWriter.java | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java index 29459d056828..19582009581d 100644 --- a/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java +++ b/paimon-vector/paimon-vector-index/src/main/java/org/apache/paimon/vector/index/VectorGlobalIndexWriter.java @@ -40,6 +40,7 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.channels.FileChannel; +import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; @@ -313,7 +314,14 @@ private void addVectorsFromTempFile(VectorIndexWriter writer) throws IOException batchVectors[i * dim + d] = readBuf.getFloat(); } } - writer.addVectors(batchIds, batchVectors, thisBatch); + if (thisBatch == ADD_BATCH_SIZE) { + writer.addVectors(batchIds, batchVectors, thisBatch); + } else { + writer.addVectors( + Arrays.copyOf(batchIds, thisBatch), + Arrays.copyOf(batchVectors, thisBatch * dim), + thisBatch); + } remaining -= thisBatch; int percent = (int) ((count - remaining) * 100 / count);