diff --git a/.devcontainer/Dockerfile.template b/.devcontainer/Dockerfile.template index c28a0e1ae..9069085bb 100644 --- a/.devcontainer/Dockerfile.template +++ b/.devcontainer/Dockerfile.template @@ -17,12 +17,32 @@ # Adapted from Apache Iceberg C++ # https://github.com/apache/iceberg-cpp/blob/main/.devcontainer/Dockerfile.template - +# # This Dockerfile is used to build a development container for Paimon C++. -# It is based on the Ubuntu image and installs necessary dependencies. +# Base: Ubuntu 24.04. Rust toolchain is installed via Dev Container +# Feature `ghcr.io/devcontainers/features/rust:1` (see devcontainer.json), +# so it does NOT appear in this Dockerfile. FROM ubuntu:24.04 +# Switch apt to Aliyun mirror for faster downloads (covers both +# x86_64 archive.ubuntu.com and aarch64 ports.ubuntu.com paths). +# If you are outside mainland China or your network has its own internal +# mirror, edit or remove this block. +RUN sed -i \ + -e 's|http://archive.ubuntu.com/ubuntu|http://mirrors.aliyun.com/ubuntu|g' \ + -e 's|http://security.ubuntu.com/ubuntu|http://mirrors.aliyun.com/ubuntu|g' \ + -e 's|http://ports.ubuntu.com/ubuntu-ports|http://mirrors.aliyun.com/ubuntu-ports|g' \ + /etc/apt/sources.list.d/ubuntu.sources + +# Point rustup at USTC mirror so the Dev Container Feature +# `ghcr.io/devcontainers/features/rust:1` (and any later `rustup` calls) +# download the Rust toolchain from a China-friendly CDN instead of +# the default static.rust-lang.org. Set as ENV so it is inherited by +# every subsequent layer (including features installed after this image). +ENV RUSTUP_DIST_SERVER=https://mirrors.ustc.edu.cn/rust-static \ + RUSTUP_UPDATE_ROOT=https://mirrors.ustc.edu.cn/rust-static/rustup + # Install necessary packages RUN apt update && \ apt install -y \ @@ -48,6 +68,16 @@ RUN apt update && \ vim \ wget \ sudo \ + # ---- additions for tantivy-fts migration (Rust + Sanitizer + LLVM) ---- + clang \ + clang-format \ + clang-tidy \ + lld \ + llvm \ + libclang-rt-dev \ + gdb \ + lldb \ + valgrind \ && rm -rf /var/lib/apt/lists/* # Add a user for development diff --git a/.devcontainer/centos7/Dockerfile b/.devcontainer/centos7/Dockerfile new file mode 100644 index 000000000..c4fe3a5a0 --- /dev/null +++ b/.devcontainer/centos7/Dockerfile @@ -0,0 +1,229 @@ +# Copyright 2026-present Alibaba Inc. +# +# Licensed under the Apache License, Version 2.0. +# +# CentOS 7 cross-build verification image for paimon-cpp + tantivy-fts. +# +# Purpose: +# Prove the tantivy-fts stack builds on the OLDEST reasonable Linux target +# (glibc 2.17, EOL 2024-06-30). The default Ubuntu 24.04 dev container +# proves nothing about glibc compatibility; this image does. +# +# Build: +# docker build -t paimon-cpp-centos7:latest -f .devcontainer/centos7/Dockerfile . +# +# Run: +# docker run -d --name paimon-centos7 \ +# --privileged \ +# -v "$(pwd):/workspaces/paimon-cpp" \ +# paimon-cpp-centos7:latest sleep infinity +# docker exec -it paimon-centos7 bash -l +# +# Inside the container: +# scl enable devtoolset-11 rh-python38 -- bash # activate modern gcc + python +# source /opt/paimon-env.sh # PATH for rust, cmake +# cd /workspaces/paimon-cpp +# git lfs install --local && git lfs pull # critical: boost & friends are LFS +# ./scripts/tantivy_smoke.sh + +# ---------- Base ---------- +# CentOS 7 reached EOL 2024-06-30; its default mirrorlist.centos.org is down. +# Pin to vault.centos.org (Red Hat's archived location) via the `linuxserver/centos` +# vault image to avoid retired-mirror failures on `yum install`. +# +# Base image: we pull from quay.io (CentOS community's canonical registry post +# Docker Hub deprecation). Override with CENTOS7_IMAGE build arg when behind a +# firewall that can't reach quay.io (e.g. registry.aliyuncs.com/library/centos:7). +ARG CENTOS7_IMAGE=quay.io/centos/centos:centos7 +FROM ${CENTOS7_IMAGE} + +# Repoint yum at aliyun's CentOS 7 vault mirror — vault.centos.org itself +# works but is slow/blocked from many CN networks; the aliyun mirror is a +# complete rsync and reliably fast. We overwrite CentOS-Base.repo rather +# than sed-patch it so the result is deterministic regardless of what the +# upstream image ships. fastestmirror plugin is disabled because its ping +# probes against the retired mirror list add ~60s to every `yum install`. +RUN echo -e '[base]\n\ +name=CentOS-7 - Base - aliyun vault\n\ +baseurl=https://mirrors.aliyun.com/centos-vault/7.9.2009/os/$basearch/\n\ +gpgcheck=0\n\ +enabled=1\n\ +\n\ +[updates]\n\ +name=CentOS-7 - Updates - aliyun vault\n\ +baseurl=https://mirrors.aliyun.com/centos-vault/7.9.2009/updates/$basearch/\n\ +gpgcheck=0\n\ +enabled=1\n\ +\n\ +[extras]\n\ +name=CentOS-7 - Extras - aliyun vault\n\ +baseurl=https://mirrors.aliyun.com/centos-vault/7.9.2009/extras/$basearch/\n\ +gpgcheck=0\n\ +enabled=1\n\ +\n\ +[centosplus]\n\ +name=CentOS-7 - Plus - aliyun vault\n\ +baseurl=https://mirrors.aliyun.com/centos-vault/7.9.2009/centosplus/$basearch/\n\ +gpgcheck=0\n\ +enabled=0\n' > /etc/yum.repos.d/CentOS-Base.repo \ + && rm -f /etc/yum.repos.d/CentOS-CR.repo \ + /etc/yum.repos.d/CentOS-Debuginfo.repo \ + /etc/yum.repos.d/CentOS-Media.repo \ + /etc/yum.repos.d/CentOS-Sources.repo \ + /etc/yum.repos.d/CentOS-Vault.repo \ + /etc/yum.repos.d/CentOS-fasttrack.repo \ + /etc/yum.repos.d/CentOS-x86_64-kernel.repo \ + && if [ -f /etc/yum/pluginconf.d/fastestmirror.conf ]; then \ + sed -i 's/^enabled=1/enabled=0/' /etc/yum/pluginconf.d/fastestmirror.conf; \ + fi \ + && yum clean all \ + && yum makecache + +# ---------- Base toolchain ---------- +# EPEL provides git-lfs, ninja-build, a newer python3 than the base 3.6. +# SCL (Software Collections) provides devtoolset-11 (gcc 11) and rh-python38 +# without overriding the system gcc/python. CentOS 7's default gcc 4.8 is +# too old for C++17/20 used by lucene++ and our tantivy wrapper. +# +# Same story as CentOS-Base.repo: both epel + SCL default to mirrorlist +# endpoints that are effectively dead; overwrite with aliyun URLs that we +# know respond. +RUN yum install -y epel-release centos-release-scl \ + && echo -e '[epel]\n\ +name=Extra Packages for Enterprise Linux 7 - aliyun\n\ +baseurl=https://mirrors.aliyun.com/epel/7/$basearch\n\ +gpgcheck=0\n\ +enabled=1\n' > /etc/yum.repos.d/epel.repo \ + && rm -f /etc/yum.repos.d/epel-testing.repo /etc/yum.repos.d/epel.repo.rpmnew \ + && rm -f /etc/yum.repos.d/CentOS-SCLo-*.repo \ + /etc/yum.repos.d/CentOS-SCLo-*.repo.rpmnew \ + && echo -e '[centos-sclo-rh]\n\ +name=CentOS-7 - SCLo rh - aliyun vault\n\ +baseurl=https://mirrors.aliyun.com/centos-vault/7.9.2009/sclo/$basearch/rh/\n\ +gpgcheck=0\n\ +enabled=1\n\ +\n\ +[centos-sclo-sclo]\n\ +name=CentOS-7 - SCLo sclo - aliyun vault\n\ +baseurl=https://mirrors.aliyun.com/centos-vault/7.9.2009/sclo/$basearch/sclo/\n\ +gpgcheck=0\n\ +enabled=1\n' > /etc/yum.repos.d/CentOS-SCLo-scl.repo \ + && yum clean all && yum makecache \ + && yum install -y \ + devtoolset-11-gcc \ + devtoolset-11-gcc-c++ \ + devtoolset-11-binutils \ + devtoolset-11-libasan-devel \ + devtoolset-11-libubsan-devel \ + rh-python38 \ + rh-python38-python-pip \ + git \ + git-lfs \ + ninja-build \ + make \ + patch \ + curl \ + wget \ + unzip \ + which \ + file \ + sudo \ + openssl-devel \ + zlib-devel \ + libffi-devel \ + bzip2-devel \ + xz-devel \ + perl-IPC-Cmd \ + && yum clean all + +# Enable the SCL collections for all subsequent shells (including RUN). +ENV BASH_ENV=/etc/profile.d/scl-enable.sh +SHELL ["/bin/bash", "-c"] +RUN printf '%s\n' \ + 'source scl_source enable devtoolset-11' \ + 'source scl_source enable rh-python38' \ + > /etc/profile.d/scl-enable.sh \ + && chmod +x /etc/profile.d/scl-enable.sh + +# ---------- CMake (must be >= 3.22 for Corrosion) ---------- +# CentOS 7's cmake package is 2.8.12; EPEL cmake3 is 3.17 — still too old. +# Install via pip in the rh-python38 SCL so we get a modern CMake without +# touching the system /usr/bin. Point pip at aliyun's pypi mirror: default +# pypi.org is 10-30s per request from CN, aliyun responds in <1s. +ENV PIP_INDEX_URL=https://mirrors.aliyun.com/pypi/simple/ \ + PIP_TRUSTED_HOST=mirrors.aliyun.com +RUN source /etc/profile.d/scl-enable.sh \ + && python3 -m pip install --upgrade pip \ + && python3 -m pip install 'cmake==3.28.*' ninja + +# ---------- Rust toolchain ---------- +# Install rustup as root into /opt/rust so all users share the same toolchain. +# Use the USTC mirror to keep downloads fast in CN; the CI runner version of +# this is mirrored in ci/scripts/setup_rust.sh. +ENV RUSTUP_HOME=/opt/rust/rustup \ + CARGO_HOME=/opt/rust/cargo \ + RUSTUP_DIST_SERVER=https://mirrors.ustc.edu.cn/rust-static \ + RUSTUP_UPDATE_ROOT=https://mirrors.ustc.edu.cn/rust-static/rustup +# In-container network for Docker Desktop builds is unreliable through many +# CN mirrors (observed: curl 7.29 on CentOS 7 + rsproxy.cn HTTP/2 path ⇒ +# partial-read truncations; USTC ⇒ 5xx; rustup sh installer ⇒ 403 from +# legacy cipher). The most reliable fix is to sidestep the issue entirely: +# pre-download rustup-init on the host (where network is solid) and COPY it +# into the image. See .devcontainer/centos7/run.sh for the prefetch step. +COPY .devcontainer/centos7/rustup-init.bin /tmp/rustup-init +RUN chmod +x /tmp/rustup-init \ + && /tmp/rustup-init -y --default-toolchain stable --profile minimal --no-modify-path \ + && rm -f /tmp/rustup-init \ + && mkdir -p $CARGO_HOME \ + && echo -e '[source.crates-io]\n\ +replace-with = "rsproxy-sparse"\n\ +\n\ +[source.rsproxy]\n\ +registry = "https://rsproxy.cn/crates.io-index"\n\ +\n\ +[source.rsproxy-sparse]\n\ +registry = "sparse+https://rsproxy.cn/index/"\n\ +\n\ +[registries.rsproxy]\n\ +index = "https://rsproxy.cn/crates.io-index"\n\ +\n\ +[net]\n\ +git-fetch-with-cli = true\n' > $CARGO_HOME/config.toml \ + && $CARGO_HOME/bin/cargo install cbindgen --version 0.29.2 --locked \ + && chmod -R a+rwx /opt/rust \ + && $CARGO_HOME/bin/rustc --version \ + && $CARGO_HOME/bin/cargo --version \ + && $CARGO_HOME/bin/cbindgen --version + +# ---------- Environment file consumed by every shell ---------- +# Sets PATH for rust / cmake / cargo so `docker exec paimon-centos7 bash -l` +# and interactive sessions have the toolchain on $PATH. +RUN printf '%s\n' \ + 'export PATH=/opt/rust/cargo/bin:$PATH' \ + '# cmake + ninja live under the rh-python38 SCL; path prefix differs by arch.' \ + '# `command -v cmake` confirms which one is in use.' \ + > /opt/paimon-env.sh \ + && chmod +x /opt/paimon-env.sh \ + && printf '%s\n' 'source /opt/paimon-env.sh' >> /etc/profile.d/scl-enable.sh + +# ---------- Non-root user ---------- +# Build as `paimon` (uid 1000) so LFS objects under the mount stay owned by +# your host user, matching the main Ubuntu dev container. +RUN useradd -m -u 1000 -s /bin/bash paimon \ + && echo 'paimon ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/paimon + +USER paimon +WORKDIR /workspaces/paimon-cpp + +# Sanity check surfaces the tool versions in `docker run ... paimon-cpp-centos7 --version`. +CMD ["bash", "-lc", "\ + echo '--- CentOS 7 cross-build image sanity check ---'; \ + cat /etc/centos-release; \ + echo '--- glibc ---'; ldd --version | head -1; \ + echo '--- gcc ---'; gcc --version | head -1; \ + echo '--- cmake ---'; cmake --version | head -1; \ + echo '--- ninja ---'; ninja --version; \ + echo '--- rust ---'; rustc --version; \ + echo '--- cargo ---'; cargo --version; \ + echo '--- cbindgen ---'; cbindgen --version; \ + echo 'Ready. Mount paimon-cpp at /workspaces/paimon-cpp and run ./scripts/tantivy_smoke.sh'"] diff --git a/.devcontainer/centos7/run.sh b/.devcontainer/centos7/run.sh new file mode 100755 index 000000000..54e6bfbde --- /dev/null +++ b/.devcontainer/centos7/run.sh @@ -0,0 +1,140 @@ +#!/usr/bin/env bash +# +# Copyright 2026-present Alibaba Inc. +# +# Licensed under the Apache License, Version 2.0. +# +# One-shot helper to build + launch + smoke-test the CentOS 7 verification +# container. Run from the paimon-cpp repo root. +# +# Usage: +# ./.devcontainer/centos7/run.sh build # build image only +# ./.devcontainer/centos7/run.sh up # start container (detached) +# ./.devcontainer/centos7/run.sh shell # exec into it +# ./.devcontainer/centos7/run.sh smoke # run scripts/tantivy_smoke.sh inside +# ./.devcontainer/centos7/run.sh down # stop + remove + +set -euo pipefail + +IMAGE=paimon-cpp-centos7:latest +CONTAINER=paimon-centos7 + +here=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +repo=$(cd "${here}/../.." && pwd) + +cmd=${1:-help} + +case "${cmd}" in + build) + # Prefetch rustup-init on the host. In-container network from Docker + # Desktop builds is unreliable for CN mirrors (TLS/HTTP2 issues with + # old curl/wget on CentOS 7), but host curl works. The image copies + # this blob in. Override mirror with RUSTUP_INIT_URL=... if needed. + rustup_init="${here}/rustup-init.bin" + rustup_url="${RUSTUP_INIT_URL:-https://mirrors.ustc.edu.cn/rust-static/rustup/dist/x86_64-unknown-linux-gnu/rustup-init}" + if [ ! -s "${rustup_init}" ]; then + echo "==> Prefetching rustup-init from ${rustup_url}" + curl --proto '=https' --tlsv1.2 -sSfL --retry 5 --retry-delay 5 \ + -o "${rustup_init}" "${rustup_url}" + fi + # Override base image with CENTOS7_IMAGE=... if quay.io is unreachable. + # Common fallbacks you may need to docker-pull into local cache first: + # CENTOS7_IMAGE=quay.io/centos/centos:centos7 (default) + # CENTOS7_IMAGE=registry.aliyuncs.com/library/centos:7 + if [ -n "${CENTOS7_IMAGE:-}" ]; then + docker build -t "${IMAGE}" -f "${here}/Dockerfile" \ + --build-arg "CENTOS7_IMAGE=${CENTOS7_IMAGE}" "${repo}" + else + docker build -t "${IMAGE}" -f "${here}/Dockerfile" "${repo}" + fi + ;; + up) + docker rm -f "${CONTAINER}" 2>/dev/null || true + # Mount host SSH keys read-only (mirrors paimon-dev) so git clones of + # internal repos (e.g. aliorc_ep on gitlab.alibaba-inc.com) that go + # over SSH can authenticate with the host's key. Skip the mount if + # ~/.ssh doesn't exist so the script still works for external users. + ssh_mount=() + if [ -d "${HOME}/.ssh" ]; then + ssh_mount=(-v "${HOME}/.ssh:/home/paimon/.ssh:ro") + fi + docker run -d \ + --name "${CONTAINER}" \ + --privileged \ + -v "${repo}:/workspaces/paimon-cpp" \ + -v "paimon-centos7-cargo-registry:/opt/rust/cargo/registry" \ + -v "paimon-centos7-build:/workspaces/paimon-cpp/build-centos7" \ + "${ssh_mount[@]}" \ + "${IMAGE}" sleep infinity + # Named volumes mount as root-owned; `paimon` user (uid 1000) needs + # write access to build-centos7 and the cargo registry cache. + # Also set up the gitlab.alibaba-inc.com url rewrite so aliorc_ep + # (and any other ExternalProject pointing at internal gitlab via + # http://) picks up the mounted SSH key. + docker exec --user root "${CONTAINER}" bash -c ' + chown -R paimon:paimon /workspaces/paimon-cpp/build-centos7 \ + /opt/rust/cargo/registry + ' + docker exec "${CONTAINER}" bash -c ' + git config --global url."git@gitlab.alibaba-inc.com:".insteadOf \ + "http://gitlab.alibaba-inc.com/" + ' + echo "Container started. \`${0} shell\` to enter." + ;; + shell) + docker exec -it "${CONTAINER}" bash -l + ;; + smoke) + # Ensure container is up first; no-op if already running. + if ! docker ps --format '{{.Names}}' | grep -qx "${CONTAINER}"; then + echo "Container ${CONTAINER} not running; starting it." + "$0" up + fi + # Two env vars pass through for Rosetta 2 (Apple Silicon) compat: + # MALLOC_CHECK_=0 disables glibc 2.17 extra malloc integrity checks + # that fire false positives under Rosetta's x86_64 emulation. + # ARROW_USER_SIMD_LEVEL=SSE4_2 keeps arrow runtime-dispatched kernels + # on SSE4.2 only (Rosetta does not support AVX2/BMI2/AVX-512). + # Both are no-ops on real x86_64 CentOS 7 hardware. + # Use a distinct build dir inside the container so it does not clash + # with the Ubuntu dev container's build/ dir on the same volume. + # Propagate PAIMON_ENABLE_ALIORC so `PAIMON_ENABLE_ALIORC=OFF` env + # on the host reaches the cmake inside the container. + docker exec \ + -e "PAIMON_ENABLE_ALIORC=${PAIMON_ENABLE_ALIORC:-ON}" \ + -e "MALLOC_CHECK_=0" \ + -e "ARROW_USER_SIMD_LEVEL=SSE4_2" \ + "${CONTAINER}" bash -lc ' + set -eux + cd /workspaces/paimon-cpp + git lfs install --local + git lfs pull + cmake -S . -B build-centos7 \ + -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DPAIMON_BUILD_TESTS=ON \ + -DPAIMON_ENABLE_FSLIB=OFF \ + -DPAIMON_ENABLE_LUMINA=OFF \ + -DPAIMON_ENABLE_LANCE=OFF \ + -DPAIMON_ENABLE_JINDO=OFF \ + -DPAIMON_ENABLE_LUCENE=ON \ + -DPAIMON_ENABLE_ORC=ON \ + -DPAIMON_ENABLE_ALIORC="${PAIMON_ENABLE_ALIORC:-ON}" \ + -DPAIMON_ENABLE_AVRO=ON + # ALIORC clones from internal gitlab. `up` mounts $HOME/.ssh and + # configures the url.insteadOf rewrite, so by default ALIORC works + # for alibaba-inc users. External users without gitlab access can + # opt out with `PAIMON_ENABLE_ALIORC=OFF ./run.sh smoke`. + cmake --build build-centos7 -j "$(nproc)" + ctest --test-dir build-centos7 \ + -R "paimon-lucene-index-test|paimon-global-index-test|paimon-tantivy-.*-test" \ + --output-on-failure + ' + ;; + down) + docker rm -f "${CONTAINER}" 2>/dev/null || true + ;; + help|*) + sed -n "2,20p" "$0" + ;; +esac diff --git a/.devcontainer/devcontainer.json.template b/.devcontainer/devcontainer.json.template index 856a89dc3..992f62aba 100644 --- a/.devcontainer/devcontainer.json.template +++ b/.devcontainer/devcontainer.json.template @@ -20,6 +20,10 @@ // Adapted from Apache Iceberg C++ // https://github.com/apache/iceberg-cpp/blob/main/.devcontainer/devcontainer.json.template +// Default Paimon C++ Dev Container. +// On Apple Silicon hosts this runs as native aarch64 Linux (fast). +// For x86_64 verification, use the variant under .devcontainer/x86_64/. + { "name": "Paimon CPP Dev Container", "build": { @@ -34,16 +38,36 @@ "seccomp=unconfined", "--privileged" ], + "features": { + "ghcr.io/devcontainers/features/rust:1": { + "version": "stable", + "profile": "default" + } + }, "mounts": [ - "source=${localEnv:HOME}/.ssh,target=/home/paimon/.ssh,type=bind,readonly" + "source=${localEnv:HOME}/.ssh,target=/home/paimon/.ssh,type=bind,readonly", + "source=paimon-cargo-registry,target=/home/paimon/.cargo/registry,type=volume", + "source=paimon-cargo-git,target=/home/paimon/.cargo/git,type=volume", + "source=paimon-rust-target,target=${containerWorkspaceFolder}/third_party/tantivy_ffi/target,type=volume", + "source=paimon-build,target=${containerWorkspaceFolder}/build,type=volume", + "source=paimon-ccache,target=/home/paimon/.ccache,type=volume" ], + "postCreateCommand": "sudo chown -R paimon:paimon ${containerWorkspaceFolder}/build ${containerWorkspaceFolder}/third_party/tantivy_ffi/target /home/paimon/.ccache /home/paimon/.cargo/registry /home/paimon/.cargo/git 2>/dev/null || true; cargo install cbindgen --locked || true; rustup component add rust-src rust-analyzer clippy rustfmt || true", "customizations": { "vscode": { "extensions": [ - "eamodio.gitlens" + "eamodio.gitlens", + "rust-lang.rust-analyzer", + "vadimcn.vscode-lldb", + "llvm-vs-code-extensions.vscode-clangd", + "ms-vscode.cmake-tools", + "twxs.cmake" ], "settings": { - "editor.formatOnSave": true + "editor.formatOnSave": true, + "rust-analyzer.linkedProjects": [ + "third_party/tantivy_ffi/Cargo.toml" + ] } } } diff --git a/.devcontainer/x86_64/devcontainer.json.template b/.devcontainer/x86_64/devcontainer.json.template new file mode 100644 index 000000000..baa400990 --- /dev/null +++ b/.devcontainer/x86_64/devcontainer.json.template @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// x86_64 variant of the Paimon CPP Dev Container. +// On Apple Silicon hosts this runs under QEMU emulation (5-10x slower). +// Use it ONLY for cross-architecture verification (Stage 11), not daily dev. +// +// Reuses the same Dockerfile as the default container; only the platform differs. +// +// Uses dedicated named volumes (suffix `-amd64`) so build/cargo cache do not +// collide with the native aarch64 container. + +{ + "name": "Paimon CPP Dev Container (x86_64 via QEMU)", + "build": { + "dockerfile": "../Dockerfile", + "options": [ + "--platform=linux/amd64" + ] + }, + "runArgs": [ + "--platform=linux/amd64", + "--ulimit=core=-1", + "--cap-add=SYS_ADMIN", + "--cap-add=SYS_PTRACE", + "--cap-add=PERFMON", + "--security-opt", + "seccomp=unconfined", + "--privileged" + ], + "features": { + "ghcr.io/devcontainers/features/rust:1": { + "version": "stable", + "profile": "default" + } + }, + "mounts": [ + "source=${localEnv:HOME}/.ssh,target=/home/paimon/.ssh,type=bind,readonly", + "source=paimon-cargo-registry-amd64,target=/home/paimon/.cargo/registry,type=volume", + "source=paimon-cargo-git-amd64,target=/home/paimon/.cargo/git,type=volume", + "source=paimon-rust-target-amd64,target=${containerWorkspaceFolder}/third_party/tantivy_ffi/target,type=volume", + "source=paimon-build-amd64,target=${containerWorkspaceFolder}/build,type=volume", + "source=paimon-ccache-amd64,target=/home/paimon/.ccache,type=volume" + ], + "postCreateCommand": "sudo chown -R paimon:paimon ${containerWorkspaceFolder}/build ${containerWorkspaceFolder}/third_party/tantivy_ffi/target /home/paimon/.ccache /home/paimon/.cargo/registry /home/paimon/.cargo/git 2>/dev/null || true; cargo install cbindgen --locked || true; rustup component add rust-src rust-analyzer clippy rustfmt || true", + "customizations": { + "vscode": { + "extensions": [ + "eamodio.gitlens", + "rust-lang.rust-analyzer", + "vadimcn.vscode-lldb", + "llvm-vs-code-extensions.vscode-clangd", + "ms-vscode.cmake-tools" + ], + "settings": { + "editor.formatOnSave": true + } + } + } +} diff --git a/.github/workflows/build_release.yaml b/.github/workflows/build_release.yaml index 6e984bd19..152048cc9 100644 --- a/.github/workflows/build_release.yaml +++ b/.github/workflows/build_release.yaml @@ -44,6 +44,9 @@ jobs: uses: ./.github/actions/setup-ccache with: cache-key-prefix: ccache-clang-release + - name: Install Rust toolchain (tantivy-fts) + shell: bash + run: ci/scripts/setup_rust.sh - name: Build Paimon shell: bash env: @@ -67,6 +70,9 @@ jobs: uses: ./.github/actions/setup-ccache with: cache-key-prefix: ccache-gcc-release + - name: Install Rust toolchain (tantivy-fts) + shell: bash + run: ci/scripts/setup_rust.sh - name: Build Paimon shell: bash env: diff --git a/.github/workflows/clang_test.yaml b/.github/workflows/clang_test.yaml index dd11dd725..824a5d45d 100644 --- a/.github/workflows/clang_test.yaml +++ b/.github/workflows/clang_test.yaml @@ -45,6 +45,9 @@ jobs: uses: ./.github/actions/setup-ccache with: cache-key-prefix: ccache-clang-test + - name: Install Rust toolchain (tantivy-fts) + shell: bash + run: ci/scripts/setup_rust.sh - name: Build Paimon shell: bash env: diff --git a/.github/workflows/gcc_test.yaml b/.github/workflows/gcc_test.yaml index e97954608..af6e0ddbd 100644 --- a/.github/workflows/gcc_test.yaml +++ b/.github/workflows/gcc_test.yaml @@ -44,6 +44,9 @@ jobs: uses: ./.github/actions/setup-ccache with: cache-key-prefix: ccache-gcc-test + - name: Install Rust toolchain (tantivy-fts) + shell: bash + run: ci/scripts/setup_rust.sh - name: Build Paimon shell: bash env: diff --git a/.github/workflows/test_with_sanitizer.yaml b/.github/workflows/test_with_sanitizer.yaml index b2b90d97b..a083a773b 100644 --- a/.github/workflows/test_with_sanitizer.yaml +++ b/.github/workflows/test_with_sanitizer.yaml @@ -44,6 +44,9 @@ jobs: uses: ./.github/actions/setup-ccache with: cache-key-prefix: ccache-sanitizer + - name: Install Rust toolchain (tantivy-fts) + shell: bash + run: ci/scripts/setup_rust.sh - name: Build Paimon shell: bash env: diff --git a/.gitignore b/.gitignore index 57e007860..0626cbc0d 100644 --- a/.gitignore +++ b/.gitignore @@ -14,8 +14,7 @@ # Build directories build -build-release -build-debug +build-*/ output # IDE settings @@ -24,8 +23,20 @@ output .cache # Devcontainer configuration +# Track only *.template files (and subdirectory structure that contains them). .devcontainer/* !.devcontainer/*.template +!.devcontainer/x86_64/ +.devcontainer/x86_64/* +!.devcontainer/x86_64/*.template +# CentOS 7 cross-build image: track raw Dockerfile + helper script (not +# templated because the image is built from the repo root directly). +!.devcontainer/centos7/ +.devcontainer/centos7/* +!.devcontainer/centos7/Dockerfile +!.devcontainer/centos7/run.sh +# rustup-init.bin is a 20 MB prefetched binary — not source, don't commit. +.devcontainer/centos7/rustup-init.bin # Temporary and backup files *~ @@ -48,3 +59,6 @@ FlameGraph # Third party dependencies archives third_party/*.tar.gz + +# Rust / Cargo build artifacts +third_party/tantivy_ffi/target/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 154a38d97..b53964563 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,7 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.16) +cmake_minimum_required(VERSION 3.22) +# 3.22 是 Corrosion-rs (用于 Rust-C++ FFI 集成,见 third_party/tantivy_ffi) 的 +# 最低要求。Ubuntu 24.04 默认 CMake 3.28,CentOS 8+/RHEL 9+ 默认 3.20+。 +# 如果需要在更老的发行版构建,请参考 docs/dev/tantivy_fts_migration_plan.md。 message(STATUS "Building using CMake version: ${CMAKE_VERSION}") # https://cmake.org/cmake/help/latest/policy/CMP0135.html @@ -55,6 +58,8 @@ option(PAIMON_ENABLE_LANCE "Whether to enable lance file format" OFF) option(PAIMON_ENABLE_JINDO "Whether to enable jindo file system" OFF) option(PAIMON_ENABLE_LUMINA "Whether to enable lumina vector index" OFF) option(PAIMON_ENABLE_LUCENE "Whether to enable lucene index" OFF) +option(PAIMON_ENABLE_TANTIVY + "Whether to enable tantivy-fulltext global index (Rust FFI, experimental)" ON) if(PAIMON_ENABLE_ORC) add_definitions(-DPAIMON_ENABLE_ORC) endif() @@ -87,6 +92,10 @@ if(PAIMON_ENABLE_LUCENE) add_definitions(-DPAIMON_ENABLE_LUCENE) endif() +if(PAIMON_ENABLE_TANTIVY) + add_definitions(-DPAIMON_ENABLE_TANTIVY) +endif() + add_definitions(-DSNAPPY_CODEC_AVAILABLE) add_definitions(-DZSTD_CODEC_AVAILABLE) add_definitions(-DRAPIDJSON_HAS_STDSTRING) @@ -303,6 +312,21 @@ if(PAIMON_ENABLE_LUMINA) DESTINATION ${CMAKE_INSTALL_LIBDIR}) endif() +# ---- tantivy-fulltext Rust FFI via Corrosion-rs -------------------------------- +# See docs/dev/tantivy_fts_migration_plan.md Stage 1. +# +# Corrosion wraps the Cargo crate as a CMake target named `paimon_tantivy_ffi`. +# `corrosion_experimental_cbindgen` runs cbindgen from CMake and writes the +# header to a stable path; it also adds that path to the target's INTERFACE +# include dirs so C++ consumers pick it up via target_link_libraries. +if(PAIMON_ENABLE_TANTIVY) + include(CorrosionFetch) + corrosion_import_crate(MANIFEST_PATH third_party/tantivy_ffi/Cargo.toml CRATES + paimon_tantivy_ffi) + corrosion_experimental_cbindgen(TARGET paimon_tantivy_ffi HEADER_NAME + paimon_tantivy_ffi.h) +endif() + if(PAIMON_ENABLE_LUCENE) set(PAIMON_DICT_DEST "share/paimon/dict") @@ -491,6 +515,9 @@ add_subdirectory(src/paimon/format/avro) add_subdirectory(src/paimon/format/lance) add_subdirectory(src/paimon/global_index/lumina) add_subdirectory(src/paimon/global_index/lucene) +if(PAIMON_ENABLE_TANTIVY) + add_subdirectory(src/paimon/global_index/tantivy) +endif() add_subdirectory(src/paimon/testing/mock) add_subdirectory(src/paimon/testing/utils) add_subdirectory(test/inte) diff --git a/ci/scripts/build_paimon.sh b/ci/scripts/build_paimon.sh index f1d0423de..145711f2f 100755 --- a/ci/scripts/build_paimon.sh +++ b/ci/scripts/build_paimon.sh @@ -36,6 +36,7 @@ pushd ${build_dir} ENABLE_LUMINA="ON" ENABLE_LANCE="ON" +ENABLE_TANTIVY="ON" if [[ "${CC:-}" == *"gcc-8"* ]] || [[ "${CXX:-}" == *"g++-8"* ]]; then ENABLE_LUMINA="OFF" # Lumina is only supported on GCC 9 or higher. ENABLE_LANCE="OFF" @@ -43,6 +44,7 @@ if [[ "${CC:-}" == *"gcc-8"* ]] || [[ "${CXX:-}" == *"g++-8"* ]]; then # which requires a higher version of glibc, # but Ubuntu 22.04 and above no longer ships with gcc-8 by default. # Consider supporting Lance from source compilation in the future + ENABLE_TANTIVY="OFF" # tantivy-fts (Rust FFI) is not built on the gcc-8 image. fi CMAKE_ARGS=( @@ -53,6 +55,7 @@ CMAKE_ARGS=( "-DPAIMON_ENABLE_JINDO=ON" "-DPAIMON_ENABLE_LUMINA=${ENABLE_LUMINA}" "-DPAIMON_ENABLE_LUCENE=ON" + "-DPAIMON_ENABLE_TANTIVY=${ENABLE_TANTIVY}" ) if [[ "${enable_sanitizer}" == "true" ]]; then diff --git a/ci/scripts/setup_rust.sh b/ci/scripts/setup_rust.sh new file mode 100755 index 000000000..99b63ea05 --- /dev/null +++ b/ci/scripts/setup_rust.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# +# Copyright 2026-present Alibaba Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Install the Rust toolchain + cbindgen required to build the +# tantivy-fts FFI crate (third_party/tantivy_ffi) from CI. +# +# The dev container (see .devcontainer/) already has these preinstalled; +# this script is for the GitHub Actions runners. Called by +# .github/workflows/gcc_test.yaml and test_with_sanitizer.yaml before +# ci/scripts/build_paimon.sh. +# +# Idempotent: a second invocation is a no-op when the tools already exist. + +set -eux + +RUSTUP_VERSION=${RUSTUP_VERSION:-1.29.0} +# 1.88.0 is the minimum required by transitive crates (e.g. time 0.3.47). +RUST_VERSION=${RUST_VERSION:-1.88.0} +CBINDGEN_VERSION=${CBINDGEN_VERSION:-0.29.2} + +# Install rustup + default toolchain if cargo isn't on PATH yet. +if ! command -v cargo >/dev/null 2>&1; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \ + | sh -s -- -y --default-toolchain "${RUST_VERSION}" --profile minimal --no-modify-path +fi + +# Export for the remainder of the CI job. +export PATH="${HOME}/.cargo/bin:${PATH}" +echo "${HOME}/.cargo/bin" >> "${GITHUB_PATH:-/dev/null}" || true + +rustup toolchain install "${RUST_VERSION}" --profile minimal +rustup default "${RUST_VERSION}" +rustup component add rustfmt clippy + +# cbindgen is used by the crate's build.rs to emit the C header that the +# C++ side includes. Corrosion will also run cbindgen at CMake configure +# time; both paths need it available. +if ! command -v cbindgen >/dev/null 2>&1; then + cargo install cbindgen --version "${CBINDGEN_VERSION}" --locked +fi + +rustc --version +cargo --version +cbindgen --version diff --git a/cmake_modules/BuildUtils.cmake b/cmake_modules/BuildUtils.cmake index d6d3b4a58..0b97943bf 100644 --- a/cmake_modules/BuildUtils.cmake +++ b/cmake_modules/BuildUtils.cmake @@ -94,6 +94,7 @@ function(add_paimon_lib LIB_NAME) endif() # Necessary to make static linking into other shared libraries work properly set_property(TARGET ${LIB_NAME}_objlib PROPERTY POSITION_INDEPENDENT_CODE 1) + target_link_libraries(${LIB_NAME}_objlib PUBLIC paimon_sanitizer_flags) if(ARG_DEPENDENCIES) # In static-only builds, some dependency names are still declared as # *_shared. Map them to *_static when the shared target is unavailable. @@ -181,8 +182,14 @@ function(add_paimon_lib LIB_NAME) PRIVATE -Wl,--exclude-libs,ALL -Wl,-Bsymbolic - -Wl,-z,defs -Wl,--gc-sections) + # -z defs (--no-undefined) rejects the __asan_*/__ubsan_* symbols that + # sanitizer-instrumented shared libraries legitimately leave undefined + # (they are resolved at load time from the executable's sanitizer + # runtime). Only enforce it for non-sanitizer builds. + if(NOT PAIMON_USE_ASAN AND NOT PAIMON_USE_UBSAN) + target_link_options(${LIB_NAME}_shared PRIVATE -Wl,-z,defs) + endif() endif() install(TARGETS ${LIB_NAME}_shared ${INSTALL_IS_OPTIONAL} @@ -334,6 +341,10 @@ function(add_test_case REL_TEST_NAME) target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors) endif() target_compile_options(${TEST_NAME} PRIVATE -fno-access-control) + # test 源文件里用 {1, -1, ...} 这样的方式初始化 char/vector 代表原始字节; + # aarch64 默认 char 是 unsigned,会触发 -Wnarrowing。这里统一关掉,避免测试 + # 源文件里大量 static_cast(-1) 污染。生产代码(src/paimon/...)不关。 + target_compile_options(${TEST_NAME} PRIVATE -Wno-narrowing) add_test(${TEST_NAME} ${BUILD_SUPPORT_DIR}/run-test.sh diff --git a/cmake_modules/CorrosionFetch.cmake b/cmake_modules/CorrosionFetch.cmake new file mode 100644 index 000000000..fff8fe655 --- /dev/null +++ b/cmake_modules/CorrosionFetch.cmake @@ -0,0 +1,75 @@ +# Copyright 2026-present Alibaba Inc. +# +# Licensed under the Apache License, Version 2.0. +# +# Pull Corrosion-rs via FetchContent so we can import Cargo crates as CMake +# targets. Used to bring in third_party/tantivy_ffi for the tantivy-fulltext +# global index (see docs/dev/tantivy_fts_migration_plan.md). +# +# Pinned to v0.5.0 (stable release). Requires CMake >= 3.22. + +include(FetchContent) + +# Corrosion does heavy cargo/rustc work at configure+build time; pin tag for +# reproducibility and allow override via env var for offline builds. +set(PAIMON_CORROSION_TAG + "v0.5.2" + CACHE STRING "Git tag of corrosion-rs to fetch; change only when upgrading. v0.5.1+ + is required for rustup >= 1.28 whose `rustup toolchain list --verbose` + output format broke v0.5.0's FindRust.cmake regex.") + +set(PAIMON_CORROSION_REPO + "https://github.com/corrosion-rs/corrosion.git" + CACHE STRING "Override to a private mirror for offline / firewalled builds.") + +# Help Corrosion find rustc/cargo when CMake is invoked without a login shell +# or when rustup is installed to a non-default location. We try, in order: +# 1. Existing Rust_COMPILER cache variable (user override) +# 2. $CARGO_HOME/bin/rustc (when env var set) +# 3. $HOME/.cargo/bin/rustc (rustup's default install) +# 4. Fallback: let Corrosion's FindRust.cmake try its own detection +function(_paimon_find_rustup_bin _var _name) + if(DEFINED ENV{CARGO_HOME} AND EXISTS "$ENV{CARGO_HOME}/bin/${_name}") + set(${_var} + "$ENV{CARGO_HOME}/bin/${_name}" + PARENT_SCOPE) + elseif(DEFINED ENV{HOME} AND EXISTS "$ENV{HOME}/.cargo/bin/${_name}") + set(${_var} + "$ENV{HOME}/.cargo/bin/${_name}" + PARENT_SCOPE) + endif() +endfunction() + +if(NOT DEFINED Rust_COMPILER OR Rust_COMPILER STREQUAL "") + _paimon_find_rustup_bin(_rustc_path rustc) + if(_rustc_path) + set(Rust_COMPILER + "${_rustc_path}" + CACHE FILEPATH "rustc") + endif() +endif() +if(NOT DEFINED Rust_CARGO OR Rust_CARGO STREQUAL "") + _paimon_find_rustup_bin(_cargo_path cargo) + if(_cargo_path) + set(Rust_CARGO + "${_cargo_path}" + CACHE FILEPATH "cargo") + endif() +endif() +# Corrosion reads `rustup which rustc` to resolve the real toolchain binary. +# If CMake is invoked from a non-login shell, $PATH may miss ~/.cargo/bin and +# `rustup` can't be found. Prepend rustup's bin dir so child processes see it. +if(DEFINED Rust_COMPILER) + get_filename_component(_rustup_bin_dir "${Rust_COMPILER}" DIRECTORY) + if(_rustup_bin_dir AND NOT "$ENV{PATH}" MATCHES "${_rustup_bin_dir}") + set(ENV{PATH} "${_rustup_bin_dir}:$ENV{PATH}") + endif() +endif() +message(STATUS "Corrosion: Rust_COMPILER=${Rust_COMPILER}") +message(STATUS "Corrosion: Rust_CARGO=${Rust_CARGO}") + +fetchcontent_declare(Corrosion + GIT_REPOSITORY "${PAIMON_CORROSION_REPO}" + GIT_TAG "${PAIMON_CORROSION_TAG}" + GIT_SHALLOW TRUE) +fetchcontent_makeavailable(Corrosion) diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake index 271011a0d..428814aeb 100644 --- a/cmake_modules/ThirdpartyToolchain.cmake +++ b/cmake_modules/ThirdpartyToolchain.cmake @@ -744,6 +744,11 @@ macro(build_lucene) "-DBoost_INCLUDE_DIR=${BOOST_INCLUDE_DIR}" "-DBoost_LIBRARY_DIR=${BOOST_LIBRARY_DIR}" "-DBOOST_ROOT=${BOOST_INSTALL}" + # Force FindBoost module mode only; ignore system BoostConfig.cmake and + # system library paths so lucene_ep links against our vendored boost 1.66, + # not a system-installed newer version (e.g. 1.83) with ABI differences. + "-DBoost_NO_BOOST_CMAKE=ON" + "-DBoost_NO_SYSTEM_PATHS=ON" "-DBoost_CHRONO_FOUND=TRUE" "-DBoost_THREAD_FOUND=TRUE" "-DZLIB_INCLUDE_DIRS=${ZLIB_INCLUDE_DIR}" @@ -1879,5 +1884,9 @@ endif() if(PAIMON_ENABLE_LUCENE) build_boost() build_lucene() +endif() +# jieba (dict + headers) is needed by BOTH lucene-fts and the tantivy jieba +# tokenizer; build it whenever either backend is on, not only under lucene. +if(PAIMON_ENABLE_LUCENE OR PAIMON_ENABLE_TANTIVY) build_jieba() endif() diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 34818475e..787bfb714 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -cmake_minimum_required(VERSION 3.16) +cmake_minimum_required(VERSION 3.22) project(example) diff --git a/include/paimon/predicate/full_text_search.h b/include/paimon/predicate/full_text_search.h index 93425050c..bbf82eae1 100644 --- a/include/paimon/predicate/full_text_search.h +++ b/include/paimon/predicate/full_text_search.h @@ -55,13 +55,24 @@ struct PAIMON_EXPORT FullTextSearch { std::shared_ptr ReplacePreFilter( const std::optional& _pre_filter) const { - return std::make_shared(field_name, limit, query, search_type, _pre_filter); + auto replaced = + std::make_shared(field_name, limit, query, search_type, _pre_filter); + // `with_score` / `min_score` are not constructor args (they have in-class + // defaults), so carry them over explicitly — otherwise rewrapping the + // pre_filter (e.g. in OffsetGlobalIndexReader) would silently reset a + // scored / min_score query back to the unscored default. + replaced->with_score = with_score; + replaced->min_score = min_score; + return replaced; } /// Name of the field to search within (must be a full-text indexed field). std::string field_name; - /// Maximum number of documents to return. If set, limit ordered by top scores. Otherwise, no - /// score return. + /// Maximum number of documents to return. + /// + /// **v0.2 contract change**: `limit` is now purely a truncation switch — it is orthogonal + /// to `with_score`. Set `with_score = true` if you want BM25 scores in the result; setting + /// `limit >= 0` no longer implies scoring. std::optional limit; /// The query string to search for. The interpretation depends on search_type: /// @@ -85,5 +96,26 @@ struct PAIMON_EXPORT FullTextSearch { /// Only rows whose global row ID is present in `pre_filter` will be included during search. /// If not set, all rows will be included. std::optional pre_filter; + /// Whether to compute and return BM25 relevance scores. + /// + /// The 4-path matrix: + /// - `with_score=false, limit=nullopt` → BitmapGlobalIndexResult (all rows, no score) + /// - `with_score=false, limit=N` → BitmapGlobalIndexResult (any N matches, unscored) + /// - `with_score=true, limit=nullopt` → BitmapScoredGlobalIndexResult (all rows + all scores) + /// - `with_score=true, limit=N` → BitmapScoredGlobalIndexResult (top-N by BM25 + scores) + /// + /// For plain `LIMIT N` without ORDER BY (the common case in SR's predicate + /// pushdown) set `with_score=false, limit=N` — the unscored fast path. If + /// you want top-N by relevance, use `with_score=true, limit=N` and drop the + /// scores in the caller if not needed. + /// + /// Default is `false` to avoid silent score computation overhead for callers that don't need + /// it. + bool with_score = false; + /// Minimum BM25 score threshold (exclusive). Results with score ≤ this value are excluded. + /// Only meaningful when scoring is active (i.e., `with_score = true` or `limit` is set). + /// Applied before truncation so low-score documents never occupy limit slots. + /// Default is nullopt (no threshold filtering). + std::optional min_score; }; } // namespace paimon diff --git a/scripts/tantivy_smoke.sh b/scripts/tantivy_smoke.sh new file mode 100755 index 000000000..4a9255716 --- /dev/null +++ b/scripts/tantivy_smoke.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +# tantivy-fts 迁移期 smoke 测试脚本。 +# +# 用途: 在 Dev Container 内一键回归 lucene-fts + tantivy-fts 相关测试。 +# 设计哲学: 命令行越拼越长容易出错,封装成一个脚本各 Stage 持续维护。 +# +# 用法: +# ./scripts/tantivy_smoke.sh # default: release, no sanitizer +# ./scripts/tantivy_smoke.sh --asan # ASAN 构建 +# ./scripts/tantivy_smoke.sh --tsan # TSAN 构建 +# ./scripts/tantivy_smoke.sh --configure # 仅 cmake configure +# ./scripts/tantivy_smoke.sh --build # 仅 cmake build (跳过 configure) +# ./scripts/tantivy_smoke.sh --tests-only # 仅 ctest (假定已 build 过) +# +# 维护约定: +# - Stage 1+ 每加一个新 ctest target 就更新下面 TEST_REGEX +# - Stage 11 加 --with-asan / --with-tsan 完整路径 + +set -e + +CMAKE_BUILD_TYPE="Release" +USE_ASAN="OFF" +USE_TSAN="OFF" +BUILD_DIR_SUFFIX="" +DO_CONFIGURE=1 +DO_BUILD=1 +DO_TEST=1 + +# ctest 正则: 各 Stage 验收时只跑这批测试,不跑全量 ctest (~531s 太慢)。 +# 内容 = lucene-fts 对照基线 + 当前 Stage 及之前 Stage 新增的 tantivy-fts target。 +# 每个 Stage 完成时往这里追加 target。只有 Stage 11 才应跑全量 ctest。 +TEST_REGEX='paimon-lucene-index-test|paimon-global-index-test|paimon-tantivy-smoke-test|paimon-tantivy-ffi-test|paimon-tantivy-tokenizer-test|paimon-tantivy-writer-test|paimon-tantivy-reader-test|paimon-tantivy-filter-limit-test|paimon-tantivy-index-test|paimon-tantivy-lucene-coexist-test|paimon-tantivy-equivalence-test|paimon-tantivy-streaming-test|paimon-tantivy-java-compat-test' + +while [ $# -gt 0 ]; do + case "$1" in + --asan) USE_ASAN="ON"; CMAKE_BUILD_TYPE="Debug"; BUILD_DIR_SUFFIX="-asan" ;; + --tsan) USE_TSAN="ON"; CMAKE_BUILD_TYPE="Debug"; BUILD_DIR_SUFFIX="-tsan" ;; + --configure) DO_BUILD=0; DO_TEST=0 ;; + --build) DO_CONFIGURE=0; DO_TEST=0 ;; + --tests-only) DO_CONFIGURE=0; DO_BUILD=0 ;; + -h|--help) sed -n '2,20p' "$0"; exit 0 ;; + *) echo "Unknown option: $1"; exit 2 ;; + esac + shift +done + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +BUILD_DIR="${REPO_ROOT}/build${BUILD_DIR_SUFFIX}" + +cd "${REPO_ROOT}" + +if [ "${DO_CONFIGURE}" = "1" ]; then + echo "==> cmake configure (${BUILD_DIR})" + cmake -S . -B "${BUILD_DIR}" \ + -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}" \ + -DPAIMON_BUILD_TESTS=ON \ + -DPAIMON_USE_ASAN="${USE_ASAN}" \ + -DPAIMON_USE_TSAN="${USE_TSAN}" \ + -DPAIMON_ENABLE_FSLIB=OFF \ + -DPAIMON_ENABLE_LUMINA=OFF \ + -DPAIMON_ENABLE_LANCE=OFF \ + -DPAIMON_ENABLE_JINDO=OFF \ + -DPAIMON_ENABLE_LUCENE=ON \ + -DPAIMON_ENABLE_ORC=ON \ + -DPAIMON_ENABLE_ALIORC=ON \ + -DPAIMON_ENABLE_AVRO=ON \ + -G Ninja +fi + +if [ "${DO_BUILD}" = "1" ]; then + echo "==> cmake build" + cmake --build "${BUILD_DIR}" -j +fi + +if [ "${DO_TEST}" = "1" ]; then + echo "==> ctest (${TEST_REGEX})" + ctest --test-dir "${BUILD_DIR}" -R "${TEST_REGEX}" --output-on-failure +fi + +echo "==> tantivy_smoke.sh DONE" diff --git a/src/paimon/common/data/binary_row_test.cpp b/src/paimon/common/data/binary_row_test.cpp index acfc259ce..34694c3a9 100644 --- a/src/paimon/common/data/binary_row_test.cpp +++ b/src/paimon/common/data/binary_row_test.cpp @@ -338,8 +338,9 @@ TEST_F(BinaryRowTest, TestBinary) { auto pool = GetDefaultPool(); BinaryRow row(2); BinaryRowWriter writer(&row, 0, pool.get()); - char chars1[3] = {1, -1, 5}; - char chars2[8] = {1, -1, 5, 5, 1, 5, 1, 5}; + // explicit cast to avoid -Wnarrowing on platforms where char is unsigned (e.g. aarch64) + char chars1[3] = {1, static_cast(-1), 5}; + char chars2[8] = {1, static_cast(-1), 5, 5, 1, 5, 1, 5}; std::string str1(chars1, 3); std::string str2(chars2, 8); Bytes bytes1(str1, pool.get()); diff --git a/src/paimon/common/global_index/offset_global_index_reader_test.cpp b/src/paimon/common/global_index/offset_global_index_reader_test.cpp index d4996bceb..0c4e4c1f5 100644 --- a/src/paimon/common/global_index/offset_global_index_reader_test.cpp +++ b/src/paimon/common/global_index/offset_global_index_reader_test.cpp @@ -22,6 +22,7 @@ #include "gtest/gtest.h" #include "paimon/global_index/bitmap_global_index_result.h" #include "paimon/global_index/bitmap_scored_global_index_result.h" +#include "paimon/predicate/full_text_search.h" #include "paimon/predicate/literal.h" #include "paimon/testing/utils/testharness.h" #include "paimon/utils/roaring_bitmap64.h" @@ -112,9 +113,14 @@ class FakeGlobalIndexReader : public GlobalIndexReader { Result> VisitFullTextSearch( const std::shared_ptr& full_text_search) override { + captured_fts = full_text_search; return MakeResult(default_result_); } + // Captures the (possibly pre_filter-rewritten) FullTextSearch the offset + // reader forwarded, so tests can assert field propagation. + std::shared_ptr captured_fts; + bool IsThreadSafe() const override { return true; } @@ -331,6 +337,37 @@ TEST_F(OffsetGlobalIndexReaderTest, TestVisitFullTextSearchWithOffset) { CheckResult(result, {10, 13, 15}); } +TEST_F(OffsetGlobalIndexReaderTest, TestVisitFullTextSearchPreservesScoreFlags) { + // Regression (review finding #2): rewriting the pre_filter global->local ids + // in the offset reader must NOT drop with_score / min_score. Before the fix, + // FullTextSearch::ReplacePreFilter rebuilt via the 5-arg ctor and silently + // reset both back to their defaults, turning a scored / min_score query + // unscored as soon as it crossed any offset shard. + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({0, 3, 5}); + auto offset_reader = std::make_shared(fake_reader, 10); + + // pre_filter must be set so the offset reader takes the rewrite path. + auto fts = std::make_shared( + "f0", /*limit=*/7, "q", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/RoaringBitmap64::From({10l, 13l, 15l})); + fts->with_score = true; + fts->min_score = 1.5f; + + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitFullTextSearch(fts)); + CheckResult(result, {10, 13, 15}); + + ASSERT_TRUE(fake_reader->captured_fts); + EXPECT_TRUE(fake_reader->captured_fts->with_score) + << "with_score must survive the pre_filter rewrite"; + ASSERT_TRUE(fake_reader->captured_fts->min_score.has_value()) + << "min_score must survive the pre_filter rewrite"; + EXPECT_FLOAT_EQ(fake_reader->captured_fts->min_score.value(), 1.5f); + // limit and the offset-rewritten local pre_filter should still be present. + EXPECT_EQ(fake_reader->captured_fts->limit, std::optional(7)); + ASSERT_TRUE(fake_reader->captured_fts->pre_filter.has_value()); +} + TEST_F(OffsetGlobalIndexReaderTest, TestVisitVectorSearchWithOffset) { auto fake_reader = std::make_shared(); fake_reader->SetVectorSearchResult({0, 2, 5}, {0.9f, 0.7f, 0.3f}); diff --git a/src/paimon/global_index/tantivy/CMakeLists.txt b/src/paimon/global_index/tantivy/CMakeLists.txt new file mode 100644 index 000000000..6039bdde5 --- /dev/null +++ b/src/paimon/global_index/tantivy/CMakeLists.txt @@ -0,0 +1,277 @@ +# Copyright 2026-present Alibaba Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# tantivy-fulltext global index (Rust FFI). See docs/dev/tantivy_fts_migration_plan.md. +# Stage 4 grows the support lib with the C++ writer wrapper + writer test. + +if(NOT PAIMON_ENABLE_TANTIVY) + return() +endif() + +set(PAIMON_TANTIVY_SUPPORT_SRCS + tantivy_ffi_log.cpp + tantivy_archive_layout.cpp + tantivy_stream_ctx.cpp + tantivy_global_index_writer.cpp + tantivy_global_index_reader.cpp + tantivy_global_index.cpp + tantivy_global_index_factory.cpp) + +add_paimon_lib(paimon_tantivy_support + SOURCES + ${PAIMON_TANTIVY_SUPPORT_SRCS} + DEPENDENCIES + paimon_shared + paimon_tantivy_ffi + STATIC_LINK_LIBS + paimon_tantivy_ffi + arrow + glog + fmt + SHARED_LINK_LIBS + paimon_shared + SHARED_LINK_FLAGS + ${PAIMON_VERSION_SCRIPT_FLAGS}) +# Corrosion's paimon_tantivy_ffi target carries INTERFACE_INCLUDE_DIRECTORIES +# (cbindgen-generated header path). The objlib in add_paimon_lib doesn't link +# against deps,so its compile step misses include dirs.Wire them explicitly. +target_link_libraries(paimon_tantivy_support_objlib PUBLIC paimon_tantivy_ffi) + +if(PAIMON_BUILD_TESTS) + add_paimon_test(tantivy_smoke_test + SOURCES + tantivy_smoke_test.cpp + STATIC_LINK_LIBS + paimon_tantivy_ffi + ${GTEST_LINK_TOOLCHAIN}) + + add_paimon_test(tantivy_ffi_test + SOURCES + tantivy_ffi_test.cpp + STATIC_LINK_LIBS + paimon_shared + "-Wl,--whole-archive" + paimon_tantivy_support_static + "-Wl,--no-whole-archive" + paimon_tantivy_ffi + glog + fmt + ${GTEST_LINK_TOOLCHAIN}) + + # Golden-sample tokenizer diff (cppjieba vs jieba-rs). Links against the + # lucene index module to reuse JiebaTokenizer::CutWithMode + Normalize, so it + # can only be built when lucene-fts is enabled (the C++ JiebaTokenizer lives + # in the lucene module). Guarded so the default LUCENE=OFF / TANTIVY=ON build + # doesn't try to link the non-existent paimon_lucene_index_static. + # Note: we mirror the lucene-fts test's link line (see lucene/CMakeLists.txt) + # rather than using the `jieba` imported target, whose INTERFACE_INCLUDE + # concatenates two paths in one string (upstream quirk). + if(PAIMON_ENABLE_LUCENE) + add_paimon_test(tantivy_tokenizer_test + SOURCES + tantivy_tokenizer_test.cpp + EXTRA_INCLUDES + ${LUCENE_INCLUDE_DIR} + STATIC_LINK_LIBS + paimon_shared + test_utils_static + "-Wl,--whole-archive" + paimon_local_file_system_static + paimon_lucene_index_static + "-Wl,--no-whole-archive" + paimon_tantivy_ffi + ${GTEST_LINK_TOOLCHAIN}) + target_compile_definitions(paimon-tantivy-tokenizer-test + PRIVATE JIEBA_TEST_DICT_DIR="${JIEBA_DICT_DIR}" + PAIMON_TANTIVY_GOLDEN_DIR="${CMAKE_SOURCE_DIR}/test/test_data/tokenizer_golden" + ) + target_include_directories(paimon-tantivy-tokenizer-test SYSTEM + PRIVATE ${JIEBA_INCLUDE_DIR} ${JIEBA_DICT_DIR}) + endif() + + # Stage 4 — Writer test. Builds an Arrow batch, runs the writer through + # GlobalIndexFileManager + LocalFileSystem, then validates the packed + # on-disk format. Reader round-trip lives in Stage 6. + add_paimon_test(tantivy_writer_test + SOURCES + tantivy_writer_test.cpp + STATIC_LINK_LIBS + paimon_shared + test_utils_static + "-Wl,--whole-archive" + paimon_local_file_system_static + paimon_tantivy_support_static + "-Wl,--no-whole-archive" + paimon_tantivy_ffi + arrow + glog + fmt + ${GTEST_LINK_TOOLCHAIN}) + target_compile_definitions(paimon-tantivy-writer-test + PRIVATE JIEBA_TEST_DICT_DIR="${JIEBA_DICT_DIR}") + + # Stage 6 — Reader + 5 query types end-to-end. + add_paimon_test(tantivy_reader_test + SOURCES + tantivy_reader_test.cpp + STATIC_LINK_LIBS + paimon_shared + test_utils_static + "-Wl,--whole-archive" + paimon_local_file_system_static + paimon_tantivy_support_static + "-Wl,--no-whole-archive" + paimon_tantivy_ffi + arrow + glog + fmt + ${GTEST_LINK_TOOLCHAIN}) + target_compile_definitions(paimon-tantivy-reader-test + PRIVATE JIEBA_TEST_DICT_DIR="${JIEBA_DICT_DIR}") + + # Stage 7 — limit + pre_filter + scoring. + add_paimon_test(tantivy_filter_limit_test + SOURCES + tantivy_filter_limit_test.cpp + STATIC_LINK_LIBS + paimon_shared + test_utils_static + "-Wl,--whole-archive" + paimon_local_file_system_static + paimon_tantivy_support_static + "-Wl,--no-whole-archive" + paimon_tantivy_ffi + arrow + glog + fmt + ${GTEST_LINK_TOOLCHAIN}) + target_compile_definitions(paimon-tantivy-filter-limit-test + PRIVATE JIEBA_TEST_DICT_DIR="${JIEBA_DICT_DIR}") + + # Java → C++ cross-read test. Fixture produced by paimon-java's + # `TantivyIndexFixtureGen` (see docs/dev/tantivy_java_cross_read_plan.md) + # and checked in under test/test_data/java_tantivy_fixtures/. + add_paimon_test(tantivy_java_compat_test + SOURCES + tantivy_java_compat_test.cpp + STATIC_LINK_LIBS + paimon_shared + test_utils_static + "-Wl,--whole-archive" + paimon_local_file_system_static + paimon_tantivy_support_static + "-Wl,--no-whole-archive" + paimon_tantivy_ffi + arrow + glog + fmt + ${GTEST_LINK_TOOLCHAIN}) + target_compile_definitions(paimon-tantivy-java-compat-test + PRIVATE JIEBA_TEST_DICT_DIR="${JIEBA_DICT_DIR}" + PAIMON_TANTIVY_JAVA_FIXTURE_DIR="${CMAKE_SOURCE_DIR}/test/test_data/java_tantivy_fixtures" + PAIMON_TANTIVY_CPP_FIXTURE_DIR="${CMAKE_SOURCE_DIR}/test/test_data/cpp_tantivy_fixtures" + ) + + # K4 — V3 streaming reader + W1 streaming writer integration coverage: + # ParseArchiveHeader fuzz, concurrent query on shared reader, concurrent + # reader create+drop lifecycle, streaming benchmark log. + add_paimon_test(tantivy_streaming_test + SOURCES + tantivy_streaming_test.cpp + STATIC_LINK_LIBS + paimon_shared + test_utils_static + "-Wl,--whole-archive" + paimon_local_file_system_static + paimon_tantivy_support_static + "-Wl,--no-whole-archive" + paimon_tantivy_ffi + arrow + glog + fmt + ${GTEST_LINK_TOOLCHAIN}) + target_compile_definitions(paimon-tantivy-streaming-test + PRIVATE JIEBA_TEST_DICT_DIR="${JIEBA_DICT_DIR}") + + # Stage 8 — TantivyGlobalIndex + factory + end-to-end integration test. + # `--whole-archive` is required so the static REGISTER_PAIMON_FACTORY + # symbols are not stripped out of the test binary. + add_paimon_test(tantivy_index_test + SOURCES + tantivy_index_test.cpp + STATIC_LINK_LIBS + paimon_shared + test_utils_static + "-Wl,--whole-archive" + paimon_local_file_system_static + paimon_tantivy_support_static + "-Wl,--no-whole-archive" + paimon_tantivy_ffi + arrow + glog + fmt + ${GTEST_LINK_TOOLCHAIN}) + target_compile_definitions(paimon-tantivy-index-test + PRIVATE JIEBA_TEST_DICT_DIR="${JIEBA_DICT_DIR}") + + # Stage 9 — Cross-implementation coexistence. Links against BOTH the + # lucene and tantivy support static libs to verify they resolve their + # `REGISTER_PAIMON_FACTORY` registrations side by side and don't + # collide on shared symbols. Only built when lucene-fts is enabled. + if(PAIMON_ENABLE_LUCENE) + add_paimon_test(tantivy_lucene_coexist_test + SOURCES + tantivy_lucene_coexist_test.cpp + EXTRA_INCLUDES + ${LUCENE_INCLUDE_DIR} + STATIC_LINK_LIBS + paimon_shared + test_utils_static + "-Wl,--whole-archive" + paimon_local_file_system_static + paimon_lucene_index_static + paimon_tantivy_support_static + "-Wl,--no-whole-archive" + paimon_tantivy_ffi + arrow + glog + fmt + ${GTEST_LINK_TOOLCHAIN}) + target_compile_definitions(paimon-tantivy-lucene-coexist-test + PRIVATE JIEBA_TEST_DICT_DIR="${JIEBA_DICT_DIR}") + + # Stage 10 — Equivalence + benchmark. Same link line as the coexist + # test (needs both impls); benchmark output goes to stderr. + add_paimon_test(tantivy_equivalence_test + SOURCES + tantivy_equivalence_test.cpp + EXTRA_INCLUDES + ${LUCENE_INCLUDE_DIR} + STATIC_LINK_LIBS + paimon_shared + test_utils_static + "-Wl,--whole-archive" + paimon_local_file_system_static + paimon_lucene_index_static + paimon_tantivy_support_static + "-Wl,--no-whole-archive" + paimon_tantivy_ffi + arrow + glog + fmt + ${GTEST_LINK_TOOLCHAIN}) + target_compile_definitions(paimon-tantivy-equivalence-test + PRIVATE JIEBA_TEST_DICT_DIR="${JIEBA_DICT_DIR}") + endif() +endif() diff --git a/src/paimon/global_index/tantivy/tantivy_archive_layout.cpp b/src/paimon/global_index/tantivy/tantivy_archive_layout.cpp new file mode 100644 index 000000000..1bd75320e --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_archive_layout.cpp @@ -0,0 +1,81 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + */ + +#include "paimon/global_index/tantivy/tantivy_archive_layout.h" + +#include +#include + +#include "fmt/format.h" +#include "paimon/fs/file_system.h" +#include "paimon/io/data_input_stream.h" + +namespace paimon::tantivy { + +namespace { + +/// Wrap the (non-owning) raw InputStream* in a shared_ptr-like handle so +/// DataInputStream — which takes `shared_ptr` — can be used +/// without transferring ownership. We use a no-op deleter to avoid double-free. +struct NoopDeleter { + void operator()(InputStream*) const {} +}; + +} // namespace + +Result ParseArchiveHeader(InputStream* in) { + if (in == nullptr) { + return Status::Invalid("ParseArchiveHeader: null input stream"); + } + + // DataInputStream defaults to BE — matches paimon-java archive format. + std::shared_ptr wrapped(in, NoopDeleter{}); + DataInputStream dis(wrapped); + + PAIMON_RETURN_NOT_OK(dis.Seek(0)); + + PAIMON_ASSIGN_OR_RAISE(int32_t file_count, dis.ReadValue()); + if (file_count < 0) { + return Status::Invalid( + fmt::format("ParseArchiveHeader: negative file_count {}", file_count)); + } + + ArchiveLayout layout; + layout.count = static_cast(file_count); + layout.names.reserve(layout.count); + layout.offsets.reserve(layout.count); + layout.lengths.reserve(layout.count); + + for (int32_t i = 0; i < file_count; ++i) { + PAIMON_ASSIGN_OR_RAISE(int32_t name_len, dis.ReadValue()); + if (name_len <= 0 || name_len > 1 << 20) { + return Status::Invalid( + fmt::format("ParseArchiveHeader: bad name_len {} at entry {}", name_len, i)); + } + std::string name(static_cast(name_len), '\0'); + PAIMON_RETURN_NOT_OK(dis.Read(name.data(), static_cast(name_len))); + + PAIMON_ASSIGN_OR_RAISE(int64_t data_len, dis.ReadValue()); + if (data_len < 0) { + return Status::Invalid( + fmt::format("ParseArchiveHeader: negative data_len {} for '{}'", data_len, name)); + } + + PAIMON_ASSIGN_OR_RAISE(int64_t data_offset, dis.GetPos()); + + layout.names.push_back(std::move(name)); + layout.offsets.push_back(static_cast(data_offset)); + layout.lengths.push_back(static_cast(data_len)); + + // Skip past the payload without reading it. + PAIMON_RETURN_NOT_OK(dis.Seek(data_offset + data_len)); + } + + return layout; +} + +} // namespace paimon::tantivy diff --git a/src/paimon/global_index/tantivy/tantivy_archive_layout.h b/src/paimon/global_index/tantivy/tantivy_archive_layout.h new file mode 100644 index 000000000..2780dfbb9 --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_archive_layout.h @@ -0,0 +1,49 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include "paimon/result.h" + +namespace paimon { +class InputStream; +} // namespace paimon + +namespace paimon::tantivy { + +/// Parsed layout of a packed tantivy archive. Arrays are parallel; `count` is +/// their common length. +/// +/// Archive byte format (matches paimon-java `TantivyFullTextGlobalIndexReader. +/// parseArchiveHeader`; big-endian, no version header): +/// `[BE i32 file_count | (BE i32 name_len, name_utf8, BE i64 data_len, data)*]` +/// +/// `offsets[i]` is the archive-absolute byte offset of file `i`'s payload +/// (points past the per-entry header). `lengths[i]` is the payload size. +struct ArchiveLayout { + std::vector names; + std::vector offsets; + std::vector lengths; + std::size_t count = 0; +}; + +/// Read the archive header from `in` (seeking past payloads) and return the +/// layout. Does NOT read file payloads — only header bytes (a few KB). +/// +/// `in` must support `Seek` (all production `paimon::InputStream` subclasses +/// do; we call `Seek(cur + data_len)` to skip over each file's payload). +/// +/// On return, `in`'s internal position is at the end of the archive; callers +/// typically don't care (the stream is subsequently read via pread callbacks). +Result ParseArchiveHeader(InputStream* in); + +} // namespace paimon::tantivy diff --git a/src/paimon/global_index/tantivy/tantivy_defs.h b/src/paimon/global_index/tantivy/tantivy_defs.h new file mode 100644 index 000000000..0824d5148 --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_defs.h @@ -0,0 +1,69 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +#pragma once + +#include +#include + +namespace paimon::tantivy { + +/// Identifier used by GlobalIndexFileWriter::NewFileName to prefix on-disk +/// filenames. Tantivy and lucene file prefixes intentionally differ so a +/// reader can dispatch the right implementation by filename pattern. +static inline const char kIdentifier[] = "tantivy-fulltext"; + +/// Schema field names — fixed to match paimon-java (decision B1). Callers +/// MUST NOT rename these even though `TantivyGlobalIndexWriter::Create` accepts +/// a `field_name` argument (that argument is used only to extract the correct +/// arrow column; the tantivy schema field name is always `"text"`). +static inline const char kTantivyTextFieldName[] = "text"; +static inline const char kTantivyRowIdFieldName[] = "row_id"; + +/// Option-key prefix consumed by TantivyGlobalIndex (Stage 8). Matches the +/// lucene-fts convention so users can configure both implementations with a +/// uniform "." key style. +static inline const char kOptionKeyPrefix[] = "tantivy-fulltext."; + +/// Buffer size for streaming raw packed bytes from FFI to OutputStream +/// (Writer) and from InputStream into Rust (Reader, Stage 5+). +static inline const int32_t kDefaultReadBufferSize = 1024 * 1024; +/// Read buffer size knob for Stage 6 reader. +static inline const char kTantivyReadBufferSize[] = "read.buffer-size"; + +/// If true, omit term frequencies/positions when indexing (smaller index, but +/// no PhraseQuery support). Default false, mirroring lucene-fts. +static inline const char kTantivyWriteOmitTermFreqAndPositions[] = + "write.omit-term-freq-and-position"; + +/// Env var carrying jieba dictionary directory; consumed by both writer and +/// reader. Same name as lucene-fts: a single env var configures both backends. +static inline const char kJiebaDictDirEnv[] = "PAIMON_JIEBA_DICT_DIR"; + +/// Default tokenize mode if not specified in options. +static inline const char kDefaultJiebaTokenizeMode[] = "mix"; +/// Tokenize mode option key. Values: "mp", "mix", "full", "query". +/// "hmm" is rejected with Unsupported (jieba-rs does not expose standalone HMM). +static inline const char kJiebaTokenizeMode[] = "jieba.tokenize-mode"; + +/// Writer-side tokenizer selector. Values: +/// "default" (default) — tantivy built-in SimpleTokenizer; +/// "paimon_jieba" — jieba-rs CJK tokenizer; opt-in for Chinese workloads +/// "whitespace" / "raw" / "en_stem" — other tantivy built-ins +/// The reader side is schema-driven (P-TK) and auto-dispatches to whatever +/// tokenizer name is baked into the archive, so the default here also +/// determines what paimon-java sees when it cross-reads the archive. +static inline const char kTantivyWriteTokenizer[] = "tantivy.write.tokenizer"; +/// Default tokenizer for writer: tantivy built-in "default" (SimpleTokenizer), +/// chosen so paimon-cpp ↔ paimon-java cross-read works out of the box. +/// Chinese workloads must opt into "paimon_jieba" via kTantivyWriteTokenizer. +static inline const char kDefaultTantivyWriteTokenizer[] = "default"; + +} // namespace paimon::tantivy diff --git a/src/paimon/global_index/tantivy/tantivy_equivalence_test.cpp b/src/paimon/global_index/tantivy/tantivy_equivalence_test.cpp new file mode 100644 index 000000000..63e5db663 --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_equivalence_test.cpp @@ -0,0 +1,400 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Stage 10: equivalence + benchmark. + * + * EQUIVALENCE: a parametric corpus × query battery that compares lucene-fts + * and tantivy-fulltext result *sets* (doc_id only — not score order, not score + * values). Coverage targets: + * - English bag-of-words: MATCH_ALL / MATCH_ANY / PHRASE + * - Chinese (jieba "query" mode): MATCH_ALL / MATCH_ANY / PHRASE + * - Pre_filter intersection (no scoring) + * PREFIX and WILDCARD are NOT compared as required-equal: tantivy's RegexQuery + * walks byte-level term dictionary, lucene's PrefixQuery/WildcardQuery walks + * its own; edge cases (empty input, anchors, multi-byte UTF-8) diverge by + * design. Documented in docs/dev/execute.md Stage 10 decisions. + * + * BENCHMARK: build a 200-doc index per backend and time write + 100 queries. + * Prints to stderr; never fails on perf — guarding against perf regressions + * is out of scope for this stage. Numbers go in execute.md as a baseline. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/c/bridge.h" +#include "arrow/ipc/api.h" +#include "arrow/type.h" +#include "fmt/format.h" +#include "gtest/gtest.h" +#include "paimon/common/utils/path_util.h" +#include "paimon/core/global_index/global_index_file_manager.h" +#include "paimon/core/index/index_path_factory.h" +#include "paimon/fs/local/local_file_system.h" +#include "paimon/global_index/bitmap_global_index_result.h" +#include "paimon/global_index/bitmap_scored_global_index_result.h" +#include "paimon/global_index/global_index_io_meta.h" +#include "paimon/global_index/global_index_reader.h" +#include "paimon/global_index/global_index_writer.h" +#include "paimon/global_index/global_indexer.h" +#include "paimon/global_index/global_indexer_factory.h" +#include "paimon/global_index/lucene/lucene_defs.h" +#include "paimon/global_index/tantivy/tantivy_defs.h" +#include "paimon/predicate/full_text_search.h" +#include "paimon/testing/utils/testharness.h" + +#ifndef JIEBA_TEST_DICT_DIR +#error "JIEBA_TEST_DICT_DIR must be set at compile time" +#endif + +namespace paimon::tantivy::test { + +namespace { + +class FakeIndexPathFactory : public IndexPathFactory { + public: + explicit FakeIndexPathFactory(const std::string& root) : root_(root) {} + std::string NewPath() const override { + assert(false); + return ""; + } + std::string ToPath(const std::shared_ptr&) const override { + assert(false); + return ""; + } + std::string ToPath(const std::string& file_name) const override { + return PathUtil::JoinPath(root_, file_name); + } + bool IsExternalPath() const override { + return false; + } + + private: + std::string root_; +}; + +struct ReaderPair { + std::shared_ptr lucene; + std::shared_ptr tantivy; + std::unique_ptr lucene_root; + std::unique_ptr tantivy_root; +}; + +class TantivyEquivalenceTest : public ::testing::Test { + public: + void SetUp() override { + setenv(::paimon::lucene::kJiebaDictDirEnv, JIEBA_TEST_DICT_DIR, /*overwrite=*/1); + setenv(::paimon::tantivy::kJiebaDictDirEnv, JIEBA_TEST_DICT_DIR, /*overwrite=*/1); + } + + std::unique_ptr<::ArrowSchema> CreateArrowSchema( + const std::shared_ptr& data_type) const { + auto c_schema = std::make_unique<::ArrowSchema>(); + EXPECT_TRUE(arrow::ExportType(*data_type, c_schema.get()).ok()); + return c_schema; + } + + GlobalIndexIOMeta WriteOne(const std::string& factory_id, + const std::shared_ptr& data_type, + const std::map& options, + const std::shared_ptr& array, + const std::string& root) { + auto indexer_res = GlobalIndexerFactory::Get(factory_id, options); + EXPECT_TRUE(indexer_res.ok()) << indexer_res.status().ToString(); + // NB: std::move(result).value() picks the rvalue overload (returns T&&); + // std::move(result.value()) would call the const T& overload first → no move. + auto indexer = std::move(indexer_res).value(); + auto path_factory = std::make_shared(root); + auto file_writer = std::make_shared(fs_, path_factory); + auto writer_res = + indexer->CreateWriter("f0", CreateArrowSchema(data_type).get(), file_writer, pool_); + EXPECT_TRUE(writer_res.ok()) << writer_res.status().ToString(); + ::ArrowArray c_array; + EXPECT_TRUE(arrow::ExportArray(*array, &c_array).ok()); + std::vector relative_row_ids(array->length()); + for (int64_t i = 0; i < array->length(); ++i) relative_row_ids[i] = i; + EXPECT_TRUE(writer_res.value()->AddBatch(&c_array, std::move(relative_row_ids)).ok()); + auto metas_res = writer_res.value()->Finish(); + EXPECT_TRUE(metas_res.ok()) << metas_res.status().ToString(); + return metas_res.value()[0]; + } + + std::shared_ptr OpenOne(const std::string& factory_id, + const std::shared_ptr& data_type, + const std::map& options, + const GlobalIndexIOMeta& meta, + const std::string& root) { + auto indexer = GlobalIndexerFactory::Get(factory_id, options).value(); + auto path_factory = std::make_shared(root); + auto file_reader = std::make_shared(fs_, path_factory); + return indexer->CreateReader(CreateArrowSchema(data_type).get(), file_reader, {meta}, pool_) + .value(); + } + + /// Build BOTH lucene + tantivy indexes for the same corpus + options. + /// Returns an opened-reader pair plus owning UniqueTestDirectory handles. + ReaderPair WriteAndOpenBoth(const std::shared_ptr& data_type, + const std::shared_ptr& array, + std::map lucene_opts, + const std::map& tantivy_opts) { + auto lroot = paimon::test::UniqueTestDirectory::Create(); + auto troot = paimon::test::UniqueTestDirectory::Create(); + EXPECT_TRUE(lroot && troot); + // lucene requires a tmp directory option; reuse lroot if caller didn't set one. + lucene_opts.emplace("lucene-fts.write.tmp.directory", lroot->Str()); + auto lmeta = WriteOne("lucene-fts", data_type, lucene_opts, array, lroot->Str()); + auto tmeta = WriteOne("tantivy-fulltext", data_type, tantivy_opts, array, troot->Str()); + ReaderPair p; + p.lucene = OpenOne("lucene-fts", data_type, lucene_opts, lmeta, lroot->Str()); + p.tantivy = OpenOne("tantivy-fulltext", data_type, tantivy_opts, tmeta, troot->Str()); + p.lucene_root = std::move(lroot); + p.tantivy_root = std::move(troot); + return p; + } + + static std::set Ids(const std::shared_ptr& result) { + const RoaringBitmap64* bitmap = nullptr; + Result br = Status::Invalid("none"); + if (auto scored = std::dynamic_pointer_cast(result)) { + br = scored->GetBitmap(); + } else if (auto plain = std::dynamic_pointer_cast(result)) { + br = plain->GetBitmap(); + } + EXPECT_TRUE(br.ok()) << br.status().ToString(); + bitmap = br.value(); + std::set out; + if (bitmap) { + for (auto it = bitmap->Begin(); it != bitmap->End(); ++it) { + out.insert(static_cast(*it)); + } + } + return out; + } + + /// Run a single FullTextSearch through both readers, return (lucene, tantivy) + /// doc id sets. + std::pair, std::set> RunPair( + const ReaderPair& p, const std::string& q, FullTextSearch::SearchType t, + std::optional limit = std::nullopt, + std::optional filter = std::nullopt) { + auto lr = p.lucene->VisitFullTextSearch( + std::make_shared("f0", limit, q, t, filter)); + auto tr = p.tantivy->VisitFullTextSearch( + std::make_shared("f0", limit, q, t, filter)); + EXPECT_TRUE(lr.ok()) << "lucene: " << lr.status().ToString(); + EXPECT_TRUE(tr.ok()) << "tantivy: " << tr.status().ToString(); + return {Ids(lr.value()), Ids(tr.value())}; + } + + protected: + std::shared_ptr pool_ = GetDefaultPool(); + std::shared_ptr fs_ = std::make_shared(); +}; + +} // namespace + +TEST_F(TantivyEquivalenceTest, EnglishBagOfWordsBattery) { + auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())}); + auto array = arrow::ipc::internal::json::ArrayFromJSON(data_type, R"([ + ["alpha beta gamma delta"], + ["alpha alpha alpha beta"], + ["beta gamma delta epsilon"], + ["zeta eta theta iota"], + ["alpha gamma epsilon iota"], + ["lone outlier word here"], + ["alpha beta gamma alpha beta"], + ["delta epsilon zeta eta theta"], + ["nothing matches this row"], + ["alpha"] + ])") + .ValueOrDie(); + auto pair = WriteAndOpenBoth(data_type, array, {}, {}); + + struct Case { + std::string query; + FullTextSearch::SearchType type; + }; + std::vector cases = { + {"alpha", FullTextSearch::SearchType::MATCH_ALL}, + {"alpha", FullTextSearch::SearchType::MATCH_ANY}, + {"alpha beta", FullTextSearch::SearchType::MATCH_ALL}, + {"alpha beta", FullTextSearch::SearchType::MATCH_ANY}, + {"alpha gamma delta", FullTextSearch::SearchType::MATCH_ALL}, + {"alpha gamma delta", FullTextSearch::SearchType::MATCH_ANY}, + {"epsilon iota", FullTextSearch::SearchType::MATCH_ALL}, + {"alpha beta gamma", FullTextSearch::SearchType::PHRASE}, + {"beta gamma delta", FullTextSearch::SearchType::PHRASE}, + {"delta epsilon", FullTextSearch::SearchType::PHRASE}, + }; + for (const auto& c : cases) { + auto [l, t] = RunPair(pair, c.query, c.type); + EXPECT_EQ(l, t) << "diverge: query=" << c.query << " type=" << static_cast(c.type); + } +} + +TEST_F(TantivyEquivalenceTest, ChineseQueryModeBattery) { + auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())}); + auto array = arrow::ipc::internal::json::ArrayFromJSON(data_type, R"([ +["智能助手 AI 模块 开发"], +["智能助手 在 Python 开发 中"], +["AI 助手 开发 框架"], +["智能 模块 技术 实现"], +["发展方向 是 智能 助手"] + ])") + .ValueOrDie(); + std::map lopts = {{"lucene-fts.jieba.tokenize-mode", "query"}}; + std::map topts = { + {"tantivy-fulltext.tantivy.write.tokenizer", "paimon_jieba"}, + {"tantivy-fulltext.jieba.tokenize-mode", "query"}, + }; + auto pair = WriteAndOpenBoth(data_type, array, lopts, topts); + + struct Case { + std::string query; + FullTextSearch::SearchType type; + }; + // Note: jieba is shared (same dictionary), so tokenization should agree + // for plain Chinese text. Differences (if any) come from the lowercase / + // stopword normalization step — tested with neutral CJK terms below. + std::vector cases = { + {"智能", FullTextSearch::SearchType::MATCH_ALL}, + {"智能 助手", FullTextSearch::SearchType::MATCH_ALL}, + {"模块", FullTextSearch::SearchType::MATCH_ANY}, + {"发展方向", FullTextSearch::SearchType::PHRASE}, + }; + for (const auto& c : cases) { + auto [l, t] = RunPair(pair, c.query, c.type); + EXPECT_EQ(l, t) << "diverge: query=" << c.query << " type=" << static_cast(c.type); + } +} + +TEST_F(TantivyEquivalenceTest, PreFilterIntersectionEquivalent) { + auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())}); + auto array = arrow::ipc::internal::json::ArrayFromJSON(data_type, R"([ + ["alpha beta"], + ["alpha gamma"], + ["alpha delta"], + ["beta gamma"], + ["beta delta"] + ])") + .ValueOrDie(); + auto pair = WriteAndOpenBoth(data_type, array, {}, {}); + + auto pf = RoaringBitmap64::From({0l, 2l, 4l}); + { + auto [l, t] = + RunPair(pair, "alpha", FullTextSearch::SearchType::MATCH_ALL, std::nullopt, pf); + EXPECT_EQ(l, t); + EXPECT_EQ(l, (std::set{0, 2})); + } + { + auto [l, t] = + RunPair(pair, "beta gamma", FullTextSearch::SearchType::MATCH_ANY, std::nullopt, pf); + EXPECT_EQ(l, t); + } + { + auto empty = RoaringBitmap64(); + auto [l, t] = + RunPair(pair, "alpha", FullTextSearch::SearchType::MATCH_ALL, std::nullopt, empty); + EXPECT_EQ(l, t); + EXPECT_TRUE(l.empty()); + } +} + +TEST_F(TantivyEquivalenceTest, BenchmarkBuildAndQuery) { + // Build a synthetic 200-doc corpus and time write + 100 random queries. + // This is a reportable baseline, NOT a perf gate — assertions only check + // semantic correctness (each query returns >= 0 docs without erroring). + constexpr int kDocCount = 200; + constexpr int kQueryCount = 100; + std::vector vocab = {"alpha", "beta", "gamma", "delta", "epsilon", + "zeta", "eta", "theta", "iota", "kappa", + "lambda", "mu", "nu", "xi", "omicron"}; + std::mt19937 rng(0xC0DE); + std::uniform_int_distribution word_pick(0, vocab.size() - 1); + std::uniform_int_distribution word_count(3, 12); + + // Build the corpus as a JSON Arrow array. + std::string json = "["; + for (int i = 0; i < kDocCount; ++i) { + json += "[\""; + int n = word_count(rng); + for (int w = 0; w < n; ++w) { + if (w > 0) json += ' '; + json += vocab[word_pick(rng)]; + } + json += "\"]"; + if (i + 1 < kDocCount) json += ","; + } + json += "]"; + + auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())}); + auto array = arrow::ipc::internal::json::ArrayFromJSON(data_type, json).ValueOrDie(); + + auto time_ms = [](auto&& fn) { + auto t0 = std::chrono::steady_clock::now(); + fn(); + auto t1 = std::chrono::steady_clock::now(); + return std::chrono::duration_cast(t1 - t0).count(); + }; + + // -------- Lucene: write + open + queries -------- + auto lroot = paimon::test::UniqueTestDirectory::Create(); + std::map lopt = {{"lucene-fts.write.tmp.directory", lroot->Str()}}; + GlobalIndexIOMeta lmeta{"", 0, nullptr}; + auto lwrite_ms = + time_ms([&] { lmeta = WriteOne("lucene-fts", data_type, lopt, array, lroot->Str()); }); + auto lreader = OpenOne("lucene-fts", data_type, lopt, lmeta, lroot->Str()); + + auto lquery_ms = time_ms([&] { + for (int i = 0; i < kQueryCount; ++i) { + const std::string& w = vocab[word_pick(rng)]; + auto r = lreader->VisitFullTextSearch(std::make_shared( + "f0", std::nullopt, w, FullTextSearch::SearchType::MATCH_ALL, std::nullopt)); + EXPECT_TRUE(r.ok()); + } + }); + + // -------- Tantivy: write + open + queries -------- + auto troot = paimon::test::UniqueTestDirectory::Create(); + GlobalIndexIOMeta tmeta{"", 0, nullptr}; + auto twrite_ms = + time_ms([&] { tmeta = WriteOne("tantivy-fulltext", data_type, {}, array, troot->Str()); }); + auto treader = OpenOne("tantivy-fulltext", data_type, {}, tmeta, troot->Str()); + + auto tquery_ms = time_ms([&] { + for (int i = 0; i < kQueryCount; ++i) { + const std::string& w = vocab[word_pick(rng)]; + auto r = treader->VisitFullTextSearch(std::make_shared( + "f0", std::nullopt, w, FullTextSearch::SearchType::MATCH_ALL, std::nullopt)); + EXPECT_TRUE(r.ok()); + } + }); + + std::cerr << fmt::format( + "[STAGE10-BENCH docs={} queries={}] lucene_write={}ms lucene_query={}ms" + " tantivy_write={}ms tantivy_query={}ms file_size_lucene={} file_size_tantivy={}\n", + kDocCount, kQueryCount, lwrite_ms, lquery_ms, twrite_ms, tquery_ms, lmeta.file_size, + tmeta.file_size); + SUCCEED() << "benchmark prints to stderr"; +} + +} // namespace paimon::tantivy::test diff --git a/src/paimon/global_index/tantivy/tantivy_ffi_handle.h b/src/paimon/global_index/tantivy/tantivy_ffi_handle.h new file mode 100644 index 000000000..b4d4e51cf --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_ffi_handle.h @@ -0,0 +1,113 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * RAII wrappers for opaque FFI handles returned by paimon_tantivy_ffi. + * See docs/dev/tantivy_ffi_design.md §3 Category A. + */ +#pragma once + +#include +#include +#include + +extern "C" { +#include "paimon_tantivy_ffi.h" // NOLINT(build/include_subdir) +} + +namespace paimon::tantivy { + +/// Deleter template; specialize per handle type with the matching free function. +/// Usage: +/// template <> struct FfiDeleter { +/// void operator()(paimon_tantivy_writer_t* p) const noexcept { +/// paimon_tantivy_writer_free(p); +/// } +/// }; +/// using WriterPtr = FfiUniquePtr; +template +struct FfiDeleter { + // Default unsupported so missing specializations fail at compile time + void operator()(Handle*) const noexcept { + static_assert(sizeof(Handle) == 0, "FfiDeleter must be specialized for this handle type"); + } +}; + +/// Generic RAII owning pointer for an FFI handle. +template +using FfiUniquePtr = std::unique_ptr>; + +/// Tokenizer handle (Stage 3). +template <> +struct FfiDeleter { + void operator()(PaimonJiebaTokenizer* p) const noexcept { + paimon_tantivy_tokenizer_free(p); + } +}; +using JiebaTokenizerPtr = FfiUniquePtr; + +/// Writer handle (Stage 4). +template <> +struct FfiDeleter { + void operator()(PaimonTantivyWriter* p) const noexcept { + paimon_tantivy_writer_free(p); + } +}; +using WriterPtr = FfiUniquePtr; + +/// Reader handle (Stage 6). +template <> +struct FfiDeleter { + void operator()(PaimonTantivyReader* p) const noexcept { + paimon_tantivy_reader_free(p); + } +}; +using ReaderPtr = FfiUniquePtr; + +/// Specialization: buffer_t is special - not an opaque handle but a value +/// struct owned on the stack. The contained `data` pointer is the Rust-owned +/// allocation; we call `paimon_tantivy_buffer_free` on the struct pointer. +/// Use BufferGuard to ensure free-on-scope-exit even on early return. +class BufferGuard { + public: + BufferGuard() noexcept { + buf_.data = nullptr; + buf_.len = 0; + buf_.capacity = 0; + } + BufferGuard(const BufferGuard&) = delete; + BufferGuard& operator=(const BufferGuard&) = delete; + BufferGuard(BufferGuard&&) = delete; + BufferGuard& operator=(BufferGuard&&) = delete; + + ~BufferGuard() noexcept { + paimon_tantivy_buffer_free(&buf_); + } + + PaimonTantivyBuffer* out() noexcept { + return &buf_; + } + + const uint8_t* data() const noexcept { + return buf_.data; + } + std::size_t size() const noexcept { + return buf_.len; + } + + private: + PaimonTantivyBuffer buf_{}; +}; + +} // namespace paimon::tantivy diff --git a/src/paimon/global_index/tantivy/tantivy_ffi_log.cpp b/src/paimon/global_index/tantivy/tantivy_ffi_log.cpp new file mode 100644 index 000000000..77d7420cb --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_ffi_log.cpp @@ -0,0 +1,67 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/global_index/tantivy/tantivy_ffi_log.h" + +#include +#include + +#include "glog/logging.h" + +extern "C" { +#include "paimon_tantivy_ffi.h" // NOLINT(build/include_subdir) +} + +namespace paimon::tantivy { +namespace { + +/// Level mapping matches Rust side (0=trace..4=error). +extern "C" void PaimonTantivyLogAdapter(int32_t level, const char* msg, std::size_t len) { + // msg is NOT null-terminated; slice with len. + std::string s(msg, len); + switch (level) { + case 4: + LOG(ERROR) << "[tantivy] " << s; + break; + case 3: + LOG(WARNING) << "[tantivy] " << s; + break; + case 2: + LOG(INFO) << "[tantivy] " << s; + break; + case 1: + VLOG(1) << "[tantivy] " << s; + break; + case 0: + VLOG(2) << "[tantivy] " << s; + break; + default: + LOG(INFO) << "[tantivy:lvl=" << level << "] " << s; + break; + } +} + +} // namespace + +void InstallTantivyLogBridge() { + paimon_tantivy_set_log_callback(&PaimonTantivyLogAdapter); +} + +void UninstallTantivyLogBridge() { + paimon_tantivy_clear_log_callback(); +} + +} // namespace paimon::tantivy diff --git a/src/paimon/global_index/tantivy/tantivy_ffi_log.h b/src/paimon/global_index/tantivy/tantivy_ffi_log.h new file mode 100644 index 000000000..42ddcbdde --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_ffi_log.h @@ -0,0 +1,33 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Bridge tantivy (Rust) logs into paimon's logger. + * See docs/dev/tantivy_ffi_design.md §7. + * + * Registered once at TantivyGlobalIndexFactory static-init time. + */ +#pragma once + +namespace paimon::tantivy { + +/// Install the Rust -> C++ log callback. Idempotent; only the last caller's +/// callback is active. Threading: C callback runs on tantivy worker threads; +/// our adapter must be thread-safe (it routes to glog which is). +void InstallTantivyLogBridge(); + +/// Uninstall (revert to Rust stderr). Mostly useful for tests. +void UninstallTantivyLogBridge(); + +} // namespace paimon::tantivy diff --git a/src/paimon/global_index/tantivy/tantivy_ffi_status.h b/src/paimon/global_index/tantivy/tantivy_ffi_status.h new file mode 100644 index 000000000..8c64d839f --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_ffi_status.h @@ -0,0 +1,92 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Translation layer: paimon_tantivy_status_t -> paimon::Status. + * See docs/dev/tantivy_ffi_design.md §2. + */ +#pragma once + +#include "fmt/format.h" +#include "paimon/status.h" + +extern "C" { +#include "paimon_tantivy_ffi.h" // NOLINT(build/include_subdir) +} + +namespace paimon::tantivy { + +/// Translate an FFI status code to a paimon::Status. OK returns Status::OK(). +/// On error, the returned Status carries the thread-local last_error() text +/// prefixed with the status code name for easier grep. +/// +/// Note: cbindgen emits `PaimonTantivyStatus` in the **global** namespace as +/// a C-style enum, so we accept it via its global type here. C++ ADL still +/// lets call sites write the unqualified enumerator names. +inline Status FfiStatusToStatus(::PaimonTantivyStatus code) { + if (code == PAIMON_TANTIVY_STATUS_OK) { + return Status::OK(); + } + const char* err = paimon_tantivy_last_error(); + const char* name = [code]() -> const char* { + switch (code) { + case PAIMON_TANTIVY_STATUS_INVALID_ARGUMENT: + return "InvalidArgument"; + case PAIMON_TANTIVY_STATUS_NOT_FOUND: + return "NotFound"; + case PAIMON_TANTIVY_STATUS_IO_ERROR: + return "IoError"; + case PAIMON_TANTIVY_STATUS_UNSUPPORTED: + return "Unsupported"; + case PAIMON_TANTIVY_STATUS_TOKENIZER_ERROR: + return "TokenizerError"; + case PAIMON_TANTIVY_STATUS_QUERY_PARSE_ERROR: + return "QueryParseError"; + case PAIMON_TANTIVY_STATUS_INDEX_FORMAT_ERROR: + return "IndexFormatError"; + case PAIMON_TANTIVY_STATUS_INTERNAL_ERROR: + return "InternalError"; + default: + return "UnknownFfiStatus"; + } + }(); + std::string msg = + fmt::format("tantivy-ffi[{}({})]: {}", name, static_cast(code), err ? err : "(null)"); + switch (code) { + case PAIMON_TANTIVY_STATUS_NOT_FOUND: + return Status::NotExist(msg); + case PAIMON_TANTIVY_STATUS_IO_ERROR: + return Status::IOError(msg); + case PAIMON_TANTIVY_STATUS_UNSUPPORTED: + return Status::NotImplemented(msg); + case PAIMON_TANTIVY_STATUS_INVALID_ARGUMENT: + case PAIMON_TANTIVY_STATUS_TOKENIZER_ERROR: + case PAIMON_TANTIVY_STATUS_QUERY_PARSE_ERROR: + case PAIMON_TANTIVY_STATUS_INDEX_FORMAT_ERROR: + return Status::Invalid(msg); + default: + return Status::UnknownError(msg); + } +} + +/// Like PAIMON_RETURN_NOT_OK but for FFI calls returning PaimonTantivyStatus. +#define PAIMON_TANTIVY_RETURN_NOT_OK(expr) \ + do { \ + ::PaimonTantivyStatus _paimon_tantivy_status_ = (expr); \ + if (_paimon_tantivy_status_ != PAIMON_TANTIVY_STATUS_OK) { \ + return ::paimon::tantivy::FfiStatusToStatus(_paimon_tantivy_status_); \ + } \ + } while (0) + +} // namespace paimon::tantivy diff --git a/src/paimon/global_index/tantivy/tantivy_ffi_test.cpp b/src/paimon/global_index/tantivy/tantivy_ffi_test.cpp new file mode 100644 index 000000000..9c7d28f6c --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_ffi_test.cpp @@ -0,0 +1,138 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Stage 2: FFI common layer tests — error/buffer/log behave as documented. + * Does NOT build on real index yet (that's Stage 4+). + */ + +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "paimon/global_index/tantivy/tantivy_ffi_handle.h" +#include "paimon/global_index/tantivy/tantivy_ffi_log.h" +#include "paimon/global_index/tantivy/tantivy_ffi_status.h" + +extern "C" { +#include "paimon_tantivy_ffi.h" // NOLINT(build/include_subdir) +} + +namespace paimon::tantivy { + +// ------------------------- last_error contract ------------------------- + +TEST(TantivyFfiError, LastErrorIsNeverNull) { + // Before anything, last_error should be a valid non-null pointer to "" + const char* ptr = paimon_tantivy_last_error(); + ASSERT_NE(ptr, nullptr); + // Content is thread-local; for freshly-spawned thread it must be empty + std::atomic child_ok{false}; + std::thread t([&]() { + const char* p = paimon_tantivy_last_error(); + child_ok.store(p != nullptr && p[0] == '\0'); + }); + t.join(); + EXPECT_TRUE(child_ok.load()); +} + +// ------------------------- status translation ------------------------- + +TEST(TantivyFfiStatus, OkTranslates) { + Status s = FfiStatusToStatus(PaimonTantivyStatus::PAIMON_TANTIVY_STATUS_OK); + EXPECT_TRUE(s.ok()) << s.ToString(); +} + +TEST(TantivyFfiStatus, ErrorCodeNamesShowUp) { + // Translate a few codes and ensure the name appears in the string form. + struct Case { + PaimonTantivyStatus code; + const char* expected_substr; + }; + const Case cases[] = { + {PaimonTantivyStatus::PAIMON_TANTIVY_STATUS_INVALID_ARGUMENT, "InvalidArgument"}, + {PaimonTantivyStatus::PAIMON_TANTIVY_STATUS_NOT_FOUND, "NotFound"}, + {PaimonTantivyStatus::PAIMON_TANTIVY_STATUS_IO_ERROR, "IoError"}, + {PaimonTantivyStatus::PAIMON_TANTIVY_STATUS_UNSUPPORTED, "Unsupported"}, + {PaimonTantivyStatus::PAIMON_TANTIVY_STATUS_TOKENIZER_ERROR, "TokenizerError"}, + }; + for (const auto& c : cases) { + Status s = FfiStatusToStatus(c.code); + EXPECT_FALSE(s.ok()); + EXPECT_NE(s.ToString().find(c.expected_substr), std::string::npos) + << "got: " << s.ToString(); + } +} + +// ------------------------- buffer lifetime ------------------------- + +TEST(TantivyFfiBuffer, EmptyBufferGuard) { + BufferGuard g; + EXPECT_EQ(g.size(), 0u); + EXPECT_EQ(g.data(), nullptr); + // Destructor must accept empty buffer +} + +// ------------------------- handle stress ------------------------- + +// Sanity stress: create/destroy a dummy "handle" via into_handle/free_handle. +// Since the Rust side doesn't yet export writer/reader, we stress via a +// temporary wrapping of the buffer API: alloc buffers repeatedly, ensure no +// crash (LSAN / ASAN would catch leaks). +TEST(TantivyFfiBuffer, StressAllocFree) { + for (int i = 0; i < 1000; ++i) { + BufferGuard g; + // We don't have a way to populate the buffer from C++ in Stage 2; + // this just exercises empty construction + destruction path. + (void)g; + } +} + +// ------------------------- log bridge ------------------------- + +namespace { +std::atomic g_log_count{0}; +extern "C" void CountingLogCb(int32_t /*level*/, const char* /*msg*/, std::size_t /*len*/) { + g_log_count.fetch_add(1, std::memory_order_relaxed); +} +} // namespace + +TEST(TantivyFfiLog, SetCallbackIsIdempotent) { + g_log_count.store(0); + paimon_tantivy_set_log_callback(&CountingLogCb); + paimon_tantivy_set_log_callback(&CountingLogCb); + paimon_tantivy_clear_log_callback(); + // Should not crash even though called multiple times (idempotent install) + SUCCEED(); +} + +TEST(TantivyFfiLog, InstallBridgeThenUninstall) { + // Bridge to glog; must not crash. + InstallTantivyLogBridge(); + UninstallTantivyLogBridge(); + SUCCEED(); +} + +// ------------------------- version still works ------------------------- + +TEST(TantivyFfi, VersionReachable) { + const char* v = paimon_tantivy_version(); + ASSERT_NE(v, nullptr); + EXPECT_GT(std::strlen(v), 0u); +} + +} // namespace paimon::tantivy diff --git a/src/paimon/global_index/tantivy/tantivy_filter_limit_test.cpp b/src/paimon/global_index/tantivy/tantivy_filter_limit_test.cpp new file mode 100644 index 000000000..a11320b26 --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_filter_limit_test.cpp @@ -0,0 +1,400 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Stage 7 test: cover the limit + pre_filter + scoring pathway. Uses the same + * write→read flow as paimon-tantivy-reader-test, but verifies that: + * - A `limit` produces a `BitmapScoredGlobalIndexResult` with non-empty + * scores ordered such that bitmap iteration order aligns with the score + * vector (paimon convention: doc-id-asc bitmap, parallel score vector). + * - A `pre_filter` excludes non-member rows even when they would otherwise + * dominate the top-N by score. + * - Combining both produces the intersection, with limit applied AFTER + * filtering (matches lucene-fts behavior). + */ + +#include +#include + +#include "arrow/array.h" +#include "arrow/c/bridge.h" +#include "arrow/ipc/api.h" +#include "arrow/type.h" +#include "gtest/gtest.h" +#include "paimon/common/utils/path_util.h" +#include "paimon/core/global_index/global_index_file_manager.h" +#include "paimon/core/index/index_path_factory.h" +#include "paimon/fs/local/local_file_system.h" +#include "paimon/global_index/bitmap_global_index_result.h" +#include "paimon/global_index/bitmap_scored_global_index_result.h" +#include "paimon/global_index/tantivy/tantivy_defs.h" +#include "paimon/global_index/tantivy/tantivy_global_index_reader.h" +#include "paimon/global_index/tantivy/tantivy_global_index_writer.h" +#include "paimon/testing/utils/testharness.h" + +#ifndef JIEBA_TEST_DICT_DIR +#error "JIEBA_TEST_DICT_DIR must be set at compile time" +#endif + +namespace paimon::tantivy::test { + +namespace { + +class FakeIndexPathFactory : public IndexPathFactory { + public: + explicit FakeIndexPathFactory(const std::string& root) : root_(root) {} + std::string NewPath() const override { + assert(false); + return ""; + } + std::string ToPath(const std::shared_ptr&) const override { + assert(false); + return ""; + } + std::string ToPath(const std::string& file_name) const override { + return PathUtil::JoinPath(root_, file_name); + } + bool IsExternalPath() const override { + return false; + } + + private: + std::string root_; +}; + +class TantivyFilterLimitTest : public ::testing::Test { + public: + void SetUp() override { + setenv(kJiebaDictDirEnv, JIEBA_TEST_DICT_DIR, /*overwrite=*/1); + } + + std::pair, GlobalIndexIOMeta> WriteAndOpen( + const std::shared_ptr& array, + const std::map& options) { + auto root_dir = paimon::test::UniqueTestDirectory::Create(); + EXPECT_TRUE(root_dir); + std::string root = root_dir->Str(); + kept_dirs_.push_back(std::move(root_dir)); + auto path_factory = std::make_shared(root); + auto fm = std::make_shared(fs_, path_factory); + auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())}); + auto writer_res = + TantivyGlobalIndexWriter::Create("f0", data_type, fm, options, GetDefaultPool()); + EXPECT_TRUE(writer_res.ok()) << writer_res.status().ToString(); + auto writer = writer_res.value(); + ::ArrowArray c_array; + EXPECT_TRUE(arrow::ExportArray(*array, &c_array).ok()); + std::vector relative_row_ids(array->length()); + for (int64_t i = 0; i < array->length(); ++i) relative_row_ids[i] = i; + EXPECT_TRUE(writer->AddBatch(&c_array, std::move(relative_row_ids)).ok()); + auto metas_res = writer->Finish(); + EXPECT_TRUE(metas_res.ok()) << metas_res.status().ToString(); + return {fm, metas_res.value()[0]}; + } + + static std::vector BitmapToVec(const RoaringBitmap64& b) { + std::vector ids; + for (auto it = b.Begin(); it != b.End(); ++it) { + ids.push_back(static_cast(*it)); + } + std::sort(ids.begin(), ids.end()); + return ids; + } + + std::shared_ptr DataType() const { + return arrow::struct_({arrow::field("f0", arrow::utf8())}); + } + + protected: + std::shared_ptr fs_ = std::make_shared(); + std::vector> kept_dirs_; +}; + +} // namespace + +TEST_F(TantivyFilterLimitTest, LimitProducesScoredResultTopN) { + // Three docs with very different term frequencies for "doc"; limit=2 must + // pick the top 2 by score (doc 1 highest, then doc 2). + auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([ + ["doc"], + ["doc doc doc doc doc"], + ["doc doc"] + ])") + .ValueOrDie(); + auto [fm, meta] = WriteAndOpen(array, {}); + ASSERT_OK_AND_ASSIGN(auto reader, + TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool())); + auto fts = std::make_shared("f0", /*limit=*/2, "doc", + FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/std::nullopt); + fts->with_score = true; // v0.2: explicit score opt-in + auto res = reader->VisitFullTextSearch(fts); + ASSERT_TRUE(res.ok()) << res.status().ToString(); + auto scored = std::dynamic_pointer_cast(res.value()); + ASSERT_TRUE(scored) << "expected BitmapScoredGlobalIndexResult"; + ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, scored->GetBitmap()); + auto ids = BitmapToVec(*bitmap); + EXPECT_EQ(ids, (std::vector{1, 2})); + EXPECT_EQ(scored->GetScores().size(), 2u); + // Per-doc scores must be > 0 and present in iteration (doc-id) order. + for (auto s : scored->GetScores()) { + EXPECT_GT(s, 0.0f); + } +} + +TEST_F(TantivyFilterLimitTest, NoLimitReturnsBitmapResult) { + auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([ + ["doc"], ["doc doc"], ["other"] + ])") + .ValueOrDie(); + auto [fm, meta] = WriteAndOpen(array, {}); + ASSERT_OK_AND_ASSIGN(auto reader, + TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool())); + auto res = reader->VisitFullTextSearch(std::make_shared( + "f0", /*limit=*/std::nullopt, "doc", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/std::nullopt)); + ASSERT_TRUE(res.ok()) << res.status().ToString(); + // No limit ⇒ NOT a BitmapScoredGlobalIndexResult; just BitmapGlobalIndexResult. + EXPECT_FALSE(std::dynamic_pointer_cast(res.value())); + auto plain = std::dynamic_pointer_cast(res.value()); + ASSERT_TRUE(plain); + ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, plain->GetBitmap()); + EXPECT_EQ(BitmapToVec(*bitmap), (std::vector{0, 1})); +} + +TEST_F(TantivyFilterLimitTest, PreFilterIntersectsWithoutLimit) { + auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([ + ["alpha"], ["alpha"], ["alpha"], ["beta"] + ])") + .ValueOrDie(); + auto [fm, meta] = WriteAndOpen(array, {}); + ASSERT_OK_AND_ASSIGN(auto reader, + TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool())); + auto res = reader->VisitFullTextSearch(std::make_shared( + "f0", /*limit=*/std::nullopt, "alpha", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/RoaringBitmap64::From({0l, 2l, 100l}))); + ASSERT_TRUE(res.ok()) << res.status().ToString(); + auto plain = std::dynamic_pointer_cast(res.value()); + ASSERT_TRUE(plain); + ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, plain->GetBitmap()); + EXPECT_EQ(BitmapToVec(*bitmap), (std::vector{0, 2})); +} + +TEST_F(TantivyFilterLimitTest, PreFilterAppliedBeforeLimit) { + // doc 0 has highest score for "doc" but is excluded by pre_filter; the + // result must contain doc 1 only, even with limit=10. + auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([ + ["doc doc doc doc doc"], + ["doc doc"], + ["doc"] + ])") + .ValueOrDie(); + auto [fm, meta] = WriteAndOpen(array, {}); + ASSERT_OK_AND_ASSIGN(auto reader, + TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool())); + auto fts = std::make_shared("f0", /*limit=*/10, "doc", + FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/RoaringBitmap64::From({1l})); + fts->with_score = true; // v0.2: explicit score opt-in + auto res = reader->VisitFullTextSearch(fts); + ASSERT_TRUE(res.ok()) << res.status().ToString(); + auto scored = std::dynamic_pointer_cast(res.value()); + ASSERT_TRUE(scored); + ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, scored->GetBitmap()); + EXPECT_EQ(BitmapToVec(*bitmap), (std::vector{1})); + EXPECT_EQ(scored->GetScores().size(), 1u); +} + +TEST_F(TantivyFilterLimitTest, EmptyPreFilterReturnsEmpty) { + auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([ + ["alpha"], ["beta"] + ])") + .ValueOrDie(); + auto [fm, meta] = WriteAndOpen(array, {}); + ASSERT_OK_AND_ASSIGN(auto reader, + TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool())); + RoaringBitmap64 empty; // explicitly empty + auto res = reader->VisitFullTextSearch(std::make_shared( + "f0", /*limit=*/std::nullopt, "alpha", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/empty)); + ASSERT_TRUE(res.ok()) << res.status().ToString(); + auto plain = std::dynamic_pointer_cast(res.value()); + ASSERT_TRUE(plain); + ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, plain->GetBitmap()); + EXPECT_TRUE(bitmap->IsEmpty()); +} + +TEST_F(TantivyFilterLimitTest, LimitGreaterThanMatchesReturnsAll) { + auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([ + ["doc"], ["doc doc"], ["other"] + ])") + .ValueOrDie(); + auto [fm, meta] = WriteAndOpen(array, {}); + ASSERT_OK_AND_ASSIGN(auto reader, + TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool())); + auto fts = std::make_shared("f0", /*limit=*/100, "doc", + FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/std::nullopt); + fts->with_score = true; // v0.2: explicit score opt-in + auto res = reader->VisitFullTextSearch(fts); + ASSERT_TRUE(res.ok()) << res.status().ToString(); + auto scored = std::dynamic_pointer_cast(res.value()); + ASSERT_TRUE(scored); + ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, scored->GetBitmap()); + EXPECT_EQ(BitmapToVec(*bitmap), (std::vector{0, 1})); + EXPECT_EQ(scored->GetScores().size(), 2u); +} + +// =========================================================================== +// v0.2: with_score × limit 4-path matrix guards +// =========================================================================== +// Decouple with_score from limit. The four combinations must each map to the +// correct concrete result type and content. See docs/dev/tantivy_bm25_score_contract.md §4. + +// Path A: with_score=false, limit=None → BitmapGlobalIndexResult, all rows, no score. +TEST_F(TantivyFilterLimitTest, WithScoreFalseLimitNone_AllRowsNoScore) { + auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([ + ["doc"], ["doc doc"], ["doc doc doc"] + ])") + .ValueOrDie(); + auto [fm, meta] = WriteAndOpen(array, {}); + ASSERT_OK_AND_ASSIGN(auto reader, + TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool())); + auto fts = std::make_shared("f0", /*limit=*/std::nullopt, "doc", + FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/std::nullopt); + fts->with_score = false; + auto res = reader->VisitFullTextSearch(fts); + ASSERT_TRUE(res.ok()) << res.status().ToString(); + // Must NOT be scored. + EXPECT_FALSE(std::dynamic_pointer_cast(res.value())); + auto plain = std::dynamic_pointer_cast(res.value()); + ASSERT_TRUE(plain); + ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, plain->GetBitmap()); + EXPECT_EQ(BitmapToVec(*bitmap), (std::vector{0, 1, 2})); +} + +// Path B: with_score=false, limit=N → BitmapGlobalIndexResult, any N matches, +// no scoring (no BM25 sort). Used by `WHERE MATCH ... LIMIT N` without ORDER BY. +TEST_F(TantivyFilterLimitTest, WithScoreFalseLimitN_AnyNNoScore) { + auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([ + ["doc"], + ["doc doc doc doc doc"], + ["doc doc"] + ])") + .ValueOrDie(); + auto [fm, meta] = WriteAndOpen(array, {}); + ASSERT_OK_AND_ASSIGN(auto reader, + TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool())); + auto fts = std::make_shared("f0", /*limit=*/2, "doc", + FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/std::nullopt); + fts->with_score = false; + auto res = reader->VisitFullTextSearch(fts); + ASSERT_TRUE(res.ok()) << res.status().ToString(); + // Must NOT be scored. + EXPECT_FALSE(std::dynamic_pointer_cast(res.value())); + auto plain = std::dynamic_pointer_cast(res.value()); + ASSERT_TRUE(plain); + ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, plain->GetBitmap()); + // Only cardinality matters — selection order is arbitrary and depends on + // tantivy's posting iteration; the two returned row_ids must each be one + // of the three input docs. + EXPECT_EQ(bitmap->Cardinality(), 2u); + auto vec = BitmapToVec(*bitmap); + for (auto id : vec) { + EXPECT_TRUE(id == 0 || id == 1 || id == 2); + } +} + +// Path C (new in v0.2): with_score=true, limit=None → BitmapScoredGlobalIndexResult, +// all rows + all scores, ordered by row_id asc. +TEST_F(TantivyFilterLimitTest, WithScoreTrueLimitNone_AllRowsWithScore) { + auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([ + ["doc"], ["doc doc"], ["doc doc doc"] + ])") + .ValueOrDie(); + auto [fm, meta] = WriteAndOpen(array, {}); + ASSERT_OK_AND_ASSIGN(auto reader, + TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool())); + auto fts = std::make_shared("f0", /*limit=*/std::nullopt, "doc", + FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/std::nullopt); + fts->with_score = true; + auto res = reader->VisitFullTextSearch(fts); + ASSERT_TRUE(res.ok()) << res.status().ToString(); + auto scored = std::dynamic_pointer_cast(res.value()); + ASSERT_TRUE(scored) << "with_score=true must produce BitmapScoredGlobalIndexResult"; + ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, scored->GetBitmap()); + EXPECT_EQ(BitmapToVec(*bitmap), (std::vector{0, 1, 2})); + // All 3 docs have scores; sizes must match. + EXPECT_EQ(scored->GetScores().size(), 3u); + for (auto s : scored->GetScores()) { + EXPECT_GT(s, 0.0f); + } +} + +// Path D: with_score=true, limit=N → BitmapScoredGlobalIndexResult, top-N with scores. +// Equivalent to the v0.1 happy-path (LimitProducesScoredResultTopN), kept here +// as an explicit anchor of the 4-path matrix. +TEST_F(TantivyFilterLimitTest, WithScoreTrueLimitN_TopNWithScore) { + auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([ + ["doc"], + ["doc doc doc doc doc"], + ["doc doc"] + ])") + .ValueOrDie(); + auto [fm, meta] = WriteAndOpen(array, {}); + ASSERT_OK_AND_ASSIGN(auto reader, + TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool())); + auto fts = std::make_shared("f0", /*limit=*/2, "doc", + FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/std::nullopt); + fts->with_score = true; + auto res = reader->VisitFullTextSearch(fts); + ASSERT_TRUE(res.ok()) << res.status().ToString(); + auto scored = std::dynamic_pointer_cast(res.value()); + ASSERT_TRUE(scored); + ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, scored->GetBitmap()); + EXPECT_EQ(bitmap->Cardinality(), 2u); + EXPECT_TRUE(bitmap->Contains(1)); // highest TF must be included + EXPECT_EQ(scored->GetScores().size(), 2u); +} + +// Migration guard: when caller omits `with_score`, the default is `false` — +// even with limit set, the result is a BitmapGlobalIndexResult (NOT scored). +// This catches v0.1 callers that relied on `limit >= 0` to implicitly get scores. +TEST_F(TantivyFilterLimitTest, WithScoreDefaultIsFalse) { + auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([ + ["doc"], ["doc doc"], ["doc doc doc"] + ])") + .ValueOrDie(); + auto [fm, meta] = WriteAndOpen(array, {}); + ASSERT_OK_AND_ASSIGN(auto reader, + TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool())); + // Note: NOT setting fts->with_score; relying on the default value. + auto fts = std::make_shared("f0", /*limit=*/2, "doc", + FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/std::nullopt); + auto res = reader->VisitFullTextSearch(fts); + ASSERT_TRUE(res.ok()) << res.status().ToString(); + // v0.2 contract: with_score defaults to false, so even with limit set the + // result is BitmapGlobalIndexResult (NOT BitmapScoredGlobalIndexResult). + EXPECT_FALSE(std::dynamic_pointer_cast(res.value())) + << "v0.2: limit alone must NOT imply scoring; with_score=true is required"; + auto plain = std::dynamic_pointer_cast(res.value()); + ASSERT_TRUE(plain); +} + +} // namespace paimon::tantivy::test diff --git a/src/paimon/global_index/tantivy/tantivy_global_index.cpp b/src/paimon/global_index/tantivy/tantivy_global_index.cpp new file mode 100644 index 000000000..2eb0d1f79 --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_global_index.cpp @@ -0,0 +1,71 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +#include "paimon/global_index/tantivy/tantivy_global_index.h" + +#include "arrow/c/bridge.h" +#include "fmt/format.h" +#include "paimon/common/utils/options_utils.h" +#include "paimon/global_index/tantivy/tantivy_global_index_reader.h" +#include "paimon/global_index/tantivy/tantivy_global_index_writer.h" + +namespace paimon::tantivy { + +#define CHECK_NOT_NULL(pointer, error_msg) \ + do { \ + if (!(pointer)) { \ + return Status::Invalid(error_msg); \ + } \ + } while (0) + +TantivyGlobalIndex::TantivyGlobalIndex(const std::map& options) + : options_(OptionsUtils::FetchOptionsWithPrefix(kOptionKeyPrefix, options)) {} + +Result> TantivyGlobalIndex::CreateWriter( + const std::string& field_name, ::ArrowSchema* arrow_schema, + const std::shared_ptr& file_writer, + const std::shared_ptr& pool) const { + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr arrow_type, + arrow::ImportType(arrow_schema)); + auto struct_type = std::dynamic_pointer_cast(arrow_type); + CHECK_NOT_NULL(struct_type, + "arrow schema must be struct type when create TantivyGlobalIndexWriter"); + auto index_field = struct_type->GetFieldByName(field_name); + CHECK_NOT_NULL( + index_field, + fmt::format("field {} not exist in arrow schema when create TantivyGlobalIndexWriter", + field_name)); + if (index_field->type()->id() != arrow::Type::type::STRING) { + return Status::Invalid("field type must be string"); + } + return TantivyGlobalIndexWriter::Create(field_name, arrow_type, file_writer, options_, pool); +} + +Result> TantivyGlobalIndex::CreateReader( + ::ArrowSchema* c_arrow_schema, const std::shared_ptr& file_reader, + const std::vector& files, const std::shared_ptr& pool) const { + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr arrow_schema, + arrow::ImportSchema(c_arrow_schema)); + if (files.size() != 1) { + return Status::Invalid("tantivy index only has one index file per shard, now num: {}", + files.size()); + } + if (arrow_schema->num_fields() != 1) { + return Status::Invalid("TantivyGlobalIndex now only support one field"); + } + auto index_field = arrow_schema->field(0); + if (index_field->type()->id() != arrow::Type::type::STRING) { + return Status::Invalid("field type must be string"); + } + return TantivyGlobalIndexReader::Create(index_field->name(), files[0], file_reader, options_, + pool); +} + +} // namespace paimon::tantivy diff --git a/src/paimon/global_index/tantivy/tantivy_global_index.h b/src/paimon/global_index/tantivy/tantivy_global_index.h new file mode 100644 index 000000000..f380cafa1 --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_global_index.h @@ -0,0 +1,47 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +#pragma once + +#include +#include +#include +#include + +#include "arrow/type.h" +#include "paimon/global_index/global_indexer.h" +#include "paimon/global_index/tantivy/tantivy_defs.h" + +namespace paimon::tantivy { + +/// `GlobalIndexer` implementation backed by tantivy-fulltext. Counterpart to +/// `LuceneGlobalIndex`; the two coexist (and are NOT cross-readable) per +/// migration plan §0 decision 1. Selection between them happens at the +/// factory layer via the `index_type` identifier. +class TantivyGlobalIndex : public GlobalIndexer { + public: + explicit TantivyGlobalIndex(const std::map& options); + + Result> CreateWriter( + const std::string& field_name, ::ArrowSchema* arrow_schema, + const std::shared_ptr& file_writer, + const std::shared_ptr& pool) const override; + + Result> CreateReader( + ::ArrowSchema* arrow_schema, const std::shared_ptr& file_reader, + const std::vector& files, + const std::shared_ptr& pool) const override; + + private: + /// Options after the `tantivy-fulltext.` prefix has been stripped. + std::map options_; +}; + +} // namespace paimon::tantivy diff --git a/src/paimon/global_index/tantivy/tantivy_global_index_factory.cpp b/src/paimon/global_index/tantivy/tantivy_global_index_factory.cpp new file mode 100644 index 000000000..0227d17bb --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_global_index_factory.cpp @@ -0,0 +1,36 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +#include "paimon/global_index/tantivy/tantivy_global_index_factory.h" + +#include +#include +#include +#include + +#include "paimon/factories/factory.h" +#include "paimon/global_index/tantivy/tantivy_global_index.h" + +namespace paimon::tantivy { + +/// Identifier convention: lucene-fts uses "lucene-fts-global"; we use +/// "tantivy-fulltext-global" so `GlobalIndexerFactory::Get("tantivy-fulltext", ...)` +/// (which appends "-global") routes to us. Keeps both backends discoverable +/// via the same lookup path. +const char TantivyGlobalIndexFactory::IDENTIFIER[] = "tantivy-fulltext-global"; + +Result> TantivyGlobalIndexFactory::Create( + const std::map& options) const { + return std::make_unique(options); +} + +REGISTER_PAIMON_FACTORY(TantivyGlobalIndexFactory); + +} // namespace paimon::tantivy diff --git a/src/paimon/global_index/tantivy/tantivy_global_index_factory.h b/src/paimon/global_index/tantivy/tantivy_global_index_factory.h new file mode 100644 index 000000000..22d456e16 --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_global_index_factory.h @@ -0,0 +1,39 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +#pragma once + +#include +#include +#include + +#include "paimon/global_index/global_indexer.h" +#include "paimon/global_index/global_indexer_factory.h" + +namespace paimon::tantivy { + +/// Factory for creating tantivy-fulltext global indexers. Registered into +/// `FactoryCreator` via `REGISTER_PAIMON_FACTORY` so it is selectable +/// alongside `lucene-fts-global` by passing `index_type = "tantivy-fulltext"` +/// (the suffix `-global` is appended automatically by +/// `GlobalIndexerFactory::Get`). +class TantivyGlobalIndexFactory : public GlobalIndexerFactory { + public: + static const char IDENTIFIER[]; + + const char* Identifier() const override { + return IDENTIFIER; + } + + Result> Create( + const std::map& options) const override; +}; + +} // namespace paimon::tantivy diff --git a/src/paimon/global_index/tantivy/tantivy_global_index_reader.cpp b/src/paimon/global_index/tantivy/tantivy_global_index_reader.cpp new file mode 100644 index 000000000..4f0690ce5 --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_global_index_reader.cpp @@ -0,0 +1,234 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +#include "paimon/global_index/tantivy/tantivy_global_index_reader.h" + +#include +#include +#include +#include +#include // [BUG_QPLEAK_RUST] +#include + +#include "fmt/format.h" +#include "paimon/common/utils/options_utils.h" +#include "paimon/common/utils/rapidjson_util.h" +#include "paimon/global_index/bitmap_global_index_result.h" +#include "paimon/global_index/tantivy/tantivy_archive_layout.h" +#include "paimon/global_index/tantivy/tantivy_ffi_log.h" // [BUG_QPLEAK_RUST] +#include "paimon/global_index/tantivy/tantivy_ffi_status.h" +#include "paimon/global_index/tantivy/tantivy_stream_ctx.h" + +namespace paimon::tantivy { + +namespace { + +// [BUG_QPLEAK_RUST] one-shot install of Rust log bridge so log::warn! in Rust +// surfaces in BE's cn.WARNING via glog. +void EnsureTantivyLogBridge() { + static std::once_flag flag; + std::call_once(flag, [] { InstallTantivyLogBridge(); }); +} + +/// Returns the jieba dictionary dir from the env var, or an empty string if the env +/// var is missing/empty. We intentionally do NOT error here: paimon-java tantivy +/// archives use the built-in `"default"` (SimpleTokenizer) and do not need jieba — +/// the Rust reader's tokenizer-registration branch skips dict_dir entirely in that +/// case (third_party/tantivy_ffi/src/reader.rs:111 → `let _ = (mode, dict_dir)`). +/// For archives that DO use jieba (paimon-cpp-written with `tantivy.write.tokenizer +/// = paimon_jieba`), the Rust side will surface a clear "create paimon_jieba +/// tokenizer" failure when it tries to load the dictionary from an empty path, so +/// the error stays actionable. +std::string GetJiebaDictionaryDir() { + const char* env_dir = std::getenv(kJiebaDictDirEnv); + if (env_dir && *env_dir != '\0') { + return std::string(env_dir); + } + return std::string(); +} + +} // namespace + +Result> TantivyGlobalIndexReader::Create( + const std::string& field_name, const GlobalIndexIOMeta& io_meta, + const std::shared_ptr& file_reader, + const std::map& options, const std::shared_ptr& pool) { + (void)field_name; // Rust-side knows the field via the schema embedded in meta.json + EnsureTantivyLogBridge(); // [BUG_QPLEAK_RUST] + + std::map write_options; + if (io_meta.metadata) { + PAIMON_RETURN_NOT_OK(RapidJsonUtil::FromJsonString( + std::string(io_meta.metadata->data(), io_meta.metadata->size()), &write_options)); + } + + PAIMON_ASSIGN_OR_RAISE( + std::string tokenize_mode, + OptionsUtils::GetValueFromMap(options, kJiebaTokenizeMode, std::string(""))); + if (tokenize_mode.empty()) { + // Reader-side option not set; look at the (possibly empty) write_options blob. + // When write_options is empty (paimon-java-written archive), the value below is + // a placeholder that satisfies FFI validation but is discarded at runtime — + // see the comment block above. Do NOT treat the placeholder as a real default + // for jieba indices; jieba archives written by paimon-cpp always stamp their + // chosen mode into metadata, so the placeholder branch never applies to them. + PAIMON_ASSIGN_OR_RAISE( + tokenize_mode, OptionsUtils::GetValueFromMap(write_options, kJiebaTokenizeMode, + std::string(kDefaultJiebaTokenizeMode))); + } + PAIMON_ASSIGN_OR_RAISE( + bool omit_term_freq_and_positions, + OptionsUtils::GetValueFromMap(write_options, kTantivyWriteOmitTermFreqAndPositions, false)); + + std::string dict_dir = GetJiebaDictionaryDir(); + + // V3 streaming read path: + // 1) open stream + // 2) ParseArchiveHeader — reads only header bytes, seeks past payloads + // 3) wrap stream in StreamCtx (owned by Rust via release callback) + // 4) build PaimonStreamCallbacks → paimon_tantivy_reader_new_streaming + // Archive payloads are read lazily through read_at callbacks as tantivy + // accesses posting lists, meta.json, etc. + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr stream, + file_reader->GetInputStream(io_meta.file_path)); + PAIMON_ASSIGN_OR_RAISE(ArchiveLayout layout, ParseArchiveHeader(stream.get())); + + // Transfer stream ownership to a heap-allocated StreamCtx; Rust will + // `paimon_cpp_stream_release(ctx)` on reader drop, which `delete`s it. + auto* stream_ctx = new StreamCtx{std::move(stream), {}}; + PaimonStreamCallbacks callbacks{ + static_cast(stream_ctx), + paimon_cpp_stream_read_at, + paimon_cpp_stream_release, + }; + + // Build C-string array pointing into layout.names (stable during this call). + std::vector name_ptrs; + name_ptrs.reserve(layout.count); + for (const auto& n : layout.names) { + name_ptrs.push_back(n.c_str()); + } + + PaimonTantivyReader* raw = nullptr; + ::PaimonTantivyStatus st = paimon_tantivy_reader_new_streaming( + name_ptrs.data(), layout.offsets.data(), layout.lengths.data(), layout.count, callbacks, + tokenize_mode.c_str(), + /*with_position=*/!omit_term_freq_and_positions, dict_dir.c_str(), &raw); + if (st != PAIMON_TANTIVY_STATUS_OK) { + // On failure, Rust did NOT take ownership of ctx (FFI contract): + // release it here so the stream doesn't leak. + paimon_cpp_stream_release(stream_ctx); + PAIMON_TANTIVY_RETURN_NOT_OK(st); + } + return std::shared_ptr( + new TantivyGlobalIndexReader(ReaderPtr(raw), pool)); +} + +Result> TantivyGlobalIndexReader::VisitFullTextSearch( + const std::shared_ptr& full_text_search) { + if (!full_text_search) { + return Status::Invalid("VisitFullTextSearch: null FullTextSearch pointer"); + } + + // Serialize pre_filter (if any) to croaring portable bytes for FFI. + // NB: Serialize() returns a pooled_unique_ptr with MemoryPool::AllocatorDelete; + // converting via raw.release() + shared_ptr(raw_ptr) would substitute + // std::default_delete, causing alloc/dealloc mismatch (malloc vs operator + // delete) — detected by ASAN on 2026-04-21. Move directly into shared_ptr + // so the pooled deleter is preserved in the control block. + PAIMON_UNIQUE_PTR pre_filter_bytes_owned; + const char* pre_filter_ptr = nullptr; + std::size_t pre_filter_len = 0; + if (full_text_search->pre_filter.has_value()) { + pre_filter_bytes_owned = full_text_search->pre_filter.value().Serialize(pool_.get()); + pre_filter_ptr = pre_filter_bytes_owned->data(); + pre_filter_len = pre_filter_bytes_owned->size(); + } + + int32_t limit_arg = full_text_search->limit.has_value() + ? static_cast(full_text_search->limit.value()) + : -1; + + float min_score_arg = + full_text_search->min_score.has_value() ? full_text_search->min_score.value() : 0.0f; + + BufferGuard out; + PaimonTantivyStatus st = paimon_tantivy_reader_search( + reader_.get(), static_cast(full_text_search->search_type), + full_text_search->query.data(), full_text_search->query.size(), + full_text_search->with_score, limit_arg, pre_filter_ptr, pre_filter_len, min_score_arg, + out.out()); + PAIMON_TANTIVY_RETURN_NOT_OK(st); + + // Decode `[u8 has_scores | u64 count | u64 row_ids[] | optional f32 scores[]]`. + // (B1 schema: row_id is the explicit u64 column read from the fast field.) + if (out.size() < 9) { + return Status::Invalid( + fmt::format("tantivy reader output too small ({} bytes)", out.size())); + } + const uint8_t* p = out.data(); + bool has_scores = (p[0] != 0); + // v0.2 consistency check: the wire-level has_scores byte must match the caller's + // with_score flag. A mismatch would indicate FFI / wire-protocol drift. + if (has_scores != full_text_search->with_score) { + return Status::Invalid(fmt::format( + "tantivy wire protocol mismatch: caller with_score={} but buffer has_scores={}", + full_text_search->with_score, has_scores)); + } + uint64_t count; + std::memcpy(&count, p + 1, sizeof(uint64_t)); + std::size_t expected = 1 + 8 + count * 8 + (has_scores ? count * 4 : 0); + if (out.size() != expected) { + return Status::Invalid(fmt::format( + "tantivy reader output size mismatch: has_scores={} count={} expected {} bytes, got {}", + has_scores, count, expected, out.size())); + } + + const uint8_t* row_id_p = p + 9; + if (!has_scores) { + RoaringBitmap64 bitmap; + for (uint64_t i = 0; i < count; i++) { + uint64_t row_id; + std::memcpy(&row_id, row_id_p + i * 8, sizeof(uint64_t)); + bitmap.Add(static_cast(row_id)); + } + return std::make_shared( + [b = std::move(bitmap)]() -> Result { return b; }); + } + // has_scores=true: produce BitmapScoredGlobalIndexResult. Rust may send rows + // in either row_id-asc order (path C: with_score=true, limit=None) or score-desc + // order (path D: with_score=true, limit=Some). The bitmap iteration order is + // row_id-asc (RoaringBitmap set semantics), so we always re-sort by row_id here + // to keep `scores[i]` aligned with the i-th row_id from the bitmap iterator — + // matching the contract documented in BitmapScoredGlobalIndexResult. + const uint8_t* score_p = row_id_p + count * 8; + std::vector> id_score_pairs; + id_score_pairs.reserve(count); + for (uint64_t i = 0; i < count; i++) { + uint64_t row_id; + std::memcpy(&row_id, row_id_p + i * 8, sizeof(uint64_t)); + float score; + std::memcpy(&score, score_p + i * 4, sizeof(float)); + id_score_pairs.emplace_back(static_cast(row_id), score); + } + // Sort by row_id ascending so scores align with bitmap iteration order. + std::sort(id_score_pairs.begin(), id_score_pairs.end(), + [](const auto& a, const auto& b) { return a.first < b.first; }); + RoaringBitmap64 bitmap; + std::vector scores; + scores.reserve(id_score_pairs.size()); + for (const auto& [id, sc] : id_score_pairs) { + bitmap.Add(id); + scores.push_back(sc); + } + return std::make_shared(std::move(bitmap), std::move(scores)); +} + +} // namespace paimon::tantivy diff --git a/src/paimon/global_index/tantivy/tantivy_global_index_reader.h b/src/paimon/global_index/tantivy/tantivy_global_index_reader.h new file mode 100644 index 000000000..d115504c9 --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_global_index_reader.h @@ -0,0 +1,121 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +#pragma once + +#include +#include +#include + +#include "paimon/global_index/bitmap_scored_global_index_result.h" +#include "paimon/global_index/global_index_io_meta.h" +#include "paimon/global_index/global_index_reader.h" +#include "paimon/global_index/io/global_index_file_reader.h" +#include "paimon/global_index/tantivy/tantivy_defs.h" +#include "paimon/global_index/tantivy/tantivy_ffi_handle.h" +#include "paimon/memory/memory_pool.h" +#include "paimon/predicate/full_text_search.h" + +namespace paimon::tantivy { + +/// Tantivy-backed implementation of `GlobalIndexReader`. +/// +/// Mirrors LuceneGlobalIndexReader's surface but delegates query construction +/// + execution into Rust over FFI. Stage 6 supports the 5 FullTextSearch +/// SearchTypes (MATCH_ALL, MATCH_ANY, PHRASE, PREFIX, WILDCARD) without limit +/// or pre_filter — both of which Stage 7 layers on. +/// +/// All non-FullTextSearch visit methods return nullptr (matches +/// LuceneGlobalIndexReader): the FTS index has no contribution for non-FTS +/// predicates, framework treats nullptr as "no filter constraint". +class TantivyGlobalIndexReader : public GlobalIndexReader { + public: + static Result> Create( + const std::string& field_name, const GlobalIndexIOMeta& io_meta, + const std::shared_ptr& file_reader, + const std::map& options, const std::shared_ptr& pool); + + // === FunctionVisitor surface — non-FTS predicates fall back to full range. === + + Result> VisitIsNotNull() override { + return CreateAllResult(); + } + Result> VisitIsNull() override { + return CreateAllResult(); + } + Result> VisitEqual(const Literal&) override { + return CreateAllResult(); + } + Result> VisitNotEqual(const Literal&) override { + return CreateAllResult(); + } + Result> VisitLessThan(const Literal&) override { + return CreateAllResult(); + } + Result> VisitLessOrEqual(const Literal&) override { + return CreateAllResult(); + } + Result> VisitGreaterThan(const Literal&) override { + return CreateAllResult(); + } + Result> VisitGreaterOrEqual(const Literal&) override { + return CreateAllResult(); + } + Result> VisitIn(const std::vector&) override { + return CreateAllResult(); + } + Result> VisitNotIn(const std::vector&) override { + return CreateAllResult(); + } + Result> VisitStartsWith(const Literal&) override { + return CreateAllResult(); + } + Result> VisitEndsWith(const Literal&) override { + return CreateAllResult(); + } + Result> VisitContains(const Literal&) override { + return CreateAllResult(); + } + Result> VisitLike(const Literal&) override { + return CreateAllResult(); + } + + Result> VisitVectorSearch( + const std::shared_ptr&) override { + return Status::Invalid( + "TantivyGlobalIndexReader is not supposed to handle vector search query"); + } + + Result> VisitFullTextSearch( + const std::shared_ptr& full_text_search) override; + + bool IsThreadSafe() const override { + return false; + } + + std::string GetIndexType() const override { + return kIdentifier; + } + + private: + TantivyGlobalIndexReader(ReaderPtr reader, std::shared_ptr pool) + : reader_(std::move(reader)), pool_(std::move(pool)) {} + + std::shared_ptr CreateAllResult() const { + return nullptr; + } + + /// Owning handle to the Rust-side reader. + ReaderPtr reader_; + /// MemoryPool used for serializing pre-filter bitmaps to bytes for FFI. + std::shared_ptr pool_; +}; + +} // namespace paimon::tantivy diff --git a/src/paimon/global_index/tantivy/tantivy_global_index_writer.cpp b/src/paimon/global_index/tantivy/tantivy_global_index_writer.cpp new file mode 100644 index 000000000..f78bc6d41 --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_global_index_writer.cpp @@ -0,0 +1,172 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +#include "paimon/global_index/tantivy/tantivy_global_index_writer.h" + +#include + +#include "arrow/c/bridge.h" +#include "fmt/format.h" +#include "paimon/common/global_index/global_index_utils.h" +#include "paimon/common/utils/options_utils.h" +#include "paimon/common/utils/rapidjson_util.h" +#include "paimon/global_index/tantivy/tantivy_ffi_status.h" +#include "paimon/global_index/tantivy/tantivy_stream_ctx.h" + +namespace paimon::tantivy { + +#define CHECK_NOT_NULL(pointer, error_msg) \ + do { \ + if (!(pointer)) { \ + return Status::Invalid(error_msg); \ + } \ + } while (0) + +namespace { + +/// Resolve the jieba dictionary directory for the writer. Mirrors lucene-fts' +/// LuceneUtils::GetJiebaDictionaryDir but kept separate to avoid coupling +/// tantivy-fulltext to the lucene module. +Result GetJiebaDictionaryDir() { + const char* env_dir = std::getenv(kJiebaDictDirEnv); + if (env_dir && *env_dir != '\0') { + return std::string(env_dir); + } + return Status::Invalid( + fmt::format("jieba dictionary dir not found, please set {} env var", kJiebaDictDirEnv)); +} + +} // namespace + +Result> TantivyGlobalIndexWriter::Create( + const std::string& field_name, const std::shared_ptr& arrow_type, + const std::shared_ptr& file_writer, + const std::map& options, const std::shared_ptr& pool) { + PAIMON_ASSIGN_OR_RAISE( + bool omit_term_freq_and_positions, + OptionsUtils::GetValueFromMap(options, kTantivyWriteOmitTermFreqAndPositions, false)); + PAIMON_ASSIGN_OR_RAISE(std::string tokenize_mode, + OptionsUtils::GetValueFromMap(options, kJiebaTokenizeMode, + std::string(kDefaultJiebaTokenizeMode))); + PAIMON_ASSIGN_OR_RAISE(std::string tokenizer, OptionsUtils::GetValueFromMap( + options, kTantivyWriteTokenizer, + std::string(kDefaultTantivyWriteTokenizer))); + // Jieba dict is only needed when actually using jieba. For tantivy built-in + // tokenizers (e.g. "default") we don't force the caller to ship the jieba + // dict dir — pass an empty string and Rust skips jieba construction. + std::string dict_dir; + if (tokenizer == "paimon_jieba") { + PAIMON_ASSIGN_OR_RAISE(dict_dir, GetJiebaDictionaryDir()); + } + + PaimonTantivyWriter* raw = nullptr; + PaimonTantivyStatus st = paimon_tantivy_writer_new( + field_name.c_str(), tokenize_mode.c_str(), + /*with_position=*/!omit_term_freq_and_positions, dict_dir.c_str(), tokenizer.c_str(), &raw); + PAIMON_TANTIVY_RETURN_NOT_OK(st); + WriterPtr writer(raw); + return std::shared_ptr(new TantivyGlobalIndexWriter( + field_name, arrow_type, std::move(writer), file_writer, options, pool)); +} + +TantivyGlobalIndexWriter::TantivyGlobalIndexWriter( + const std::string& field_name, const std::shared_ptr& arrow_type, + WriterPtr writer, const std::shared_ptr& file_writer, + const std::map& options, const std::shared_ptr& pool) + : pool_(pool), + field_name_(field_name), + arrow_type_(arrow_type), + writer_(std::move(writer)), + file_writer_(file_writer), + options_(options) {} + +Status TantivyGlobalIndexWriter::AddBatch(::ArrowArray* arrow_array, + std::vector&& relative_row_ids) { + // First-element check mirrors lucene; trust caller to feed sequential ids + // within a batch (same contract LuceneGlobalIndexWriter relies on). + PAIMON_RETURN_NOT_OK( + GlobalIndexUtils::CheckRelativeRowIds(arrow_array, relative_row_ids, row_id_)); + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr array, + arrow::ImportArray(arrow_array, arrow_type_)); + auto struct_array = std::dynamic_pointer_cast(array); + CHECK_NOT_NULL(struct_array, + "invalid input array in TantivyGlobalIndexWriter, must be struct array"); + auto field_array = struct_array->GetFieldByName(field_name_); + CHECK_NOT_NULL( + field_array, + fmt::format("invalid input array in TantivyGlobalIndexWriter, field {} not in input array", + field_name_)); + auto string_array = std::dynamic_pointer_cast(field_array); + CHECK_NOT_NULL(string_array, + fmt::format("invalid input array in TantivyGlobalIndexWriter, field array {} " + "is not a string array", + field_name_)); + + for (int64_t i = 0; i < string_array->length(); i++) { + const char* text_ptr = nullptr; + size_t text_len = 0; + if (!string_array->IsNull(i)) { + std::string_view view = string_array->Value(i); + text_ptr = view.data(); + text_len = view.size(); + } + // B1 schema: pass the caller-tracked row_id as an explicit u64 field. + PaimonTantivyStatus st = paimon_tantivy_writer_add( + writer_.get(), static_cast(row_id_), text_ptr, text_len); + PAIMON_TANTIVY_RETURN_NOT_OK(st); + row_id_++; + } + return Status::OK(); +} + +Result> TantivyGlobalIndexWriter::Finish() { + // W1 streaming finish: open the output file, pipe archive bytes from Rust + // through `paimon_cpp_writer_push` directly into the OutputStream. Peak + // RAM (Rust side) = 64KB buffer, independent of archive size. + PAIMON_ASSIGN_OR_RAISE(std::string index_file_name, file_writer_->NewFileName(kIdentifier)); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr out, + file_writer_->NewOutputStream(index_file_name)); + + WriteCtx ctx{out.get(), Status::OK()}; + PaimonWriteCallbacks cb{ + static_cast(&ctx), + paimon_cpp_writer_push, + }; + + int64_t rust_row_count = 0; + ::PaimonTantivyStatus st = + paimon_tantivy_writer_finish_streaming(writer_.get(), cb, &rust_row_count); + if (st != PAIMON_TANTIVY_STATUS_OK) { + // Prefer the detailed C++-side Status stashed by the write callback + // (if the failure originated there); fall back to FFI-derived status. + if (!ctx.last_error.ok()) { + return ctx.last_error; + } + PAIMON_TANTIVY_RETURN_NOT_OK(st); + } + if (rust_row_count != row_id_) { + return Status::Invalid( + fmt::format("tantivy writer row count {} mismatch paimon inner row count {}", + rust_row_count, row_id_)); + } + + PAIMON_RETURN_NOT_OK(out->Flush()); + PAIMON_RETURN_NOT_OK(out->Close()); + + PAIMON_ASSIGN_OR_RAISE(int64_t file_size, file_writer_->GetFileSize(index_file_name)); + std::string options_json; + PAIMON_RETURN_NOT_OK(RapidJsonUtil::ToJsonString(options_, &options_json)); + auto meta_bytes = std::make_shared(options_json, pool_.get()); + GlobalIndexIOMeta meta(file_writer_->ToPath(index_file_name), file_size, + /*metadata=*/meta_bytes); + return std::vector({meta}); +} + +} // namespace paimon::tantivy diff --git a/src/paimon/global_index/tantivy/tantivy_global_index_writer.h b/src/paimon/global_index/tantivy/tantivy_global_index_writer.h new file mode 100644 index 000000000..ed5421320 --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_global_index_writer.h @@ -0,0 +1,68 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +#pragma once + +#include +#include +#include +#include + +#include "arrow/type.h" +#include "paimon/global_index/global_index_writer.h" +#include "paimon/global_index/io/global_index_file_writer.h" +#include "paimon/global_index/tantivy/tantivy_defs.h" +#include "paimon/global_index/tantivy/tantivy_ffi_handle.h" + +namespace paimon::tantivy { + +/// Tantivy-backed implementation of GlobalIndexWriter. +/// +/// Mirrors LuceneGlobalIndexWriter's lifecycle: +/// Create() → AddBatch()* → Finish() +/// Each shard produces exactly one .index file via the GlobalIndexFileWriter, +/// containing the full packed tantivy on-disk index in a single contiguous blob. +/// +/// Indexes written by this class are NOT cross-readable with lucene-fts — see +/// migration plan §0 decision 1. The C++ side of this writer is intentionally +/// thin: index construction, segment merging, and packing all happen in Rust +/// behind the FFI boundary. +class TantivyGlobalIndexWriter : public GlobalIndexWriter { + public: + static Result> Create( + const std::string& field_name, const std::shared_ptr& arrow_type, + const std::shared_ptr& file_writer, + const std::map& options, const std::shared_ptr& pool); + + ~TantivyGlobalIndexWriter() override = default; + + Status AddBatch(::ArrowArray* arrow_array, std::vector&& relative_row_ids) override; + + Result> Finish() override; + + private: + TantivyGlobalIndexWriter(const std::string& field_name, + const std::shared_ptr& arrow_type, WriterPtr writer, + const std::shared_ptr& file_writer, + const std::map& options, + const std::shared_ptr& pool); + + std::shared_ptr pool_; + std::string field_name_; + std::shared_ptr arrow_type_; + /// Owning handle to the Rust-side writer. + WriterPtr writer_; + std::shared_ptr file_writer_; + std::map options_; + /// Last document index processed (matches caller-passed relative_row_ids). + int64_t row_id_ = 0; +}; + +} // namespace paimon::tantivy diff --git a/src/paimon/global_index/tantivy/tantivy_index_test.cpp b/src/paimon/global_index/tantivy/tantivy_index_test.cpp new file mode 100644 index 000000000..81e3f365a --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_index_test.cpp @@ -0,0 +1,283 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Stage 8 integration test: end-to-end via TantivyGlobalIndex (writer + reader), + * mirroring src/paimon/global_index/lucene/lucene_global_index_test.cpp. + * + * Validates parity with lucene-fts on: + * - file naming: "tantivy-fulltext-global-index-{uuid}.index" + * - meta JSON shape: option-prefix-stripped key/value pairs + * - 5 SearchTypes against an English corpus + * - 5 SearchTypes against a Chinese corpus (jieba "query" mode) + * - limit + pre_filter + scoring (Stage 7) interactions + * - factory registration: looking up "tantivy-fulltext" produces a tantivy indexer + */ + +#include +#include + +#include "arrow/array.h" +#include "arrow/c/bridge.h" +#include "arrow/ipc/api.h" +#include "arrow/type.h" +#include "gtest/gtest.h" +#include "paimon/common/utils/path_util.h" +#include "paimon/common/utils/string_utils.h" +#include "paimon/core/global_index/global_index_file_manager.h" +#include "paimon/core/index/index_path_factory.h" +#include "paimon/fs/local/local_file_system.h" +#include "paimon/global_index/bitmap_global_index_result.h" +#include "paimon/global_index/bitmap_scored_global_index_result.h" +#include "paimon/global_index/global_indexer_factory.h" +#include "paimon/global_index/tantivy/tantivy_defs.h" +#include "paimon/global_index/tantivy/tantivy_global_index.h" +#include "paimon/global_index/tantivy/tantivy_global_index_factory.h" +#include "paimon/global_index/tantivy/tantivy_global_index_reader.h" +#include "paimon/testing/utils/testharness.h" + +#ifndef JIEBA_TEST_DICT_DIR +#error "JIEBA_TEST_DICT_DIR must be set at compile time" +#endif + +namespace paimon::tantivy::test { + +namespace { + +class FakeIndexPathFactory : public IndexPathFactory { + public: + explicit FakeIndexPathFactory(const std::string& root) : root_(root) {} + std::string NewPath() const override { + assert(false); + return ""; + } + std::string ToPath(const std::shared_ptr&) const override { + assert(false); + return ""; + } + std::string ToPath(const std::string& file_name) const override { + return PathUtil::JoinPath(root_, file_name); + } + bool IsExternalPath() const override { + return false; + } + + private: + std::string root_; +}; + +class TantivyGlobalIndexIntegrationTest : public ::testing::Test { + public: + void SetUp() override { + setenv(kJiebaDictDirEnv, JIEBA_TEST_DICT_DIR, /*overwrite=*/1); + } + + std::unique_ptr<::ArrowSchema> CreateArrowSchema( + const std::shared_ptr& data_type) const { + auto c_schema = std::make_unique<::ArrowSchema>(); + EXPECT_TRUE(arrow::ExportType(*data_type, c_schema.get()).ok()); + return c_schema; + } + + Result WriteGlobalIndex(const std::string& root, + const std::shared_ptr& data_type, + const std::map& options, + const std::shared_ptr& array, + int64_t /*unused_expected_range_end*/) const { + auto global_index = std::make_shared(options); + auto path_factory = std::make_shared(root); + auto file_writer = std::make_shared(fs_, path_factory); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr w, + global_index->CreateWriter("f0", CreateArrowSchema(data_type).get(), + file_writer, pool_)); + ::ArrowArray c_array; + PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportArray(*array, &c_array)); + std::vector relative_row_ids(array->length()); + for (int64_t i = 0; i < array->length(); ++i) relative_row_ids[i] = i; + PAIMON_RETURN_NOT_OK(w->AddBatch(&c_array, std::move(relative_row_ids))); + PAIMON_ASSIGN_OR_RAISE(auto metas, w->Finish()); + EXPECT_EQ(metas.size(), 1u); + auto file_name = PathUtil::GetName(metas[0].file_path); + EXPECT_TRUE(StringUtils::StartsWith(file_name, "tantivy-fulltext-global-index-")) + << file_name; + EXPECT_TRUE(StringUtils::EndsWith(file_name, ".index")); + EXPECT_TRUE(metas[0].metadata); + return metas[0]; + } + + Result> CreateReader( + const std::string& root, const std::shared_ptr& data_type, + const std::map& options, const GlobalIndexIOMeta& meta) const { + auto global_index = std::make_shared(options); + auto path_factory = std::make_shared(root); + auto file_reader = std::make_shared(fs_, path_factory); + return global_index->CreateReader(CreateArrowSchema(data_type).get(), file_reader, {meta}, + pool_); + } + + void CheckResult(const std::shared_ptr& result, + const std::vector& expected_ids) const { + const RoaringBitmap64* bitmap = nullptr; + if (auto scored = std::dynamic_pointer_cast(result)) { + ASSERT_OK_AND_ASSIGN(bitmap, scored->GetBitmap()); + ASSERT_EQ(scored->GetScores().size(), expected_ids.size()); + } else if (auto plain = std::dynamic_pointer_cast(result)) { + ASSERT_OK_AND_ASSIGN(bitmap, plain->GetBitmap()); + } + ASSERT_TRUE(bitmap); + ASSERT_EQ(*bitmap, RoaringBitmap64::From(expected_ids)) + << "result=" << bitmap->ToString() + << ", expected=" << RoaringBitmap64::From(expected_ids).ToString(); + } + + protected: + std::shared_ptr pool_ = GetDefaultPool(); + std::shared_ptr fs_ = std::make_shared(); + std::shared_ptr data_type_ = + arrow::struct_({arrow::field("f0", arrow::utf8())}); +}; + +} // namespace + +TEST_F(TantivyGlobalIndexIntegrationTest, EnglishCorpus) { + auto root_dir = paimon::test::UniqueTestDirectory::Create(); + ASSERT_TRUE(root_dir); + std::string root = root_dir->Str(); + + std::map options = { + {"tantivy-fulltext.write.omit-term-freq-and-position", "false"}, + }; + auto array = arrow::ipc::internal::json::ArrayFromJSON(data_type_, R"([ + ["This is an test document."], + ["This is an new document document document."], + ["Document document document document test."], + ["unordered user-defined doc id"] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto meta, WriteGlobalIndex(root, data_type_, options, array, 3)); + EXPECT_EQ(std::string(meta.metadata->data(), meta.metadata->size()), + R"({"write.omit-term-freq-and-position":"false"})"); + + ASSERT_OK_AND_ASSIGN(auto reader, CreateReader(root, data_type_, options, meta)); + auto t_reader = std::dynamic_pointer_cast(reader); + ASSERT_TRUE(t_reader); + EXPECT_EQ(t_reader->GetIndexType(), std::string(kIdentifier)); + + auto run = [&](const std::string& q, FullTextSearch::SearchType t, + std::optional limit = std::nullopt, + std::optional filter = std::nullopt) { + // Use scored path so `limit` returns top-N by BM25, matching test + // expectations (otherwise unscored Path B returns any-N, non-deterministic). + auto fts = std::make_shared("f0", limit, q, t, filter); + fts->with_score = true; + auto res = t_reader->VisitFullTextSearch(fts); + EXPECT_TRUE(res.ok()) << res.status().ToString(); + return res.value(); + }; + + CheckResult(run("document", FullTextSearch::SearchType::MATCH_ALL, 10), {2, 1, 0}); + CheckResult(run("document", FullTextSearch::SearchType::MATCH_ANY, 1), {2}); + CheckResult(run("test document", FullTextSearch::SearchType::MATCH_ALL, 10), {2, 0}); + CheckResult(run("test new", FullTextSearch::SearchType::MATCH_ANY, 10), {1, 0, 2}); + CheckResult(run("test document", FullTextSearch::SearchType::PHRASE, 10), {0}); + CheckResult(run("unordered", FullTextSearch::SearchType::MATCH_ALL, 10), {3}); + CheckResult(run("unorder", FullTextSearch::SearchType::PREFIX, 10), {3}); + CheckResult(run("*order*", FullTextSearch::SearchType::WILDCARD, 10), {3}); + CheckResult(run("*or*er*", FullTextSearch::SearchType::WILDCARD, 10), {3}); + + // pre_filter + CheckResult( + run("document", FullTextSearch::SearchType::MATCH_ALL, 10, RoaringBitmap64::From({0l, 1l})), + {0, 1}); + CheckResult(run("document", FullTextSearch::SearchType::MATCH_ALL, 10, + RoaringBitmap64::From({2l, 100l})), + {2}); + CheckResult(run("document", FullTextSearch::SearchType::MATCH_ALL, 10, + RoaringBitmap64::From({20l, 100l})), + {}); + + // No limit + CheckResult(run("document", FullTextSearch::SearchType::MATCH_ALL), {0, 1, 2}); + CheckResult(run("document", FullTextSearch::SearchType::MATCH_ALL, std::nullopt, + RoaringBitmap64::From({2l})), + {2}); + CheckResult(run("document test", FullTextSearch::SearchType::MATCH_ALL, std::nullopt, + RoaringBitmap64::From({1l, 2l, 3l, 100l})), + {2}); +} + +TEST_F(TantivyGlobalIndexIntegrationTest, ChineseCorpus) { + auto root_dir = paimon::test::UniqueTestDirectory::Create(); + ASSERT_TRUE(root_dir); + std::string root = root_dir->Str(); + + std::map options = { + {"tantivy-fulltext.write.omit-term-freq-and-position", "false"}, + {"tantivy-fulltext.tantivy.write.tokenizer", "paimon_jieba"}, + {"tantivy-fulltext.jieba.tokenize-mode", "query"}, + }; + auto array = arrow::ipc::internal::json::ArrayFromJSON(data_type_, R"([ +["QianWen 是一个基于 AI 的智能助手,类似于 Siri 和 Alexa。我们正在用 Python 开发 QianWen 的 Natural Language Understanding 模块,该模块支持多轮对话和意图识别功能,是新一代智能助手的核心技术之一。"], +["最近开源了一个新项目叫qianwen(全角字符),功能类似之前的 Qianwen,是一个面向 AI 应用的智能助手。它不仅支持 Machine Learning 和 NLP 技术,还提供了可扩展的开发框架,便于开发者构建自己的智能助手系统。"], +["我们在测试 qianwen-core v1.2 和 ai-engine-alpha 中的 bug,重点优化了 qianwen 的响应速度和稳定性。本次更新增强了核心模块的功能,提升了智能助手的开发效率,并修复了与 NLP 模块相关的多个问题。"], +["AI 助手开发中常用的技术包括 Speech Recognition、Natural Language Processing 和 Recommendation System。我们使用 TensorFlow 和 PyTorch 构建模型,开发了多个智能助手原型,支持语音交互和上下文理解功能,是当前热门的人工智能发展应用方向。"], +["新一代的 AI 助手代号为「千问」,内部命名为 QianwenX-2024,计划在 next quarter 发布。QianwenX 将集成更强的 multimodel 能力,支持图像和文本联合处理,进一步提升智能助手的理解能力和交互体验,是未来智能助手的重要发展方向。"] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto meta, WriteGlobalIndex(root, data_type_, options, array, 4)); + EXPECT_EQ( + std::string(meta.metadata->data(), meta.metadata->size()), + R"({"jieba.tokenize-mode":"query","tantivy.write.tokenizer":"paimon_jieba","write.omit-term-freq-and-position":"false"})"); + + ASSERT_OK_AND_ASSIGN(auto reader, CreateReader(root, data_type_, options, meta)); + auto t_reader = std::dynamic_pointer_cast(reader); + ASSERT_TRUE(t_reader); + + auto run = [&](const std::string& q, FullTextSearch::SearchType t, + std::optional limit = std::nullopt, + std::optional filter = std::nullopt) { + // Use scored path so `limit` returns top-N by BM25, matching test + // expectations (otherwise unscored Path B returns any-N, non-deterministic). + auto fts = std::make_shared("f0", limit, q, t, filter); + fts->with_score = true; + auto res = t_reader->VisitFullTextSearch(fts); + EXPECT_TRUE(res.ok()) << res.status().ToString(); + return res.value(); + }; + + CheckResult(run("模块", FullTextSearch::SearchType::MATCH_ALL, 10), {0, 2}); + CheckResult(run("模块", FullTextSearch::SearchType::MATCH_ANY, 1), {0}); + CheckResult(run("模块技术", FullTextSearch::SearchType::MATCH_ALL, 10), {0}); + CheckResult(run("模块技术", FullTextSearch::SearchType::MATCH_ANY, 10), {0, 1, 2, 3}); + CheckResult(run("发展方向", FullTextSearch::SearchType::PHRASE, 10), {4}); + CheckResult(run("模块技术", FullTextSearch::SearchType::MATCH_ANY, 10, + RoaringBitmap64::From({1l, 3l, 4l})), + {1, 3}); + CheckResult(run("模块技术", FullTextSearch::SearchType::MATCH_ANY), {0, 1, 2, 3}); +} + +TEST_F(TantivyGlobalIndexIntegrationTest, FactoryLookupReturnsTantivyIndexer) { + std::map options = { + {"tantivy-fulltext.jieba.tokenize-mode", "query"}, + }; + // Identifier passed to GlobalIndexerFactory::Get is the prefix; "-global" + // is appended automatically. So "tantivy-fulltext" must route to our factory. + ASSERT_OK_AND_ASSIGN(std::unique_ptr indexer, + GlobalIndexerFactory::Get("tantivy-fulltext", options)); + ASSERT_TRUE(indexer); + auto* casted = dynamic_cast(indexer.get()); + ASSERT_TRUE(casted) << "factory did not return a TantivyGlobalIndex"; +} + +} // namespace paimon::tantivy::test diff --git a/src/paimon/global_index/tantivy/tantivy_java_compat_test.cpp b/src/paimon/global_index/tantivy/tantivy_java_compat_test.cpp new file mode 100644 index 000000000..fbfdd8fa2 --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_java_compat_test.cpp @@ -0,0 +1,535 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * J6: cross-read test — paimon-java produces the tantivy archive, paimon-cpp + * V3 reader consumes it. + * + * The fixture (`english_simple.archive` + `english_simple.golden.json`) is + * generated by `TantivyIndexFixtureGen.java` in the paimon repo using the + * production `TantivyIndexWriter` + `packIndex` path. Ten pure-ASCII English + * documents (row_id 0..9) are indexed; for each SearchType we assert the V3 + * reader returns exactly the row_ids the Java side wrote — evidence that + * archive byte format, schema, and segment-level byte format all line up + * across the Java/C++ implementations. + * + * Architectural cross-checks this test guards: + * 1. Archive BE big-endian format parsing (ParseArchiveHeader) + * 2. Multi-segment layout (Java does not force-merge; 20+ files in fixture) + * 3. Schema interop: `row_id` u64 fast field written by Java, read by C++ V3 + * 4. Tokenizer parity on pure English (SimpleTokenizer ↔ paimon_jieba) + * 5. row_id caller-supplied invariant: reader returns the exact row_ids + * Java wrote (0..9), NOT tantivy-internal doc_ids + */ + +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/c/bridge.h" +#include "arrow/ipc/api.h" +#include "arrow/type.h" +#include "gtest/gtest.h" +#include "paimon/common/utils/path_util.h" +#include "paimon/core/global_index/global_index_file_manager.h" +#include "paimon/core/index/index_path_factory.h" +#include "paimon/fs/local/local_file_system.h" +#include "paimon/global_index/bitmap_global_index_result.h" +#include "paimon/global_index/bitmap_scored_global_index_result.h" +#include "paimon/global_index/tantivy/tantivy_archive_layout.h" +#include "paimon/global_index/tantivy/tantivy_defs.h" +#include "paimon/global_index/tantivy/tantivy_global_index.h" +#include "paimon/global_index/tantivy/tantivy_global_index_reader.h" +#include "paimon/global_index/tantivy/tantivy_global_index_writer.h" +#include "paimon/predicate/full_text_search.h" +#include "paimon/testing/utils/testharness.h" + +#ifndef JIEBA_TEST_DICT_DIR +#error "JIEBA_TEST_DICT_DIR must be set at compile time" +#endif +#ifndef PAIMON_TANTIVY_JAVA_FIXTURE_DIR +#error "PAIMON_TANTIVY_JAVA_FIXTURE_DIR must be set at compile time" +#endif + +namespace paimon::tantivy::test { + +namespace { + +class FixturePathFactory : public IndexPathFactory { + public: + explicit FixturePathFactory(const std::string& root) : root_(root) {} + std::string NewPath() const override { + assert(false); + return ""; + } + std::string ToPath(const std::shared_ptr&) const override { + assert(false); + return ""; + } + std::string ToPath(const std::string& file_name) const override { + return PathUtil::JoinPath(root_, file_name); + } + bool IsExternalPath() const override { + return false; + } + + private: + std::string root_; +}; + +class JavaCompatTest : public ::testing::Test { + public: + void SetUp() override { + setenv(kJiebaDictDirEnv, JIEBA_TEST_DICT_DIR, /*overwrite=*/1); + } + + /// Build a V3 TantivyGlobalIndexReader on top of the Java-produced fixture. + /// `fixture_name` is relative to `PAIMON_TANTIVY_JAVA_FIXTURE_DIR`. + std::shared_ptr OpenFixture(const std::string& fixture_name) { + std::string fixture_dir = PAIMON_TANTIVY_JAVA_FIXTURE_DIR; + std::string archive_path = PathUtil::JoinPath(fixture_dir, fixture_name); + + auto file_status = fs_->GetFileStatus(archive_path).value(); + int64_t file_size = file_status->GetLen(); + EXPECT_GT(file_size, 4) << "fixture archive must exist and be > 4 bytes"; + + // Empty metadata (options not needed for cross-read — we use defaults) + std::string metadata_json = "{}"; + auto meta_bytes = std::make_shared(metadata_json, pool_.get()); + + GlobalIndexIOMeta io_meta(archive_path, file_size, meta_bytes); + + std::map options; + auto global_index = std::make_shared(options); + auto path_factory = std::make_shared(fixture_dir); + auto file_reader = std::make_shared(fs_, path_factory); + + auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())}); + auto c_schema = std::make_unique<::ArrowSchema>(); + EXPECT_TRUE(arrow::ExportType(*data_type, c_schema.get()).ok()); + + auto reader_res = global_index->CreateReader(c_schema.get(), file_reader, {io_meta}, pool_); + EXPECT_TRUE(reader_res.ok()) << reader_res.status().ToString(); + return reader_res.value(); + } + + std::shared_ptr BuildFts(FullTextSearch::SearchType type, + const std::string& query) { + return std::make_shared( + /*_field_name=*/"f0", + /*_limit=*/std::optional{}, + /*_query=*/query, + /*_search_type=*/type, + /*_pre_filter=*/std::optional{}); + } + + /// Run the search and return the sorted row_ids from the result bitmap. + std::vector RunSearchRowIds(const std::shared_ptr& reader, + FullTextSearch::SearchType type, + const std::string& query) { + auto fts = BuildFts(type, query); + auto result = reader->VisitFullTextSearch(fts); + EXPECT_TRUE(result.ok()) << result.status().ToString(); + std::shared_ptr r = result.value(); + + const RoaringBitmap64* bitmap = nullptr; + if (auto plain = std::dynamic_pointer_cast(r)) { + auto b = plain->GetBitmap(); + EXPECT_TRUE(b.ok()) << b.status().ToString(); + bitmap = b.value(); + } else if (auto scored = std::dynamic_pointer_cast(r)) { + auto b = scored->GetBitmap(); + EXPECT_TRUE(b.ok()) << b.status().ToString(); + bitmap = b.value(); + } + EXPECT_TRUE(bitmap != nullptr); + if (bitmap == nullptr) return {}; + + std::vector out; + for (auto it = bitmap->Begin(); it != bitmap->End(); ++it) { + out.push_back(static_cast(*it)); + } + std::sort(out.begin(), out.end()); + return out; + } + + protected: + std::shared_ptr pool_ = GetDefaultPool(); + std::shared_ptr fs_ = std::make_shared(); +}; + +} // namespace + +// ============================================================================ +// 1. Archive basics: opening the Java-produced fixture succeeds +// ============================================================================ + +TEST_F(JavaCompatTest, OpenJavaArchiveSucceeds) { + auto reader = OpenFixture("english_simple.archive"); + ASSERT_TRUE(reader != nullptr); +} + +// ============================================================================ +// 2. MATCH_ALL — single and multi-term +// ============================================================================ + +TEST_F(JavaCompatTest, MatchAll_Apple) { + auto reader = OpenFixture("english_simple.archive"); + auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ALL, "apple"); + // Docs containing "apple": 0 ("apple banana cherry"), 1 ("apple durian"), + // 4 ("apple cherry fig"), 7 ("apple") + EXPECT_EQ(ids, (std::vector{0, 1, 4, 7})); +} + +TEST_F(JavaCompatTest, MatchAll_AppleBanana_Intersection) { + auto reader = OpenFixture("english_simple.archive"); + auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ALL, "apple banana"); + // Only doc 0 contains both "apple" and "banana" + EXPECT_EQ(ids, (std::vector{0})); +} + +// ============================================================================ +// 3. MATCH_ANY — union +// ============================================================================ + +TEST_F(JavaCompatTest, MatchAny_DurianElderberry_Union) { + auto reader = OpenFixture("english_simple.archive"); + auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ANY, "durian elderberry"); + // durian: 1, 6 elderberry: 5, 8 union: {1, 5, 6, 8} + EXPECT_EQ(ids, (std::vector{1, 5, 6, 8})); +} + +// ============================================================================ +// 4. PHRASE — consecutive term order matters +// ============================================================================ + +TEST_F(JavaCompatTest, Phrase_AppleBanana) { + auto reader = OpenFixture("english_simple.archive"); + auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::PHRASE, "apple banana"); + // Only doc 0 has "apple banana" as consecutive phrase + EXPECT_EQ(ids, (std::vector{0})); +} + +TEST_F(JavaCompatTest, Phrase_BananaCherry) { + auto reader = OpenFixture("english_simple.archive"); + auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::PHRASE, "banana cherry"); + // "banana cherry" consecutive in doc 0 ("apple banana cherry") and doc 2 ("banana cherry") + EXPECT_EQ(ids, (std::vector{0, 2})); +} + +// ============================================================================ +// 5. PREFIX — byte-level (not tokenized) via RegexQuery +// ============================================================================ + +TEST_F(JavaCompatTest, Prefix_Ap) { + auto reader = OpenFixture("english_simple.archive"); + auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::PREFIX, "ap"); + // Tokens starting with "ap": "apple" → docs 0, 1, 4, 7 + EXPECT_EQ(ids, (std::vector{0, 1, 4, 7})); +} + +// ============================================================================ +// 6. WILDCARD — glob-style via regex +// ============================================================================ + +TEST_F(JavaCompatTest, Wildcard_Err) { + auto reader = OpenFixture("english_simple.archive"); + auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::WILDCARD, "*err*"); + // Tokens matching *err*: "cherry" (0,2,4,6,9), "elderberry" (5,8) + EXPECT_EQ(ids, (std::vector{0, 2, 4, 5, 6, 8, 9})); +} + +// ============================================================================ +// 7. row_id invariant — must return the *caller-supplied* row_ids (not doc_ids) +// ============================================================================ + +TEST_F(JavaCompatTest, AllDocsReachableByRowId) { + auto reader = OpenFixture("english_simple.archive"); + // Union of all terms matches all 10 docs. + auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ANY, + "apple banana cherry durian fig grape elderberry"); + EXPECT_EQ(ids, (std::vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9})); + // This confirms Java wrote row_ids 0..9 via `addDocument(rowId, text)` and + // paimon-cpp V3 reader extracted them via fast_fields().u64("row_id") — + // the schema B1 invariant survives round-trip across implementations. +} + +// ============================================================================ +// 8. Probe: real paimon-java production archive (handed over by Java team). +// Data was claimed to be (id INT, content STRING) with 5 rows but ids +// rewritten multiple times; dump layout + per-term hits so caller can +// reverse-engineer what's actually inside. +// ============================================================================ + +TEST_F(JavaCompatTest, ProductionSampleProbe) { + const std::string fixture_name = "production_sample.archive"; + const std::string fixture_dir = PAIMON_TANTIVY_JAVA_FIXTURE_DIR; + const std::string archive_path = PathUtil::JoinPath(fixture_dir, fixture_name); + + // 1) parse archive header, dump layout + auto stream_res = fs_->Open(archive_path); + ASSERT_TRUE(stream_res.ok()) << stream_res.status().ToString(); + std::shared_ptr stream = std::move(stream_res).value(); + auto layout_res = ParseArchiveHeader(stream.get()); + ASSERT_TRUE(layout_res.ok()) << layout_res.status().ToString(); + const auto& layout = layout_res.value(); + std::cerr << "[PROBE] archive=" << fixture_name << " file_count=" << layout.count << "\n"; + for (std::size_t i = 0; i < layout.count; ++i) { + std::cerr << " [" << i << "] " << layout.names[i] << " offset=" << layout.offsets[i] + << " length=" << layout.lengths[i] << "\n"; + } + + // 2) open reader and print the schema-declared tokenizer name + auto reader = OpenFixture(fixture_name); + ASSERT_TRUE(reader != nullptr); + + // 3) scan for keywords we'd expect based on user-provided text samples + // ("Apache Paimon / full-text search / vector / lumina / streaming / ..."). + // tokenizer is "default" — lowercased word-granular tokens. + const std::vector probes = { + "apache", "paimon", "is", "a", "lake", "format", "supports", + "full", "text", "search", "in", "vector", "similarity", "using", + "lumina", "streaming", "and", "batch", "processing", "engine", + }; + + std::cerr << "[PROBE] MATCH_ALL per-term row_ids:\n"; + for (const auto& term : probes) { + auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ALL, term); + std::cerr << " " << term << " -> ["; + for (std::size_t i = 0; i < ids.size(); ++i) { + if (i > 0) std::cerr << ", "; + std::cerr << ids[i]; + } + std::cerr << "]\n"; + } + + // 4) union of everything to see every row_id present in the archive + std::string all_terms; + for (const auto& t : probes) { + if (!all_terms.empty()) all_terms += " "; + all_terms += t; + } + auto all_ids = RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ANY, all_terms); + std::cerr << "[PROBE] union all probe terms -> row_id count=" << all_ids.size() << " ["; + for (std::size_t i = 0; i < all_ids.size(); ++i) { + if (i > 0) std::cerr << ", "; + std::cerr << all_ids[i]; + } + std::cerr << "]\n"; + + // 5) a few common phrases from the user's snippet + for (const auto& phrase : std::vector{ + "apache paimon", "full text", "vector similarity", "streaming and batch"}) { + auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::PHRASE, phrase); + std::cerr << "[PROBE] PHRASE \"" << phrase << "\" -> ["; + for (std::size_t i = 0; i < ids.size(); ++i) { + if (i > 0) std::cerr << ", "; + std::cerr << ids[i]; + } + std::cerr << "]\n"; + } + + // sanity: the archive is readable at all — at least one probe term hits. + bool any_hit = false; + for (const auto& term : probes) { + auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ALL, term); + if (!ids.empty()) { + any_hit = true; + break; + } + } + EXPECT_TRUE(any_hit) << "no probe term hit; archive may be empty or schema mismatched"; +} + +// ============================================================================ +// 9. Reverse direction: paimon-cpp writes with tokenizer="default" → fixture +// consumed by paimon-java test. This test emits the archive into +// test/test_data/cpp_tantivy_fixtures/english_default.archive and +// round-trips it through the cpp reader first (schema-driven tokenizer +// dispatch picks "default" automatically via P-TK). +// ============================================================================ + +namespace { + +/// GlobalIndexFileWriter that emits to a single fixed filename under `root`. +/// Mirrors paimon-java's `FixedNameLocalFileWriter` from +/// `TantivyIndexFixtureGen.java`: `newFileName(prefix)` ignores the prefix and +/// always returns the caller-chosen name. Used to produce a stable fixture +/// path consumed by the paimon-java cross-read test. +class FixedNameGlobalIndexFileWriter : public GlobalIndexFileWriter { + public: + FixedNameGlobalIndexFileWriter(std::shared_ptr fs, std::string root, + std::string fixed_name) + : fs_(std::move(fs)), root_(std::move(root)), fixed_name_(std::move(fixed_name)) {} + + Result NewFileName(const std::string& /*prefix*/) const override { + return fixed_name_; + } + std::string ToPath(const std::string& file_name) const override { + return PathUtil::JoinPath(root_, file_name); + } + Result> NewOutputStream( + const std::string& file_name) const override { + return fs_->Create(ToPath(file_name), /*overwrite=*/true); + } + Result GetFileSize(const std::string& file_name) const override { + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr file_status, + fs_->GetFileStatus(ToPath(file_name))); + return file_status->GetLen(); + } + + private: + std::shared_ptr fs_; + std::string root_; + std::string fixed_name_; +}; + +/// Same 10-doc English corpus paimon-java uses in TantivyIndexFixtureGen +/// (pure ASCII, no punctuation inside words). SimpleTokenizer (tantivy's +/// "default") tokenizes identically on both sides for this subset, so the +/// golden row_ids match byte-for-byte between cpp-write and java-read. +constexpr const char* kEnglishDocs[] = { + "apple banana cherry", // 0 + "apple durian", // 1 + "banana cherry", // 2 + "fig grape", // 3 + "apple cherry fig", // 4 + "banana elderberry", // 5 + "cherry durian", // 6 + "apple", // 7 + "grape fig elderberry", // 8 + "cherry fig", // 9 +}; + +} // namespace + +TEST_F(JavaCompatTest, CppWriteDefaultTokenizerForJavaCrossRead) { + // 1) Produce an archive into test/test_data/cpp_tantivy_fixtures/ via the + // production TantivyGlobalIndexWriter, configured with tantivy's + // built-in "default" tokenizer (same as paimon-java's TEXT field). + const std::string out_dir = PAIMON_TANTIVY_CPP_FIXTURE_DIR; + const std::string fixture_name = "english_default.archive"; + // Ensure dir exists (CMake does NOT create it automatically). + { + auto mk = fs_->Mkdirs(out_dir); + ASSERT_TRUE(mk.ok()) << mk.ToString(); + } + // Clean any prior fixture so each test run writes fresh bytes. + { + const std::string archive_path_cleanup = PathUtil::JoinPath(out_dir, fixture_name); + auto existing = fs_->GetFileStatus(archive_path_cleanup); + if (existing.ok()) { + ASSERT_TRUE(fs_->Delete(archive_path_cleanup, false).ok()); + } + } + + auto file_writer = std::make_shared(fs_, out_dir, fixture_name); + + auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())}); + + std::map options{ + {kTantivyWriteTokenizer, "default"}, + }; + auto writer_res = + TantivyGlobalIndexWriter::Create("f0", data_type, file_writer, options, pool_); + ASSERT_TRUE(writer_res.ok()) << writer_res.status().ToString(); + auto writer = writer_res.value(); + + // Build an arrow batch from kEnglishDocs. + std::string json = "["; + for (std::size_t i = 0; i < sizeof(kEnglishDocs) / sizeof(kEnglishDocs[0]); ++i) { + if (i > 0) json += ","; + json += "[\""; + json += kEnglishDocs[i]; + json += "\"]"; + } + json += "]"; + auto array = arrow::ipc::internal::json::ArrayFromJSON(data_type, json).ValueOrDie(); + ::ArrowArray c_array; + ASSERT_TRUE(arrow::ExportArray(*array, &c_array).ok()); + std::vector relative_row_ids(array->length()); + for (int64_t i = 0; i < array->length(); ++i) relative_row_ids[i] = i; + ASSERT_TRUE(writer->AddBatch(&c_array, std::move(relative_row_ids)).ok()); + auto metas_res = writer->Finish(); + ASSERT_TRUE(metas_res.ok()) << metas_res.status().ToString(); + ASSERT_EQ(metas_res.value().size(), 1u); + const auto& meta = metas_res.value().front(); + const std::string archive_path = meta.file_path; + std::cerr << "[CPP-WRITE] archive_path=" << archive_path << " file_size=" << meta.file_size + << "\n"; + + // 2) Archive header sanity: 16+ files, meta.json present, tokenizer in schema. + auto stream_res = fs_->Open(archive_path); + ASSERT_TRUE(stream_res.ok()) << stream_res.status().ToString(); + std::shared_ptr stream = std::move(stream_res).value(); + auto layout_res = ParseArchiveHeader(stream.get()); + ASSERT_TRUE(layout_res.ok()) << layout_res.status().ToString(); + const auto& layout = layout_res.value(); + std::cerr << "[CPP-WRITE] file_count=" << layout.count << "\n"; + bool has_meta_json = false; + for (std::size_t i = 0; i < layout.count; ++i) { + if (layout.names[i] == "meta.json") has_meta_json = true; + } + EXPECT_TRUE(has_meta_json); + + // 3) Round-trip through the cpp reader first — P-TK must auto-register + // "default" from the schema so the search path works without passing + // any reader-side tokenizer config. + // Build a reader directly off the archive path (mirrors OpenFixture + // but rooted at the cpp fixtures dir). + auto file_status = fs_->GetFileStatus(archive_path).value(); + int64_t file_size = file_status->GetLen(); + auto meta_bytes = std::make_shared(std::string("{}"), pool_.get()); + GlobalIndexIOMeta io_meta(archive_path, file_size, meta_bytes); + auto reader_factory = + std::make_shared(std::map{}); + auto reader_path_factory = std::make_shared(out_dir); + auto reader_file_mgr = std::make_shared(fs_, reader_path_factory); + + auto c_schema = std::make_unique<::ArrowSchema>(); + ASSERT_TRUE(arrow::ExportType(*data_type, c_schema.get()).ok()); + + auto reader_res = + reader_factory->CreateReader(c_schema.get(), reader_file_mgr, {io_meta}, pool_); + ASSERT_TRUE(reader_res.ok()) << reader_res.status().ToString(); + auto reader = reader_res.value(); + + // Golden expectations (identical to paimon-java's english_simple.golden.json) + EXPECT_EQ(RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ALL, "apple"), + (std::vector{0, 1, 4, 7})); + EXPECT_EQ(RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ALL, "apple banana"), + (std::vector{0})); + EXPECT_EQ(RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ANY, "durian elderberry"), + (std::vector{1, 5, 6, 8})); + EXPECT_EQ(RunSearchRowIds(reader, FullTextSearch::SearchType::PHRASE, "apple banana"), + (std::vector{0})); + EXPECT_EQ(RunSearchRowIds(reader, FullTextSearch::SearchType::PHRASE, "banana cherry"), + (std::vector{0, 2})); + EXPECT_EQ(RunSearchRowIds(reader, FullTextSearch::SearchType::PREFIX, "ap"), + (std::vector{0, 1, 4, 7})); + EXPECT_EQ(RunSearchRowIds(reader, FullTextSearch::SearchType::WILDCARD, "*err*"), + (std::vector{0, 2, 4, 5, 6, 8, 9})); + EXPECT_EQ(RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ANY, + "apple banana cherry durian fig grape elderberry"), + (std::vector{0, 1, 2, 3, 4, 5, 6, 7, 8, 9})); + + std::cerr << "[CPP-WRITE] SUCCESS: archive ready for paimon-java read at " << archive_path + << "\n"; +} + +} // namespace paimon::tantivy::test diff --git a/src/paimon/global_index/tantivy/tantivy_lucene_coexist_test.cpp b/src/paimon/global_index/tantivy/tantivy_lucene_coexist_test.cpp new file mode 100644 index 000000000..dbee3946a --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_lucene_coexist_test.cpp @@ -0,0 +1,294 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Stage 9 coexistence test: prove lucene-fts and tantivy-fulltext can be linked + * + instantiated + used in the same process without state collisions, and + * that GlobalIndexerFactory routes correctly between them via index_type. + * + * The two implementations are NOT cross-readable (migration plan §0 + * decision 1) — each reader only opens files written by its own writer. + * This test does NOT attempt a tantivy reader on a lucene file or vice + * versa; instead it verifies: + * + * - both factories register without symbol clashes + * - both writers can produce indexes side-by-side from identical input + * - both readers return semantically equivalent doc id sets for queries + * where tokenization differences don't matter (English bag-of-words) + * - the two indexes coexist on disk under distinct identifiers + * ("lucene-fts-global-index-*" vs "tantivy-fulltext-global-index-*") + */ + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/c/bridge.h" +#include "arrow/ipc/api.h" +#include "arrow/type.h" +#include "fmt/format.h" +#include "gtest/gtest.h" +#include "paimon/common/utils/path_util.h" +#include "paimon/common/utils/string_utils.h" +#include "paimon/core/global_index/global_index_file_manager.h" +#include "paimon/core/index/index_path_factory.h" +#include "paimon/fs/local/local_file_system.h" +#include "paimon/global_index/bitmap_global_index_result.h" +#include "paimon/global_index/bitmap_scored_global_index_result.h" +#include "paimon/global_index/global_index_io_meta.h" +#include "paimon/global_index/global_index_reader.h" +#include "paimon/global_index/global_index_writer.h" +#include "paimon/global_index/global_indexer.h" +#include "paimon/global_index/global_indexer_factory.h" +#include "paimon/global_index/lucene/lucene_defs.h" +#include "paimon/global_index/tantivy/tantivy_defs.h" +#include "paimon/predicate/full_text_search.h" +#include "paimon/testing/utils/testharness.h" + +#ifndef JIEBA_TEST_DICT_DIR +#error "JIEBA_TEST_DICT_DIR must be set at compile time" +#endif + +namespace paimon::tantivy::test { + +namespace { + +class FakeIndexPathFactory : public IndexPathFactory { + public: + explicit FakeIndexPathFactory(const std::string& root) : root_(root) {} + std::string NewPath() const override { + assert(false); + return ""; + } + std::string ToPath(const std::shared_ptr&) const override { + assert(false); + return ""; + } + std::string ToPath(const std::string& file_name) const override { + return PathUtil::JoinPath(root_, file_name); + } + bool IsExternalPath() const override { + return false; + } + + private: + std::string root_; +}; + +/// Adopt one of the two factory identifiers; everything else (paths, queries, +/// arrow plumbing) is shared. +struct ImplSpec { + std::string factory_id; // "lucene-fts" or "tantivy-fulltext" + std::string file_prefix; // "lucene-fts-global-index-" or "tantivy-fulltext-global-index-" + std::string option_prefix; // "lucene-fts." or "tantivy-fulltext." +}; + +class TantivyLuceneCoexistTest : public ::testing::Test { + public: + void SetUp() override { + setenv(::paimon::lucene::kJiebaDictDirEnv, JIEBA_TEST_DICT_DIR, /*overwrite=*/1); + setenv(::paimon::tantivy::kJiebaDictDirEnv, JIEBA_TEST_DICT_DIR, /*overwrite=*/1); + } + + std::unique_ptr<::ArrowSchema> CreateArrowSchema( + const std::shared_ptr& data_type) const { + auto c_schema = std::make_unique<::ArrowSchema>(); + EXPECT_TRUE(arrow::ExportType(*data_type, c_schema.get()).ok()); + return c_schema; + } + + Result WriteWith(const ImplSpec& impl, const std::string& root, + const std::shared_ptr& data_type, + const std::map& options, + const std::shared_ptr& array) const { + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr indexer, + GlobalIndexerFactory::Get(impl.factory_id, options)); + if (!indexer) { + return Status::Invalid(fmt::format("factory returned null for {}", impl.factory_id)); + } + auto path_factory = std::make_shared(root); + auto file_writer = std::make_shared(fs_, path_factory); + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr w, + indexer->CreateWriter("f0", CreateArrowSchema(data_type).get(), file_writer, pool_)); + ::ArrowArray c_array; + PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportArray(*array, &c_array)); + std::vector relative_row_ids(array->length()); + for (int64_t i = 0; i < array->length(); ++i) relative_row_ids[i] = i; + PAIMON_RETURN_NOT_OK(w->AddBatch(&c_array, std::move(relative_row_ids))); + PAIMON_ASSIGN_OR_RAISE(auto metas, w->Finish()); + EXPECT_EQ(metas.size(), 1u); + EXPECT_TRUE( + StringUtils::StartsWith(PathUtil::GetName(metas[0].file_path), impl.file_prefix)) + << metas[0].file_path << " did not start with " << impl.file_prefix; + return metas[0]; + } + + Result> OpenReader( + const ImplSpec& impl, const std::string& root, + const std::shared_ptr& data_type, + const std::map& options, const GlobalIndexIOMeta& meta) const { + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr indexer, + GlobalIndexerFactory::Get(impl.factory_id, options)); + auto path_factory = std::make_shared(root); + auto file_reader = std::make_shared(fs_, path_factory); + return indexer->CreateReader(CreateArrowSchema(data_type).get(), file_reader, {meta}, + pool_); + } + + static std::set ExtractDocIds(const std::shared_ptr& result) { + const RoaringBitmap64* bitmap = nullptr; + Result br = Status::Invalid("no result"); + if (auto scored = std::dynamic_pointer_cast(result)) { + br = scored->GetBitmap(); + } else if (auto plain = std::dynamic_pointer_cast(result)) { + br = plain->GetBitmap(); + } + EXPECT_TRUE(br.ok()) << br.status().ToString(); + bitmap = br.value(); + std::set out; + if (bitmap) { + for (auto it = bitmap->Begin(); it != bitmap->End(); ++it) { + out.insert(static_cast(*it)); + } + } + return out; + } + + protected: + std::shared_ptr pool_ = GetDefaultPool(); + std::shared_ptr fs_ = std::make_shared(); + + inline static const ImplSpec kLucene{"lucene-fts", "lucene-fts-global-index-", "lucene-fts."}; + inline static const ImplSpec kTantivy{"tantivy-fulltext", "tantivy-fulltext-global-index-", + "tantivy-fulltext."}; +}; + +} // namespace + +TEST_F(TantivyLuceneCoexistTest, BothFactoriesResolve) { + // No options needed; just verify both factories register and dispatch. + ASSERT_OK_AND_ASSIGN(auto lucene_indexer, GlobalIndexerFactory::Get("lucene-fts", {})); + ASSERT_OK_AND_ASSIGN(auto tantivy_indexer, GlobalIndexerFactory::Get("tantivy-fulltext", {})); + ASSERT_TRUE(lucene_indexer); + ASSERT_TRUE(tantivy_indexer); + // Sanity: factories return distinct types — different vtables → different + // GetIndexType() once we open a reader (not testable here without an + // index), so just check shared_ptr identity differs. + EXPECT_NE(static_cast(lucene_indexer.get()), static_cast(tantivy_indexer.get())); +} + +TEST_F(TantivyLuceneCoexistTest, SideBySideEnglishCorpusReturnsSameDocIds) { + auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())}); + auto array = arrow::ipc::internal::json::ArrayFromJSON(data_type, R"([ + ["alpha beta gamma document"], + ["alpha alpha document"], + ["gamma delta epsilon"], + ["alpha beta document document"] + ])") + .ValueOrDie(); + + auto lucene_root = paimon::test::UniqueTestDirectory::Create(); + auto tantivy_root = paimon::test::UniqueTestDirectory::Create(); + ASSERT_TRUE(lucene_root && tantivy_root); + + // Lucene requires a tmp directory option; tantivy ignores unknown keys. + std::map lucene_options = { + {"lucene-fts.write.tmp.directory", lucene_root->Str()}}; + + // Write through BOTH factories side by side in the same process. + ASSERT_OK_AND_ASSIGN(auto lucene_meta, + WriteWith(kLucene, lucene_root->Str(), data_type, lucene_options, array)); + ASSERT_OK_AND_ASSIGN(auto tantivy_meta, + WriteWith(kTantivy, tantivy_root->Str(), data_type, {}, array)); + + ASSERT_OK_AND_ASSIGN(auto lucene_reader, + OpenReader(kLucene, lucene_root->Str(), data_type, {}, lucene_meta)); + ASSERT_OK_AND_ASSIGN(auto tantivy_reader, + OpenReader(kTantivy, tantivy_root->Str(), data_type, {}, tantivy_meta)); + EXPECT_EQ(lucene_reader->GetIndexType(), std::string("lucene-fts")); + EXPECT_EQ(tantivy_reader->GetIndexType(), std::string("tantivy-fulltext")); + + auto run_pair = [&](const std::string& q, FullTextSearch::SearchType t) { + auto lr = lucene_reader->VisitFullTextSearch(std::make_shared( + "f0", /*limit=*/std::nullopt, q, t, /*pre_filter=*/std::nullopt)); + auto tr = tantivy_reader->VisitFullTextSearch(std::make_shared( + "f0", /*limit=*/std::nullopt, q, t, /*pre_filter=*/std::nullopt)); + EXPECT_TRUE(lr.ok()) << "lucene: " << lr.status().ToString(); + EXPECT_TRUE(tr.ok()) << "tantivy: " << tr.status().ToString(); + return std::make_pair(ExtractDocIds(lr.value()), ExtractDocIds(tr.value())); + }; + + // For an English bag-of-words corpus the two implementations should agree + // on which docs contain which terms — Lucene and tantivy both store + // lowercased word tokens. + { + auto [l, t] = run_pair("document", FullTextSearch::SearchType::MATCH_ALL); + EXPECT_EQ(l, t) << "MATCH_ALL document — lucene vs tantivy doc id set differs"; + EXPECT_EQ(l, (std::set{0, 1, 3})); + } + { + auto [l, t] = run_pair("alpha beta", FullTextSearch::SearchType::MATCH_ALL); + EXPECT_EQ(l, t) << "MATCH_ALL 'alpha beta' — sets differ"; + EXPECT_EQ(l, (std::set{0, 3})); + } + { + auto [l, t] = run_pair("alpha epsilon", FullTextSearch::SearchType::MATCH_ANY); + EXPECT_EQ(l, t) << "MATCH_ANY 'alpha epsilon' — sets differ"; + EXPECT_EQ(l, (std::set{0, 1, 2, 3})); + } + { + auto [l, t] = run_pair("alpha beta", FullTextSearch::SearchType::PHRASE); + EXPECT_EQ(l, t) << "PHRASE 'alpha beta' — sets differ"; + EXPECT_EQ(l, (std::set{0, 3})); + } +} + +TEST_F(TantivyLuceneCoexistTest, IndependentLifecycleNoStateLeakage) { + // Build a lucene index and a tantivy index back-to-back many times in the + // same process; if either factory leaked global state across instances + // we'd see crashes or stale results. + auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())}); + + for (int round = 0; round < 3; ++round) { + auto array = arrow::ipc::internal::json::ArrayFromJSON(data_type, R"([ + ["round payload one"], + ["round payload two"] + ])") + .ValueOrDie(); + auto lroot = paimon::test::UniqueTestDirectory::Create(); + auto troot = paimon::test::UniqueTestDirectory::Create(); + ASSERT_TRUE(lroot && troot); + + std::map lopt = { + {"lucene-fts.write.tmp.directory", lroot->Str()}}; + ASSERT_OK_AND_ASSIGN(auto lm, WriteWith(kLucene, lroot->Str(), data_type, lopt, array)); + ASSERT_OK_AND_ASSIGN(auto tm, WriteWith(kTantivy, troot->Str(), data_type, {}, array)); + ASSERT_OK_AND_ASSIGN(auto lr, OpenReader(kLucene, lroot->Str(), data_type, {}, lm)); + ASSERT_OK_AND_ASSIGN(auto tr, OpenReader(kTantivy, troot->Str(), data_type, {}, tm)); + + auto lq = lr->VisitFullTextSearch(std::make_shared( + "f0", std::nullopt, "payload", FullTextSearch::SearchType::MATCH_ALL, std::nullopt)); + auto tq = tr->VisitFullTextSearch(std::make_shared( + "f0", std::nullopt, "payload", FullTextSearch::SearchType::MATCH_ALL, std::nullopt)); + ASSERT_TRUE(lq.ok()); + ASSERT_TRUE(tq.ok()); + EXPECT_EQ(ExtractDocIds(lq.value()), (std::set{0, 1})) << "lucene round " << round; + EXPECT_EQ(ExtractDocIds(tq.value()), (std::set{0, 1})) + << "tantivy round " << round; + } +} + +} // namespace paimon::tantivy::test diff --git a/src/paimon/global_index/tantivy/tantivy_reader_test.cpp b/src/paimon/global_index/tantivy/tantivy_reader_test.cpp new file mode 100644 index 000000000..ba3fe6299 --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_reader_test.cpp @@ -0,0 +1,220 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Stage 6 reader test: write an index via TantivyGlobalIndexWriter, persist + * it, then run all 5 FullTextSearch SearchTypes through TantivyGlobalIndexReader + * and assert matching local row ids. Mirrors the no-limit / no-pre_filter + * subset of paimon-lucene-index-test's TestSimple/TestSimpleChinese cases. + * + * limit / pre_filter coverage lands in Stage 7 (paimon-tantivy-filter-limit-test). + */ + +#include +#include + +#include "arrow/array.h" +#include "arrow/c/bridge.h" +#include "arrow/ipc/api.h" +#include "arrow/type.h" +#include "gtest/gtest.h" +#include "paimon/common/utils/path_util.h" +#include "paimon/core/global_index/global_index_file_manager.h" +#include "paimon/core/index/index_path_factory.h" +#include "paimon/fs/local/local_file_system.h" +#include "paimon/global_index/bitmap_global_index_result.h" +#include "paimon/global_index/tantivy/tantivy_defs.h" +#include "paimon/global_index/tantivy/tantivy_global_index_reader.h" +#include "paimon/global_index/tantivy/tantivy_global_index_writer.h" +#include "paimon/testing/utils/testharness.h" + +#ifndef JIEBA_TEST_DICT_DIR +#error "JIEBA_TEST_DICT_DIR must be set at compile time" +#endif + +namespace paimon::tantivy::test { + +namespace { + +class FakeIndexPathFactory : public IndexPathFactory { + public: + explicit FakeIndexPathFactory(const std::string& root) : root_(root) {} + std::string NewPath() const override { + assert(false); + return ""; + } + std::string ToPath(const std::shared_ptr&) const override { + assert(false); + return ""; + } + std::string ToPath(const std::string& file_name) const override { + return PathUtil::JoinPath(root_, file_name); + } + bool IsExternalPath() const override { + return false; + } + + private: + std::string root_; +}; + +class TantivyReaderTest : public ::testing::Test { + public: + void SetUp() override { + setenv(kJiebaDictDirEnv, JIEBA_TEST_DICT_DIR, /*overwrite=*/1); + } + + /// Write `array` to a fresh test directory and return (file_manager, meta). + std::pair, GlobalIndexIOMeta> WriteAndOpen( + const std::shared_ptr& array, + const std::map& options) { + auto root_dir = paimon::test::UniqueTestDirectory::Create(); + EXPECT_TRUE(root_dir); + // Hold the directory alive across this test by leaking the + // unique_ptr's owned dir into a static — UniqueTestDirectory::Create + // returns RAII; need the path to outlive the function. + // Easier path: reach in via member, save root string, then wrap a + // fresh GlobalIndexFileManager pointing at that string. + std::string root = root_dir->Str(); + // keep the directory alive + kept_dirs_.push_back(std::move(root_dir)); + + auto path_factory = std::make_shared(root); + auto fm = std::make_shared(fs_, path_factory); + auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())}); + auto writer_res = + TantivyGlobalIndexWriter::Create("f0", data_type, fm, options, GetDefaultPool()); + EXPECT_TRUE(writer_res.ok()) << writer_res.status().ToString(); + auto writer = writer_res.value(); + ::ArrowArray c_array; + EXPECT_TRUE(arrow::ExportArray(*array, &c_array).ok()); + std::vector relative_row_ids(array->length()); + for (int64_t i = 0; i < array->length(); ++i) relative_row_ids[i] = i; + EXPECT_TRUE(writer->AddBatch(&c_array, std::move(relative_row_ids)).ok()); + auto metas_res = writer->Finish(); + EXPECT_TRUE(metas_res.ok()) << metas_res.status().ToString(); + return {fm, metas_res.value()[0]}; + } + + static std::vector BitmapToVec(const std::shared_ptr& result) { + auto bg = std::dynamic_pointer_cast(result); + EXPECT_TRUE(bg) << "expected BitmapGlobalIndexResult"; + auto bitmap_res = bg->GetBitmap(); + EXPECT_TRUE(bitmap_res.ok()) << bitmap_res.status().ToString(); + const RoaringBitmap64* bitmap = bitmap_res.value(); + std::vector ids; + for (auto it = bitmap->Begin(); it != bitmap->End(); ++it) { + ids.push_back(static_cast(*it)); + } + std::sort(ids.begin(), ids.end()); + return ids; + } + + std::shared_ptr DataType() const { + return arrow::struct_({arrow::field("f0", arrow::utf8())}); + } + + protected: + std::shared_ptr fs_ = std::make_shared(); + /// Keep test directories alive for the duration of the test. + std::vector> kept_dirs_; +}; + +} // namespace + +TEST_F(TantivyReaderTest, EnglishMatchAllAndAny) { + auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([ + ["This is an test document."], + ["This is an new document document document."], + ["Document document document document test."], + ["unordered user-defined doc id"] + ])") + .ValueOrDie(); + auto [fm, meta] = WriteAndOpen(array, {}); + ASSERT_OK_AND_ASSIGN(auto reader, + TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool())); + + auto run = [&](const std::string& q, FullTextSearch::SearchType t) { + auto res = reader->VisitFullTextSearch(std::make_shared( + "f0", /*limit=*/std::nullopt, q, t, /*pre_filter=*/std::nullopt)); + EXPECT_TRUE(res.ok()) << res.status().ToString(); + return BitmapToVec(res.value()); + }; + + EXPECT_EQ(run("document", FullTextSearch::SearchType::MATCH_ALL), + (std::vector{0, 1, 2})); + EXPECT_EQ(run("test document", FullTextSearch::SearchType::MATCH_ALL), + (std::vector{0, 2})); + EXPECT_EQ(run("test new", FullTextSearch::SearchType::MATCH_ANY), + (std::vector{0, 1, 2})); +} + +TEST_F(TantivyReaderTest, EnglishPhrasePrefixWildcard) { + auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([ + ["This is an test document."], + ["This is an new document document document."], + ["Document document document document test."], + ["unordered user-defined doc id"] + ])") + .ValueOrDie(); + auto [fm, meta] = WriteAndOpen(array, {}); + ASSERT_OK_AND_ASSIGN(auto reader, + TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool())); + + auto run = [&](const std::string& q, FullTextSearch::SearchType t) { + auto res = reader->VisitFullTextSearch(std::make_shared( + "f0", /*limit=*/std::nullopt, q, t, /*pre_filter=*/std::nullopt)); + EXPECT_TRUE(res.ok()) << res.status().ToString(); + return BitmapToVec(res.value()); + }; + + // "test document" is consecutive only in row 0 ("an test document.") + EXPECT_EQ(run("test document", FullTextSearch::SearchType::PHRASE), (std::vector{0})); + EXPECT_EQ(run("unorder", FullTextSearch::SearchType::PREFIX), (std::vector{3})); + EXPECT_EQ(run("*order*", FullTextSearch::SearchType::WILDCARD), (std::vector{3})); + EXPECT_EQ(run("*or*er*", FullTextSearch::SearchType::WILDCARD), (std::vector{3})); +} + +TEST_F(TantivyReaderTest, ChineseQueryMode) { + auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([ +["QianWen 是一个基于 AI 的智能助手,类似于 Siri 和 Alexa。我们正在用 Python 开发 QianWen 的 Natural Language Understanding 模块,该模块支持多轮对话和意图识别功能,是新一代智能助手的核心技术之一。"], +["最近开源了一个新项目叫qianwen(全角字符),功能类似之前的 Qianwen,是一个面向 AI 应用的智能助手。它不仅支持 Machine Learning 和 NLP 技术,还提供了可扩展的开发框架,便于开发者构建自己的智能助手系统。"], +["我们在测试 qianwen-core v1.2 和 ai-engine-alpha 中的 bug,重点优化了 qianwen 的响应速度和稳定性。本次更新增强了核心模块的功能,提升了智能助手的开发效率,并修复了与 NLP 模块相关的多个问题。"], +["AI 助手开发中常用的技术包括 Speech Recognition、Natural Language Processing 和 Recommendation System。我们使用 TensorFlow 和 PyTorch 构建模型,开发了多个智能助手原型,支持语音交互和上下文理解功能,是当前热门的人工智能发展应用方向。"], +["新一代的 AI 助手代号为「千问」,内部命名为 QianwenX-2024,计划在 next quarter 发布。QianwenX 将集成更强的 multimodel 能力,支持图像和文本联合处理,进一步提升智能助手的理解能力和交互体验,是未来智能助手的重要发展方向。"] + ])") + .ValueOrDie(); + std::map options = { + {kTantivyWriteTokenizer, "paimon_jieba"}, + {kJiebaTokenizeMode, "query"}, + }; + auto [fm, meta] = WriteAndOpen(array, options); + ASSERT_OK_AND_ASSIGN( + auto reader, TantivyGlobalIndexReader::Create("f0", meta, fm, options, GetDefaultPool())); + + auto run = [&](const std::string& q, FullTextSearch::SearchType t) { + auto res = reader->VisitFullTextSearch(std::make_shared( + "f0", /*limit=*/std::nullopt, q, t, /*pre_filter=*/std::nullopt)); + EXPECT_TRUE(res.ok()) << res.status().ToString(); + return BitmapToVec(res.value()); + }; + + EXPECT_EQ(run("模块", FullTextSearch::SearchType::MATCH_ALL), (std::vector{0, 2})); + EXPECT_EQ(run("模块技术", FullTextSearch::SearchType::MATCH_ALL), (std::vector{0})); + EXPECT_EQ(run("模块技术", FullTextSearch::SearchType::MATCH_ANY), + (std::vector{0, 1, 2, 3})); + EXPECT_EQ(run("发展方向", FullTextSearch::SearchType::PHRASE), (std::vector{4})); +} + +} // namespace paimon::tantivy::test diff --git a/src/paimon/global_index/tantivy/tantivy_smoke_test.cpp b/src/paimon/global_index/tantivy/tantivy_smoke_test.cpp new file mode 100644 index 000000000..04f7915c7 --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_smoke_test.cpp @@ -0,0 +1,52 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * tantivy-fulltext Stage 1 smoke test: prove the Rust FFI bridge is callable from C++. + * Intentionally minimal — exercises only paimon_tantivy_version(). + * Later stages add real functional tests. + */ + +#include +#include + +#include "gtest/gtest.h" + +extern "C" { +#include "paimon_tantivy_ffi.h" // NOLINT(build/include_subdir) +} + +namespace paimon::tantivy { + +TEST(TantivySmoke, VersionIsReachable) { + const char* version = paimon_tantivy_version(); + ASSERT_NE(version, nullptr) << "paimon_tantivy_version returned null"; + + const std::string v(version); + EXPECT_FALSE(v.empty()); + // build.rs pins version from Cargo.toml (CARGO_PKG_VERSION), semver "x.y.z" + EXPECT_NE(v.find('.'), std::string::npos) << "expected semver, got: " << v; +} + +TEST(TantivySmoke, VersionPointerIsStable) { + // The pointer is documented as 'static — two calls should return either + // the same pointer or at least equivalent string content. + const char* v1 = paimon_tantivy_version(); + const char* v2 = paimon_tantivy_version(); + ASSERT_NE(v1, nullptr); + ASSERT_NE(v2, nullptr); + EXPECT_EQ(std::strcmp(v1, v2), 0); +} + +} // namespace paimon::tantivy diff --git a/src/paimon/global_index/tantivy/tantivy_stream_ctx.cpp b/src/paimon/global_index/tantivy/tantivy_stream_ctx.cpp new file mode 100644 index 000000000..b45572a71 --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_stream_ctx.cpp @@ -0,0 +1,78 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + */ + +#include "paimon/global_index/tantivy/tantivy_stream_ctx.h" + +#include + +#include "fmt/format.h" +#include "paimon/fs/file_system.h" + +namespace paimon::tantivy { + +extern "C" int32_t paimon_cpp_stream_read_at(void* ctx_ptr, uint64_t offset, std::size_t len, + uint8_t* out_buf) { + if (ctx_ptr == nullptr || out_buf == nullptr) { + return 1; + } + auto* ctx = static_cast(ctx_ptr); + std::lock_guard lock(ctx->pread_mu); + + std::size_t total = 0; + while (total < len) { + auto r = ctx->stream->Read(reinterpret_cast(out_buf + total), + static_cast(len - total), offset + total); + if (!r.ok()) { + return 1; + } + int32_t got = r.value(); + if (got <= 0) { + return 1; // unexpected EOF / 0-byte read + } + total += static_cast(got); + } + return 0; +} + +extern "C" void paimon_cpp_stream_release(void* ctx_ptr) { + if (ctx_ptr == nullptr) { + return; + } + auto* ctx = static_cast(ctx_ptr); + // ~shared_ptr closes the underlying stream. + delete ctx; +} + +extern "C" int32_t paimon_cpp_writer_push(void* ctx_ptr, const uint8_t* data, std::size_t len) { + if (ctx_ptr == nullptr) { + return 1; + } + auto* ctx = static_cast(ctx_ptr); + if (ctx->out == nullptr) { + ctx->last_error = Status::Invalid("writer_push: null OutputStream"); + return 1; + } + std::size_t total = 0; + while (total < len) { + auto r = ctx->out->Write(reinterpret_cast(data + total), + static_cast(len - total)); + if (!r.ok()) { + ctx->last_error = r.status(); + return 1; + } + int32_t written = r.value(); + if (written <= 0) { + ctx->last_error = Status::IOError(fmt::format( + "writer_push: short write (wrote {} of {} bytes)", written, len - total)); + return 1; + } + total += static_cast(written); + } + return 0; +} + +} // namespace paimon::tantivy diff --git a/src/paimon/global_index/tantivy/tantivy_stream_ctx.h b/src/paimon/global_index/tantivy/tantivy_stream_ctx.h new file mode 100644 index 000000000..532ca4e35 --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_stream_ctx.h @@ -0,0 +1,62 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include "paimon/status.h" + +namespace paimon { +class InputStream; +class OutputStream; +} // namespace paimon + +namespace paimon::tantivy { + +/// C++ side wrapper around a seekable InputStream, used as the `ctx` of +/// `PaimonStreamCallbacks` (V3). Lifetime is transferred to Rust via +/// `paimon_tantivy_reader_new_streaming`; Rust invokes `paimon_cpp_stream_release` +/// when the reader handle is freed, which `delete`s this struct. +/// +/// `pread_mu` is a defensive per-ctx lock: the underlying `InputStream::Read( +/// buffer, size, offset)` is declared pread-style (thread-safe, no position +/// mutation) but a few subclasses (notably `JindoInputStream`) have member- +/// variable races in practice. Rust also has its own `stream_mutex` that +/// serializes reads at the Directory level; `pread_mu` is belt-and-suspenders. +struct StreamCtx { + std::shared_ptr stream; + std::mutex pread_mu; +}; + +/// `ctx` of `PaimonWriteCallbacks` (W1). Holds a raw (non-owning) pointer to +/// a paimon `OutputStream` plus a sticky error for conveying write failures +/// back to the C++ caller of `TantivyGlobalIndexWriter::Finish`. +struct WriteCtx { + OutputStream* out = nullptr; + Status last_error = Status::OK(); +}; + +/// Rust -> C++ read callback. Reads `len` bytes starting at archive-absolute +/// `offset` into `out_buf`. Returns 0 on success, 1 on IO error. Thread-safe +/// (serialized via `StreamCtx::pread_mu`; Rust also holds its own mutex). +extern "C" int32_t paimon_cpp_stream_read_at(void* ctx_ptr, uint64_t offset, std::size_t len, + uint8_t* out_buf); + +/// Rust -> C++ release callback. Called exactly once when the Rust reader is +/// dropped. Deletes the ctx (which closes the underlying stream via ~shared_ptr). +extern "C" void paimon_cpp_stream_release(void* ctx_ptr); + +/// Rust -> C++ write push callback. Writes `len` bytes from `data` to the +/// underlying OutputStream. Returns 0 on success, 1 on IO error (with the +/// detailed Status stashed in `WriteCtx::last_error` for the caller to pick up). +extern "C" int32_t paimon_cpp_writer_push(void* ctx_ptr, const uint8_t* data, std::size_t len); + +} // namespace paimon::tantivy diff --git a/src/paimon/global_index/tantivy/tantivy_streaming_test.cpp b/src/paimon/global_index/tantivy/tantivy_streaming_test.cpp new file mode 100644 index 000000000..7c9a6e0f7 --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_streaming_test.cpp @@ -0,0 +1,370 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * K4 streaming test: V3 Callback Directory + W1 streaming writer end-to-end. + * + * Coverage: + * 1. ParseArchiveHeaderFuzz — malformed header bytes rejected cleanly + * 2. ConcurrentQueryOnSameReader — 4 threads query same reader, serialized + * by Rust stream_mutex, results consistent, no race + * 3. ConcurrentCreateAndDropReaders — 10 threads each open/query/close their + * own reader on the same archive; no leaks, release exactly-once per reader + * 4. StreamingBenchmarkLog — builds a medium index, prints RSS/timing to + * stderr for baseline comparison (execute.md archival) + * + * We don't duplicate tests already covered by the Rust unit tests + * (callback_directory::tests::* for Directory semantics, writer::tests:: + * streaming_chunk_size_bounded_by_buffer for the 64KB buffer guarantee). + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/c/bridge.h" +#include "arrow/type.h" +#include "gtest/gtest.h" +#include "paimon/common/utils/path_util.h" +#include "paimon/core/global_index/global_index_file_manager.h" +#include "paimon/core/index/index_path_factory.h" +#include "paimon/fs/local/local_file_system.h" +#include "paimon/global_index/bitmap_global_index_result.h" +#include "paimon/global_index/bitmap_scored_global_index_result.h" +#include "paimon/global_index/tantivy/tantivy_archive_layout.h" +#include "paimon/global_index/tantivy/tantivy_defs.h" +#include "paimon/global_index/tantivy/tantivy_global_index.h" +#include "paimon/global_index/tantivy/tantivy_global_index_reader.h" +#include "paimon/io/byte_array_input_stream.h" +#include "paimon/predicate/full_text_search.h" +#include "paimon/testing/utils/testharness.h" + +#ifndef JIEBA_TEST_DICT_DIR +#error "JIEBA_TEST_DICT_DIR must be set at compile time" +#endif + +namespace paimon::tantivy::test { + +namespace { + +class FakeIndexPathFactory : public IndexPathFactory { + public: + explicit FakeIndexPathFactory(const std::string& root) : root_(root) {} + std::string NewPath() const override { + assert(false); + return ""; + } + std::string ToPath(const std::shared_ptr&) const override { + assert(false); + return ""; + } + std::string ToPath(const std::string& file_name) const override { + return PathUtil::JoinPath(root_, file_name); + } + bool IsExternalPath() const override { + return false; + } + + private: + std::string root_; +}; + +/// Helper: build an archive with `n` documents, return the GlobalIndexIOMeta. +/// Holds the tmp dir alive (via `holder`) so it's cleaned up when the +/// WriteResult goes out of scope. +struct WriteResult { + std::unique_ptr holder; + std::string root_dir; + GlobalIndexIOMeta meta; +}; + +class StreamingTestFixture : public ::testing::Test { + public: + void SetUp() override { + setenv(kJiebaDictDirEnv, JIEBA_TEST_DICT_DIR, /*overwrite=*/1); + } + + WriteResult BuildArchive(std::size_t n_docs, + const std::string& text_template = "apple banana cherry {}") { + auto root_dir = paimon::test::UniqueTestDirectory::Create(); + EXPECT_TRUE(root_dir); + std::string root = root_dir->Str(); + + // Build arrow StringArray + arrow::StringBuilder sb; + for (std::size_t i = 0; i < n_docs; ++i) { + char buf[128]; + std::snprintf(buf, sizeof(buf), text_template.c_str(), i); + EXPECT_TRUE(sb.Append(buf).ok()); + } + auto text_array = sb.Finish().ValueOrDie(); + auto struct_array = + arrow::StructArray::Make({text_array}, {arrow::field("f0", arrow::utf8())}) + .ValueOrDie(); + + std::map options; + auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())}); + auto c_schema = std::make_unique<::ArrowSchema>(); + EXPECT_TRUE(arrow::ExportType(*data_type, c_schema.get()).ok()); + auto global_index = std::make_shared(options); + auto path_factory = std::make_shared(root); + auto file_writer = std::make_shared(fs_, path_factory); + auto w = global_index->CreateWriter("f0", c_schema.get(), file_writer, pool_).value(); + ::ArrowArray c_array; + EXPECT_TRUE(arrow::ExportArray(*struct_array, &c_array).ok()); + std::vector relative_row_ids(struct_array->length()); + for (int64_t i = 0; i < struct_array->length(); ++i) relative_row_ids[i] = i; + EXPECT_TRUE(w->AddBatch(&c_array, std::move(relative_row_ids)).ok()); + auto metas = w->Finish().value(); + EXPECT_EQ(metas.size(), 1u); + + // Move root_dir into the result — it stays alive as long as the + // caller holds WriteResult; cleaned up when TEST_F scope exits. + return WriteResult{std::move(root_dir), std::move(root), metas[0]}; + } + + std::shared_ptr OpenReader(const std::string& root, + const GlobalIndexIOMeta& meta) { + std::map options; + auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())}); + auto c_schema = std::make_unique<::ArrowSchema>(); + EXPECT_TRUE(arrow::ExportType(*data_type, c_schema.get()).ok()); + auto global_index = std::make_shared(options); + auto path_factory = std::make_shared(root); + auto file_reader = std::make_shared(fs_, path_factory); + return global_index->CreateReader(c_schema.get(), file_reader, {meta}, pool_).value(); + } + + std::shared_ptr BuildMatchAll(const std::string& query) { + return std::make_shared( + /*_field_name=*/"f0", + /*_limit=*/std::optional{}, + /*_query=*/query, + /*_search_type=*/FullTextSearch::SearchType::MATCH_ALL, + /*_pre_filter=*/std::optional{}); + } + + protected: + std::shared_ptr pool_ = GetDefaultPool(); + std::shared_ptr fs_ = std::make_shared(); +}; + +// ========================================================================= +// 1. ParseArchiveHeader fuzz +// ========================================================================= + +TEST(ParseArchiveHeaderFuzz, TruncatedHeader) { + // Fewer than 4 bytes → DataInputStream::ReadValue fails + std::string bytes = "\x00\x00"; + ByteArrayInputStream in(bytes.data(), bytes.size()); + auto r = ParseArchiveHeader(&in); + EXPECT_FALSE(r.ok()) << "expected failure on truncated header"; +} + +TEST(ParseArchiveHeaderFuzz, NegativeFileCount) { + // BE int32 -1 = 0xFFFFFFFF + char bytes[4] = {static_cast(0xFF), static_cast(0xFF), static_cast(0xFF), + static_cast(0xFF)}; + ByteArrayInputStream in(bytes, 4); + auto r = ParseArchiveHeader(&in); + ASSERT_FALSE(r.ok()); + EXPECT_NE(r.status().message().find("negative file_count"), std::string::npos) + << r.status().ToString(); +} + +TEST(ParseArchiveHeaderFuzz, NameLenOutOfRange) { + // file_count=1, name_len=2GB (BE int32 0x7FFFFFFF) + char bytes[8] = {0, + 0, + 0, + 1, + static_cast(0x7F), + static_cast(0xFF), + static_cast(0xFF), + static_cast(0xFF)}; + ByteArrayInputStream in(bytes, 8); + auto r = ParseArchiveHeader(&in); + ASSERT_FALSE(r.ok()); + EXPECT_NE(r.status().message().find("bad name_len"), std::string::npos) + << r.status().ToString(); +} + +TEST(ParseArchiveHeaderFuzz, ZeroFileCountSucceeds) { + // file_count=0 is structurally valid; caller will fail later when + // tantivy::Index::open finds no meta.json, but parse itself OK. + char bytes[4] = {0, 0, 0, 0}; + ByteArrayInputStream in(bytes, 4); + auto r = ParseArchiveHeader(&in); + ASSERT_TRUE(r.ok()) << r.status().ToString(); + EXPECT_EQ(r.value().count, 0u); +} + +TEST(ParseArchiveHeaderFuzz, PayloadLenNegative) { + // file_count=1, name_len=1, name="a", data_len=-1 (BE int64 0xFFFFFFFFFFFFFFFF) + char bytes[4 + 4 + 1 + 8] = { + // file_count=1 + 0, + 0, + 0, + 1, + // name_len=1 + 0, + 0, + 0, + 1, + // name='a' + 'a', + // data_len = -1 (BE int64 0xFFFFFFFFFFFFFFFF) + static_cast(0xFF), + static_cast(0xFF), + static_cast(0xFF), + static_cast(0xFF), + static_cast(0xFF), + static_cast(0xFF), + static_cast(0xFF), + static_cast(0xFF), + }; + ByteArrayInputStream in(bytes, sizeof(bytes)); + auto r = ParseArchiveHeader(&in); + ASSERT_FALSE(r.ok()); + EXPECT_NE(r.status().message().find("negative data_len"), std::string::npos) + << r.status().ToString(); +} + +// ========================================================================= +// 2. Concurrent query on same reader +// ========================================================================= + +TEST_F(StreamingTestFixture, ConcurrentQueryOnSameReader) { + // 50 docs containing "apple" in every one (all should match) + auto wr = BuildArchive(50, "apple banana {}"); + auto reader = OpenReader(wr.root_dir, wr.meta); + + auto fts = BuildMatchAll("apple"); + + // 4 threads × 20 queries each, all must return 50 rowIds + constexpr int kThreads = 4; + constexpr int kIters = 20; + std::vector threads; + std::atomic failures{0}; + for (int t = 0; t < kThreads; ++t) { + threads.emplace_back([&] { + for (int i = 0; i < kIters; ++i) { + auto result = reader->VisitFullTextSearch(fts); + if (!result.ok() || !result.value()) { + failures++; + continue; + } + std::shared_ptr r = result.value(); + auto plain = std::dynamic_pointer_cast(r); + if (!plain) { + failures++; + continue; + } + auto bres = plain->GetBitmap(); + if (!bres.ok() || bres.value() == nullptr || bres.value()->Cardinality() != 50) { + failures++; + } + } + }); + } + for (auto& th : threads) th.join(); + EXPECT_EQ(failures.load(), 0) << "concurrent queries produced inconsistent results"; +} + +// ========================================================================= +// 3. Concurrent reader open + close +// ========================================================================= + +TEST_F(StreamingTestFixture, ConcurrentCreateAndDropReaders) { + // One archive, many readers opening/closing it concurrently. + // Validates exactly-once release (no UAF under ASAN) and open/close race safety. + auto wr = BuildArchive(20); + + constexpr int kThreads = 10; + std::vector threads; + std::atomic failures{0}; + for (int t = 0; t < kThreads; ++t) { + threads.emplace_back([&, t] { + for (int i = 0; i < 5; ++i) { + auto reader = OpenReader(wr.root_dir, wr.meta); + if (!reader) { + failures++; + continue; + } + auto fts = BuildMatchAll("apple"); + auto r = reader->VisitFullTextSearch(fts); + if (!r.ok()) { + failures++; + } + // reader drops here → Rust Arc::drop → paimon_cpp_stream_release + } + (void)t; + }); + } + for (auto& th : threads) th.join(); + EXPECT_EQ(failures.load(), 0); +} + +// ========================================================================= +// 4. Benchmark log (non-assertion; archived to execute.md) +// ========================================================================= + +TEST_F(StreamingTestFixture, StreamingBenchmarkLog) { + auto rss_kb = []() { + struct rusage ru; + getrusage(RUSAGE_SELF, &ru); + // Linux: KB; macOS: bytes + return static_cast(ru.ru_maxrss); + }; + + int64_t rss_before = rss_kb(); + auto t0 = std::chrono::steady_clock::now(); + auto wr = BuildArchive(200); + auto t1 = std::chrono::steady_clock::now(); + int64_t rss_after_write = rss_kb(); + + auto reader = OpenReader(wr.root_dir, wr.meta); + auto t2 = std::chrono::steady_clock::now(); + int64_t rss_after_open = rss_kb(); + + auto fts = BuildMatchAll("apple"); + auto result = reader->VisitFullTextSearch(fts); + auto t3 = std::chrono::steady_clock::now(); + + auto write_ms = std::chrono::duration_cast(t1 - t0).count(); + auto open_ms = std::chrono::duration_cast(t2 - t1).count(); + auto query_ms = std::chrono::duration_cast(t3 - t2).count(); + + std::fprintf(stderr, + "[BENCHMARK] V3 streaming (200 docs): " + "write=%" PRId64 "ms open=%" PRId64 "ms query=%" PRId64 + "ms " + "rss_before=%" PRId64 "KB rss_after_write=%" PRId64 "KB rss_after_open=%" PRId64 + "KB\n", + static_cast(write_ms), static_cast(open_ms), + static_cast(query_ms), rss_before, rss_after_write, rss_after_open); + EXPECT_TRUE(result.ok()); + SUCCEED(); +} + +} // namespace +} // namespace paimon::tantivy::test diff --git a/src/paimon/global_index/tantivy/tantivy_tokenizer_test.cpp b/src/paimon/global_index/tantivy/tantivy_tokenizer_test.cpp new file mode 100644 index 000000000..27ec788a1 --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_tokenizer_test.cpp @@ -0,0 +1,278 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Stage 3 golden-sample test: cppjieba vs jieba-rs (PaimonJiebaTokenizer) diff. + * + * For each mode (mp / mix / full / query), tokenize every line of + * `test/test_data/tokenizer_golden/golden_*.txt` twice: once with cppjieba + * (the existing JiebaTokenizer::CutWithMode + Normalize), once with the + * FFI-exposed PaimonJiebaTokenizer. Compare the token text sequences. + * Diffs are advisory only (logged to stderr) — per + * docs/dev/tokenizer_diff_report.md we do not require cppjieba<->jieba-rs parity. + * + * `hmm` mode is tested separately: FFI must return Unsupported. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "cppjieba/Jieba.hpp" +#include "gtest/gtest.h" +#include "paimon/global_index/lucene/jieba_analyzer.h" +#include "paimon/global_index/lucene/lucene_utils.h" +#include "paimon/global_index/tantivy/tantivy_ffi_handle.h" +#include "paimon/global_index/tantivy/tantivy_ffi_status.h" + +extern "C" { +#include "paimon_tantivy_ffi.h" // NOLINT(build/include_subdir) +} + +#ifndef JIEBA_TEST_DICT_DIR +#error "JIEBA_TEST_DICT_DIR must be set at compile time for this test" +#endif + +#ifndef PAIMON_TANTIVY_GOLDEN_DIR +#error "PAIMON_TANTIVY_GOLDEN_DIR must be set at compile time for this test" +#endif + +namespace paimon::tantivy { +namespace { + +/// Load lines from all `golden_*.txt` files (the strict corpus). +/// Files named `known_diffs*.txt` are excluded — those document known +/// cppjieba↔jieba-rs divergences and are inspected separately. +std::vector LoadGoldenLines() { + std::vector lines; + namespace fs = std::filesystem; + for (const auto& entry : fs::directory_iterator(PAIMON_TANTIVY_GOLDEN_DIR)) { + if (!entry.is_regular_file()) continue; + const std::string name = entry.path().filename().string(); + if (name.rfind("golden_", 0) != 0 || entry.path().extension() != ".txt") continue; + std::ifstream fin(entry.path()); + std::string line; + while (std::getline(fin, line)) { + lines.push_back(line); + } + } + return lines; +} + +/// Load lines from `known_diffs*.txt` — known divergent edge cases documented +/// in docs/dev/tokenizer_diff_report.md. +std::vector LoadKnownDiffLines() { + std::vector lines; + namespace fs = std::filesystem; + for (const auto& entry : fs::directory_iterator(PAIMON_TANTIVY_GOLDEN_DIR)) { + if (!entry.is_regular_file()) continue; + const std::string name = entry.path().filename().string(); + if (name.rfind("known_diffs", 0) != 0 || entry.path().extension() != ".txt") continue; + std::ifstream fin(entry.path()); + std::string line; + while (std::getline(fin, line)) { + lines.push_back(line); + } + } + return lines; +} + +/// Tokenize via cppjieba + Normalize (mirrors JiebaAnalyzer runtime path). +std::vector TokenizeWithCppjieba(const cppjieba::Jieba& jieba, const std::string& mode, + const std::string& text) { + std::vector terms; + ::paimon::lucene::JiebaTokenizer::CutWithMode(mode, &jieba, text, &terms); + std::vector normalized_views; + ::paimon::lucene::JiebaTokenizer::Normalize(jieba.extractor.GetStopWords(), &terms, + &normalized_views); + std::vector result; + result.reserve(normalized_views.size()); + for (auto v : normalized_views) result.emplace_back(v); + return result; +} + +/// Parse the FFI `tokenize` output (tab-separated: from\tto\tpos\ttext\n) and +/// return only the token text sequence. +std::vector ExtractTokenTexts(const PaimonTantivyBuffer& buf) { + std::vector out; + if (buf.len == 0) return out; + std::string s(reinterpret_cast(buf.data), buf.len); + std::istringstream in(s); + std::string row; + while (std::getline(in, row)) { + // extract text field = after 3rd '\t' + size_t p1 = row.find('\t'); + if (p1 == std::string::npos) continue; + size_t p2 = row.find('\t', p1 + 1); + if (p2 == std::string::npos) continue; + size_t p3 = row.find('\t', p2 + 1); + if (p3 == std::string::npos) continue; + out.emplace_back(row.substr(p3 + 1)); + } + return out; +} + +std::vector TokenizeWithTantivy(PaimonJiebaTokenizer* tok, const std::string& text) { + BufferGuard buf; + PaimonTantivyStatus st = + paimon_tantivy_tokenizer_tokenize(tok, text.data(), text.size(), buf.out()); + EXPECT_EQ(st, PaimonTantivyStatus::PAIMON_TANTIVY_STATUS_OK) + << "FFI tokenize failed: " << paimon_tantivy_last_error(); + return ExtractTokenTexts(*buf.out()); +} + +/// Build a cppjieba::Jieba instance mirroring the one used at runtime. +std::unique_ptr MakeJieba() { + const std::string d = JIEBA_TEST_DICT_DIR; + return std::make_unique(d + "/jieba.dict.utf8", d + "/hmm_model.utf8", + d + "/user.dict.utf8", d + "/idf.utf8", + d + "/stop_words.utf8"); +} + +struct DiffReport { + size_t total = 0; + size_t differ = 0; + std::vector sample_diffs; // first N diffs +}; + +void RunDiff(const std::vector& lines, const std::string& mode, DiffReport* report) { + auto jieba = MakeJieba(); + std::string dict_dir = JIEBA_TEST_DICT_DIR; + + PaimonJiebaTokenizer* handle = nullptr; + PaimonTantivyStatus st = paimon_tantivy_tokenizer_new(mode.c_str(), /*with_position=*/true, + dict_dir.c_str(), &handle); + ASSERT_EQ(st, PaimonTantivyStatus::PAIMON_TANTIVY_STATUS_OK) + << "tokenizer_new failed for mode=" << mode << ": " << paimon_tantivy_last_error(); + + for (const auto& line : lines) { + if (line.empty()) continue; + auto a = TokenizeWithCppjieba(*jieba, mode, line); + auto b = TokenizeWithTantivy(handle, line); + report->total++; + if (a != b) { + report->differ++; + if (report->sample_diffs.size() < 10) { + std::ostringstream os; + os << "LINE: " << line << "\n cppjieba: ["; + for (size_t i = 0; i < a.size(); ++i) { + if (i) os << ","; + os << a[i]; + } + os << "]\n jieba-rs: ["; + for (size_t i = 0; i < b.size(); ++i) { + if (i) os << ","; + os << b[i]; + } + os << "]"; + report->sample_diffs.push_back(os.str()); + } + } + } + + paimon_tantivy_tokenizer_free(handle); +} + +} // namespace + +TEST(TantivyTokenizer, HmmModeReturnsUnsupported) { + std::string dict_dir = JIEBA_TEST_DICT_DIR; + PaimonJiebaTokenizer* handle = nullptr; + PaimonTantivyStatus st = + paimon_tantivy_tokenizer_new("hmm", /*with_position=*/true, dict_dir.c_str(), &handle); + EXPECT_EQ(st, PaimonTantivyStatus::PAIMON_TANTIVY_STATUS_UNSUPPORTED); + EXPECT_EQ(handle, nullptr); + std::string err = paimon_tantivy_last_error(); + EXPECT_NE(err.find("hmm"), std::string::npos); +} + +// ---------------- positive jieba-rs behavior assertions ---------------- +// +// Per decision in docs/dev/tokenizer_diff_report.md: we do NOT require +// byte-level parity with cppjieba (共存 + 各自索引不互读). Instead assert +// jieba-rs produces expected token sequences for a curated set of inputs. + +struct JiebaRsCase { + std::string mode; + std::string input; + std::vector expected; +}; + +class JiebaRsBehavior : public ::testing::TestWithParam {}; + +TEST_P(JiebaRsBehavior, ProducesExpectedTokens) { + const auto& c = GetParam(); + std::string dict_dir = JIEBA_TEST_DICT_DIR; + PaimonJiebaTokenizer* handle = nullptr; + PaimonTantivyStatus st = paimon_tantivy_tokenizer_new(c.mode.c_str(), /*with_position=*/true, + dict_dir.c_str(), &handle); + ASSERT_EQ(st, PaimonTantivyStatus::PAIMON_TANTIVY_STATUS_OK) << paimon_tantivy_last_error(); + auto got = TokenizeWithTantivy(handle, c.input); + EXPECT_EQ(got, c.expected) << "mode=" << c.mode << " input=" << c.input; + paimon_tantivy_tokenizer_free(handle); +} + +INSTANTIATE_TEST_SUITE_P( + BasicCases, JiebaRsBehavior, + ::testing::Values(JiebaRsCase{"mix", "Hello World", {"hello", "world"}}, + JiebaRsCase{"mix", "HELLO", {"hello"}}, + JiebaRsCase{"mix", "中国人民", {"中国", "人民"}}, + // 他/了 在 stop_words.utf8 里,被 Normalize 过滤 + JiebaRsCase{"mix", "他来到了网易杭研大厦", {"来到", "网易", "杭研", "大厦"}}, + JiebaRsCase{"full", "中国", {"中", "中国", "国"}}, + JiebaRsCase{"query", "中国人民", {"中国", "人民"}})); + +// ---------------- advisory: log diffs vs cppjieba ---------------- +// +// These tests never fail; they exist to print diffs to stderr for +// human review, feeding docs/dev/tokenizer_diff_report.md. They cover both +// strict and known-diffs corpora. + +class AdvisoryDiffTest : public ::testing::TestWithParam {}; + +TEST_P(AdvisoryDiffTest, LogsStrictGoldenDiffs) { + const auto mode = GetParam(); + DiffReport report; + RunDiff(LoadGoldenLines(), mode, &report); + const double rate = report.total > 0 ? static_cast(report.differ) / report.total : 0.0; + std::cerr << "ADVISORY-STRICT mode=" << mode << " total=" << report.total + << " differ=" << report.differ << " rate=" << rate << "\n"; + for (const auto& d : report.sample_diffs) std::cerr << d << "\n"; + SUCCEED() << "Advisory only: review docs/dev/tokenizer_diff_report.md"; +} + +TEST_P(AdvisoryDiffTest, LogsKnownDiffs) { + const auto mode = GetParam(); + DiffReport report; + auto lines = LoadKnownDiffLines(); + if (lines.empty()) GTEST_SKIP(); + RunDiff(lines, mode, &report); + const double rate = report.total > 0 ? static_cast(report.differ) / report.total : 0.0; + std::cerr << "ADVISORY-KNOWN mode=" << mode << " total=" << report.total + << " differ=" << report.differ << " rate=" << rate << "\n"; + for (const auto& d : report.sample_diffs) std::cerr << d << "\n"; + SUCCEED(); +} + +INSTANTIATE_TEST_SUITE_P(AllModes, AdvisoryDiffTest, + ::testing::Values("mp", "mix", "full", "query"), + [](const testing::TestParamInfo& info) { + return info.param; + }); + +} // namespace paimon::tantivy diff --git a/src/paimon/global_index/tantivy/tantivy_writer_test.cpp b/src/paimon/global_index/tantivy/tantivy_writer_test.cpp new file mode 100644 index 000000000..8aeca0078 --- /dev/null +++ b/src/paimon/global_index/tantivy/tantivy_writer_test.cpp @@ -0,0 +1,272 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Stage 4 writer test: build a tantivy-fulltext global index from an Arrow batch, + * persist it through GlobalIndexFileManager, then verify the resulting file + * conforms to the packing format documented in tantivy_defs.h: + * + * [i32 version | i32 file_count | + * (i32 name_len | name | i64 file_len | file_bytes)*] + * + * Stage 6 (reader) will round-trip these bytes back to a queryable index; + * this stage only checks structural validity + meta correctness. + */ + +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/c/bridge.h" +#include "arrow/ipc/api.h" +#include "arrow/type.h" +#include "gtest/gtest.h" +#include "paimon/common/utils/path_util.h" +#include "paimon/common/utils/string_utils.h" +#include "paimon/core/global_index/global_index_file_manager.h" +#include "paimon/core/index/index_path_factory.h" +#include "paimon/fs/local/local_file_system.h" +#include "paimon/global_index/tantivy/tantivy_defs.h" +#include "paimon/global_index/tantivy/tantivy_global_index_writer.h" +#include "paimon/testing/utils/testharness.h" + +#ifndef JIEBA_TEST_DICT_DIR +#error "JIEBA_TEST_DICT_DIR must be set at compile time" +#endif + +namespace paimon::tantivy::test { + +namespace { + +class FakeIndexPathFactory : public IndexPathFactory { + public: + explicit FakeIndexPathFactory(const std::string& root) : root_(root) {} + std::string NewPath() const override { + assert(false); + return ""; + } + std::string ToPath(const std::shared_ptr&) const override { + assert(false); + return ""; + } + std::string ToPath(const std::string& file_name) const override { + return PathUtil::JoinPath(root_, file_name); + } + bool IsExternalPath() const override { + return false; + } + + private: + std::string root_; +}; + +/// Read the entire file at `path` into a byte buffer. +std::vector ReadFile(const std::string& path) { + std::ifstream in(path, std::ios::binary); + EXPECT_TRUE(in.good()) << "open " << path; + in.seekg(0, std::ios::end); + auto sz = static_cast(in.tellg()); + in.seekg(0, std::ios::beg); + std::vector buf(sz); + in.read(reinterpret_cast(buf.data()), sz); + return buf; +} + +/// Read a big-endian integer from a raw pointer. +template +T ReadBE(const uint8_t* p) { + T v = 0; + for (std::size_t i = 0; i < sizeof(T); ++i) { + v = static_cast((v << 8) | static_cast(p[i])); + } + return v; +} + +struct PackedEntry { + std::string name; + int64_t length = 0; + std::size_t offset = 0; // offset in the buffer where bytes start +}; + +/// Parse the packing header into a list of entries; verifies that the offsets +/// and lengths cover the full buffer with no leftover bytes. +/// Format (Java-compatible, big-endian, no version header): +/// [i32 BE file_count | (i32 BE name_len | name | i64 BE file_len | bytes)*] +std::vector ParsePacked(const std::vector& bytes) { + std::vector entries; + EXPECT_GE(bytes.size(), 4u); + int32_t file_count = ReadBE(bytes.data()); + EXPECT_GT(file_count, 0); + std::size_t off = 4; + for (int32_t i = 0; i < file_count; ++i) { + EXPECT_LE(off + 4, bytes.size()); + int32_t nlen = ReadBE(bytes.data() + off); + off += 4; + EXPECT_GT(nlen, 0); + EXPECT_LE(off + static_cast(nlen), bytes.size()); + std::string name(reinterpret_cast(bytes.data() + off), + static_cast(nlen)); + off += nlen; + EXPECT_LE(off + 8, bytes.size()); + int64_t flen = ReadBE(bytes.data() + off); + off += 8; + EXPECT_GE(flen, 0); + EXPECT_LE(off + static_cast(flen), bytes.size()); + entries.push_back({name, flen, off}); + off += static_cast(flen); + } + EXPECT_EQ(off, bytes.size()) << "trailing bytes after pack"; + return entries; +} + +class TantivyGlobalIndexWriterTest : public ::testing::Test { + public: + void SetUp() override { + // Make jieba dict dir visible to the writer's GetJiebaDictionaryDir + // (it reads the env var directly). + setenv(kJiebaDictDirEnv, JIEBA_TEST_DICT_DIR, /*overwrite=*/1); + } + + std::unique_ptr<::ArrowSchema> CreateArrowSchema( + const std::shared_ptr& data_type) const { + auto c_schema = std::make_unique<::ArrowSchema>(); + EXPECT_TRUE(arrow::ExportType(*data_type, c_schema.get()).ok()); + return c_schema; + } + + Result> WriteIndex( + const std::string& root, const std::shared_ptr& data_type, + const std::map& options, + const std::shared_ptr& array) { + auto path_factory = std::make_shared(root); + auto file_writer = std::make_shared(fs_, path_factory); + PAIMON_ASSIGN_OR_RAISE(auto writer, TantivyGlobalIndexWriter::Create( + "f0", data_type, file_writer, options, pool_)); + ::ArrowArray c_array; + PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportArray(*array, &c_array)); + std::vector relative_row_ids(array->length()); + for (int64_t i = 0; i < array->length(); ++i) relative_row_ids[i] = i; + PAIMON_RETURN_NOT_OK(writer->AddBatch(&c_array, std::move(relative_row_ids))); + return writer->Finish(); + } + + protected: + std::shared_ptr pool_ = GetDefaultPool(); + std::shared_ptr fs_ = std::make_shared(); + std::shared_ptr data_type_ = + arrow::struct_({arrow::field("f0", arrow::utf8())}); +}; + +} // namespace + +TEST_F(TantivyGlobalIndexWriterTest, EnglishCorpusProducesValidPackedIndex) { + auto root_dir = paimon::test::UniqueTestDirectory::Create(); + ASSERT_TRUE(root_dir); + std::string root = root_dir->Str(); + + std::map options = { + {kTantivyWriteOmitTermFreqAndPositions, "false"}, + }; + std::shared_ptr array = arrow::ipc::internal::json::ArrayFromJSON(data_type_, R"([ + ["This is an test document."], + ["This is an new document document document."], + ["Document document document document test."], + ["unordered user-defined doc id"] + ])") + .ValueOrDie(); + + ASSERT_OK_AND_ASSIGN(auto metas, WriteIndex(root, data_type_, options, array)); + ASSERT_EQ(metas.size(), 1u); + const auto& meta = metas[0]; + + auto file_name = PathUtil::GetName(meta.file_path); + EXPECT_TRUE(StringUtils::StartsWith(file_name, "tantivy-fulltext-global-index-")) + << "file_name=" << file_name; + EXPECT_TRUE(StringUtils::EndsWith(file_name, ".index")); + ASSERT_TRUE(meta.metadata); + EXPECT_EQ(std::string(meta.metadata->data(), meta.metadata->size()), + R"({"write.omit-term-freq-and-position":"false"})"); + EXPECT_GT(meta.file_size, 8); + + auto bytes = ReadFile(meta.file_path); + ASSERT_EQ(static_cast(bytes.size()), meta.file_size); + auto entries = ParsePacked(bytes); + EXPECT_FALSE(entries.empty()); + bool has_meta_json = false; + for (const auto& e : entries) { + if (e.name == "meta.json") has_meta_json = true; + } + EXPECT_TRUE(has_meta_json) << "expected meta.json in packed entries"; +} + +TEST_F(TantivyGlobalIndexWriterTest, ChineseCorpusProducesValidPackedIndex) { + auto root_dir = paimon::test::UniqueTestDirectory::Create(); + ASSERT_TRUE(root_dir); + std::string root = root_dir->Str(); + + std::map options = { + {kTantivyWriteOmitTermFreqAndPositions, "false"}, + {kTantivyWriteTokenizer, "paimon_jieba"}, + {kJiebaTokenizeMode, "query"}, + }; + std::shared_ptr array = arrow::ipc::internal::json::ArrayFromJSON(data_type_, R"([ + ["千问是一个智能助手"], + ["新一代AI助手发布"] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto metas, WriteIndex(root, data_type_, options, array)); + ASSERT_EQ(metas.size(), 1u); + const auto& meta = metas[0]; + auto bytes = ReadFile(meta.file_path); + ASSERT_EQ(static_cast(bytes.size()), meta.file_size); + auto entries = ParsePacked(bytes); + EXPECT_FALSE(entries.empty()); +} + +TEST_F(TantivyGlobalIndexWriterTest, NullStringRowsBecomeEmptyDocuments) { + auto root_dir = paimon::test::UniqueTestDirectory::Create(); + ASSERT_TRUE(root_dir); + std::string root = root_dir->Str(); + + std::map options; + std::shared_ptr array = arrow::ipc::internal::json::ArrayFromJSON(data_type_, R"([ + ["nonempty"], + [null], + ["another"] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto metas, WriteIndex(root, data_type_, options, array)); + ASSERT_EQ(metas.size(), 1u); +} + +TEST_F(TantivyGlobalIndexWriterTest, RejectsHmmTokenizeMode) { + auto root_dir = paimon::test::UniqueTestDirectory::Create(); + ASSERT_TRUE(root_dir); + auto path_factory = std::make_shared(root_dir->Str()); + auto file_writer = std::make_shared(fs_, path_factory); + // hmm rejection only fires when the jieba tokenizer is actually constructed, + // so this test must explicitly opt into jieba (default tokenizer skips + // jieba construction entirely). + std::map options = { + {kTantivyWriteTokenizer, "paimon_jieba"}, + {kJiebaTokenizeMode, "hmm"}, + }; + auto res = TantivyGlobalIndexWriter::Create("f0", data_type_, file_writer, options, pool_); + ASSERT_FALSE(res.ok()); + EXPECT_TRUE(res.status().IsNotImplemented()) << res.status().ToString(); +} + +} // namespace paimon::tantivy::test diff --git a/test/test_data/cpp_tantivy_fixtures/english_default.archive b/test/test_data/cpp_tantivy_fixtures/english_default.archive new file mode 100644 index 000000000..d195af7ec Binary files /dev/null and b/test/test_data/cpp_tantivy_fixtures/english_default.archive differ diff --git a/test/test_data/java_tantivy_fixtures/README.md b/test/test_data/java_tantivy_fixtures/README.md new file mode 100644 index 000000000..f13a5e162 --- /dev/null +++ b/test/test_data/java_tantivy_fixtures/README.md @@ -0,0 +1,49 @@ +# Java → C++ tantivy 跨端读 fixture + +> 生成于 **2026-04-23**,用于 J6 `paimon-tantivy-java-compat-test`。 + +## 内容 + +| 文件 | 作用 | +|---|---| +| `english_simple.archive` | 由 paimon-java 的 `TantivyIndexWriter + packIndex` 路径生成的 BE archive;10 条纯英文文档,row_ids 0..9 | +| `english_simple.golden.json` | 人类可读 golden,每个 query type 的 expected row_ids | + +## 版本锁定 + +| 组件 | 版本 | +|---|---| +| tantivy crate | **0.22.1** | +| paimon-tantivy-jni | git sha 生成时最新(commit 在 paimon 仓) | +| schema | B1:`row_id` u64 stored+indexed+fast + `text` TEXT | +| archive 字节格式 | Java-compat 大端 + 无 version | + +任何组件升级(特别是 **tantivy 版本**)都可能导致段文件二进制不兼容 — 需**重新 regen**: + +```bash +# 1. 构建 Java native lib(若 Rust 变了) +cd /path/to/paimon/paimon-tantivy/paimon-tantivy-jni/rust && cargo build --release +cp target/release/libtantivy_jni.dylib \ + ../src/main/resources/native/darwin-aarch64/ + +# 2. mvn install + 跑 fixture gen +cd /path/to/paimon +mvn install -pl paimon-tantivy/paimon-tantivy-index -am -DskipTests -Denforcer.skip=true +mvn -pl paimon-tantivy/paimon-tantivy-index test \ + -Dtest=TantivyIndexFixtureGen -DfailIfNoTests=false \ + -Denforcer.skip=true \ + -DfixtureOutDir=/path/to/paimon-cpp/test/test_data/java_tantivy_fixtures +``` + +## 检验 + +``` +xxd english_simple.archive | head -1 +# 00000000: 00 00 00 16 ... ← BE int32 file_count = 22(Java 不 force-merge,多段) +``` + +## 相关文档 + +- `docs/dev/tantivy_java_cross_read_plan.md` — J6 整体 plan +- `docs/dev/test_execute.md` — J6 本次执行日志 +- `docs/dev/tantivy_java_compat_plan.md` — paimon-cpp 与 paimon-java 对齐总方案 diff --git a/test/test_data/java_tantivy_fixtures/english_simple.archive b/test/test_data/java_tantivy_fixtures/english_simple.archive new file mode 100644 index 000000000..c08499578 Binary files /dev/null and b/test/test_data/java_tantivy_fixtures/english_simple.archive differ diff --git a/test/test_data/java_tantivy_fixtures/english_simple.golden.json b/test/test_data/java_tantivy_fixtures/english_simple.golden.json new file mode 100644 index 000000000..9776b720b --- /dev/null +++ b/test/test_data/java_tantivy_fixtures/english_simple.golden.json @@ -0,0 +1,25 @@ +{ + "description": "10 English docs; row_ids 0..9; generated by TantivyIndexFixtureGen via TantivyFullTextGlobalIndexWriter production path; consumed by paimon-cpp V3 reader cross-read test (J6).", + "docs": [ + {"row_id": 0, "text": "apple banana cherry"}, + {"row_id": 1, "text": "apple durian"}, + {"row_id": 2, "text": "banana cherry"}, + {"row_id": 3, "text": "fig grape"}, + {"row_id": 4, "text": "apple cherry fig"}, + {"row_id": 5, "text": "banana elderberry"}, + {"row_id": 6, "text": "cherry durian"}, + {"row_id": 7, "text": "apple"}, + {"row_id": 8, "text": "grape fig elderberry"}, + {"row_id": 9, "text": "cherry fig"} + ], + "queries": [ + {"type": "match_all", "query": "apple", "expected_row_ids": [0, 1, 4, 7]}, + {"type": "match_all", "query": "apple banana", "expected_row_ids": [0]}, + {"type": "match_any", "query": "durian elderberry", "expected_row_ids": [1, 5, 6, 8]}, + {"type": "phrase", "query": "apple banana", "expected_row_ids": [0]}, + {"type": "phrase", "query": "banana cherry", "expected_row_ids": [0, 2]}, + {"type": "prefix", "query": "ap", "expected_row_ids": [0, 1, 4, 7]}, + {"type": "wildcard", "query": "*err*", "expected_row_ids": [0, 2, 4, 5, 6, 8, 9]}, + {"type": "match_any", "query": "apple banana cherry durian fig grape elderberry", "expected_row_ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]} + ] +} diff --git a/test/test_data/java_tantivy_fixtures/production_sample.archive b/test/test_data/java_tantivy_fixtures/production_sample.archive new file mode 100644 index 000000000..0f8297189 Binary files /dev/null and b/test/test_data/java_tantivy_fixtures/production_sample.archive differ diff --git a/test/test_data/tokenizer_golden/README.md b/test/test_data/tokenizer_golden/README.md new file mode 100644 index 000000000..d51861e8f --- /dev/null +++ b/test/test_data/tokenizer_golden/README.md @@ -0,0 +1,21 @@ +# Tokenizer 黄金样本 + +供 `paimon-tantivy-tokenizer-test` 比对 cppjieba vs jieba-rs 的分词输出。 + +## 文件 + +- `golden_synthetic.txt` — 手写边界 case(混合中英文、数字、标点、emoji、空白、超长词…) +- `golden_corpus.txt` — 公开语料短句摘录(通用知识、无版权敏感) + +## 使用 + +测试代码(见 `src/paimon/global_index/tantivy/tantivy_tokenizer_test.cpp`): +1. 逐行读取 +2. 每行用 cppjieba `JiebaTokenizer::CutWithMode` + `Normalize` 得到 token 序列 A +3. 每行用 jieba-rs FFI `paimon_tantivy_tokenizer_tokenize` 得到 token 序列 B +4. 比对 A 和 B:如果完全相同则本行 pass;否则记入 diff 报告 +5. 通过条件:diff 率 ≤ 1%(见 plan Stage 3 验收标准) + +## 扩充 + +后续补充业务 query log 时,新增文件 `golden_business.txt` 放在同目录,测试代码自动扫描 `golden_*.txt`。 diff --git a/test/test_data/tokenizer_golden/golden_corpus.txt b/test/test_data/tokenizer_golden/golden_corpus.txt new file mode 100644 index 000000000..38c7c887e --- /dev/null +++ b/test/test_data/tokenizer_golden/golden_corpus.txt @@ -0,0 +1,20 @@ +人工智能是计算机科学的一个分支 +机器学习是人工智能的核心领域 +深度学习使用神经网络进行模式识别 +大语言模型基于 Transformer 架构 +开源软件促进了全球技术合作 +Rust 语言以内存安全著称 +Python 广泛应用于数据科学 +分布式系统需要处理网络分区问题 +数据库事务保证原子性一致性隔离性持久性 +编程的艺术在于解决复杂问题 +搜索引擎依赖倒排索引加速查询 +自然语言处理技术日新月异 +云计算降低了基础设施成本 +开发者社区推动了技术进步 +版本控制系统是协作的基石 +操作系统管理计算机的硬件资源 +编译器将源代码翻译成机器指令 +算法的时间复杂度决定了执行效率 +数据结构的选择影响程序性能 +网络协议定义了通信的规则 diff --git a/test/test_data/tokenizer_golden/golden_synthetic.txt b/test/test_data/tokenizer_golden/golden_synthetic.txt new file mode 100644 index 000000000..65b144741 --- /dev/null +++ b/test/test_data/tokenizer_golden/golden_synthetic.txt @@ -0,0 +1,38 @@ +Hello World +hello world +HELLO WORLD +Hello 世界 +你好世界 +中国人民共和国 +我爱北京天安门 +北京是中华人民共和国的首都 +南京市长江大桥 +他来到了网易杭研大厦 +小明硕士毕业于中国科学院计算所,后在日本京都大学深造 +工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作 +结婚的和尚未结婚的 +程序员用Python和Rust写代码 +this is a test 这是一个测试 +Rust tantivy 全文索引 +C++ 到 Rust 的 FFI 桥接 +cpp cppjieba jieba-rs +分词器 tokenizer +全文 search +倒排索引 inverted index +paimon-cpp tantivy-fts +100个中文字符被分词器处理 +超长词最长词最长词最长词最长词最长词最长词 +... +!@#$%^&*() +"hello" +'quoted' +content +{json: "value"} +[1,2,3] +line1 +line2 +CJK 标点、。!? +全角:ABC123 +ABC123 混合数字字母 +abc123 +ABC123 diff --git a/test/test_data/tokenizer_golden/known_diffs.txt b/test/test_data/tokenizer_golden/known_diffs.txt new file mode 100644 index 000000000..23073bd37 --- /dev/null +++ b/test/test_data/tokenizer_golden/known_diffs.txt @@ -0,0 +1,18 @@ +abc_123 +foo.bar.baz +https://example.com/path?q=1 +email@example.com +192.168.1.1 +2026-04-20 +12:34:56 +$100 ¥200 €300 +100% +3.14 +-1 -2 -3 +a b c d e + + tab tab +mixed space tab +空 白 和 tab + leading and trailing +中英混合 Mixed CN EN diff --git a/third_party/tantivy_ffi/Cargo.lock b/third_party/tantivy_ffi/Cargo.lock new file mode 100644 index 000000000..be9056ad8 --- /dev/null +++ b/third_party/tantivy_ffi/Cargo.lock @@ -0,0 +1,1859 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "adler32" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + +[[package]] +name = "allocator-api2" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c880a97d28a3681c0267bd29cff89621202715b065127cd445fa0f0fe0aa2880" + +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "arc-swap" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207" +dependencies = [ + "rustversion", +] + +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bitflags" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" + +[[package]] +name = "bitpacking" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96a7139abd3d9cebf8cd6f920a389cf3dc9576172e32f4563f188cae3c3eb019" +dependencies = [ + "crunchy", +] + +[[package]] +name = "bumpalo" +version = "3.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "cbindgen" +version = "0.29.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "befbfd072a8e81c02f8c507aefce431fe5e7d051f83d48a23ffc9b9fe5a11799" +dependencies = [ + "clap", + "heck", + "indexmap", + "log", + "proc-macro2", + "quote", + "serde", + "serde_json", + "syn", + "tempfile", + "toml", +] + +[[package]] +name = "cc" +version = "1.2.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43c5703da9466b66a946814e1adf53ea2c90f10063b86290cc9eb67ce3478a20" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cedarwood" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90" +dependencies = [ + "smallvec", +] + +[[package]] +name = "census" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "clap" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "croaring" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0e813b58ac55ac5ccea5ec63beb8c80f37dedd78da3f594c848313415a08c8c" +dependencies = [ + "allocator-api2 0.4.0", + "croaring-sys", +] + +[[package]] +name = "croaring-sys" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f34e9ee8e65c0d46c9d0fe55ce80b477d0bfae4c786c6694687b9c70e8267027" +dependencies = [ + "cc", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "dary_heap" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b1e3a325bc115f096c8b77bbf027a7c2592230e70be2d985be950d3d5e60ebe" + +[[package]] +name = "deranged" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" +dependencies = [ + "powerfmt", + "serde_core", +] + +[[package]] +name = "downcast-rs" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "fastdivide" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afc2bd4d5a73106dd53d10d73d3401c2f32730ba2c0b93ddb888a8983680471" + +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + +[[package]] +name = "fs4" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7e180ac76c23b45e767bd7ae9579bc0bb458618c4bc71835926e098e61d15f8" +dependencies = [ + "rustix 0.38.44", + "windows-sys 0.52.0", +] + +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi 5.3.0", + "wasip2", +] + +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", + "wasip2", + "wasip3", +] + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "allocator-api2 0.2.21", + "equivalent", + "foldhash 0.1.5", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2 0.2.21", + "equivalent", + "foldhash 0.2.0", +] + +[[package]] +name = "hashbrown" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + +[[package]] +name = "htmlescape" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9025058dae765dee5070ec375f591e2ba14638c63feff74f13805a72e523163" + +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + +[[package]] +name = "include-flate" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23e233413926ef735f7d87024466cfda5a4b87467730846bd82ea7d504121347" +dependencies = [ + "include-flate-codegen", + "include-flate-compress", +] + +[[package]] +name = "include-flate-codegen" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e7148f24ef8922cc0e5574ebb908729ccdd3a110c440a45165733fedadd9969" +dependencies = [ + "include-flate-compress", + "proc-macro-error2", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "include-flate-compress" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74783a9ed407e844e99d5e7a57bd650acbfa124cf6e97ffd790ba59d8ab8e7ff" +dependencies = [ + "libflate", + "zstd", +] + +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown 0.17.0", + "serde", + "serde_core", +] + +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "jieba-macros" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c676b32a471d3cfae8dac2ad2f8334cd52e53377733cca8c1fb0a5062fec192" +dependencies = [ + "phf_codegen", +] + +[[package]] +name = "jieba-rs" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5dd552bbb95d578520ee68403bf8aaf0dbbb2ce55b0854d019f9350ad61040a" +dependencies = [ + "cedarwood", + "fxhash", + "include-flate", + "jieba-macros", + "lazy_static", + "phf", + "regex", +] + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2964e92d1d9dc3364cae4d718d93f227e3abb088e747d92e0395bfdedf1c12ca" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + +[[package]] +name = "levenshtein_automata" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c2cdeb66e45e9f36bfad5bbdb4d2384e70936afbee843c6f6543f0c551ebb25" + +[[package]] +name = "libc" +version = "0.2.185" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ff2c0fe9bc6cb6b14a0592c2ff4fa9ceb83eea9db979b0487cd054946a2b8f" + +[[package]] +name = "libflate" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd96e993e5f3368b0cb8497dae6c860c22af8ff18388c61c6c0b86c58d86b5df" +dependencies = [ + "adler32", + "crc32fast", + "dary_heap", + "libflate_lz77", + "no_std_io2", +] + +[[package]] +name = "libflate_lz77" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff7a10e427698aef6eef269482776debfef63384d30f13aad39a1a95e0e098fd" +dependencies = [ + "hashbrown 0.16.1", + "no_std_io2", + "rle-decode-fast", +] + +[[package]] +name = "libm" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" + +[[package]] +name = "linux-raw-sys" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "lru" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" +dependencies = [ + "hashbrown 0.15.5", +] + +[[package]] +name = "lz4_flex" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "373f5eceeeab7925e0c1098212f2fbc4d416adec9d35051a6ab251e824c1854a" + +[[package]] +name = "measure_time" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbefd235b0aadd181626f281e1d684e116972988c14c264e42069d5e8a5775cc" +dependencies = [ + "instant", + "log", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "memmap2" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3" +dependencies = [ + "libc", +] + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "murmurhash32" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" + +[[package]] +name = "no_std_io2" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b51ed7824b6e07d354605f4abb3d9d300350701299da96642ee084f5ce631550" +dependencies = [ + "memchr", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "num-conv" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "num_cpus" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "oneshot" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "269bca4c2591a28585d6bf10d9ed0332b7d76900a1b02bec41bdc3a2cdcda107" + +[[package]] +name = "ownedbytes" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3a059efb063b8f425b948e042e6b9bd85edfe60e913630ed727b23e2dfcc558" +dependencies = [ + "stable_deref_trait", +] + +[[package]] +name = "paimon_tantivy_ffi" +version = "0.1.0" +dependencies = [ + "cbindgen", + "croaring", + "jieba-rs", + "log", + "tantivy", + "tempfile", +] + +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pkg-config" +version = "0.3.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro-error-attr2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96de42df36bb9bba5542fe9f1a054b8cc87e172759a1868aa05c1f3acc89dfc5" +dependencies = [ + "proc-macro2", + "quote", +] + +[[package]] +name = "proc-macro-error2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11ec05c52be0a07b08061f7dd003e7d7092e0472bc731b4af7bb1ef876109802" +dependencies = [ + "proc-macro-error-attr2", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + +[[package]] +name = "rand" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.17", +] + +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand", +] + +[[package]] +name = "rayon" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "rle-decode-fast" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" + +[[package]] +name = "rust-stemmers" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54" +dependencies = [ + "serde", + "serde_derive", +] + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustix" +version = "0.38.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys 0.4.15", + "windows-sys 0.59.0", +] + +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys 0.12.1", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "semver" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "serde_spanned" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6662b5879511e06e8999a8a235d848113e942c9124f211511b16466ee2995f26" +dependencies = [ + "serde_core", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "siphasher" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" + +[[package]] +name = "sketches-ddsketch" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85636c14b73d81f541e525f585c0a2109e6744e1565b5c1668e31c70c10ed65c" +dependencies = [ + "serde", +] + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tantivy" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96599ea6fccd844fc833fed21d2eecac2e6a7c1afd9e044057391d78b1feb141" +dependencies = [ + "aho-corasick", + "arc-swap", + "base64", + "bitpacking", + "byteorder", + "census", + "crc32fast", + "crossbeam-channel", + "downcast-rs", + "fastdivide", + "fnv", + "fs4", + "htmlescape", + "itertools", + "levenshtein_automata", + "log", + "lru", + "lz4_flex", + "measure_time", + "memmap2", + "num_cpus", + "once_cell", + "oneshot", + "rayon", + "regex", + "rust-stemmers", + "rustc-hash", + "serde", + "serde_json", + "sketches-ddsketch", + "smallvec", + "tantivy-bitpacker", + "tantivy-columnar", + "tantivy-common", + "tantivy-fst", + "tantivy-query-grammar", + "tantivy-stacker", + "tantivy-tokenizer-api", + "tempfile", + "thiserror", + "time", + "uuid", + "winapi", +] + +[[package]] +name = "tantivy-bitpacker" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "284899c2325d6832203ac6ff5891b297fc5239c3dc754c5bc1977855b23c10df" +dependencies = [ + "bitpacking", +] + +[[package]] +name = "tantivy-columnar" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12722224ffbe346c7fec3275c699e508fd0d4710e629e933d5736ec524a1f44e" +dependencies = [ + "downcast-rs", + "fastdivide", + "itertools", + "serde", + "tantivy-bitpacker", + "tantivy-common", + "tantivy-sstable", + "tantivy-stacker", +] + +[[package]] +name = "tantivy-common" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8019e3cabcfd20a1380b491e13ff42f57bb38bf97c3d5fa5c07e50816e0621f4" +dependencies = [ + "async-trait", + "byteorder", + "ownedbytes", + "serde", + "time", +] + +[[package]] +name = "tantivy-fst" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d60769b80ad7953d8a7b2c70cdfe722bbcdcac6bccc8ac934c40c034d866fc18" +dependencies = [ + "byteorder", + "regex-syntax", + "utf8-ranges", +] + +[[package]] +name = "tantivy-query-grammar" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "847434d4af57b32e309f4ab1b4f1707a6c566656264caa427ff4285c4d9d0b82" +dependencies = [ + "nom", +] + +[[package]] +name = "tantivy-sstable" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c69578242e8e9fc989119f522ba5b49a38ac20f576fc778035b96cc94f41f98e" +dependencies = [ + "tantivy-bitpacker", + "tantivy-common", + "tantivy-fst", + "zstd", +] + +[[package]] +name = "tantivy-stacker" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c56d6ff5591fc332739b3ce7035b57995a3ce29a93ffd6012660e0949c956ea8" +dependencies = [ + "murmurhash32", + "rand_distr", + "tantivy-common", +] + +[[package]] +name = "tantivy-tokenizer-api" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a0dcade25819a89cfe6f17d932c9cedff11989936bf6dd4f336d50392053b04" +dependencies = [ + "serde", +] + +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom 0.4.2", + "once_cell", + "rustix 1.1.4", + "windows-sys 0.61.2", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "time" +version = "0.3.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" +dependencies = [ + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde_core", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" + +[[package]] +name = "time-macros" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" +dependencies = [ + "num-conv", + "time-core", +] + +[[package]] +name = "toml" +version = "0.9.12+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf92845e79fc2e2def6a5d828f0801e29a2f8acc037becc5ab08595c7d5e9863" +dependencies = [ + "indexmap", + "serde_core", + "serde_spanned", + "toml_datetime", + "toml_parser", + "toml_writer", + "winnow 0.7.15", +] + +[[package]] +name = "toml_datetime" +version = "0.7.5+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347" +dependencies = [ + "serde_core", +] + +[[package]] +name = "toml_parser" +version = "1.1.2+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526" +dependencies = [ + "winnow 1.0.1", +] + +[[package]] +name = "toml_writer" +version = "1.1.1+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "756daf9b1013ebe47a8776667b466417e2d4c5679d441c26230efd9ef78692db" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + +[[package]] +name = "utf8-ranges" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcfc827f90e53a02eaef5e535ee14266c1d569214c6aa70133a624d8a3164ba" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "uuid" +version = "1.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76" +dependencies = [ + "getrandom 0.4.2", + "js-sys", + "serde_core", + "wasm-bindgen", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasip2" +version = "1.0.3+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" +dependencies = [ + "wit-bindgen 0.57.1", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen 0.51.0", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf938a0bacb0469e83c1e148908bd7d5a6010354cf4fb73279b7447422e3a89" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eeff24f84126c0ec2db7a449f0c2ec963c6a49efe0698c4242929da037ca28ed" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d08065faf983b2b80a79fd87d8254c409281cf7de75fc4b773019824196c904" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd04d9e306f1907bd13c6361b5c6bfc7b3b3c095ed3f8a9246390f8dbdee129" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + +[[package]] +name = "web-sys" +version = "0.3.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f2dfbb17949fa2088e5d39408c48368947b86f7834484e87b73de55bc14d97d" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "winnow" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" + +[[package]] +name = "winnow" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09dac053f1cd375980747450bfc7250c264eaae0583872e845c0c7cd578872b5" + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + +[[package]] +name = "zerocopy" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/third_party/tantivy_ffi/Cargo.toml b/third_party/tantivy_ffi/Cargo.toml new file mode 100644 index 000000000..4b5d76a5d --- /dev/null +++ b/third_party/tantivy_ffi/Cargo.toml @@ -0,0 +1,33 @@ +[package] +name = "paimon_tantivy_ffi" +version = "0.1.0" +edition = "2021" +description = "C FFI layer wrapping tantivy + jieba-rs for paimon-cpp tantivy-fts global index" +license = "Apache-2.0" +publish = false + +[lib] +name = "paimon_tantivy_ffi" +# staticlib: 给 CMake + Corrosion 链接成 libpaimon_tantivy_ffi.a +# rlib: 给 cargo test 生成 test 可执行文件时能用到 Rust 原生 linkage +crate-type = ["staticlib", "rlib"] + +[dependencies] +tantivy = "0.22" +jieba-rs = "0.7" +croaring = "2.0" +log = "0.4" +tempfile = "3" + +[build-dependencies] +cbindgen = "0.29" + +[profile.release] +opt-level = 3 +lto = "thin" +codegen-units = 1 +panic = "abort" + +[profile.dev] +# FFI 异常传播通过 status code,Rust 侧 panic 应当 abort 避免穿过 FFI 边界 +panic = "abort" diff --git a/third_party/tantivy_ffi/build.rs b/third_party/tantivy_ffi/build.rs new file mode 100644 index 000000000..cc8da5574 --- /dev/null +++ b/third_party/tantivy_ffi/build.rs @@ -0,0 +1,38 @@ +//! build.rs: 调 cbindgen 生成 C 头文件 paimon_tantivy_ffi.h +//! +//! 输出路径: $OUT_DIR/paimon_tantivy_ffi.h +//! Corrosion (CMake 侧) 会读 cargo metadata 里的 OUT_DIR,把头文件加入 C++ include path。 + +use std::env; +use std::path::PathBuf; + +fn main() { + let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap(); + let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()); + let header_path = out_dir.join("paimon_tantivy_ffi.h"); + + let cfg = cbindgen::Config::from_file(PathBuf::from(&crate_dir).join("cbindgen.toml")) + .expect("cbindgen.toml must exist at crate root"); + + match cbindgen::Builder::new() + .with_crate(&crate_dir) + .with_config(cfg) + .generate() + { + Ok(bindings) => { + bindings.write_to_file(&header_path); + println!( + "cargo:rerun-if-changed={}", + PathBuf::from(&crate_dir).join("src").display() + ); + println!("cargo:rerun-if-changed=cbindgen.toml"); + // 把头文件路径暴露给 Corrosion / 上游 CMake + println!("cargo:include={}", out_dir.display()); + eprintln!("cbindgen: wrote {}", header_path.display()); + } + Err(e) => { + // cbindgen 失败不一定致命 (例如 CI 在没改 Rust 代码时跳过). 打 warning 继续。 + eprintln!("cbindgen generation failed: {e:?}"); + } + } +} diff --git a/third_party/tantivy_ffi/cbindgen.toml b/third_party/tantivy_ffi/cbindgen.toml new file mode 100644 index 000000000..a8b5237fa --- /dev/null +++ b/third_party/tantivy_ffi/cbindgen.toml @@ -0,0 +1,48 @@ +# cbindgen 配置: Rust FFI -> C 头文件生成器 +# 由 build.rs 调用,输出到 $OUT_DIR/paimon_tantivy_ffi.h +# CMake 通过 Corrosion 拿到 $OUT_DIR 并 include 到 C++ target + +language = "C" + +# 头文件顶部标注 +header = """ +/* Copyright 2026-present Alibaba Inc. */ +/* + * AUTO-GENERATED by cbindgen from Rust sources under third_party/tantivy_ffi - DO NOT EDIT. + * + * C ABI for paimon_tantivy_ffi. See docs/dev/tantivy_ffi_design.md for contract. + */ +#pragma once +""" + +include_guard = "PAIMON_TANTIVY_FFI_H" +cpp_compat = true +pragma_once = false # 已经手写在 header 里 +documentation = true +documentation_style = "c" +line_length = 100 +tab_width = 4 + +[export] +# 不给类型加前缀 (Rust 侧类型名已经自带 PaimonTantivy... 前缀)。 +# 函数名天然以 paimon_tantivy_ 开头(Rust 源里直接这样命名)。 +prefix = "" +# 强制导出仅作为句柄/返回值的类型(没有 FFI 函数直接 take/return 它们时, +# cbindgen 默认不导出;显式列在这里)。 +include = ["PaimonTantivyStatus"] + +[export.rename] +# Rust enum 名 -> C 里的 typedef 名(避免重复 prefix 之类) + +[fn] +prefix = "" +args = "auto" +rename_args = "None" + +[enum] +rename_variants = "ScreamingSnakeCase" +prefix_with_name = true +derive_helper_methods = false + +[parse] +parse_deps = false diff --git a/third_party/tantivy_ffi/rust-toolchain.toml b/third_party/tantivy_ffi/rust-toolchain.toml new file mode 100644 index 000000000..8a8c36646 --- /dev/null +++ b/third_party/tantivy_ffi/rust-toolchain.toml @@ -0,0 +1,11 @@ +# Pin the Rust toolchain used to build paimon_tantivy_ffi. Without this, +# Corrosion's FindRust.cmake invokes `rustup which rustc --toolchain ''` +# which fails on fresh CMake configure (no rust-toolchain → empty toolchain +# name → rustup rejects it). See docs/dev/execute.md Stage 11 for context. +# +# Only the `channel` is pinned — no extra components, because rustup in +# CI/containers may lack network access to fetch clippy/rustfmt, and build +# doesn't need them. +[toolchain] +channel = "stable" +profile = "minimal" diff --git a/third_party/tantivy_ffi/src/buffer.rs b/third_party/tantivy_ffi/src/buffer.rs new file mode 100644 index 000000000..36ad0b905 --- /dev/null +++ b/third_party/tantivy_ffi/src/buffer.rs @@ -0,0 +1,111 @@ +//! `paimon_tantivy_buffer_t`: Rust-allocated byte buffer returned to C++. +//! +//! Contract (see docs/dev/tantivy_ffi_design.md §3 Category B): +//! - Buffer is allocated by Rust (as a `Box<[u8]>`) +//! - C++ reads `data[0..len]`, **must not** write past len +//! - C++ must call `paimon_tantivy_buffer_free()` exactly once per non-empty buffer +//! - Empty (len=0) buffer has null `data`; buffer_free accepts it as no-op +//! +//! This struct is #[repr(C)] so cbindgen generates a matching C struct. + +use std::ptr; + +#[repr(C)] +pub struct PaimonTantivyBuffer { + /// Pointer to `len` bytes. Null iff len == 0. + pub data: *mut u8, + /// Number of valid bytes. + pub len: usize, + /// Internal capacity hint for Rust-side reconstruction. C++ treats as opaque. + pub capacity: usize, +} + +impl PaimonTantivyBuffer { + /// Build a buffer from owned bytes; consumes the Vec. + pub(crate) fn from_vec(mut v: Vec) -> Self { + if v.is_empty() { + return Self::empty(); + } + v.shrink_to_fit(); + let len = v.len(); + let capacity = v.capacity(); + let data = v.as_mut_ptr(); + std::mem::forget(v); + Self { data, len, capacity } + } + + pub(crate) fn empty() -> Self { + Self { + data: ptr::null_mut(), + len: 0, + capacity: 0, + } + } +} + +/// Free a buffer returned by any Rust FFI function. Safe to call on an empty +/// buffer (len=0 / data=null). Must only be called once per buffer. +/// +/// SAFETY: `buf` must be either null, or point to a live `paimon_tantivy_buffer_t` +/// produced by this crate and not yet freed. +#[no_mangle] +pub unsafe extern "C" fn paimon_tantivy_buffer_free(buf: *mut PaimonTantivyBuffer) { + if buf.is_null() { + return; + } + let b = unsafe { &mut *buf }; + if b.len != 0 && !b.data.is_null() { + // Reconstruct the Vec and drop it + let v = unsafe { Vec::from_raw_parts(b.data, b.len, b.capacity) }; + drop(v); + } + b.data = ptr::null_mut(); + b.len = 0; + b.capacity = 0; +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn empty_has_null_data() { + let b = PaimonTantivyBuffer::empty(); + assert!(b.data.is_null()); + assert_eq!(b.len, 0); + } + + #[test] + fn from_vec_roundtrip() { + let src = vec![1u8, 2, 3, 4, 5]; + let src_clone = src.clone(); + let mut b = PaimonTantivyBuffer::from_vec(src); + assert_eq!(b.len, 5); + assert!(!b.data.is_null()); + let view: &[u8] = unsafe { std::slice::from_raw_parts(b.data, b.len) }; + assert_eq!(view, src_clone.as_slice()); + unsafe { paimon_tantivy_buffer_free(&mut b) }; + assert!(b.data.is_null()); + assert_eq!(b.len, 0); + } + + #[test] + fn free_null_is_noop() { + unsafe { paimon_tantivy_buffer_free(std::ptr::null_mut()) }; + } + + #[test] + fn free_empty_is_noop() { + let mut b = PaimonTantivyBuffer::empty(); + unsafe { paimon_tantivy_buffer_free(&mut b) }; + } + + #[test] + fn stress_alloc_free() { + // LSAN would catch any leak + for i in 0..5_000usize { + let mut b = PaimonTantivyBuffer::from_vec(vec![42u8; i.min(256)]); + unsafe { paimon_tantivy_buffer_free(&mut b) }; + } + } +} diff --git a/third_party/tantivy_ffi/src/callback_directory.rs b/third_party/tantivy_ffi/src/callback_directory.rs new file mode 100644 index 000000000..6ef64a170 --- /dev/null +++ b/third_party/tantivy_ffi/src/callback_directory.rs @@ -0,0 +1,498 @@ +//! PaimonCallbackDirectory: streaming tantivy `Directory` backed by C FFI +//! callbacks. Replaces the V1 `PaimonDirectory` (RamDirectory wrapper) with a +//! callback-driven design that mirrors Java paimon-tantivy-jni's `JniDirectory`. +//! +//! ## Why callback-based? +//! +//! V1 loaded the entire archive (100MB+) into `RamDirectory` at reader +//! construction, giving ~2x archive peak RAM and paying the whole download +//! cost up front even for small queries. V3 keeps just the `HashMap` layout and issues pread calls through the FFI callback whenever +//! tantivy asks for bytes — peak RAM is ~KB, startup is ~header size. +//! +//! ## Concurrency +//! +//! V3 serializes `read_at` via `stream_mutex` (same as Java JniDir's +//! `stream_lock`). pread-style callbacks in principle allow concurrent reads, +//! but some `paimon::InputStream` subclasses (notably `JindoInputStream`) +//! have shared-state races, so V3 plays it safe. V3.5 removes the mutex — +//! see `docs/dev/tantivy_directory_upgrade_plan.md` §5. + +use std::collections::HashMap; +use std::ffi::c_void; +use std::fmt; +use std::io; +use std::ops::Range; +use std::path::{Path, PathBuf}; +use std::sync::{Arc, Mutex}; + +use tantivy::directory::error::{DeleteError, LockError, OpenReadError, OpenWriteError}; +use tantivy::directory::{ + AntiCallToken, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes, TerminatingWrite, + WatchCallback, WatchHandle, WritePtr, +}; +use tantivy::HasLen; + +// ========================================================================= +// FFI types +// ========================================================================= + +/// pread-style callback table passed from C++ at reader construction. +/// +/// `ctx` is an opaque pointer to C++'s `StreamCtx` (holding a +/// `paimon::InputStream`). Rust never dereferences it — only forwards it +/// into the callback functions. `release` is called exactly once when the +/// last `Arc` is dropped. +#[repr(C)] +pub struct PaimonStreamCallbacks { + pub ctx: *mut c_void, + pub read_at: + extern "C" fn(ctx: *mut c_void, offset: u64, len: usize, out_buf: *mut u8) -> i32, + pub release: extern "C" fn(ctx: *mut c_void), +} + +// ========================================================================= +// Internal state +// ========================================================================= + +#[derive(Clone, Debug)] +struct FileMeta { + offset: u64, + length: u64, +} + +/// RAII wrapper owning the FFI callbacks. On drop, invokes `release(ctx)`. +/// Shared across clones of `PaimonCallbackDirectory` via `Arc`. +struct CallbackCtx { + callbacks: PaimonStreamCallbacks, +} + +impl Drop for CallbackCtx { + fn drop(&mut self) { + // Calling an extern "C" fn pointer from safe Rust is legal; the + // contract safety relies on the C++ side providing a valid ctx. + (self.callbacks.release)(self.callbacks.ctx); + } +} + +// Safety: callbacks.ctx is treated as opaque; C++ owner is responsible for +// the ctx being usable across threads. Rust's stream_mutex serializes +// read_at calls, and release is only invoked once (when Arc refcount hits 0). +unsafe impl Send for CallbackCtx {} +unsafe impl Sync for CallbackCtx {} + +// ========================================================================= +// PaimonCallbackDirectory +// ========================================================================= + +#[derive(Clone)] +pub struct PaimonCallbackDirectory { + /// name → (offset, length) in the stream. Immutable after construction. + layout: Arc>, + /// FFI callbacks + their ctx lifetime. + ctx: Arc, + /// tantivy writes small atomic files (`.lock`, in some paths `meta.json`) + /// via `atomic_write`; we keep them in memory instead of pushing back + /// through C++ (read-only archive). Shared across clones. + atomic_data: Arc>>>, + /// V3 保守路线:串行 seek+read(对齐 Java JniDir `stream_lock`)。 + /// V3.5 升级去掉此锁,见 `tantivy_directory_upgrade_plan.md` §5。 + stream_mutex: Arc>, +} + +impl fmt::Debug for PaimonCallbackDirectory { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("PaimonCallbackDirectory") + .field("files", &self.layout.keys().collect::>()) + .finish() + } +} + +impl PaimonCallbackDirectory { + /// Construct a new directory from the C++-parsed archive layout + callbacks. + /// The ctx ownership transfers to this Directory; `release` is invoked on + /// drop of the last clone. + pub fn new( + entries: Vec<(String, u64, u64)>, + callbacks: PaimonStreamCallbacks, + ) -> Self { + let mut layout = HashMap::with_capacity(entries.len()); + for (name, offset, length) in entries { + layout.insert(PathBuf::from(name), FileMeta { offset, length }); + } + Self { + layout: Arc::new(layout), + ctx: Arc::new(CallbackCtx { callbacks }), + atomic_data: Arc::new(Mutex::new(HashMap::new())), + stream_mutex: Arc::new(Mutex::new(())), + } + } + + /// Perform an FFI pread. Serialized via `stream_mutex` (V3 invariant). + fn pread(&self, offset: u64, len: usize) -> io::Result> { + let _guard = self.stream_mutex.lock().map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("stream_mutex poisoned: {e}")) + })?; + let mut buf = vec![0u8; len]; + // Calling extern "C" fn pointer — safe from Rust's POV (ABI is C); + // the contract safety (ctx validity, buffer ownership) is on the C++ side. + let rc = + (self.ctx.callbacks.read_at)(self.ctx.callbacks.ctx, offset, len, buf.as_mut_ptr()); + if rc != 0 { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("pread callback rc={rc} offset={offset} len={len}"), + )); + } + Ok(buf) + } + + /// Sorted file names, for diagnostic / test use. + #[cfg(test)] + pub(crate) fn file_names(&self) -> Vec { + let mut names: Vec = self + .layout + .keys() + .map(|p| p.to_string_lossy().into_owned()) + .collect(); + names.sort(); + names + } +} + +// ========================================================================= +// FileHandle +// ========================================================================= + +#[derive(Clone)] +struct PaimonCallbackFileHandle { + directory: PaimonCallbackDirectory, + file_offset: u64, + file_length: u64, +} + +impl fmt::Debug for PaimonCallbackFileHandle { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("PaimonCallbackFileHandle") + .field("offset", &self.file_offset) + .field("length", &self.file_length) + .finish() + } +} + +impl HasLen for PaimonCallbackFileHandle { + fn len(&self) -> usize { + self.file_length as usize + } +} + +impl FileHandle for PaimonCallbackFileHandle { + fn read_bytes(&self, range: Range) -> io::Result { + let start = self.file_offset + range.start as u64; + let len = range.end - range.start; + let data = self.directory.pread(start, len)?; + Ok(OwnedBytes::new(data)) + } +} + +// ========================================================================= +// Directory trait (13 methods for tantivy 0.22) +// ========================================================================= + +impl Directory for PaimonCallbackDirectory { + fn get_file_handle(&self, path: &Path) -> Result, OpenReadError> { + let meta = self + .layout + .get(path) + .ok_or_else(|| OpenReadError::FileDoesNotExist(path.to_path_buf()))?; + Ok(Arc::new(PaimonCallbackFileHandle { + directory: self.clone(), + file_offset: meta.offset, + file_length: meta.length, + })) + } + + fn exists(&self, path: &Path) -> Result { + let in_layout = self.layout.contains_key(path); + let in_atomic = self.atomic_data.lock().unwrap().contains_key(path); + Ok(in_layout || in_atomic) + } + + fn atomic_read(&self, path: &Path) -> Result, OpenReadError> { + if let Some(data) = self.atomic_data.lock().unwrap().get(path) { + return Ok(data.clone()); + } + let meta = self + .layout + .get(path) + .ok_or_else(|| OpenReadError::FileDoesNotExist(path.to_path_buf()))?; + self.pread(meta.offset, meta.length as usize) + .map_err(|e| OpenReadError::wrap_io_error(e, path.to_path_buf())) + } + + fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()> { + self.atomic_data + .lock() + .unwrap() + .insert(path.to_path_buf(), data.to_vec()); + Ok(()) + } + + fn delete(&self, _path: &Path) -> Result<(), DeleteError> { + // read-only archive: ignore + Ok(()) + } + + fn open_write(&self, _path: &Path) -> Result { + // tantivy needs this for lock files when opening an index; provide a + // dummy in-memory writer (same trick as Java JniDirectory). + let buf: Vec = Vec::new(); + Ok(io::BufWriter::new(Box::new(VecTerminatingWrite(buf)))) + } + + fn sync_directory(&self) -> io::Result<()> { + Ok(()) + } + + fn acquire_lock(&self, _lock: &Lock) -> Result { + // Read-only: no actual locking. + Ok(DirectoryLock::from(Box::new(()))) + } + + fn watch(&self, _watch_callback: WatchCallback) -> tantivy::Result { + Ok(WatchHandle::empty()) + } +} + +/// Throwaway writer for `open_write` — tantivy creates it for lock files but +/// the bytes never matter in a read-only archive. +struct VecTerminatingWrite(Vec); + +impl io::Write for VecTerminatingWrite { + fn write(&mut self, buf: &[u8]) -> io::Result { + self.0.extend_from_slice(buf); + Ok(buf.len()) + } + fn flush(&mut self) -> io::Result<()> { + Ok(()) + } +} + +impl TerminatingWrite for VecTerminatingWrite { + fn terminate_ref(&mut self, _token: AntiCallToken) -> io::Result<()> { + Ok(()) + } +} + +// ========================================================================= +// Test support (pub(crate) — used by reader.rs tests too) +// ========================================================================= + +#[cfg(test)] +pub(crate) mod test_support { + use super::*; + use std::sync::atomic::{AtomicUsize, Ordering}; + + /// Mock backend: an in-memory buffer serving pread requests. Counters + /// expose behavior for test assertions (read count / release count). + pub(crate) struct MockBackend { + pub data: Vec, + pub read_count: AtomicUsize, + pub release_count: AtomicUsize, + } + + extern "C" fn mock_read_at( + ctx: *mut c_void, + offset: u64, + len: usize, + out_buf: *mut u8, + ) -> i32 { + let backend = unsafe { &*(ctx as *const MockBackend) }; + backend.read_count.fetch_add(1, Ordering::SeqCst); + let data = &backend.data; + let end = (offset as usize).saturating_add(len); + if end > data.len() { + return 1; // out of range + } + unsafe { + std::ptr::copy_nonoverlapping(data.as_ptr().add(offset as usize), out_buf, len); + } + 0 + } + + extern "C" fn mock_release(ctx: *mut c_void) { + // Reclaim the strong ref that `Arc::into_raw` leaked at construction. + let backend = unsafe { Arc::from_raw(ctx as *const MockBackend) }; + backend.release_count.fetch_add(1, Ordering::SeqCst); + // `arc` drops here → decrement; test still holds its own clone. + } + + /// Build a mock-backed directory for tests. Returns (dir, backend clone). + /// The backend Arc is shared — drop the directory to trigger release. + pub(crate) fn build_mock_directory( + data: Vec, + entries: Vec<(String, u64, u64)>, + ) -> (PaimonCallbackDirectory, Arc) { + let backend = Arc::new(MockBackend { + data, + read_count: AtomicUsize::new(0), + release_count: AtomicUsize::new(0), + }); + let ctx_ptr = Arc::into_raw(backend.clone()) as *mut c_void; + let cb = PaimonStreamCallbacks { + ctx: ctx_ptr, + read_at: mock_read_at, + release: mock_release, + }; + let dir = PaimonCallbackDirectory::new(entries, cb); + (dir, backend) + } + + /// Parse a packed archive blob (BE, no version header, matching + /// `writer::pack_index_dir`) and build a mock-backed directory. Used by + /// `reader.rs::tests` since writer.finish currently still returns a Vec. + pub(crate) fn build_directory_from_archive( + packed: Vec, + ) -> (PaimonCallbackDirectory, Arc) { + let entries = parse_archive_header(&packed); + build_mock_directory(packed, entries) + } + + /// Parse the archive header — mirrors the layout that + /// C++ `ParseArchiveHeader` will produce in production (K3). + fn parse_archive_header(bytes: &[u8]) -> Vec<(String, u64, u64)> { + let mut off = 0usize; + let file_count = i32::from_be_bytes(bytes[off..off + 4].try_into().unwrap()) as usize; + off += 4; + let mut entries = Vec::with_capacity(file_count); + for _ in 0..file_count { + let nlen = i32::from_be_bytes(bytes[off..off + 4].try_into().unwrap()) as usize; + off += 4; + let name = + std::str::from_utf8(&bytes[off..off + nlen]).unwrap().to_owned(); + off += nlen; + let flen = i64::from_be_bytes(bytes[off..off + 8].try_into().unwrap()) as u64; + off += 8; + let data_offset = off as u64; + entries.push((name, data_offset, flen)); + off += flen as usize; + } + entries + } +} + +#[cfg(test)] +mod tests { + use super::test_support::*; + use super::*; + + #[test] + fn file_handle_reads_correct_bytes() { + let data = b"hello world".to_vec(); + let entries = vec![("foo.txt".to_string(), 0, 11)]; + let (dir, _backend) = build_mock_directory(data, entries); + + let handle = dir.get_file_handle(Path::new("foo.txt")).unwrap(); + let bytes = handle.read_bytes(0..5).unwrap(); + assert_eq!(&bytes[..], b"hello"); + let bytes = handle.read_bytes(6..11).unwrap(); + assert_eq!(&bytes[..], b"world"); + } + + #[test] + fn missing_file_returns_error() { + let (dir, _backend) = build_mock_directory(vec![], vec![]); + let err = dir.get_file_handle(Path::new("nonexistent")).unwrap_err(); + match err { + OpenReadError::FileDoesNotExist(p) => { + assert_eq!(p.to_string_lossy(), "nonexistent") + } + other => panic!("expected FileDoesNotExist, got {other:?}"), + } + } + + #[test] + fn pread_out_of_range_propagates_error() { + let data = b"short".to_vec(); + let entries = vec![("bad.txt".to_string(), 0, 100)]; // 长度超出 data + let (dir, _backend) = build_mock_directory(data, entries); + let handle = dir.get_file_handle(Path::new("bad.txt")).unwrap(); + let err = handle.read_bytes(0..100).unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::Other); + } + + #[test] + fn atomic_write_read_roundtrip_and_exists() { + let (dir, _backend) = build_mock_directory(vec![], vec![]); + dir.atomic_write(Path::new(".lock"), b"locked").unwrap(); + let data = dir.atomic_read(Path::new(".lock")).unwrap(); + assert_eq!(data, b"locked"); + assert!(dir.exists(Path::new(".lock")).unwrap()); + assert!(!dir.exists(Path::new("gone")).unwrap()); + } + + #[test] + fn release_called_exactly_once_on_last_drop() { + let entries = vec![("a".to_string(), 0, 5)]; + let (dir, backend) = build_mock_directory(b"hello".to_vec(), entries); + assert_eq!(backend.release_count.load(std::sync::atomic::Ordering::SeqCst), 0); + drop(dir); + assert_eq!(backend.release_count.load(std::sync::atomic::Ordering::SeqCst), 1); + } + + #[test] + fn cloned_directory_shares_ctx_and_atomic_data() { + let (dir, backend) = build_mock_directory(vec![], vec![]); + let dir2 = dir.clone(); + dir.atomic_write(Path::new("x"), b"hello").unwrap(); + assert!(dir2.exists(Path::new("x")).unwrap()); // shared atomic_data + drop(dir); + assert_eq!(backend.release_count.load(std::sync::atomic::Ordering::SeqCst), 0); // ctx still held by dir2 + drop(dir2); + assert_eq!(backend.release_count.load(std::sync::atomic::Ordering::SeqCst), 1); + } + + #[test] + fn concurrent_pread_results_correct_under_stream_mutex() { + use std::thread; + + let data: Vec = (0..1000).map(|i| (i % 256) as u8).collect(); + let entries = vec![("data".to_string(), 0, 1000)]; + let (dir, backend) = build_mock_directory(data.clone(), entries); + let handle: Arc = + dir.get_file_handle(Path::new("data")).unwrap(); + + let threads: Vec<_> = (0..8) + .map(|_| { + let h = handle.clone(); + let expected = data.clone(); + thread::spawn(move || { + for _ in 0..20 { + let bytes = h.read_bytes(100..200).unwrap(); + assert_eq!(&bytes[..], &expected[100..200]); + } + }) + }) + .collect(); + + for t in threads { + t.join().unwrap(); + } + assert_eq!( + backend.read_count.load(std::sync::atomic::Ordering::SeqCst), + 8 * 20 + ); + } + + #[test] + fn file_names_sorted() { + let entries = vec![ + ("z.idx".to_string(), 0, 10), + ("a.meta".to_string(), 10, 20), + ("m.term".to_string(), 30, 5), + ]; + let (dir, _backend) = build_mock_directory(vec![0u8; 100], entries); + let names = dir.file_names(); + assert_eq!(names, vec!["a.meta", "m.term", "z.idx"]); + } +} diff --git a/third_party/tantivy_ffi/src/error.rs b/third_party/tantivy_ffi/src/error.rs new file mode 100644 index 000000000..80f16df65 --- /dev/null +++ b/third_party/tantivy_ffi/src/error.rs @@ -0,0 +1,137 @@ +//! Error model for paimon_tantivy_ffi. +//! +//! See docs/dev/tantivy_ffi_design.md §2. Contract: +//! - Every fallible FFI function returns `paimon_tantivy_status_t` +//! - Failure sets `last_error` (thread-local) with human-readable text +//! - C++ calls `paimon_tantivy_last_error()` after a non-OK status to fetch text +//! - Pointer returned by `last_error()` is thread-local and valid until the +//! next failing FFI call on the same thread. C++ must NOT free it. + +use std::cell::RefCell; +use std::ffi::c_char; +use std::ffi::CString; + +/// Status codes. Values are stable ABI; append-only. +#[repr(i32)] +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum PaimonTantivyStatus { + Ok = 0, + InvalidArgument = 1, + NotFound = 2, + IoError = 3, + Unsupported = 4, + TokenizerError = 5, + QueryParseError = 6, + IndexFormatError = 7, + InternalError = 99, +} + +thread_local! { + /// Pre-allocated empty string so `paimon_tantivy_last_error()` can always + /// return a valid non-null pointer. + static LAST_ERROR: RefCell = RefCell::new(CString::new("").unwrap()); +} + +/// Record an error message for the current thread. Called by fallible FFI +/// functions right before returning a non-OK status. +pub(crate) fn set_last_error(msg: impl Into) { + // Interior nul bytes would make CString::new fail; strip them as a safety net. + let s: String = msg.into().replace('\0', "\u{FFFD}"); + LAST_ERROR.with(|cell| { + // CString::new clones the bytes and appends a nul terminator. + *cell.borrow_mut() = CString::new(s).unwrap_or_else(|_| CString::new("").unwrap()); + }); +} + +/// Clear the current thread's error slot. Called at the top of fallible APIs +/// so a subsequent successful call doesn't return stale text. +#[allow(dead_code)] +pub(crate) fn clear_last_error() { + LAST_ERROR.with(|cell| { + *cell.borrow_mut() = CString::new("").unwrap(); + }); +} + +/// Macro that wraps a `Result`-returning block: sets last_error on +/// Err and returns the given status code; returns Ok value on success. +#[macro_export] +macro_rules! ffi_try { + ($expr:expr, $err_status:expr) => {{ + match $expr { + Ok(v) => v, + Err(e) => { + $crate::error::set_last_error(format!("{e}")); + return $err_status; + } + } + }}; +} + +/// Return the last error text for the calling thread. Always non-null; returns +/// pointer to "" when there is no error recorded yet. Pointer is thread-local; +/// C++ must NOT free it; treat as valid until the next failing FFI call on +/// the same thread. +#[no_mangle] +pub extern "C" fn paimon_tantivy_last_error() -> *const c_char { + LAST_ERROR.with(|cell| cell.borrow().as_ptr()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::ffi::CStr; + + #[test] + fn initial_last_error_is_empty() { + let ptr = paimon_tantivy_last_error(); + assert!(!ptr.is_null()); + let s = unsafe { CStr::from_ptr(ptr) }.to_str().unwrap(); + assert_eq!(s, ""); + } + + #[test] + fn set_then_retrieve() { + set_last_error("boom"); + let s = unsafe { CStr::from_ptr(paimon_tantivy_last_error()) } + .to_str() + .unwrap(); + assert_eq!(s, "boom"); + } + + #[test] + fn clear_resets_to_empty() { + set_last_error("x"); + clear_last_error(); + let s = unsafe { CStr::from_ptr(paimon_tantivy_last_error()) } + .to_str() + .unwrap(); + assert_eq!(s, ""); + } + + #[test] + fn embedded_nul_is_stripped() { + set_last_error("a\0b"); + let s = unsafe { CStr::from_ptr(paimon_tantivy_last_error()) } + .to_str() + .unwrap(); + assert_eq!(s, "a\u{FFFD}b"); + } + + #[test] + fn thread_local_isolation() { + set_last_error("main"); + let t = std::thread::spawn(|| { + let s = unsafe { CStr::from_ptr(paimon_tantivy_last_error()) } + .to_str() + .unwrap(); + s.to_owned() + }) + .join() + .unwrap(); + assert_eq!(t, ""); + let s = unsafe { CStr::from_ptr(paimon_tantivy_last_error()) } + .to_str() + .unwrap(); + assert_eq!(s, "main"); + } +} diff --git a/third_party/tantivy_ffi/src/handle.rs b/third_party/tantivy_ffi/src/handle.rs new file mode 100644 index 000000000..175c75e05 --- /dev/null +++ b/third_party/tantivy_ffi/src/handle.rs @@ -0,0 +1,106 @@ +//! Opaque handle helpers. +//! +//! Contract (see docs/dev/tantivy_ffi_design.md §3 Category A): +//! - Rust creates handles with `Box::into_raw(Box::new(T))` +//! - C++ must free with the matching `xxx_free(*mut T)` function, once +//! - Functions accepting handles treat null as invalid argument + +use std::ffi::c_void; + +/// Consume `T`, return a raw opaque pointer suitable for C++. +#[inline] +pub(crate) fn into_handle(value: T) -> *mut T { + Box::into_raw(Box::new(value)) +} + +/// Reconstitute a `Box` from an FFI-provided pointer and drop it. +/// SAFETY: caller must pass a pointer previously returned by `into_handle::`, +/// and must not use it again after this call. +#[inline] +pub(crate) unsafe fn free_handle(handle: *mut T) { + if handle.is_null() { + return; + } + drop(unsafe { Box::from_raw(handle) }); +} + +/// Borrow an `&T` from an FFI-provided pointer. Returns None on null. +/// SAFETY: caller must ensure the pointer was previously returned by +/// `into_handle::` and is still alive (not freed). +#[inline] +pub(crate) unsafe fn borrow_handle<'a, T>(handle: *const T) -> Option<&'a T> { + if handle.is_null() { + None + } else { + Some(unsafe { &*handle }) + } +} + +/// Borrow `&mut T` from an FFI-provided pointer. Returns None on null. +/// SAFETY: same as `borrow_handle`, plus caller must ensure there is no +/// concurrent access via another pointer (writer/reader handles are +/// documented as thread-unsafe). +#[inline] +pub(crate) unsafe fn borrow_handle_mut<'a, T>(handle: *mut T) -> Option<&'a mut T> { + if handle.is_null() { + None + } else { + Some(unsafe { &mut *handle }) + } +} + +/// Opaque ctx pointer from C++ (passed through to Rust Directory callbacks). +/// Type-erased on purpose: only C++ side knows the concrete type. +pub(crate) type OpaqueCtx = *mut c_void; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn into_then_free() { + struct X(i32); + let h: *mut X = into_handle(X(42)); + assert!(!h.is_null()); + unsafe { free_handle(h) }; + // no leak (LSAN would catch if compiled with sanitizers) + } + + #[test] + fn free_null_is_noop() { + let h: *mut i32 = std::ptr::null_mut(); + unsafe { free_handle(h) }; + } + + #[test] + fn borrow_roundtrip() { + let h = into_handle(42i32); + unsafe { + assert_eq!(*borrow_handle(h as *const i32).unwrap(), 42); + *borrow_handle_mut(h).unwrap() = 7; + assert_eq!(*borrow_handle(h as *const i32).unwrap(), 7); + free_handle(h); + } + } + + #[test] + fn borrow_null_is_none() { + unsafe { + assert!(borrow_handle::(std::ptr::null()).is_none()); + assert!(borrow_handle_mut::(std::ptr::null_mut()).is_none()); + } + } + + #[test] + fn stress_many_create_destroy() { + // smoke stress: many allocations, no leak + for i in 0..10_000 { + let h = into_handle(vec![i; 8]); + unsafe { + let v = borrow_handle(h as *const Vec).unwrap(); + assert_eq!(v.len(), 8); + free_handle(h); + } + } + } +} diff --git a/third_party/tantivy_ffi/src/lib.rs b/third_party/tantivy_ffi/src/lib.rs new file mode 100644 index 000000000..c96dac998 --- /dev/null +++ b/third_party/tantivy_ffi/src/lib.rs @@ -0,0 +1,84 @@ +//! paimon_tantivy_ffi: C ABI layer for tantivy + jieba-rs, +//! consumed by paimon-cpp's `tantivy-fulltext` global index. +//! +//! See `docs/dev/tantivy_ffi_design.md` for the contract. +//! +//! Stage 1: scaffold + version FFI. +//! Stage 2: error / handle / buffer / log modules. +//! Stage 3: tokenizer. +//! Stage 4: writer. +//! Later stages fill in directory / reader / query. + +#![deny(unsafe_op_in_unsafe_fn)] + +use std::ffi::c_char; + +pub mod error; +pub mod handle; +pub mod buffer; +pub mod log_bridge; +pub mod tokenizer; +pub mod writer; +pub mod callback_directory; +pub mod reader; + +// Re-export public FFI symbols at crate root so cbindgen picks them up. +pub use buffer::{paimon_tantivy_buffer_free, PaimonTantivyBuffer}; +pub use error::{paimon_tantivy_last_error, PaimonTantivyStatus}; +pub use log_bridge::{ + paimon_tantivy_clear_log_callback, paimon_tantivy_set_log_callback, PaimonTantivyLogFn, +}; +pub use tokenizer::{ + paimon_tantivy_tokenizer_free, paimon_tantivy_tokenizer_new, + paimon_tantivy_tokenizer_tokenize, PaimonJiebaTokenizer, +}; +pub use writer::{ + paimon_tantivy_writer_add, paimon_tantivy_writer_finish_streaming, + paimon_tantivy_writer_free, paimon_tantivy_writer_new, PaimonTantivyWriter, + PaimonWriteCallbacks, +}; +pub use callback_directory::{PaimonCallbackDirectory, PaimonStreamCallbacks}; +pub use reader::{ + paimon_tantivy_reader_free, paimon_tantivy_reader_new_streaming, + paimon_tantivy_reader_search, PaimonTantivyReader, +}; + +/// Semantic version of this crate, **'static lifetime**; C++ must NOT free. +/// Format: `""` (git sha postfix can be added later via build.rs). +/// Returned as a NUL-terminated UTF-8 C string. +#[no_mangle] +pub extern "C" fn paimon_tantivy_version() -> *const c_char { + concat!(env!("CARGO_PKG_VERSION"), "\0").as_ptr() as *const c_char +} + +#[cfg(test)] +mod tests { + use super::*; + use std::ffi::CStr; + + #[test] + fn version_is_non_empty() { + let ptr = paimon_tantivy_version(); + assert!(!ptr.is_null()); + let s = unsafe { CStr::from_ptr(ptr) }.to_str().unwrap(); + assert!(!s.is_empty(), "version must be non-empty"); + assert!(s.contains('.'), "version must look like semver, got {s:?}"); + } + + #[test] + fn tantivy_and_jieba_are_linked() { + let _ = tantivy::schema::Schema::builder(); + let _ = jieba_rs::Jieba::new(); + } + + #[test] + fn croaring_serialize_roundtrip() { + use croaring::Bitmap; + let mut b = Bitmap::new(); + b.add(42); + b.add(100); + let bytes = b.serialize::(); + let b2 = Bitmap::deserialize::(&bytes); + assert_eq!(b.cardinality(), b2.cardinality()); + } +} diff --git a/third_party/tantivy_ffi/src/log_bridge.rs b/third_party/tantivy_ffi/src/log_bridge.rs new file mode 100644 index 000000000..380832c81 --- /dev/null +++ b/third_party/tantivy_ffi/src/log_bridge.rs @@ -0,0 +1,103 @@ +//! Log bridge: tantivy internally emits log records via the `log` crate +//! (via `tantivy::debug` / `info` etc.). This module registers a global +//! `log::Log` implementation that forwards records to a C callback. +//! +//! Contract (see docs/dev/tantivy_ffi_design.md §7): +//! - C++ calls `paimon_tantivy_set_log_callback(cb)` once at process startup +//! - Passing null unregisters (reverts to stderr) +//! - Callback receives (level, msg_ptr, msg_len); pointer is non-null, +//! UTF-8, NOT null-terminated, valid only for the duration of the call +//! - Level mapping: 0=trace 1=debug 2=info 3=warn 4=error +//! - Callback must be thread-safe: tantivy writes from worker threads +//! +//! NOTE: tantivy uses `tracing` in newer versions and `log` in others. +//! Our current `tantivy = "0.22"` uses `log` (verified Stage 0.5 probe). +//! If a future upgrade switches to `tracing`, install a `tracing-log` +//! bridge here. + +use std::ffi::c_char; +use std::sync::atomic::{AtomicPtr, Ordering}; + +pub type PaimonTantivyLogFn = extern "C" fn(level: i32, msg: *const c_char, len: usize); + +static CALLBACK: AtomicPtr<()> = AtomicPtr::new(std::ptr::null_mut()); + +struct LogBridge; + +impl log::Log for LogBridge { + fn enabled(&self, _: &log::Metadata) -> bool { + true + } + + fn log(&self, record: &log::Record) { + let level = match record.level() { + log::Level::Trace => 0, + log::Level::Debug => 1, + log::Level::Info => 2, + log::Level::Warn => 3, + log::Level::Error => 4, + }; + let msg = format!("[{}] {}", record.target(), record.args()); + let ptr = CALLBACK.load(Ordering::Acquire); + if ptr.is_null() { + // Fallback: stderr + eprintln!("{msg}"); + return; + } + // SAFETY: ptr was installed as PaimonTantivyLogFn via transmute below + let cb: PaimonTantivyLogFn = unsafe { std::mem::transmute(ptr) }; + cb(level, msg.as_ptr() as *const c_char, msg.len()); + } + + fn flush(&self) {} +} + +static LOGGER: LogBridge = LogBridge; + +/// Install a non-null callback. First call also registers `LogBridge` as +/// the global `log` crate sink. Subsequent calls swap the callback atomically. +/// Thread-safety: safe to call from any thread. +/// +/// Note: we use separate `set`/`clear` functions instead of `Option` +/// because cbindgen translates `Option` into an opaque struct +/// rather than a nullable C function pointer. +#[no_mangle] +pub extern "C" fn paimon_tantivy_set_log_callback(cb: PaimonTantivyLogFn) { + let ptr = cb as *mut (); + CALLBACK.store(ptr, Ordering::Release); + let _ = log::set_logger(&LOGGER); + log::set_max_level(log::LevelFilter::Info); +} + +/// Clear the installed callback (revert to Rust-side stderr fallback). +#[no_mangle] +pub extern "C" fn paimon_tantivy_clear_log_callback() { + CALLBACK.store(std::ptr::null_mut(), Ordering::Release); +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::atomic::{AtomicUsize, Ordering}; + + // Simple test callback that counts invocations + static COUNT: AtomicUsize = AtomicUsize::new(0); + extern "C" fn counting_cb(_: i32, _: *const c_char, _: usize) { + COUNT.fetch_add(1, Ordering::SeqCst); + } + + #[test] + fn install_then_log() { + COUNT.store(0, Ordering::SeqCst); + paimon_tantivy_set_log_callback(counting_cb); + log::info!("hello"); + assert!(COUNT.load(Ordering::SeqCst) >= 1); + } + + #[test] + fn clear_reverts_to_stderr() { + paimon_tantivy_set_log_callback(counting_cb); + paimon_tantivy_clear_log_callback(); + log::warn!("goes to stderr"); + } +} diff --git a/third_party/tantivy_ffi/src/reader.rs b/third_party/tantivy_ffi/src/reader.rs new file mode 100644 index 000000000..b2032299c --- /dev/null +++ b/third_party/tantivy_ffi/src/reader.rs @@ -0,0 +1,1237 @@ +//! PaimonTantivyReader: query side of tantivy-fulltext. +//! +//! Constructs a tantivy Index from a packed-blob produced by writer.rs (via +//! PaimonDirectory), registers the same `paimon_jieba` tokenizer, and runs +//! one of 5 search types (mirrors `paimon::FullTextSearch::SearchType`): +//! +//! 1 MATCH_ALL — tokenize query, BooleanQuery (Must) +//! 2 MATCH_ANY — tokenize query, BooleanQuery (Should) +//! 3 PHRASE — tokenize query, PhraseQuery +//! 4 PREFIX — RegexQuery `.*` (no tokenization, mirrors lucene-fts) +//! 5 WILDCARD — RegexQuery from glob pattern (`*` → `.*`, `?` → `.`, others escaped) +//! +//! Decision B1 (paimon-java compat): row_id is stored as an explicit u64 field +//! (`fast` for O(1) retrieval). Reader translates tantivy DocAddress → row_id +//! via `fast_fields().u64("row_id").first(doc_id)` per segment. +//! +//! FFI return format (little-endian, **doc identifiers are u64 row_ids**): +//! `[u8 has_scores | u64 count | u64 row_id[count] | optional f32 score[count]]` + +use std::ffi::{c_char, CStr}; +use std::path::Path; + +use croaring::{Portable, Treemap}; +use tantivy::collector::{Collector, SegmentCollector}; +use tantivy::columnar::Column; +use tantivy::query::{BooleanQuery, Occur, PhraseQuery, Query, RegexQuery, TermQuery}; +use tantivy::schema::{Field, IndexRecordOption}; +use tantivy::{DocAddress, DocId, Index, IndexReader, ReloadPolicy, Score, SegmentOrdinal, + SegmentReader, Term}; + +use crate::buffer::PaimonTantivyBuffer; +use crate::callback_directory::{PaimonCallbackDirectory, PaimonStreamCallbacks}; +use crate::error::{set_last_error, PaimonTantivyStatus}; +use crate::handle::{borrow_handle_mut, free_handle, into_handle}; +use crate::tokenizer::{PaimonJiebaTokenizer, TokenizeMode}; +use crate::writer::{PAIMON_ROW_ID_FIELD_NAME, PAIMON_TEXT_FIELD_NAME, PAIMON_TOKENIZER_NAME}; + +/// Numeric encoding of `paimon::FullTextSearch::SearchType`. Kept in sync +/// with include/paimon/predicate/full_text_search.h. +#[repr(i32)] +#[derive(Clone, Copy, Debug)] +pub enum SearchType { + MatchAll = 1, + MatchAny = 2, + Phrase = 3, + Prefix = 4, + Wildcard = 5, +} + +impl SearchType { + fn from_i32(v: i32) -> Option { + match v { + 1 => Some(Self::MatchAll), + 2 => Some(Self::MatchAny), + 3 => Some(Self::Phrase), + 4 => Some(Self::Prefix), + 5 => Some(Self::Wildcard), + _ => None, + } + } +} + +pub struct PaimonTantivyReader { + /// Held alive so `IndexReader::searcher()` + `index.tokenizers()` stay + /// usable for the reader's lifetime. + index: Index, + reader: IndexReader, + text_field: Field, + /// Name of the tokenizer the `text` field is actually bound to in the open + /// index's schema (read from `meta.json` at construction time). Query-side + /// tokenization looks this up in `index.tokenizers()` every time + tokenizer_name: String, +} + +impl PaimonTantivyReader { + /// Construct a reader from a pre-built callback-backed Directory. + /// Layout (file names + offsets + lengths) must come from the caller + /// (C++ side `ParseArchiveHeader`); Rust does not re-parse the archive. + pub fn new( + directory: PaimonCallbackDirectory, + mode: TokenizeMode, + with_position: bool, + dict_dir: &Path, + ) -> Result { + let index = Index::open(directory) + .map_err(|e| format!("tantivy::Index::open: {e}"))?; + + // Resolve fields by their fixed names (B1: schema is `row_id` + `text`). + let schema = index.schema(); + let text_field = schema.get_field(PAIMON_TEXT_FIELD_NAME).map_err(|e| { + format!("tantivy index missing '{PAIMON_TEXT_FIELD_NAME}' field: {e}") + })?; + + // Read the tokenizer name the `text` field was actually written with + // (lives in meta.json's schema). Auto-aligns cpp query-side tokenizer + // with whatever the writer side used. + let tokenizer_name = match schema.get_field_entry(text_field).field_type() { + tantivy::schema::FieldType::Str(text_options) => text_options + .get_indexing_options() + .map(|io| io.tokenizer().to_string()) + .unwrap_or_else(|| "default".to_string()), + other => { + return Err(format!( + "text field has non-TEXT type: {other:?} (schema corrupted?)" + )); + } + }; + + // Only register paimon_jieba if the index actually uses it. The + // tantivy-builtin "default" / "raw" / "en_stem" etc. are pre-registered + // by the TokenizerManager — no setup needed for those. + if tokenizer_name == PAIMON_TOKENIZER_NAME { + // `Path::is_empty` is unstable; check via OsStr. + if dict_dir.as_os_str().is_empty() { + return Err(format!( + "paimon_jieba tokenizer required by archive schema but dict dir \ + is empty — set the PAIMON_JIEBA_DICT_DIR env var to a directory \ + containing jieba.dict.utf8 / hmm_model.utf8 / user.dict.utf8 / \ + idf.utf8 / stop_words.utf8" + )); + } + let jieba = PaimonJiebaTokenizer::new(dict_dir, mode, with_position) + .map_err(|e| format!("create paimon_jieba tokenizer: {e}"))?; + index.tokenizers().register(PAIMON_TOKENIZER_NAME, jieba); + } else { + // For other known-safe names we trust tantivy's builtin registry. + // `mode` / `dict_dir` are unused in this branch — no-op; we still + // require them in the ABI for backward-compat with the jieba case. + let _ = (mode, dict_dir); + } + + // Sanity: the tokenizer MUST be resolvable now; otherwise query-time + // lookup fails mid-flight. + if index.tokenizers().get(&tokenizer_name).is_none() { + return Err(format!( + "tokenizer {tokenizer_name:?} referenced by text field is not \ + registered; add it to TokenizerManager before opening the reader" + )); + } + + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into() + .map_err(|e| format!("build IndexReader: {e}"))?; + + Ok(Self { + index, + reader, + text_field, + tokenizer_name, + }) + } + + /// Tokenize the query string using the *same* tokenizer the index's text + /// field was built with. Looks up `self.tokenizer_name` in the index's + /// `TokenizerManager` — which was populated by `new()` with either + /// `paimon_jieba` (if cpp wrote the index) or a tantivy builtin like + /// `default` (if paimon-java wrote it). + fn tokenize_query(&self, query: &str) -> Vec { + // `TokenizerManager::get` returns a fresh clone per call — safe to use + // across threads / calls. If the tokenizer was missing we'd have + // failed in `new()`; we still defend with `unwrap_or_default`. + let mut analyzer = match self.index.tokenizers().get(&self.tokenizer_name) { + Some(a) => a, + None => return Vec::new(), + }; + let mut stream = analyzer.token_stream(query); + let mut out = Vec::new(); + while stream.advance() { + out.push(stream.token().text.clone()); + } + out + } + + fn build_match_query(&self, query: &str, occur: Occur) -> Result, String> { + let terms = self.tokenize_query(query); + if terms.is_empty() { + return Err(format!("query {query:?} produced no tokens after analysis")); + } + if terms.len() == 1 { + let term = Term::from_field_text(self.text_field, &terms[0]); + return Ok(Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs))); + } + let clauses: Vec<(Occur, Box)> = terms + .iter() + .map(|t| { + let term = Term::from_field_text(self.text_field, t); + let q: Box = + Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)); + (occur, q) + }) + .collect(); + Ok(Box::new(BooleanQuery::new(clauses))) + } + + fn build_phrase_query(&self, query: &str) -> Result, String> { + let terms = self.tokenize_query(query); + if terms.is_empty() { + return Err(format!("phrase query {query:?} produced no tokens")); + } + if terms.len() == 1 { + // PhraseQuery requires >=2 terms in tantivy; degrade to TermQuery. + let term = Term::from_field_text(self.text_field, &terms[0]); + return Ok(Box::new(TermQuery::new(term, IndexRecordOption::WithFreqsAndPositions))); + } + let tantivy_terms: Vec = terms + .iter() + .map(|t| Term::from_field_text(self.text_field, t)) + .collect(); + Ok(Box::new(PhraseQuery::new(tantivy_terms))) + } + + fn build_prefix_query(&self, query: &str) -> Result, String> { + if query.is_empty() { + return Err("prefix query is empty".into()); + } + // Mirror lucene-fts: don't tokenize prefix; match indexed term bytes + // starting with the given prefix verbatim. + let pattern = format!("{}.*", regex_escape(query)); + RegexQuery::from_pattern(&pattern, self.text_field) + .map(|q| Box::new(q) as Box) + .map_err(|e| format!("RegexQuery from prefix {query:?}: {e}")) + } + + fn build_wildcard_query(&self, query: &str) -> Result, String> { + if query.is_empty() { + return Err("wildcard query is empty".into()); + } + let pattern = wildcard_to_regex(query); + RegexQuery::from_pattern(&pattern, self.text_field) + .map(|q| Box::new(q) as Box) + .map_err(|e| format!("RegexQuery from wildcard {query:?} (pattern {pattern}): {e}")) + } + + fn build_query(&self, search_type: SearchType, query: &str) -> Result, String> { + match search_type { + SearchType::MatchAll => self.build_match_query(query, Occur::Must), + SearchType::MatchAny => self.build_match_query(query, Occur::Should), + SearchType::Phrase => self.build_phrase_query(query), + SearchType::Prefix => self.build_prefix_query(query), + SearchType::Wildcard => self.build_wildcard_query(query), + } + } + + /// Return all matching row_ids (no scoring, no limit, no pre_filter). + /// row_ids come from the explicit `row_id` u64 fast field, supporting + /// multi-segment indexes (e.g. produced by paimon-java without force-merge). + pub fn search_all(&self, search_type: SearchType, query: &str) -> Result, String> { + let q = self.build_query(search_type, query)?; + let searcher = self.reader.searcher(); + let mut ids: Vec = searcher + .search(&*q, &RowIdCollector) + .map_err(|e| format!("tantivy search: {e}"))?; + ids.sort_unstable(); + ids.dedup(); + Ok(ids) + } + + /// 4-path dispatch on `(with_score, limit)` — see `docs/dev/tantivy_bm25_score_contract.md` + /// §4. + /// + /// | with_score | limit | path | collector | sort | truncate | output score | + /// |------------|--------|------|------------------------|----------------|----------|--------------| + /// | false | None | A | RowIdCollector | row_id asc | — | ❌ | + /// | false | Some(n)| B | AllScoredCollector | score desc | top n | ❌ (dropped) | + /// | true | None | C | AllScoredCollector | row_id asc | — | ✅ | + /// | true | Some(n)| D | AllScoredCollector | score desc | top n | ✅ | + /// + /// Pre-filter is a `Treemap` of paimon row_ids (not tantivy doc_ids), applied BEFORE + /// truncation so high-score matches outside the filter don't crowd out valid ones. + /// + /// **v0.2 contract change**: previously `limit.is_some()` implicitly triggered scoring; now + /// scoring is gated solely by `with_score`. See changelog in tantivy_ffi_design.md §4.6. + pub fn search_with_limit_and_filter( + &self, + search_type: SearchType, + query: &str, + with_score: bool, + limit: Option, + pre_filter: Option<&Treemap>, + min_score: Option, + ) -> Result)>, String> { + let q = self.build_query(search_type, query)?; + let searcher = self.reader.searcher(); + match (with_score, limit) { + // Path A: all rows, no score. RowIdCollector reads the `row_id` fast + // field inline per segment (opened once), avoiding a DocSetCollector + // HashSet and per-doc handle — hot path for high-cardinality counts. + (false, None) => { + let mut row_ids: Vec = searcher + .search(&*q, &RowIdCollector) + .map_err(|e| format!("tantivy search: {e}"))?; + if let Some(filter) = pre_filter { + row_ids.retain(|id| filter.contains(*id)); + } + row_ids.sort_unstable(); + row_ids.dedup(); + Ok(row_ids.into_iter().map(|id| (id, None)).collect()) + } + // Path B: any N matches, unscored. Used by SR's `WHERE MATCH ... LIMIT N` (no + // ORDER BY): pushes the limit down so each shard stops collecting once N hits + // are gathered per segment instead of materialising the full posting list. + // If the caller wants top-N by BM25 they should set `with_score=true` (Path D) + // and ignore the score values. + (false, Some(n)) => { + if n == 0 { + return Ok(Vec::new()); + } + if min_score.is_some() { + // min_score requires scoring — fall back to collect_scored path + let mut filtered = self.collect_scored(&*q, &searcher, pre_filter)?; + if let Some(threshold) = min_score { + filtered.retain(|(s, _)| *s > threshold); + } + let truncated = Self::sort_by_score_desc_truncate(filtered, n); + Ok(truncated.into_iter().map(|(_, id)| (id, None)).collect()) + } else if let Some(filter) = pre_filter { + // pre_filter present: it MUST be applied to the full match set + // before truncation. LimitedDocSetCollector stops after the + // first N raw matches, which could all be filtered out while + // valid matches exist further down the posting list — that + // would under-return (fewer than N, or even empty). So collect + // every matching row_id (filter-aware), then truncate to N. + let mut row_ids: Vec = searcher + .search(&*q, &RowIdCollector) + .map_err(|e| format!("tantivy search: {e}"))?; + row_ids.retain(|id| filter.contains(*id)); + row_ids.sort_unstable(); + row_ids.dedup(); + row_ids.truncate(n); + Ok(row_ids.into_iter().map(|id| (id, None)).collect()) + } else { + // No pre_filter: fast path — stop collecting once N matches are + // gathered per segment instead of materialising the full posting list. + let collector = LimitedDocSetCollector::new(n); + let mut docset = searcher + .search(&*q, &collector) + .map_err(|e| format!("tantivy search: {e}"))?; + let mut by_segment: std::collections::HashMap> = + std::collections::HashMap::new(); + for addr in docset.drain(..) { + by_segment.entry(addr.segment_ord).or_default().push(addr.doc_id); + } + let mut row_ids: Vec = Vec::new(); + for (segment_ord, doc_ids) in by_segment.iter() { + let segment_reader = searcher.segment_reader(*segment_ord); + let fast = segment_reader + .fast_fields() + .u64(PAIMON_ROW_ID_FIELD_NAME) + .map_err(|e| format!("fast_fields().u64('row_id') on segment {}: {e}", + segment_ord))?; + for &doc_id in doc_ids { + row_ids.push(fast.first(doc_id).unwrap_or(0)); + } + } + row_ids.sort_unstable(); + row_ids.dedup(); + row_ids.truncate(n); + Ok(row_ids.into_iter().map(|id| (id, None)).collect()) + } + } + // Path C: all rows + all scores, sorted by row_id asc to match the + // BitmapScoredGlobalIndexResult contract (bitmap iter order == score order). + (true, None) => { + let mut filtered = self.collect_scored(&*q, &searcher, pre_filter)?; + if let Some(threshold) = min_score { + filtered.retain(|(s, _)| *s > threshold); + } + filtered.sort_unstable_by(|a, b| a.1.cmp(&b.1)); + Ok(filtered.into_iter().map(|(s, id)| (id, Some(s))).collect()) + } + // Path D: top-N by BM25 with scores. + (true, Some(n)) => { + if n == 0 { + return Ok(Vec::new()); + } + let mut filtered = self.collect_scored(&*q, &searcher, pre_filter)?; + if let Some(threshold) = min_score { + filtered.retain(|(s, _)| *s > threshold); + } + let truncated = Self::sort_by_score_desc_truncate(filtered, n); + Ok(truncated.into_iter().map(|(s, id)| (id, Some(s))).collect()) + } + } + } + + /// Helper for paths B/C/D: run AllScoredCollector, translate doc_id → row_id, apply pre_filter. + /// Groups results by segment so the fast field column handle is opened once per segment + /// (same rationale as Path A — avoids per-match Column allocation). + fn collect_scored( + &self, + q: &dyn Query, + searcher: &tantivy::Searcher, + pre_filter: Option<&Treemap>, + ) -> Result, String> { + let scored = searcher + .search(q, &AllScoredCollector) + .map_err(|e| format!("tantivy search: {e}"))?; + let mut by_segment: std::collections::HashMap> = + std::collections::HashMap::new(); + for (s, addr) in scored.into_iter() { + by_segment.entry(addr.segment_ord).or_default().push((s, addr.doc_id)); + } + let mut result: Vec<(Score, u64)> = Vec::new(); + for (segment_ord, entries) in by_segment.iter() { + let segment_reader = searcher.segment_reader(*segment_ord); + let fast = segment_reader + .fast_fields() + .u64(PAIMON_ROW_ID_FIELD_NAME) + .map_err(|e| format!("fast_fields().u64('row_id') on segment {}: {e}", + segment_ord))?; + for &(score, doc_id) in entries { + let rid = fast.first(doc_id).unwrap_or(0); + if pre_filter.map_or(true, |t| t.contains(rid)) { + result.push((score, rid)); + } + } + } + Ok(result) + } + + /// Helper for paths B/D: sort (score, row_id) by score desc with row_id asc tie-break, + /// then truncate to `n` items. + fn sort_by_score_desc_truncate(mut v: Vec<(Score, u64)>, n: usize) -> Vec<(Score, u64)> { + v.sort_unstable_by(|a, b| { + b.0.partial_cmp(&a.0) + .unwrap_or(std::cmp::Ordering::Equal) + .then(a.1.cmp(&b.1)) + }); + v.truncate(n); + v + } + + #[cfg(test)] + pub(crate) fn tokenizer_name(&self) -> &str { + &self.tokenizer_name + } + + #[cfg(test)] + pub(crate) fn debug_index(&self) -> &Index { + &self.index + } +} + +/// Escape regex metacharacters, but leave the input as a verbatim literal. +fn regex_escape(input: &str) -> String { + let mut out = String::with_capacity(input.len() + 4); + for ch in input.chars() { + match ch { + '.' | '+' | '*' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '\\' => { + out.push('\\'); + out.push(ch); + } + _ => out.push(ch), + } + } + out +} + +/// Translate a glob-style wildcard ('*' = any, '?' = single char) into a +/// regex pattern, escaping all other regex metacharacters. +fn wildcard_to_regex(input: &str) -> String { + let mut out = String::with_capacity(input.len() + 4); + for ch in input.chars() { + match ch { + '*' => out.push_str(".*"), + '?' => out.push('.'), + '.' | '+' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '\\' => { + out.push('\\'); + out.push(ch); + } + _ => out.push(ch), + } + } + out +} + +/// Collector that reads the explicit `row_id` u64 fast field directly into a +/// `Vec`, opening the column once per segment in `for_segment`. Replaces +/// the DocSetCollector → HashSet → per-doc translate path for unscored queries. +struct RowIdCollector; + +struct RowIdSegmentCollector { + row_id: Column, + ids: Vec, +} + +impl SegmentCollector for RowIdSegmentCollector { + type Fruit = Vec; + + fn collect(&mut self, doc: DocId, _score: Score) { + self.ids.push(self.row_id.first(doc).unwrap_or(0)); + } + + fn harvest(self) -> Vec { + self.ids + } +} + +impl Collector for RowIdCollector { + type Fruit = Vec; + type Child = RowIdSegmentCollector; + + fn for_segment( + &self, _ord: SegmentOrdinal, segment: &SegmentReader, + ) -> tantivy::Result { + let row_id = segment.fast_fields().u64(PAIMON_ROW_ID_FIELD_NAME)?; + Ok(RowIdSegmentCollector { row_id, ids: Vec::new() }) + } + + fn requires_scoring(&self) -> bool { + false + } + + fn merge_fruits(&self, segs: Vec>) -> tantivy::Result> { + Ok(segs.into_iter().flatten().collect()) + } +} + +/// Collector that returns at most `limit` DocAddresses across all segments, +/// no scoring. Shared atomic counter caps the global total so per-shard +/// transfer stays bounded for plain `LIMIT N` queries (no ORDER BY). +struct LimitedDocSetCollector { + limit: usize, + counter: std::sync::Arc, +} + +impl LimitedDocSetCollector { + fn new(limit: usize) -> Self { + Self { limit, counter: std::sync::Arc::new(std::sync::atomic::AtomicU64::new(0)) } + } +} + +struct LimitedDocSetSegmentCollector { + segment_ord: SegmentOrdinal, + docs: Vec, + counter: std::sync::Arc, + limit: u64, +} + +impl SegmentCollector for LimitedDocSetSegmentCollector { + type Fruit = Vec; + + fn collect(&mut self, doc: DocId, _score: Score) { + // Best-effort cap: if multiple segments are scanned concurrently the + // atomic ensures we never accept more than `limit` rows total. + let prev = self.counter.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + if prev < self.limit { + self.docs.push(doc); + } + } + + fn harvest(self) -> Self::Fruit { + let segment_ord = self.segment_ord; + self.docs.into_iter().map(|d| DocAddress::new(segment_ord, d)).collect() + } +} + +impl Collector for LimitedDocSetCollector { + type Fruit = Vec; + type Child = LimitedDocSetSegmentCollector; + + fn for_segment( + &self, segment_ord: SegmentOrdinal, _segment: &SegmentReader, + ) -> tantivy::Result { + Ok(LimitedDocSetSegmentCollector { + segment_ord, + docs: Vec::new(), + counter: self.counter.clone(), + limit: self.limit as u64, + }) + } + + fn requires_scoring(&self) -> bool { false } + + fn merge_fruits( + &self, segment_fruits: Vec>, + ) -> tantivy::Result> { + let mut result: Vec = segment_fruits.into_iter().flatten().collect(); + result.truncate(self.limit); + Ok(result) + } +} + +/// Custom Collector that returns ALL matching (score, DocAddress) tuples, +/// without truncation. tantivy's stock `TopDocs::with_limit(N)` would force +/// us to either pick N upfront (wrong when pre_filter rejects high-score +/// docs) or pass `usize::MAX` (which still enforces a binary heap on every +/// push). Our collector is just a plain Vec append, then merge. +struct AllScoredCollector; + +struct AllScoredSegmentCollector { + segment_ord: SegmentOrdinal, + docs: Vec<(Score, DocId)>, +} + +impl SegmentCollector for AllScoredSegmentCollector { + type Fruit = Vec<(Score, DocAddress)>; + + fn collect(&mut self, doc: DocId, score: Score) { + self.docs.push((score, doc)); + } + + fn harvest(self) -> Self::Fruit { + let segment_ord = self.segment_ord; + self.docs + .into_iter() + .map(|(s, d)| (s, DocAddress::new(segment_ord, d))) + .collect() + } +} + +impl Collector for AllScoredCollector { + type Fruit = Vec<(Score, DocAddress)>; + type Child = AllScoredSegmentCollector; + + fn for_segment( + &self, + segment_ord: SegmentOrdinal, + _segment: &SegmentReader, + ) -> tantivy::Result { + Ok(AllScoredSegmentCollector { + segment_ord, + docs: Vec::new(), + }) + } + + fn requires_scoring(&self) -> bool { + true + } + + fn merge_fruits( + &self, + segment_fruits: Vec>, + ) -> tantivy::Result> { + Ok(segment_fruits.into_iter().flatten().collect()) + } +} + +// ============================ FFI surface ============================ + +/// Construct a streaming reader from a layout table + pread callbacks. +/// +/// The layout arrays (names / offsets / lengths) are produced by C++-side +/// `ParseArchiveHeader` after reading only the archive header bytes. Payload +/// bytes are fetched lazily through `callbacks.read_at` as tantivy reads. +/// +/// # Arguments +/// * `file_names` — array of `file_count` UTF-8 NUL-terminated C strings +/// * `file_offsets` / `file_lengths` — u64 arrays (archive-absolute offsets and lengths) +/// * `file_count` — number of entries in each of the three arrays +/// * `callbacks` — pread + release callbacks; `ctx` ownership transfers to Rust +/// * `mode_cstr` — tokenize mode ("mp"/"mix"/"full"/"query"; "hmm" → Unsupported) +/// * `with_position` — whether text field was indexed with positions +/// * `dict_dir_cstr` — paimon_jieba dictionary directory +/// * `out` — receives the reader handle on success +/// +/// # Safety +/// All pointer args must be valid for the duration of the call; ctx lifetime +/// extends until `callbacks.release` is invoked (when reader handle is freed). +#[no_mangle] +pub unsafe extern "C" fn paimon_tantivy_reader_new_streaming( + file_names: *const *const c_char, + file_offsets: *const u64, + file_lengths: *const u64, + file_count: usize, + callbacks: PaimonStreamCallbacks, + mode_cstr: *const c_char, + with_position: bool, + dict_dir_cstr: *const c_char, + out: *mut *mut PaimonTantivyReader, +) -> PaimonTantivyStatus { + if mode_cstr.is_null() || dict_dir_cstr.is_null() || out.is_null() { + set_last_error("paimon_tantivy_reader_new_streaming: null mandatory argument"); + // NOTE: we cannot call callbacks.release here because we don't know + // if the caller populated it yet. Caller must manage ctx on failure. + return PaimonTantivyStatus::InvalidArgument; + } + if file_count > 0 + && (file_names.is_null() || file_offsets.is_null() || file_lengths.is_null()) + { + set_last_error("file_names/offsets/lengths must be non-null when file_count > 0"); + return PaimonTantivyStatus::InvalidArgument; + } + + let mode_str = match unsafe { CStr::from_ptr(mode_cstr) }.to_str() { + Ok(s) => s, + Err(e) => { + set_last_error(format!("mode not utf-8: {e}")); + return PaimonTantivyStatus::InvalidArgument; + } + }; + let dict_dir = match unsafe { CStr::from_ptr(dict_dir_cstr) }.to_str() { + Ok(s) => s, + Err(e) => { + set_last_error(format!("dict_dir not utf-8: {e}")); + return PaimonTantivyStatus::InvalidArgument; + } + }; + let mode = match TokenizeMode::parse(mode_str) { + Some(m) => m, + None => { + set_last_error(format!( + "unknown tokenize mode {mode_str:?}; expected mp/mix/full/query" + )); + return PaimonTantivyStatus::InvalidArgument; + } + }; + + // Copy the C string array into owned Rust entries so the directory doesn't + // depend on caller-supplied lifetime. + let mut entries: Vec<(String, u64, u64)> = Vec::with_capacity(file_count); + for i in 0..file_count { + let name_ptr = unsafe { *file_names.add(i) }; + if name_ptr.is_null() { + set_last_error(format!("file_names[{i}] is null")); + return PaimonTantivyStatus::InvalidArgument; + } + let name = match unsafe { CStr::from_ptr(name_ptr) }.to_str() { + Ok(s) => s.to_owned(), + Err(e) => { + set_last_error(format!("file_names[{i}] not utf-8: {e}")); + return PaimonTantivyStatus::InvalidArgument; + } + }; + let offset = unsafe { *file_offsets.add(i) }; + let length = unsafe { *file_lengths.add(i) }; + entries.push((name, offset, length)); + } + + // Build callback directory (ctx ownership transfers here; release fires on drop). + let directory = PaimonCallbackDirectory::new(entries, callbacks); + + match PaimonTantivyReader::new(directory, mode, with_position, Path::new(dict_dir)) { + Ok(r) => { + unsafe { *out = into_handle(r) }; + PaimonTantivyStatus::Ok + } + Err(e) => { + let unsupported = e.contains("'hmm' is not supported"); + let bad_format = e.contains("tantivy::Index::open") + || e.contains("missing 'text' field"); + set_last_error(e); + if unsupported { + PaimonTantivyStatus::Unsupported + } else if bad_format { + PaimonTantivyStatus::IndexFormatError + } else { + PaimonTantivyStatus::InternalError + } + } + } +} + +/// Run a query and emit results into `out`. +/// +/// Output bytes (little-endian): +/// `[u8 has_scores | u64 count | u64 row_ids[count] | optional f32 scores[count]]` +/// +/// `has_scores=1` iff `limit >= 0` (caller asked for scoring + limit). +/// +/// `limit < 0` ⇒ no limit, no scoring; sorted ascending by row_id. +/// `limit >= 0` ⇒ top-N by descending score (pre_filter applied first). +/// `pre_filter_bytes`: serialized croaring `Roaring64Map::write` (portable), +/// containing paimon **row_ids** (not tantivy doc_ids); null+0 = no filter. +/// +/// SAFETY: `reader` must be a live handle; `query` and `pre_filter_bytes` +/// may be null+0 or readable slices; `out` non-null. +#[no_mangle] +pub unsafe extern "C" fn paimon_tantivy_reader_search( + reader: *mut PaimonTantivyReader, + search_type: i32, + query: *const c_char, + query_len: usize, + with_score: bool, + limit: i32, + pre_filter_bytes: *const c_char, + pre_filter_len: usize, + min_score: f32, + out: *mut PaimonTantivyBuffer, +) -> PaimonTantivyStatus { + if out.is_null() { + set_last_error("reader_search: out is null"); + return PaimonTantivyStatus::InvalidArgument; + } + let Some(r) = (unsafe { borrow_handle_mut::(reader) }) else { + set_last_error("reader_search: null reader handle"); + return PaimonTantivyStatus::InvalidArgument; + }; + let st = match SearchType::from_i32(search_type) { + Some(s) => s, + None => { + set_last_error(format!("unknown search_type {search_type}")); + return PaimonTantivyStatus::InvalidArgument; + } + }; + if query.is_null() && query_len != 0 { + set_last_error("query is null but len > 0"); + return PaimonTantivyStatus::InvalidArgument; + } + let query_str = if query_len == 0 { + "" + } else { + let slice = unsafe { std::slice::from_raw_parts(query as *const u8, query_len) }; + match std::str::from_utf8(slice) { + Ok(s) => s, + Err(e) => { + set_last_error(format!("query not utf-8: {e}")); + return PaimonTantivyStatus::InvalidArgument; + } + } + }; + + let pre_filter: Option = if pre_filter_bytes.is_null() && pre_filter_len == 0 { + None + } else if pre_filter_bytes.is_null() { + set_last_error("pre_filter_bytes is null but len > 0"); + return PaimonTantivyStatus::InvalidArgument; + } else { + let slice = unsafe { + std::slice::from_raw_parts(pre_filter_bytes as *const u8, pre_filter_len) + }; + match Treemap::try_deserialize::(slice) { + Some(t) => Some(t), + None => { + set_last_error(format!( + "pre_filter not a valid Roaring64Map portable serialization ({} bytes)", + pre_filter_len + )); + return PaimonTantivyStatus::InvalidArgument; + } + } + }; + + let limit_opt: Option = if limit < 0 { None } else { Some(limit as usize) }; + let min_score_opt: Option = if min_score > 0.0 { Some(min_score) } else { None }; + + match r.search_with_limit_and_filter(st, query_str, with_score, limit_opt, pre_filter.as_ref(), min_score_opt) + { + Ok(rows) => { + // v0.2: has_scores is decoupled from limit — it equals with_score directly. + let has_scores = with_score; + let count = rows.len() as u64; + // 1 byte has_scores + 8 bytes count + 8 bytes per row_id + optional 4 bytes per score + let mut buf = Vec::with_capacity( + 1 + 8 + rows.len() * 8 + if has_scores { rows.len() * 4 } else { 0 }, + ); + buf.push(if has_scores { 1u8 } else { 0u8 }); + buf.extend_from_slice(&count.to_le_bytes()); + for (id, _) in &rows { + buf.extend_from_slice(&id.to_le_bytes()); // u64 row_id LE + } + if has_scores { + for (_, score) in &rows { + let s = score.unwrap_or(0.0); + buf.extend_from_slice(&s.to_le_bytes()); + } + } + unsafe { *out = PaimonTantivyBuffer::from_vec(buf) }; + PaimonTantivyStatus::Ok + } + Err(e) => { + let parse_err = e.contains("RegexQuery from") + || e.contains("phrase query") + || e.contains("produced no tokens"); + set_last_error(e); + if parse_err { + PaimonTantivyStatus::QueryParseError + } else { + PaimonTantivyStatus::InternalError + } + } + } +} + +/// Destroy a reader handle. Safe on null. +#[no_mangle] +pub unsafe extern "C" fn paimon_tantivy_reader_free(reader: *mut PaimonTantivyReader) { + unsafe { free_handle(reader) }; +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::callback_directory::test_support::build_directory_from_archive; + use crate::writer::PaimonTantivyWriter; + use std::path::PathBuf; + + fn dict_dir() -> PathBuf { + std::env::var("PAIMON_JIEBA_DICT_DIR") + .map(PathBuf::from) + .unwrap_or_else(|_| PathBuf::from("/tmp/nonexistent-dict")) + } + + fn build(docs: &[&str]) -> Vec { + let mut w = PaimonTantivyWriter::new("f0", TokenizeMode::Mix, true, &dict_dir(), "paimon_jieba").unwrap(); + for (i, d) in docs.iter().enumerate() { + w.add(i as u64, d).unwrap(); + } + w.finish().unwrap().1 + } + + fn open(packed: &[u8]) -> PaimonTantivyReader { + // Simulate production flow: parse archive header → build layout → + // back PaimonCallbackDirectory with a mock pread that reads from the + // packed Vec. Once C++ `ParseArchiveHeader` (K3) is in place, prod + // uses the same PaimonCallbackDirectory path. + let (dir, _backend) = build_directory_from_archive(packed.to_vec()); + PaimonTantivyReader::new(dir, TokenizeMode::Mix, true, &dict_dir()).unwrap() + } + + #[test] + fn match_all_single_term() { + let bytes = build(&["hello world", "hello there", "world peace"]); + let r = open(&bytes); + let ids = r.search_all(SearchType::MatchAll, "hello").unwrap(); + assert_eq!(ids, vec![0u64, 1]); + } + + #[test] + fn match_all_two_terms_intersection() { + let bytes = build(&["hello world", "hello there", "world peace"]); + let r = open(&bytes); + let ids = r.search_all(SearchType::MatchAll, "hello world").unwrap(); + assert_eq!(ids, vec![0u64]); + } + + #[test] + fn match_any_two_terms_union() { + let bytes = build(&["hello world", "hello there", "world peace"]); + let r = open(&bytes); + let ids = r.search_all(SearchType::MatchAny, "hello peace").unwrap(); + assert_eq!(ids, vec![0u64, 1, 2]); + } + + #[test] + fn phrase_only_consecutive() { + let bytes = build(&["hello world there", "world hello there"]); + let r = open(&bytes); + let ids = r.search_all(SearchType::Phrase, "hello world").unwrap(); + assert_eq!(ids, vec![0u64]); + } + + #[test] + fn prefix_matches_indexed_terms() { + let bytes = build(&["unordered user-defined doc id"]); + let r = open(&bytes); + let ids = r.search_all(SearchType::Prefix, "unorder").unwrap(); + assert_eq!(ids, vec![0u64]); + } + + #[test] + fn wildcard_with_star() { + let bytes = build(&["unordered", "ordered", "border"]); + let r = open(&bytes); + let ids = r.search_all(SearchType::Wildcard, "*order*").unwrap(); + assert_eq!(ids, vec![0u64, 1, 2]); + } + + #[test] + fn empty_query_for_match_returns_query_parse_error() { + let bytes = build(&["hello"]); + let r = open(&bytes); + let err = r.search_all(SearchType::MatchAll, "").unwrap_err(); + assert!(err.contains("no tokens"), "got: {err}"); + } + + #[test] + fn wildcard_helper_escapes_dots() { + assert_eq!(wildcard_to_regex("a*b"), "a.*b"); + assert_eq!(wildcard_to_regex("a?b"), "a.b"); + assert_eq!(wildcard_to_regex("a.b"), r"a\.b"); + assert_eq!(wildcard_to_regex("*a*"), ".*a.*"); + } + + // ----- limit + pre_filter + scoring (B1: row_id-based) ----- + + #[test] + fn limit_returns_top_n_with_scores() { + let bytes = build(&[ + "doc", // 0: low score (1 occurrence) + "doc doc doc doc doc", // 1: high score (5 occurrences) + "doc doc", // 2: medium score + ]); + let r = open(&bytes); + let rows = r + .search_with_limit_and_filter(SearchType::MatchAll, "doc", true, Some(2), None, None) + .unwrap(); + assert_eq!(rows.len(), 2); + // doc 1 has highest TF, expect first + assert_eq!(rows[0].0, 1u64); + assert!(rows[0].1.is_some()); + assert!(rows[1].1.is_some()); + // Scores monotonically decreasing + assert!(rows[0].1.unwrap() >= rows[1].1.unwrap()); + } + + #[test] + fn no_limit_returns_all_unscored() { + let bytes = build(&["hello world", "world hello", "world peace"]); + let r = open(&bytes); + let rows = r + .search_with_limit_and_filter(SearchType::MatchAll, "world", false, None, None, None) + .unwrap(); + let ids: Vec = rows.iter().map(|(id, _)| *id).collect(); + assert_eq!(ids, vec![0u64, 1, 2]); + assert!(rows.iter().all(|(_, s)| s.is_none())); + } + + #[test] + fn pre_filter_no_limit_intersects() { + let bytes = build(&["alpha beta", "alpha gamma", "beta gamma"]); + let r = open(&bytes); + // pre_filter = {0, 2}; query "alpha" matches {0, 1}; expect intersection {0} + let mut tm = Treemap::new(); + tm.add(0); + tm.add(2); + let rows = r + .search_with_limit_and_filter(SearchType::MatchAll, "alpha", false, None, Some(&tm), None) + .unwrap(); + let ids: Vec = rows.iter().map(|(id, _)| *id).collect(); + assert_eq!(ids, vec![0u64]); + } + + #[test] + fn pre_filter_with_limit_filters_before_topn() { + // doc 0 has highest TF for "doc" but is NOT in pre_filter → must NOT + // be in result, even with limit=1. + let bytes = build(&[ + "doc doc doc doc doc", // 0: highest TF, but excluded + "doc doc", // 1: medium TF, included + "doc", // 2: low TF, excluded + ]); + let r = open(&bytes); + let mut tm = Treemap::new(); + tm.add(1); // only doc 1 passes pre_filter + let rows = r + .search_with_limit_and_filter(SearchType::MatchAll, "doc", true, Some(10), Some(&tm), None) + .unwrap(); + assert_eq!(rows.len(), 1); + assert_eq!(rows[0].0, 1u64); + } + + #[test] + fn unscored_limit_with_pre_filter_applies_filter_before_truncate() { + // Regression (review finding #1): with_score=false + limit=N + pre_filter + // must apply the filter to the FULL match set before truncating to N. + // All three docs match "doc" but only row_id 2 (the LAST one) passes the + // pre_filter; a truncate-before-filter impl (LimitedDocSetCollector that + // stops at N raw matches, then filters) would collect doc 0, filter it + // out, and wrongly return empty instead of {2}. + let bytes = build(&["doc", "doc", "doc"]); + let r = open(&bytes); + let mut tm = Treemap::new(); + tm.add(2); // only row_id 2 passes the pre_filter + let rows = r + .search_with_limit_and_filter(SearchType::MatchAll, "doc", false, Some(1), Some(&tm), None) + .unwrap(); + let ids: Vec = rows.iter().map(|(id, _)| *id).collect(); + assert_eq!(ids, vec![2u64], "pre_filter must be applied before LIMIT truncation"); + assert!(rows.iter().all(|(_, s)| s.is_none())); + } + + #[test] + fn empty_pre_filter_returns_empty() { + let bytes = build(&["alpha", "beta"]); + let r = open(&bytes); + let tm = Treemap::new(); // empty + let rows = r + .search_with_limit_and_filter(SearchType::MatchAll, "alpha", false, None, Some(&tm), None) + .unwrap(); + assert!(rows.is_empty()); + } + + #[test] + fn limit_zero_returns_empty_without_running_query() { + let bytes = build(&["alpha", "beta"]); + let r = open(&bytes); + let rows = r + .search_with_limit_and_filter(SearchType::MatchAll, "alpha", true, Some(0), None, None) + .unwrap(); + assert!(rows.is_empty()); + } + + // ----- B1: row_id is independent of doc_id ----- + + #[test] + fn pre_filter_uses_row_id_not_doc_id() { + // Build with non-contiguous row_ids so doc_id ≠ row_id. Then verify + // pre_filter operates on row_id values, not internal tantivy doc_ids. + let mut w = PaimonTantivyWriter::new("f0", TokenizeMode::Mix, true, &dict_dir(), "paimon_jieba").unwrap(); + w.add(100, "alpha").unwrap(); + w.add(200, "alpha").unwrap(); + w.add(300, "alpha").unwrap(); + let bytes = w.finish().unwrap().1; + let r = open(&bytes); + + // pre_filter = {200} as row_id (doc_id would be 1) + let mut tm = Treemap::new(); + tm.add(200); + let rows = r + .search_with_limit_and_filter(SearchType::MatchAll, "alpha", false, None, Some(&tm), None) + .unwrap(); + let ids: Vec = rows.iter().map(|(id, _)| *id).collect(); + assert_eq!(ids, vec![200u64], "pre_filter must operate on row_id, not doc_id"); + } + + #[test] + fn search_returns_caller_supplied_row_ids() { + // Same setup: row_ids 100/200/300, verify search_all returns those values. + let mut w = PaimonTantivyWriter::new("f0", TokenizeMode::Mix, true, &dict_dir(), "paimon_jieba").unwrap(); + w.add(100, "doc").unwrap(); + w.add(200, "doc").unwrap(); + w.add(300, "doc").unwrap(); + let bytes = w.finish().unwrap().1; + let r = open(&bytes); + let ids = r.search_all(SearchType::MatchAll, "doc").unwrap(); + assert_eq!(ids, vec![100u64, 200, 300]); + } + + #[test] + fn tokenizer_name_reflects_paimon_jieba_schema_for_cpp_written_index() { + // cpp-written index: PaimonTantivyWriter binds the text field to + // `paimon_jieba`. Reader must pick that up from meta.json (not hardcode). + let bytes = build(&["hello world"]); + let r = open(&bytes); + assert_eq!(r.tokenizer_name(), PAIMON_TOKENIZER_NAME); + + // tokenize sanity: jieba mode="mix" picks `hello` + `world` from ASCII. + let q = r.tokenize_query("hello world"); + assert_eq!(q, vec!["hello".to_string(), "world".to_string()]); + } + + #[test] + fn tokenizer_name_reflects_default_schema_for_externally_written_index() { + // Simulate a paimon-java-shaped index: text field bound to the + // builtin `default` tokenizer (SimpleTokenizer + LowerCaser), not jieba. + // Build it directly via tantivy (bypassing PaimonTantivyWriter's jieba + // schema) so we can prove the reader auto-switches to the builtin. + use crate::callback_directory::test_support::build_mock_directory; + use tantivy::directory::Directory; + use tantivy::schema::{IndexRecordOption, NumericOptions, Schema, TextFieldIndexing, TextOptions}; + use tantivy::{doc, Index}; + + // Build a minimal index with field "text" bound to "default". + let mut sb = Schema::builder(); + let row_id_f = sb.add_u64_field( + "row_id", + NumericOptions::default().set_stored().set_indexed().set_fast(), + ); + let text_opts = TextOptions::default().set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("default") // ← key: match paimon-java's TEXT default + .set_index_option(IndexRecordOption::WithFreqsAndPositions), + ); + let text_f = sb.add_text_field("text", text_opts); + let schema = sb.build(); + let tmp = tempfile::Builder::new() + .prefix("paimon-tantivy-dyn-tk-") + .tempdir() + .unwrap(); + let index = Index::create_in_dir(tmp.path(), schema).unwrap(); + let mut writer = index.writer(15_000_000).unwrap(); + writer + .add_document(doc!(row_id_f => 0u64, text_f => "Hello World")) + .unwrap(); + writer + .add_document(doc!(row_id_f => 1u64, text_f => "Apple.Banana")) + .unwrap(); + writer.commit().unwrap(); + writer.wait_merging_threads().unwrap(); + + // Pack the index dir into our archive format so the callback directory + // can serve it. Reuse writer.rs's format by streaming entries manually. + let mut data = Vec::new(); + let mut entries = Vec::<(String, u64, u64)>::new(); + let dir_iter = std::fs::read_dir(tmp.path()).unwrap(); + let mut files: Vec<_> = dir_iter + .filter_map(|e| e.ok()) + .filter(|e| e.file_type().ok().map_or(false, |t| t.is_file())) + .filter(|e| !e.file_name().to_string_lossy().starts_with('.')) + .collect(); + files.sort_by_key(|e| e.file_name()); + data.extend_from_slice(&(files.len() as i32).to_be_bytes()); + for e in &files { + let name = e.file_name().to_string_lossy().into_owned(); + let bytes = std::fs::read(e.path()).unwrap(); + data.extend_from_slice(&(name.len() as i32).to_be_bytes()); + data.extend_from_slice(name.as_bytes()); + data.extend_from_slice(&(bytes.len() as i64).to_be_bytes()); + let off = data.len() as u64; + data.extend_from_slice(&bytes); + entries.push((name, off, bytes.len() as u64)); + } + + let (dir, _backend) = build_mock_directory(data, entries); + let r = PaimonTantivyReader::new(dir, TokenizeMode::Mix, true, &dict_dir()).unwrap(); + + // Reader must pick up `default` from schema, not hardcode `paimon_jieba`. + assert_eq!(r.tokenizer_name(), "default"); + + // Query tokenization now goes through tantivy's builtin default + // (SimpleTokenizer + LowerCaser): + // "Apple.Banana" → ["apple", "banana"] (dot is non-alnum, split) + // "Hello World" → ["hello", "world"] (space split + lowercase) + let q1 = r.tokenize_query("Hello World"); + assert_eq!(q1, vec!["hello".to_string(), "world".to_string()]); + let q2 = r.tokenize_query("Apple.Banana"); + assert_eq!(q2, vec!["apple".to_string(), "banana".to_string()]); + + // And the search path works across tokenizer: + let ids = r.search_all(SearchType::MatchAll, "hello").unwrap(); + assert_eq!(ids, vec![0u64]); + let ids = r.search_all(SearchType::MatchAll, "apple").unwrap(); + assert_eq!(ids, vec![1u64]); + } + + #[test] + fn reader_aggregates_row_ids_across_segments() { + // Multi-thread default writer + many docs => may produce multiple + // segments before force-merge. After finish(), force-merge collapses + // to one segment, but this test still validates the row_id retrieval + // path works for ≥1 segment. + let mut w = PaimonTantivyWriter::new("f0", TokenizeMode::Mix, true, &dict_dir(), "paimon_jieba").unwrap(); + for i in 0..200u64 { + w.add(i * 7, &format!("docmark_{i} apple")).unwrap(); + } + let bytes = w.finish().unwrap().1; + let r = open(&bytes); + let ids = r.search_all(SearchType::MatchAll, "apple").unwrap(); + assert_eq!(ids.len(), 200); + for i in 0..200u64 { + assert!(ids.contains(&(i * 7)), "missing row_id={}", i * 7); + } + } +} diff --git a/third_party/tantivy_ffi/src/tokenizer.rs b/third_party/tantivy_ffi/src/tokenizer.rs new file mode 100644 index 000000000..2ab4c5e96 --- /dev/null +++ b/third_party/tantivy_ffi/src/tokenizer.rs @@ -0,0 +1,447 @@ +//! PaimonJiebaTokenizer: tantivy Tokenizer impl wrapping jieba-rs. +//! +//! Contract (see docs/dev/tantivy_ffi_design.md §4.2 and migration plan Stage 3): +//! - Behavior-equivalent with `JiebaAnalyzer` in src/paimon/global_index/lucene/ +//! - 5 modes: mp / hmm / mix / full / query +//! - `hmm` is Unsupported (jieba-rs has no standalone HMM entry point) +//! - `mp` accepts cut(hmm=false) but does not replicate cppjieba's +//! max_word_len truncation (docs/dev/tantivy_ffi_design.md §9.3 entry) +//! - Normalize: skip pure whitespace, skip stop_words, lowercase ASCII-only tokens +//! - Token offsets: byte offsets into the original UTF-8 string +//! - `with_position=false`: all tokens emitted at `position=0` (disables PhraseQuery) +//! - Custom dict dir: loads `jieba.dict.utf8` (+optional `user.dict.utf8`) from +//! `$PAIMON_JIEBA_DICT_DIR`; stop_words.utf8 loaded if present + +use std::collections::HashSet; +use std::ffi::{c_char, CStr}; +use std::fs::File; +use std::io::{BufRead, BufReader}; +use std::path::Path; +use std::sync::Arc; + +use jieba_rs::Jieba; +use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; + +use crate::buffer::PaimonTantivyBuffer; +use crate::error::{set_last_error, PaimonTantivyStatus}; +use crate::handle::{borrow_handle, free_handle, into_handle}; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum TokenizeMode { + Mp, + Hmm, + Mix, + Full, + Query, +} + +impl TokenizeMode { + pub(crate) fn parse(s: &str) -> Option { + match s { + "mp" => Some(Self::Mp), + "hmm" => Some(Self::Hmm), + "mix" => Some(Self::Mix), + "full" => Some(Self::Full), + "query" => Some(Self::Query), + _ => None, + } + } +} + +#[derive(Clone)] +pub struct PaimonJiebaTokenizer { + jieba: Arc, + mode: TokenizeMode, + with_position: bool, + stop_words: Arc>, +} + +impl PaimonJiebaTokenizer { + pub fn new( + dict_dir: &Path, + mode: TokenizeMode, + with_position: bool, + ) -> Result { + if mode == TokenizeMode::Hmm { + return Err( + "tokenize mode 'hmm' is not supported (jieba-rs does not expose standalone HMM)" + .into(), + ); + } + let jieba = load_jieba(dict_dir)?; + let stop_words = load_stop_words(dict_dir); + Ok(Self { + jieba: Arc::new(jieba), + mode, + with_position, + stop_words: Arc::new(stop_words), + }) + } + + /// Directly tokenize, returning a Vec of (offset_start, offset_end, text) tuples. + /// Used both by the tantivy Tokenizer impl and the standalone `tokenize` FFI. + pub fn tokenize_raw(&self, text: &str) -> Vec<(usize, usize, String)> { + // Use jieba-rs's cut variants which return Vec<&'a str>; compute byte offsets + // via pointer arithmetic (each &str is a slice of the original). + let cuts: Vec<&str> = match self.mode { + TokenizeMode::Mp => self.jieba.cut(text, false), + TokenizeMode::Hmm => Vec::new(), // unreachable (caught in new()) + TokenizeMode::Mix => self.jieba.cut(text, true), + TokenizeMode::Full => self.jieba.cut_all(text), + TokenizeMode::Query => self.jieba.cut_for_search(text, true), + }; + + let text_start = text.as_ptr() as usize; + let mut out = Vec::with_capacity(cuts.len()); + for piece in cuts { + // skip pure whitespace + if piece.chars().all(char::is_whitespace) { + continue; + } + // skip stop words (compare original case) + if self.stop_words.contains(piece) { + continue; + } + // offset calc + let start = piece.as_ptr() as usize - text_start; + let end = start + piece.len(); + // lowercase only if pure ASCII alphanumeric (match cppjieba Normalize behavior) + let token_text = if is_ascii_alnum(piece) { + piece.to_ascii_lowercase() + } else { + piece.to_string() + }; + out.push((start, end, token_text)); + } + out + } +} + +fn is_ascii_alnum(s: &str) -> bool { + !s.is_empty() && s.bytes().all(|b| b.is_ascii_alphanumeric()) +} + +fn load_jieba(dict_dir: &Path) -> Result { + let main_dict = dict_dir.join("jieba.dict.utf8"); + let mut jieba = if main_dict.exists() { + let file = File::open(&main_dict) + .map_err(|e| format!("open {}: {e}", main_dict.display()))?; + let mut rdr = BufReader::new(file); + Jieba::with_dict(&mut rdr).map_err(|e| format!("load jieba dict: {e:?}"))? + } else { + // No custom dict; use jieba-rs builtin + Jieba::new() + }; + // Optional user dict. cppjieba's user.dict.utf8 is lenient: lines are + // `word [freq] [tag]` where freq can be omitted (e.g. "蓝翔 nz"), but + // jieba-rs's load_dict strictly requires `word freq [tag]` and fails if + // freq is not an integer. We parse line-by-line with `add_word` to stay + // compatible. + let user_dict = dict_dir.join("user.dict.utf8"); + if user_dict.exists() { + let file = File::open(&user_dict) + .map_err(|e| format!("open {}: {e}", user_dict.display()))?; + for (n, line_res) in BufReader::new(file).lines().enumerate() { + let line = match line_res { + Ok(l) => l, + Err(_) => continue, // skip unreadable lines + }; + let trimmed = line.trim(); + if trimmed.is_empty() || trimmed.starts_with('#') { + continue; + } + let mut it = trimmed.split_whitespace(); + let word = it.next().unwrap(); // non-empty guaranteed + let next = it.next(); + let freq = next.and_then(|s| s.parse::().ok()); + let tag = match (freq, next) { + (Some(_), _) => it.next(), // [tag] + (None, tok) => tok, // (no freq) + }; + // `add_word` returns the assigned frequency; ignore it. For lines + // with bogus content we silently keep going, matching cppjieba's + // tolerant behavior. + let _ = jieba.add_word(word, freq, tag); + let _ = n; // keep for potential debug + } + } + Ok(jieba) +} + +fn load_stop_words(dict_dir: &Path) -> HashSet { + let path = dict_dir.join("stop_words.utf8"); + let mut out = HashSet::new(); + if let Ok(f) = File::open(&path) { + for line in BufReader::new(f).lines().map_while(Result::ok) { + let w = line.trim(); + if !w.is_empty() { + out.insert(w.to_owned()); + } + } + } + out +} + +// ----------------- tantivy Tokenizer integration ----------------- + +pub struct PaimonJiebaTokenStream { + tokens: Vec, + index: usize, +} + +impl TokenStream for PaimonJiebaTokenStream { + fn advance(&mut self) -> bool { + self.index += 1; + self.index <= self.tokens.len() + } + + fn token(&self) -> &Token { + &self.tokens[self.index - 1] + } + + fn token_mut(&mut self) -> &mut Token { + &mut self.tokens[self.index - 1] + } +} + +impl Tokenizer for PaimonJiebaTokenizer { + type TokenStream<'a> = PaimonJiebaTokenStream; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + let raw = self.tokenize_raw(text); + let tokens: Vec = raw + .into_iter() + .enumerate() + .map(|(i, (s, e, t))| Token { + offset_from: s, + offset_to: e, + position: if self.with_position { i } else { 0 }, + text: t, + position_length: 1, + }) + .collect(); + PaimonJiebaTokenStream { tokens, index: 0 } + } +} + +// ----------------- FFI surface ----------------- + +/// Create a tokenizer handle. Returns OK and writes *out on success; returns +/// status and sets last_error on failure. +/// +/// SAFETY: `mode_cstr` and `dict_dir_cstr` must be NUL-terminated UTF-8; +/// `out` must be a valid non-null pointer. +#[no_mangle] +pub unsafe extern "C" fn paimon_tantivy_tokenizer_new( + mode_cstr: *const c_char, + with_position: bool, + dict_dir_cstr: *const c_char, + out: *mut *mut PaimonJiebaTokenizer, +) -> PaimonTantivyStatus { + if mode_cstr.is_null() || dict_dir_cstr.is_null() || out.is_null() { + set_last_error("paimon_tantivy_tokenizer_new: null argument"); + return PaimonTantivyStatus::InvalidArgument; + } + let mode_s = match unsafe { CStr::from_ptr(mode_cstr) }.to_str() { + Ok(s) => s, + Err(e) => { + set_last_error(format!("mode not utf-8: {e}")); + return PaimonTantivyStatus::InvalidArgument; + } + }; + let dict_s = match unsafe { CStr::from_ptr(dict_dir_cstr) }.to_str() { + Ok(s) => s, + Err(e) => { + set_last_error(format!("dict_dir not utf-8: {e}")); + return PaimonTantivyStatus::InvalidArgument; + } + }; + let mode = match TokenizeMode::parse(mode_s) { + Some(m) => m, + None => { + set_last_error(format!( + "unknown tokenize mode {mode_s:?}; expected one of mp/hmm/mix/full/query" + )); + return PaimonTantivyStatus::InvalidArgument; + } + }; + match PaimonJiebaTokenizer::new(Path::new(dict_s), mode, with_position) { + Ok(t) => { + unsafe { *out = into_handle(t) }; + PaimonTantivyStatus::Ok + } + Err(e) => { + let is_hmm_unsupported = e.contains("'hmm' is not supported"); + set_last_error(e); + if is_hmm_unsupported { + PaimonTantivyStatus::Unsupported + } else { + PaimonTantivyStatus::TokenizerError + } + } + } +} + +/// Free a tokenizer handle. Safe on null. +#[no_mangle] +pub unsafe extern "C" fn paimon_tantivy_tokenizer_free(tok: *mut PaimonJiebaTokenizer) { + unsafe { free_handle(tok) }; +} + +/// Tokenize a string and return a newline-delimited list of tokens as bytes. +/// Used for Stage 3 golden-sample tests (easy to diff from C++). +/// +/// Output format: +/// `\t\t\t\n` for each token. +/// +/// SAFETY: `tok` must be a valid handle; `text` must point to `text_len` UTF-8 bytes; +/// `out` must be non-null. +#[no_mangle] +pub unsafe extern "C" fn paimon_tantivy_tokenizer_tokenize( + tok: *const PaimonJiebaTokenizer, + text: *const c_char, + text_len: usize, + out: *mut PaimonTantivyBuffer, +) -> PaimonTantivyStatus { + if out.is_null() { + set_last_error("paimon_tantivy_tokenizer_tokenize: out is null"); + return PaimonTantivyStatus::InvalidArgument; + } + let Some(tokenizer) = (unsafe { borrow_handle::(tok) }) else { + set_last_error("paimon_tantivy_tokenizer_tokenize: null tokenizer handle"); + return PaimonTantivyStatus::InvalidArgument; + }; + if text.is_null() && text_len != 0 { + set_last_error("text is null but len > 0"); + return PaimonTantivyStatus::InvalidArgument; + } + let text_str = if text_len == 0 { + "" + } else { + let slice = unsafe { std::slice::from_raw_parts(text as *const u8, text_len) }; + match std::str::from_utf8(slice) { + Ok(s) => s, + Err(e) => { + set_last_error(format!("text not utf-8: {e}")); + return PaimonTantivyStatus::InvalidArgument; + } + } + }; + let raw = tokenizer.tokenize_raw(text_str); + let mut buf = String::new(); + for (i, (s, e, t)) in raw.iter().enumerate() { + let pos = if tokenizer.with_position { i } else { 0 }; + buf.push_str(&format!("{s}\t{e}\t{pos}\t{t}\n")); + } + let bytes = buf.into_bytes(); + unsafe { *out = PaimonTantivyBuffer::from_vec(bytes) }; + PaimonTantivyStatus::Ok +} + +#[cfg(test)] +mod tests { + use super::*; + use std::ffi::CString; + + fn dict_dir_from_env() -> std::path::PathBuf { + std::env::var("PAIMON_JIEBA_DICT_DIR") + .map(std::path::PathBuf::from) + .unwrap_or_else(|_| std::path::PathBuf::from("/tmp/nonexistent-dict")) + } + + #[test] + fn mode_parse() { + for (s, m) in [ + ("mp", TokenizeMode::Mp), + ("hmm", TokenizeMode::Hmm), + ("mix", TokenizeMode::Mix), + ("full", TokenizeMode::Full), + ("query", TokenizeMode::Query), + ] { + assert_eq!(TokenizeMode::parse(s), Some(m)); + } + assert!(TokenizeMode::parse("bogus").is_none()); + } + + #[test] + fn hmm_mode_returns_unsupported() { + let tok = PaimonJiebaTokenizer::new( + &dict_dir_from_env(), + TokenizeMode::Hmm, + true, + ); + match tok { + Err(e) => assert!(e.contains("'hmm' is not supported"), "got: {e}"), + Ok(_) => panic!("expected Err"), + } + } + + #[test] + fn tokenize_mix_default_dict_smoke() { + // If no custom dict dir, jieba-rs builtin is used. + let t = PaimonJiebaTokenizer::new(Path::new("/tmp/nonexistent-dict"), TokenizeMode::Mix, true) + .unwrap(); + let raw = t.tokenize_raw("他来到了网易杭研大厦"); + let texts: Vec<&str> = raw.iter().map(|(_, _, s)| s.as_str()).collect(); + assert!(texts.contains(&"网易")); + assert!(texts.contains(&"大厦")); + } + + #[test] + fn ascii_alnum_is_lowercased() { + let t = PaimonJiebaTokenizer::new(Path::new("/tmp/nx"), TokenizeMode::Mix, true).unwrap(); + let raw = t.tokenize_raw("Hello World 中国"); + let texts: Vec<&str> = raw.iter().map(|(_, _, s)| s.as_str()).collect(); + assert!(texts.contains(&"hello")); + assert!(texts.contains(&"world")); + assert!(texts.contains(&"中国")); + } + + #[test] + fn with_position_false_emits_zero_position() { + let t = PaimonJiebaTokenizer::new(Path::new("/tmp/nx"), TokenizeMode::Mix, false).unwrap(); + let raw = t.tokenize_raw("中国人"); + // Can't check position on raw tuples; check via tantivy Token stream: + let mut t2 = t.clone(); + let mut stream = ::token_stream(&mut t2, "中国人"); + let mut positions = Vec::new(); + while stream.advance() { + positions.push(stream.token().position); + } + assert!(!raw.is_empty()); + assert!(positions.iter().all(|&p| p == 0)); + } + + #[test] + fn ffi_roundtrip() { + let dict = dict_dir_from_env(); + let dict_str = dict.to_str().unwrap(); + let mode = CString::new("mix").unwrap(); + let dict_c = CString::new(dict_str).unwrap(); + let mut handle: *mut PaimonJiebaTokenizer = std::ptr::null_mut(); + unsafe { + let st = paimon_tantivy_tokenizer_new( + mode.as_ptr(), + true, + dict_c.as_ptr(), + &mut handle, + ); + assert_eq!(st, PaimonTantivyStatus::Ok); + assert!(!handle.is_null()); + + let input = "Hello 中国"; + let input_c = CString::new(input).unwrap(); + let mut buf = PaimonTantivyBuffer::empty(); + let st2 = paimon_tantivy_tokenizer_tokenize( + handle, + input_c.as_ptr(), + input.len(), + &mut buf, + ); + assert_eq!(st2, PaimonTantivyStatus::Ok); + assert!(buf.len > 0); + crate::buffer::paimon_tantivy_buffer_free(&mut buf); + paimon_tantivy_tokenizer_free(handle); + } + } +} diff --git a/third_party/tantivy_ffi/src/writer.rs b/third_party/tantivy_ffi/src/writer.rs new file mode 100644 index 000000000..291408ef6 --- /dev/null +++ b/third_party/tantivy_ffi/src/writer.rs @@ -0,0 +1,769 @@ +//! PaimonTantivyWriter: Writer for tantivy-fulltext global index. +//! +//! Contract (see docs/dev/tantivy_java_compat_plan.md §2.5 + §5.1 J2): +//! - `writer_new(field_name, mode, with_position, dict_dir, out)` — create on a +//! private tmp dir backed by MmapDirectory + PaimonJiebaTokenizer. +//! `field_name` is **ignored** by the Rust schema (kept for FFI ABI +//! compatibility); schema field names are fixed (`row_id`, `text`) to match +//! paimon-java `paimon-tantivy-jni/rust/src/lib.rs:55-66`. +//! - `writer_add(writer, row_id, text, len)` — add a single document with the +//! caller-supplied `row_id` (u64) and a TEXT field +//! - `writer_finish(writer, out_row_count, out_buf)` — commit + force-merge to +//! single segment + pack all on-disk index files into a Rust-allocated buffer +//! - `writer_free(writer)` — destroy (RAII removes tmp dir) +//! +//! Packing format (big-endian, **cross-readable with paimon-java archive**; +//! see `paimon-tantivy-index/README.md` §Archive File Format): +//! `[i32 BE file_count | +//! (i32 BE name_len | name_bytes | i64 BE file_len | file_bytes)*]` + +use std::ffi::{c_char, c_void, CStr}; +use std::fs::File; +use std::io::Read; +use std::path::{Path, PathBuf}; + +use tantivy::schema::{ + Field, IndexRecordOption, NumericOptions, Schema, TextFieldIndexing, TextOptions, +}; +use tantivy::{doc, Index, IndexWriter, TantivyDocument}; +use tempfile::TempDir; + +use crate::error::{set_last_error, PaimonTantivyStatus}; +use crate::handle::{borrow_handle_mut, free_handle, into_handle}; +use crate::tokenizer::{PaimonJiebaTokenizer, TokenizeMode}; + +/// Schema field names. Fixed to match paimon-java's tantivy schema so that +/// indexes are cross-readable. Both fields are required. +pub const PAIMON_ROW_ID_FIELD_NAME: &str = "row_id"; +pub const PAIMON_TEXT_FIELD_NAME: &str = "text"; + +/// Name registered with the tantivy `TokenizerManager`. Reader must register +/// the same name to make stored term dictionaries readable. +pub const PAIMON_TOKENIZER_NAME: &str = "paimon_jieba"; + +/// Heap budget for the in-process IndexWriter (50 MB; tantivy minimum is ~3 MB). +/// Default multi-threaded writer (`Index::writer(heap)`) splits this budget +/// across `min(num_cpus, MAX_NUM_THREAD=8)` worker threads. +const WRITER_HEAP_SIZE: usize = 50_000_000; + +pub struct PaimonTantivyWriter { + /// Owned tmp dir; cleaned up when this struct drops. + tmpdir: TempDir, + /// `row_id` u64 field (stored + indexed + fast). Reader retrieves the + /// caller-supplied row_id via `fast_fields().u64("row_id").first(doc_id)`. + row_id_field: Field, + /// `text` TEXT field tokenized via the registered jieba tokenizer. + text_field: Field, + /// tantivy index instance, file-backed in `tmpdir`. + index: Index, + /// Active writer; consumed by `wait_merging_threads()` in `finish`. + writer: Option, + /// Documents added since construction. + row_count: i64, +} + +impl PaimonTantivyWriter { + pub fn new( + field_name: &str, + mode: TokenizeMode, + with_position: bool, + dict_dir: &Path, + tokenizer_name: &str, + ) -> Result { + if field_name.is_empty() { + return Err("field_name must be non-empty".into()); + } + // Schema is fixed to match paimon-java (decision B1): row_id (u64 + // stored+indexed+fast) + text (TEXT). The caller-supplied `field_name` + // parameter is currently ignored by the Rust schema (kept for FFI + // backward-compatibility); the C++ side still uses it to extract the + // right column from arrow batches. + let _ = field_name; // intentionally unused on the Rust side + let mut schema_builder = Schema::builder(); + let row_id_field = schema_builder.add_u64_field( + PAIMON_ROW_ID_FIELD_NAME, + NumericOptions::default() + .set_stored() + .set_indexed() + .set_fast(), + ); + let index_option = if with_position { + IndexRecordOption::WithFreqsAndPositions + } else { + IndexRecordOption::Basic + }; + // Empty input falls back to tantivy's built-in "default" (SimpleTokenizer), + // matching the cpp-side default in `tantivy_defs.h::kDefaultTantivyWriteTokenizer`. + // Cross-read with paimon-java works out of the box; CJK callers must + // pass "paimon_jieba" explicitly. + let effective_tokenizer = if tokenizer_name.is_empty() { + "default" + } else { + tokenizer_name + }; + let text_options = TextOptions::default().set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer(effective_tokenizer) + .set_index_option(index_option), + ); + let text_field = schema_builder.add_text_field(PAIMON_TEXT_FIELD_NAME, text_options); + let schema = schema_builder.build(); + + let tmpdir = tempfile::Builder::new() + .prefix("paimon-tantivy-") + .tempdir() + .map_err(|e| format!("create tmp dir: {e}"))?; + + let index = Index::create_in_dir(tmpdir.path(), schema) + .map_err(|e| format!("create tantivy index: {e}"))?; + // When caller picks "paimon_jieba" we construct + register the jieba + // tokenizer. For any tantivy built-in name ("default", "whitespace", + // "raw", "en_stem", ...) tantivy's TokenizerManager already has it + // registered via `TokenizerManager::default()`; no-op here. This lets + // paimon-cpp emit archives cross-readable by paimon-java's default + // TEXT tokenizer path. + if effective_tokenizer == PAIMON_TOKENIZER_NAME { + let tokenizer = PaimonJiebaTokenizer::new(dict_dir, mode, with_position) + .map_err(|e| format!("create tokenizer: {e}"))?; + index + .tokenizers() + .register(PAIMON_TOKENIZER_NAME, tokenizer); + } + + // Default multi-threaded writer (B1 schema stores row_id explicitly so + // we no longer need single-threaded ordering invariants). tantivy will + // use min(num_cpus, MAX_NUM_THREAD=8) workers, splitting heap budget. + let writer: IndexWriter = index + .writer(WRITER_HEAP_SIZE) + .map_err(|e| format!("create index writer: {e}"))?; + + Ok(Self { + tmpdir, + row_id_field, + text_field, + index, + writer: Some(writer), + row_count: 0, + }) + } + + pub fn add(&mut self, row_id: u64, text: &str) -> Result<(), String> { + let writer = self + .writer + .as_mut() + .ok_or_else(|| "writer already finished".to_string())?; + let document: TantivyDocument = doc!( + self.row_id_field => row_id, + self.text_field => text, + ); + writer + .add_document(document) + .map_err(|e| format!("add document: {e}"))?; + self.row_count += 1; + Ok(()) + } + + /// Commit + force-merge + GC on-disk index. Extracted from `finish_*` + /// so both streaming and test paths can share it. + fn commit_and_merge(&mut self) -> Result<(), String> { + let mut writer = self + .writer + .take() + .ok_or_else(|| "writer already finished".to_string())?; + writer.commit().map_err(|e| format!("commit: {e}"))?; + + let segment_metas = self + .index + .searchable_segment_metas() + .map_err(|e| format!("list segments: {e}"))?; + if segment_metas.len() > 1 { + let segment_ids: Vec<_> = segment_metas.iter().map(|m| m.id()).collect(); + writer + .merge(&segment_ids) + .wait() + .map_err(|e| format!("merge: {e}"))?; + } + writer + .garbage_collect_files() + .wait() + .map_err(|e| format!("garbage_collect_files: {e}"))?; + writer + .wait_merging_threads() + .map_err(|e| format!("wait_merging_threads: {e}"))?; + Ok(()) + } + + /// Streaming finish (W1 production path): commit + force-merge + push + /// archive bytes through the FFI callback in 64KB chunks. Peak RAM + /// independent of archive size — one stack buffer + a few KB metadata. + pub fn finish_streaming( + &mut self, + cb: &PaimonWriteCallbacks, + ) -> Result { + self.commit_and_merge()?; + let ctx = cb.ctx; + let write_fn = cb.write; + pack_index_dir_stream(self.tmpdir.path(), |bytes| { + // Calling extern "C" fn pointer is safe; C++ side owns ctx validity. + let rc = (write_fn)(ctx, bytes.as_ptr(), bytes.len()); + if rc != 0 { + return Err(format!("write callback rc={rc} len={}", bytes.len())); + } + Ok(()) + })?; + Ok(self.row_count) + } + + /// Test-only convenience: collect streaming output into a `Vec`. + /// Rust unit tests / integration tests use this; production path is + /// `finish_streaming`. + #[cfg(test)] + pub(crate) fn finish(&mut self) -> Result<(i64, Vec), String> { + self.commit_and_merge()?; + let mut out: Vec = Vec::new(); + pack_index_dir_stream(self.tmpdir.path(), |bytes| { + out.extend_from_slice(bytes); + Ok(()) + })?; + Ok((self.row_count, out)) + } + + #[cfg(test)] + pub(crate) fn tmpdir_path(&self) -> &Path { + self.tmpdir.path() + } +} + +// ========================================================================= +// Streaming pack (W1) +// ========================================================================= + +/// Streaming pack buffer size. Bigger than Java packIndex's 8KB for throughput, +/// still far below any archive size we care about. +const WRITER_STREAM_BUFFER_SIZE: usize = 64 * 1024; + +/// Callback table passed from C++ for streaming writer output (W1). +/// +/// `ctx` is an opaque pointer to C++'s `WriteCtx` (holding a `paimon::OutputStream`). +/// `write` is called in-order by Rust (not concurrently) to push bytes. +#[repr(C)] +pub struct PaimonWriteCallbacks { + pub ctx: *mut c_void, + /// Returns 0 on success, non-zero to signal C++ side error (Rust aborts pack). + pub write: extern "C" fn(ctx: *mut c_void, data: *const u8, len: usize) -> i32, +} + +/// Walk tempdir + pack into the Java-compatible archive format, pushing each +/// chunk through `write_fn`. Peak RAM = one 64KB stack buffer + a few KB of +/// entry metadata (name + PathBuf + u64 length). Mirrors Java +/// `TantivyFullTextGlobalIndexWriter.packIndex` but with a bigger buffer. +/// +/// Archive format (BE, no version): `[i32 file_count | (i32 name_len, name, +/// i64 file_len, file_bytes)*]`. Files sorted alphabetically for deterministic +/// output; `.`-prefixed (lock) files and non-regular entries skipped. +fn pack_index_dir_stream(dir: &Path, mut write_fn: F) -> Result<(), String> +where + F: FnMut(&[u8]) -> Result<(), String>, +{ + let entries = collect_dir_entries(dir)?; + + // Header: BE i32 file_count + write_fn(&(entries.len() as i32).to_be_bytes())?; + + let mut buf = [0u8; WRITER_STREAM_BUFFER_SIZE]; + for (name, path, file_len) in &entries { + // Per-entry header: name_len, name, data_len + write_fn(&(name.len() as i32).to_be_bytes())?; + write_fn(name.as_bytes())?; + write_fn(&(*file_len as i64).to_be_bytes())?; + + // Payload: 64KB buffer loop + let mut f = File::open(path) + .map_err(|e| format!("open {}: {e}", path.display()))?; + let mut pushed: u64 = 0; + loop { + let n = f + .read(&mut buf) + .map_err(|e| format!("read {}: {e}", path.display()))?; + if n == 0 { + break; + } + write_fn(&buf[..n])?; + pushed += n as u64; + } + if pushed != *file_len { + return Err(format!( + "file {} changed size during packing: header said {}, streamed {}", + name, file_len, pushed + )); + } + } + Ok(()) +} + +/// Enumerate the tempdir: sorted (name, path, len) for regular non-`.lock` files. +fn collect_dir_entries(dir: &Path) -> Result, String> { + let mut entries: Vec<(String, PathBuf, u64)> = Vec::new(); + let read_dir = + std::fs::read_dir(dir).map_err(|e| format!("read tmp dir {}: {e}", dir.display()))?; + for entry_res in read_dir { + let entry = entry_res.map_err(|e| format!("read entry: {e}"))?; + let name = match entry.file_name().into_string() { + Ok(n) => n, + Err(_) => continue, + }; + if name.starts_with('.') { + continue; + } + let ft = entry + .file_type() + .map_err(|e| format!("file_type for {}: {e}", entry.path().display()))?; + if !ft.is_file() { + continue; + } + let len = entry + .metadata() + .map_err(|e| format!("metadata for {}: {e}", entry.path().display()))? + .len(); + entries.push((name, entry.path(), len)); + } + entries.sort_by(|a, b| a.0.cmp(&b.0)); + Ok(entries) +} + +// ============================ FFI surface ============================ + +/// Create a writer handle on a private tmp dir. +/// +/// SAFETY: all C-string args must be NUL-terminated UTF-8; `out` non-null. +#[no_mangle] +pub unsafe extern "C" fn paimon_tantivy_writer_new( + field_name_cstr: *const c_char, + mode_cstr: *const c_char, + with_position: bool, + dict_dir_cstr: *const c_char, + tokenizer_cstr: *const c_char, + out: *mut *mut PaimonTantivyWriter, +) -> PaimonTantivyStatus { + if field_name_cstr.is_null() + || mode_cstr.is_null() + || dict_dir_cstr.is_null() + || tokenizer_cstr.is_null() + || out.is_null() + { + set_last_error("paimon_tantivy_writer_new: null argument"); + return PaimonTantivyStatus::InvalidArgument; + } + let field_name = match unsafe { CStr::from_ptr(field_name_cstr) }.to_str() { + Ok(s) => s, + Err(e) => { + set_last_error(format!("field_name not utf-8: {e}")); + return PaimonTantivyStatus::InvalidArgument; + } + }; + let mode_str = match unsafe { CStr::from_ptr(mode_cstr) }.to_str() { + Ok(s) => s, + Err(e) => { + set_last_error(format!("mode not utf-8: {e}")); + return PaimonTantivyStatus::InvalidArgument; + } + }; + let dict_dir = match unsafe { CStr::from_ptr(dict_dir_cstr) }.to_str() { + Ok(s) => s, + Err(e) => { + set_last_error(format!("dict_dir not utf-8: {e}")); + return PaimonTantivyStatus::InvalidArgument; + } + }; + let tokenizer_name = match unsafe { CStr::from_ptr(tokenizer_cstr) }.to_str() { + Ok(s) => s, + Err(e) => { + set_last_error(format!("tokenizer not utf-8: {e}")); + return PaimonTantivyStatus::InvalidArgument; + } + }; + let mode = match TokenizeMode::parse(mode_str) { + Some(m) => m, + None => { + set_last_error(format!( + "unknown tokenize mode {mode_str:?}; expected one of mp/hmm/mix/full/query" + )); + return PaimonTantivyStatus::InvalidArgument; + } + }; + match PaimonTantivyWriter::new( + field_name, + mode, + with_position, + Path::new(dict_dir), + tokenizer_name, + ) { + Ok(w) => { + unsafe { *out = into_handle(w) }; + PaimonTantivyStatus::Ok + } + Err(e) => { + // hmm-mode rejection bubbles through tokenizer construction. + let unsupported = e.contains("'hmm' is not supported"); + set_last_error(e); + if unsupported { + PaimonTantivyStatus::Unsupported + } else { + PaimonTantivyStatus::InternalError + } + } + } +} + +/// Add a single document. `text` need not be NUL-terminated; treat as a slice +/// of `text_len` UTF-8 bytes. Empty text (len=0) inserts an empty-text doc. +/// `row_id` is the caller-supplied paimon row id (u64), stored in a fast field +/// for retrieval by the reader. +/// +/// SAFETY: `writer` must be a live handle from `writer_new`. +#[no_mangle] +pub unsafe extern "C" fn paimon_tantivy_writer_add( + writer: *mut PaimonTantivyWriter, + row_id: u64, + text: *const c_char, + text_len: usize, +) -> PaimonTantivyStatus { + let Some(w) = (unsafe { borrow_handle_mut::(writer) }) else { + set_last_error("paimon_tantivy_writer_add: null writer handle"); + return PaimonTantivyStatus::InvalidArgument; + }; + if text.is_null() && text_len != 0 { + set_last_error("text is null but len > 0"); + return PaimonTantivyStatus::InvalidArgument; + } + let text_str = if text_len == 0 { + "" + } else { + let slice = unsafe { std::slice::from_raw_parts(text as *const u8, text_len) }; + match std::str::from_utf8(slice) { + Ok(s) => s, + Err(e) => { + set_last_error(format!("text not utf-8: {e}")); + return PaimonTantivyStatus::InvalidArgument; + } + } + }; + match w.add(row_id, text_str) { + Ok(()) => PaimonTantivyStatus::Ok, + Err(e) => { + set_last_error(e); + PaimonTantivyStatus::InternalError + } + } +} + +/// Commit + force-merge + stream archive bytes through `callbacks.write` in +/// 64KB chunks (W1). May only be called once per writer; subsequent calls +/// return InvalidArgument with last_error="writer already finished". +/// Peak Rust RAM ≈ 64KB + entry metadata (independent of archive size). +/// +/// The callback is invoked **serially** (not concurrently) within this call; +/// C++ side can write directly to paimon OutputStream without locking. +/// +/// SAFETY: `writer` must be a live handle; `out_row_count` non-null. +/// `callbacks.write` / `callbacks.ctx` must remain valid for the duration of +/// the call (callback is consumed in-place, not retained). +#[no_mangle] +pub unsafe extern "C" fn paimon_tantivy_writer_finish_streaming( + writer: *mut PaimonTantivyWriter, + callbacks: PaimonWriteCallbacks, + out_row_count: *mut i64, +) -> PaimonTantivyStatus { + if out_row_count.is_null() { + set_last_error("paimon_tantivy_writer_finish_streaming: null out_row_count"); + return PaimonTantivyStatus::InvalidArgument; + } + let Some(w) = (unsafe { borrow_handle_mut::(writer) }) else { + set_last_error("paimon_tantivy_writer_finish_streaming: null writer handle"); + return PaimonTantivyStatus::InvalidArgument; + }; + match w.finish_streaming(&callbacks) { + Ok(rows) => { + unsafe { *out_row_count = rows }; + PaimonTantivyStatus::Ok + } + Err(e) => { + let already_finished = e == "writer already finished"; + let io_err = e.starts_with("write callback rc=") + || e.starts_with("open ") + || e.starts_with("read "); + set_last_error(e); + if already_finished { + PaimonTantivyStatus::InvalidArgument + } else if io_err { + PaimonTantivyStatus::IoError + } else { + PaimonTantivyStatus::InternalError + } + } + } +} + +/// Destroy a writer handle. Safe on null. Tmp dir is removed via Drop. +#[no_mangle] +pub unsafe extern "C" fn paimon_tantivy_writer_free(writer: *mut PaimonTantivyWriter) { + unsafe { free_handle(writer) }; +} + +#[cfg(test)] +mod tests { + use super::*; + use std::ffi::CString; + + /// Test dict dir for jieba; defaults to a non-existent path so jieba-rs uses + /// its built-in dict (which is enough for these smoke tests). + fn dict_dir_from_env() -> std::path::PathBuf { + std::env::var("PAIMON_JIEBA_DICT_DIR") + .map(std::path::PathBuf::from) + .unwrap_or_else(|_| std::path::PathBuf::from("/tmp/nonexistent-dict")) + } + + #[test] + fn empty_field_name_rejected() { + let err = PaimonTantivyWriter::new("", TokenizeMode::Mix, true, Path::new("/tmp/nx"), "paimon_jieba") + .err() + .unwrap(); + assert!(err.contains("field_name"), "got: {err}"); + } + + #[test] + fn hmm_mode_rejected() { + let err = + PaimonTantivyWriter::new("f0", TokenizeMode::Hmm, true, Path::new("/tmp/nx"), "paimon_jieba") + .err() + .unwrap(); + assert!(err.contains("'hmm' is not supported"), "got: {err}"); + } + + #[test] + fn create_add_finish_roundtrip() { + let mut w = + PaimonTantivyWriter::new("f0", TokenizeMode::Mix, true, &dict_dir_from_env(), "paimon_jieba").unwrap(); + w.add(0, "hello world").unwrap(); + w.add(1, "中国人民").unwrap(); + w.add(2, "").unwrap(); // empty doc + let (rows, bytes) = w.finish().unwrap(); + assert_eq!(rows, 3); + assert!(bytes.len() > 4); + + // Validate header (Java-compatible: BE int32 file_count, no version) + let file_count = i32::from_be_bytes(bytes[0..4].try_into().unwrap()); + assert!(file_count > 0, "expected >0 packed files"); + + // Walk entries (BE) + let mut off: usize = 4; + let mut names = Vec::new(); + for _ in 0..file_count { + let nlen = i32::from_be_bytes(bytes[off..off + 4].try_into().unwrap()) as usize; + off += 4; + let name = std::str::from_utf8(&bytes[off..off + nlen]).unwrap().to_owned(); + off += nlen; + let flen = i64::from_be_bytes(bytes[off..off + 8].try_into().unwrap()) as usize; + off += 8; + assert!(off + flen <= bytes.len(), "file {name} extends past buffer"); + off += flen; + names.push(name); + } + assert_eq!(off, bytes.len(), "trailing bytes after pack"); + // tantivy must produce at least meta.json + assert!(names.iter().any(|n| n == "meta.json"), "names={names:?}"); + } + + #[test] + fn schema_field_names_are_fixed() { + // Schema must be `row_id` (u64) + `text` (TEXT) regardless of caller's + // field_name argument — matches paimon-java for cross-readability. + let w = + PaimonTantivyWriter::new("ignored_name", TokenizeMode::Mix, true, &dict_dir_from_env(), "paimon_jieba") + .unwrap(); + let schema = w.index.schema(); + assert!(schema.get_field(PAIMON_ROW_ID_FIELD_NAME).is_ok(), + "schema must have row_id field"); + assert!(schema.get_field(PAIMON_TEXT_FIELD_NAME).is_ok(), + "schema must have text field"); + // Caller-supplied name must NOT appear + assert!(schema.get_field("ignored_name").is_err(), + "caller-supplied field_name must be ignored"); + } + + #[test] + fn archive_uses_big_endian_no_version_header() { + // Strong guard: header must be BE int32 file_count, NOT LE int32 + // version=1 + LE int32 file_count. Any regression to LE/version-header + // would silently break paimon-java cross-read. + let mut w = + PaimonTantivyWriter::new("f0", TokenizeMode::Mix, true, &dict_dir_from_env(), "paimon_jieba").unwrap(); + w.add(0, "hello").unwrap(); + let (_, bytes) = w.finish().unwrap(); + let header_be = i32::from_be_bytes(bytes[0..4].try_into().unwrap()); + let header_le = i32::from_le_bytes(bytes[0..4].try_into().unwrap()); + // BE file_count is small (single-segment force-merge: ~6-7 files) + assert!(header_be > 0 && header_be < 100, + "expected sensible BE file_count, got BE={header_be} LE={header_le}"); + // LE-decoded header would be a huge number (e.g. 0x06000000), ensuring + // we did NOT regress to the old LE+version layout. + assert_ne!(header_be, header_le, "buffer must be BE-encoded"); + } + + #[test] + fn multi_thread_writer_default() { + // B1 schema stores row_id explicitly so we no longer enforce + // single-threaded writer. Just verify many docs across threads land + // correctly and force-merge collapses to a single segment. + let mut w = + PaimonTantivyWriter::new("f0", TokenizeMode::Mix, true, &dict_dir_from_env(), "paimon_jieba").unwrap(); + for i in 0..200u64 { + w.add(i, &format!("row {i} apple banana")).unwrap(); + } + let (rows, bytes) = w.finish().unwrap(); + assert_eq!(rows, 200); + assert!(bytes.len() > 4); + // After force-merge there must be exactly one meta.json + segment files. + let file_count = i32::from_be_bytes(bytes[0..4].try_into().unwrap()); + assert!(file_count >= 2, "force-merged single segment needs ≥ 2 files (meta + segment), got {file_count}"); + } + + #[test] + fn finish_twice_errors() { + let mut w = + PaimonTantivyWriter::new("f0", TokenizeMode::Mix, true, &dict_dir_from_env(), "paimon_jieba").unwrap(); + w.add(0, "hi").unwrap(); + let _ = w.finish().unwrap(); + let err = w.finish().err().unwrap(); + assert!(err.contains("already finished"), "got: {err}"); + } + + /// Mock collector for FFI streaming tests: push bytes into a Box> + /// pointed to by `ctx`. (No Arc / atomic needed — test is single-threaded.) + extern "C" fn mock_write_collect(ctx: *mut c_void, data: *const u8, len: usize) -> i32 { + let vec = unsafe { &mut *(ctx as *mut Vec) }; + let slice = unsafe { std::slice::from_raw_parts(data, len) }; + vec.extend_from_slice(slice); + 0 + } + + /// Mock that counts the largest single `write` call — sanity check that + /// Rust streams with small chunks (≤ 64KB buffer + header fields). + extern "C" fn mock_write_max_chunk( + ctx: *mut c_void, + _data: *const u8, + len: usize, + ) -> i32 { + let max = unsafe { &mut *(ctx as *mut usize) }; + if len > *max { + *max = len; + } + 0 + } + + #[test] + fn ffi_full_path_streaming() { + unsafe { + let field = CString::new("f0").unwrap(); + let mode = CString::new("mix").unwrap(); + let dict = CString::new(dict_dir_from_env().to_str().unwrap()).unwrap(); + let tokenizer = CString::new("paimon_jieba").unwrap(); + let mut handle: *mut PaimonTantivyWriter = std::ptr::null_mut(); + let st = paimon_tantivy_writer_new( + field.as_ptr(), + mode.as_ptr(), + true, + dict.as_ptr(), + tokenizer.as_ptr(), + &mut handle, + ); + assert_eq!(st, PaimonTantivyStatus::Ok); + assert!(!handle.is_null()); + + let txt = "hello world"; + let st = + paimon_tantivy_writer_add(handle, 42u64, txt.as_ptr() as *const c_char, txt.len()); + assert_eq!(st, PaimonTantivyStatus::Ok); + + // Streaming finish: collect bytes into a Vec via FFI callback + let mut out: Vec = Vec::new(); + let cb = PaimonWriteCallbacks { + ctx: &mut out as *mut _ as *mut c_void, + write: mock_write_collect, + }; + let mut rows: i64 = 0; + let st = paimon_tantivy_writer_finish_streaming(handle, cb, &mut rows); + assert_eq!(st, PaimonTantivyStatus::Ok); + assert_eq!(rows, 1); + // BE file_count at byte 0,> 0 + let file_count = i32::from_be_bytes(out[0..4].try_into().unwrap()); + assert!(file_count > 0); + + // double finish must error + let mut out2: Vec = Vec::new(); + let cb2 = PaimonWriteCallbacks { + ctx: &mut out2 as *mut _ as *mut c_void, + write: mock_write_collect, + }; + let mut rows2: i64 = 0; + let st = paimon_tantivy_writer_finish_streaming(handle, cb2, &mut rows2); + assert_eq!(st, PaimonTantivyStatus::InvalidArgument); + + paimon_tantivy_writer_free(handle); + } + } + + #[test] + fn streaming_chunk_size_bounded_by_buffer() { + // After force-merge, a 200-doc index still streams in chunks ≤ 64KB + // (payload) / or small header-field chunks. Peak chunk ≤ 64KB. + let mut w = + PaimonTantivyWriter::new("f0", TokenizeMode::Mix, true, &dict_dir_from_env(), "paimon_jieba").unwrap(); + for i in 0..200u64 { + w.add(i, &format!("row {i} apple banana")).unwrap(); + } + let mut max_chunk: usize = 0; + let cb = PaimonWriteCallbacks { + ctx: &mut max_chunk as *mut _ as *mut c_void, + write: mock_write_max_chunk, + }; + let rows = w.finish_streaming(&cb).unwrap(); + assert_eq!(rows, 200); + assert!( + max_chunk <= WRITER_STREAM_BUFFER_SIZE, + "streaming chunk size {} exceeded buffer {}", + max_chunk, + WRITER_STREAM_BUFFER_SIZE + ); + } + + #[test] + fn streaming_write_callback_error_propagates() { + extern "C" fn always_fail(_ctx: *mut c_void, _data: *const u8, _len: usize) -> i32 { + 7 + } + let mut w = + PaimonTantivyWriter::new("f0", TokenizeMode::Mix, true, &dict_dir_from_env(), "paimon_jieba").unwrap(); + w.add(0, "hello").unwrap(); + let cb = PaimonWriteCallbacks { + ctx: std::ptr::null_mut(), + write: always_fail, + }; + let err = w.finish_streaming(&cb).unwrap_err(); + assert!(err.contains("write callback rc=7"), "got: {err}"); + } + + #[test] + fn ffi_null_writer_invalid() { + unsafe { + let txt = "x"; + let st = paimon_tantivy_writer_add( + std::ptr::null_mut(), + 0u64, + txt.as_ptr() as *const c_char, + txt.len(), + ); + assert_eq!(st, PaimonTantivyStatus::InvalidArgument); + } + } +}