diff --git a/.devcontainer/Dockerfile.template b/.devcontainer/Dockerfile.template
index c28a0e1ae..9069085bb 100644
--- a/.devcontainer/Dockerfile.template
+++ b/.devcontainer/Dockerfile.template
@@ -17,12 +17,32 @@
 
 # Adapted from Apache Iceberg C++
 # https://github.com/apache/iceberg-cpp/blob/main/.devcontainer/Dockerfile.template
-
+#
 # This Dockerfile is used to build a development container for Paimon C++.
-# It is based on the Ubuntu image and installs necessary dependencies.
+# Base: Ubuntu 24.04. Rust toolchain is installed via Dev Container
+# Feature `ghcr.io/devcontainers/features/rust:1` (see devcontainer.json),
+# so it does NOT appear in this Dockerfile.
 
 FROM ubuntu:24.04
 
+# Switch apt to Aliyun mirror for faster downloads (covers both
+# x86_64 archive.ubuntu.com and aarch64 ports.ubuntu.com paths).
+# If you are outside mainland China or your network has its own internal
+# mirror, edit or remove this block.
+RUN sed -i \
+        -e 's|http://archive.ubuntu.com/ubuntu|http://mirrors.aliyun.com/ubuntu|g' \
+        -e 's|http://security.ubuntu.com/ubuntu|http://mirrors.aliyun.com/ubuntu|g' \
+        -e 's|http://ports.ubuntu.com/ubuntu-ports|http://mirrors.aliyun.com/ubuntu-ports|g' \
+        /etc/apt/sources.list.d/ubuntu.sources
+
+# Point rustup at USTC mirror so the Dev Container Feature
+# `ghcr.io/devcontainers/features/rust:1` (and any later `rustup` calls)
+# download the Rust toolchain from a China-friendly CDN instead of
+# the default static.rust-lang.org. Set as ENV so it is inherited by
+# every subsequent layer (including features installed after this image).
+ENV RUSTUP_DIST_SERVER=https://mirrors.ustc.edu.cn/rust-static \
+    RUSTUP_UPDATE_ROOT=https://mirrors.ustc.edu.cn/rust-static/rustup
+
 # Install necessary packages
 RUN apt update && \
     apt install -y \
@@ -48,6 +68,16 @@ RUN apt update && \
     vim \
     wget \
     sudo \
+    # ---- additions for tantivy-fts migration (Rust + Sanitizer + LLVM) ----
+    clang \
+    clang-format \
+    clang-tidy \
+    lld \
+    llvm \
+    libclang-rt-dev \
+    gdb \
+    lldb \
+    valgrind \
     && rm -rf /var/lib/apt/lists/*
 
 # Add a user for development
diff --git a/.devcontainer/centos7/Dockerfile b/.devcontainer/centos7/Dockerfile
new file mode 100644
index 000000000..c4fe3a5a0
--- /dev/null
+++ b/.devcontainer/centos7/Dockerfile
@@ -0,0 +1,229 @@
+# Copyright 2026-present Alibaba Inc.
+#
+# Licensed under the Apache License, Version 2.0.
+#
+# CentOS 7 cross-build verification image for paimon-cpp + tantivy-fts.
+#
+# Purpose:
+#   Prove the tantivy-fts stack builds on the OLDEST reasonable Linux target
+#   (glibc 2.17, EOL 2024-06-30). The default Ubuntu 24.04 dev container
+#   proves nothing about glibc compatibility; this image does.
+#
+# Build:
+#   docker build -t paimon-cpp-centos7:latest -f .devcontainer/centos7/Dockerfile .
+#
+# Run:
+#   docker run -d --name paimon-centos7 \
+#     --privileged \
+#     -v "$(pwd):/workspaces/paimon-cpp" \
+#     paimon-cpp-centos7:latest sleep infinity
+#   docker exec -it paimon-centos7 bash -l
+#
+# Inside the container:
+#   scl enable devtoolset-11 rh-python38 -- bash        # activate modern gcc + python
+#   source /opt/paimon-env.sh                           # PATH for rust, cmake
+#   cd /workspaces/paimon-cpp
+#   git lfs install --local && git lfs pull             # critical: boost & friends are LFS
+#   ./scripts/tantivy_smoke.sh
+
+# ---------- Base ----------
+# CentOS 7 reached EOL 2024-06-30; its default mirrorlist.centos.org is down.
+# Pin to vault.centos.org (Red Hat's archived location) via the `linuxserver/centos`
+# vault image to avoid retired-mirror failures on `yum install`.
+#
+# Base image: we pull from quay.io (CentOS community's canonical registry post
+# Docker Hub deprecation). Override with CENTOS7_IMAGE build arg when behind a
+# firewall that can't reach quay.io (e.g. registry.aliyuncs.com/library/centos:7).
+ARG CENTOS7_IMAGE=quay.io/centos/centos:centos7
+FROM ${CENTOS7_IMAGE}
+
+# Repoint yum at aliyun's CentOS 7 vault mirror — vault.centos.org itself
+# works but is slow/blocked from many CN networks; the aliyun mirror is a
+# complete rsync and reliably fast. We overwrite CentOS-Base.repo rather
+# than sed-patch it so the result is deterministic regardless of what the
+# upstream image ships. fastestmirror plugin is disabled because its ping
+# probes against the retired mirror list add ~60s to every `yum install`.
+RUN echo -e '[base]\n\
+name=CentOS-7 - Base - aliyun vault\n\
+baseurl=https://mirrors.aliyun.com/centos-vault/7.9.2009/os/$basearch/\n\
+gpgcheck=0\n\
+enabled=1\n\
+\n\
+[updates]\n\
+name=CentOS-7 - Updates - aliyun vault\n\
+baseurl=https://mirrors.aliyun.com/centos-vault/7.9.2009/updates/$basearch/\n\
+gpgcheck=0\n\
+enabled=1\n\
+\n\
+[extras]\n\
+name=CentOS-7 - Extras - aliyun vault\n\
+baseurl=https://mirrors.aliyun.com/centos-vault/7.9.2009/extras/$basearch/\n\
+gpgcheck=0\n\
+enabled=1\n\
+\n\
+[centosplus]\n\
+name=CentOS-7 - Plus - aliyun vault\n\
+baseurl=https://mirrors.aliyun.com/centos-vault/7.9.2009/centosplus/$basearch/\n\
+gpgcheck=0\n\
+enabled=0\n' > /etc/yum.repos.d/CentOS-Base.repo \
+    && rm -f /etc/yum.repos.d/CentOS-CR.repo \
+             /etc/yum.repos.d/CentOS-Debuginfo.repo \
+             /etc/yum.repos.d/CentOS-Media.repo \
+             /etc/yum.repos.d/CentOS-Sources.repo \
+             /etc/yum.repos.d/CentOS-Vault.repo \
+             /etc/yum.repos.d/CentOS-fasttrack.repo \
+             /etc/yum.repos.d/CentOS-x86_64-kernel.repo \
+    && if [ -f /etc/yum/pluginconf.d/fastestmirror.conf ]; then \
+         sed -i 's/^enabled=1/enabled=0/' /etc/yum/pluginconf.d/fastestmirror.conf; \
+       fi \
+    && yum clean all \
+    && yum makecache
+
+# ---------- Base toolchain ----------
+# EPEL provides git-lfs, ninja-build, a newer python3 than the base 3.6.
+# SCL (Software Collections) provides devtoolset-11 (gcc 11) and rh-python38
+# without overriding the system gcc/python. CentOS 7's default gcc 4.8 is
+# too old for C++17/20 used by lucene++ and our tantivy wrapper.
+#
+# Same story as CentOS-Base.repo: both epel + SCL default to mirrorlist
+# endpoints that are effectively dead; overwrite with aliyun URLs that we
+# know respond.
+RUN yum install -y epel-release centos-release-scl \
+    && echo -e '[epel]\n\
+name=Extra Packages for Enterprise Linux 7 - aliyun\n\
+baseurl=https://mirrors.aliyun.com/epel/7/$basearch\n\
+gpgcheck=0\n\
+enabled=1\n' > /etc/yum.repos.d/epel.repo \
+    && rm -f /etc/yum.repos.d/epel-testing.repo /etc/yum.repos.d/epel.repo.rpmnew \
+    && rm -f /etc/yum.repos.d/CentOS-SCLo-*.repo \
+             /etc/yum.repos.d/CentOS-SCLo-*.repo.rpmnew \
+    && echo -e '[centos-sclo-rh]\n\
+name=CentOS-7 - SCLo rh - aliyun vault\n\
+baseurl=https://mirrors.aliyun.com/centos-vault/7.9.2009/sclo/$basearch/rh/\n\
+gpgcheck=0\n\
+enabled=1\n\
+\n\
+[centos-sclo-sclo]\n\
+name=CentOS-7 - SCLo sclo - aliyun vault\n\
+baseurl=https://mirrors.aliyun.com/centos-vault/7.9.2009/sclo/$basearch/sclo/\n\
+gpgcheck=0\n\
+enabled=1\n' > /etc/yum.repos.d/CentOS-SCLo-scl.repo \
+    && yum clean all && yum makecache \
+    && yum install -y \
+        devtoolset-11-gcc \
+        devtoolset-11-gcc-c++ \
+        devtoolset-11-binutils \
+        devtoolset-11-libasan-devel \
+        devtoolset-11-libubsan-devel \
+        rh-python38 \
+        rh-python38-python-pip \
+        git \
+        git-lfs \
+        ninja-build \
+        make \
+        patch \
+        curl \
+        wget \
+        unzip \
+        which \
+        file \
+        sudo \
+        openssl-devel \
+        zlib-devel \
+        libffi-devel \
+        bzip2-devel \
+        xz-devel \
+        perl-IPC-Cmd \
+    && yum clean all
+
+# Enable the SCL collections for all subsequent shells (including RUN).
+ENV BASH_ENV=/etc/profile.d/scl-enable.sh
+SHELL ["/bin/bash", "-c"]
+RUN printf '%s\n' \
+        'source scl_source enable devtoolset-11' \
+        'source scl_source enable rh-python38' \
+    > /etc/profile.d/scl-enable.sh \
+    && chmod +x /etc/profile.d/scl-enable.sh
+
+# ---------- CMake (must be >= 3.22 for Corrosion) ----------
+# CentOS 7's cmake package is 2.8.12; EPEL cmake3 is 3.17 — still too old.
+# Install via pip in the rh-python38 SCL so we get a modern CMake without
+# touching the system /usr/bin. Point pip at aliyun's pypi mirror: default
+# pypi.org is 10-30s per request from CN, aliyun responds in <1s.
+ENV PIP_INDEX_URL=https://mirrors.aliyun.com/pypi/simple/ \
+    PIP_TRUSTED_HOST=mirrors.aliyun.com
+RUN source /etc/profile.d/scl-enable.sh \
+    && python3 -m pip install --upgrade pip \
+    && python3 -m pip install 'cmake==3.28.*' ninja
+
+# ---------- Rust toolchain ----------
+# Install rustup as root into /opt/rust so all users share the same toolchain.
+# Use the USTC mirror to keep downloads fast in CN; the CI runner version of
+# this is mirrored in ci/scripts/setup_rust.sh.
+ENV RUSTUP_HOME=/opt/rust/rustup \
+    CARGO_HOME=/opt/rust/cargo \
+    RUSTUP_DIST_SERVER=https://mirrors.ustc.edu.cn/rust-static \
+    RUSTUP_UPDATE_ROOT=https://mirrors.ustc.edu.cn/rust-static/rustup
+# In-container network for Docker Desktop builds is unreliable through many
+# CN mirrors (observed: curl 7.29 on CentOS 7 + rsproxy.cn HTTP/2 path ⇒
+# partial-read truncations; USTC ⇒ 5xx; rustup sh installer ⇒ 403 from
+# legacy cipher). The most reliable fix is to sidestep the issue entirely:
+# pre-download rustup-init on the host (where network is solid) and COPY it
+# into the image. See .devcontainer/centos7/run.sh for the prefetch step.
+COPY .devcontainer/centos7/rustup-init.bin /tmp/rustup-init
+RUN chmod +x /tmp/rustup-init \
+    && /tmp/rustup-init -y --default-toolchain stable --profile minimal --no-modify-path \
+    && rm -f /tmp/rustup-init \
+    && mkdir -p $CARGO_HOME \
+    && echo -e '[source.crates-io]\n\
+replace-with = "rsproxy-sparse"\n\
+\n\
+[source.rsproxy]\n\
+registry = "https://rsproxy.cn/crates.io-index"\n\
+\n\
+[source.rsproxy-sparse]\n\
+registry = "sparse+https://rsproxy.cn/index/"\n\
+\n\
+[registries.rsproxy]\n\
+index = "https://rsproxy.cn/crates.io-index"\n\
+\n\
+[net]\n\
+git-fetch-with-cli = true\n' > $CARGO_HOME/config.toml \
+    && $CARGO_HOME/bin/cargo install cbindgen --version 0.29.2 --locked \
+    && chmod -R a+rwx /opt/rust \
+    && $CARGO_HOME/bin/rustc --version \
+    && $CARGO_HOME/bin/cargo --version \
+    && $CARGO_HOME/bin/cbindgen --version
+
+# ---------- Environment file consumed by every shell ----------
+# Sets PATH for rust / cmake / cargo so `docker exec paimon-centos7 bash -l`
+# and interactive sessions have the toolchain on $PATH.
+RUN printf '%s\n' \
+        'export PATH=/opt/rust/cargo/bin:$PATH' \
+        '# cmake + ninja live under the rh-python38 SCL; path prefix differs by arch.' \
+        '# `command -v cmake` confirms which one is in use.' \
+    > /opt/paimon-env.sh \
+    && chmod +x /opt/paimon-env.sh \
+    && printf '%s\n' 'source /opt/paimon-env.sh' >> /etc/profile.d/scl-enable.sh
+
+# ---------- Non-root user ----------
+# Build as `paimon` (uid 1000) so LFS objects under the mount stay owned by
+# your host user, matching the main Ubuntu dev container.
+RUN useradd -m -u 1000 -s /bin/bash paimon \
+    && echo 'paimon ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/paimon
+
+USER paimon
+WORKDIR /workspaces/paimon-cpp
+
+# Sanity check surfaces the tool versions in `docker run ... paimon-cpp-centos7 --version`.
+CMD ["bash", "-lc", "\
+    echo '--- CentOS 7 cross-build image sanity check ---'; \
+    cat /etc/centos-release; \
+    echo '--- glibc ---'; ldd --version | head -1; \
+    echo '--- gcc   ---'; gcc --version | head -1; \
+    echo '--- cmake ---'; cmake --version | head -1; \
+    echo '--- ninja ---'; ninja --version; \
+    echo '--- rust  ---'; rustc --version; \
+    echo '--- cargo ---'; cargo --version; \
+    echo '--- cbindgen ---'; cbindgen --version; \
+    echo 'Ready. Mount paimon-cpp at /workspaces/paimon-cpp and run ./scripts/tantivy_smoke.sh'"]
diff --git a/.devcontainer/centos7/run.sh b/.devcontainer/centos7/run.sh
new file mode 100755
index 000000000..54e6bfbde
--- /dev/null
+++ b/.devcontainer/centos7/run.sh
@@ -0,0 +1,140 @@
+#!/usr/bin/env bash
+#
+# Copyright 2026-present Alibaba Inc.
+#
+# Licensed under the Apache License, Version 2.0.
+#
+# One-shot helper to build + launch + smoke-test the CentOS 7 verification
+# container. Run from the paimon-cpp repo root.
+#
+# Usage:
+#   ./.devcontainer/centos7/run.sh build         # build image only
+#   ./.devcontainer/centos7/run.sh up            # start container (detached)
+#   ./.devcontainer/centos7/run.sh shell         # exec into it
+#   ./.devcontainer/centos7/run.sh smoke         # run scripts/tantivy_smoke.sh inside
+#   ./.devcontainer/centos7/run.sh down          # stop + remove
+
+set -euo pipefail
+
+IMAGE=paimon-cpp-centos7:latest
+CONTAINER=paimon-centos7
+
+here=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+repo=$(cd "${here}/../.." && pwd)
+
+cmd=${1:-help}
+
+case "${cmd}" in
+    build)
+        # Prefetch rustup-init on the host. In-container network from Docker
+        # Desktop builds is unreliable for CN mirrors (TLS/HTTP2 issues with
+        # old curl/wget on CentOS 7), but host curl works. The image copies
+        # this blob in. Override mirror with RUSTUP_INIT_URL=... if needed.
+        rustup_init="${here}/rustup-init.bin"
+        rustup_url="${RUSTUP_INIT_URL:-https://mirrors.ustc.edu.cn/rust-static/rustup/dist/x86_64-unknown-linux-gnu/rustup-init}"
+        if [ ! -s "${rustup_init}" ]; then
+            echo "==> Prefetching rustup-init from ${rustup_url}"
+            curl --proto '=https' --tlsv1.2 -sSfL --retry 5 --retry-delay 5 \
+                -o "${rustup_init}" "${rustup_url}"
+        fi
+        # Override base image with CENTOS7_IMAGE=... if quay.io is unreachable.
+        # Common fallbacks you may need to docker-pull into local cache first:
+        #   CENTOS7_IMAGE=quay.io/centos/centos:centos7   (default)
+        #   CENTOS7_IMAGE=registry.aliyuncs.com/library/centos:7
+        if [ -n "${CENTOS7_IMAGE:-}" ]; then
+            docker build -t "${IMAGE}" -f "${here}/Dockerfile" \
+                --build-arg "CENTOS7_IMAGE=${CENTOS7_IMAGE}" "${repo}"
+        else
+            docker build -t "${IMAGE}" -f "${here}/Dockerfile" "${repo}"
+        fi
+        ;;
+    up)
+        docker rm -f "${CONTAINER}" 2>/dev/null || true
+        # Mount host SSH keys read-only (mirrors paimon-dev) so git clones of
+        # internal repos (e.g. aliorc_ep on gitlab.alibaba-inc.com) that go
+        # over SSH can authenticate with the host's key. Skip the mount if
+        # ~/.ssh doesn't exist so the script still works for external users.
+        ssh_mount=()
+        if [ -d "${HOME}/.ssh" ]; then
+            ssh_mount=(-v "${HOME}/.ssh:/home/paimon/.ssh:ro")
+        fi
+        docker run -d \
+            --name "${CONTAINER}" \
+            --privileged \
+            -v "${repo}:/workspaces/paimon-cpp" \
+            -v "paimon-centos7-cargo-registry:/opt/rust/cargo/registry" \
+            -v "paimon-centos7-build:/workspaces/paimon-cpp/build-centos7" \
+            "${ssh_mount[@]}" \
+            "${IMAGE}" sleep infinity
+        # Named volumes mount as root-owned; `paimon` user (uid 1000) needs
+        # write access to build-centos7 and the cargo registry cache.
+        # Also set up the gitlab.alibaba-inc.com url rewrite so aliorc_ep
+        # (and any other ExternalProject pointing at internal gitlab via
+        # http://) picks up the mounted SSH key.
+        docker exec --user root "${CONTAINER}" bash -c '
+            chown -R paimon:paimon /workspaces/paimon-cpp/build-centos7 \
+                                   /opt/rust/cargo/registry
+        '
+        docker exec "${CONTAINER}" bash -c '
+            git config --global url."git@gitlab.alibaba-inc.com:".insteadOf \
+                "http://gitlab.alibaba-inc.com/"
+        '
+        echo "Container started. \`${0} shell\` to enter."
+        ;;
+    shell)
+        docker exec -it "${CONTAINER}" bash -l
+        ;;
+    smoke)
+        # Ensure container is up first; no-op if already running.
+        if ! docker ps --format '{{.Names}}' | grep -qx "${CONTAINER}"; then
+            echo "Container ${CONTAINER} not running; starting it."
+            "$0" up
+        fi
+        # Two env vars pass through for Rosetta 2 (Apple Silicon) compat:
+        # MALLOC_CHECK_=0 disables glibc 2.17 extra malloc integrity checks
+        #   that fire false positives under Rosetta's x86_64 emulation.
+        # ARROW_USER_SIMD_LEVEL=SSE4_2 keeps arrow runtime-dispatched kernels
+        #   on SSE4.2 only (Rosetta does not support AVX2/BMI2/AVX-512).
+        # Both are no-ops on real x86_64 CentOS 7 hardware.
+        # Use a distinct build dir inside the container so it does not clash
+        # with the Ubuntu dev container's build/ dir on the same volume.
+        # Propagate PAIMON_ENABLE_ALIORC so `PAIMON_ENABLE_ALIORC=OFF` env
+        # on the host reaches the cmake inside the container.
+        docker exec \
+            -e "PAIMON_ENABLE_ALIORC=${PAIMON_ENABLE_ALIORC:-ON}" \
+            -e "MALLOC_CHECK_=0" \
+            -e "ARROW_USER_SIMD_LEVEL=SSE4_2" \
+            "${CONTAINER}" bash -lc '
+            set -eux
+            cd /workspaces/paimon-cpp
+            git lfs install --local
+            git lfs pull
+            cmake -S . -B build-centos7 \
+                -G Ninja \
+                -DCMAKE_BUILD_TYPE=Release \
+                -DPAIMON_BUILD_TESTS=ON \
+                -DPAIMON_ENABLE_FSLIB=OFF \
+                -DPAIMON_ENABLE_LUMINA=OFF \
+                -DPAIMON_ENABLE_LANCE=OFF \
+                -DPAIMON_ENABLE_JINDO=OFF \
+                -DPAIMON_ENABLE_LUCENE=ON \
+                -DPAIMON_ENABLE_ORC=ON \
+                -DPAIMON_ENABLE_ALIORC="${PAIMON_ENABLE_ALIORC:-ON}" \
+                -DPAIMON_ENABLE_AVRO=ON
+            # ALIORC clones from internal gitlab. `up` mounts $HOME/.ssh and
+            # configures the url.insteadOf rewrite, so by default ALIORC works
+            # for alibaba-inc users. External users without gitlab access can
+            # opt out with `PAIMON_ENABLE_ALIORC=OFF ./run.sh smoke`.
+            cmake --build build-centos7 -j "$(nproc)"
+            ctest --test-dir build-centos7 \
+                -R "paimon-lucene-index-test|paimon-global-index-test|paimon-tantivy-.*-test" \
+                --output-on-failure
+        '
+        ;;
+    down)
+        docker rm -f "${CONTAINER}" 2>/dev/null || true
+        ;;
+    help|*)
+        sed -n "2,20p" "$0"
+        ;;
+esac
diff --git a/.devcontainer/devcontainer.json.template b/.devcontainer/devcontainer.json.template
index 856a89dc3..992f62aba 100644
--- a/.devcontainer/devcontainer.json.template
+++ b/.devcontainer/devcontainer.json.template
@@ -20,6 +20,10 @@
 // Adapted from Apache Iceberg C++
 // https://github.com/apache/iceberg-cpp/blob/main/.devcontainer/devcontainer.json.template
 
+// Default Paimon C++ Dev Container.
+// On Apple Silicon hosts this runs as native aarch64 Linux (fast).
+// For x86_64 verification, use the variant under .devcontainer/x86_64/.
+
 {
 	"name": "Paimon CPP Dev Container",
 	"build": {
@@ -34,16 +38,36 @@
 		"seccomp=unconfined",
 		"--privileged"
 	],
+	"features": {
+		"ghcr.io/devcontainers/features/rust:1": {
+			"version": "stable",
+			"profile": "default"
+		}
+	},
 	"mounts": [
-		"source=${localEnv:HOME}/.ssh,target=/home/paimon/.ssh,type=bind,readonly"
+		"source=${localEnv:HOME}/.ssh,target=/home/paimon/.ssh,type=bind,readonly",
+		"source=paimon-cargo-registry,target=/home/paimon/.cargo/registry,type=volume",
+		"source=paimon-cargo-git,target=/home/paimon/.cargo/git,type=volume",
+		"source=paimon-rust-target,target=${containerWorkspaceFolder}/third_party/tantivy_ffi/target,type=volume",
+		"source=paimon-build,target=${containerWorkspaceFolder}/build,type=volume",
+		"source=paimon-ccache,target=/home/paimon/.ccache,type=volume"
 	],
+	"postCreateCommand": "sudo chown -R paimon:paimon ${containerWorkspaceFolder}/build ${containerWorkspaceFolder}/third_party/tantivy_ffi/target /home/paimon/.ccache /home/paimon/.cargo/registry /home/paimon/.cargo/git 2>/dev/null || true; cargo install cbindgen --locked || true; rustup component add rust-src rust-analyzer clippy rustfmt || true",
 	"customizations": {
 		"vscode": {
 			"extensions": [
-				"eamodio.gitlens"
+				"eamodio.gitlens",
+				"rust-lang.rust-analyzer",
+				"vadimcn.vscode-lldb",
+				"llvm-vs-code-extensions.vscode-clangd",
+				"ms-vscode.cmake-tools",
+				"twxs.cmake"
 			],
 			"settings": {
-				"editor.formatOnSave": true
+				"editor.formatOnSave": true,
+				"rust-analyzer.linkedProjects": [
+					"third_party/tantivy_ffi/Cargo.toml"
+				]
 			}
 		}
 	}
diff --git a/.devcontainer/x86_64/devcontainer.json.template b/.devcontainer/x86_64/devcontainer.json.template
new file mode 100644
index 000000000..baa400990
--- /dev/null
+++ b/.devcontainer/x86_64/devcontainer.json.template
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// x86_64 variant of the Paimon CPP Dev Container.
+// On Apple Silicon hosts this runs under QEMU emulation (5-10x slower).
+// Use it ONLY for cross-architecture verification (Stage 11), not daily dev.
+//
+// Reuses the same Dockerfile as the default container; only the platform differs.
+//
+// Uses dedicated named volumes (suffix `-amd64`) so build/cargo cache do not
+// collide with the native aarch64 container.
+
+{
+	"name": "Paimon CPP Dev Container (x86_64 via QEMU)",
+	"build": {
+		"dockerfile": "../Dockerfile",
+		"options": [
+			"--platform=linux/amd64"
+		]
+	},
+	"runArgs": [
+		"--platform=linux/amd64",
+		"--ulimit=core=-1",
+		"--cap-add=SYS_ADMIN",
+		"--cap-add=SYS_PTRACE",
+		"--cap-add=PERFMON",
+		"--security-opt",
+		"seccomp=unconfined",
+		"--privileged"
+	],
+	"features": {
+		"ghcr.io/devcontainers/features/rust:1": {
+			"version": "stable",
+			"profile": "default"
+		}
+	},
+	"mounts": [
+		"source=${localEnv:HOME}/.ssh,target=/home/paimon/.ssh,type=bind,readonly",
+		"source=paimon-cargo-registry-amd64,target=/home/paimon/.cargo/registry,type=volume",
+		"source=paimon-cargo-git-amd64,target=/home/paimon/.cargo/git,type=volume",
+		"source=paimon-rust-target-amd64,target=${containerWorkspaceFolder}/third_party/tantivy_ffi/target,type=volume",
+		"source=paimon-build-amd64,target=${containerWorkspaceFolder}/build,type=volume",
+		"source=paimon-ccache-amd64,target=/home/paimon/.ccache,type=volume"
+	],
+	"postCreateCommand": "sudo chown -R paimon:paimon ${containerWorkspaceFolder}/build ${containerWorkspaceFolder}/third_party/tantivy_ffi/target /home/paimon/.ccache /home/paimon/.cargo/registry /home/paimon/.cargo/git 2>/dev/null || true; cargo install cbindgen --locked || true; rustup component add rust-src rust-analyzer clippy rustfmt || true",
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"eamodio.gitlens",
+				"rust-lang.rust-analyzer",
+				"vadimcn.vscode-lldb",
+				"llvm-vs-code-extensions.vscode-clangd",
+				"ms-vscode.cmake-tools"
+			],
+			"settings": {
+				"editor.formatOnSave": true
+			}
+		}
+	}
+}
diff --git a/.github/workflows/build_release.yaml b/.github/workflows/build_release.yaml
index 6e984bd19..152048cc9 100644
--- a/.github/workflows/build_release.yaml
+++ b/.github/workflows/build_release.yaml
@@ -44,6 +44,9 @@ jobs:
         uses: ./.github/actions/setup-ccache
         with:
           cache-key-prefix: ccache-clang-release
+      - name: Install Rust toolchain (tantivy-fts)
+        shell: bash
+        run: ci/scripts/setup_rust.sh
       - name: Build Paimon
         shell: bash
         env:
@@ -67,6 +70,9 @@ jobs:
         uses: ./.github/actions/setup-ccache
         with:
           cache-key-prefix: ccache-gcc-release
+      - name: Install Rust toolchain (tantivy-fts)
+        shell: bash
+        run: ci/scripts/setup_rust.sh
       - name: Build Paimon
         shell: bash
         env:
diff --git a/.github/workflows/clang_test.yaml b/.github/workflows/clang_test.yaml
index dd11dd725..824a5d45d 100644
--- a/.github/workflows/clang_test.yaml
+++ b/.github/workflows/clang_test.yaml
@@ -45,6 +45,9 @@ jobs:
         uses: ./.github/actions/setup-ccache
         with:
           cache-key-prefix: ccache-clang-test
+      - name: Install Rust toolchain (tantivy-fts)
+        shell: bash
+        run: ci/scripts/setup_rust.sh
       - name: Build Paimon
         shell: bash
         env:
diff --git a/.github/workflows/gcc_test.yaml b/.github/workflows/gcc_test.yaml
index e97954608..af6e0ddbd 100644
--- a/.github/workflows/gcc_test.yaml
+++ b/.github/workflows/gcc_test.yaml
@@ -44,6 +44,9 @@ jobs:
         uses: ./.github/actions/setup-ccache
         with:
           cache-key-prefix: ccache-gcc-test
+      - name: Install Rust toolchain (tantivy-fts)
+        shell: bash
+        run: ci/scripts/setup_rust.sh
       - name: Build Paimon
         shell: bash
         env:
diff --git a/.github/workflows/test_with_sanitizer.yaml b/.github/workflows/test_with_sanitizer.yaml
index b2b90d97b..a083a773b 100644
--- a/.github/workflows/test_with_sanitizer.yaml
+++ b/.github/workflows/test_with_sanitizer.yaml
@@ -44,6 +44,9 @@ jobs:
         uses: ./.github/actions/setup-ccache
         with:
           cache-key-prefix: ccache-sanitizer
+      - name: Install Rust toolchain (tantivy-fts)
+        shell: bash
+        run: ci/scripts/setup_rust.sh
       - name: Build Paimon
         shell: bash
         env:
diff --git a/.gitignore b/.gitignore
index 57e007860..0626cbc0d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,8 +14,7 @@
 
 # Build directories
 build
-build-release
-build-debug
+build-*/
 output
 
 # IDE settings
@@ -24,8 +23,20 @@ output
 .cache
 
 # Devcontainer configuration
+# Track only *.template files (and subdirectory structure that contains them).
 .devcontainer/*
 !.devcontainer/*.template
+!.devcontainer/x86_64/
+.devcontainer/x86_64/*
+!.devcontainer/x86_64/*.template
+# CentOS 7 cross-build image: track raw Dockerfile + helper script (not
+# templated because the image is built from the repo root directly).
+!.devcontainer/centos7/
+.devcontainer/centos7/*
+!.devcontainer/centos7/Dockerfile
+!.devcontainer/centos7/run.sh
+# rustup-init.bin is a 20 MB prefetched binary — not source, don't commit.
+.devcontainer/centos7/rustup-init.bin
 
 # Temporary and backup files
 *~
@@ -48,3 +59,6 @@ FlameGraph
 
 # Third party dependencies archives
 third_party/*.tar.gz
+
+# Rust / Cargo build artifacts
+third_party/tantivy_ffi/target/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 154a38d97..b53964563 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.16)
+cmake_minimum_required(VERSION 3.22)
+# 3.22 是 Corrosion-rs (用于 Rust-C++ FFI 集成,见 third_party/tantivy_ffi) 的
+# 最低要求。Ubuntu 24.04 默认 CMake 3.28,CentOS 8+/RHEL 9+ 默认 3.20+。
+# 如果需要在更老的发行版构建,请参考 docs/dev/tantivy_fts_migration_plan.md。
 message(STATUS "Building using CMake version: ${CMAKE_VERSION}")
 
 # https://cmake.org/cmake/help/latest/policy/CMP0135.html
@@ -55,6 +58,8 @@ option(PAIMON_ENABLE_LANCE "Whether to enable lance file format" OFF)
 option(PAIMON_ENABLE_JINDO "Whether to enable jindo file system" OFF)
 option(PAIMON_ENABLE_LUMINA "Whether to enable lumina vector index" OFF)
 option(PAIMON_ENABLE_LUCENE "Whether to enable lucene index" OFF)
+option(PAIMON_ENABLE_TANTIVY
+       "Whether to enable tantivy-fulltext global index (Rust FFI, experimental)" ON)
 if(PAIMON_ENABLE_ORC)
     add_definitions(-DPAIMON_ENABLE_ORC)
 endif()
@@ -87,6 +92,10 @@ if(PAIMON_ENABLE_LUCENE)
     add_definitions(-DPAIMON_ENABLE_LUCENE)
 endif()
 
+if(PAIMON_ENABLE_TANTIVY)
+    add_definitions(-DPAIMON_ENABLE_TANTIVY)
+endif()
+
 add_definitions(-DSNAPPY_CODEC_AVAILABLE)
 add_definitions(-DZSTD_CODEC_AVAILABLE)
 add_definitions(-DRAPIDJSON_HAS_STDSTRING)
@@ -303,6 +312,21 @@ if(PAIMON_ENABLE_LUMINA)
             DESTINATION ${CMAKE_INSTALL_LIBDIR})
 endif()
 
+# ---- tantivy-fulltext Rust FFI via Corrosion-rs --------------------------------
+# See docs/dev/tantivy_fts_migration_plan.md Stage 1.
+#
+# Corrosion wraps the Cargo crate as a CMake target named `paimon_tantivy_ffi`.
+# `corrosion_experimental_cbindgen` runs cbindgen from CMake and writes the
+# header to a stable path; it also adds that path to the target's INTERFACE
+# include dirs so C++ consumers pick it up via target_link_libraries.
+if(PAIMON_ENABLE_TANTIVY)
+    include(CorrosionFetch)
+    corrosion_import_crate(MANIFEST_PATH third_party/tantivy_ffi/Cargo.toml CRATES
+                           paimon_tantivy_ffi)
+    corrosion_experimental_cbindgen(TARGET paimon_tantivy_ffi HEADER_NAME
+                                    paimon_tantivy_ffi.h)
+endif()
+
 if(PAIMON_ENABLE_LUCENE)
     set(PAIMON_DICT_DEST "share/paimon/dict")
 
@@ -491,6 +515,9 @@ add_subdirectory(src/paimon/format/avro)
 add_subdirectory(src/paimon/format/lance)
 add_subdirectory(src/paimon/global_index/lumina)
 add_subdirectory(src/paimon/global_index/lucene)
+if(PAIMON_ENABLE_TANTIVY)
+    add_subdirectory(src/paimon/global_index/tantivy)
+endif()
 add_subdirectory(src/paimon/testing/mock)
 add_subdirectory(src/paimon/testing/utils)
 add_subdirectory(test/inte)
diff --git a/ci/scripts/build_paimon.sh b/ci/scripts/build_paimon.sh
index f1d0423de..145711f2f 100755
--- a/ci/scripts/build_paimon.sh
+++ b/ci/scripts/build_paimon.sh
@@ -36,6 +36,7 @@ pushd ${build_dir}
 
 ENABLE_LUMINA="ON"
 ENABLE_LANCE="ON"
+ENABLE_TANTIVY="ON"
 if [[ "${CC:-}" == *"gcc-8"* ]] || [[ "${CXX:-}" == *"g++-8"* ]]; then
     ENABLE_LUMINA="OFF" # Lumina is only supported on GCC 9 or higher.
     ENABLE_LANCE="OFF"
@@ -43,6 +44,7 @@ if [[ "${CC:-}" == *"gcc-8"* ]] || [[ "${CXX:-}" == *"g++-8"* ]]; then
     # which requires a higher version of glibc,
     # but Ubuntu 22.04 and above no longer ships with gcc-8 by default.
     # Consider supporting Lance from source compilation in the future
+    ENABLE_TANTIVY="OFF" # tantivy-fts (Rust FFI) is not built on the gcc-8 image.
 fi
 
 CMAKE_ARGS=(
@@ -53,6 +55,7 @@ CMAKE_ARGS=(
     "-DPAIMON_ENABLE_JINDO=ON"
     "-DPAIMON_ENABLE_LUMINA=${ENABLE_LUMINA}"
     "-DPAIMON_ENABLE_LUCENE=ON"
+    "-DPAIMON_ENABLE_TANTIVY=${ENABLE_TANTIVY}"
 )
 
 if [[ "${enable_sanitizer}" == "true" ]]; then
diff --git a/ci/scripts/setup_rust.sh b/ci/scripts/setup_rust.sh
new file mode 100755
index 000000000..99b63ea05
--- /dev/null
+++ b/ci/scripts/setup_rust.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+#
+# Copyright 2026-present Alibaba Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Install the Rust toolchain + cbindgen required to build the
+# tantivy-fts FFI crate (third_party/tantivy_ffi) from CI.
+#
+# The dev container (see .devcontainer/) already has these preinstalled;
+# this script is for the GitHub Actions runners. Called by
+# .github/workflows/gcc_test.yaml and test_with_sanitizer.yaml before
+# ci/scripts/build_paimon.sh.
+#
+# Idempotent: a second invocation is a no-op when the tools already exist.
+
+set -eux
+
+RUSTUP_VERSION=${RUSTUP_VERSION:-1.29.0}
+# 1.88.0 is the minimum required by transitive crates (e.g. time 0.3.47).
+RUST_VERSION=${RUST_VERSION:-1.88.0}
+CBINDGEN_VERSION=${CBINDGEN_VERSION:-0.29.2}
+
+# Install rustup + default toolchain if cargo isn't on PATH yet.
+if ! command -v cargo >/dev/null 2>&1; then
+    curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \
+        | sh -s -- -y --default-toolchain "${RUST_VERSION}" --profile minimal --no-modify-path
+fi
+
+# Export for the remainder of the CI job.
+export PATH="${HOME}/.cargo/bin:${PATH}"
+echo "${HOME}/.cargo/bin" >> "${GITHUB_PATH:-/dev/null}" || true
+
+rustup toolchain install "${RUST_VERSION}" --profile minimal
+rustup default "${RUST_VERSION}"
+rustup component add rustfmt clippy
+
+# cbindgen is used by the crate's build.rs to emit the C header that the
+# C++ side includes. Corrosion will also run cbindgen at CMake configure
+# time; both paths need it available.
+if ! command -v cbindgen >/dev/null 2>&1; then
+    cargo install cbindgen --version "${CBINDGEN_VERSION}" --locked
+fi
+
+rustc --version
+cargo --version
+cbindgen --version
diff --git a/cmake_modules/BuildUtils.cmake b/cmake_modules/BuildUtils.cmake
index d6d3b4a58..0b97943bf 100644
--- a/cmake_modules/BuildUtils.cmake
+++ b/cmake_modules/BuildUtils.cmake
@@ -94,6 +94,7 @@ function(add_paimon_lib LIB_NAME)
     endif()
     # Necessary to make static linking into other shared libraries work properly
     set_property(TARGET ${LIB_NAME}_objlib PROPERTY POSITION_INDEPENDENT_CODE 1)
+    target_link_libraries(${LIB_NAME}_objlib PUBLIC paimon_sanitizer_flags)
     if(ARG_DEPENDENCIES)
         # In static-only builds, some dependency names are still declared as
         # *_shared. Map them to *_static when the shared target is unavailable.
@@ -181,8 +182,14 @@ function(add_paimon_lib LIB_NAME)
                                 PRIVATE
                                 -Wl,--exclude-libs,ALL
                                 -Wl,-Bsymbolic
-                                -Wl,-z,defs
                                 -Wl,--gc-sections)
+            # -z defs (--no-undefined) rejects the __asan_*/__ubsan_* symbols that
+            # sanitizer-instrumented shared libraries legitimately leave undefined
+            # (they are resolved at load time from the executable's sanitizer
+            # runtime). Only enforce it for non-sanitizer builds.
+            if(NOT PAIMON_USE_ASAN AND NOT PAIMON_USE_UBSAN)
+                target_link_options(${LIB_NAME}_shared PRIVATE -Wl,-z,defs)
+            endif()
         endif()
 
         install(TARGETS ${LIB_NAME}_shared ${INSTALL_IS_OPTIONAL}
@@ -334,6 +341,10 @@ function(add_test_case REL_TEST_NAME)
         target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors)
     endif()
     target_compile_options(${TEST_NAME} PRIVATE -fno-access-control)
+    # test 源文件里用 {1, -1, ...} 这样的方式初始化 char/vector<char> 代表原始字节;
+    # aarch64 默认 char 是 unsigned,会触发 -Wnarrowing。这里统一关掉,避免测试
+    # 源文件里大量 static_cast<char>(-1) 污染。生产代码(src/paimon/...)不关。
+    target_compile_options(${TEST_NAME} PRIVATE -Wno-narrowing)
 
     add_test(${TEST_NAME}
              ${BUILD_SUPPORT_DIR}/run-test.sh
diff --git a/cmake_modules/CorrosionFetch.cmake b/cmake_modules/CorrosionFetch.cmake
new file mode 100644
index 000000000..fff8fe655
--- /dev/null
+++ b/cmake_modules/CorrosionFetch.cmake
@@ -0,0 +1,75 @@
+# Copyright 2026-present Alibaba Inc.
+#
+# Licensed under the Apache License, Version 2.0.
+#
+# Pull Corrosion-rs via FetchContent so we can import Cargo crates as CMake
+# targets. Used to bring in third_party/tantivy_ffi for the tantivy-fulltext
+# global index (see docs/dev/tantivy_fts_migration_plan.md).
+#
+# Pinned to v0.5.0 (stable release). Requires CMake >= 3.22.
+
+include(FetchContent)
+
+# Corrosion does heavy cargo/rustc work at configure+build time; pin tag for
+# reproducibility and allow override via env var for offline builds.
+set(PAIMON_CORROSION_TAG
+    "v0.5.2"
+    CACHE STRING "Git tag of corrosion-rs to fetch; change only when upgrading. v0.5.1+
+    is required for rustup >= 1.28 whose `rustup toolchain list --verbose`
+    output format broke v0.5.0's FindRust.cmake regex.")
+
+set(PAIMON_CORROSION_REPO
+    "https://github.com/corrosion-rs/corrosion.git"
+    CACHE STRING "Override to a private mirror for offline / firewalled builds.")
+
+# Help Corrosion find rustc/cargo when CMake is invoked without a login shell
+# or when rustup is installed to a non-default location. We try, in order:
+#   1. Existing Rust_COMPILER cache variable (user override)
+#   2. $CARGO_HOME/bin/rustc (when env var set)
+#   3. $HOME/.cargo/bin/rustc (rustup's default install)
+#   4. Fallback: let Corrosion's FindRust.cmake try its own detection
+function(_paimon_find_rustup_bin _var _name)
+    if(DEFINED ENV{CARGO_HOME} AND EXISTS "$ENV{CARGO_HOME}/bin/${_name}")
+        set(${_var}
+            "$ENV{CARGO_HOME}/bin/${_name}"
+            PARENT_SCOPE)
+    elseif(DEFINED ENV{HOME} AND EXISTS "$ENV{HOME}/.cargo/bin/${_name}")
+        set(${_var}
+            "$ENV{HOME}/.cargo/bin/${_name}"
+            PARENT_SCOPE)
+    endif()
+endfunction()
+
+if(NOT DEFINED Rust_COMPILER OR Rust_COMPILER STREQUAL "")
+    _paimon_find_rustup_bin(_rustc_path rustc)
+    if(_rustc_path)
+        set(Rust_COMPILER
+            "${_rustc_path}"
+            CACHE FILEPATH "rustc")
+    endif()
+endif()
+if(NOT DEFINED Rust_CARGO OR Rust_CARGO STREQUAL "")
+    _paimon_find_rustup_bin(_cargo_path cargo)
+    if(_cargo_path)
+        set(Rust_CARGO
+            "${_cargo_path}"
+            CACHE FILEPATH "cargo")
+    endif()
+endif()
+# Corrosion reads `rustup which rustc` to resolve the real toolchain binary.
+# If CMake is invoked from a non-login shell, $PATH may miss ~/.cargo/bin and
+# `rustup` can't be found. Prepend rustup's bin dir so child processes see it.
+if(DEFINED Rust_COMPILER)
+    get_filename_component(_rustup_bin_dir "${Rust_COMPILER}" DIRECTORY)
+    if(_rustup_bin_dir AND NOT "$ENV{PATH}" MATCHES "${_rustup_bin_dir}")
+        set(ENV{PATH} "${_rustup_bin_dir}:$ENV{PATH}")
+    endif()
+endif()
+message(STATUS "Corrosion: Rust_COMPILER=${Rust_COMPILER}")
+message(STATUS "Corrosion: Rust_CARGO=${Rust_CARGO}")
+
+fetchcontent_declare(Corrosion
+                     GIT_REPOSITORY "${PAIMON_CORROSION_REPO}"
+                     GIT_TAG "${PAIMON_CORROSION_TAG}"
+                     GIT_SHALLOW TRUE)
+fetchcontent_makeavailable(Corrosion)
diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake
index 271011a0d..428814aeb 100644
--- a/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cmake_modules/ThirdpartyToolchain.cmake
@@ -744,6 +744,11 @@ macro(build_lucene)
         "-DBoost_INCLUDE_DIR=${BOOST_INCLUDE_DIR}"
         "-DBoost_LIBRARY_DIR=${BOOST_LIBRARY_DIR}"
         "-DBOOST_ROOT=${BOOST_INSTALL}"
+        # Force FindBoost module mode only; ignore system BoostConfig.cmake and
+        # system library paths so lucene_ep links against our vendored boost 1.66,
+        # not a system-installed newer version (e.g. 1.83) with ABI differences.
+        "-DBoost_NO_BOOST_CMAKE=ON"
+        "-DBoost_NO_SYSTEM_PATHS=ON"
         "-DBoost_CHRONO_FOUND=TRUE"
         "-DBoost_THREAD_FOUND=TRUE"
         "-DZLIB_INCLUDE_DIRS=${ZLIB_INCLUDE_DIR}"
@@ -1879,5 +1884,9 @@ endif()
 if(PAIMON_ENABLE_LUCENE)
     build_boost()
     build_lucene()
+endif()
+# jieba (dict + headers) is needed by BOTH lucene-fts and the tantivy jieba
+# tokenizer; build it whenever either backend is on, not only under lucene.
+if(PAIMON_ENABLE_LUCENE OR PAIMON_ENABLE_TANTIVY)
     build_jieba()
 endif()
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 34818475e..787bfb714 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.16)
+cmake_minimum_required(VERSION 3.22)
 
 project(example)
 
diff --git a/include/paimon/predicate/full_text_search.h b/include/paimon/predicate/full_text_search.h
index 93425050c..bbf82eae1 100644
--- a/include/paimon/predicate/full_text_search.h
+++ b/include/paimon/predicate/full_text_search.h
@@ -55,13 +55,24 @@ struct PAIMON_EXPORT FullTextSearch {
 
     std::shared_ptr<FullTextSearch> ReplacePreFilter(
         const std::optional<RoaringBitmap64>& _pre_filter) const {
-        return std::make_shared<FullTextSearch>(field_name, limit, query, search_type, _pre_filter);
+        auto replaced =
+            std::make_shared<FullTextSearch>(field_name, limit, query, search_type, _pre_filter);
+        // `with_score` / `min_score` are not constructor args (they have in-class
+        // defaults), so carry them over explicitly — otherwise rewrapping the
+        // pre_filter (e.g. in OffsetGlobalIndexReader) would silently reset a
+        // scored / min_score query back to the unscored default.
+        replaced->with_score = with_score;
+        replaced->min_score = min_score;
+        return replaced;
     }
 
     /// Name of the field to search within (must be a full-text indexed field).
     std::string field_name;
-    /// Maximum number of documents to return. If set, limit ordered by top scores. Otherwise, no
-    /// score return.
+    /// Maximum number of documents to return.
+    ///
+    /// **v0.2 contract change**: `limit` is now purely a truncation switch — it is orthogonal
+    /// to `with_score`. Set `with_score = true` if you want BM25 scores in the result; setting
+    /// `limit >= 0` no longer implies scoring.
     std::optional<int32_t> limit;
     /// The query string to search for. The interpretation depends on search_type:
     ///
@@ -85,5 +96,26 @@ struct PAIMON_EXPORT FullTextSearch {
     /// Only rows whose global row ID is present in `pre_filter` will be included during search.
     /// If not set, all rows will be included.
     std::optional<RoaringBitmap64> pre_filter;
+    /// Whether to compute and return BM25 relevance scores.
+    ///
+    /// The 4-path matrix:
+    /// - `with_score=false, limit=nullopt` → BitmapGlobalIndexResult (all rows, no score)
+    /// - `with_score=false, limit=N`       → BitmapGlobalIndexResult (any N matches, unscored)
+    /// - `with_score=true,  limit=nullopt` → BitmapScoredGlobalIndexResult (all rows + all scores)
+    /// - `with_score=true,  limit=N`       → BitmapScoredGlobalIndexResult (top-N by BM25 + scores)
+    ///
+    /// For plain `LIMIT N` without ORDER BY (the common case in SR's predicate
+    /// pushdown) set `with_score=false, limit=N` — the unscored fast path. If
+    /// you want top-N by relevance, use `with_score=true, limit=N` and drop the
+    /// scores in the caller if not needed.
+    ///
+    /// Default is `false` to avoid silent score computation overhead for callers that don't need
+    /// it.
+    bool with_score = false;
+    /// Minimum BM25 score threshold (exclusive). Results with score ≤ this value are excluded.
+    /// Only meaningful when scoring is active (i.e., `with_score = true` or `limit` is set).
+    /// Applied before truncation so low-score documents never occupy limit slots.
+    /// Default is nullopt (no threshold filtering).
+    std::optional<float> min_score;
 };
 }  // namespace paimon
diff --git a/scripts/tantivy_smoke.sh b/scripts/tantivy_smoke.sh
new file mode 100755
index 000000000..4a9255716
--- /dev/null
+++ b/scripts/tantivy_smoke.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+# tantivy-fts 迁移期 smoke 测试脚本。
+#
+# 用途: 在 Dev Container 内一键回归 lucene-fts + tantivy-fts 相关测试。
+# 设计哲学: 命令行越拼越长容易出错,封装成一个脚本各 Stage 持续维护。
+#
+# 用法:
+#   ./scripts/tantivy_smoke.sh                # default: release, no sanitizer
+#   ./scripts/tantivy_smoke.sh --asan         # ASAN 构建
+#   ./scripts/tantivy_smoke.sh --tsan         # TSAN 构建
+#   ./scripts/tantivy_smoke.sh --configure    # 仅 cmake configure
+#   ./scripts/tantivy_smoke.sh --build        # 仅 cmake build (跳过 configure)
+#   ./scripts/tantivy_smoke.sh --tests-only   # 仅 ctest (假定已 build 过)
+#
+# 维护约定:
+#   - Stage 1+ 每加一个新 ctest target 就更新下面 TEST_REGEX
+#   - Stage 11 加 --with-asan / --with-tsan 完整路径
+
+set -e
+
+CMAKE_BUILD_TYPE="Release"
+USE_ASAN="OFF"
+USE_TSAN="OFF"
+BUILD_DIR_SUFFIX=""
+DO_CONFIGURE=1
+DO_BUILD=1
+DO_TEST=1
+
+# ctest 正则: 各 Stage 验收时只跑这批测试,不跑全量 ctest (~531s 太慢)。
+# 内容 = lucene-fts 对照基线 + 当前 Stage 及之前 Stage 新增的 tantivy-fts target。
+# 每个 Stage 完成时往这里追加 target。只有 Stage 11 才应跑全量 ctest。
+TEST_REGEX='paimon-lucene-index-test|paimon-global-index-test|paimon-tantivy-smoke-test|paimon-tantivy-ffi-test|paimon-tantivy-tokenizer-test|paimon-tantivy-writer-test|paimon-tantivy-reader-test|paimon-tantivy-filter-limit-test|paimon-tantivy-index-test|paimon-tantivy-lucene-coexist-test|paimon-tantivy-equivalence-test|paimon-tantivy-streaming-test|paimon-tantivy-java-compat-test'
+
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --asan)        USE_ASAN="ON";  CMAKE_BUILD_TYPE="Debug"; BUILD_DIR_SUFFIX="-asan" ;;
+        --tsan)        USE_TSAN="ON";  CMAKE_BUILD_TYPE="Debug"; BUILD_DIR_SUFFIX="-tsan" ;;
+        --configure)   DO_BUILD=0; DO_TEST=0 ;;
+        --build)       DO_CONFIGURE=0; DO_TEST=0 ;;
+        --tests-only)  DO_CONFIGURE=0; DO_BUILD=0 ;;
+        -h|--help)     sed -n '2,20p' "$0"; exit 0 ;;
+        *)             echo "Unknown option: $1"; exit 2 ;;
+    esac
+    shift
+done
+
+REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+BUILD_DIR="${REPO_ROOT}/build${BUILD_DIR_SUFFIX}"
+
+cd "${REPO_ROOT}"
+
+if [ "${DO_CONFIGURE}" = "1" ]; then
+    echo "==> cmake configure (${BUILD_DIR})"
+    cmake -S . -B "${BUILD_DIR}" \
+        -DCMAKE_BUILD_TYPE="${CMAKE_BUILD_TYPE}" \
+        -DPAIMON_BUILD_TESTS=ON \
+        -DPAIMON_USE_ASAN="${USE_ASAN}" \
+        -DPAIMON_USE_TSAN="${USE_TSAN}" \
+        -DPAIMON_ENABLE_FSLIB=OFF \
+        -DPAIMON_ENABLE_LUMINA=OFF \
+        -DPAIMON_ENABLE_LANCE=OFF \
+        -DPAIMON_ENABLE_JINDO=OFF \
+        -DPAIMON_ENABLE_LUCENE=ON \
+        -DPAIMON_ENABLE_ORC=ON \
+        -DPAIMON_ENABLE_ALIORC=ON \
+        -DPAIMON_ENABLE_AVRO=ON \
+        -G Ninja
+fi
+
+if [ "${DO_BUILD}" = "1" ]; then
+    echo "==> cmake build"
+    cmake --build "${BUILD_DIR}" -j
+fi
+
+if [ "${DO_TEST}" = "1" ]; then
+    echo "==> ctest (${TEST_REGEX})"
+    ctest --test-dir "${BUILD_DIR}" -R "${TEST_REGEX}" --output-on-failure
+fi
+
+echo "==> tantivy_smoke.sh DONE"
diff --git a/src/paimon/common/data/binary_row_test.cpp b/src/paimon/common/data/binary_row_test.cpp
index acfc259ce..34694c3a9 100644
--- a/src/paimon/common/data/binary_row_test.cpp
+++ b/src/paimon/common/data/binary_row_test.cpp
@@ -338,8 +338,9 @@ TEST_F(BinaryRowTest, TestBinary) {
     auto pool = GetDefaultPool();
     BinaryRow row(2);
     BinaryRowWriter writer(&row, 0, pool.get());
-    char chars1[3] = {1, -1, 5};
-    char chars2[8] = {1, -1, 5, 5, 1, 5, 1, 5};
+    // explicit cast to avoid -Wnarrowing on platforms where char is unsigned (e.g. aarch64)
+    char chars1[3] = {1, static_cast<char>(-1), 5};
+    char chars2[8] = {1, static_cast<char>(-1), 5, 5, 1, 5, 1, 5};
     std::string str1(chars1, 3);
     std::string str2(chars2, 8);
     Bytes bytes1(str1, pool.get());
diff --git a/src/paimon/common/global_index/offset_global_index_reader_test.cpp b/src/paimon/common/global_index/offset_global_index_reader_test.cpp
index d4996bceb..0c4e4c1f5 100644
--- a/src/paimon/common/global_index/offset_global_index_reader_test.cpp
+++ b/src/paimon/common/global_index/offset_global_index_reader_test.cpp
@@ -22,6 +22,7 @@
 #include "gtest/gtest.h"
 #include "paimon/global_index/bitmap_global_index_result.h"
 #include "paimon/global_index/bitmap_scored_global_index_result.h"
+#include "paimon/predicate/full_text_search.h"
 #include "paimon/predicate/literal.h"
 #include "paimon/testing/utils/testharness.h"
 #include "paimon/utils/roaring_bitmap64.h"
@@ -112,9 +113,14 @@ class FakeGlobalIndexReader : public GlobalIndexReader {
 
     Result<std::shared_ptr<GlobalIndexResult>> VisitFullTextSearch(
         const std::shared_ptr<FullTextSearch>& full_text_search) override {
+        captured_fts = full_text_search;
         return MakeResult(default_result_);
     }
 
+    // Captures the (possibly pre_filter-rewritten) FullTextSearch the offset
+    // reader forwarded, so tests can assert field propagation.
+    std::shared_ptr<FullTextSearch> captured_fts;
+
     bool IsThreadSafe() const override {
         return true;
     }
@@ -331,6 +337,37 @@ TEST_F(OffsetGlobalIndexReaderTest, TestVisitFullTextSearchWithOffset) {
     CheckResult(result, {10, 13, 15});
 }
 
+TEST_F(OffsetGlobalIndexReaderTest, TestVisitFullTextSearchPreservesScoreFlags) {
+    // Regression (review finding #2): rewriting the pre_filter global->local ids
+    // in the offset reader must NOT drop with_score / min_score. Before the fix,
+    // FullTextSearch::ReplacePreFilter rebuilt via the 5-arg ctor and silently
+    // reset both back to their defaults, turning a scored / min_score query
+    // unscored as soon as it crossed any offset shard.
+    auto fake_reader = std::make_shared<FakeGlobalIndexReader>();
+    fake_reader->SetDefaultResult({0, 3, 5});
+    auto offset_reader = std::make_shared<OffsetGlobalIndexReader>(fake_reader, 10);
+
+    // pre_filter must be set so the offset reader takes the rewrite path.
+    auto fts = std::make_shared<FullTextSearch>(
+        "f0", /*limit=*/7, "q", FullTextSearch::SearchType::MATCH_ALL,
+        /*pre_filter=*/RoaringBitmap64::From({10l, 13l, 15l}));
+    fts->with_score = true;
+    fts->min_score = 1.5f;
+
+    ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitFullTextSearch(fts));
+    CheckResult(result, {10, 13, 15});
+
+    ASSERT_TRUE(fake_reader->captured_fts);
+    EXPECT_TRUE(fake_reader->captured_fts->with_score)
+        << "with_score must survive the pre_filter rewrite";
+    ASSERT_TRUE(fake_reader->captured_fts->min_score.has_value())
+        << "min_score must survive the pre_filter rewrite";
+    EXPECT_FLOAT_EQ(fake_reader->captured_fts->min_score.value(), 1.5f);
+    // limit and the offset-rewritten local pre_filter should still be present.
+    EXPECT_EQ(fake_reader->captured_fts->limit, std::optional<int32_t>(7));
+    ASSERT_TRUE(fake_reader->captured_fts->pre_filter.has_value());
+}
+
 TEST_F(OffsetGlobalIndexReaderTest, TestVisitVectorSearchWithOffset) {
     auto fake_reader = std::make_shared<FakeGlobalIndexReader>();
     fake_reader->SetVectorSearchResult({0, 2, 5}, {0.9f, 0.7f, 0.3f});
diff --git a/src/paimon/global_index/tantivy/CMakeLists.txt b/src/paimon/global_index/tantivy/CMakeLists.txt
new file mode 100644
index 000000000..6039bdde5
--- /dev/null
+++ b/src/paimon/global_index/tantivy/CMakeLists.txt
@@ -0,0 +1,277 @@
+# Copyright 2026-present Alibaba Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# tantivy-fulltext global index (Rust FFI). See docs/dev/tantivy_fts_migration_plan.md.
+# Stage 4 grows the support lib with the C++ writer wrapper + writer test.
+
+if(NOT PAIMON_ENABLE_TANTIVY)
+    return()
+endif()
+
+set(PAIMON_TANTIVY_SUPPORT_SRCS
+    tantivy_ffi_log.cpp
+    tantivy_archive_layout.cpp
+    tantivy_stream_ctx.cpp
+    tantivy_global_index_writer.cpp
+    tantivy_global_index_reader.cpp
+    tantivy_global_index.cpp
+    tantivy_global_index_factory.cpp)
+
+add_paimon_lib(paimon_tantivy_support
+               SOURCES
+               ${PAIMON_TANTIVY_SUPPORT_SRCS}
+               DEPENDENCIES
+               paimon_shared
+               paimon_tantivy_ffi
+               STATIC_LINK_LIBS
+               paimon_tantivy_ffi
+               arrow
+               glog
+               fmt
+               SHARED_LINK_LIBS
+               paimon_shared
+               SHARED_LINK_FLAGS
+               ${PAIMON_VERSION_SCRIPT_FLAGS})
+# Corrosion's paimon_tantivy_ffi target carries INTERFACE_INCLUDE_DIRECTORIES
+# (cbindgen-generated header path). The objlib in add_paimon_lib doesn't link
+# against deps,so its compile step misses include dirs.Wire them explicitly.
+target_link_libraries(paimon_tantivy_support_objlib PUBLIC paimon_tantivy_ffi)
+
+if(PAIMON_BUILD_TESTS)
+    add_paimon_test(tantivy_smoke_test
+                    SOURCES
+                    tantivy_smoke_test.cpp
+                    STATIC_LINK_LIBS
+                    paimon_tantivy_ffi
+                    ${GTEST_LINK_TOOLCHAIN})
+
+    add_paimon_test(tantivy_ffi_test
+                    SOURCES
+                    tantivy_ffi_test.cpp
+                    STATIC_LINK_LIBS
+                    paimon_shared
+                    "-Wl,--whole-archive"
+                    paimon_tantivy_support_static
+                    "-Wl,--no-whole-archive"
+                    paimon_tantivy_ffi
+                    glog
+                    fmt
+                    ${GTEST_LINK_TOOLCHAIN})
+
+    # Golden-sample tokenizer diff (cppjieba vs jieba-rs). Links against the
+    # lucene index module to reuse JiebaTokenizer::CutWithMode + Normalize, so it
+    # can only be built when lucene-fts is enabled (the C++ JiebaTokenizer lives
+    # in the lucene module). Guarded so the default LUCENE=OFF / TANTIVY=ON build
+    # doesn't try to link the non-existent paimon_lucene_index_static.
+    # Note: we mirror the lucene-fts test's link line (see lucene/CMakeLists.txt)
+    # rather than using the `jieba` imported target, whose INTERFACE_INCLUDE
+    # concatenates two paths in one string (upstream quirk).
+    if(PAIMON_ENABLE_LUCENE)
+        add_paimon_test(tantivy_tokenizer_test
+                        SOURCES
+                        tantivy_tokenizer_test.cpp
+                        EXTRA_INCLUDES
+                        ${LUCENE_INCLUDE_DIR}
+                        STATIC_LINK_LIBS
+                        paimon_shared
+                        test_utils_static
+                        "-Wl,--whole-archive"
+                        paimon_local_file_system_static
+                        paimon_lucene_index_static
+                        "-Wl,--no-whole-archive"
+                        paimon_tantivy_ffi
+                        ${GTEST_LINK_TOOLCHAIN})
+        target_compile_definitions(paimon-tantivy-tokenizer-test
+                                   PRIVATE JIEBA_TEST_DICT_DIR="${JIEBA_DICT_DIR}"
+                                           PAIMON_TANTIVY_GOLDEN_DIR="${CMAKE_SOURCE_DIR}/test/test_data/tokenizer_golden"
+        )
+        target_include_directories(paimon-tantivy-tokenizer-test SYSTEM
+                                   PRIVATE ${JIEBA_INCLUDE_DIR} ${JIEBA_DICT_DIR})
+    endif()
+
+    # Stage 4 — Writer test. Builds an Arrow batch, runs the writer through
+    # GlobalIndexFileManager + LocalFileSystem, then validates the packed
+    # on-disk format. Reader round-trip lives in Stage 6.
+    add_paimon_test(tantivy_writer_test
+                    SOURCES
+                    tantivy_writer_test.cpp
+                    STATIC_LINK_LIBS
+                    paimon_shared
+                    test_utils_static
+                    "-Wl,--whole-archive"
+                    paimon_local_file_system_static
+                    paimon_tantivy_support_static
+                    "-Wl,--no-whole-archive"
+                    paimon_tantivy_ffi
+                    arrow
+                    glog
+                    fmt
+                    ${GTEST_LINK_TOOLCHAIN})
+    target_compile_definitions(paimon-tantivy-writer-test
+                               PRIVATE JIEBA_TEST_DICT_DIR="${JIEBA_DICT_DIR}")
+
+    # Stage 6 — Reader + 5 query types end-to-end.
+    add_paimon_test(tantivy_reader_test
+                    SOURCES
+                    tantivy_reader_test.cpp
+                    STATIC_LINK_LIBS
+                    paimon_shared
+                    test_utils_static
+                    "-Wl,--whole-archive"
+                    paimon_local_file_system_static
+                    paimon_tantivy_support_static
+                    "-Wl,--no-whole-archive"
+                    paimon_tantivy_ffi
+                    arrow
+                    glog
+                    fmt
+                    ${GTEST_LINK_TOOLCHAIN})
+    target_compile_definitions(paimon-tantivy-reader-test
+                               PRIVATE JIEBA_TEST_DICT_DIR="${JIEBA_DICT_DIR}")
+
+    # Stage 7 — limit + pre_filter + scoring.
+    add_paimon_test(tantivy_filter_limit_test
+                    SOURCES
+                    tantivy_filter_limit_test.cpp
+                    STATIC_LINK_LIBS
+                    paimon_shared
+                    test_utils_static
+                    "-Wl,--whole-archive"
+                    paimon_local_file_system_static
+                    paimon_tantivy_support_static
+                    "-Wl,--no-whole-archive"
+                    paimon_tantivy_ffi
+                    arrow
+                    glog
+                    fmt
+                    ${GTEST_LINK_TOOLCHAIN})
+    target_compile_definitions(paimon-tantivy-filter-limit-test
+                               PRIVATE JIEBA_TEST_DICT_DIR="${JIEBA_DICT_DIR}")
+
+    # Java → C++ cross-read test. Fixture produced by paimon-java's
+    # `TantivyIndexFixtureGen` (see docs/dev/tantivy_java_cross_read_plan.md)
+    # and checked in under test/test_data/java_tantivy_fixtures/.
+    add_paimon_test(tantivy_java_compat_test
+                    SOURCES
+                    tantivy_java_compat_test.cpp
+                    STATIC_LINK_LIBS
+                    paimon_shared
+                    test_utils_static
+                    "-Wl,--whole-archive"
+                    paimon_local_file_system_static
+                    paimon_tantivy_support_static
+                    "-Wl,--no-whole-archive"
+                    paimon_tantivy_ffi
+                    arrow
+                    glog
+                    fmt
+                    ${GTEST_LINK_TOOLCHAIN})
+    target_compile_definitions(paimon-tantivy-java-compat-test
+                               PRIVATE JIEBA_TEST_DICT_DIR="${JIEBA_DICT_DIR}"
+                                       PAIMON_TANTIVY_JAVA_FIXTURE_DIR="${CMAKE_SOURCE_DIR}/test/test_data/java_tantivy_fixtures"
+                                       PAIMON_TANTIVY_CPP_FIXTURE_DIR="${CMAKE_SOURCE_DIR}/test/test_data/cpp_tantivy_fixtures"
+    )
+
+    # K4 — V3 streaming reader + W1 streaming writer integration coverage:
+    # ParseArchiveHeader fuzz, concurrent query on shared reader, concurrent
+    # reader create+drop lifecycle, streaming benchmark log.
+    add_paimon_test(tantivy_streaming_test
+                    SOURCES
+                    tantivy_streaming_test.cpp
+                    STATIC_LINK_LIBS
+                    paimon_shared
+                    test_utils_static
+                    "-Wl,--whole-archive"
+                    paimon_local_file_system_static
+                    paimon_tantivy_support_static
+                    "-Wl,--no-whole-archive"
+                    paimon_tantivy_ffi
+                    arrow
+                    glog
+                    fmt
+                    ${GTEST_LINK_TOOLCHAIN})
+    target_compile_definitions(paimon-tantivy-streaming-test
+                               PRIVATE JIEBA_TEST_DICT_DIR="${JIEBA_DICT_DIR}")
+
+    # Stage 8 — TantivyGlobalIndex + factory + end-to-end integration test.
+    # `--whole-archive` is required so the static REGISTER_PAIMON_FACTORY
+    # symbols are not stripped out of the test binary.
+    add_paimon_test(tantivy_index_test
+                    SOURCES
+                    tantivy_index_test.cpp
+                    STATIC_LINK_LIBS
+                    paimon_shared
+                    test_utils_static
+                    "-Wl,--whole-archive"
+                    paimon_local_file_system_static
+                    paimon_tantivy_support_static
+                    "-Wl,--no-whole-archive"
+                    paimon_tantivy_ffi
+                    arrow
+                    glog
+                    fmt
+                    ${GTEST_LINK_TOOLCHAIN})
+    target_compile_definitions(paimon-tantivy-index-test
+                               PRIVATE JIEBA_TEST_DICT_DIR="${JIEBA_DICT_DIR}")
+
+    # Stage 9 — Cross-implementation coexistence. Links against BOTH the
+    # lucene and tantivy support static libs to verify they resolve their
+    # `REGISTER_PAIMON_FACTORY` registrations side by side and don't
+    # collide on shared symbols. Only built when lucene-fts is enabled.
+    if(PAIMON_ENABLE_LUCENE)
+        add_paimon_test(tantivy_lucene_coexist_test
+                        SOURCES
+                        tantivy_lucene_coexist_test.cpp
+                        EXTRA_INCLUDES
+                        ${LUCENE_INCLUDE_DIR}
+                        STATIC_LINK_LIBS
+                        paimon_shared
+                        test_utils_static
+                        "-Wl,--whole-archive"
+                        paimon_local_file_system_static
+                        paimon_lucene_index_static
+                        paimon_tantivy_support_static
+                        "-Wl,--no-whole-archive"
+                        paimon_tantivy_ffi
+                        arrow
+                        glog
+                        fmt
+                        ${GTEST_LINK_TOOLCHAIN})
+        target_compile_definitions(paimon-tantivy-lucene-coexist-test
+                                   PRIVATE JIEBA_TEST_DICT_DIR="${JIEBA_DICT_DIR}")
+
+        # Stage 10 — Equivalence + benchmark. Same link line as the coexist
+        # test (needs both impls); benchmark output goes to stderr.
+        add_paimon_test(tantivy_equivalence_test
+                        SOURCES
+                        tantivy_equivalence_test.cpp
+                        EXTRA_INCLUDES
+                        ${LUCENE_INCLUDE_DIR}
+                        STATIC_LINK_LIBS
+                        paimon_shared
+                        test_utils_static
+                        "-Wl,--whole-archive"
+                        paimon_local_file_system_static
+                        paimon_lucene_index_static
+                        paimon_tantivy_support_static
+                        "-Wl,--no-whole-archive"
+                        paimon_tantivy_ffi
+                        arrow
+                        glog
+                        fmt
+                        ${GTEST_LINK_TOOLCHAIN})
+        target_compile_definitions(paimon-tantivy-equivalence-test
+                                   PRIVATE JIEBA_TEST_DICT_DIR="${JIEBA_DICT_DIR}")
+    endif()
+endif()
diff --git a/src/paimon/global_index/tantivy/tantivy_archive_layout.cpp b/src/paimon/global_index/tantivy/tantivy_archive_layout.cpp
new file mode 100644
index 000000000..1bd75320e
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_archive_layout.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ */
+
+#include "paimon/global_index/tantivy/tantivy_archive_layout.h"
+
+#include <cstring>
+#include <memory>
+
+#include "fmt/format.h"
+#include "paimon/fs/file_system.h"
+#include "paimon/io/data_input_stream.h"
+
+namespace paimon::tantivy {
+
+namespace {
+
+/// Wrap the (non-owning) raw InputStream* in a shared_ptr-like handle so
+/// DataInputStream — which takes `shared_ptr<InputStream>` — can be used
+/// without transferring ownership. We use a no-op deleter to avoid double-free.
+struct NoopDeleter {
+    void operator()(InputStream*) const {}
+};
+
+}  // namespace
+
+Result<ArchiveLayout> ParseArchiveHeader(InputStream* in) {
+    if (in == nullptr) {
+        return Status::Invalid("ParseArchiveHeader: null input stream");
+    }
+
+    // DataInputStream defaults to BE — matches paimon-java archive format.
+    std::shared_ptr<InputStream> wrapped(in, NoopDeleter{});
+    DataInputStream dis(wrapped);
+
+    PAIMON_RETURN_NOT_OK(dis.Seek(0));
+
+    PAIMON_ASSIGN_OR_RAISE(int32_t file_count, dis.ReadValue<int32_t>());
+    if (file_count < 0) {
+        return Status::Invalid(
+            fmt::format("ParseArchiveHeader: negative file_count {}", file_count));
+    }
+
+    ArchiveLayout layout;
+    layout.count = static_cast<std::size_t>(file_count);
+    layout.names.reserve(layout.count);
+    layout.offsets.reserve(layout.count);
+    layout.lengths.reserve(layout.count);
+
+    for (int32_t i = 0; i < file_count; ++i) {
+        PAIMON_ASSIGN_OR_RAISE(int32_t name_len, dis.ReadValue<int32_t>());
+        if (name_len <= 0 || name_len > 1 << 20) {
+            return Status::Invalid(
+                fmt::format("ParseArchiveHeader: bad name_len {} at entry {}", name_len, i));
+        }
+        std::string name(static_cast<std::size_t>(name_len), '\0');
+        PAIMON_RETURN_NOT_OK(dis.Read(name.data(), static_cast<uint32_t>(name_len)));
+
+        PAIMON_ASSIGN_OR_RAISE(int64_t data_len, dis.ReadValue<int64_t>());
+        if (data_len < 0) {
+            return Status::Invalid(
+                fmt::format("ParseArchiveHeader: negative data_len {} for '{}'", data_len, name));
+        }
+
+        PAIMON_ASSIGN_OR_RAISE(int64_t data_offset, dis.GetPos());
+
+        layout.names.push_back(std::move(name));
+        layout.offsets.push_back(static_cast<uint64_t>(data_offset));
+        layout.lengths.push_back(static_cast<uint64_t>(data_len));
+
+        // Skip past the payload without reading it.
+        PAIMON_RETURN_NOT_OK(dis.Seek(data_offset + data_len));
+    }
+
+    return layout;
+}
+
+}  // namespace paimon::tantivy
diff --git a/src/paimon/global_index/tantivy/tantivy_archive_layout.h b/src/paimon/global_index/tantivy/tantivy_archive_layout.h
new file mode 100644
index 000000000..2780dfbb9
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_archive_layout.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paimon/result.h"
+
+namespace paimon {
+class InputStream;
+}  // namespace paimon
+
+namespace paimon::tantivy {
+
+/// Parsed layout of a packed tantivy archive. Arrays are parallel; `count` is
+/// their common length.
+///
+/// Archive byte format (matches paimon-java `TantivyFullTextGlobalIndexReader.
+/// parseArchiveHeader`; big-endian, no version header):
+///   `[BE i32 file_count | (BE i32 name_len, name_utf8, BE i64 data_len, data)*]`
+///
+/// `offsets[i]` is the archive-absolute byte offset of file `i`'s payload
+/// (points past the per-entry header). `lengths[i]` is the payload size.
+struct ArchiveLayout {
+    std::vector<std::string> names;
+    std::vector<uint64_t> offsets;
+    std::vector<uint64_t> lengths;
+    std::size_t count = 0;
+};
+
+/// Read the archive header from `in` (seeking past payloads) and return the
+/// layout. Does NOT read file payloads — only header bytes (a few KB).
+///
+/// `in` must support `Seek` (all production `paimon::InputStream` subclasses
+/// do; we call `Seek(cur + data_len)` to skip over each file's payload).
+///
+/// On return, `in`'s internal position is at the end of the archive; callers
+/// typically don't care (the stream is subsequently read via pread callbacks).
+Result<ArchiveLayout> ParseArchiveHeader(InputStream* in);
+
+}  // namespace paimon::tantivy
diff --git a/src/paimon/global_index/tantivy/tantivy_defs.h b/src/paimon/global_index/tantivy/tantivy_defs.h
new file mode 100644
index 000000000..0824d5148
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_defs.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace paimon::tantivy {
+
+/// Identifier used by GlobalIndexFileWriter::NewFileName to prefix on-disk
+/// filenames. Tantivy and lucene file prefixes intentionally differ so a
+/// reader can dispatch the right implementation by filename pattern.
+static inline const char kIdentifier[] = "tantivy-fulltext";
+
+/// Schema field names — fixed to match paimon-java (decision B1). Callers
+/// MUST NOT rename these even though `TantivyGlobalIndexWriter::Create` accepts
+/// a `field_name` argument (that argument is used only to extract the correct
+/// arrow column; the tantivy schema field name is always `"text"`).
+static inline const char kTantivyTextFieldName[] = "text";
+static inline const char kTantivyRowIdFieldName[] = "row_id";
+
+/// Option-key prefix consumed by TantivyGlobalIndex (Stage 8). Matches the
+/// lucene-fts convention so users can configure both implementations with a
+/// uniform "<impl>.<knob>" key style.
+static inline const char kOptionKeyPrefix[] = "tantivy-fulltext.";
+
+/// Buffer size for streaming raw packed bytes from FFI to OutputStream
+/// (Writer) and from InputStream into Rust (Reader, Stage 5+).
+static inline const int32_t kDefaultReadBufferSize = 1024 * 1024;
+/// Read buffer size knob for Stage 6 reader.
+static inline const char kTantivyReadBufferSize[] = "read.buffer-size";
+
+/// If true, omit term frequencies/positions when indexing (smaller index, but
+/// no PhraseQuery support). Default false, mirroring lucene-fts.
+static inline const char kTantivyWriteOmitTermFreqAndPositions[] =
+    "write.omit-term-freq-and-position";
+
+/// Env var carrying jieba dictionary directory; consumed by both writer and
+/// reader. Same name as lucene-fts: a single env var configures both backends.
+static inline const char kJiebaDictDirEnv[] = "PAIMON_JIEBA_DICT_DIR";
+
+/// Default tokenize mode if not specified in options.
+static inline const char kDefaultJiebaTokenizeMode[] = "mix";
+/// Tokenize mode option key. Values: "mp", "mix", "full", "query".
+/// "hmm" is rejected with Unsupported (jieba-rs does not expose standalone HMM).
+static inline const char kJiebaTokenizeMode[] = "jieba.tokenize-mode";
+
+/// Writer-side tokenizer selector. Values:
+///   "default" (default) — tantivy built-in SimpleTokenizer;
+///   "paimon_jieba" — jieba-rs CJK tokenizer; opt-in for Chinese workloads
+///   "whitespace" / "raw" / "en_stem" — other tantivy built-ins
+/// The reader side is schema-driven (P-TK) and auto-dispatches to whatever
+/// tokenizer name is baked into the archive, so the default here also
+/// determines what paimon-java sees when it cross-reads the archive.
+static inline const char kTantivyWriteTokenizer[] = "tantivy.write.tokenizer";
+/// Default tokenizer for writer: tantivy built-in "default" (SimpleTokenizer),
+/// chosen so paimon-cpp ↔ paimon-java cross-read works out of the box.
+/// Chinese workloads must opt into "paimon_jieba" via kTantivyWriteTokenizer.
+static inline const char kDefaultTantivyWriteTokenizer[] = "default";
+
+}  // namespace paimon::tantivy
diff --git a/src/paimon/global_index/tantivy/tantivy_equivalence_test.cpp b/src/paimon/global_index/tantivy/tantivy_equivalence_test.cpp
new file mode 100644
index 000000000..63e5db663
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_equivalence_test.cpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Stage 10: equivalence + benchmark.
+ *
+ * EQUIVALENCE: a parametric corpus × query battery that compares lucene-fts
+ * and tantivy-fulltext result *sets* (doc_id only — not score order, not score
+ * values). Coverage targets:
+ *   - English bag-of-words: MATCH_ALL / MATCH_ANY / PHRASE
+ *   - Chinese (jieba "query" mode): MATCH_ALL / MATCH_ANY / PHRASE
+ *   - Pre_filter intersection (no scoring)
+ * PREFIX and WILDCARD are NOT compared as required-equal: tantivy's RegexQuery
+ * walks byte-level term dictionary, lucene's PrefixQuery/WildcardQuery walks
+ * its own; edge cases (empty input, anchors, multi-byte UTF-8) diverge by
+ * design. Documented in docs/dev/execute.md Stage 10 decisions.
+ *
+ * BENCHMARK: build a 200-doc index per backend and time write + 100 queries.
+ * Prints to stderr; never fails on perf — guarding against perf regressions
+ * is out of scope for this stage. Numbers go in execute.md as a baseline.
+ */
+
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/c/bridge.h"
+#include "arrow/ipc/api.h"
+#include "arrow/type.h"
+#include "fmt/format.h"
+#include "gtest/gtest.h"
+#include "paimon/common/utils/path_util.h"
+#include "paimon/core/global_index/global_index_file_manager.h"
+#include "paimon/core/index/index_path_factory.h"
+#include "paimon/fs/local/local_file_system.h"
+#include "paimon/global_index/bitmap_global_index_result.h"
+#include "paimon/global_index/bitmap_scored_global_index_result.h"
+#include "paimon/global_index/global_index_io_meta.h"
+#include "paimon/global_index/global_index_reader.h"
+#include "paimon/global_index/global_index_writer.h"
+#include "paimon/global_index/global_indexer.h"
+#include "paimon/global_index/global_indexer_factory.h"
+#include "paimon/global_index/lucene/lucene_defs.h"
+#include "paimon/global_index/tantivy/tantivy_defs.h"
+#include "paimon/predicate/full_text_search.h"
+#include "paimon/testing/utils/testharness.h"
+
+#ifndef JIEBA_TEST_DICT_DIR
+#error "JIEBA_TEST_DICT_DIR must be set at compile time"
+#endif
+
+namespace paimon::tantivy::test {
+
+namespace {
+
+class FakeIndexPathFactory : public IndexPathFactory {
+ public:
+    explicit FakeIndexPathFactory(const std::string& root) : root_(root) {}
+    std::string NewPath() const override {
+        assert(false);
+        return "";
+    }
+    std::string ToPath(const std::shared_ptr<IndexFileMeta>&) const override {
+        assert(false);
+        return "";
+    }
+    std::string ToPath(const std::string& file_name) const override {
+        return PathUtil::JoinPath(root_, file_name);
+    }
+    bool IsExternalPath() const override {
+        return false;
+    }
+
+ private:
+    std::string root_;
+};
+
+struct ReaderPair {
+    std::shared_ptr<GlobalIndexReader> lucene;
+    std::shared_ptr<GlobalIndexReader> tantivy;
+    std::unique_ptr<paimon::test::UniqueTestDirectory> lucene_root;
+    std::unique_ptr<paimon::test::UniqueTestDirectory> tantivy_root;
+};
+
+class TantivyEquivalenceTest : public ::testing::Test {
+ public:
+    void SetUp() override {
+        setenv(::paimon::lucene::kJiebaDictDirEnv, JIEBA_TEST_DICT_DIR, /*overwrite=*/1);
+        setenv(::paimon::tantivy::kJiebaDictDirEnv, JIEBA_TEST_DICT_DIR, /*overwrite=*/1);
+    }
+
+    std::unique_ptr<::ArrowSchema> CreateArrowSchema(
+        const std::shared_ptr<arrow::DataType>& data_type) const {
+        auto c_schema = std::make_unique<::ArrowSchema>();
+        EXPECT_TRUE(arrow::ExportType(*data_type, c_schema.get()).ok());
+        return c_schema;
+    }
+
+    GlobalIndexIOMeta WriteOne(const std::string& factory_id,
+                               const std::shared_ptr<arrow::DataType>& data_type,
+                               const std::map<std::string, std::string>& options,
+                               const std::shared_ptr<arrow::Array>& array,
+                               const std::string& root) {
+        auto indexer_res = GlobalIndexerFactory::Get(factory_id, options);
+        EXPECT_TRUE(indexer_res.ok()) << indexer_res.status().ToString();
+        // NB: std::move(result).value() picks the rvalue overload (returns T&&);
+        // std::move(result.value()) would call the const T& overload first → no move.
+        auto indexer = std::move(indexer_res).value();
+        auto path_factory = std::make_shared<FakeIndexPathFactory>(root);
+        auto file_writer = std::make_shared<GlobalIndexFileManager>(fs_, path_factory);
+        auto writer_res =
+            indexer->CreateWriter("f0", CreateArrowSchema(data_type).get(), file_writer, pool_);
+        EXPECT_TRUE(writer_res.ok()) << writer_res.status().ToString();
+        ::ArrowArray c_array;
+        EXPECT_TRUE(arrow::ExportArray(*array, &c_array).ok());
+        std::vector<int64_t> relative_row_ids(array->length());
+        for (int64_t i = 0; i < array->length(); ++i) relative_row_ids[i] = i;
+        EXPECT_TRUE(writer_res.value()->AddBatch(&c_array, std::move(relative_row_ids)).ok());
+        auto metas_res = writer_res.value()->Finish();
+        EXPECT_TRUE(metas_res.ok()) << metas_res.status().ToString();
+        return metas_res.value()[0];
+    }
+
+    std::shared_ptr<GlobalIndexReader> OpenOne(const std::string& factory_id,
+                                               const std::shared_ptr<arrow::DataType>& data_type,
+                                               const std::map<std::string, std::string>& options,
+                                               const GlobalIndexIOMeta& meta,
+                                               const std::string& root) {
+        auto indexer = GlobalIndexerFactory::Get(factory_id, options).value();
+        auto path_factory = std::make_shared<FakeIndexPathFactory>(root);
+        auto file_reader = std::make_shared<GlobalIndexFileManager>(fs_, path_factory);
+        return indexer->CreateReader(CreateArrowSchema(data_type).get(), file_reader, {meta}, pool_)
+            .value();
+    }
+
+    /// Build BOTH lucene + tantivy indexes for the same corpus + options.
+    /// Returns an opened-reader pair plus owning UniqueTestDirectory handles.
+    ReaderPair WriteAndOpenBoth(const std::shared_ptr<arrow::DataType>& data_type,
+                                const std::shared_ptr<arrow::Array>& array,
+                                std::map<std::string, std::string> lucene_opts,
+                                const std::map<std::string, std::string>& tantivy_opts) {
+        auto lroot = paimon::test::UniqueTestDirectory::Create();
+        auto troot = paimon::test::UniqueTestDirectory::Create();
+        EXPECT_TRUE(lroot && troot);
+        // lucene requires a tmp directory option; reuse lroot if caller didn't set one.
+        lucene_opts.emplace("lucene-fts.write.tmp.directory", lroot->Str());
+        auto lmeta = WriteOne("lucene-fts", data_type, lucene_opts, array, lroot->Str());
+        auto tmeta = WriteOne("tantivy-fulltext", data_type, tantivy_opts, array, troot->Str());
+        ReaderPair p;
+        p.lucene = OpenOne("lucene-fts", data_type, lucene_opts, lmeta, lroot->Str());
+        p.tantivy = OpenOne("tantivy-fulltext", data_type, tantivy_opts, tmeta, troot->Str());
+        p.lucene_root = std::move(lroot);
+        p.tantivy_root = std::move(troot);
+        return p;
+    }
+
+    static std::set<int64_t> Ids(const std::shared_ptr<GlobalIndexResult>& result) {
+        const RoaringBitmap64* bitmap = nullptr;
+        Result<const RoaringBitmap64*> br = Status::Invalid("none");
+        if (auto scored = std::dynamic_pointer_cast<BitmapScoredGlobalIndexResult>(result)) {
+            br = scored->GetBitmap();
+        } else if (auto plain = std::dynamic_pointer_cast<BitmapGlobalIndexResult>(result)) {
+            br = plain->GetBitmap();
+        }
+        EXPECT_TRUE(br.ok()) << br.status().ToString();
+        bitmap = br.value();
+        std::set<int64_t> out;
+        if (bitmap) {
+            for (auto it = bitmap->Begin(); it != bitmap->End(); ++it) {
+                out.insert(static_cast<int64_t>(*it));
+            }
+        }
+        return out;
+    }
+
+    /// Run a single FullTextSearch through both readers, return (lucene, tantivy)
+    /// doc id sets.
+    std::pair<std::set<int64_t>, std::set<int64_t>> RunPair(
+        const ReaderPair& p, const std::string& q, FullTextSearch::SearchType t,
+        std::optional<int32_t> limit = std::nullopt,
+        std::optional<RoaringBitmap64> filter = std::nullopt) {
+        auto lr = p.lucene->VisitFullTextSearch(
+            std::make_shared<FullTextSearch>("f0", limit, q, t, filter));
+        auto tr = p.tantivy->VisitFullTextSearch(
+            std::make_shared<FullTextSearch>("f0", limit, q, t, filter));
+        EXPECT_TRUE(lr.ok()) << "lucene: " << lr.status().ToString();
+        EXPECT_TRUE(tr.ok()) << "tantivy: " << tr.status().ToString();
+        return {Ids(lr.value()), Ids(tr.value())};
+    }
+
+ protected:
+    std::shared_ptr<MemoryPool> pool_ = GetDefaultPool();
+    std::shared_ptr<FileSystem> fs_ = std::make_shared<LocalFileSystem>();
+};
+
+}  // namespace
+
+TEST_F(TantivyEquivalenceTest, EnglishBagOfWordsBattery) {
+    auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())});
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(data_type, R"([
+        ["alpha beta gamma delta"],
+        ["alpha alpha alpha beta"],
+        ["beta gamma delta epsilon"],
+        ["zeta eta theta iota"],
+        ["alpha gamma epsilon iota"],
+        ["lone outlier word here"],
+        ["alpha beta gamma alpha beta"],
+        ["delta epsilon zeta eta theta"],
+        ["nothing matches this row"],
+        ["alpha"]
+    ])")
+                     .ValueOrDie();
+    auto pair = WriteAndOpenBoth(data_type, array, {}, {});
+
+    struct Case {
+        std::string query;
+        FullTextSearch::SearchType type;
+    };
+    std::vector<Case> cases = {
+        {"alpha", FullTextSearch::SearchType::MATCH_ALL},
+        {"alpha", FullTextSearch::SearchType::MATCH_ANY},
+        {"alpha beta", FullTextSearch::SearchType::MATCH_ALL},
+        {"alpha beta", FullTextSearch::SearchType::MATCH_ANY},
+        {"alpha gamma delta", FullTextSearch::SearchType::MATCH_ALL},
+        {"alpha gamma delta", FullTextSearch::SearchType::MATCH_ANY},
+        {"epsilon iota", FullTextSearch::SearchType::MATCH_ALL},
+        {"alpha beta gamma", FullTextSearch::SearchType::PHRASE},
+        {"beta gamma delta", FullTextSearch::SearchType::PHRASE},
+        {"delta epsilon", FullTextSearch::SearchType::PHRASE},
+    };
+    for (const auto& c : cases) {
+        auto [l, t] = RunPair(pair, c.query, c.type);
+        EXPECT_EQ(l, t) << "diverge: query=" << c.query << " type=" << static_cast<int>(c.type);
+    }
+}
+
+TEST_F(TantivyEquivalenceTest, ChineseQueryModeBattery) {
+    auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())});
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(data_type, R"([
+["智能助手 AI 模块 开发"],
+["智能助手 在 Python 开发 中"],
+["AI 助手 开发 框架"],
+["智能 模块 技术 实现"],
+["发展方向 是 智能 助手"]
+    ])")
+                     .ValueOrDie();
+    std::map<std::string, std::string> lopts = {{"lucene-fts.jieba.tokenize-mode", "query"}};
+    std::map<std::string, std::string> topts = {
+        {"tantivy-fulltext.tantivy.write.tokenizer", "paimon_jieba"},
+        {"tantivy-fulltext.jieba.tokenize-mode", "query"},
+    };
+    auto pair = WriteAndOpenBoth(data_type, array, lopts, topts);
+
+    struct Case {
+        std::string query;
+        FullTextSearch::SearchType type;
+    };
+    // Note: jieba is shared (same dictionary), so tokenization should agree
+    // for plain Chinese text. Differences (if any) come from the lowercase /
+    // stopword normalization step — tested with neutral CJK terms below.
+    std::vector<Case> cases = {
+        {"智能", FullTextSearch::SearchType::MATCH_ALL},
+        {"智能 助手", FullTextSearch::SearchType::MATCH_ALL},
+        {"模块", FullTextSearch::SearchType::MATCH_ANY},
+        {"发展方向", FullTextSearch::SearchType::PHRASE},
+    };
+    for (const auto& c : cases) {
+        auto [l, t] = RunPair(pair, c.query, c.type);
+        EXPECT_EQ(l, t) << "diverge: query=" << c.query << " type=" << static_cast<int>(c.type);
+    }
+}
+
+TEST_F(TantivyEquivalenceTest, PreFilterIntersectionEquivalent) {
+    auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())});
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(data_type, R"([
+        ["alpha beta"],
+        ["alpha gamma"],
+        ["alpha delta"],
+        ["beta gamma"],
+        ["beta delta"]
+    ])")
+                     .ValueOrDie();
+    auto pair = WriteAndOpenBoth(data_type, array, {}, {});
+
+    auto pf = RoaringBitmap64::From({0l, 2l, 4l});
+    {
+        auto [l, t] =
+            RunPair(pair, "alpha", FullTextSearch::SearchType::MATCH_ALL, std::nullopt, pf);
+        EXPECT_EQ(l, t);
+        EXPECT_EQ(l, (std::set<int64_t>{0, 2}));
+    }
+    {
+        auto [l, t] =
+            RunPair(pair, "beta gamma", FullTextSearch::SearchType::MATCH_ANY, std::nullopt, pf);
+        EXPECT_EQ(l, t);
+    }
+    {
+        auto empty = RoaringBitmap64();
+        auto [l, t] =
+            RunPair(pair, "alpha", FullTextSearch::SearchType::MATCH_ALL, std::nullopt, empty);
+        EXPECT_EQ(l, t);
+        EXPECT_TRUE(l.empty());
+    }
+}
+
+TEST_F(TantivyEquivalenceTest, BenchmarkBuildAndQuery) {
+    // Build a synthetic 200-doc corpus and time write + 100 random queries.
+    // This is a reportable baseline, NOT a perf gate — assertions only check
+    // semantic correctness (each query returns >= 0 docs without erroring).
+    constexpr int kDocCount = 200;
+    constexpr int kQueryCount = 100;
+    std::vector<std::string> vocab = {"alpha",  "beta", "gamma", "delta", "epsilon",
+                                      "zeta",   "eta",  "theta", "iota",  "kappa",
+                                      "lambda", "mu",   "nu",    "xi",    "omicron"};
+    std::mt19937 rng(0xC0DE);
+    std::uniform_int_distribution<size_t> word_pick(0, vocab.size() - 1);
+    std::uniform_int_distribution<int> word_count(3, 12);
+
+    // Build the corpus as a JSON Arrow array.
+    std::string json = "[";
+    for (int i = 0; i < kDocCount; ++i) {
+        json += "[\"";
+        int n = word_count(rng);
+        for (int w = 0; w < n; ++w) {
+            if (w > 0) json += ' ';
+            json += vocab[word_pick(rng)];
+        }
+        json += "\"]";
+        if (i + 1 < kDocCount) json += ",";
+    }
+    json += "]";
+
+    auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())});
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(data_type, json).ValueOrDie();
+
+    auto time_ms = [](auto&& fn) {
+        auto t0 = std::chrono::steady_clock::now();
+        fn();
+        auto t1 = std::chrono::steady_clock::now();
+        return std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
+    };
+
+    // -------- Lucene: write + open + queries --------
+    auto lroot = paimon::test::UniqueTestDirectory::Create();
+    std::map<std::string, std::string> lopt = {{"lucene-fts.write.tmp.directory", lroot->Str()}};
+    GlobalIndexIOMeta lmeta{"", 0, nullptr};
+    auto lwrite_ms =
+        time_ms([&] { lmeta = WriteOne("lucene-fts", data_type, lopt, array, lroot->Str()); });
+    auto lreader = OpenOne("lucene-fts", data_type, lopt, lmeta, lroot->Str());
+
+    auto lquery_ms = time_ms([&] {
+        for (int i = 0; i < kQueryCount; ++i) {
+            const std::string& w = vocab[word_pick(rng)];
+            auto r = lreader->VisitFullTextSearch(std::make_shared<FullTextSearch>(
+                "f0", std::nullopt, w, FullTextSearch::SearchType::MATCH_ALL, std::nullopt));
+            EXPECT_TRUE(r.ok());
+        }
+    });
+
+    // -------- Tantivy: write + open + queries --------
+    auto troot = paimon::test::UniqueTestDirectory::Create();
+    GlobalIndexIOMeta tmeta{"", 0, nullptr};
+    auto twrite_ms =
+        time_ms([&] { tmeta = WriteOne("tantivy-fulltext", data_type, {}, array, troot->Str()); });
+    auto treader = OpenOne("tantivy-fulltext", data_type, {}, tmeta, troot->Str());
+
+    auto tquery_ms = time_ms([&] {
+        for (int i = 0; i < kQueryCount; ++i) {
+            const std::string& w = vocab[word_pick(rng)];
+            auto r = treader->VisitFullTextSearch(std::make_shared<FullTextSearch>(
+                "f0", std::nullopt, w, FullTextSearch::SearchType::MATCH_ALL, std::nullopt));
+            EXPECT_TRUE(r.ok());
+        }
+    });
+
+    std::cerr << fmt::format(
+        "[STAGE10-BENCH docs={} queries={}] lucene_write={}ms lucene_query={}ms"
+        " tantivy_write={}ms tantivy_query={}ms file_size_lucene={} file_size_tantivy={}\n",
+        kDocCount, kQueryCount, lwrite_ms, lquery_ms, twrite_ms, tquery_ms, lmeta.file_size,
+        tmeta.file_size);
+    SUCCEED() << "benchmark prints to stderr";
+}
+
+}  // namespace paimon::tantivy::test
diff --git a/src/paimon/global_index/tantivy/tantivy_ffi_handle.h b/src/paimon/global_index/tantivy/tantivy_ffi_handle.h
new file mode 100644
index 000000000..b4d4e51cf
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_ffi_handle.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * RAII wrappers for opaque FFI handles returned by paimon_tantivy_ffi.
+ * See docs/dev/tantivy_ffi_design.md §3 Category A.
+ */
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+extern "C" {
+#include "paimon_tantivy_ffi.h"  // NOLINT(build/include_subdir)
+}
+
+namespace paimon::tantivy {
+
+/// Deleter template; specialize per handle type with the matching free function.
+/// Usage:
+///   template <> struct FfiDeleter<paimon_tantivy_writer_t> {
+///       void operator()(paimon_tantivy_writer_t* p) const noexcept {
+///           paimon_tantivy_writer_free(p);
+///       }
+///   };
+///   using WriterPtr = FfiUniquePtr<paimon_tantivy_writer_t>;
+template <typename Handle>
+struct FfiDeleter {
+    // Default unsupported so missing specializations fail at compile time
+    void operator()(Handle*) const noexcept {
+        static_assert(sizeof(Handle) == 0, "FfiDeleter must be specialized for this handle type");
+    }
+};
+
+/// Generic RAII owning pointer for an FFI handle.
+template <typename Handle>
+using FfiUniquePtr = std::unique_ptr<Handle, FfiDeleter<Handle>>;
+
+/// Tokenizer handle (Stage 3).
+template <>
+struct FfiDeleter<PaimonJiebaTokenizer> {
+    void operator()(PaimonJiebaTokenizer* p) const noexcept {
+        paimon_tantivy_tokenizer_free(p);
+    }
+};
+using JiebaTokenizerPtr = FfiUniquePtr<PaimonJiebaTokenizer>;
+
+/// Writer handle (Stage 4).
+template <>
+struct FfiDeleter<PaimonTantivyWriter> {
+    void operator()(PaimonTantivyWriter* p) const noexcept {
+        paimon_tantivy_writer_free(p);
+    }
+};
+using WriterPtr = FfiUniquePtr<PaimonTantivyWriter>;
+
+/// Reader handle (Stage 6).
+template <>
+struct FfiDeleter<PaimonTantivyReader> {
+    void operator()(PaimonTantivyReader* p) const noexcept {
+        paimon_tantivy_reader_free(p);
+    }
+};
+using ReaderPtr = FfiUniquePtr<PaimonTantivyReader>;
+
+/// Specialization: buffer_t is special - not an opaque handle but a value
+/// struct owned on the stack. The contained `data` pointer is the Rust-owned
+/// allocation; we call `paimon_tantivy_buffer_free` on the struct pointer.
+/// Use BufferGuard to ensure free-on-scope-exit even on early return.
+class BufferGuard {
+ public:
+    BufferGuard() noexcept {
+        buf_.data = nullptr;
+        buf_.len = 0;
+        buf_.capacity = 0;
+    }
+    BufferGuard(const BufferGuard&) = delete;
+    BufferGuard& operator=(const BufferGuard&) = delete;
+    BufferGuard(BufferGuard&&) = delete;
+    BufferGuard& operator=(BufferGuard&&) = delete;
+
+    ~BufferGuard() noexcept {
+        paimon_tantivy_buffer_free(&buf_);
+    }
+
+    PaimonTantivyBuffer* out() noexcept {
+        return &buf_;
+    }
+
+    const uint8_t* data() const noexcept {
+        return buf_.data;
+    }
+    std::size_t size() const noexcept {
+        return buf_.len;
+    }
+
+ private:
+    PaimonTantivyBuffer buf_{};
+};
+
+}  // namespace paimon::tantivy
diff --git a/src/paimon/global_index/tantivy/tantivy_ffi_log.cpp b/src/paimon/global_index/tantivy/tantivy_ffi_log.cpp
new file mode 100644
index 000000000..77d7420cb
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_ffi_log.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "paimon/global_index/tantivy/tantivy_ffi_log.h"
+
+#include <cstring>
+#include <string>
+
+#include "glog/logging.h"
+
+extern "C" {
+#include "paimon_tantivy_ffi.h"  // NOLINT(build/include_subdir)
+}
+
+namespace paimon::tantivy {
+namespace {
+
+/// Level mapping matches Rust side (0=trace..4=error).
+extern "C" void PaimonTantivyLogAdapter(int32_t level, const char* msg, std::size_t len) {
+    // msg is NOT null-terminated; slice with len.
+    std::string s(msg, len);
+    switch (level) {
+        case 4:
+            LOG(ERROR) << "[tantivy] " << s;
+            break;
+        case 3:
+            LOG(WARNING) << "[tantivy] " << s;
+            break;
+        case 2:
+            LOG(INFO) << "[tantivy] " << s;
+            break;
+        case 1:
+            VLOG(1) << "[tantivy] " << s;
+            break;
+        case 0:
+            VLOG(2) << "[tantivy] " << s;
+            break;
+        default:
+            LOG(INFO) << "[tantivy:lvl=" << level << "] " << s;
+            break;
+    }
+}
+
+}  // namespace
+
+void InstallTantivyLogBridge() {
+    paimon_tantivy_set_log_callback(&PaimonTantivyLogAdapter);
+}
+
+void UninstallTantivyLogBridge() {
+    paimon_tantivy_clear_log_callback();
+}
+
+}  // namespace paimon::tantivy
diff --git a/src/paimon/global_index/tantivy/tantivy_ffi_log.h b/src/paimon/global_index/tantivy/tantivy_ffi_log.h
new file mode 100644
index 000000000..42ddcbdde
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_ffi_log.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Bridge tantivy (Rust) logs into paimon's logger.
+ * See docs/dev/tantivy_ffi_design.md §7.
+ *
+ * Registered once at TantivyGlobalIndexFactory static-init time.
+ */
+#pragma once
+
+namespace paimon::tantivy {
+
+/// Install the Rust -> C++ log callback. Idempotent; only the last caller's
+/// callback is active. Threading: C callback runs on tantivy worker threads;
+/// our adapter must be thread-safe (it routes to glog which is).
+void InstallTantivyLogBridge();
+
+/// Uninstall (revert to Rust stderr). Mostly useful for tests.
+void UninstallTantivyLogBridge();
+
+}  // namespace paimon::tantivy
diff --git a/src/paimon/global_index/tantivy/tantivy_ffi_status.h b/src/paimon/global_index/tantivy/tantivy_ffi_status.h
new file mode 100644
index 000000000..8c64d839f
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_ffi_status.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Translation layer: paimon_tantivy_status_t -> paimon::Status.
+ * See docs/dev/tantivy_ffi_design.md §2.
+ */
+#pragma once
+
+#include "fmt/format.h"
+#include "paimon/status.h"
+
+extern "C" {
+#include "paimon_tantivy_ffi.h"  // NOLINT(build/include_subdir)
+}
+
+namespace paimon::tantivy {
+
+/// Translate an FFI status code to a paimon::Status. OK returns Status::OK().
+/// On error, the returned Status carries the thread-local last_error() text
+/// prefixed with the status code name for easier grep.
+///
+/// Note: cbindgen emits `PaimonTantivyStatus` in the **global** namespace as
+/// a C-style enum, so we accept it via its global type here. C++ ADL still
+/// lets call sites write the unqualified enumerator names.
+inline Status FfiStatusToStatus(::PaimonTantivyStatus code) {
+    if (code == PAIMON_TANTIVY_STATUS_OK) {
+        return Status::OK();
+    }
+    const char* err = paimon_tantivy_last_error();
+    const char* name = [code]() -> const char* {
+        switch (code) {
+            case PAIMON_TANTIVY_STATUS_INVALID_ARGUMENT:
+                return "InvalidArgument";
+            case PAIMON_TANTIVY_STATUS_NOT_FOUND:
+                return "NotFound";
+            case PAIMON_TANTIVY_STATUS_IO_ERROR:
+                return "IoError";
+            case PAIMON_TANTIVY_STATUS_UNSUPPORTED:
+                return "Unsupported";
+            case PAIMON_TANTIVY_STATUS_TOKENIZER_ERROR:
+                return "TokenizerError";
+            case PAIMON_TANTIVY_STATUS_QUERY_PARSE_ERROR:
+                return "QueryParseError";
+            case PAIMON_TANTIVY_STATUS_INDEX_FORMAT_ERROR:
+                return "IndexFormatError";
+            case PAIMON_TANTIVY_STATUS_INTERNAL_ERROR:
+                return "InternalError";
+            default:
+                return "UnknownFfiStatus";
+        }
+    }();
+    std::string msg =
+        fmt::format("tantivy-ffi[{}({})]: {}", name, static_cast<int>(code), err ? err : "(null)");
+    switch (code) {
+        case PAIMON_TANTIVY_STATUS_NOT_FOUND:
+            return Status::NotExist(msg);
+        case PAIMON_TANTIVY_STATUS_IO_ERROR:
+            return Status::IOError(msg);
+        case PAIMON_TANTIVY_STATUS_UNSUPPORTED:
+            return Status::NotImplemented(msg);
+        case PAIMON_TANTIVY_STATUS_INVALID_ARGUMENT:
+        case PAIMON_TANTIVY_STATUS_TOKENIZER_ERROR:
+        case PAIMON_TANTIVY_STATUS_QUERY_PARSE_ERROR:
+        case PAIMON_TANTIVY_STATUS_INDEX_FORMAT_ERROR:
+            return Status::Invalid(msg);
+        default:
+            return Status::UnknownError(msg);
+    }
+}
+
+/// Like PAIMON_RETURN_NOT_OK but for FFI calls returning PaimonTantivyStatus.
+#define PAIMON_TANTIVY_RETURN_NOT_OK(expr)                                        \
+    do {                                                                          \
+        ::PaimonTantivyStatus _paimon_tantivy_status_ = (expr);                   \
+        if (_paimon_tantivy_status_ != PAIMON_TANTIVY_STATUS_OK) {                \
+            return ::paimon::tantivy::FfiStatusToStatus(_paimon_tantivy_status_); \
+        }                                                                         \
+    } while (0)
+
+}  // namespace paimon::tantivy
diff --git a/src/paimon/global_index/tantivy/tantivy_ffi_test.cpp b/src/paimon/global_index/tantivy/tantivy_ffi_test.cpp
new file mode 100644
index 000000000..9c7d28f6c
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_ffi_test.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Stage 2: FFI common layer tests — error/buffer/log behave as documented.
+ * Does NOT build on real index yet (that's Stage 4+).
+ */
+
+#include <atomic>
+#include <cstring>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paimon/global_index/tantivy/tantivy_ffi_handle.h"
+#include "paimon/global_index/tantivy/tantivy_ffi_log.h"
+#include "paimon/global_index/tantivy/tantivy_ffi_status.h"
+
+extern "C" {
+#include "paimon_tantivy_ffi.h"  // NOLINT(build/include_subdir)
+}
+
+namespace paimon::tantivy {
+
+// ------------------------- last_error contract -------------------------
+
+TEST(TantivyFfiError, LastErrorIsNeverNull) {
+    // Before anything, last_error should be a valid non-null pointer to ""
+    const char* ptr = paimon_tantivy_last_error();
+    ASSERT_NE(ptr, nullptr);
+    // Content is thread-local; for freshly-spawned thread it must be empty
+    std::atomic<bool> child_ok{false};
+    std::thread t([&]() {
+        const char* p = paimon_tantivy_last_error();
+        child_ok.store(p != nullptr && p[0] == '\0');
+    });
+    t.join();
+    EXPECT_TRUE(child_ok.load());
+}
+
+// ------------------------- status translation -------------------------
+
+TEST(TantivyFfiStatus, OkTranslates) {
+    Status s = FfiStatusToStatus(PaimonTantivyStatus::PAIMON_TANTIVY_STATUS_OK);
+    EXPECT_TRUE(s.ok()) << s.ToString();
+}
+
+TEST(TantivyFfiStatus, ErrorCodeNamesShowUp) {
+    // Translate a few codes and ensure the name appears in the string form.
+    struct Case {
+        PaimonTantivyStatus code;
+        const char* expected_substr;
+    };
+    const Case cases[] = {
+        {PaimonTantivyStatus::PAIMON_TANTIVY_STATUS_INVALID_ARGUMENT, "InvalidArgument"},
+        {PaimonTantivyStatus::PAIMON_TANTIVY_STATUS_NOT_FOUND, "NotFound"},
+        {PaimonTantivyStatus::PAIMON_TANTIVY_STATUS_IO_ERROR, "IoError"},
+        {PaimonTantivyStatus::PAIMON_TANTIVY_STATUS_UNSUPPORTED, "Unsupported"},
+        {PaimonTantivyStatus::PAIMON_TANTIVY_STATUS_TOKENIZER_ERROR, "TokenizerError"},
+    };
+    for (const auto& c : cases) {
+        Status s = FfiStatusToStatus(c.code);
+        EXPECT_FALSE(s.ok());
+        EXPECT_NE(s.ToString().find(c.expected_substr), std::string::npos)
+            << "got: " << s.ToString();
+    }
+}
+
+// ------------------------- buffer lifetime -------------------------
+
+TEST(TantivyFfiBuffer, EmptyBufferGuard) {
+    BufferGuard g;
+    EXPECT_EQ(g.size(), 0u);
+    EXPECT_EQ(g.data(), nullptr);
+    // Destructor must accept empty buffer
+}
+
+// ------------------------- handle stress -------------------------
+
+// Sanity stress: create/destroy a dummy "handle" via into_handle/free_handle.
+// Since the Rust side doesn't yet export writer/reader, we stress via a
+// temporary wrapping of the buffer API: alloc buffers repeatedly, ensure no
+// crash (LSAN / ASAN would catch leaks).
+TEST(TantivyFfiBuffer, StressAllocFree) {
+    for (int i = 0; i < 1000; ++i) {
+        BufferGuard g;
+        // We don't have a way to populate the buffer from C++ in Stage 2;
+        // this just exercises empty construction + destruction path.
+        (void)g;
+    }
+}
+
+// ------------------------- log bridge -------------------------
+
+namespace {
+std::atomic<int> g_log_count{0};
+extern "C" void CountingLogCb(int32_t /*level*/, const char* /*msg*/, std::size_t /*len*/) {
+    g_log_count.fetch_add(1, std::memory_order_relaxed);
+}
+}  // namespace
+
+TEST(TantivyFfiLog, SetCallbackIsIdempotent) {
+    g_log_count.store(0);
+    paimon_tantivy_set_log_callback(&CountingLogCb);
+    paimon_tantivy_set_log_callback(&CountingLogCb);
+    paimon_tantivy_clear_log_callback();
+    // Should not crash even though called multiple times (idempotent install)
+    SUCCEED();
+}
+
+TEST(TantivyFfiLog, InstallBridgeThenUninstall) {
+    // Bridge to glog; must not crash.
+    InstallTantivyLogBridge();
+    UninstallTantivyLogBridge();
+    SUCCEED();
+}
+
+// ------------------------- version still works -------------------------
+
+TEST(TantivyFfi, VersionReachable) {
+    const char* v = paimon_tantivy_version();
+    ASSERT_NE(v, nullptr);
+    EXPECT_GT(std::strlen(v), 0u);
+}
+
+}  // namespace paimon::tantivy
diff --git a/src/paimon/global_index/tantivy/tantivy_filter_limit_test.cpp b/src/paimon/global_index/tantivy/tantivy_filter_limit_test.cpp
new file mode 100644
index 000000000..a11320b26
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_filter_limit_test.cpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Stage 7 test: cover the limit + pre_filter + scoring pathway. Uses the same
+ * write→read flow as paimon-tantivy-reader-test, but verifies that:
+ *   - A `limit` produces a `BitmapScoredGlobalIndexResult` with non-empty
+ *     scores ordered such that bitmap iteration order aligns with the score
+ *     vector (paimon convention: doc-id-asc bitmap, parallel score vector).
+ *   - A `pre_filter` excludes non-member rows even when they would otherwise
+ *     dominate the top-N by score.
+ *   - Combining both produces the intersection, with limit applied AFTER
+ *     filtering (matches lucene-fts behavior).
+ */
+
+#include <memory>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/c/bridge.h"
+#include "arrow/ipc/api.h"
+#include "arrow/type.h"
+#include "gtest/gtest.h"
+#include "paimon/common/utils/path_util.h"
+#include "paimon/core/global_index/global_index_file_manager.h"
+#include "paimon/core/index/index_path_factory.h"
+#include "paimon/fs/local/local_file_system.h"
+#include "paimon/global_index/bitmap_global_index_result.h"
+#include "paimon/global_index/bitmap_scored_global_index_result.h"
+#include "paimon/global_index/tantivy/tantivy_defs.h"
+#include "paimon/global_index/tantivy/tantivy_global_index_reader.h"
+#include "paimon/global_index/tantivy/tantivy_global_index_writer.h"
+#include "paimon/testing/utils/testharness.h"
+
+#ifndef JIEBA_TEST_DICT_DIR
+#error "JIEBA_TEST_DICT_DIR must be set at compile time"
+#endif
+
+namespace paimon::tantivy::test {
+
+namespace {
+
+class FakeIndexPathFactory : public IndexPathFactory {
+ public:
+    explicit FakeIndexPathFactory(const std::string& root) : root_(root) {}
+    std::string NewPath() const override {
+        assert(false);
+        return "";
+    }
+    std::string ToPath(const std::shared_ptr<IndexFileMeta>&) const override {
+        assert(false);
+        return "";
+    }
+    std::string ToPath(const std::string& file_name) const override {
+        return PathUtil::JoinPath(root_, file_name);
+    }
+    bool IsExternalPath() const override {
+        return false;
+    }
+
+ private:
+    std::string root_;
+};
+
+class TantivyFilterLimitTest : public ::testing::Test {
+ public:
+    void SetUp() override {
+        setenv(kJiebaDictDirEnv, JIEBA_TEST_DICT_DIR, /*overwrite=*/1);
+    }
+
+    std::pair<std::shared_ptr<GlobalIndexFileManager>, GlobalIndexIOMeta> WriteAndOpen(
+        const std::shared_ptr<arrow::Array>& array,
+        const std::map<std::string, std::string>& options) {
+        auto root_dir = paimon::test::UniqueTestDirectory::Create();
+        EXPECT_TRUE(root_dir);
+        std::string root = root_dir->Str();
+        kept_dirs_.push_back(std::move(root_dir));
+        auto path_factory = std::make_shared<FakeIndexPathFactory>(root);
+        auto fm = std::make_shared<GlobalIndexFileManager>(fs_, path_factory);
+        auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())});
+        auto writer_res =
+            TantivyGlobalIndexWriter::Create("f0", data_type, fm, options, GetDefaultPool());
+        EXPECT_TRUE(writer_res.ok()) << writer_res.status().ToString();
+        auto writer = writer_res.value();
+        ::ArrowArray c_array;
+        EXPECT_TRUE(arrow::ExportArray(*array, &c_array).ok());
+        std::vector<int64_t> relative_row_ids(array->length());
+        for (int64_t i = 0; i < array->length(); ++i) relative_row_ids[i] = i;
+        EXPECT_TRUE(writer->AddBatch(&c_array, std::move(relative_row_ids)).ok());
+        auto metas_res = writer->Finish();
+        EXPECT_TRUE(metas_res.ok()) << metas_res.status().ToString();
+        return {fm, metas_res.value()[0]};
+    }
+
+    static std::vector<int64_t> BitmapToVec(const RoaringBitmap64& b) {
+        std::vector<int64_t> ids;
+        for (auto it = b.Begin(); it != b.End(); ++it) {
+            ids.push_back(static_cast<int64_t>(*it));
+        }
+        std::sort(ids.begin(), ids.end());
+        return ids;
+    }
+
+    std::shared_ptr<arrow::DataType> DataType() const {
+        return arrow::struct_({arrow::field("f0", arrow::utf8())});
+    }
+
+ protected:
+    std::shared_ptr<FileSystem> fs_ = std::make_shared<LocalFileSystem>();
+    std::vector<std::unique_ptr<paimon::test::UniqueTestDirectory>> kept_dirs_;
+};
+
+}  // namespace
+
+TEST_F(TantivyFilterLimitTest, LimitProducesScoredResultTopN) {
+    // Three docs with very different term frequencies for "doc"; limit=2 must
+    // pick the top 2 by score (doc 1 highest, then doc 2).
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([
+        ["doc"],
+        ["doc doc doc doc doc"],
+        ["doc doc"]
+    ])")
+                     .ValueOrDie();
+    auto [fm, meta] = WriteAndOpen(array, {});
+    ASSERT_OK_AND_ASSIGN(auto reader,
+                         TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool()));
+    auto fts = std::make_shared<FullTextSearch>("f0", /*limit=*/2, "doc",
+                                                FullTextSearch::SearchType::MATCH_ALL,
+                                                /*pre_filter=*/std::nullopt);
+    fts->with_score = true;  // v0.2: explicit score opt-in
+    auto res = reader->VisitFullTextSearch(fts);
+    ASSERT_TRUE(res.ok()) << res.status().ToString();
+    auto scored = std::dynamic_pointer_cast<BitmapScoredGlobalIndexResult>(res.value());
+    ASSERT_TRUE(scored) << "expected BitmapScoredGlobalIndexResult";
+    ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, scored->GetBitmap());
+    auto ids = BitmapToVec(*bitmap);
+    EXPECT_EQ(ids, (std::vector<int64_t>{1, 2}));
+    EXPECT_EQ(scored->GetScores().size(), 2u);
+    // Per-doc scores must be > 0 and present in iteration (doc-id) order.
+    for (auto s : scored->GetScores()) {
+        EXPECT_GT(s, 0.0f);
+    }
+}
+
+TEST_F(TantivyFilterLimitTest, NoLimitReturnsBitmapResult) {
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([
+        ["doc"], ["doc doc"], ["other"]
+    ])")
+                     .ValueOrDie();
+    auto [fm, meta] = WriteAndOpen(array, {});
+    ASSERT_OK_AND_ASSIGN(auto reader,
+                         TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool()));
+    auto res = reader->VisitFullTextSearch(std::make_shared<FullTextSearch>(
+        "f0", /*limit=*/std::nullopt, "doc", FullTextSearch::SearchType::MATCH_ALL,
+        /*pre_filter=*/std::nullopt));
+    ASSERT_TRUE(res.ok()) << res.status().ToString();
+    // No limit ⇒ NOT a BitmapScoredGlobalIndexResult; just BitmapGlobalIndexResult.
+    EXPECT_FALSE(std::dynamic_pointer_cast<BitmapScoredGlobalIndexResult>(res.value()));
+    auto plain = std::dynamic_pointer_cast<BitmapGlobalIndexResult>(res.value());
+    ASSERT_TRUE(plain);
+    ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, plain->GetBitmap());
+    EXPECT_EQ(BitmapToVec(*bitmap), (std::vector<int64_t>{0, 1}));
+}
+
+TEST_F(TantivyFilterLimitTest, PreFilterIntersectsWithoutLimit) {
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([
+        ["alpha"], ["alpha"], ["alpha"], ["beta"]
+    ])")
+                     .ValueOrDie();
+    auto [fm, meta] = WriteAndOpen(array, {});
+    ASSERT_OK_AND_ASSIGN(auto reader,
+                         TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool()));
+    auto res = reader->VisitFullTextSearch(std::make_shared<FullTextSearch>(
+        "f0", /*limit=*/std::nullopt, "alpha", FullTextSearch::SearchType::MATCH_ALL,
+        /*pre_filter=*/RoaringBitmap64::From({0l, 2l, 100l})));
+    ASSERT_TRUE(res.ok()) << res.status().ToString();
+    auto plain = std::dynamic_pointer_cast<BitmapGlobalIndexResult>(res.value());
+    ASSERT_TRUE(plain);
+    ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, plain->GetBitmap());
+    EXPECT_EQ(BitmapToVec(*bitmap), (std::vector<int64_t>{0, 2}));
+}
+
+TEST_F(TantivyFilterLimitTest, PreFilterAppliedBeforeLimit) {
+    // doc 0 has highest score for "doc" but is excluded by pre_filter; the
+    // result must contain doc 1 only, even with limit=10.
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([
+        ["doc doc doc doc doc"],
+        ["doc doc"],
+        ["doc"]
+    ])")
+                     .ValueOrDie();
+    auto [fm, meta] = WriteAndOpen(array, {});
+    ASSERT_OK_AND_ASSIGN(auto reader,
+                         TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool()));
+    auto fts = std::make_shared<FullTextSearch>("f0", /*limit=*/10, "doc",
+                                                FullTextSearch::SearchType::MATCH_ALL,
+                                                /*pre_filter=*/RoaringBitmap64::From({1l}));
+    fts->with_score = true;  // v0.2: explicit score opt-in
+    auto res = reader->VisitFullTextSearch(fts);
+    ASSERT_TRUE(res.ok()) << res.status().ToString();
+    auto scored = std::dynamic_pointer_cast<BitmapScoredGlobalIndexResult>(res.value());
+    ASSERT_TRUE(scored);
+    ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, scored->GetBitmap());
+    EXPECT_EQ(BitmapToVec(*bitmap), (std::vector<int64_t>{1}));
+    EXPECT_EQ(scored->GetScores().size(), 1u);
+}
+
+TEST_F(TantivyFilterLimitTest, EmptyPreFilterReturnsEmpty) {
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([
+        ["alpha"], ["beta"]
+    ])")
+                     .ValueOrDie();
+    auto [fm, meta] = WriteAndOpen(array, {});
+    ASSERT_OK_AND_ASSIGN(auto reader,
+                         TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool()));
+    RoaringBitmap64 empty;  // explicitly empty
+    auto res = reader->VisitFullTextSearch(std::make_shared<FullTextSearch>(
+        "f0", /*limit=*/std::nullopt, "alpha", FullTextSearch::SearchType::MATCH_ALL,
+        /*pre_filter=*/empty));
+    ASSERT_TRUE(res.ok()) << res.status().ToString();
+    auto plain = std::dynamic_pointer_cast<BitmapGlobalIndexResult>(res.value());
+    ASSERT_TRUE(plain);
+    ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, plain->GetBitmap());
+    EXPECT_TRUE(bitmap->IsEmpty());
+}
+
+TEST_F(TantivyFilterLimitTest, LimitGreaterThanMatchesReturnsAll) {
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([
+        ["doc"], ["doc doc"], ["other"]
+    ])")
+                     .ValueOrDie();
+    auto [fm, meta] = WriteAndOpen(array, {});
+    ASSERT_OK_AND_ASSIGN(auto reader,
+                         TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool()));
+    auto fts = std::make_shared<FullTextSearch>("f0", /*limit=*/100, "doc",
+                                                FullTextSearch::SearchType::MATCH_ALL,
+                                                /*pre_filter=*/std::nullopt);
+    fts->with_score = true;  // v0.2: explicit score opt-in
+    auto res = reader->VisitFullTextSearch(fts);
+    ASSERT_TRUE(res.ok()) << res.status().ToString();
+    auto scored = std::dynamic_pointer_cast<BitmapScoredGlobalIndexResult>(res.value());
+    ASSERT_TRUE(scored);
+    ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, scored->GetBitmap());
+    EXPECT_EQ(BitmapToVec(*bitmap), (std::vector<int64_t>{0, 1}));
+    EXPECT_EQ(scored->GetScores().size(), 2u);
+}
+
+// ===========================================================================
+// v0.2: with_score × limit 4-path matrix guards
+// ===========================================================================
+// Decouple with_score from limit. The four combinations must each map to the
+// correct concrete result type and content. See docs/dev/tantivy_bm25_score_contract.md §4.
+
+// Path A: with_score=false, limit=None → BitmapGlobalIndexResult, all rows, no score.
+TEST_F(TantivyFilterLimitTest, WithScoreFalseLimitNone_AllRowsNoScore) {
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([
+        ["doc"], ["doc doc"], ["doc doc doc"]
+    ])")
+                     .ValueOrDie();
+    auto [fm, meta] = WriteAndOpen(array, {});
+    ASSERT_OK_AND_ASSIGN(auto reader,
+                         TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool()));
+    auto fts = std::make_shared<FullTextSearch>("f0", /*limit=*/std::nullopt, "doc",
+                                                FullTextSearch::SearchType::MATCH_ALL,
+                                                /*pre_filter=*/std::nullopt);
+    fts->with_score = false;
+    auto res = reader->VisitFullTextSearch(fts);
+    ASSERT_TRUE(res.ok()) << res.status().ToString();
+    // Must NOT be scored.
+    EXPECT_FALSE(std::dynamic_pointer_cast<BitmapScoredGlobalIndexResult>(res.value()));
+    auto plain = std::dynamic_pointer_cast<BitmapGlobalIndexResult>(res.value());
+    ASSERT_TRUE(plain);
+    ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, plain->GetBitmap());
+    EXPECT_EQ(BitmapToVec(*bitmap), (std::vector<int64_t>{0, 1, 2}));
+}
+
+// Path B: with_score=false, limit=N → BitmapGlobalIndexResult, any N matches,
+// no scoring (no BM25 sort). Used by `WHERE MATCH ... LIMIT N` without ORDER BY.
+TEST_F(TantivyFilterLimitTest, WithScoreFalseLimitN_AnyNNoScore) {
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([
+        ["doc"],
+        ["doc doc doc doc doc"],
+        ["doc doc"]
+    ])")
+                     .ValueOrDie();
+    auto [fm, meta] = WriteAndOpen(array, {});
+    ASSERT_OK_AND_ASSIGN(auto reader,
+                         TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool()));
+    auto fts = std::make_shared<FullTextSearch>("f0", /*limit=*/2, "doc",
+                                                FullTextSearch::SearchType::MATCH_ALL,
+                                                /*pre_filter=*/std::nullopt);
+    fts->with_score = false;
+    auto res = reader->VisitFullTextSearch(fts);
+    ASSERT_TRUE(res.ok()) << res.status().ToString();
+    // Must NOT be scored.
+    EXPECT_FALSE(std::dynamic_pointer_cast<BitmapScoredGlobalIndexResult>(res.value()));
+    auto plain = std::dynamic_pointer_cast<BitmapGlobalIndexResult>(res.value());
+    ASSERT_TRUE(plain);
+    ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, plain->GetBitmap());
+    // Only cardinality matters — selection order is arbitrary and depends on
+    // tantivy's posting iteration; the two returned row_ids must each be one
+    // of the three input docs.
+    EXPECT_EQ(bitmap->Cardinality(), 2u);
+    auto vec = BitmapToVec(*bitmap);
+    for (auto id : vec) {
+        EXPECT_TRUE(id == 0 || id == 1 || id == 2);
+    }
+}
+
+// Path C (new in v0.2): with_score=true, limit=None → BitmapScoredGlobalIndexResult,
+// all rows + all scores, ordered by row_id asc.
+TEST_F(TantivyFilterLimitTest, WithScoreTrueLimitNone_AllRowsWithScore) {
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([
+        ["doc"], ["doc doc"], ["doc doc doc"]
+    ])")
+                     .ValueOrDie();
+    auto [fm, meta] = WriteAndOpen(array, {});
+    ASSERT_OK_AND_ASSIGN(auto reader,
+                         TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool()));
+    auto fts = std::make_shared<FullTextSearch>("f0", /*limit=*/std::nullopt, "doc",
+                                                FullTextSearch::SearchType::MATCH_ALL,
+                                                /*pre_filter=*/std::nullopt);
+    fts->with_score = true;
+    auto res = reader->VisitFullTextSearch(fts);
+    ASSERT_TRUE(res.ok()) << res.status().ToString();
+    auto scored = std::dynamic_pointer_cast<BitmapScoredGlobalIndexResult>(res.value());
+    ASSERT_TRUE(scored) << "with_score=true must produce BitmapScoredGlobalIndexResult";
+    ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, scored->GetBitmap());
+    EXPECT_EQ(BitmapToVec(*bitmap), (std::vector<int64_t>{0, 1, 2}));
+    // All 3 docs have scores; sizes must match.
+    EXPECT_EQ(scored->GetScores().size(), 3u);
+    for (auto s : scored->GetScores()) {
+        EXPECT_GT(s, 0.0f);
+    }
+}
+
+// Path D: with_score=true, limit=N → BitmapScoredGlobalIndexResult, top-N with scores.
+// Equivalent to the v0.1 happy-path (LimitProducesScoredResultTopN), kept here
+// as an explicit anchor of the 4-path matrix.
+TEST_F(TantivyFilterLimitTest, WithScoreTrueLimitN_TopNWithScore) {
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([
+        ["doc"],
+        ["doc doc doc doc doc"],
+        ["doc doc"]
+    ])")
+                     .ValueOrDie();
+    auto [fm, meta] = WriteAndOpen(array, {});
+    ASSERT_OK_AND_ASSIGN(auto reader,
+                         TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool()));
+    auto fts = std::make_shared<FullTextSearch>("f0", /*limit=*/2, "doc",
+                                                FullTextSearch::SearchType::MATCH_ALL,
+                                                /*pre_filter=*/std::nullopt);
+    fts->with_score = true;
+    auto res = reader->VisitFullTextSearch(fts);
+    ASSERT_TRUE(res.ok()) << res.status().ToString();
+    auto scored = std::dynamic_pointer_cast<BitmapScoredGlobalIndexResult>(res.value());
+    ASSERT_TRUE(scored);
+    ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, scored->GetBitmap());
+    EXPECT_EQ(bitmap->Cardinality(), 2u);
+    EXPECT_TRUE(bitmap->Contains(1));  // highest TF must be included
+    EXPECT_EQ(scored->GetScores().size(), 2u);
+}
+
+// Migration guard: when caller omits `with_score`, the default is `false` —
+// even with limit set, the result is a BitmapGlobalIndexResult (NOT scored).
+// This catches v0.1 callers that relied on `limit >= 0` to implicitly get scores.
+TEST_F(TantivyFilterLimitTest, WithScoreDefaultIsFalse) {
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([
+        ["doc"], ["doc doc"], ["doc doc doc"]
+    ])")
+                     .ValueOrDie();
+    auto [fm, meta] = WriteAndOpen(array, {});
+    ASSERT_OK_AND_ASSIGN(auto reader,
+                         TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool()));
+    // Note: NOT setting fts->with_score; relying on the default value.
+    auto fts = std::make_shared<FullTextSearch>("f0", /*limit=*/2, "doc",
+                                                FullTextSearch::SearchType::MATCH_ALL,
+                                                /*pre_filter=*/std::nullopt);
+    auto res = reader->VisitFullTextSearch(fts);
+    ASSERT_TRUE(res.ok()) << res.status().ToString();
+    // v0.2 contract: with_score defaults to false, so even with limit set the
+    // result is BitmapGlobalIndexResult (NOT BitmapScoredGlobalIndexResult).
+    EXPECT_FALSE(std::dynamic_pointer_cast<BitmapScoredGlobalIndexResult>(res.value()))
+        << "v0.2: limit alone must NOT imply scoring; with_score=true is required";
+    auto plain = std::dynamic_pointer_cast<BitmapGlobalIndexResult>(res.value());
+    ASSERT_TRUE(plain);
+}
+
+}  // namespace paimon::tantivy::test
diff --git a/src/paimon/global_index/tantivy/tantivy_global_index.cpp b/src/paimon/global_index/tantivy/tantivy_global_index.cpp
new file mode 100644
index 000000000..2eb0d1f79
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_global_index.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ */
+
+#include "paimon/global_index/tantivy/tantivy_global_index.h"
+
+#include "arrow/c/bridge.h"
+#include "fmt/format.h"
+#include "paimon/common/utils/options_utils.h"
+#include "paimon/global_index/tantivy/tantivy_global_index_reader.h"
+#include "paimon/global_index/tantivy/tantivy_global_index_writer.h"
+
+namespace paimon::tantivy {
+
+#define CHECK_NOT_NULL(pointer, error_msg)     \
+    do {                                       \
+        if (!(pointer)) {                      \
+            return Status::Invalid(error_msg); \
+        }                                      \
+    } while (0)
+
+TantivyGlobalIndex::TantivyGlobalIndex(const std::map<std::string, std::string>& options)
+    : options_(OptionsUtils::FetchOptionsWithPrefix(kOptionKeyPrefix, options)) {}
+
+Result<std::shared_ptr<GlobalIndexWriter>> TantivyGlobalIndex::CreateWriter(
+    const std::string& field_name, ::ArrowSchema* arrow_schema,
+    const std::shared_ptr<GlobalIndexFileWriter>& file_writer,
+    const std::shared_ptr<MemoryPool>& pool) const {
+    PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr<arrow::DataType> arrow_type,
+                                      arrow::ImportType(arrow_schema));
+    auto struct_type = std::dynamic_pointer_cast<arrow::StructType>(arrow_type);
+    CHECK_NOT_NULL(struct_type,
+                   "arrow schema must be struct type when create TantivyGlobalIndexWriter");
+    auto index_field = struct_type->GetFieldByName(field_name);
+    CHECK_NOT_NULL(
+        index_field,
+        fmt::format("field {} not exist in arrow schema when create TantivyGlobalIndexWriter",
+                    field_name));
+    if (index_field->type()->id() != arrow::Type::type::STRING) {
+        return Status::Invalid("field type must be string");
+    }
+    return TantivyGlobalIndexWriter::Create(field_name, arrow_type, file_writer, options_, pool);
+}
+
+Result<std::shared_ptr<GlobalIndexReader>> TantivyGlobalIndex::CreateReader(
+    ::ArrowSchema* c_arrow_schema, const std::shared_ptr<GlobalIndexFileReader>& file_reader,
+    const std::vector<GlobalIndexIOMeta>& files, const std::shared_ptr<MemoryPool>& pool) const {
+    PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr<arrow::Schema> arrow_schema,
+                                      arrow::ImportSchema(c_arrow_schema));
+    if (files.size() != 1) {
+        return Status::Invalid("tantivy index only has one index file per shard, now num: {}",
+                               files.size());
+    }
+    if (arrow_schema->num_fields() != 1) {
+        return Status::Invalid("TantivyGlobalIndex now only support one field");
+    }
+    auto index_field = arrow_schema->field(0);
+    if (index_field->type()->id() != arrow::Type::type::STRING) {
+        return Status::Invalid("field type must be string");
+    }
+    return TantivyGlobalIndexReader::Create(index_field->name(), files[0], file_reader, options_,
+                                            pool);
+}
+
+}  // namespace paimon::tantivy
diff --git a/src/paimon/global_index/tantivy/tantivy_global_index.h b/src/paimon/global_index/tantivy/tantivy_global_index.h
new file mode 100644
index 000000000..f380cafa1
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_global_index.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/type.h"
+#include "paimon/global_index/global_indexer.h"
+#include "paimon/global_index/tantivy/tantivy_defs.h"
+
+namespace paimon::tantivy {
+
+/// `GlobalIndexer` implementation backed by tantivy-fulltext. Counterpart to
+/// `LuceneGlobalIndex`; the two coexist (and are NOT cross-readable) per
+/// migration plan §0 decision 1. Selection between them happens at the
+/// factory layer via the `index_type` identifier.
+class TantivyGlobalIndex : public GlobalIndexer {
+ public:
+    explicit TantivyGlobalIndex(const std::map<std::string, std::string>& options);
+
+    Result<std::shared_ptr<GlobalIndexWriter>> CreateWriter(
+        const std::string& field_name, ::ArrowSchema* arrow_schema,
+        const std::shared_ptr<GlobalIndexFileWriter>& file_writer,
+        const std::shared_ptr<MemoryPool>& pool) const override;
+
+    Result<std::shared_ptr<GlobalIndexReader>> CreateReader(
+        ::ArrowSchema* arrow_schema, const std::shared_ptr<GlobalIndexFileReader>& file_reader,
+        const std::vector<GlobalIndexIOMeta>& files,
+        const std::shared_ptr<MemoryPool>& pool) const override;
+
+ private:
+    /// Options after the `tantivy-fulltext.` prefix has been stripped.
+    std::map<std::string, std::string> options_;
+};
+
+}  // namespace paimon::tantivy
diff --git a/src/paimon/global_index/tantivy/tantivy_global_index_factory.cpp b/src/paimon/global_index/tantivy/tantivy_global_index_factory.cpp
new file mode 100644
index 000000000..0227d17bb
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_global_index_factory.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ */
+
+#include "paimon/global_index/tantivy/tantivy_global_index_factory.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "paimon/factories/factory.h"
+#include "paimon/global_index/tantivy/tantivy_global_index.h"
+
+namespace paimon::tantivy {
+
+/// Identifier convention: lucene-fts uses "lucene-fts-global"; we use
+/// "tantivy-fulltext-global" so `GlobalIndexerFactory::Get("tantivy-fulltext", ...)`
+/// (which appends "-global") routes to us. Keeps both backends discoverable
+/// via the same lookup path.
+const char TantivyGlobalIndexFactory::IDENTIFIER[] = "tantivy-fulltext-global";
+
+Result<std::unique_ptr<GlobalIndexer>> TantivyGlobalIndexFactory::Create(
+    const std::map<std::string, std::string>& options) const {
+    return std::make_unique<TantivyGlobalIndex>(options);
+}
+
+REGISTER_PAIMON_FACTORY(TantivyGlobalIndexFactory);
+
+}  // namespace paimon::tantivy
diff --git a/src/paimon/global_index/tantivy/tantivy_global_index_factory.h b/src/paimon/global_index/tantivy/tantivy_global_index_factory.h
new file mode 100644
index 000000000..22d456e16
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_global_index_factory.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "paimon/global_index/global_indexer.h"
+#include "paimon/global_index/global_indexer_factory.h"
+
+namespace paimon::tantivy {
+
+/// Factory for creating tantivy-fulltext global indexers. Registered into
+/// `FactoryCreator` via `REGISTER_PAIMON_FACTORY` so it is selectable
+/// alongside `lucene-fts-global` by passing `index_type = "tantivy-fulltext"`
+/// (the suffix `-global` is appended automatically by
+/// `GlobalIndexerFactory::Get`).
+class TantivyGlobalIndexFactory : public GlobalIndexerFactory {
+ public:
+    static const char IDENTIFIER[];
+
+    const char* Identifier() const override {
+        return IDENTIFIER;
+    }
+
+    Result<std::unique_ptr<GlobalIndexer>> Create(
+        const std::map<std::string, std::string>& options) const override;
+};
+
+}  // namespace paimon::tantivy
diff --git a/src/paimon/global_index/tantivy/tantivy_global_index_reader.cpp b/src/paimon/global_index/tantivy/tantivy_global_index_reader.cpp
new file mode 100644
index 000000000..4f0690ce5
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_global_index_reader.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ */
+
+#include "paimon/global_index/tantivy/tantivy_global_index_reader.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <mutex>  // [BUG_QPLEAK_RUST]
+#include <vector>
+
+#include "fmt/format.h"
+#include "paimon/common/utils/options_utils.h"
+#include "paimon/common/utils/rapidjson_util.h"
+#include "paimon/global_index/bitmap_global_index_result.h"
+#include "paimon/global_index/tantivy/tantivy_archive_layout.h"
+#include "paimon/global_index/tantivy/tantivy_ffi_log.h"  // [BUG_QPLEAK_RUST]
+#include "paimon/global_index/tantivy/tantivy_ffi_status.h"
+#include "paimon/global_index/tantivy/tantivy_stream_ctx.h"
+
+namespace paimon::tantivy {
+
+namespace {
+
+// [BUG_QPLEAK_RUST] one-shot install of Rust log bridge so log::warn! in Rust
+// surfaces in BE's cn.WARNING via glog.
+void EnsureTantivyLogBridge() {
+    static std::once_flag flag;
+    std::call_once(flag, [] { InstallTantivyLogBridge(); });
+}
+
+/// Returns the jieba dictionary dir from the env var, or an empty string if the env
+/// var is missing/empty. We intentionally do NOT error here: paimon-java tantivy
+/// archives use the built-in `"default"` (SimpleTokenizer) and do not need jieba —
+/// the Rust reader's tokenizer-registration branch skips dict_dir entirely in that
+/// case (third_party/tantivy_ffi/src/reader.rs:111 → `let _ = (mode, dict_dir)`).
+/// For archives that DO use jieba (paimon-cpp-written with `tantivy.write.tokenizer
+/// = paimon_jieba`), the Rust side will surface a clear "create paimon_jieba
+/// tokenizer" failure when it tries to load the dictionary from an empty path, so
+/// the error stays actionable.
+std::string GetJiebaDictionaryDir() {
+    const char* env_dir = std::getenv(kJiebaDictDirEnv);
+    if (env_dir && *env_dir != '\0') {
+        return std::string(env_dir);
+    }
+    return std::string();
+}
+
+}  // namespace
+
+Result<std::shared_ptr<TantivyGlobalIndexReader>> TantivyGlobalIndexReader::Create(
+    const std::string& field_name, const GlobalIndexIOMeta& io_meta,
+    const std::shared_ptr<GlobalIndexFileReader>& file_reader,
+    const std::map<std::string, std::string>& options, const std::shared_ptr<MemoryPool>& pool) {
+    (void)field_name;          // Rust-side knows the field via the schema embedded in meta.json
+    EnsureTantivyLogBridge();  // [BUG_QPLEAK_RUST]
+
+    std::map<std::string, std::string> write_options;
+    if (io_meta.metadata) {
+        PAIMON_RETURN_NOT_OK(RapidJsonUtil::FromJsonString(
+            std::string(io_meta.metadata->data(), io_meta.metadata->size()), &write_options));
+    }
+
+    PAIMON_ASSIGN_OR_RAISE(
+        std::string tokenize_mode,
+        OptionsUtils::GetValueFromMap(options, kJiebaTokenizeMode, std::string("")));
+    if (tokenize_mode.empty()) {
+        // Reader-side option not set; look at the (possibly empty) write_options blob.
+        // When write_options is empty (paimon-java-written archive), the value below is
+        // a placeholder that satisfies FFI validation but is discarded at runtime —
+        // see the comment block above. Do NOT treat the placeholder as a real default
+        // for jieba indices; jieba archives written by paimon-cpp always stamp their
+        // chosen mode into metadata, so the placeholder branch never applies to them.
+        PAIMON_ASSIGN_OR_RAISE(
+            tokenize_mode, OptionsUtils::GetValueFromMap(write_options, kJiebaTokenizeMode,
+                                                         std::string(kDefaultJiebaTokenizeMode)));
+    }
+    PAIMON_ASSIGN_OR_RAISE(
+        bool omit_term_freq_and_positions,
+        OptionsUtils::GetValueFromMap(write_options, kTantivyWriteOmitTermFreqAndPositions, false));
+
+    std::string dict_dir = GetJiebaDictionaryDir();
+
+    // V3 streaming read path:
+    //   1) open stream
+    //   2) ParseArchiveHeader — reads only header bytes, seeks past payloads
+    //   3) wrap stream in StreamCtx (owned by Rust via release callback)
+    //   4) build PaimonStreamCallbacks → paimon_tantivy_reader_new_streaming
+    // Archive payloads are read lazily through read_at callbacks as tantivy
+    // accesses posting lists, meta.json, etc.
+    PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<InputStream> stream,
+                           file_reader->GetInputStream(io_meta.file_path));
+    PAIMON_ASSIGN_OR_RAISE(ArchiveLayout layout, ParseArchiveHeader(stream.get()));
+
+    // Transfer stream ownership to a heap-allocated StreamCtx; Rust will
+    // `paimon_cpp_stream_release(ctx)` on reader drop, which `delete`s it.
+    auto* stream_ctx = new StreamCtx{std::move(stream), {}};
+    PaimonStreamCallbacks callbacks{
+        static_cast<void*>(stream_ctx),
+        paimon_cpp_stream_read_at,
+        paimon_cpp_stream_release,
+    };
+
+    // Build C-string array pointing into layout.names (stable during this call).
+    std::vector<const char*> name_ptrs;
+    name_ptrs.reserve(layout.count);
+    for (const auto& n : layout.names) {
+        name_ptrs.push_back(n.c_str());
+    }
+
+    PaimonTantivyReader* raw = nullptr;
+    ::PaimonTantivyStatus st = paimon_tantivy_reader_new_streaming(
+        name_ptrs.data(), layout.offsets.data(), layout.lengths.data(), layout.count, callbacks,
+        tokenize_mode.c_str(),
+        /*with_position=*/!omit_term_freq_and_positions, dict_dir.c_str(), &raw);
+    if (st != PAIMON_TANTIVY_STATUS_OK) {
+        // On failure, Rust did NOT take ownership of ctx (FFI contract):
+        // release it here so the stream doesn't leak.
+        paimon_cpp_stream_release(stream_ctx);
+        PAIMON_TANTIVY_RETURN_NOT_OK(st);
+    }
+    return std::shared_ptr<TantivyGlobalIndexReader>(
+        new TantivyGlobalIndexReader(ReaderPtr(raw), pool));
+}
+
+Result<std::shared_ptr<GlobalIndexResult>> TantivyGlobalIndexReader::VisitFullTextSearch(
+    const std::shared_ptr<FullTextSearch>& full_text_search) {
+    if (!full_text_search) {
+        return Status::Invalid("VisitFullTextSearch: null FullTextSearch pointer");
+    }
+
+    // Serialize pre_filter (if any) to croaring portable bytes for FFI.
+    // NB: Serialize() returns a pooled_unique_ptr with MemoryPool::AllocatorDelete;
+    // converting via raw.release() + shared_ptr<Bytes>(raw_ptr) would substitute
+    // std::default_delete, causing alloc/dealloc mismatch (malloc vs operator
+    // delete) — detected by ASAN on 2026-04-21. Move directly into shared_ptr
+    // so the pooled deleter is preserved in the control block.
+    PAIMON_UNIQUE_PTR<Bytes> pre_filter_bytes_owned;
+    const char* pre_filter_ptr = nullptr;
+    std::size_t pre_filter_len = 0;
+    if (full_text_search->pre_filter.has_value()) {
+        pre_filter_bytes_owned = full_text_search->pre_filter.value().Serialize(pool_.get());
+        pre_filter_ptr = pre_filter_bytes_owned->data();
+        pre_filter_len = pre_filter_bytes_owned->size();
+    }
+
+    int32_t limit_arg = full_text_search->limit.has_value()
+                            ? static_cast<int32_t>(full_text_search->limit.value())
+                            : -1;
+
+    float min_score_arg =
+        full_text_search->min_score.has_value() ? full_text_search->min_score.value() : 0.0f;
+
+    BufferGuard out;
+    PaimonTantivyStatus st = paimon_tantivy_reader_search(
+        reader_.get(), static_cast<int32_t>(full_text_search->search_type),
+        full_text_search->query.data(), full_text_search->query.size(),
+        full_text_search->with_score, limit_arg, pre_filter_ptr, pre_filter_len, min_score_arg,
+        out.out());
+    PAIMON_TANTIVY_RETURN_NOT_OK(st);
+
+    // Decode `[u8 has_scores | u64 count | u64 row_ids[] | optional f32 scores[]]`.
+    // (B1 schema: row_id is the explicit u64 column read from the fast field.)
+    if (out.size() < 9) {
+        return Status::Invalid(
+            fmt::format("tantivy reader output too small ({} bytes)", out.size()));
+    }
+    const uint8_t* p = out.data();
+    bool has_scores = (p[0] != 0);
+    // v0.2 consistency check: the wire-level has_scores byte must match the caller's
+    // with_score flag. A mismatch would indicate FFI / wire-protocol drift.
+    if (has_scores != full_text_search->with_score) {
+        return Status::Invalid(fmt::format(
+            "tantivy wire protocol mismatch: caller with_score={} but buffer has_scores={}",
+            full_text_search->with_score, has_scores));
+    }
+    uint64_t count;
+    std::memcpy(&count, p + 1, sizeof(uint64_t));
+    std::size_t expected = 1 + 8 + count * 8 + (has_scores ? count * 4 : 0);
+    if (out.size() != expected) {
+        return Status::Invalid(fmt::format(
+            "tantivy reader output size mismatch: has_scores={} count={} expected {} bytes, got {}",
+            has_scores, count, expected, out.size()));
+    }
+
+    const uint8_t* row_id_p = p + 9;
+    if (!has_scores) {
+        RoaringBitmap64 bitmap;
+        for (uint64_t i = 0; i < count; i++) {
+            uint64_t row_id;
+            std::memcpy(&row_id, row_id_p + i * 8, sizeof(uint64_t));
+            bitmap.Add(static_cast<int64_t>(row_id));
+        }
+        return std::make_shared<BitmapGlobalIndexResult>(
+            [b = std::move(bitmap)]() -> Result<RoaringBitmap64> { return b; });
+    }
+    // has_scores=true: produce BitmapScoredGlobalIndexResult. Rust may send rows
+    // in either row_id-asc order (path C: with_score=true, limit=None) or score-desc
+    // order (path D: with_score=true, limit=Some). The bitmap iteration order is
+    // row_id-asc (RoaringBitmap set semantics), so we always re-sort by row_id here
+    // to keep `scores[i]` aligned with the i-th row_id from the bitmap iterator —
+    // matching the contract documented in BitmapScoredGlobalIndexResult.
+    const uint8_t* score_p = row_id_p + count * 8;
+    std::vector<std::pair<int64_t, float>> id_score_pairs;
+    id_score_pairs.reserve(count);
+    for (uint64_t i = 0; i < count; i++) {
+        uint64_t row_id;
+        std::memcpy(&row_id, row_id_p + i * 8, sizeof(uint64_t));
+        float score;
+        std::memcpy(&score, score_p + i * 4, sizeof(float));
+        id_score_pairs.emplace_back(static_cast<int64_t>(row_id), score);
+    }
+    // Sort by row_id ascending so scores align with bitmap iteration order.
+    std::sort(id_score_pairs.begin(), id_score_pairs.end(),
+              [](const auto& a, const auto& b) { return a.first < b.first; });
+    RoaringBitmap64 bitmap;
+    std::vector<float> scores;
+    scores.reserve(id_score_pairs.size());
+    for (const auto& [id, sc] : id_score_pairs) {
+        bitmap.Add(id);
+        scores.push_back(sc);
+    }
+    return std::make_shared<BitmapScoredGlobalIndexResult>(std::move(bitmap), std::move(scores));
+}
+
+}  // namespace paimon::tantivy
diff --git a/src/paimon/global_index/tantivy/tantivy_global_index_reader.h b/src/paimon/global_index/tantivy/tantivy_global_index_reader.h
new file mode 100644
index 000000000..d115504c9
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_global_index_reader.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "paimon/global_index/bitmap_scored_global_index_result.h"
+#include "paimon/global_index/global_index_io_meta.h"
+#include "paimon/global_index/global_index_reader.h"
+#include "paimon/global_index/io/global_index_file_reader.h"
+#include "paimon/global_index/tantivy/tantivy_defs.h"
+#include "paimon/global_index/tantivy/tantivy_ffi_handle.h"
+#include "paimon/memory/memory_pool.h"
+#include "paimon/predicate/full_text_search.h"
+
+namespace paimon::tantivy {
+
+/// Tantivy-backed implementation of `GlobalIndexReader`.
+///
+/// Mirrors LuceneGlobalIndexReader's surface but delegates query construction
+/// + execution into Rust over FFI. Stage 6 supports the 5 FullTextSearch
+/// SearchTypes (MATCH_ALL, MATCH_ANY, PHRASE, PREFIX, WILDCARD) without limit
+/// or pre_filter — both of which Stage 7 layers on.
+///
+/// All non-FullTextSearch visit methods return nullptr (matches
+/// LuceneGlobalIndexReader): the FTS index has no contribution for non-FTS
+/// predicates, framework treats nullptr as "no filter constraint".
+class TantivyGlobalIndexReader : public GlobalIndexReader {
+ public:
+    static Result<std::shared_ptr<TantivyGlobalIndexReader>> Create(
+        const std::string& field_name, const GlobalIndexIOMeta& io_meta,
+        const std::shared_ptr<GlobalIndexFileReader>& file_reader,
+        const std::map<std::string, std::string>& options, const std::shared_ptr<MemoryPool>& pool);
+
+    // === FunctionVisitor surface — non-FTS predicates fall back to full range. ===
+
+    Result<std::shared_ptr<GlobalIndexResult>> VisitIsNotNull() override {
+        return CreateAllResult();
+    }
+    Result<std::shared_ptr<GlobalIndexResult>> VisitIsNull() override {
+        return CreateAllResult();
+    }
+    Result<std::shared_ptr<GlobalIndexResult>> VisitEqual(const Literal&) override {
+        return CreateAllResult();
+    }
+    Result<std::shared_ptr<GlobalIndexResult>> VisitNotEqual(const Literal&) override {
+        return CreateAllResult();
+    }
+    Result<std::shared_ptr<GlobalIndexResult>> VisitLessThan(const Literal&) override {
+        return CreateAllResult();
+    }
+    Result<std::shared_ptr<GlobalIndexResult>> VisitLessOrEqual(const Literal&) override {
+        return CreateAllResult();
+    }
+    Result<std::shared_ptr<GlobalIndexResult>> VisitGreaterThan(const Literal&) override {
+        return CreateAllResult();
+    }
+    Result<std::shared_ptr<GlobalIndexResult>> VisitGreaterOrEqual(const Literal&) override {
+        return CreateAllResult();
+    }
+    Result<std::shared_ptr<GlobalIndexResult>> VisitIn(const std::vector<Literal>&) override {
+        return CreateAllResult();
+    }
+    Result<std::shared_ptr<GlobalIndexResult>> VisitNotIn(const std::vector<Literal>&) override {
+        return CreateAllResult();
+    }
+    Result<std::shared_ptr<GlobalIndexResult>> VisitStartsWith(const Literal&) override {
+        return CreateAllResult();
+    }
+    Result<std::shared_ptr<GlobalIndexResult>> VisitEndsWith(const Literal&) override {
+        return CreateAllResult();
+    }
+    Result<std::shared_ptr<GlobalIndexResult>> VisitContains(const Literal&) override {
+        return CreateAllResult();
+    }
+    Result<std::shared_ptr<GlobalIndexResult>> VisitLike(const Literal&) override {
+        return CreateAllResult();
+    }
+
+    Result<std::shared_ptr<ScoredGlobalIndexResult>> VisitVectorSearch(
+        const std::shared_ptr<VectorSearch>&) override {
+        return Status::Invalid(
+            "TantivyGlobalIndexReader is not supposed to handle vector search query");
+    }
+
+    Result<std::shared_ptr<GlobalIndexResult>> VisitFullTextSearch(
+        const std::shared_ptr<FullTextSearch>& full_text_search) override;
+
+    bool IsThreadSafe() const override {
+        return false;
+    }
+
+    std::string GetIndexType() const override {
+        return kIdentifier;
+    }
+
+ private:
+    TantivyGlobalIndexReader(ReaderPtr reader, std::shared_ptr<MemoryPool> pool)
+        : reader_(std::move(reader)), pool_(std::move(pool)) {}
+
+    std::shared_ptr<GlobalIndexResult> CreateAllResult() const {
+        return nullptr;
+    }
+
+    /// Owning handle to the Rust-side reader.
+    ReaderPtr reader_;
+    /// MemoryPool used for serializing pre-filter bitmaps to bytes for FFI.
+    std::shared_ptr<MemoryPool> pool_;
+};
+
+}  // namespace paimon::tantivy
diff --git a/src/paimon/global_index/tantivy/tantivy_global_index_writer.cpp b/src/paimon/global_index/tantivy/tantivy_global_index_writer.cpp
new file mode 100644
index 000000000..f78bc6d41
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_global_index_writer.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ */
+
+#include "paimon/global_index/tantivy/tantivy_global_index_writer.h"
+
+#include <cstdlib>
+
+#include "arrow/c/bridge.h"
+#include "fmt/format.h"
+#include "paimon/common/global_index/global_index_utils.h"
+#include "paimon/common/utils/options_utils.h"
+#include "paimon/common/utils/rapidjson_util.h"
+#include "paimon/global_index/tantivy/tantivy_ffi_status.h"
+#include "paimon/global_index/tantivy/tantivy_stream_ctx.h"
+
+namespace paimon::tantivy {
+
+#define CHECK_NOT_NULL(pointer, error_msg)     \
+    do {                                       \
+        if (!(pointer)) {                      \
+            return Status::Invalid(error_msg); \
+        }                                      \
+    } while (0)
+
+namespace {
+
+/// Resolve the jieba dictionary directory for the writer. Mirrors lucene-fts'
+/// LuceneUtils::GetJiebaDictionaryDir but kept separate to avoid coupling
+/// tantivy-fulltext to the lucene module.
+Result<std::string> GetJiebaDictionaryDir() {
+    const char* env_dir = std::getenv(kJiebaDictDirEnv);
+    if (env_dir && *env_dir != '\0') {
+        return std::string(env_dir);
+    }
+    return Status::Invalid(
+        fmt::format("jieba dictionary dir not found, please set {} env var", kJiebaDictDirEnv));
+}
+
+}  // namespace
+
+Result<std::shared_ptr<TantivyGlobalIndexWriter>> TantivyGlobalIndexWriter::Create(
+    const std::string& field_name, const std::shared_ptr<arrow::DataType>& arrow_type,
+    const std::shared_ptr<GlobalIndexFileWriter>& file_writer,
+    const std::map<std::string, std::string>& options, const std::shared_ptr<MemoryPool>& pool) {
+    PAIMON_ASSIGN_OR_RAISE(
+        bool omit_term_freq_and_positions,
+        OptionsUtils::GetValueFromMap(options, kTantivyWriteOmitTermFreqAndPositions, false));
+    PAIMON_ASSIGN_OR_RAISE(std::string tokenize_mode,
+                           OptionsUtils::GetValueFromMap(options, kJiebaTokenizeMode,
+                                                         std::string(kDefaultJiebaTokenizeMode)));
+    PAIMON_ASSIGN_OR_RAISE(std::string tokenizer, OptionsUtils::GetValueFromMap(
+                                                      options, kTantivyWriteTokenizer,
+                                                      std::string(kDefaultTantivyWriteTokenizer)));
+    // Jieba dict is only needed when actually using jieba. For tantivy built-in
+    // tokenizers (e.g. "default") we don't force the caller to ship the jieba
+    // dict dir — pass an empty string and Rust skips jieba construction.
+    std::string dict_dir;
+    if (tokenizer == "paimon_jieba") {
+        PAIMON_ASSIGN_OR_RAISE(dict_dir, GetJiebaDictionaryDir());
+    }
+
+    PaimonTantivyWriter* raw = nullptr;
+    PaimonTantivyStatus st = paimon_tantivy_writer_new(
+        field_name.c_str(), tokenize_mode.c_str(),
+        /*with_position=*/!omit_term_freq_and_positions, dict_dir.c_str(), tokenizer.c_str(), &raw);
+    PAIMON_TANTIVY_RETURN_NOT_OK(st);
+    WriterPtr writer(raw);
+    return std::shared_ptr<TantivyGlobalIndexWriter>(new TantivyGlobalIndexWriter(
+        field_name, arrow_type, std::move(writer), file_writer, options, pool));
+}
+
+TantivyGlobalIndexWriter::TantivyGlobalIndexWriter(
+    const std::string& field_name, const std::shared_ptr<arrow::DataType>& arrow_type,
+    WriterPtr writer, const std::shared_ptr<GlobalIndexFileWriter>& file_writer,
+    const std::map<std::string, std::string>& options, const std::shared_ptr<MemoryPool>& pool)
+    : pool_(pool),
+      field_name_(field_name),
+      arrow_type_(arrow_type),
+      writer_(std::move(writer)),
+      file_writer_(file_writer),
+      options_(options) {}
+
+Status TantivyGlobalIndexWriter::AddBatch(::ArrowArray* arrow_array,
+                                          std::vector<int64_t>&& relative_row_ids) {
+    // First-element check mirrors lucene; trust caller to feed sequential ids
+    // within a batch (same contract LuceneGlobalIndexWriter relies on).
+    PAIMON_RETURN_NOT_OK(
+        GlobalIndexUtils::CheckRelativeRowIds(arrow_array, relative_row_ids, row_id_));
+    PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr<arrow::Array> array,
+                                      arrow::ImportArray(arrow_array, arrow_type_));
+    auto struct_array = std::dynamic_pointer_cast<arrow::StructArray>(array);
+    CHECK_NOT_NULL(struct_array,
+                   "invalid input array in TantivyGlobalIndexWriter, must be struct array");
+    auto field_array = struct_array->GetFieldByName(field_name_);
+    CHECK_NOT_NULL(
+        field_array,
+        fmt::format("invalid input array in TantivyGlobalIndexWriter, field {} not in input array",
+                    field_name_));
+    auto string_array = std::dynamic_pointer_cast<arrow::StringArray>(field_array);
+    CHECK_NOT_NULL(string_array,
+                   fmt::format("invalid input array in TantivyGlobalIndexWriter, field array {} "
+                               "is not a string array",
+                               field_name_));
+
+    for (int64_t i = 0; i < string_array->length(); i++) {
+        const char* text_ptr = nullptr;
+        size_t text_len = 0;
+        if (!string_array->IsNull(i)) {
+            std::string_view view = string_array->Value(i);
+            text_ptr = view.data();
+            text_len = view.size();
+        }
+        // B1 schema: pass the caller-tracked row_id as an explicit u64 field.
+        PaimonTantivyStatus st = paimon_tantivy_writer_add(
+            writer_.get(), static_cast<uint64_t>(row_id_), text_ptr, text_len);
+        PAIMON_TANTIVY_RETURN_NOT_OK(st);
+        row_id_++;
+    }
+    return Status::OK();
+}
+
+Result<std::vector<GlobalIndexIOMeta>> TantivyGlobalIndexWriter::Finish() {
+    // W1 streaming finish: open the output file, pipe archive bytes from Rust
+    // through `paimon_cpp_writer_push` directly into the OutputStream. Peak
+    // RAM (Rust side) = 64KB buffer, independent of archive size.
+    PAIMON_ASSIGN_OR_RAISE(std::string index_file_name, file_writer_->NewFileName(kIdentifier));
+    PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<OutputStream> out,
+                           file_writer_->NewOutputStream(index_file_name));
+
+    WriteCtx ctx{out.get(), Status::OK()};
+    PaimonWriteCallbacks cb{
+        static_cast<void*>(&ctx),
+        paimon_cpp_writer_push,
+    };
+
+    int64_t rust_row_count = 0;
+    ::PaimonTantivyStatus st =
+        paimon_tantivy_writer_finish_streaming(writer_.get(), cb, &rust_row_count);
+    if (st != PAIMON_TANTIVY_STATUS_OK) {
+        // Prefer the detailed C++-side Status stashed by the write callback
+        // (if the failure originated there); fall back to FFI-derived status.
+        if (!ctx.last_error.ok()) {
+            return ctx.last_error;
+        }
+        PAIMON_TANTIVY_RETURN_NOT_OK(st);
+    }
+    if (rust_row_count != row_id_) {
+        return Status::Invalid(
+            fmt::format("tantivy writer row count {} mismatch paimon inner row count {}",
+                        rust_row_count, row_id_));
+    }
+
+    PAIMON_RETURN_NOT_OK(out->Flush());
+    PAIMON_RETURN_NOT_OK(out->Close());
+
+    PAIMON_ASSIGN_OR_RAISE(int64_t file_size, file_writer_->GetFileSize(index_file_name));
+    std::string options_json;
+    PAIMON_RETURN_NOT_OK(RapidJsonUtil::ToJsonString(options_, &options_json));
+    auto meta_bytes = std::make_shared<Bytes>(options_json, pool_.get());
+    GlobalIndexIOMeta meta(file_writer_->ToPath(index_file_name), file_size,
+                           /*metadata=*/meta_bytes);
+    return std::vector<GlobalIndexIOMeta>({meta});
+}
+
+}  // namespace paimon::tantivy
diff --git a/src/paimon/global_index/tantivy/tantivy_global_index_writer.h b/src/paimon/global_index/tantivy/tantivy_global_index_writer.h
new file mode 100644
index 000000000..ed5421320
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_global_index_writer.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/type.h"
+#include "paimon/global_index/global_index_writer.h"
+#include "paimon/global_index/io/global_index_file_writer.h"
+#include "paimon/global_index/tantivy/tantivy_defs.h"
+#include "paimon/global_index/tantivy/tantivy_ffi_handle.h"
+
+namespace paimon::tantivy {
+
+/// Tantivy-backed implementation of GlobalIndexWriter.
+///
+/// Mirrors LuceneGlobalIndexWriter's lifecycle:
+///   Create() → AddBatch()* → Finish()
+/// Each shard produces exactly one .index file via the GlobalIndexFileWriter,
+/// containing the full packed tantivy on-disk index in a single contiguous blob.
+///
+/// Indexes written by this class are NOT cross-readable with lucene-fts — see
+/// migration plan §0 decision 1. The C++ side of this writer is intentionally
+/// thin: index construction, segment merging, and packing all happen in Rust
+/// behind the FFI boundary.
+class TantivyGlobalIndexWriter : public GlobalIndexWriter {
+ public:
+    static Result<std::shared_ptr<TantivyGlobalIndexWriter>> Create(
+        const std::string& field_name, const std::shared_ptr<arrow::DataType>& arrow_type,
+        const std::shared_ptr<GlobalIndexFileWriter>& file_writer,
+        const std::map<std::string, std::string>& options, const std::shared_ptr<MemoryPool>& pool);
+
+    ~TantivyGlobalIndexWriter() override = default;
+
+    Status AddBatch(::ArrowArray* arrow_array, std::vector<int64_t>&& relative_row_ids) override;
+
+    Result<std::vector<GlobalIndexIOMeta>> Finish() override;
+
+ private:
+    TantivyGlobalIndexWriter(const std::string& field_name,
+                             const std::shared_ptr<arrow::DataType>& arrow_type, WriterPtr writer,
+                             const std::shared_ptr<GlobalIndexFileWriter>& file_writer,
+                             const std::map<std::string, std::string>& options,
+                             const std::shared_ptr<MemoryPool>& pool);
+
+    std::shared_ptr<MemoryPool> pool_;
+    std::string field_name_;
+    std::shared_ptr<arrow::DataType> arrow_type_;
+    /// Owning handle to the Rust-side writer.
+    WriterPtr writer_;
+    std::shared_ptr<GlobalIndexFileWriter> file_writer_;
+    std::map<std::string, std::string> options_;
+    /// Last document index processed (matches caller-passed relative_row_ids).
+    int64_t row_id_ = 0;
+};
+
+}  // namespace paimon::tantivy
diff --git a/src/paimon/global_index/tantivy/tantivy_index_test.cpp b/src/paimon/global_index/tantivy/tantivy_index_test.cpp
new file mode 100644
index 000000000..81e3f365a
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_index_test.cpp
@@ -0,0 +1,283 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Stage 8 integration test: end-to-end via TantivyGlobalIndex (writer + reader),
+ * mirroring src/paimon/global_index/lucene/lucene_global_index_test.cpp.
+ *
+ * Validates parity with lucene-fts on:
+ *   - file naming: "tantivy-fulltext-global-index-{uuid}.index"
+ *   - meta JSON shape: option-prefix-stripped key/value pairs
+ *   - 5 SearchTypes against an English corpus
+ *   - 5 SearchTypes against a Chinese corpus (jieba "query" mode)
+ *   - limit + pre_filter + scoring (Stage 7) interactions
+ *   - factory registration: looking up "tantivy-fulltext" produces a tantivy indexer
+ */
+
+#include <memory>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/c/bridge.h"
+#include "arrow/ipc/api.h"
+#include "arrow/type.h"
+#include "gtest/gtest.h"
+#include "paimon/common/utils/path_util.h"
+#include "paimon/common/utils/string_utils.h"
+#include "paimon/core/global_index/global_index_file_manager.h"
+#include "paimon/core/index/index_path_factory.h"
+#include "paimon/fs/local/local_file_system.h"
+#include "paimon/global_index/bitmap_global_index_result.h"
+#include "paimon/global_index/bitmap_scored_global_index_result.h"
+#include "paimon/global_index/global_indexer_factory.h"
+#include "paimon/global_index/tantivy/tantivy_defs.h"
+#include "paimon/global_index/tantivy/tantivy_global_index.h"
+#include "paimon/global_index/tantivy/tantivy_global_index_factory.h"
+#include "paimon/global_index/tantivy/tantivy_global_index_reader.h"
+#include "paimon/testing/utils/testharness.h"
+
+#ifndef JIEBA_TEST_DICT_DIR
+#error "JIEBA_TEST_DICT_DIR must be set at compile time"
+#endif
+
+namespace paimon::tantivy::test {
+
+namespace {
+
+class FakeIndexPathFactory : public IndexPathFactory {
+ public:
+    explicit FakeIndexPathFactory(const std::string& root) : root_(root) {}
+    std::string NewPath() const override {
+        assert(false);
+        return "";
+    }
+    std::string ToPath(const std::shared_ptr<IndexFileMeta>&) const override {
+        assert(false);
+        return "";
+    }
+    std::string ToPath(const std::string& file_name) const override {
+        return PathUtil::JoinPath(root_, file_name);
+    }
+    bool IsExternalPath() const override {
+        return false;
+    }
+
+ private:
+    std::string root_;
+};
+
+class TantivyGlobalIndexIntegrationTest : public ::testing::Test {
+ public:
+    void SetUp() override {
+        setenv(kJiebaDictDirEnv, JIEBA_TEST_DICT_DIR, /*overwrite=*/1);
+    }
+
+    std::unique_ptr<::ArrowSchema> CreateArrowSchema(
+        const std::shared_ptr<arrow::DataType>& data_type) const {
+        auto c_schema = std::make_unique<::ArrowSchema>();
+        EXPECT_TRUE(arrow::ExportType(*data_type, c_schema.get()).ok());
+        return c_schema;
+    }
+
+    Result<GlobalIndexIOMeta> WriteGlobalIndex(const std::string& root,
+                                               const std::shared_ptr<arrow::DataType>& data_type,
+                                               const std::map<std::string, std::string>& options,
+                                               const std::shared_ptr<arrow::Array>& array,
+                                               int64_t /*unused_expected_range_end*/) const {
+        auto global_index = std::make_shared<TantivyGlobalIndex>(options);
+        auto path_factory = std::make_shared<FakeIndexPathFactory>(root);
+        auto file_writer = std::make_shared<GlobalIndexFileManager>(fs_, path_factory);
+        PAIMON_ASSIGN_OR_RAISE(std::shared_ptr<GlobalIndexWriter> w,
+                               global_index->CreateWriter("f0", CreateArrowSchema(data_type).get(),
+                                                          file_writer, pool_));
+        ::ArrowArray c_array;
+        PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportArray(*array, &c_array));
+        std::vector<int64_t> relative_row_ids(array->length());
+        for (int64_t i = 0; i < array->length(); ++i) relative_row_ids[i] = i;
+        PAIMON_RETURN_NOT_OK(w->AddBatch(&c_array, std::move(relative_row_ids)));
+        PAIMON_ASSIGN_OR_RAISE(auto metas, w->Finish());
+        EXPECT_EQ(metas.size(), 1u);
+        auto file_name = PathUtil::GetName(metas[0].file_path);
+        EXPECT_TRUE(StringUtils::StartsWith(file_name, "tantivy-fulltext-global-index-"))
+            << file_name;
+        EXPECT_TRUE(StringUtils::EndsWith(file_name, ".index"));
+        EXPECT_TRUE(metas[0].metadata);
+        return metas[0];
+    }
+
+    Result<std::shared_ptr<GlobalIndexReader>> CreateReader(
+        const std::string& root, const std::shared_ptr<arrow::DataType>& data_type,
+        const std::map<std::string, std::string>& options, const GlobalIndexIOMeta& meta) const {
+        auto global_index = std::make_shared<TantivyGlobalIndex>(options);
+        auto path_factory = std::make_shared<FakeIndexPathFactory>(root);
+        auto file_reader = std::make_shared<GlobalIndexFileManager>(fs_, path_factory);
+        return global_index->CreateReader(CreateArrowSchema(data_type).get(), file_reader, {meta},
+                                          pool_);
+    }
+
+    void CheckResult(const std::shared_ptr<GlobalIndexResult>& result,
+                     const std::vector<int64_t>& expected_ids) const {
+        const RoaringBitmap64* bitmap = nullptr;
+        if (auto scored = std::dynamic_pointer_cast<BitmapScoredGlobalIndexResult>(result)) {
+            ASSERT_OK_AND_ASSIGN(bitmap, scored->GetBitmap());
+            ASSERT_EQ(scored->GetScores().size(), expected_ids.size());
+        } else if (auto plain = std::dynamic_pointer_cast<BitmapGlobalIndexResult>(result)) {
+            ASSERT_OK_AND_ASSIGN(bitmap, plain->GetBitmap());
+        }
+        ASSERT_TRUE(bitmap);
+        ASSERT_EQ(*bitmap, RoaringBitmap64::From(expected_ids))
+            << "result=" << bitmap->ToString()
+            << ", expected=" << RoaringBitmap64::From(expected_ids).ToString();
+    }
+
+ protected:
+    std::shared_ptr<MemoryPool> pool_ = GetDefaultPool();
+    std::shared_ptr<FileSystem> fs_ = std::make_shared<LocalFileSystem>();
+    std::shared_ptr<arrow::DataType> data_type_ =
+        arrow::struct_({arrow::field("f0", arrow::utf8())});
+};
+
+}  // namespace
+
+TEST_F(TantivyGlobalIndexIntegrationTest, EnglishCorpus) {
+    auto root_dir = paimon::test::UniqueTestDirectory::Create();
+    ASSERT_TRUE(root_dir);
+    std::string root = root_dir->Str();
+
+    std::map<std::string, std::string> options = {
+        {"tantivy-fulltext.write.omit-term-freq-and-position", "false"},
+    };
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(data_type_, R"([
+        ["This is an test document."],
+        ["This is an new document document document."],
+        ["Document document document document test."],
+        ["unordered user-defined doc id"]
+    ])")
+                     .ValueOrDie();
+    ASSERT_OK_AND_ASSIGN(auto meta, WriteGlobalIndex(root, data_type_, options, array, 3));
+    EXPECT_EQ(std::string(meta.metadata->data(), meta.metadata->size()),
+              R"({"write.omit-term-freq-and-position":"false"})");
+
+    ASSERT_OK_AND_ASSIGN(auto reader, CreateReader(root, data_type_, options, meta));
+    auto t_reader = std::dynamic_pointer_cast<TantivyGlobalIndexReader>(reader);
+    ASSERT_TRUE(t_reader);
+    EXPECT_EQ(t_reader->GetIndexType(), std::string(kIdentifier));
+
+    auto run = [&](const std::string& q, FullTextSearch::SearchType t,
+                   std::optional<int32_t> limit = std::nullopt,
+                   std::optional<RoaringBitmap64> filter = std::nullopt) {
+        // Use scored path so `limit` returns top-N by BM25, matching test
+        // expectations (otherwise unscored Path B returns any-N, non-deterministic).
+        auto fts = std::make_shared<FullTextSearch>("f0", limit, q, t, filter);
+        fts->with_score = true;
+        auto res = t_reader->VisitFullTextSearch(fts);
+        EXPECT_TRUE(res.ok()) << res.status().ToString();
+        return res.value();
+    };
+
+    CheckResult(run("document", FullTextSearch::SearchType::MATCH_ALL, 10), {2, 1, 0});
+    CheckResult(run("document", FullTextSearch::SearchType::MATCH_ANY, 1), {2});
+    CheckResult(run("test document", FullTextSearch::SearchType::MATCH_ALL, 10), {2, 0});
+    CheckResult(run("test new", FullTextSearch::SearchType::MATCH_ANY, 10), {1, 0, 2});
+    CheckResult(run("test document", FullTextSearch::SearchType::PHRASE, 10), {0});
+    CheckResult(run("unordered", FullTextSearch::SearchType::MATCH_ALL, 10), {3});
+    CheckResult(run("unorder", FullTextSearch::SearchType::PREFIX, 10), {3});
+    CheckResult(run("*order*", FullTextSearch::SearchType::WILDCARD, 10), {3});
+    CheckResult(run("*or*er*", FullTextSearch::SearchType::WILDCARD, 10), {3});
+
+    // pre_filter
+    CheckResult(
+        run("document", FullTextSearch::SearchType::MATCH_ALL, 10, RoaringBitmap64::From({0l, 1l})),
+        {0, 1});
+    CheckResult(run("document", FullTextSearch::SearchType::MATCH_ALL, 10,
+                    RoaringBitmap64::From({2l, 100l})),
+                {2});
+    CheckResult(run("document", FullTextSearch::SearchType::MATCH_ALL, 10,
+                    RoaringBitmap64::From({20l, 100l})),
+                {});
+
+    // No limit
+    CheckResult(run("document", FullTextSearch::SearchType::MATCH_ALL), {0, 1, 2});
+    CheckResult(run("document", FullTextSearch::SearchType::MATCH_ALL, std::nullopt,
+                    RoaringBitmap64::From({2l})),
+                {2});
+    CheckResult(run("document test", FullTextSearch::SearchType::MATCH_ALL, std::nullopt,
+                    RoaringBitmap64::From({1l, 2l, 3l, 100l})),
+                {2});
+}
+
+TEST_F(TantivyGlobalIndexIntegrationTest, ChineseCorpus) {
+    auto root_dir = paimon::test::UniqueTestDirectory::Create();
+    ASSERT_TRUE(root_dir);
+    std::string root = root_dir->Str();
+
+    std::map<std::string, std::string> options = {
+        {"tantivy-fulltext.write.omit-term-freq-and-position", "false"},
+        {"tantivy-fulltext.tantivy.write.tokenizer", "paimon_jieba"},
+        {"tantivy-fulltext.jieba.tokenize-mode", "query"},
+    };
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(data_type_, R"([
+["QianWen 是一个基于 AI 的智能助手，类似于 Siri 和 Alexa。我们正在用 Python 开发 QianWen 的 Natural Language Understanding 模块，该模块支持多轮对话和意图识别功能，是新一代智能助手的核心技术之一。"],
+["最近开源了一个新项目叫ｑｉａｎｗｅｎ（全角字符），功能类似之前的 Qianwen，是一个面向 AI 应用的智能助手。它不仅支持 Machine Learning 和 NLP 技术，还提供了可扩展的开发框架，便于开发者构建自己的智能助手系统。"],
+["我们在测试 qianwen-core v1.2 和 ai-engine-alpha 中的 bug，重点优化了 qianwen 的响应速度和稳定性。本次更新增强了核心模块的功能，提升了智能助手的开发效率，并修复了与 NLP 模块相关的多个问题。"],
+["AI 助手开发中常用的技术包括 Speech Recognition、Natural Language Processing 和 Recommendation System。我们使用 TensorFlow 和 PyTorch 构建模型，开发了多个智能助手原型，支持语音交互和上下文理解功能，是当前热门的人工智能发展应用方向。"],
+["新一代的 AI 助手代号为「千问」，内部命名为 QianwenX-2024，计划在 next quarter 发布。QianwenX 将集成更强的 multimodel 能力，支持图像和文本联合处理，进一步提升智能助手的理解能力和交互体验，是未来智能助手的重要发展方向。"]
+    ])")
+                     .ValueOrDie();
+    ASSERT_OK_AND_ASSIGN(auto meta, WriteGlobalIndex(root, data_type_, options, array, 4));
+    EXPECT_EQ(
+        std::string(meta.metadata->data(), meta.metadata->size()),
+        R"({"jieba.tokenize-mode":"query","tantivy.write.tokenizer":"paimon_jieba","write.omit-term-freq-and-position":"false"})");
+
+    ASSERT_OK_AND_ASSIGN(auto reader, CreateReader(root, data_type_, options, meta));
+    auto t_reader = std::dynamic_pointer_cast<TantivyGlobalIndexReader>(reader);
+    ASSERT_TRUE(t_reader);
+
+    auto run = [&](const std::string& q, FullTextSearch::SearchType t,
+                   std::optional<int32_t> limit = std::nullopt,
+                   std::optional<RoaringBitmap64> filter = std::nullopt) {
+        // Use scored path so `limit` returns top-N by BM25, matching test
+        // expectations (otherwise unscored Path B returns any-N, non-deterministic).
+        auto fts = std::make_shared<FullTextSearch>("f0", limit, q, t, filter);
+        fts->with_score = true;
+        auto res = t_reader->VisitFullTextSearch(fts);
+        EXPECT_TRUE(res.ok()) << res.status().ToString();
+        return res.value();
+    };
+
+    CheckResult(run("模块", FullTextSearch::SearchType::MATCH_ALL, 10), {0, 2});
+    CheckResult(run("模块", FullTextSearch::SearchType::MATCH_ANY, 1), {0});
+    CheckResult(run("模块技术", FullTextSearch::SearchType::MATCH_ALL, 10), {0});
+    CheckResult(run("模块技术", FullTextSearch::SearchType::MATCH_ANY, 10), {0, 1, 2, 3});
+    CheckResult(run("发展方向", FullTextSearch::SearchType::PHRASE, 10), {4});
+    CheckResult(run("模块技术", FullTextSearch::SearchType::MATCH_ANY, 10,
+                    RoaringBitmap64::From({1l, 3l, 4l})),
+                {1, 3});
+    CheckResult(run("模块技术", FullTextSearch::SearchType::MATCH_ANY), {0, 1, 2, 3});
+}
+
+TEST_F(TantivyGlobalIndexIntegrationTest, FactoryLookupReturnsTantivyIndexer) {
+    std::map<std::string, std::string> options = {
+        {"tantivy-fulltext.jieba.tokenize-mode", "query"},
+    };
+    // Identifier passed to GlobalIndexerFactory::Get is the prefix; "-global"
+    // is appended automatically. So "tantivy-fulltext" must route to our factory.
+    ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalIndexer> indexer,
+                         GlobalIndexerFactory::Get("tantivy-fulltext", options));
+    ASSERT_TRUE(indexer);
+    auto* casted = dynamic_cast<TantivyGlobalIndex*>(indexer.get());
+    ASSERT_TRUE(casted) << "factory did not return a TantivyGlobalIndex";
+}
+
+}  // namespace paimon::tantivy::test
diff --git a/src/paimon/global_index/tantivy/tantivy_java_compat_test.cpp b/src/paimon/global_index/tantivy/tantivy_java_compat_test.cpp
new file mode 100644
index 000000000..fbfdd8fa2
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_java_compat_test.cpp
@@ -0,0 +1,535 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * J6: cross-read test — paimon-java produces the tantivy archive, paimon-cpp
+ * V3 reader consumes it.
+ *
+ * The fixture (`english_simple.archive` + `english_simple.golden.json`) is
+ * generated by `TantivyIndexFixtureGen.java` in the paimon repo using the
+ * production `TantivyIndexWriter` + `packIndex` path. Ten pure-ASCII English
+ * documents (row_id 0..9) are indexed; for each SearchType we assert the V3
+ * reader returns exactly the row_ids the Java side wrote — evidence that
+ * archive byte format, schema, and segment-level byte format all line up
+ * across the Java/C++ implementations.
+ *
+ * Architectural cross-checks this test guards:
+ *   1. Archive BE big-endian format parsing (ParseArchiveHeader)
+ *   2. Multi-segment layout (Java does not force-merge; 20+ files in fixture)
+ *   3. Schema interop: `row_id` u64 fast field written by Java, read by C++ V3
+ *   4. Tokenizer parity on pure English (SimpleTokenizer ↔ paimon_jieba)
+ *   5. row_id caller-supplied invariant: reader returns the exact row_ids
+ *      Java wrote (0..9), NOT tantivy-internal doc_ids
+ */
+
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/c/bridge.h"
+#include "arrow/ipc/api.h"
+#include "arrow/type.h"
+#include "gtest/gtest.h"
+#include "paimon/common/utils/path_util.h"
+#include "paimon/core/global_index/global_index_file_manager.h"
+#include "paimon/core/index/index_path_factory.h"
+#include "paimon/fs/local/local_file_system.h"
+#include "paimon/global_index/bitmap_global_index_result.h"
+#include "paimon/global_index/bitmap_scored_global_index_result.h"
+#include "paimon/global_index/tantivy/tantivy_archive_layout.h"
+#include "paimon/global_index/tantivy/tantivy_defs.h"
+#include "paimon/global_index/tantivy/tantivy_global_index.h"
+#include "paimon/global_index/tantivy/tantivy_global_index_reader.h"
+#include "paimon/global_index/tantivy/tantivy_global_index_writer.h"
+#include "paimon/predicate/full_text_search.h"
+#include "paimon/testing/utils/testharness.h"
+
+#ifndef JIEBA_TEST_DICT_DIR
+#error "JIEBA_TEST_DICT_DIR must be set at compile time"
+#endif
+#ifndef PAIMON_TANTIVY_JAVA_FIXTURE_DIR
+#error "PAIMON_TANTIVY_JAVA_FIXTURE_DIR must be set at compile time"
+#endif
+
+namespace paimon::tantivy::test {
+
+namespace {
+
+class FixturePathFactory : public IndexPathFactory {
+ public:
+    explicit FixturePathFactory(const std::string& root) : root_(root) {}
+    std::string NewPath() const override {
+        assert(false);
+        return "";
+    }
+    std::string ToPath(const std::shared_ptr<IndexFileMeta>&) const override {
+        assert(false);
+        return "";
+    }
+    std::string ToPath(const std::string& file_name) const override {
+        return PathUtil::JoinPath(root_, file_name);
+    }
+    bool IsExternalPath() const override {
+        return false;
+    }
+
+ private:
+    std::string root_;
+};
+
+class JavaCompatTest : public ::testing::Test {
+ public:
+    void SetUp() override {
+        setenv(kJiebaDictDirEnv, JIEBA_TEST_DICT_DIR, /*overwrite=*/1);
+    }
+
+    /// Build a V3 TantivyGlobalIndexReader on top of the Java-produced fixture.
+    /// `fixture_name` is relative to `PAIMON_TANTIVY_JAVA_FIXTURE_DIR`.
+    std::shared_ptr<GlobalIndexReader> OpenFixture(const std::string& fixture_name) {
+        std::string fixture_dir = PAIMON_TANTIVY_JAVA_FIXTURE_DIR;
+        std::string archive_path = PathUtil::JoinPath(fixture_dir, fixture_name);
+
+        auto file_status = fs_->GetFileStatus(archive_path).value();
+        int64_t file_size = file_status->GetLen();
+        EXPECT_GT(file_size, 4) << "fixture archive must exist and be > 4 bytes";
+
+        // Empty metadata (options not needed for cross-read — we use defaults)
+        std::string metadata_json = "{}";
+        auto meta_bytes = std::make_shared<Bytes>(metadata_json, pool_.get());
+
+        GlobalIndexIOMeta io_meta(archive_path, file_size, meta_bytes);
+
+        std::map<std::string, std::string> options;
+        auto global_index = std::make_shared<TantivyGlobalIndex>(options);
+        auto path_factory = std::make_shared<FixturePathFactory>(fixture_dir);
+        auto file_reader = std::make_shared<GlobalIndexFileManager>(fs_, path_factory);
+
+        auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())});
+        auto c_schema = std::make_unique<::ArrowSchema>();
+        EXPECT_TRUE(arrow::ExportType(*data_type, c_schema.get()).ok());
+
+        auto reader_res = global_index->CreateReader(c_schema.get(), file_reader, {io_meta}, pool_);
+        EXPECT_TRUE(reader_res.ok()) << reader_res.status().ToString();
+        return reader_res.value();
+    }
+
+    std::shared_ptr<FullTextSearch> BuildFts(FullTextSearch::SearchType type,
+                                             const std::string& query) {
+        return std::make_shared<FullTextSearch>(
+            /*_field_name=*/"f0",
+            /*_limit=*/std::optional<int32_t>{},
+            /*_query=*/query,
+            /*_search_type=*/type,
+            /*_pre_filter=*/std::optional<RoaringBitmap64>{});
+    }
+
+    /// Run the search and return the sorted row_ids from the result bitmap.
+    std::vector<int64_t> RunSearchRowIds(const std::shared_ptr<GlobalIndexReader>& reader,
+                                         FullTextSearch::SearchType type,
+                                         const std::string& query) {
+        auto fts = BuildFts(type, query);
+        auto result = reader->VisitFullTextSearch(fts);
+        EXPECT_TRUE(result.ok()) << result.status().ToString();
+        std::shared_ptr<GlobalIndexResult> r = result.value();
+
+        const RoaringBitmap64* bitmap = nullptr;
+        if (auto plain = std::dynamic_pointer_cast<BitmapGlobalIndexResult>(r)) {
+            auto b = plain->GetBitmap();
+            EXPECT_TRUE(b.ok()) << b.status().ToString();
+            bitmap = b.value();
+        } else if (auto scored = std::dynamic_pointer_cast<BitmapScoredGlobalIndexResult>(r)) {
+            auto b = scored->GetBitmap();
+            EXPECT_TRUE(b.ok()) << b.status().ToString();
+            bitmap = b.value();
+        }
+        EXPECT_TRUE(bitmap != nullptr);
+        if (bitmap == nullptr) return {};
+
+        std::vector<int64_t> out;
+        for (auto it = bitmap->Begin(); it != bitmap->End(); ++it) {
+            out.push_back(static_cast<int64_t>(*it));
+        }
+        std::sort(out.begin(), out.end());
+        return out;
+    }
+
+ protected:
+    std::shared_ptr<MemoryPool> pool_ = GetDefaultPool();
+    std::shared_ptr<FileSystem> fs_ = std::make_shared<LocalFileSystem>();
+};
+
+}  // namespace
+
+// ============================================================================
+// 1. Archive basics: opening the Java-produced fixture succeeds
+// ============================================================================
+
+TEST_F(JavaCompatTest, OpenJavaArchiveSucceeds) {
+    auto reader = OpenFixture("english_simple.archive");
+    ASSERT_TRUE(reader != nullptr);
+}
+
+// ============================================================================
+// 2. MATCH_ALL — single and multi-term
+// ============================================================================
+
+TEST_F(JavaCompatTest, MatchAll_Apple) {
+    auto reader = OpenFixture("english_simple.archive");
+    auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ALL, "apple");
+    // Docs containing "apple": 0 ("apple banana cherry"), 1 ("apple durian"),
+    // 4 ("apple cherry fig"), 7 ("apple")
+    EXPECT_EQ(ids, (std::vector<int64_t>{0, 1, 4, 7}));
+}
+
+TEST_F(JavaCompatTest, MatchAll_AppleBanana_Intersection) {
+    auto reader = OpenFixture("english_simple.archive");
+    auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ALL, "apple banana");
+    // Only doc 0 contains both "apple" and "banana"
+    EXPECT_EQ(ids, (std::vector<int64_t>{0}));
+}
+
+// ============================================================================
+// 3. MATCH_ANY — union
+// ============================================================================
+
+TEST_F(JavaCompatTest, MatchAny_DurianElderberry_Union) {
+    auto reader = OpenFixture("english_simple.archive");
+    auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ANY, "durian elderberry");
+    // durian: 1, 6   elderberry: 5, 8   union: {1, 5, 6, 8}
+    EXPECT_EQ(ids, (std::vector<int64_t>{1, 5, 6, 8}));
+}
+
+// ============================================================================
+// 4. PHRASE — consecutive term order matters
+// ============================================================================
+
+TEST_F(JavaCompatTest, Phrase_AppleBanana) {
+    auto reader = OpenFixture("english_simple.archive");
+    auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::PHRASE, "apple banana");
+    // Only doc 0 has "apple banana" as consecutive phrase
+    EXPECT_EQ(ids, (std::vector<int64_t>{0}));
+}
+
+TEST_F(JavaCompatTest, Phrase_BananaCherry) {
+    auto reader = OpenFixture("english_simple.archive");
+    auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::PHRASE, "banana cherry");
+    // "banana cherry" consecutive in doc 0 ("apple banana cherry") and doc 2 ("banana cherry")
+    EXPECT_EQ(ids, (std::vector<int64_t>{0, 2}));
+}
+
+// ============================================================================
+// 5. PREFIX — byte-level (not tokenized) via RegexQuery
+// ============================================================================
+
+TEST_F(JavaCompatTest, Prefix_Ap) {
+    auto reader = OpenFixture("english_simple.archive");
+    auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::PREFIX, "ap");
+    // Tokens starting with "ap": "apple" → docs 0, 1, 4, 7
+    EXPECT_EQ(ids, (std::vector<int64_t>{0, 1, 4, 7}));
+}
+
+// ============================================================================
+// 6. WILDCARD — glob-style via regex
+// ============================================================================
+
+TEST_F(JavaCompatTest, Wildcard_Err) {
+    auto reader = OpenFixture("english_simple.archive");
+    auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::WILDCARD, "*err*");
+    // Tokens matching *err*: "cherry" (0,2,4,6,9), "elderberry" (5,8)
+    EXPECT_EQ(ids, (std::vector<int64_t>{0, 2, 4, 5, 6, 8, 9}));
+}
+
+// ============================================================================
+// 7. row_id invariant — must return the *caller-supplied* row_ids (not doc_ids)
+// ============================================================================
+
+TEST_F(JavaCompatTest, AllDocsReachableByRowId) {
+    auto reader = OpenFixture("english_simple.archive");
+    // Union of all terms matches all 10 docs.
+    auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ANY,
+                               "apple banana cherry durian fig grape elderberry");
+    EXPECT_EQ(ids, (std::vector<int64_t>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}));
+    // This confirms Java wrote row_ids 0..9 via `addDocument(rowId, text)` and
+    // paimon-cpp V3 reader extracted them via fast_fields().u64("row_id") —
+    // the schema B1 invariant survives round-trip across implementations.
+}
+
+// ============================================================================
+// 8. Probe: real paimon-java production archive (handed over by Java team).
+//    Data was claimed to be (id INT, content STRING) with 5 rows but ids
+//    rewritten multiple times; dump layout + per-term hits so caller can
+//    reverse-engineer what's actually inside.
+// ============================================================================
+
+TEST_F(JavaCompatTest, ProductionSampleProbe) {
+    const std::string fixture_name = "production_sample.archive";
+    const std::string fixture_dir = PAIMON_TANTIVY_JAVA_FIXTURE_DIR;
+    const std::string archive_path = PathUtil::JoinPath(fixture_dir, fixture_name);
+
+    // 1) parse archive header, dump layout
+    auto stream_res = fs_->Open(archive_path);
+    ASSERT_TRUE(stream_res.ok()) << stream_res.status().ToString();
+    std::shared_ptr<InputStream> stream = std::move(stream_res).value();
+    auto layout_res = ParseArchiveHeader(stream.get());
+    ASSERT_TRUE(layout_res.ok()) << layout_res.status().ToString();
+    const auto& layout = layout_res.value();
+    std::cerr << "[PROBE] archive=" << fixture_name << " file_count=" << layout.count << "\n";
+    for (std::size_t i = 0; i < layout.count; ++i) {
+        std::cerr << "  [" << i << "] " << layout.names[i] << "  offset=" << layout.offsets[i]
+                  << "  length=" << layout.lengths[i] << "\n";
+    }
+
+    // 2) open reader and print the schema-declared tokenizer name
+    auto reader = OpenFixture(fixture_name);
+    ASSERT_TRUE(reader != nullptr);
+
+    // 3) scan for keywords we'd expect based on user-provided text samples
+    //    ("Apache Paimon / full-text search / vector / lumina / streaming / ...").
+    //    tokenizer is "default" — lowercased word-granular tokens.
+    const std::vector<std::string> probes = {
+        "apache", "paimon",    "is",     "a",     "lake",       "format",     "supports",
+        "full",   "text",      "search", "in",    "vector",     "similarity", "using",
+        "lumina", "streaming", "and",    "batch", "processing", "engine",
+    };
+
+    std::cerr << "[PROBE] MATCH_ALL per-term row_ids:\n";
+    for (const auto& term : probes) {
+        auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ALL, term);
+        std::cerr << "  " << term << " -> [";
+        for (std::size_t i = 0; i < ids.size(); ++i) {
+            if (i > 0) std::cerr << ", ";
+            std::cerr << ids[i];
+        }
+        std::cerr << "]\n";
+    }
+
+    // 4) union of everything to see every row_id present in the archive
+    std::string all_terms;
+    for (const auto& t : probes) {
+        if (!all_terms.empty()) all_terms += " ";
+        all_terms += t;
+    }
+    auto all_ids = RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ANY, all_terms);
+    std::cerr << "[PROBE] union all probe terms -> row_id count=" << all_ids.size() << " [";
+    for (std::size_t i = 0; i < all_ids.size(); ++i) {
+        if (i > 0) std::cerr << ", ";
+        std::cerr << all_ids[i];
+    }
+    std::cerr << "]\n";
+
+    // 5) a few common phrases from the user's snippet
+    for (const auto& phrase : std::vector<std::string>{
+             "apache paimon", "full text", "vector similarity", "streaming and batch"}) {
+        auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::PHRASE, phrase);
+        std::cerr << "[PROBE] PHRASE \"" << phrase << "\" -> [";
+        for (std::size_t i = 0; i < ids.size(); ++i) {
+            if (i > 0) std::cerr << ", ";
+            std::cerr << ids[i];
+        }
+        std::cerr << "]\n";
+    }
+
+    // sanity: the archive is readable at all — at least one probe term hits.
+    bool any_hit = false;
+    for (const auto& term : probes) {
+        auto ids = RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ALL, term);
+        if (!ids.empty()) {
+            any_hit = true;
+            break;
+        }
+    }
+    EXPECT_TRUE(any_hit) << "no probe term hit; archive may be empty or schema mismatched";
+}
+
+// ============================================================================
+// 9. Reverse direction: paimon-cpp writes with tokenizer="default" → fixture
+//    consumed by paimon-java test. This test emits the archive into
+//    test/test_data/cpp_tantivy_fixtures/english_default.archive and
+//    round-trips it through the cpp reader first (schema-driven tokenizer
+//    dispatch picks "default" automatically via P-TK).
+// ============================================================================
+
+namespace {
+
+/// GlobalIndexFileWriter that emits to a single fixed filename under `root`.
+/// Mirrors paimon-java's `FixedNameLocalFileWriter` from
+/// `TantivyIndexFixtureGen.java`: `newFileName(prefix)` ignores the prefix and
+/// always returns the caller-chosen name. Used to produce a stable fixture
+/// path consumed by the paimon-java cross-read test.
+class FixedNameGlobalIndexFileWriter : public GlobalIndexFileWriter {
+ public:
+    FixedNameGlobalIndexFileWriter(std::shared_ptr<FileSystem> fs, std::string root,
+                                   std::string fixed_name)
+        : fs_(std::move(fs)), root_(std::move(root)), fixed_name_(std::move(fixed_name)) {}
+
+    Result<std::string> NewFileName(const std::string& /*prefix*/) const override {
+        return fixed_name_;
+    }
+    std::string ToPath(const std::string& file_name) const override {
+        return PathUtil::JoinPath(root_, file_name);
+    }
+    Result<std::unique_ptr<OutputStream>> NewOutputStream(
+        const std::string& file_name) const override {
+        return fs_->Create(ToPath(file_name), /*overwrite=*/true);
+    }
+    Result<int64_t> GetFileSize(const std::string& file_name) const override {
+        PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<FileStatus> file_status,
+                               fs_->GetFileStatus(ToPath(file_name)));
+        return file_status->GetLen();
+    }
+
+ private:
+    std::shared_ptr<FileSystem> fs_;
+    std::string root_;
+    std::string fixed_name_;
+};
+
+/// Same 10-doc English corpus paimon-java uses in TantivyIndexFixtureGen
+/// (pure ASCII, no punctuation inside words). SimpleTokenizer (tantivy's
+/// "default") tokenizes identically on both sides for this subset, so the
+/// golden row_ids match byte-for-byte between cpp-write and java-read.
+constexpr const char* kEnglishDocs[] = {
+    "apple banana cherry",   // 0
+    "apple durian",          // 1
+    "banana cherry",         // 2
+    "fig grape",             // 3
+    "apple cherry fig",      // 4
+    "banana elderberry",     // 5
+    "cherry durian",         // 6
+    "apple",                 // 7
+    "grape fig elderberry",  // 8
+    "cherry fig",            // 9
+};
+
+}  // namespace
+
+TEST_F(JavaCompatTest, CppWriteDefaultTokenizerForJavaCrossRead) {
+    // 1) Produce an archive into test/test_data/cpp_tantivy_fixtures/ via the
+    //    production TantivyGlobalIndexWriter, configured with tantivy's
+    //    built-in "default" tokenizer (same as paimon-java's TEXT field).
+    const std::string out_dir = PAIMON_TANTIVY_CPP_FIXTURE_DIR;
+    const std::string fixture_name = "english_default.archive";
+    // Ensure dir exists (CMake does NOT create it automatically).
+    {
+        auto mk = fs_->Mkdirs(out_dir);
+        ASSERT_TRUE(mk.ok()) << mk.ToString();
+    }
+    // Clean any prior fixture so each test run writes fresh bytes.
+    {
+        const std::string archive_path_cleanup = PathUtil::JoinPath(out_dir, fixture_name);
+        auto existing = fs_->GetFileStatus(archive_path_cleanup);
+        if (existing.ok()) {
+            ASSERT_TRUE(fs_->Delete(archive_path_cleanup, false).ok());
+        }
+    }
+
+    auto file_writer = std::make_shared<FixedNameGlobalIndexFileWriter>(fs_, out_dir, fixture_name);
+
+    auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())});
+
+    std::map<std::string, std::string> options{
+        {kTantivyWriteTokenizer, "default"},
+    };
+    auto writer_res =
+        TantivyGlobalIndexWriter::Create("f0", data_type, file_writer, options, pool_);
+    ASSERT_TRUE(writer_res.ok()) << writer_res.status().ToString();
+    auto writer = writer_res.value();
+
+    // Build an arrow batch from kEnglishDocs.
+    std::string json = "[";
+    for (std::size_t i = 0; i < sizeof(kEnglishDocs) / sizeof(kEnglishDocs[0]); ++i) {
+        if (i > 0) json += ",";
+        json += "[\"";
+        json += kEnglishDocs[i];
+        json += "\"]";
+    }
+    json += "]";
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(data_type, json).ValueOrDie();
+    ::ArrowArray c_array;
+    ASSERT_TRUE(arrow::ExportArray(*array, &c_array).ok());
+    std::vector<int64_t> relative_row_ids(array->length());
+    for (int64_t i = 0; i < array->length(); ++i) relative_row_ids[i] = i;
+    ASSERT_TRUE(writer->AddBatch(&c_array, std::move(relative_row_ids)).ok());
+    auto metas_res = writer->Finish();
+    ASSERT_TRUE(metas_res.ok()) << metas_res.status().ToString();
+    ASSERT_EQ(metas_res.value().size(), 1u);
+    const auto& meta = metas_res.value().front();
+    const std::string archive_path = meta.file_path;
+    std::cerr << "[CPP-WRITE] archive_path=" << archive_path << " file_size=" << meta.file_size
+              << "\n";
+
+    // 2) Archive header sanity: 16+ files, meta.json present, tokenizer in schema.
+    auto stream_res = fs_->Open(archive_path);
+    ASSERT_TRUE(stream_res.ok()) << stream_res.status().ToString();
+    std::shared_ptr<InputStream> stream = std::move(stream_res).value();
+    auto layout_res = ParseArchiveHeader(stream.get());
+    ASSERT_TRUE(layout_res.ok()) << layout_res.status().ToString();
+    const auto& layout = layout_res.value();
+    std::cerr << "[CPP-WRITE] file_count=" << layout.count << "\n";
+    bool has_meta_json = false;
+    for (std::size_t i = 0; i < layout.count; ++i) {
+        if (layout.names[i] == "meta.json") has_meta_json = true;
+    }
+    EXPECT_TRUE(has_meta_json);
+
+    // 3) Round-trip through the cpp reader first — P-TK must auto-register
+    //    "default" from the schema so the search path works without passing
+    //    any reader-side tokenizer config.
+    //    Build a reader directly off the archive path (mirrors OpenFixture
+    //    but rooted at the cpp fixtures dir).
+    auto file_status = fs_->GetFileStatus(archive_path).value();
+    int64_t file_size = file_status->GetLen();
+    auto meta_bytes = std::make_shared<Bytes>(std::string("{}"), pool_.get());
+    GlobalIndexIOMeta io_meta(archive_path, file_size, meta_bytes);
+    auto reader_factory =
+        std::make_shared<TantivyGlobalIndex>(std::map<std::string, std::string>{});
+    auto reader_path_factory = std::make_shared<FixturePathFactory>(out_dir);
+    auto reader_file_mgr = std::make_shared<GlobalIndexFileManager>(fs_, reader_path_factory);
+
+    auto c_schema = std::make_unique<::ArrowSchema>();
+    ASSERT_TRUE(arrow::ExportType(*data_type, c_schema.get()).ok());
+
+    auto reader_res =
+        reader_factory->CreateReader(c_schema.get(), reader_file_mgr, {io_meta}, pool_);
+    ASSERT_TRUE(reader_res.ok()) << reader_res.status().ToString();
+    auto reader = reader_res.value();
+
+    // Golden expectations (identical to paimon-java's english_simple.golden.json)
+    EXPECT_EQ(RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ALL, "apple"),
+              (std::vector<int64_t>{0, 1, 4, 7}));
+    EXPECT_EQ(RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ALL, "apple banana"),
+              (std::vector<int64_t>{0}));
+    EXPECT_EQ(RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ANY, "durian elderberry"),
+              (std::vector<int64_t>{1, 5, 6, 8}));
+    EXPECT_EQ(RunSearchRowIds(reader, FullTextSearch::SearchType::PHRASE, "apple banana"),
+              (std::vector<int64_t>{0}));
+    EXPECT_EQ(RunSearchRowIds(reader, FullTextSearch::SearchType::PHRASE, "banana cherry"),
+              (std::vector<int64_t>{0, 2}));
+    EXPECT_EQ(RunSearchRowIds(reader, FullTextSearch::SearchType::PREFIX, "ap"),
+              (std::vector<int64_t>{0, 1, 4, 7}));
+    EXPECT_EQ(RunSearchRowIds(reader, FullTextSearch::SearchType::WILDCARD, "*err*"),
+              (std::vector<int64_t>{0, 2, 4, 5, 6, 8, 9}));
+    EXPECT_EQ(RunSearchRowIds(reader, FullTextSearch::SearchType::MATCH_ANY,
+                              "apple banana cherry durian fig grape elderberry"),
+              (std::vector<int64_t>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}));
+
+    std::cerr << "[CPP-WRITE] SUCCESS: archive ready for paimon-java read at " << archive_path
+              << "\n";
+}
+
+}  // namespace paimon::tantivy::test
diff --git a/src/paimon/global_index/tantivy/tantivy_lucene_coexist_test.cpp b/src/paimon/global_index/tantivy/tantivy_lucene_coexist_test.cpp
new file mode 100644
index 000000000..dbee3946a
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_lucene_coexist_test.cpp
@@ -0,0 +1,294 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Stage 9 coexistence test: prove lucene-fts and tantivy-fulltext can be linked
+ * + instantiated + used in the same process without state collisions, and
+ * that GlobalIndexerFactory routes correctly between them via index_type.
+ *
+ * The two implementations are NOT cross-readable (migration plan §0
+ * decision 1) — each reader only opens files written by its own writer.
+ * This test does NOT attempt a tantivy reader on a lucene file or vice
+ * versa; instead it verifies:
+ *
+ *   - both factories register without symbol clashes
+ *   - both writers can produce indexes side-by-side from identical input
+ *   - both readers return semantically equivalent doc id sets for queries
+ *     where tokenization differences don't matter (English bag-of-words)
+ *   - the two indexes coexist on disk under distinct identifiers
+ *     ("lucene-fts-global-index-*" vs "tantivy-fulltext-global-index-*")
+ */
+
+#include <memory>
+#include <set>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/c/bridge.h"
+#include "arrow/ipc/api.h"
+#include "arrow/type.h"
+#include "fmt/format.h"
+#include "gtest/gtest.h"
+#include "paimon/common/utils/path_util.h"
+#include "paimon/common/utils/string_utils.h"
+#include "paimon/core/global_index/global_index_file_manager.h"
+#include "paimon/core/index/index_path_factory.h"
+#include "paimon/fs/local/local_file_system.h"
+#include "paimon/global_index/bitmap_global_index_result.h"
+#include "paimon/global_index/bitmap_scored_global_index_result.h"
+#include "paimon/global_index/global_index_io_meta.h"
+#include "paimon/global_index/global_index_reader.h"
+#include "paimon/global_index/global_index_writer.h"
+#include "paimon/global_index/global_indexer.h"
+#include "paimon/global_index/global_indexer_factory.h"
+#include "paimon/global_index/lucene/lucene_defs.h"
+#include "paimon/global_index/tantivy/tantivy_defs.h"
+#include "paimon/predicate/full_text_search.h"
+#include "paimon/testing/utils/testharness.h"
+
+#ifndef JIEBA_TEST_DICT_DIR
+#error "JIEBA_TEST_DICT_DIR must be set at compile time"
+#endif
+
+namespace paimon::tantivy::test {
+
+namespace {
+
+class FakeIndexPathFactory : public IndexPathFactory {
+ public:
+    explicit FakeIndexPathFactory(const std::string& root) : root_(root) {}
+    std::string NewPath() const override {
+        assert(false);
+        return "";
+    }
+    std::string ToPath(const std::shared_ptr<IndexFileMeta>&) const override {
+        assert(false);
+        return "";
+    }
+    std::string ToPath(const std::string& file_name) const override {
+        return PathUtil::JoinPath(root_, file_name);
+    }
+    bool IsExternalPath() const override {
+        return false;
+    }
+
+ private:
+    std::string root_;
+};
+
+/// Adopt one of the two factory identifiers; everything else (paths, queries,
+/// arrow plumbing) is shared.
+struct ImplSpec {
+    std::string factory_id;     // "lucene-fts" or "tantivy-fulltext"
+    std::string file_prefix;    // "lucene-fts-global-index-" or "tantivy-fulltext-global-index-"
+    std::string option_prefix;  // "lucene-fts." or "tantivy-fulltext."
+};
+
+class TantivyLuceneCoexistTest : public ::testing::Test {
+ public:
+    void SetUp() override {
+        setenv(::paimon::lucene::kJiebaDictDirEnv, JIEBA_TEST_DICT_DIR, /*overwrite=*/1);
+        setenv(::paimon::tantivy::kJiebaDictDirEnv, JIEBA_TEST_DICT_DIR, /*overwrite=*/1);
+    }
+
+    std::unique_ptr<::ArrowSchema> CreateArrowSchema(
+        const std::shared_ptr<arrow::DataType>& data_type) const {
+        auto c_schema = std::make_unique<::ArrowSchema>();
+        EXPECT_TRUE(arrow::ExportType(*data_type, c_schema.get()).ok());
+        return c_schema;
+    }
+
+    Result<GlobalIndexIOMeta> WriteWith(const ImplSpec& impl, const std::string& root,
+                                        const std::shared_ptr<arrow::DataType>& data_type,
+                                        const std::map<std::string, std::string>& options,
+                                        const std::shared_ptr<arrow::Array>& array) const {
+        PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<GlobalIndexer> indexer,
+                               GlobalIndexerFactory::Get(impl.factory_id, options));
+        if (!indexer) {
+            return Status::Invalid(fmt::format("factory returned null for {}", impl.factory_id));
+        }
+        auto path_factory = std::make_shared<FakeIndexPathFactory>(root);
+        auto file_writer = std::make_shared<GlobalIndexFileManager>(fs_, path_factory);
+        PAIMON_ASSIGN_OR_RAISE(
+            std::shared_ptr<GlobalIndexWriter> w,
+            indexer->CreateWriter("f0", CreateArrowSchema(data_type).get(), file_writer, pool_));
+        ::ArrowArray c_array;
+        PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportArray(*array, &c_array));
+        std::vector<int64_t> relative_row_ids(array->length());
+        for (int64_t i = 0; i < array->length(); ++i) relative_row_ids[i] = i;
+        PAIMON_RETURN_NOT_OK(w->AddBatch(&c_array, std::move(relative_row_ids)));
+        PAIMON_ASSIGN_OR_RAISE(auto metas, w->Finish());
+        EXPECT_EQ(metas.size(), 1u);
+        EXPECT_TRUE(
+            StringUtils::StartsWith(PathUtil::GetName(metas[0].file_path), impl.file_prefix))
+            << metas[0].file_path << " did not start with " << impl.file_prefix;
+        return metas[0];
+    }
+
+    Result<std::shared_ptr<GlobalIndexReader>> OpenReader(
+        const ImplSpec& impl, const std::string& root,
+        const std::shared_ptr<arrow::DataType>& data_type,
+        const std::map<std::string, std::string>& options, const GlobalIndexIOMeta& meta) const {
+        PAIMON_ASSIGN_OR_RAISE(std::unique_ptr<GlobalIndexer> indexer,
+                               GlobalIndexerFactory::Get(impl.factory_id, options));
+        auto path_factory = std::make_shared<FakeIndexPathFactory>(root);
+        auto file_reader = std::make_shared<GlobalIndexFileManager>(fs_, path_factory);
+        return indexer->CreateReader(CreateArrowSchema(data_type).get(), file_reader, {meta},
+                                     pool_);
+    }
+
+    static std::set<int64_t> ExtractDocIds(const std::shared_ptr<GlobalIndexResult>& result) {
+        const RoaringBitmap64* bitmap = nullptr;
+        Result<const RoaringBitmap64*> br = Status::Invalid("no result");
+        if (auto scored = std::dynamic_pointer_cast<BitmapScoredGlobalIndexResult>(result)) {
+            br = scored->GetBitmap();
+        } else if (auto plain = std::dynamic_pointer_cast<BitmapGlobalIndexResult>(result)) {
+            br = plain->GetBitmap();
+        }
+        EXPECT_TRUE(br.ok()) << br.status().ToString();
+        bitmap = br.value();
+        std::set<int64_t> out;
+        if (bitmap) {
+            for (auto it = bitmap->Begin(); it != bitmap->End(); ++it) {
+                out.insert(static_cast<int64_t>(*it));
+            }
+        }
+        return out;
+    }
+
+ protected:
+    std::shared_ptr<MemoryPool> pool_ = GetDefaultPool();
+    std::shared_ptr<FileSystem> fs_ = std::make_shared<LocalFileSystem>();
+
+    inline static const ImplSpec kLucene{"lucene-fts", "lucene-fts-global-index-", "lucene-fts."};
+    inline static const ImplSpec kTantivy{"tantivy-fulltext", "tantivy-fulltext-global-index-",
+                                          "tantivy-fulltext."};
+};
+
+}  // namespace
+
+TEST_F(TantivyLuceneCoexistTest, BothFactoriesResolve) {
+    // No options needed; just verify both factories register and dispatch.
+    ASSERT_OK_AND_ASSIGN(auto lucene_indexer, GlobalIndexerFactory::Get("lucene-fts", {}));
+    ASSERT_OK_AND_ASSIGN(auto tantivy_indexer, GlobalIndexerFactory::Get("tantivy-fulltext", {}));
+    ASSERT_TRUE(lucene_indexer);
+    ASSERT_TRUE(tantivy_indexer);
+    // Sanity: factories return distinct types — different vtables → different
+    // GetIndexType() once we open a reader (not testable here without an
+    // index), so just check shared_ptr identity differs.
+    EXPECT_NE(static_cast<void*>(lucene_indexer.get()), static_cast<void*>(tantivy_indexer.get()));
+}
+
+TEST_F(TantivyLuceneCoexistTest, SideBySideEnglishCorpusReturnsSameDocIds) {
+    auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())});
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(data_type, R"([
+        ["alpha beta gamma document"],
+        ["alpha alpha document"],
+        ["gamma delta epsilon"],
+        ["alpha beta document document"]
+    ])")
+                     .ValueOrDie();
+
+    auto lucene_root = paimon::test::UniqueTestDirectory::Create();
+    auto tantivy_root = paimon::test::UniqueTestDirectory::Create();
+    ASSERT_TRUE(lucene_root && tantivy_root);
+
+    // Lucene requires a tmp directory option; tantivy ignores unknown keys.
+    std::map<std::string, std::string> lucene_options = {
+        {"lucene-fts.write.tmp.directory", lucene_root->Str()}};
+
+    // Write through BOTH factories side by side in the same process.
+    ASSERT_OK_AND_ASSIGN(auto lucene_meta,
+                         WriteWith(kLucene, lucene_root->Str(), data_type, lucene_options, array));
+    ASSERT_OK_AND_ASSIGN(auto tantivy_meta,
+                         WriteWith(kTantivy, tantivy_root->Str(), data_type, {}, array));
+
+    ASSERT_OK_AND_ASSIGN(auto lucene_reader,
+                         OpenReader(kLucene, lucene_root->Str(), data_type, {}, lucene_meta));
+    ASSERT_OK_AND_ASSIGN(auto tantivy_reader,
+                         OpenReader(kTantivy, tantivy_root->Str(), data_type, {}, tantivy_meta));
+    EXPECT_EQ(lucene_reader->GetIndexType(), std::string("lucene-fts"));
+    EXPECT_EQ(tantivy_reader->GetIndexType(), std::string("tantivy-fulltext"));
+
+    auto run_pair = [&](const std::string& q, FullTextSearch::SearchType t) {
+        auto lr = lucene_reader->VisitFullTextSearch(std::make_shared<FullTextSearch>(
+            "f0", /*limit=*/std::nullopt, q, t, /*pre_filter=*/std::nullopt));
+        auto tr = tantivy_reader->VisitFullTextSearch(std::make_shared<FullTextSearch>(
+            "f0", /*limit=*/std::nullopt, q, t, /*pre_filter=*/std::nullopt));
+        EXPECT_TRUE(lr.ok()) << "lucene: " << lr.status().ToString();
+        EXPECT_TRUE(tr.ok()) << "tantivy: " << tr.status().ToString();
+        return std::make_pair(ExtractDocIds(lr.value()), ExtractDocIds(tr.value()));
+    };
+
+    // For an English bag-of-words corpus the two implementations should agree
+    // on which docs contain which terms — Lucene and tantivy both store
+    // lowercased word tokens.
+    {
+        auto [l, t] = run_pair("document", FullTextSearch::SearchType::MATCH_ALL);
+        EXPECT_EQ(l, t) << "MATCH_ALL document — lucene vs tantivy doc id set differs";
+        EXPECT_EQ(l, (std::set<int64_t>{0, 1, 3}));
+    }
+    {
+        auto [l, t] = run_pair("alpha beta", FullTextSearch::SearchType::MATCH_ALL);
+        EXPECT_EQ(l, t) << "MATCH_ALL 'alpha beta' — sets differ";
+        EXPECT_EQ(l, (std::set<int64_t>{0, 3}));
+    }
+    {
+        auto [l, t] = run_pair("alpha epsilon", FullTextSearch::SearchType::MATCH_ANY);
+        EXPECT_EQ(l, t) << "MATCH_ANY 'alpha epsilon' — sets differ";
+        EXPECT_EQ(l, (std::set<int64_t>{0, 1, 2, 3}));
+    }
+    {
+        auto [l, t] = run_pair("alpha beta", FullTextSearch::SearchType::PHRASE);
+        EXPECT_EQ(l, t) << "PHRASE 'alpha beta' — sets differ";
+        EXPECT_EQ(l, (std::set<int64_t>{0, 3}));
+    }
+}
+
+TEST_F(TantivyLuceneCoexistTest, IndependentLifecycleNoStateLeakage) {
+    // Build a lucene index and a tantivy index back-to-back many times in the
+    // same process; if either factory leaked global state across instances
+    // we'd see crashes or stale results.
+    auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())});
+
+    for (int round = 0; round < 3; ++round) {
+        auto array = arrow::ipc::internal::json::ArrayFromJSON(data_type, R"([
+            ["round payload one"],
+            ["round payload two"]
+        ])")
+                         .ValueOrDie();
+        auto lroot = paimon::test::UniqueTestDirectory::Create();
+        auto troot = paimon::test::UniqueTestDirectory::Create();
+        ASSERT_TRUE(lroot && troot);
+
+        std::map<std::string, std::string> lopt = {
+            {"lucene-fts.write.tmp.directory", lroot->Str()}};
+        ASSERT_OK_AND_ASSIGN(auto lm, WriteWith(kLucene, lroot->Str(), data_type, lopt, array));
+        ASSERT_OK_AND_ASSIGN(auto tm, WriteWith(kTantivy, troot->Str(), data_type, {}, array));
+        ASSERT_OK_AND_ASSIGN(auto lr, OpenReader(kLucene, lroot->Str(), data_type, {}, lm));
+        ASSERT_OK_AND_ASSIGN(auto tr, OpenReader(kTantivy, troot->Str(), data_type, {}, tm));
+
+        auto lq = lr->VisitFullTextSearch(std::make_shared<FullTextSearch>(
+            "f0", std::nullopt, "payload", FullTextSearch::SearchType::MATCH_ALL, std::nullopt));
+        auto tq = tr->VisitFullTextSearch(std::make_shared<FullTextSearch>(
+            "f0", std::nullopt, "payload", FullTextSearch::SearchType::MATCH_ALL, std::nullopt));
+        ASSERT_TRUE(lq.ok());
+        ASSERT_TRUE(tq.ok());
+        EXPECT_EQ(ExtractDocIds(lq.value()), (std::set<int64_t>{0, 1})) << "lucene round " << round;
+        EXPECT_EQ(ExtractDocIds(tq.value()), (std::set<int64_t>{0, 1}))
+            << "tantivy round " << round;
+    }
+}
+
+}  // namespace paimon::tantivy::test
diff --git a/src/paimon/global_index/tantivy/tantivy_reader_test.cpp b/src/paimon/global_index/tantivy/tantivy_reader_test.cpp
new file mode 100644
index 000000000..ba3fe6299
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_reader_test.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Stage 6 reader test: write an index via TantivyGlobalIndexWriter, persist
+ * it, then run all 5 FullTextSearch SearchTypes through TantivyGlobalIndexReader
+ * and assert matching local row ids. Mirrors the no-limit / no-pre_filter
+ * subset of paimon-lucene-index-test's TestSimple/TestSimpleChinese cases.
+ *
+ * limit / pre_filter coverage lands in Stage 7 (paimon-tantivy-filter-limit-test).
+ */
+
+#include <memory>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/c/bridge.h"
+#include "arrow/ipc/api.h"
+#include "arrow/type.h"
+#include "gtest/gtest.h"
+#include "paimon/common/utils/path_util.h"
+#include "paimon/core/global_index/global_index_file_manager.h"
+#include "paimon/core/index/index_path_factory.h"
+#include "paimon/fs/local/local_file_system.h"
+#include "paimon/global_index/bitmap_global_index_result.h"
+#include "paimon/global_index/tantivy/tantivy_defs.h"
+#include "paimon/global_index/tantivy/tantivy_global_index_reader.h"
+#include "paimon/global_index/tantivy/tantivy_global_index_writer.h"
+#include "paimon/testing/utils/testharness.h"
+
+#ifndef JIEBA_TEST_DICT_DIR
+#error "JIEBA_TEST_DICT_DIR must be set at compile time"
+#endif
+
+namespace paimon::tantivy::test {
+
+namespace {
+
+class FakeIndexPathFactory : public IndexPathFactory {
+ public:
+    explicit FakeIndexPathFactory(const std::string& root) : root_(root) {}
+    std::string NewPath() const override {
+        assert(false);
+        return "";
+    }
+    std::string ToPath(const std::shared_ptr<IndexFileMeta>&) const override {
+        assert(false);
+        return "";
+    }
+    std::string ToPath(const std::string& file_name) const override {
+        return PathUtil::JoinPath(root_, file_name);
+    }
+    bool IsExternalPath() const override {
+        return false;
+    }
+
+ private:
+    std::string root_;
+};
+
+class TantivyReaderTest : public ::testing::Test {
+ public:
+    void SetUp() override {
+        setenv(kJiebaDictDirEnv, JIEBA_TEST_DICT_DIR, /*overwrite=*/1);
+    }
+
+    /// Write `array` to a fresh test directory and return (file_manager, meta).
+    std::pair<std::shared_ptr<GlobalIndexFileManager>, GlobalIndexIOMeta> WriteAndOpen(
+        const std::shared_ptr<arrow::Array>& array,
+        const std::map<std::string, std::string>& options) {
+        auto root_dir = paimon::test::UniqueTestDirectory::Create();
+        EXPECT_TRUE(root_dir);
+        // Hold the directory alive across this test by leaking the
+        // unique_ptr's owned dir into a static — UniqueTestDirectory::Create
+        // returns RAII; need the path to outlive the function.
+        // Easier path: reach in via member, save root string, then wrap a
+        // fresh GlobalIndexFileManager pointing at that string.
+        std::string root = root_dir->Str();
+        // keep the directory alive
+        kept_dirs_.push_back(std::move(root_dir));
+
+        auto path_factory = std::make_shared<FakeIndexPathFactory>(root);
+        auto fm = std::make_shared<GlobalIndexFileManager>(fs_, path_factory);
+        auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())});
+        auto writer_res =
+            TantivyGlobalIndexWriter::Create("f0", data_type, fm, options, GetDefaultPool());
+        EXPECT_TRUE(writer_res.ok()) << writer_res.status().ToString();
+        auto writer = writer_res.value();
+        ::ArrowArray c_array;
+        EXPECT_TRUE(arrow::ExportArray(*array, &c_array).ok());
+        std::vector<int64_t> relative_row_ids(array->length());
+        for (int64_t i = 0; i < array->length(); ++i) relative_row_ids[i] = i;
+        EXPECT_TRUE(writer->AddBatch(&c_array, std::move(relative_row_ids)).ok());
+        auto metas_res = writer->Finish();
+        EXPECT_TRUE(metas_res.ok()) << metas_res.status().ToString();
+        return {fm, metas_res.value()[0]};
+    }
+
+    static std::vector<int64_t> BitmapToVec(const std::shared_ptr<GlobalIndexResult>& result) {
+        auto bg = std::dynamic_pointer_cast<BitmapGlobalIndexResult>(result);
+        EXPECT_TRUE(bg) << "expected BitmapGlobalIndexResult";
+        auto bitmap_res = bg->GetBitmap();
+        EXPECT_TRUE(bitmap_res.ok()) << bitmap_res.status().ToString();
+        const RoaringBitmap64* bitmap = bitmap_res.value();
+        std::vector<int64_t> ids;
+        for (auto it = bitmap->Begin(); it != bitmap->End(); ++it) {
+            ids.push_back(static_cast<int64_t>(*it));
+        }
+        std::sort(ids.begin(), ids.end());
+        return ids;
+    }
+
+    std::shared_ptr<arrow::DataType> DataType() const {
+        return arrow::struct_({arrow::field("f0", arrow::utf8())});
+    }
+
+ protected:
+    std::shared_ptr<FileSystem> fs_ = std::make_shared<LocalFileSystem>();
+    /// Keep test directories alive for the duration of the test.
+    std::vector<std::unique_ptr<paimon::test::UniqueTestDirectory>> kept_dirs_;
+};
+
+}  // namespace
+
+TEST_F(TantivyReaderTest, EnglishMatchAllAndAny) {
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([
+        ["This is an test document."],
+        ["This is an new document document document."],
+        ["Document document document document test."],
+        ["unordered user-defined doc id"]
+    ])")
+                     .ValueOrDie();
+    auto [fm, meta] = WriteAndOpen(array, {});
+    ASSERT_OK_AND_ASSIGN(auto reader,
+                         TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool()));
+
+    auto run = [&](const std::string& q, FullTextSearch::SearchType t) {
+        auto res = reader->VisitFullTextSearch(std::make_shared<FullTextSearch>(
+            "f0", /*limit=*/std::nullopt, q, t, /*pre_filter=*/std::nullopt));
+        EXPECT_TRUE(res.ok()) << res.status().ToString();
+        return BitmapToVec(res.value());
+    };
+
+    EXPECT_EQ(run("document", FullTextSearch::SearchType::MATCH_ALL),
+              (std::vector<int64_t>{0, 1, 2}));
+    EXPECT_EQ(run("test document", FullTextSearch::SearchType::MATCH_ALL),
+              (std::vector<int64_t>{0, 2}));
+    EXPECT_EQ(run("test new", FullTextSearch::SearchType::MATCH_ANY),
+              (std::vector<int64_t>{0, 1, 2}));
+}
+
+TEST_F(TantivyReaderTest, EnglishPhrasePrefixWildcard) {
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([
+        ["This is an test document."],
+        ["This is an new document document document."],
+        ["Document document document document test."],
+        ["unordered user-defined doc id"]
+    ])")
+                     .ValueOrDie();
+    auto [fm, meta] = WriteAndOpen(array, {});
+    ASSERT_OK_AND_ASSIGN(auto reader,
+                         TantivyGlobalIndexReader::Create("f0", meta, fm, {}, GetDefaultPool()));
+
+    auto run = [&](const std::string& q, FullTextSearch::SearchType t) {
+        auto res = reader->VisitFullTextSearch(std::make_shared<FullTextSearch>(
+            "f0", /*limit=*/std::nullopt, q, t, /*pre_filter=*/std::nullopt));
+        EXPECT_TRUE(res.ok()) << res.status().ToString();
+        return BitmapToVec(res.value());
+    };
+
+    // "test document" is consecutive only in row 0 ("an test document.")
+    EXPECT_EQ(run("test document", FullTextSearch::SearchType::PHRASE), (std::vector<int64_t>{0}));
+    EXPECT_EQ(run("unorder", FullTextSearch::SearchType::PREFIX), (std::vector<int64_t>{3}));
+    EXPECT_EQ(run("*order*", FullTextSearch::SearchType::WILDCARD), (std::vector<int64_t>{3}));
+    EXPECT_EQ(run("*or*er*", FullTextSearch::SearchType::WILDCARD), (std::vector<int64_t>{3}));
+}
+
+TEST_F(TantivyReaderTest, ChineseQueryMode) {
+    auto array = arrow::ipc::internal::json::ArrayFromJSON(DataType(), R"([
+["QianWen 是一个基于 AI 的智能助手，类似于 Siri 和 Alexa。我们正在用 Python 开发 QianWen 的 Natural Language Understanding 模块，该模块支持多轮对话和意图识别功能，是新一代智能助手的核心技术之一。"],
+["最近开源了一个新项目叫ｑｉａｎｗｅｎ（全角字符），功能类似之前的 Qianwen，是一个面向 AI 应用的智能助手。它不仅支持 Machine Learning 和 NLP 技术，还提供了可扩展的开发框架，便于开发者构建自己的智能助手系统。"],
+["我们在测试 qianwen-core v1.2 和 ai-engine-alpha 中的 bug，重点优化了 qianwen 的响应速度和稳定性。本次更新增强了核心模块的功能，提升了智能助手的开发效率，并修复了与 NLP 模块相关的多个问题。"],
+["AI 助手开发中常用的技术包括 Speech Recognition、Natural Language Processing 和 Recommendation System。我们使用 TensorFlow 和 PyTorch 构建模型，开发了多个智能助手原型，支持语音交互和上下文理解功能，是当前热门的人工智能发展应用方向。"],
+["新一代的 AI 助手代号为「千问」，内部命名为 QianwenX-2024，计划在 next quarter 发布。QianwenX 将集成更强的 multimodel 能力，支持图像和文本联合处理，进一步提升智能助手的理解能力和交互体验，是未来智能助手的重要发展方向。"]
+    ])")
+                     .ValueOrDie();
+    std::map<std::string, std::string> options = {
+        {kTantivyWriteTokenizer, "paimon_jieba"},
+        {kJiebaTokenizeMode, "query"},
+    };
+    auto [fm, meta] = WriteAndOpen(array, options);
+    ASSERT_OK_AND_ASSIGN(
+        auto reader, TantivyGlobalIndexReader::Create("f0", meta, fm, options, GetDefaultPool()));
+
+    auto run = [&](const std::string& q, FullTextSearch::SearchType t) {
+        auto res = reader->VisitFullTextSearch(std::make_shared<FullTextSearch>(
+            "f0", /*limit=*/std::nullopt, q, t, /*pre_filter=*/std::nullopt));
+        EXPECT_TRUE(res.ok()) << res.status().ToString();
+        return BitmapToVec(res.value());
+    };
+
+    EXPECT_EQ(run("模块", FullTextSearch::SearchType::MATCH_ALL), (std::vector<int64_t>{0, 2}));
+    EXPECT_EQ(run("模块技术", FullTextSearch::SearchType::MATCH_ALL), (std::vector<int64_t>{0}));
+    EXPECT_EQ(run("模块技术", FullTextSearch::SearchType::MATCH_ANY),
+              (std::vector<int64_t>{0, 1, 2, 3}));
+    EXPECT_EQ(run("发展方向", FullTextSearch::SearchType::PHRASE), (std::vector<int64_t>{4}));
+}
+
+}  // namespace paimon::tantivy::test
diff --git a/src/paimon/global_index/tantivy/tantivy_smoke_test.cpp b/src/paimon/global_index/tantivy/tantivy_smoke_test.cpp
new file mode 100644
index 000000000..04f7915c7
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_smoke_test.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * tantivy-fulltext Stage 1 smoke test: prove the Rust FFI bridge is callable from C++.
+ * Intentionally minimal — exercises only paimon_tantivy_version().
+ * Later stages add real functional tests.
+ */
+
+#include <cstring>
+#include <string>
+
+#include "gtest/gtest.h"
+
+extern "C" {
+#include "paimon_tantivy_ffi.h"  // NOLINT(build/include_subdir)
+}
+
+namespace paimon::tantivy {
+
+TEST(TantivySmoke, VersionIsReachable) {
+    const char* version = paimon_tantivy_version();
+    ASSERT_NE(version, nullptr) << "paimon_tantivy_version returned null";
+
+    const std::string v(version);
+    EXPECT_FALSE(v.empty());
+    // build.rs pins version from Cargo.toml (CARGO_PKG_VERSION), semver "x.y.z"
+    EXPECT_NE(v.find('.'), std::string::npos) << "expected semver, got: " << v;
+}
+
+TEST(TantivySmoke, VersionPointerIsStable) {
+    // The pointer is documented as 'static — two calls should return either
+    // the same pointer or at least equivalent string content.
+    const char* v1 = paimon_tantivy_version();
+    const char* v2 = paimon_tantivy_version();
+    ASSERT_NE(v1, nullptr);
+    ASSERT_NE(v2, nullptr);
+    EXPECT_EQ(std::strcmp(v1, v2), 0);
+}
+
+}  // namespace paimon::tantivy
diff --git a/src/paimon/global_index/tantivy/tantivy_stream_ctx.cpp b/src/paimon/global_index/tantivy/tantivy_stream_ctx.cpp
new file mode 100644
index 000000000..b45572a71
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_stream_ctx.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ */
+
+#include "paimon/global_index/tantivy/tantivy_stream_ctx.h"
+
+#include <cstring>
+
+#include "fmt/format.h"
+#include "paimon/fs/file_system.h"
+
+namespace paimon::tantivy {
+
+extern "C" int32_t paimon_cpp_stream_read_at(void* ctx_ptr, uint64_t offset, std::size_t len,
+                                             uint8_t* out_buf) {
+    if (ctx_ptr == nullptr || out_buf == nullptr) {
+        return 1;
+    }
+    auto* ctx = static_cast<StreamCtx*>(ctx_ptr);
+    std::lock_guard<std::mutex> lock(ctx->pread_mu);
+
+    std::size_t total = 0;
+    while (total < len) {
+        auto r = ctx->stream->Read(reinterpret_cast<char*>(out_buf + total),
+                                   static_cast<uint32_t>(len - total), offset + total);
+        if (!r.ok()) {
+            return 1;
+        }
+        int32_t got = r.value();
+        if (got <= 0) {
+            return 1;  // unexpected EOF / 0-byte read
+        }
+        total += static_cast<std::size_t>(got);
+    }
+    return 0;
+}
+
+extern "C" void paimon_cpp_stream_release(void* ctx_ptr) {
+    if (ctx_ptr == nullptr) {
+        return;
+    }
+    auto* ctx = static_cast<StreamCtx*>(ctx_ptr);
+    // ~shared_ptr closes the underlying stream.
+    delete ctx;
+}
+
+extern "C" int32_t paimon_cpp_writer_push(void* ctx_ptr, const uint8_t* data, std::size_t len) {
+    if (ctx_ptr == nullptr) {
+        return 1;
+    }
+    auto* ctx = static_cast<WriteCtx*>(ctx_ptr);
+    if (ctx->out == nullptr) {
+        ctx->last_error = Status::Invalid("writer_push: null OutputStream");
+        return 1;
+    }
+    std::size_t total = 0;
+    while (total < len) {
+        auto r = ctx->out->Write(reinterpret_cast<const char*>(data + total),
+                                 static_cast<uint32_t>(len - total));
+        if (!r.ok()) {
+            ctx->last_error = r.status();
+            return 1;
+        }
+        int32_t written = r.value();
+        if (written <= 0) {
+            ctx->last_error = Status::IOError(fmt::format(
+                "writer_push: short write (wrote {} of {} bytes)", written, len - total));
+            return 1;
+        }
+        total += static_cast<std::size_t>(written);
+    }
+    return 0;
+}
+
+}  // namespace paimon::tantivy
diff --git a/src/paimon/global_index/tantivy/tantivy_stream_ctx.h b/src/paimon/global_index/tantivy/tantivy_stream_ctx.h
new file mode 100644
index 000000000..532ca4e35
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_stream_ctx.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+
+#include "paimon/status.h"
+
+namespace paimon {
+class InputStream;
+class OutputStream;
+}  // namespace paimon
+
+namespace paimon::tantivy {
+
+/// C++ side wrapper around a seekable InputStream, used as the `ctx` of
+/// `PaimonStreamCallbacks` (V3). Lifetime is transferred to Rust via
+/// `paimon_tantivy_reader_new_streaming`; Rust invokes `paimon_cpp_stream_release`
+/// when the reader handle is freed, which `delete`s this struct.
+///
+/// `pread_mu` is a defensive per-ctx lock: the underlying `InputStream::Read(
+/// buffer, size, offset)` is declared pread-style (thread-safe, no position
+/// mutation) but a few subclasses (notably `JindoInputStream`) have member-
+/// variable races in practice. Rust also has its own `stream_mutex` that
+/// serializes reads at the Directory level; `pread_mu` is belt-and-suspenders.
+struct StreamCtx {
+    std::shared_ptr<InputStream> stream;
+    std::mutex pread_mu;
+};
+
+/// `ctx` of `PaimonWriteCallbacks` (W1). Holds a raw (non-owning) pointer to
+/// a paimon `OutputStream` plus a sticky error for conveying write failures
+/// back to the C++ caller of `TantivyGlobalIndexWriter::Finish`.
+struct WriteCtx {
+    OutputStream* out = nullptr;
+    Status last_error = Status::OK();
+};
+
+/// Rust -> C++ read callback. Reads `len` bytes starting at archive-absolute
+/// `offset` into `out_buf`. Returns 0 on success, 1 on IO error. Thread-safe
+/// (serialized via `StreamCtx::pread_mu`; Rust also holds its own mutex).
+extern "C" int32_t paimon_cpp_stream_read_at(void* ctx_ptr, uint64_t offset, std::size_t len,
+                                             uint8_t* out_buf);
+
+/// Rust -> C++ release callback. Called exactly once when the Rust reader is
+/// dropped. Deletes the ctx (which closes the underlying stream via ~shared_ptr).
+extern "C" void paimon_cpp_stream_release(void* ctx_ptr);
+
+/// Rust -> C++ write push callback. Writes `len` bytes from `data` to the
+/// underlying OutputStream. Returns 0 on success, 1 on IO error (with the
+/// detailed Status stashed in `WriteCtx::last_error` for the caller to pick up).
+extern "C" int32_t paimon_cpp_writer_push(void* ctx_ptr, const uint8_t* data, std::size_t len);
+
+}  // namespace paimon::tantivy
diff --git a/src/paimon/global_index/tantivy/tantivy_streaming_test.cpp b/src/paimon/global_index/tantivy/tantivy_streaming_test.cpp
new file mode 100644
index 000000000..7c9a6e0f7
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_streaming_test.cpp
@@ -0,0 +1,370 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * K4 streaming test: V3 Callback Directory + W1 streaming writer end-to-end.
+ *
+ * Coverage:
+ *   1. ParseArchiveHeaderFuzz — malformed header bytes rejected cleanly
+ *   2. ConcurrentQueryOnSameReader — 4 threads query same reader, serialized
+ *      by Rust stream_mutex, results consistent, no race
+ *   3. ConcurrentCreateAndDropReaders — 10 threads each open/query/close their
+ *      own reader on the same archive; no leaks, release exactly-once per reader
+ *   4. StreamingBenchmarkLog — builds a medium index, prints RSS/timing to
+ *      stderr for baseline comparison (execute.md archival)
+ *
+ * We don't duplicate tests already covered by the Rust unit tests
+ * (callback_directory::tests::* for Directory semantics, writer::tests::
+ * streaming_chunk_size_bounded_by_buffer for the 64KB buffer guarantee).
+ */
+
+#include <sys/resource.h>
+
+#include <atomic>
+#include <chrono>
+#include <cinttypes>
+#include <cstring>
+#include <memory>
+#include <thread>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/c/bridge.h"
+#include "arrow/type.h"
+#include "gtest/gtest.h"
+#include "paimon/common/utils/path_util.h"
+#include "paimon/core/global_index/global_index_file_manager.h"
+#include "paimon/core/index/index_path_factory.h"
+#include "paimon/fs/local/local_file_system.h"
+#include "paimon/global_index/bitmap_global_index_result.h"
+#include "paimon/global_index/bitmap_scored_global_index_result.h"
+#include "paimon/global_index/tantivy/tantivy_archive_layout.h"
+#include "paimon/global_index/tantivy/tantivy_defs.h"
+#include "paimon/global_index/tantivy/tantivy_global_index.h"
+#include "paimon/global_index/tantivy/tantivy_global_index_reader.h"
+#include "paimon/io/byte_array_input_stream.h"
+#include "paimon/predicate/full_text_search.h"
+#include "paimon/testing/utils/testharness.h"
+
+#ifndef JIEBA_TEST_DICT_DIR
+#error "JIEBA_TEST_DICT_DIR must be set at compile time"
+#endif
+
+namespace paimon::tantivy::test {
+
+namespace {
+
+class FakeIndexPathFactory : public IndexPathFactory {
+ public:
+    explicit FakeIndexPathFactory(const std::string& root) : root_(root) {}
+    std::string NewPath() const override {
+        assert(false);
+        return "";
+    }
+    std::string ToPath(const std::shared_ptr<IndexFileMeta>&) const override {
+        assert(false);
+        return "";
+    }
+    std::string ToPath(const std::string& file_name) const override {
+        return PathUtil::JoinPath(root_, file_name);
+    }
+    bool IsExternalPath() const override {
+        return false;
+    }
+
+ private:
+    std::string root_;
+};
+
+/// Helper: build an archive with `n` documents, return the GlobalIndexIOMeta.
+/// Holds the tmp dir alive (via `holder`) so it's cleaned up when the
+/// WriteResult goes out of scope.
+struct WriteResult {
+    std::unique_ptr<paimon::test::UniqueTestDirectory> holder;
+    std::string root_dir;
+    GlobalIndexIOMeta meta;
+};
+
+class StreamingTestFixture : public ::testing::Test {
+ public:
+    void SetUp() override {
+        setenv(kJiebaDictDirEnv, JIEBA_TEST_DICT_DIR, /*overwrite=*/1);
+    }
+
+    WriteResult BuildArchive(std::size_t n_docs,
+                             const std::string& text_template = "apple banana cherry {}") {
+        auto root_dir = paimon::test::UniqueTestDirectory::Create();
+        EXPECT_TRUE(root_dir);
+        std::string root = root_dir->Str();
+
+        // Build arrow StringArray
+        arrow::StringBuilder sb;
+        for (std::size_t i = 0; i < n_docs; ++i) {
+            char buf[128];
+            std::snprintf(buf, sizeof(buf), text_template.c_str(), i);
+            EXPECT_TRUE(sb.Append(buf).ok());
+        }
+        auto text_array = sb.Finish().ValueOrDie();
+        auto struct_array =
+            arrow::StructArray::Make({text_array}, {arrow::field("f0", arrow::utf8())})
+                .ValueOrDie();
+
+        std::map<std::string, std::string> options;
+        auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())});
+        auto c_schema = std::make_unique<::ArrowSchema>();
+        EXPECT_TRUE(arrow::ExportType(*data_type, c_schema.get()).ok());
+        auto global_index = std::make_shared<TantivyGlobalIndex>(options);
+        auto path_factory = std::make_shared<FakeIndexPathFactory>(root);
+        auto file_writer = std::make_shared<GlobalIndexFileManager>(fs_, path_factory);
+        auto w = global_index->CreateWriter("f0", c_schema.get(), file_writer, pool_).value();
+        ::ArrowArray c_array;
+        EXPECT_TRUE(arrow::ExportArray(*struct_array, &c_array).ok());
+        std::vector<int64_t> relative_row_ids(struct_array->length());
+        for (int64_t i = 0; i < struct_array->length(); ++i) relative_row_ids[i] = i;
+        EXPECT_TRUE(w->AddBatch(&c_array, std::move(relative_row_ids)).ok());
+        auto metas = w->Finish().value();
+        EXPECT_EQ(metas.size(), 1u);
+
+        // Move root_dir into the result — it stays alive as long as the
+        // caller holds WriteResult; cleaned up when TEST_F scope exits.
+        return WriteResult{std::move(root_dir), std::move(root), metas[0]};
+    }
+
+    std::shared_ptr<GlobalIndexReader> OpenReader(const std::string& root,
+                                                  const GlobalIndexIOMeta& meta) {
+        std::map<std::string, std::string> options;
+        auto data_type = arrow::struct_({arrow::field("f0", arrow::utf8())});
+        auto c_schema = std::make_unique<::ArrowSchema>();
+        EXPECT_TRUE(arrow::ExportType(*data_type, c_schema.get()).ok());
+        auto global_index = std::make_shared<TantivyGlobalIndex>(options);
+        auto path_factory = std::make_shared<FakeIndexPathFactory>(root);
+        auto file_reader = std::make_shared<GlobalIndexFileManager>(fs_, path_factory);
+        return global_index->CreateReader(c_schema.get(), file_reader, {meta}, pool_).value();
+    }
+
+    std::shared_ptr<FullTextSearch> BuildMatchAll(const std::string& query) {
+        return std::make_shared<FullTextSearch>(
+            /*_field_name=*/"f0",
+            /*_limit=*/std::optional<int32_t>{},
+            /*_query=*/query,
+            /*_search_type=*/FullTextSearch::SearchType::MATCH_ALL,
+            /*_pre_filter=*/std::optional<RoaringBitmap64>{});
+    }
+
+ protected:
+    std::shared_ptr<MemoryPool> pool_ = GetDefaultPool();
+    std::shared_ptr<FileSystem> fs_ = std::make_shared<LocalFileSystem>();
+};
+
+// =========================================================================
+// 1. ParseArchiveHeader fuzz
+// =========================================================================
+
+TEST(ParseArchiveHeaderFuzz, TruncatedHeader) {
+    // Fewer than 4 bytes → DataInputStream::ReadValue<int32_t> fails
+    std::string bytes = "\x00\x00";
+    ByteArrayInputStream in(bytes.data(), bytes.size());
+    auto r = ParseArchiveHeader(&in);
+    EXPECT_FALSE(r.ok()) << "expected failure on truncated header";
+}
+
+TEST(ParseArchiveHeaderFuzz, NegativeFileCount) {
+    // BE int32 -1 = 0xFFFFFFFF
+    char bytes[4] = {static_cast<char>(0xFF), static_cast<char>(0xFF), static_cast<char>(0xFF),
+                     static_cast<char>(0xFF)};
+    ByteArrayInputStream in(bytes, 4);
+    auto r = ParseArchiveHeader(&in);
+    ASSERT_FALSE(r.ok());
+    EXPECT_NE(r.status().message().find("negative file_count"), std::string::npos)
+        << r.status().ToString();
+}
+
+TEST(ParseArchiveHeaderFuzz, NameLenOutOfRange) {
+    // file_count=1, name_len=2GB (BE int32 0x7FFFFFFF)
+    char bytes[8] = {0,
+                     0,
+                     0,
+                     1,
+                     static_cast<char>(0x7F),
+                     static_cast<char>(0xFF),
+                     static_cast<char>(0xFF),
+                     static_cast<char>(0xFF)};
+    ByteArrayInputStream in(bytes, 8);
+    auto r = ParseArchiveHeader(&in);
+    ASSERT_FALSE(r.ok());
+    EXPECT_NE(r.status().message().find("bad name_len"), std::string::npos)
+        << r.status().ToString();
+}
+
+TEST(ParseArchiveHeaderFuzz, ZeroFileCountSucceeds) {
+    // file_count=0 is structurally valid; caller will fail later when
+    // tantivy::Index::open finds no meta.json, but parse itself OK.
+    char bytes[4] = {0, 0, 0, 0};
+    ByteArrayInputStream in(bytes, 4);
+    auto r = ParseArchiveHeader(&in);
+    ASSERT_TRUE(r.ok()) << r.status().ToString();
+    EXPECT_EQ(r.value().count, 0u);
+}
+
+TEST(ParseArchiveHeaderFuzz, PayloadLenNegative) {
+    // file_count=1, name_len=1, name="a", data_len=-1 (BE int64 0xFFFFFFFFFFFFFFFF)
+    char bytes[4 + 4 + 1 + 8] = {
+        // file_count=1
+        0,
+        0,
+        0,
+        1,
+        // name_len=1
+        0,
+        0,
+        0,
+        1,
+        // name='a'
+        'a',
+        // data_len = -1 (BE int64 0xFFFFFFFFFFFFFFFF)
+        static_cast<char>(0xFF),
+        static_cast<char>(0xFF),
+        static_cast<char>(0xFF),
+        static_cast<char>(0xFF),
+        static_cast<char>(0xFF),
+        static_cast<char>(0xFF),
+        static_cast<char>(0xFF),
+        static_cast<char>(0xFF),
+    };
+    ByteArrayInputStream in(bytes, sizeof(bytes));
+    auto r = ParseArchiveHeader(&in);
+    ASSERT_FALSE(r.ok());
+    EXPECT_NE(r.status().message().find("negative data_len"), std::string::npos)
+        << r.status().ToString();
+}
+
+// =========================================================================
+// 2. Concurrent query on same reader
+// =========================================================================
+
+TEST_F(StreamingTestFixture, ConcurrentQueryOnSameReader) {
+    // 50 docs containing "apple" in every one (all should match)
+    auto wr = BuildArchive(50, "apple banana {}");
+    auto reader = OpenReader(wr.root_dir, wr.meta);
+
+    auto fts = BuildMatchAll("apple");
+
+    // 4 threads × 20 queries each, all must return 50 rowIds
+    constexpr int kThreads = 4;
+    constexpr int kIters = 20;
+    std::vector<std::thread> threads;
+    std::atomic<int> failures{0};
+    for (int t = 0; t < kThreads; ++t) {
+        threads.emplace_back([&] {
+            for (int i = 0; i < kIters; ++i) {
+                auto result = reader->VisitFullTextSearch(fts);
+                if (!result.ok() || !result.value()) {
+                    failures++;
+                    continue;
+                }
+                std::shared_ptr<GlobalIndexResult> r = result.value();
+                auto plain = std::dynamic_pointer_cast<BitmapGlobalIndexResult>(r);
+                if (!plain) {
+                    failures++;
+                    continue;
+                }
+                auto bres = plain->GetBitmap();
+                if (!bres.ok() || bres.value() == nullptr || bres.value()->Cardinality() != 50) {
+                    failures++;
+                }
+            }
+        });
+    }
+    for (auto& th : threads) th.join();
+    EXPECT_EQ(failures.load(), 0) << "concurrent queries produced inconsistent results";
+}
+
+// =========================================================================
+// 3. Concurrent reader open + close
+// =========================================================================
+
+TEST_F(StreamingTestFixture, ConcurrentCreateAndDropReaders) {
+    // One archive, many readers opening/closing it concurrently.
+    // Validates exactly-once release (no UAF under ASAN) and open/close race safety.
+    auto wr = BuildArchive(20);
+
+    constexpr int kThreads = 10;
+    std::vector<std::thread> threads;
+    std::atomic<int> failures{0};
+    for (int t = 0; t < kThreads; ++t) {
+        threads.emplace_back([&, t] {
+            for (int i = 0; i < 5; ++i) {
+                auto reader = OpenReader(wr.root_dir, wr.meta);
+                if (!reader) {
+                    failures++;
+                    continue;
+                }
+                auto fts = BuildMatchAll("apple");
+                auto r = reader->VisitFullTextSearch(fts);
+                if (!r.ok()) {
+                    failures++;
+                }
+                // reader drops here → Rust Arc<CallbackCtx>::drop → paimon_cpp_stream_release
+            }
+            (void)t;
+        });
+    }
+    for (auto& th : threads) th.join();
+    EXPECT_EQ(failures.load(), 0);
+}
+
+// =========================================================================
+// 4. Benchmark log (non-assertion; archived to execute.md)
+// =========================================================================
+
+TEST_F(StreamingTestFixture, StreamingBenchmarkLog) {
+    auto rss_kb = []() {
+        struct rusage ru;
+        getrusage(RUSAGE_SELF, &ru);
+        // Linux: KB; macOS: bytes
+        return static_cast<int64_t>(ru.ru_maxrss);
+    };
+
+    int64_t rss_before = rss_kb();
+    auto t0 = std::chrono::steady_clock::now();
+    auto wr = BuildArchive(200);
+    auto t1 = std::chrono::steady_clock::now();
+    int64_t rss_after_write = rss_kb();
+
+    auto reader = OpenReader(wr.root_dir, wr.meta);
+    auto t2 = std::chrono::steady_clock::now();
+    int64_t rss_after_open = rss_kb();
+
+    auto fts = BuildMatchAll("apple");
+    auto result = reader->VisitFullTextSearch(fts);
+    auto t3 = std::chrono::steady_clock::now();
+
+    auto write_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
+    auto open_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
+    auto query_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2).count();
+
+    std::fprintf(stderr,
+                 "[BENCHMARK] V3 streaming (200 docs): "
+                 "write=%" PRId64 "ms open=%" PRId64 "ms query=%" PRId64
+                 "ms "
+                 "rss_before=%" PRId64 "KB rss_after_write=%" PRId64 "KB rss_after_open=%" PRId64
+                 "KB\n",
+                 static_cast<int64_t>(write_ms), static_cast<int64_t>(open_ms),
+                 static_cast<int64_t>(query_ms), rss_before, rss_after_write, rss_after_open);
+    EXPECT_TRUE(result.ok());
+    SUCCEED();
+}
+
+}  // namespace
+}  // namespace paimon::tantivy::test
diff --git a/src/paimon/global_index/tantivy/tantivy_tokenizer_test.cpp b/src/paimon/global_index/tantivy/tantivy_tokenizer_test.cpp
new file mode 100644
index 000000000..27ec788a1
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_tokenizer_test.cpp
@@ -0,0 +1,278 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Stage 3 golden-sample test: cppjieba vs jieba-rs (PaimonJiebaTokenizer) diff.
+ *
+ * For each mode (mp / mix / full / query), tokenize every line of
+ * `test/test_data/tokenizer_golden/golden_*.txt` twice: once with cppjieba
+ * (the existing JiebaTokenizer::CutWithMode + Normalize), once with the
+ * FFI-exposed PaimonJiebaTokenizer. Compare the token text sequences.
+ * Diffs are advisory only (logged to stderr) — per
+ * docs/dev/tokenizer_diff_report.md we do not require cppjieba<->jieba-rs parity.
+ *
+ * `hmm` mode is tested separately: FFI must return Unsupported.
+ */
+
+#include <algorithm>
+#include <filesystem>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "cppjieba/Jieba.hpp"
+#include "gtest/gtest.h"
+#include "paimon/global_index/lucene/jieba_analyzer.h"
+#include "paimon/global_index/lucene/lucene_utils.h"
+#include "paimon/global_index/tantivy/tantivy_ffi_handle.h"
+#include "paimon/global_index/tantivy/tantivy_ffi_status.h"
+
+extern "C" {
+#include "paimon_tantivy_ffi.h"  // NOLINT(build/include_subdir)
+}
+
+#ifndef JIEBA_TEST_DICT_DIR
+#error "JIEBA_TEST_DICT_DIR must be set at compile time for this test"
+#endif
+
+#ifndef PAIMON_TANTIVY_GOLDEN_DIR
+#error "PAIMON_TANTIVY_GOLDEN_DIR must be set at compile time for this test"
+#endif
+
+namespace paimon::tantivy {
+namespace {
+
+/// Load lines from all `golden_*.txt` files (the strict corpus).
+/// Files named `known_diffs*.txt` are excluded — those document known
+/// cppjieba↔jieba-rs divergences and are inspected separately.
+std::vector<std::string> LoadGoldenLines() {
+    std::vector<std::string> lines;
+    namespace fs = std::filesystem;
+    for (const auto& entry : fs::directory_iterator(PAIMON_TANTIVY_GOLDEN_DIR)) {
+        if (!entry.is_regular_file()) continue;
+        const std::string name = entry.path().filename().string();
+        if (name.rfind("golden_", 0) != 0 || entry.path().extension() != ".txt") continue;
+        std::ifstream fin(entry.path());
+        std::string line;
+        while (std::getline(fin, line)) {
+            lines.push_back(line);
+        }
+    }
+    return lines;
+}
+
+/// Load lines from `known_diffs*.txt` — known divergent edge cases documented
+/// in docs/dev/tokenizer_diff_report.md.
+std::vector<std::string> LoadKnownDiffLines() {
+    std::vector<std::string> lines;
+    namespace fs = std::filesystem;
+    for (const auto& entry : fs::directory_iterator(PAIMON_TANTIVY_GOLDEN_DIR)) {
+        if (!entry.is_regular_file()) continue;
+        const std::string name = entry.path().filename().string();
+        if (name.rfind("known_diffs", 0) != 0 || entry.path().extension() != ".txt") continue;
+        std::ifstream fin(entry.path());
+        std::string line;
+        while (std::getline(fin, line)) {
+            lines.push_back(line);
+        }
+    }
+    return lines;
+}
+
+/// Tokenize via cppjieba + Normalize (mirrors JiebaAnalyzer runtime path).
+std::vector<std::string> TokenizeWithCppjieba(const cppjieba::Jieba& jieba, const std::string& mode,
+                                              const std::string& text) {
+    std::vector<std::string> terms;
+    ::paimon::lucene::JiebaTokenizer::CutWithMode(mode, &jieba, text, &terms);
+    std::vector<std::string_view> normalized_views;
+    ::paimon::lucene::JiebaTokenizer::Normalize(jieba.extractor.GetStopWords(), &terms,
+                                                &normalized_views);
+    std::vector<std::string> result;
+    result.reserve(normalized_views.size());
+    for (auto v : normalized_views) result.emplace_back(v);
+    return result;
+}
+
+/// Parse the FFI `tokenize` output (tab-separated: from\tto\tpos\ttext\n) and
+/// return only the token text sequence.
+std::vector<std::string> ExtractTokenTexts(const PaimonTantivyBuffer& buf) {
+    std::vector<std::string> out;
+    if (buf.len == 0) return out;
+    std::string s(reinterpret_cast<const char*>(buf.data), buf.len);
+    std::istringstream in(s);
+    std::string row;
+    while (std::getline(in, row)) {
+        // extract text field = after 3rd '\t'
+        size_t p1 = row.find('\t');
+        if (p1 == std::string::npos) continue;
+        size_t p2 = row.find('\t', p1 + 1);
+        if (p2 == std::string::npos) continue;
+        size_t p3 = row.find('\t', p2 + 1);
+        if (p3 == std::string::npos) continue;
+        out.emplace_back(row.substr(p3 + 1));
+    }
+    return out;
+}
+
+std::vector<std::string> TokenizeWithTantivy(PaimonJiebaTokenizer* tok, const std::string& text) {
+    BufferGuard buf;
+    PaimonTantivyStatus st =
+        paimon_tantivy_tokenizer_tokenize(tok, text.data(), text.size(), buf.out());
+    EXPECT_EQ(st, PaimonTantivyStatus::PAIMON_TANTIVY_STATUS_OK)
+        << "FFI tokenize failed: " << paimon_tantivy_last_error();
+    return ExtractTokenTexts(*buf.out());
+}
+
+/// Build a cppjieba::Jieba instance mirroring the one used at runtime.
+std::unique_ptr<cppjieba::Jieba> MakeJieba() {
+    const std::string d = JIEBA_TEST_DICT_DIR;
+    return std::make_unique<cppjieba::Jieba>(d + "/jieba.dict.utf8", d + "/hmm_model.utf8",
+                                             d + "/user.dict.utf8", d + "/idf.utf8",
+                                             d + "/stop_words.utf8");
+}
+
+struct DiffReport {
+    size_t total = 0;
+    size_t differ = 0;
+    std::vector<std::string> sample_diffs;  // first N diffs
+};
+
+void RunDiff(const std::vector<std::string>& lines, const std::string& mode, DiffReport* report) {
+    auto jieba = MakeJieba();
+    std::string dict_dir = JIEBA_TEST_DICT_DIR;
+
+    PaimonJiebaTokenizer* handle = nullptr;
+    PaimonTantivyStatus st = paimon_tantivy_tokenizer_new(mode.c_str(), /*with_position=*/true,
+                                                          dict_dir.c_str(), &handle);
+    ASSERT_EQ(st, PaimonTantivyStatus::PAIMON_TANTIVY_STATUS_OK)
+        << "tokenizer_new failed for mode=" << mode << ": " << paimon_tantivy_last_error();
+
+    for (const auto& line : lines) {
+        if (line.empty()) continue;
+        auto a = TokenizeWithCppjieba(*jieba, mode, line);
+        auto b = TokenizeWithTantivy(handle, line);
+        report->total++;
+        if (a != b) {
+            report->differ++;
+            if (report->sample_diffs.size() < 10) {
+                std::ostringstream os;
+                os << "LINE: " << line << "\n  cppjieba: [";
+                for (size_t i = 0; i < a.size(); ++i) {
+                    if (i) os << ",";
+                    os << a[i];
+                }
+                os << "]\n  jieba-rs: [";
+                for (size_t i = 0; i < b.size(); ++i) {
+                    if (i) os << ",";
+                    os << b[i];
+                }
+                os << "]";
+                report->sample_diffs.push_back(os.str());
+            }
+        }
+    }
+
+    paimon_tantivy_tokenizer_free(handle);
+}
+
+}  // namespace
+
+TEST(TantivyTokenizer, HmmModeReturnsUnsupported) {
+    std::string dict_dir = JIEBA_TEST_DICT_DIR;
+    PaimonJiebaTokenizer* handle = nullptr;
+    PaimonTantivyStatus st =
+        paimon_tantivy_tokenizer_new("hmm", /*with_position=*/true, dict_dir.c_str(), &handle);
+    EXPECT_EQ(st, PaimonTantivyStatus::PAIMON_TANTIVY_STATUS_UNSUPPORTED);
+    EXPECT_EQ(handle, nullptr);
+    std::string err = paimon_tantivy_last_error();
+    EXPECT_NE(err.find("hmm"), std::string::npos);
+}
+
+// ---------------- positive jieba-rs behavior assertions ----------------
+//
+// Per decision in docs/dev/tokenizer_diff_report.md: we do NOT require
+// byte-level parity with cppjieba (共存 + 各自索引不互读). Instead assert
+// jieba-rs produces expected token sequences for a curated set of inputs.
+
+struct JiebaRsCase {
+    std::string mode;
+    std::string input;
+    std::vector<std::string> expected;
+};
+
+class JiebaRsBehavior : public ::testing::TestWithParam<JiebaRsCase> {};
+
+TEST_P(JiebaRsBehavior, ProducesExpectedTokens) {
+    const auto& c = GetParam();
+    std::string dict_dir = JIEBA_TEST_DICT_DIR;
+    PaimonJiebaTokenizer* handle = nullptr;
+    PaimonTantivyStatus st = paimon_tantivy_tokenizer_new(c.mode.c_str(), /*with_position=*/true,
+                                                          dict_dir.c_str(), &handle);
+    ASSERT_EQ(st, PaimonTantivyStatus::PAIMON_TANTIVY_STATUS_OK) << paimon_tantivy_last_error();
+    auto got = TokenizeWithTantivy(handle, c.input);
+    EXPECT_EQ(got, c.expected) << "mode=" << c.mode << " input=" << c.input;
+    paimon_tantivy_tokenizer_free(handle);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    BasicCases, JiebaRsBehavior,
+    ::testing::Values(JiebaRsCase{"mix", "Hello World", {"hello", "world"}},
+                      JiebaRsCase{"mix", "HELLO", {"hello"}},
+                      JiebaRsCase{"mix", "中国人民", {"中国", "人民"}},
+                      // 他/了 在 stop_words.utf8 里,被 Normalize 过滤
+                      JiebaRsCase{"mix", "他来到了网易杭研大厦", {"来到", "网易", "杭研", "大厦"}},
+                      JiebaRsCase{"full", "中国", {"中", "中国", "国"}},
+                      JiebaRsCase{"query", "中国人民", {"中国", "人民"}}));
+
+// ---------------- advisory: log diffs vs cppjieba ----------------
+//
+// These tests never fail; they exist to print diffs to stderr for
+// human review, feeding docs/dev/tokenizer_diff_report.md. They cover both
+// strict and known-diffs corpora.
+
+class AdvisoryDiffTest : public ::testing::TestWithParam<std::string> {};
+
+TEST_P(AdvisoryDiffTest, LogsStrictGoldenDiffs) {
+    const auto mode = GetParam();
+    DiffReport report;
+    RunDiff(LoadGoldenLines(), mode, &report);
+    const double rate = report.total > 0 ? static_cast<double>(report.differ) / report.total : 0.0;
+    std::cerr << "ADVISORY-STRICT mode=" << mode << " total=" << report.total
+              << " differ=" << report.differ << " rate=" << rate << "\n";
+    for (const auto& d : report.sample_diffs) std::cerr << d << "\n";
+    SUCCEED() << "Advisory only: review docs/dev/tokenizer_diff_report.md";
+}
+
+TEST_P(AdvisoryDiffTest, LogsKnownDiffs) {
+    const auto mode = GetParam();
+    DiffReport report;
+    auto lines = LoadKnownDiffLines();
+    if (lines.empty()) GTEST_SKIP();
+    RunDiff(lines, mode, &report);
+    const double rate = report.total > 0 ? static_cast<double>(report.differ) / report.total : 0.0;
+    std::cerr << "ADVISORY-KNOWN mode=" << mode << " total=" << report.total
+              << " differ=" << report.differ << " rate=" << rate << "\n";
+    for (const auto& d : report.sample_diffs) std::cerr << d << "\n";
+    SUCCEED();
+}
+
+INSTANTIATE_TEST_SUITE_P(AllModes, AdvisoryDiffTest,
+                         ::testing::Values("mp", "mix", "full", "query"),
+                         [](const testing::TestParamInfo<std::string>& info) {
+                             return info.param;
+                         });
+
+}  // namespace paimon::tantivy
diff --git a/src/paimon/global_index/tantivy/tantivy_writer_test.cpp b/src/paimon/global_index/tantivy/tantivy_writer_test.cpp
new file mode 100644
index 000000000..8aeca0078
--- /dev/null
+++ b/src/paimon/global_index/tantivy/tantivy_writer_test.cpp
@@ -0,0 +1,272 @@
+/*
+ * Copyright 2026-present Alibaba Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Stage 4 writer test: build a tantivy-fulltext global index from an Arrow batch,
+ * persist it through GlobalIndexFileManager, then verify the resulting file
+ * conforms to the packing format documented in tantivy_defs.h:
+ *
+ *   [i32 version | i32 file_count |
+ *     (i32 name_len | name | i64 file_len | file_bytes)*]
+ *
+ * Stage 6 (reader) will round-trip these bytes back to a queryable index;
+ * this stage only checks structural validity + meta correctness.
+ */
+
+#include <cstring>
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/c/bridge.h"
+#include "arrow/ipc/api.h"
+#include "arrow/type.h"
+#include "gtest/gtest.h"
+#include "paimon/common/utils/path_util.h"
+#include "paimon/common/utils/string_utils.h"
+#include "paimon/core/global_index/global_index_file_manager.h"
+#include "paimon/core/index/index_path_factory.h"
+#include "paimon/fs/local/local_file_system.h"
+#include "paimon/global_index/tantivy/tantivy_defs.h"
+#include "paimon/global_index/tantivy/tantivy_global_index_writer.h"
+#include "paimon/testing/utils/testharness.h"
+
+#ifndef JIEBA_TEST_DICT_DIR
+#error "JIEBA_TEST_DICT_DIR must be set at compile time"
+#endif
+
+namespace paimon::tantivy::test {
+
+namespace {
+
+class FakeIndexPathFactory : public IndexPathFactory {
+ public:
+    explicit FakeIndexPathFactory(const std::string& root) : root_(root) {}
+    std::string NewPath() const override {
+        assert(false);
+        return "";
+    }
+    std::string ToPath(const std::shared_ptr<IndexFileMeta>&) const override {
+        assert(false);
+        return "";
+    }
+    std::string ToPath(const std::string& file_name) const override {
+        return PathUtil::JoinPath(root_, file_name);
+    }
+    bool IsExternalPath() const override {
+        return false;
+    }
+
+ private:
+    std::string root_;
+};
+
+/// Read the entire file at `path` into a byte buffer.
+std::vector<uint8_t> ReadFile(const std::string& path) {
+    std::ifstream in(path, std::ios::binary);
+    EXPECT_TRUE(in.good()) << "open " << path;
+    in.seekg(0, std::ios::end);
+    auto sz = static_cast<std::size_t>(in.tellg());
+    in.seekg(0, std::ios::beg);
+    std::vector<uint8_t> buf(sz);
+    in.read(reinterpret_cast<char*>(buf.data()), sz);
+    return buf;
+}
+
+/// Read a big-endian integer from a raw pointer.
+template <typename T>
+T ReadBE(const uint8_t* p) {
+    T v = 0;
+    for (std::size_t i = 0; i < sizeof(T); ++i) {
+        v = static_cast<T>((v << 8) | static_cast<T>(p[i]));
+    }
+    return v;
+}
+
+struct PackedEntry {
+    std::string name;
+    int64_t length = 0;
+    std::size_t offset = 0;  // offset in the buffer where bytes start
+};
+
+/// Parse the packing header into a list of entries; verifies that the offsets
+/// and lengths cover the full buffer with no leftover bytes.
+/// Format (Java-compatible, big-endian, no version header):
+///   [i32 BE file_count | (i32 BE name_len | name | i64 BE file_len | bytes)*]
+std::vector<PackedEntry> ParsePacked(const std::vector<uint8_t>& bytes) {
+    std::vector<PackedEntry> entries;
+    EXPECT_GE(bytes.size(), 4u);
+    int32_t file_count = ReadBE<int32_t>(bytes.data());
+    EXPECT_GT(file_count, 0);
+    std::size_t off = 4;
+    for (int32_t i = 0; i < file_count; ++i) {
+        EXPECT_LE(off + 4, bytes.size());
+        int32_t nlen = ReadBE<int32_t>(bytes.data() + off);
+        off += 4;
+        EXPECT_GT(nlen, 0);
+        EXPECT_LE(off + static_cast<std::size_t>(nlen), bytes.size());
+        std::string name(reinterpret_cast<const char*>(bytes.data() + off),
+                         static_cast<std::size_t>(nlen));
+        off += nlen;
+        EXPECT_LE(off + 8, bytes.size());
+        int64_t flen = ReadBE<int64_t>(bytes.data() + off);
+        off += 8;
+        EXPECT_GE(flen, 0);
+        EXPECT_LE(off + static_cast<std::size_t>(flen), bytes.size());
+        entries.push_back({name, flen, off});
+        off += static_cast<std::size_t>(flen);
+    }
+    EXPECT_EQ(off, bytes.size()) << "trailing bytes after pack";
+    return entries;
+}
+
+class TantivyGlobalIndexWriterTest : public ::testing::Test {
+ public:
+    void SetUp() override {
+        // Make jieba dict dir visible to the writer's GetJiebaDictionaryDir
+        // (it reads the env var directly).
+        setenv(kJiebaDictDirEnv, JIEBA_TEST_DICT_DIR, /*overwrite=*/1);
+    }
+
+    std::unique_ptr<::ArrowSchema> CreateArrowSchema(
+        const std::shared_ptr<arrow::DataType>& data_type) const {
+        auto c_schema = std::make_unique<::ArrowSchema>();
+        EXPECT_TRUE(arrow::ExportType(*data_type, c_schema.get()).ok());
+        return c_schema;
+    }
+
+    Result<std::vector<GlobalIndexIOMeta>> WriteIndex(
+        const std::string& root, const std::shared_ptr<arrow::DataType>& data_type,
+        const std::map<std::string, std::string>& options,
+        const std::shared_ptr<arrow::Array>& array) {
+        auto path_factory = std::make_shared<FakeIndexPathFactory>(root);
+        auto file_writer = std::make_shared<GlobalIndexFileManager>(fs_, path_factory);
+        PAIMON_ASSIGN_OR_RAISE(auto writer, TantivyGlobalIndexWriter::Create(
+                                                "f0", data_type, file_writer, options, pool_));
+        ::ArrowArray c_array;
+        PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportArray(*array, &c_array));
+        std::vector<int64_t> relative_row_ids(array->length());
+        for (int64_t i = 0; i < array->length(); ++i) relative_row_ids[i] = i;
+        PAIMON_RETURN_NOT_OK(writer->AddBatch(&c_array, std::move(relative_row_ids)));
+        return writer->Finish();
+    }
+
+ protected:
+    std::shared_ptr<MemoryPool> pool_ = GetDefaultPool();
+    std::shared_ptr<FileSystem> fs_ = std::make_shared<LocalFileSystem>();
+    std::shared_ptr<arrow::DataType> data_type_ =
+        arrow::struct_({arrow::field("f0", arrow::utf8())});
+};
+
+}  // namespace
+
+TEST_F(TantivyGlobalIndexWriterTest, EnglishCorpusProducesValidPackedIndex) {
+    auto root_dir = paimon::test::UniqueTestDirectory::Create();
+    ASSERT_TRUE(root_dir);
+    std::string root = root_dir->Str();
+
+    std::map<std::string, std::string> options = {
+        {kTantivyWriteOmitTermFreqAndPositions, "false"},
+    };
+    std::shared_ptr<arrow::Array> array = arrow::ipc::internal::json::ArrayFromJSON(data_type_, R"([
+            ["This is an test document."],
+            ["This is an new document document document."],
+            ["Document document document document test."],
+            ["unordered user-defined doc id"]
+        ])")
+                                              .ValueOrDie();
+
+    ASSERT_OK_AND_ASSIGN(auto metas, WriteIndex(root, data_type_, options, array));
+    ASSERT_EQ(metas.size(), 1u);
+    const auto& meta = metas[0];
+
+    auto file_name = PathUtil::GetName(meta.file_path);
+    EXPECT_TRUE(StringUtils::StartsWith(file_name, "tantivy-fulltext-global-index-"))
+        << "file_name=" << file_name;
+    EXPECT_TRUE(StringUtils::EndsWith(file_name, ".index"));
+    ASSERT_TRUE(meta.metadata);
+    EXPECT_EQ(std::string(meta.metadata->data(), meta.metadata->size()),
+              R"({"write.omit-term-freq-and-position":"false"})");
+    EXPECT_GT(meta.file_size, 8);
+
+    auto bytes = ReadFile(meta.file_path);
+    ASSERT_EQ(static_cast<int64_t>(bytes.size()), meta.file_size);
+    auto entries = ParsePacked(bytes);
+    EXPECT_FALSE(entries.empty());
+    bool has_meta_json = false;
+    for (const auto& e : entries) {
+        if (e.name == "meta.json") has_meta_json = true;
+    }
+    EXPECT_TRUE(has_meta_json) << "expected meta.json in packed entries";
+}
+
+TEST_F(TantivyGlobalIndexWriterTest, ChineseCorpusProducesValidPackedIndex) {
+    auto root_dir = paimon::test::UniqueTestDirectory::Create();
+    ASSERT_TRUE(root_dir);
+    std::string root = root_dir->Str();
+
+    std::map<std::string, std::string> options = {
+        {kTantivyWriteOmitTermFreqAndPositions, "false"},
+        {kTantivyWriteTokenizer, "paimon_jieba"},
+        {kJiebaTokenizeMode, "query"},
+    };
+    std::shared_ptr<arrow::Array> array = arrow::ipc::internal::json::ArrayFromJSON(data_type_, R"([
+            ["千问是一个智能助手"],
+            ["新一代AI助手发布"]
+        ])")
+                                              .ValueOrDie();
+    ASSERT_OK_AND_ASSIGN(auto metas, WriteIndex(root, data_type_, options, array));
+    ASSERT_EQ(metas.size(), 1u);
+    const auto& meta = metas[0];
+    auto bytes = ReadFile(meta.file_path);
+    ASSERT_EQ(static_cast<int64_t>(bytes.size()), meta.file_size);
+    auto entries = ParsePacked(bytes);
+    EXPECT_FALSE(entries.empty());
+}
+
+TEST_F(TantivyGlobalIndexWriterTest, NullStringRowsBecomeEmptyDocuments) {
+    auto root_dir = paimon::test::UniqueTestDirectory::Create();
+    ASSERT_TRUE(root_dir);
+    std::string root = root_dir->Str();
+
+    std::map<std::string, std::string> options;
+    std::shared_ptr<arrow::Array> array = arrow::ipc::internal::json::ArrayFromJSON(data_type_, R"([
+            ["nonempty"],
+            [null],
+            ["another"]
+        ])")
+                                              .ValueOrDie();
+    ASSERT_OK_AND_ASSIGN(auto metas, WriteIndex(root, data_type_, options, array));
+    ASSERT_EQ(metas.size(), 1u);
+}
+
+TEST_F(TantivyGlobalIndexWriterTest, RejectsHmmTokenizeMode) {
+    auto root_dir = paimon::test::UniqueTestDirectory::Create();
+    ASSERT_TRUE(root_dir);
+    auto path_factory = std::make_shared<FakeIndexPathFactory>(root_dir->Str());
+    auto file_writer = std::make_shared<GlobalIndexFileManager>(fs_, path_factory);
+    // hmm rejection only fires when the jieba tokenizer is actually constructed,
+    // so this test must explicitly opt into jieba (default tokenizer skips
+    // jieba construction entirely).
+    std::map<std::string, std::string> options = {
+        {kTantivyWriteTokenizer, "paimon_jieba"},
+        {kJiebaTokenizeMode, "hmm"},
+    };
+    auto res = TantivyGlobalIndexWriter::Create("f0", data_type_, file_writer, options, pool_);
+    ASSERT_FALSE(res.ok());
+    EXPECT_TRUE(res.status().IsNotImplemented()) << res.status().ToString();
+}
+
+}  // namespace paimon::tantivy::test
diff --git a/test/test_data/cpp_tantivy_fixtures/english_default.archive b/test/test_data/cpp_tantivy_fixtures/english_default.archive
new file mode 100644
index 000000000..d195af7ec
Binary files /dev/null and b/test/test_data/cpp_tantivy_fixtures/english_default.archive differ
diff --git a/test/test_data/java_tantivy_fixtures/README.md b/test/test_data/java_tantivy_fixtures/README.md
new file mode 100644
index 000000000..f13a5e162
--- /dev/null
+++ b/test/test_data/java_tantivy_fixtures/README.md
@@ -0,0 +1,49 @@
+# Java → C++ tantivy 跨端读 fixture
+
+> 生成于 **2026-04-23**,用于 J6 `paimon-tantivy-java-compat-test`。
+
+## 内容
+
+| 文件 | 作用 |
+|---|---|
+| `english_simple.archive` | 由 paimon-java 的 `TantivyIndexWriter + packIndex` 路径生成的 BE archive;10 条纯英文文档,row_ids 0..9 |
+| `english_simple.golden.json` | 人类可读 golden,每个 query type 的 expected row_ids |
+
+## 版本锁定
+
+| 组件 | 版本 |
+|---|---|
+| tantivy crate | **0.22.1** |
+| paimon-tantivy-jni | git sha 生成时最新(commit 在 paimon 仓) |
+| schema | B1:`row_id` u64 stored+indexed+fast + `text` TEXT |
+| archive 字节格式 | Java-compat 大端 + 无 version |
+
+任何组件升级(特别是 **tantivy 版本**)都可能导致段文件二进制不兼容 — 需**重新 regen**:
+
+```bash
+# 1. 构建 Java native lib(若 Rust 变了)
+cd /path/to/paimon/paimon-tantivy/paimon-tantivy-jni/rust && cargo build --release
+cp target/release/libtantivy_jni.dylib \
+   ../src/main/resources/native/darwin-aarch64/
+
+# 2. mvn install + 跑 fixture gen
+cd /path/to/paimon
+mvn install -pl paimon-tantivy/paimon-tantivy-index -am -DskipTests -Denforcer.skip=true
+mvn -pl paimon-tantivy/paimon-tantivy-index test \
+    -Dtest=TantivyIndexFixtureGen -DfailIfNoTests=false \
+    -Denforcer.skip=true \
+    -DfixtureOutDir=/path/to/paimon-cpp/test/test_data/java_tantivy_fixtures
+```
+
+## 检验
+
+```
+xxd english_simple.archive | head -1
+# 00000000: 00 00 00 16 ...   ← BE int32 file_count = 22(Java 不 force-merge,多段)
+```
+
+## 相关文档
+
+- `docs/dev/tantivy_java_cross_read_plan.md` — J6 整体 plan
+- `docs/dev/test_execute.md` — J6 本次执行日志
+- `docs/dev/tantivy_java_compat_plan.md` — paimon-cpp 与 paimon-java 对齐总方案
diff --git a/test/test_data/java_tantivy_fixtures/english_simple.archive b/test/test_data/java_tantivy_fixtures/english_simple.archive
new file mode 100644
index 000000000..c08499578
Binary files /dev/null and b/test/test_data/java_tantivy_fixtures/english_simple.archive differ
diff --git a/test/test_data/java_tantivy_fixtures/english_simple.golden.json b/test/test_data/java_tantivy_fixtures/english_simple.golden.json
new file mode 100644
index 000000000..9776b720b
--- /dev/null
+++ b/test/test_data/java_tantivy_fixtures/english_simple.golden.json
@@ -0,0 +1,25 @@
+{
+  "description": "10 English docs; row_ids 0..9; generated by TantivyIndexFixtureGen via TantivyFullTextGlobalIndexWriter production path; consumed by paimon-cpp V3 reader cross-read test (J6).",
+  "docs": [
+    {"row_id": 0, "text": "apple banana cherry"},
+    {"row_id": 1, "text": "apple durian"},
+    {"row_id": 2, "text": "banana cherry"},
+    {"row_id": 3, "text": "fig grape"},
+    {"row_id": 4, "text": "apple cherry fig"},
+    {"row_id": 5, "text": "banana elderberry"},
+    {"row_id": 6, "text": "cherry durian"},
+    {"row_id": 7, "text": "apple"},
+    {"row_id": 8, "text": "grape fig elderberry"},
+    {"row_id": 9, "text": "cherry fig"}
+  ],
+  "queries": [
+    {"type": "match_all", "query": "apple", "expected_row_ids": [0, 1, 4, 7]},
+    {"type": "match_all", "query": "apple banana", "expected_row_ids": [0]},
+    {"type": "match_any", "query": "durian elderberry", "expected_row_ids": [1, 5, 6, 8]},
+    {"type": "phrase", "query": "apple banana", "expected_row_ids": [0]},
+    {"type": "phrase", "query": "banana cherry", "expected_row_ids": [0, 2]},
+    {"type": "prefix", "query": "ap", "expected_row_ids": [0, 1, 4, 7]},
+    {"type": "wildcard", "query": "*err*", "expected_row_ids": [0, 2, 4, 5, 6, 8, 9]},
+    {"type": "match_any", "query": "apple banana cherry durian fig grape elderberry", "expected_row_ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}
+  ]
+}
diff --git a/test/test_data/java_tantivy_fixtures/production_sample.archive b/test/test_data/java_tantivy_fixtures/production_sample.archive
new file mode 100644
index 000000000..0f8297189
Binary files /dev/null and b/test/test_data/java_tantivy_fixtures/production_sample.archive differ
diff --git a/test/test_data/tokenizer_golden/README.md b/test/test_data/tokenizer_golden/README.md
new file mode 100644
index 000000000..d51861e8f
--- /dev/null
+++ b/test/test_data/tokenizer_golden/README.md
@@ -0,0 +1,21 @@
+# Tokenizer 黄金样本
+
+供 `paimon-tantivy-tokenizer-test` 比对 cppjieba vs jieba-rs 的分词输出。
+
+## 文件
+
+- `golden_synthetic.txt` — 手写边界 case（混合中英文、数字、标点、emoji、空白、超长词…）
+- `golden_corpus.txt` — 公开语料短句摘录（通用知识、无版权敏感）
+
+## 使用
+
+测试代码（见 `src/paimon/global_index/tantivy/tantivy_tokenizer_test.cpp`）：
+1. 逐行读取
+2. 每行用 cppjieba `JiebaTokenizer::CutWithMode` + `Normalize` 得到 token 序列 A
+3. 每行用 jieba-rs FFI `paimon_tantivy_tokenizer_tokenize` 得到 token 序列 B
+4. 比对 A 和 B：如果完全相同则本行 pass；否则记入 diff 报告
+5. 通过条件：diff 率 ≤ 1%（见 plan Stage 3 验收标准）
+
+## 扩充
+
+后续补充业务 query log 时，新增文件 `golden_business.txt` 放在同目录，测试代码自动扫描 `golden_*.txt`。
diff --git a/test/test_data/tokenizer_golden/golden_corpus.txt b/test/test_data/tokenizer_golden/golden_corpus.txt
new file mode 100644
index 000000000..38c7c887e
--- /dev/null
+++ b/test/test_data/tokenizer_golden/golden_corpus.txt
@@ -0,0 +1,20 @@
+人工智能是计算机科学的一个分支
+机器学习是人工智能的核心领域
+深度学习使用神经网络进行模式识别
+大语言模型基于 Transformer 架构
+开源软件促进了全球技术合作
+Rust 语言以内存安全著称
+Python 广泛应用于数据科学
+分布式系统需要处理网络分区问题
+数据库事务保证原子性一致性隔离性持久性
+编程的艺术在于解决复杂问题
+搜索引擎依赖倒排索引加速查询
+自然语言处理技术日新月异
+云计算降低了基础设施成本
+开发者社区推动了技术进步
+版本控制系统是协作的基石
+操作系统管理计算机的硬件资源
+编译器将源代码翻译成机器指令
+算法的时间复杂度决定了执行效率
+数据结构的选择影响程序性能
+网络协议定义了通信的规则
diff --git a/test/test_data/tokenizer_golden/golden_synthetic.txt b/test/test_data/tokenizer_golden/golden_synthetic.txt
new file mode 100644
index 000000000..65b144741
--- /dev/null
+++ b/test/test_data/tokenizer_golden/golden_synthetic.txt
@@ -0,0 +1,38 @@
+Hello World
+hello world
+HELLO WORLD
+Hello 世界
+你好世界
+中国人民共和国
+我爱北京天安门
+北京是中华人民共和国的首都
+南京市长江大桥
+他来到了网易杭研大厦
+小明硕士毕业于中国科学院计算所,后在日本京都大学深造
+工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作
+结婚的和尚未结婚的
+程序员用Python和Rust写代码
+this is a test 这是一个测试
+Rust tantivy 全文索引
+C++ 到 Rust 的 FFI 桥接
+cpp cppjieba jieba-rs
+分词器 tokenizer
+全文 search
+倒排索引 inverted index
+paimon-cpp tantivy-fts
+100个中文字符被分词器处理
+超长词最长词最长词最长词最长词最长词最长词
+...
+!@#$%^&*()
+"hello"
+'quoted'
+<tag>content</tag>
+{json: "value"}
+[1,2,3]
+line1
+line2
+CJK 标点、。！？
+全角：ＡＢＣ１２３
+ABC123 混合数字字母
+abc123
+ABC123
diff --git a/test/test_data/tokenizer_golden/known_diffs.txt b/test/test_data/tokenizer_golden/known_diffs.txt
new file mode 100644
index 000000000..23073bd37
--- /dev/null
+++ b/test/test_data/tokenizer_golden/known_diffs.txt
@@ -0,0 +1,18 @@
+abc_123
+foo.bar.baz
+https://example.com/path?q=1
+email@example.com
+192.168.1.1
+2026-04-20
+12:34:56
+$100 ¥200 €300
+100%
+3.14
+-1 -2 -3
+a b c d e
+
+	tab	tab
+mixed space       tab
+空   白 和	tab
+  leading and trailing
+中英混合 Mixed CN EN
diff --git a/third_party/tantivy_ffi/Cargo.lock b/third_party/tantivy_ffi/Cargo.lock
new file mode 100644
index 000000000..be9056ad8
--- /dev/null
+++ b/third_party/tantivy_ffi/Cargo.lock
@@ -0,0 +1,1859 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "adler32"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234"
+
+[[package]]
+name = "aho-corasick"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "allocator-api2"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
+
+[[package]]
+name = "allocator-api2"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c880a97d28a3681c0267bd29cff89621202715b065127cd445fa0f0fe0aa2880"
+
+[[package]]
+name = "anstream"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
+
+[[package]]
+name = "anstyle-parse"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
+dependencies = [
+ "anstyle",
+ "once_cell_polyfill",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.102"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
+
+[[package]]
+name = "arc-swap"
+version = "1.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207"
+dependencies = [
+ "rustversion",
+]
+
+[[package]]
+name = "async-trait"
+version = "0.1.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "autocfg"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
+
+[[package]]
+name = "base64"
+version = "0.22.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
+
+[[package]]
+name = "bitflags"
+version = "2.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
+
+[[package]]
+name = "bitpacking"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96a7139abd3d9cebf8cd6f920a389cf3dc9576172e32f4563f188cae3c3eb019"
+dependencies = [
+ "crunchy",
+]
+
+[[package]]
+name = "bumpalo"
+version = "3.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
+
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
+[[package]]
+name = "cbindgen"
+version = "0.29.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "befbfd072a8e81c02f8c507aefce431fe5e7d051f83d48a23ffc9b9fe5a11799"
+dependencies = [
+ "clap",
+ "heck",
+ "indexmap",
+ "log",
+ "proc-macro2",
+ "quote",
+ "serde",
+ "serde_json",
+ "syn",
+ "tempfile",
+ "toml",
+]
+
+[[package]]
+name = "cc"
+version = "1.2.60"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43c5703da9466b66a946814e1adf53ea2c90f10063b86290cc9eb67ce3478a20"
+dependencies = [
+ "find-msvc-tools",
+ "jobserver",
+ "libc",
+ "shlex",
+]
+
+[[package]]
+name = "cedarwood"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90"
+dependencies = [
+ "smallvec",
+]
+
+[[package]]
+name = "census"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
+
+[[package]]
+name = "clap"
+version = "4.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51"
+dependencies = [
+ "clap_builder",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+]
+
+[[package]]
+name = "clap_lex"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
+
+[[package]]
+name = "colorchoice"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
+
+[[package]]
+name = "crc32fast"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "croaring"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0e813b58ac55ac5ccea5ec63beb8c80f37dedd78da3f594c848313415a08c8c"
+dependencies = [
+ "allocator-api2 0.4.0",
+ "croaring-sys",
+]
+
+[[package]]
+name = "croaring-sys"
+version = "4.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f34e9ee8e65c0d46c9d0fe55ce80b477d0bfae4c786c6694687b9c70e8267027"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "crossbeam-channel"
+version = "0.5.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+
+[[package]]
+name = "crunchy"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
+
+[[package]]
+name = "dary_heap"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b1e3a325bc115f096c8b77bbf027a7c2592230e70be2d985be950d3d5e60ebe"
+
+[[package]]
+name = "deranged"
+version = "0.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c"
+dependencies = [
+ "powerfmt",
+ "serde_core",
+]
+
+[[package]]
+name = "downcast-rs"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2"
+
+[[package]]
+name = "either"
+version = "1.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
+
+[[package]]
+name = "equivalent"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
+
+[[package]]
+name = "errno"
+version = "0.3.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
+dependencies = [
+ "libc",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "fastdivide"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9afc2bd4d5a73106dd53d10d73d3401c2f32730ba2c0b93ddb888a8983680471"
+
+[[package]]
+name = "fastrand"
+version = "2.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
+
+[[package]]
+name = "find-msvc-tools"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
+
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
+[[package]]
+name = "foldhash"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
+
+[[package]]
+name = "foldhash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
+
+[[package]]
+name = "fs4"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f7e180ac76c23b45e767bd7ae9579bc0bb458618c4bc71835926e098e61d15f8"
+dependencies = [
+ "rustix 0.38.44",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "fxhash"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
+dependencies = [
+ "byteorder",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi 5.3.0",
+ "wasip2",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi 6.0.0",
+ "wasip2",
+ "wasip3",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.15.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
+dependencies = [
+ "allocator-api2 0.2.21",
+ "equivalent",
+ "foldhash 0.1.5",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
+dependencies = [
+ "allocator-api2 0.2.21",
+ "equivalent",
+ "foldhash 0.2.0",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51"
+
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
+[[package]]
+name = "hermit-abi"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
+
+[[package]]
+name = "htmlescape"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e9025058dae765dee5070ec375f591e2ba14638c63feff74f13805a72e523163"
+
+[[package]]
+name = "id-arena"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
+
+[[package]]
+name = "include-flate"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23e233413926ef735f7d87024466cfda5a4b87467730846bd82ea7d504121347"
+dependencies = [
+ "include-flate-codegen",
+ "include-flate-compress",
+]
+
+[[package]]
+name = "include-flate-codegen"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e7148f24ef8922cc0e5574ebb908729ccdd3a110c440a45165733fedadd9969"
+dependencies = [
+ "include-flate-compress",
+ "proc-macro-error2",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "include-flate-compress"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74783a9ed407e844e99d5e7a57bd650acbfa124cf6e97ffd790ba59d8ab8e7ff"
+dependencies = [
+ "libflate",
+ "zstd",
+]
+
+[[package]]
+name = "indexmap"
+version = "2.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
+dependencies = [
+ "equivalent",
+ "hashbrown 0.17.0",
+ "serde",
+ "serde_core",
+]
+
+[[package]]
+name = "instant"
+version = "0.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
+
+[[package]]
+name = "itertools"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
+
+[[package]]
+name = "jieba-macros"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c676b32a471d3cfae8dac2ad2f8334cd52e53377733cca8c1fb0a5062fec192"
+dependencies = [
+ "phf_codegen",
+]
+
+[[package]]
+name = "jieba-rs"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f5dd552bbb95d578520ee68403bf8aaf0dbbb2ce55b0854d019f9350ad61040a"
+dependencies = [
+ "cedarwood",
+ "fxhash",
+ "include-flate",
+ "jieba-macros",
+ "lazy_static",
+ "phf",
+ "regex",
+]
+
+[[package]]
+name = "jobserver"
+version = "0.1.34"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
+dependencies = [
+ "getrandom 0.3.4",
+ "libc",
+]
+
+[[package]]
+name = "js-sys"
+version = "0.3.95"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2964e92d1d9dc3364cae4d718d93f227e3abb088e747d92e0395bfdedf1c12ca"
+dependencies = [
+ "once_cell",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
+
+[[package]]
+name = "leb128fmt"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
+
+[[package]]
+name = "levenshtein_automata"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c2cdeb66e45e9f36bfad5bbdb4d2384e70936afbee843c6f6543f0c551ebb25"
+
+[[package]]
+name = "libc"
+version = "0.2.185"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52ff2c0fe9bc6cb6b14a0592c2ff4fa9ceb83eea9db979b0487cd054946a2b8f"
+
+[[package]]
+name = "libflate"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd96e993e5f3368b0cb8497dae6c860c22af8ff18388c61c6c0b86c58d86b5df"
+dependencies = [
+ "adler32",
+ "crc32fast",
+ "dary_heap",
+ "libflate_lz77",
+ "no_std_io2",
+]
+
+[[package]]
+name = "libflate_lz77"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff7a10e427698aef6eef269482776debfef63384d30f13aad39a1a95e0e098fd"
+dependencies = [
+ "hashbrown 0.16.1",
+ "no_std_io2",
+ "rle-decode-fast",
+]
+
+[[package]]
+name = "libm"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.4.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
+
+[[package]]
+name = "log"
+version = "0.4.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
+
+[[package]]
+name = "lru"
+version = "0.12.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38"
+dependencies = [
+ "hashbrown 0.15.5",
+]
+
+[[package]]
+name = "lz4_flex"
+version = "0.11.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "373f5eceeeab7925e0c1098212f2fbc4d416adec9d35051a6ab251e824c1854a"
+
+[[package]]
+name = "measure_time"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dbefd235b0aadd181626f281e1d684e116972988c14c264e42069d5e8a5775cc"
+dependencies = [
+ "instant",
+ "log",
+]
+
+[[package]]
+name = "memchr"
+version = "2.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
+
+[[package]]
+name = "memmap2"
+version = "0.9.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "minimal-lexical"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
+
+[[package]]
+name = "murmurhash32"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b"
+
+[[package]]
+name = "no_std_io2"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b51ed7824b6e07d354605f4abb3d9d300350701299da96642ee084f5ce631550"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "nom"
+version = "7.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
+dependencies = [
+ "memchr",
+ "minimal-lexical",
+]
+
+[[package]]
+name = "num-conv"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967"
+
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+ "libm",
+]
+
+[[package]]
+name = "num_cpus"
+version = "1.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
+dependencies = [
+ "hermit-abi",
+ "libc",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.21.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
+
+[[package]]
+name = "once_cell_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
+
+[[package]]
+name = "oneshot"
+version = "0.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "269bca4c2591a28585d6bf10d9ed0332b7d76900a1b02bec41bdc3a2cdcda107"
+
+[[package]]
+name = "ownedbytes"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3a059efb063b8f425b948e042e6b9bd85edfe60e913630ed727b23e2dfcc558"
+dependencies = [
+ "stable_deref_trait",
+]
+
+[[package]]
+name = "paimon_tantivy_ffi"
+version = "0.1.0"
+dependencies = [
+ "cbindgen",
+ "croaring",
+ "jieba-rs",
+ "log",
+ "tantivy",
+ "tempfile",
+]
+
+[[package]]
+name = "phf"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
+dependencies = [
+ "phf_shared",
+]
+
+[[package]]
+name = "phf_codegen"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
+dependencies = [
+ "phf_generator",
+ "phf_shared",
+]
+
+[[package]]
+name = "phf_generator"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
+dependencies = [
+ "phf_shared",
+ "rand",
+]
+
+[[package]]
+name = "phf_shared"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
+dependencies = [
+ "siphasher",
+]
+
+[[package]]
+name = "pkg-config"
+version = "0.3.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e"
+
+[[package]]
+name = "powerfmt"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
+dependencies = [
+ "zerocopy",
+]
+
+[[package]]
+name = "prettyplease"
+version = "0.2.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
+dependencies = [
+ "proc-macro2",
+ "syn",
+]
+
+[[package]]
+name = "proc-macro-error-attr2"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96de42df36bb9bba5542fe9f1a054b8cc87e172759a1868aa05c1f3acc89dfc5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+]
+
+[[package]]
+name = "proc-macro-error2"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "11ec05c52be0a07b08061f7dd003e7d7092e0472bc731b4af7bb1ef876109802"
+dependencies = [
+ "proc-macro-error-attr2",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "r-efi"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
+
+[[package]]
+name = "r-efi"
+version = "6.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
+
+[[package]]
+name = "rand"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a"
+dependencies = [
+ "libc",
+ "rand_chacha",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom 0.2.17",
+]
+
+[[package]]
+name = "rand_distr"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
+dependencies = [
+ "num-traits",
+ "rand",
+]
+
+[[package]]
+name = "rayon"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+
+[[package]]
+name = "rayon-core"
+version = "1.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "regex"
+version = "1.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
+
+[[package]]
+name = "rle-decode-fast"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422"
+
+[[package]]
+name = "rust-stemmers"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54"
+dependencies = [
+ "serde",
+ "serde_derive",
+]
+
+[[package]]
+name = "rustc-hash"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
+
+[[package]]
+name = "rustix"
+version = "0.38.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154"
+dependencies = [
+ "bitflags",
+ "errno",
+ "libc",
+ "linux-raw-sys 0.4.15",
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "rustix"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
+dependencies = [
+ "bitflags",
+ "errno",
+ "libc",
+ "linux-raw-sys 0.12.1",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "rustversion"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
+
+[[package]]
+name = "semver"
+version = "1.0.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
+
+[[package]]
+name = "serde"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.149"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
+dependencies = [
+ "itoa",
+ "memchr",
+ "serde",
+ "serde_core",
+ "zmij",
+]
+
+[[package]]
+name = "serde_spanned"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6662b5879511e06e8999a8a235d848113e942c9124f211511b16466ee2995f26"
+dependencies = [
+ "serde_core",
+]
+
+[[package]]
+name = "shlex"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
+
+[[package]]
+name = "siphasher"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e"
+
+[[package]]
+name = "sketches-ddsketch"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85636c14b73d81f541e525f585c0a2109e6744e1565b5c1668e31c70c10ed65c"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "smallvec"
+version = "1.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
+
+[[package]]
+name = "stable_deref_trait"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
+
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
+[[package]]
+name = "syn"
+version = "2.0.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "tantivy"
+version = "0.22.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96599ea6fccd844fc833fed21d2eecac2e6a7c1afd9e044057391d78b1feb141"
+dependencies = [
+ "aho-corasick",
+ "arc-swap",
+ "base64",
+ "bitpacking",
+ "byteorder",
+ "census",
+ "crc32fast",
+ "crossbeam-channel",
+ "downcast-rs",
+ "fastdivide",
+ "fnv",
+ "fs4",
+ "htmlescape",
+ "itertools",
+ "levenshtein_automata",
+ "log",
+ "lru",
+ "lz4_flex",
+ "measure_time",
+ "memmap2",
+ "num_cpus",
+ "once_cell",
+ "oneshot",
+ "rayon",
+ "regex",
+ "rust-stemmers",
+ "rustc-hash",
+ "serde",
+ "serde_json",
+ "sketches-ddsketch",
+ "smallvec",
+ "tantivy-bitpacker",
+ "tantivy-columnar",
+ "tantivy-common",
+ "tantivy-fst",
+ "tantivy-query-grammar",
+ "tantivy-stacker",
+ "tantivy-tokenizer-api",
+ "tempfile",
+ "thiserror",
+ "time",
+ "uuid",
+ "winapi",
+]
+
+[[package]]
+name = "tantivy-bitpacker"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "284899c2325d6832203ac6ff5891b297fc5239c3dc754c5bc1977855b23c10df"
+dependencies = [
+ "bitpacking",
+]
+
+[[package]]
+name = "tantivy-columnar"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "12722224ffbe346c7fec3275c699e508fd0d4710e629e933d5736ec524a1f44e"
+dependencies = [
+ "downcast-rs",
+ "fastdivide",
+ "itertools",
+ "serde",
+ "tantivy-bitpacker",
+ "tantivy-common",
+ "tantivy-sstable",
+ "tantivy-stacker",
+]
+
+[[package]]
+name = "tantivy-common"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8019e3cabcfd20a1380b491e13ff42f57bb38bf97c3d5fa5c07e50816e0621f4"
+dependencies = [
+ "async-trait",
+ "byteorder",
+ "ownedbytes",
+ "serde",
+ "time",
+]
+
+[[package]]
+name = "tantivy-fst"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d60769b80ad7953d8a7b2c70cdfe722bbcdcac6bccc8ac934c40c034d866fc18"
+dependencies = [
+ "byteorder",
+ "regex-syntax",
+ "utf8-ranges",
+]
+
+[[package]]
+name = "tantivy-query-grammar"
+version = "0.22.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "847434d4af57b32e309f4ab1b4f1707a6c566656264caa427ff4285c4d9d0b82"
+dependencies = [
+ "nom",
+]
+
+[[package]]
+name = "tantivy-sstable"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c69578242e8e9fc989119f522ba5b49a38ac20f576fc778035b96cc94f41f98e"
+dependencies = [
+ "tantivy-bitpacker",
+ "tantivy-common",
+ "tantivy-fst",
+ "zstd",
+]
+
+[[package]]
+name = "tantivy-stacker"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c56d6ff5591fc332739b3ce7035b57995a3ce29a93ffd6012660e0949c956ea8"
+dependencies = [
+ "murmurhash32",
+ "rand_distr",
+ "tantivy-common",
+]
+
+[[package]]
+name = "tantivy-tokenizer-api"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2a0dcade25819a89cfe6f17d932c9cedff11989936bf6dd4f336d50392053b04"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "tempfile"
+version = "3.27.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd"
+dependencies = [
+ "fastrand",
+ "getrandom 0.4.2",
+ "once_cell",
+ "rustix 1.1.4",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "time"
+version = "0.3.47"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c"
+dependencies = [
+ "deranged",
+ "itoa",
+ "num-conv",
+ "powerfmt",
+ "serde_core",
+ "time-core",
+ "time-macros",
+]
+
+[[package]]
+name = "time-core"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca"
+
+[[package]]
+name = "time-macros"
+version = "0.2.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215"
+dependencies = [
+ "num-conv",
+ "time-core",
+]
+
+[[package]]
+name = "toml"
+version = "0.9.12+spec-1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf92845e79fc2e2def6a5d828f0801e29a2f8acc037becc5ab08595c7d5e9863"
+dependencies = [
+ "indexmap",
+ "serde_core",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_parser",
+ "toml_writer",
+ "winnow 0.7.15",
+]
+
+[[package]]
+name = "toml_datetime"
+version = "0.7.5+spec-1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347"
+dependencies = [
+ "serde_core",
+]
+
+[[package]]
+name = "toml_parser"
+version = "1.1.2+spec-1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526"
+dependencies = [
+ "winnow 1.0.1",
+]
+
+[[package]]
+name = "toml_writer"
+version = "1.1.1+spec-1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "756daf9b1013ebe47a8776667b466417e2d4c5679d441c26230efd9ef78692db"
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
+
+[[package]]
+name = "unicode-xid"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
+
+[[package]]
+name = "utf8-ranges"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7fcfc827f90e53a02eaef5e535ee14266c1d569214c6aa70133a624d8a3164ba"
+
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+
+[[package]]
+name = "uuid"
+version = "1.23.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76"
+dependencies = [
+ "getrandom 0.4.2",
+ "js-sys",
+ "serde_core",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "wasi"
+version = "0.11.1+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
+
+[[package]]
+name = "wasip2"
+version = "1.0.3+wasi-0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6"
+dependencies = [
+ "wit-bindgen 0.57.1",
+]
+
+[[package]]
+name = "wasip3"
+version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
+dependencies = [
+ "wit-bindgen 0.51.0",
+]
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.118"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0bf938a0bacb0469e83c1e148908bd7d5a6010354cf4fb73279b7447422e3a89"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "rustversion",
+ "wasm-bindgen-macro",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.118"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eeff24f84126c0ec2db7a449f0c2ec963c6a49efe0698c4242929da037ca28ed"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.118"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d08065faf983b2b80a79fd87d8254c409281cf7de75fc4b773019824196c904"
+dependencies = [
+ "bumpalo",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.118"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5fd04d9e306f1907bd13c6361b5c6bfc7b3b3c095ed3f8a9246390f8dbdee129"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "wasm-encoder"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319"
+dependencies = [
+ "leb128fmt",
+ "wasmparser",
+]
+
+[[package]]
+name = "wasm-metadata"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
+dependencies = [
+ "anyhow",
+ "indexmap",
+ "wasm-encoder",
+ "wasmparser",
+]
+
+[[package]]
+name = "wasmparser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
+dependencies = [
+ "bitflags",
+ "hashbrown 0.15.5",
+ "indexmap",
+ "semver",
+]
+
+[[package]]
+name = "web-sys"
+version = "0.3.95"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f2dfbb17949fa2088e5d39408c48368947b86f7834484e87b73de55bc14d97d"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "windows-link"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.61.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
+dependencies = [
+ "windows-link",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+
+[[package]]
+name = "winnow"
+version = "0.7.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945"
+
+[[package]]
+name = "winnow"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09dac053f1cd375980747450bfc7250c264eaae0583872e845c0c7cd578872b5"
+
+[[package]]
+name = "wit-bindgen"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
+dependencies = [
+ "wit-bindgen-rust-macro",
+]
+
+[[package]]
+name = "wit-bindgen"
+version = "0.57.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e"
+
+[[package]]
+name = "wit-bindgen-core"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc"
+dependencies = [
+ "anyhow",
+ "heck",
+ "wit-parser",
+]
+
+[[package]]
+name = "wit-bindgen-rust"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
+dependencies = [
+ "anyhow",
+ "heck",
+ "indexmap",
+ "prettyplease",
+ "syn",
+ "wasm-metadata",
+ "wit-bindgen-core",
+ "wit-component",
+]
+
+[[package]]
+name = "wit-bindgen-rust-macro"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a"
+dependencies = [
+ "anyhow",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wit-bindgen-core",
+ "wit-bindgen-rust",
+]
+
+[[package]]
+name = "wit-component"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
+dependencies = [
+ "anyhow",
+ "bitflags",
+ "indexmap",
+ "log",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "wasm-encoder",
+ "wasm-metadata",
+ "wasmparser",
+ "wit-parser",
+]
+
+[[package]]
+name = "wit-parser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
+dependencies = [
+ "anyhow",
+ "id-arena",
+ "indexmap",
+ "log",
+ "semver",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "unicode-xid",
+ "wasmparser",
+]
+
+[[package]]
+name = "zerocopy"
+version = "0.8.48"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.48"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "zmij"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
+
+[[package]]
+name = "zstd"
+version = "0.13.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a"
+dependencies = [
+ "zstd-safe",
+]
+
+[[package]]
+name = "zstd-safe"
+version = "7.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d"
+dependencies = [
+ "zstd-sys",
+]
+
+[[package]]
+name = "zstd-sys"
+version = "2.0.16+zstd.1.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748"
+dependencies = [
+ "cc",
+ "pkg-config",
+]
diff --git a/third_party/tantivy_ffi/Cargo.toml b/third_party/tantivy_ffi/Cargo.toml
new file mode 100644
index 000000000..4b5d76a5d
--- /dev/null
+++ b/third_party/tantivy_ffi/Cargo.toml
@@ -0,0 +1,33 @@
+[package]
+name = "paimon_tantivy_ffi"
+version = "0.1.0"
+edition = "2021"
+description = "C FFI layer wrapping tantivy + jieba-rs for paimon-cpp tantivy-fts global index"
+license = "Apache-2.0"
+publish = false
+
+[lib]
+name = "paimon_tantivy_ffi"
+# staticlib: 给 CMake + Corrosion 链接成 libpaimon_tantivy_ffi.a
+# rlib:      给 cargo test 生成 test 可执行文件时能用到 Rust 原生 linkage
+crate-type = ["staticlib", "rlib"]
+
+[dependencies]
+tantivy = "0.22"
+jieba-rs = "0.7"
+croaring = "2.0"
+log = "0.4"
+tempfile = "3"
+
+[build-dependencies]
+cbindgen = "0.29"
+
+[profile.release]
+opt-level = 3
+lto = "thin"
+codegen-units = 1
+panic = "abort"
+
+[profile.dev]
+# FFI 异常传播通过 status code,Rust 侧 panic 应当 abort 避免穿过 FFI 边界
+panic = "abort"
diff --git a/third_party/tantivy_ffi/build.rs b/third_party/tantivy_ffi/build.rs
new file mode 100644
index 000000000..cc8da5574
--- /dev/null
+++ b/third_party/tantivy_ffi/build.rs
@@ -0,0 +1,38 @@
+//! build.rs: 调 cbindgen 生成 C 头文件 paimon_tantivy_ffi.h
+//!
+//! 输出路径: $OUT_DIR/paimon_tantivy_ffi.h
+//! Corrosion (CMake 侧) 会读 cargo metadata 里的 OUT_DIR,把头文件加入 C++ include path。
+
+use std::env;
+use std::path::PathBuf;
+
+fn main() {
+    let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
+    let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
+    let header_path = out_dir.join("paimon_tantivy_ffi.h");
+
+    let cfg = cbindgen::Config::from_file(PathBuf::from(&crate_dir).join("cbindgen.toml"))
+        .expect("cbindgen.toml must exist at crate root");
+
+    match cbindgen::Builder::new()
+        .with_crate(&crate_dir)
+        .with_config(cfg)
+        .generate()
+    {
+        Ok(bindings) => {
+            bindings.write_to_file(&header_path);
+            println!(
+                "cargo:rerun-if-changed={}",
+                PathBuf::from(&crate_dir).join("src").display()
+            );
+            println!("cargo:rerun-if-changed=cbindgen.toml");
+            // 把头文件路径暴露给 Corrosion / 上游 CMake
+            println!("cargo:include={}", out_dir.display());
+            eprintln!("cbindgen: wrote {}", header_path.display());
+        }
+        Err(e) => {
+            // cbindgen 失败不一定致命 (例如 CI 在没改 Rust 代码时跳过). 打 warning 继续。
+            eprintln!("cbindgen generation failed: {e:?}");
+        }
+    }
+}
diff --git a/third_party/tantivy_ffi/cbindgen.toml b/third_party/tantivy_ffi/cbindgen.toml
new file mode 100644
index 000000000..a8b5237fa
--- /dev/null
+++ b/third_party/tantivy_ffi/cbindgen.toml
@@ -0,0 +1,48 @@
+# cbindgen 配置: Rust FFI -> C 头文件生成器
+# 由 build.rs 调用,输出到 $OUT_DIR/paimon_tantivy_ffi.h
+# CMake 通过 Corrosion 拿到 $OUT_DIR 并 include 到 C++ target
+
+language = "C"
+
+# 头文件顶部标注
+header = """
+/* Copyright 2026-present Alibaba Inc. */
+/*
+ * AUTO-GENERATED by cbindgen from Rust sources under third_party/tantivy_ffi - DO NOT EDIT.
+ *
+ * C ABI for paimon_tantivy_ffi. See docs/dev/tantivy_ffi_design.md for contract.
+ */
+#pragma once
+"""
+
+include_guard = "PAIMON_TANTIVY_FFI_H"
+cpp_compat = true
+pragma_once = false   # 已经手写在 header 里
+documentation = true
+documentation_style = "c"
+line_length = 100
+tab_width = 4
+
+[export]
+# 不给类型加前缀 (Rust 侧类型名已经自带 PaimonTantivy... 前缀)。
+# 函数名天然以 paimon_tantivy_ 开头(Rust 源里直接这样命名)。
+prefix = ""
+# 强制导出仅作为句柄/返回值的类型(没有 FFI 函数直接 take/return 它们时,
+# cbindgen 默认不导出;显式列在这里)。
+include = ["PaimonTantivyStatus"]
+
+[export.rename]
+# Rust enum 名 -> C 里的 typedef 名(避免重复 prefix 之类)
+
+[fn]
+prefix = ""
+args = "auto"
+rename_args = "None"
+
+[enum]
+rename_variants = "ScreamingSnakeCase"
+prefix_with_name = true
+derive_helper_methods = false
+
+[parse]
+parse_deps = false
diff --git a/third_party/tantivy_ffi/rust-toolchain.toml b/third_party/tantivy_ffi/rust-toolchain.toml
new file mode 100644
index 000000000..8a8c36646
--- /dev/null
+++ b/third_party/tantivy_ffi/rust-toolchain.toml
@@ -0,0 +1,11 @@
+# Pin the Rust toolchain used to build paimon_tantivy_ffi. Without this,
+# Corrosion's FindRust.cmake invokes `rustup which rustc --toolchain ''`
+# which fails on fresh CMake configure (no rust-toolchain → empty toolchain
+# name → rustup rejects it). See docs/dev/execute.md Stage 11 for context.
+#
+# Only the `channel` is pinned — no extra components, because rustup in
+# CI/containers may lack network access to fetch clippy/rustfmt, and build
+# doesn't need them.
+[toolchain]
+channel = "stable"
+profile = "minimal"
diff --git a/third_party/tantivy_ffi/src/buffer.rs b/third_party/tantivy_ffi/src/buffer.rs
new file mode 100644
index 000000000..36ad0b905
--- /dev/null
+++ b/third_party/tantivy_ffi/src/buffer.rs
@@ -0,0 +1,111 @@
+//! `paimon_tantivy_buffer_t`: Rust-allocated byte buffer returned to C++.
+//!
+//! Contract (see docs/dev/tantivy_ffi_design.md §3 Category B):
+//! - Buffer is allocated by Rust (as a `Box<[u8]>`)
+//! - C++ reads `data[0..len]`, **must not** write past len
+//! - C++ must call `paimon_tantivy_buffer_free()` exactly once per non-empty buffer
+//! - Empty (len=0) buffer has null `data`; buffer_free accepts it as no-op
+//!
+//! This struct is #[repr(C)] so cbindgen generates a matching C struct.
+
+use std::ptr;
+
+#[repr(C)]
+pub struct PaimonTantivyBuffer {
+    /// Pointer to `len` bytes. Null iff len == 0.
+    pub data: *mut u8,
+    /// Number of valid bytes.
+    pub len: usize,
+    /// Internal capacity hint for Rust-side reconstruction. C++ treats as opaque.
+    pub capacity: usize,
+}
+
+impl PaimonTantivyBuffer {
+    /// Build a buffer from owned bytes; consumes the Vec.
+    pub(crate) fn from_vec(mut v: Vec<u8>) -> Self {
+        if v.is_empty() {
+            return Self::empty();
+        }
+        v.shrink_to_fit();
+        let len = v.len();
+        let capacity = v.capacity();
+        let data = v.as_mut_ptr();
+        std::mem::forget(v);
+        Self { data, len, capacity }
+    }
+
+    pub(crate) fn empty() -> Self {
+        Self {
+            data: ptr::null_mut(),
+            len: 0,
+            capacity: 0,
+        }
+    }
+}
+
+/// Free a buffer returned by any Rust FFI function. Safe to call on an empty
+/// buffer (len=0 / data=null). Must only be called once per buffer.
+///
+/// SAFETY: `buf` must be either null, or point to a live `paimon_tantivy_buffer_t`
+/// produced by this crate and not yet freed.
+#[no_mangle]
+pub unsafe extern "C" fn paimon_tantivy_buffer_free(buf: *mut PaimonTantivyBuffer) {
+    if buf.is_null() {
+        return;
+    }
+    let b = unsafe { &mut *buf };
+    if b.len != 0 && !b.data.is_null() {
+        // Reconstruct the Vec<u8> and drop it
+        let v = unsafe { Vec::from_raw_parts(b.data, b.len, b.capacity) };
+        drop(v);
+    }
+    b.data = ptr::null_mut();
+    b.len = 0;
+    b.capacity = 0;
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn empty_has_null_data() {
+        let b = PaimonTantivyBuffer::empty();
+        assert!(b.data.is_null());
+        assert_eq!(b.len, 0);
+    }
+
+    #[test]
+    fn from_vec_roundtrip() {
+        let src = vec![1u8, 2, 3, 4, 5];
+        let src_clone = src.clone();
+        let mut b = PaimonTantivyBuffer::from_vec(src);
+        assert_eq!(b.len, 5);
+        assert!(!b.data.is_null());
+        let view: &[u8] = unsafe { std::slice::from_raw_parts(b.data, b.len) };
+        assert_eq!(view, src_clone.as_slice());
+        unsafe { paimon_tantivy_buffer_free(&mut b) };
+        assert!(b.data.is_null());
+        assert_eq!(b.len, 0);
+    }
+
+    #[test]
+    fn free_null_is_noop() {
+        unsafe { paimon_tantivy_buffer_free(std::ptr::null_mut()) };
+    }
+
+    #[test]
+    fn free_empty_is_noop() {
+        let mut b = PaimonTantivyBuffer::empty();
+        unsafe { paimon_tantivy_buffer_free(&mut b) };
+    }
+
+    #[test]
+    fn stress_alloc_free() {
+        // LSAN would catch any leak
+        for i in 0..5_000usize {
+            let mut b = PaimonTantivyBuffer::from_vec(vec![42u8; i.min(256)]);
+            unsafe { paimon_tantivy_buffer_free(&mut b) };
+        }
+    }
+}
diff --git a/third_party/tantivy_ffi/src/callback_directory.rs b/third_party/tantivy_ffi/src/callback_directory.rs
new file mode 100644
index 000000000..6ef64a170
--- /dev/null
+++ b/third_party/tantivy_ffi/src/callback_directory.rs
@@ -0,0 +1,498 @@
+//! PaimonCallbackDirectory: streaming tantivy `Directory` backed by C FFI
+//! callbacks. Replaces the V1 `PaimonDirectory` (RamDirectory wrapper) with a
+//! callback-driven design that mirrors Java paimon-tantivy-jni's `JniDirectory`.
+//!
+//! ## Why callback-based?
+//!
+//! V1 loaded the entire archive (100MB+) into `RamDirectory` at reader
+//! construction, giving ~2x archive peak RAM and paying the whole download
+//! cost up front even for small queries. V3 keeps just the `HashMap<PathBuf,
+//! FileMeta>` layout and issues pread calls through the FFI callback whenever
+//! tantivy asks for bytes — peak RAM is ~KB, startup is ~header size.
+//!
+//! ## Concurrency
+//!
+//! V3 serializes `read_at` via `stream_mutex` (same as Java JniDir's
+//! `stream_lock`). pread-style callbacks in principle allow concurrent reads,
+//! but some `paimon::InputStream` subclasses (notably `JindoInputStream`)
+//! have shared-state races, so V3 plays it safe. V3.5 removes the mutex —
+//! see `docs/dev/tantivy_directory_upgrade_plan.md` §5.
+
+use std::collections::HashMap;
+use std::ffi::c_void;
+use std::fmt;
+use std::io;
+use std::ops::Range;
+use std::path::{Path, PathBuf};
+use std::sync::{Arc, Mutex};
+
+use tantivy::directory::error::{DeleteError, LockError, OpenReadError, OpenWriteError};
+use tantivy::directory::{
+    AntiCallToken, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes, TerminatingWrite,
+    WatchCallback, WatchHandle, WritePtr,
+};
+use tantivy::HasLen;
+
+// =========================================================================
+// FFI types
+// =========================================================================
+
+/// pread-style callback table passed from C++ at reader construction.
+///
+/// `ctx` is an opaque pointer to C++'s `StreamCtx` (holding a
+/// `paimon::InputStream`). Rust never dereferences it — only forwards it
+/// into the callback functions. `release` is called exactly once when the
+/// last `Arc<CallbackCtx>` is dropped.
+#[repr(C)]
+pub struct PaimonStreamCallbacks {
+    pub ctx: *mut c_void,
+    pub read_at:
+        extern "C" fn(ctx: *mut c_void, offset: u64, len: usize, out_buf: *mut u8) -> i32,
+    pub release: extern "C" fn(ctx: *mut c_void),
+}
+
+// =========================================================================
+// Internal state
+// =========================================================================
+
+#[derive(Clone, Debug)]
+struct FileMeta {
+    offset: u64,
+    length: u64,
+}
+
+/// RAII wrapper owning the FFI callbacks. On drop, invokes `release(ctx)`.
+/// Shared across clones of `PaimonCallbackDirectory` via `Arc`.
+struct CallbackCtx {
+    callbacks: PaimonStreamCallbacks,
+}
+
+impl Drop for CallbackCtx {
+    fn drop(&mut self) {
+        // Calling an extern "C" fn pointer from safe Rust is legal; the
+        // contract safety relies on the C++ side providing a valid ctx.
+        (self.callbacks.release)(self.callbacks.ctx);
+    }
+}
+
+// Safety: callbacks.ctx is treated as opaque; C++ owner is responsible for
+// the ctx being usable across threads. Rust's stream_mutex serializes
+// read_at calls, and release is only invoked once (when Arc refcount hits 0).
+unsafe impl Send for CallbackCtx {}
+unsafe impl Sync for CallbackCtx {}
+
+// =========================================================================
+// PaimonCallbackDirectory
+// =========================================================================
+
+#[derive(Clone)]
+pub struct PaimonCallbackDirectory {
+    /// name → (offset, length) in the stream. Immutable after construction.
+    layout: Arc<HashMap<PathBuf, FileMeta>>,
+    /// FFI callbacks + their ctx lifetime.
+    ctx: Arc<CallbackCtx>,
+    /// tantivy writes small atomic files (`.lock`, in some paths `meta.json`)
+    /// via `atomic_write`; we keep them in memory instead of pushing back
+    /// through C++ (read-only archive). Shared across clones.
+    atomic_data: Arc<Mutex<HashMap<PathBuf, Vec<u8>>>>,
+    /// V3 保守路线:串行 seek+read(对齐 Java JniDir `stream_lock`)。
+    /// V3.5 升级去掉此锁,见 `tantivy_directory_upgrade_plan.md` §5。
+    stream_mutex: Arc<Mutex<()>>,
+}
+
+impl fmt::Debug for PaimonCallbackDirectory {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("PaimonCallbackDirectory")
+            .field("files", &self.layout.keys().collect::<Vec<_>>())
+            .finish()
+    }
+}
+
+impl PaimonCallbackDirectory {
+    /// Construct a new directory from the C++-parsed archive layout + callbacks.
+    /// The ctx ownership transfers to this Directory; `release` is invoked on
+    /// drop of the last clone.
+    pub fn new(
+        entries: Vec<(String, u64, u64)>,
+        callbacks: PaimonStreamCallbacks,
+    ) -> Self {
+        let mut layout = HashMap::with_capacity(entries.len());
+        for (name, offset, length) in entries {
+            layout.insert(PathBuf::from(name), FileMeta { offset, length });
+        }
+        Self {
+            layout: Arc::new(layout),
+            ctx: Arc::new(CallbackCtx { callbacks }),
+            atomic_data: Arc::new(Mutex::new(HashMap::new())),
+            stream_mutex: Arc::new(Mutex::new(())),
+        }
+    }
+
+    /// Perform an FFI pread. Serialized via `stream_mutex` (V3 invariant).
+    fn pread(&self, offset: u64, len: usize) -> io::Result<Vec<u8>> {
+        let _guard = self.stream_mutex.lock().map_err(|e| {
+            io::Error::new(io::ErrorKind::Other, format!("stream_mutex poisoned: {e}"))
+        })?;
+        let mut buf = vec![0u8; len];
+        // Calling extern "C" fn pointer — safe from Rust's POV (ABI is C);
+        // the contract safety (ctx validity, buffer ownership) is on the C++ side.
+        let rc =
+            (self.ctx.callbacks.read_at)(self.ctx.callbacks.ctx, offset, len, buf.as_mut_ptr());
+        if rc != 0 {
+            return Err(io::Error::new(
+                io::ErrorKind::Other,
+                format!("pread callback rc={rc} offset={offset} len={len}"),
+            ));
+        }
+        Ok(buf)
+    }
+
+    /// Sorted file names, for diagnostic / test use.
+    #[cfg(test)]
+    pub(crate) fn file_names(&self) -> Vec<String> {
+        let mut names: Vec<String> = self
+            .layout
+            .keys()
+            .map(|p| p.to_string_lossy().into_owned())
+            .collect();
+        names.sort();
+        names
+    }
+}
+
+// =========================================================================
+// FileHandle
+// =========================================================================
+
+#[derive(Clone)]
+struct PaimonCallbackFileHandle {
+    directory: PaimonCallbackDirectory,
+    file_offset: u64,
+    file_length: u64,
+}
+
+impl fmt::Debug for PaimonCallbackFileHandle {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("PaimonCallbackFileHandle")
+            .field("offset", &self.file_offset)
+            .field("length", &self.file_length)
+            .finish()
+    }
+}
+
+impl HasLen for PaimonCallbackFileHandle {
+    fn len(&self) -> usize {
+        self.file_length as usize
+    }
+}
+
+impl FileHandle for PaimonCallbackFileHandle {
+    fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
+        let start = self.file_offset + range.start as u64;
+        let len = range.end - range.start;
+        let data = self.directory.pread(start, len)?;
+        Ok(OwnedBytes::new(data))
+    }
+}
+
+// =========================================================================
+// Directory trait (13 methods for tantivy 0.22)
+// =========================================================================
+
+impl Directory for PaimonCallbackDirectory {
+    fn get_file_handle(&self, path: &Path) -> Result<Arc<dyn FileHandle>, OpenReadError> {
+        let meta = self
+            .layout
+            .get(path)
+            .ok_or_else(|| OpenReadError::FileDoesNotExist(path.to_path_buf()))?;
+        Ok(Arc::new(PaimonCallbackFileHandle {
+            directory: self.clone(),
+            file_offset: meta.offset,
+            file_length: meta.length,
+        }))
+    }
+
+    fn exists(&self, path: &Path) -> Result<bool, OpenReadError> {
+        let in_layout = self.layout.contains_key(path);
+        let in_atomic = self.atomic_data.lock().unwrap().contains_key(path);
+        Ok(in_layout || in_atomic)
+    }
+
+    fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
+        if let Some(data) = self.atomic_data.lock().unwrap().get(path) {
+            return Ok(data.clone());
+        }
+        let meta = self
+            .layout
+            .get(path)
+            .ok_or_else(|| OpenReadError::FileDoesNotExist(path.to_path_buf()))?;
+        self.pread(meta.offset, meta.length as usize)
+            .map_err(|e| OpenReadError::wrap_io_error(e, path.to_path_buf()))
+    }
+
+    fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()> {
+        self.atomic_data
+            .lock()
+            .unwrap()
+            .insert(path.to_path_buf(), data.to_vec());
+        Ok(())
+    }
+
+    fn delete(&self, _path: &Path) -> Result<(), DeleteError> {
+        // read-only archive: ignore
+        Ok(())
+    }
+
+    fn open_write(&self, _path: &Path) -> Result<WritePtr, OpenWriteError> {
+        // tantivy needs this for lock files when opening an index; provide a
+        // dummy in-memory writer (same trick as Java JniDirectory).
+        let buf: Vec<u8> = Vec::new();
+        Ok(io::BufWriter::new(Box::new(VecTerminatingWrite(buf))))
+    }
+
+    fn sync_directory(&self) -> io::Result<()> {
+        Ok(())
+    }
+
+    fn acquire_lock(&self, _lock: &Lock) -> Result<DirectoryLock, LockError> {
+        // Read-only: no actual locking.
+        Ok(DirectoryLock::from(Box::new(())))
+    }
+
+    fn watch(&self, _watch_callback: WatchCallback) -> tantivy::Result<WatchHandle> {
+        Ok(WatchHandle::empty())
+    }
+}
+
+/// Throwaway writer for `open_write` — tantivy creates it for lock files but
+/// the bytes never matter in a read-only archive.
+struct VecTerminatingWrite(Vec<u8>);
+
+impl io::Write for VecTerminatingWrite {
+    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
+        self.0.extend_from_slice(buf);
+        Ok(buf.len())
+    }
+    fn flush(&mut self) -> io::Result<()> {
+        Ok(())
+    }
+}
+
+impl TerminatingWrite for VecTerminatingWrite {
+    fn terminate_ref(&mut self, _token: AntiCallToken) -> io::Result<()> {
+        Ok(())
+    }
+}
+
+// =========================================================================
+// Test support (pub(crate) — used by reader.rs tests too)
+// =========================================================================
+
+#[cfg(test)]
+pub(crate) mod test_support {
+    use super::*;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+
+    /// Mock backend: an in-memory buffer serving pread requests. Counters
+    /// expose behavior for test assertions (read count / release count).
+    pub(crate) struct MockBackend {
+        pub data: Vec<u8>,
+        pub read_count: AtomicUsize,
+        pub release_count: AtomicUsize,
+    }
+
+    extern "C" fn mock_read_at(
+        ctx: *mut c_void,
+        offset: u64,
+        len: usize,
+        out_buf: *mut u8,
+    ) -> i32 {
+        let backend = unsafe { &*(ctx as *const MockBackend) };
+        backend.read_count.fetch_add(1, Ordering::SeqCst);
+        let data = &backend.data;
+        let end = (offset as usize).saturating_add(len);
+        if end > data.len() {
+            return 1; // out of range
+        }
+        unsafe {
+            std::ptr::copy_nonoverlapping(data.as_ptr().add(offset as usize), out_buf, len);
+        }
+        0
+    }
+
+    extern "C" fn mock_release(ctx: *mut c_void) {
+        // Reclaim the strong ref that `Arc::into_raw` leaked at construction.
+        let backend = unsafe { Arc::from_raw(ctx as *const MockBackend) };
+        backend.release_count.fetch_add(1, Ordering::SeqCst);
+        // `arc` drops here → decrement; test still holds its own clone.
+    }
+
+    /// Build a mock-backed directory for tests. Returns (dir, backend clone).
+    /// The backend Arc is shared — drop the directory to trigger release.
+    pub(crate) fn build_mock_directory(
+        data: Vec<u8>,
+        entries: Vec<(String, u64, u64)>,
+    ) -> (PaimonCallbackDirectory, Arc<MockBackend>) {
+        let backend = Arc::new(MockBackend {
+            data,
+            read_count: AtomicUsize::new(0),
+            release_count: AtomicUsize::new(0),
+        });
+        let ctx_ptr = Arc::into_raw(backend.clone()) as *mut c_void;
+        let cb = PaimonStreamCallbacks {
+            ctx: ctx_ptr,
+            read_at: mock_read_at,
+            release: mock_release,
+        };
+        let dir = PaimonCallbackDirectory::new(entries, cb);
+        (dir, backend)
+    }
+
+    /// Parse a packed archive blob (BE, no version header, matching
+    /// `writer::pack_index_dir`) and build a mock-backed directory. Used by
+    /// `reader.rs::tests` since writer.finish currently still returns a Vec<u8>.
+    pub(crate) fn build_directory_from_archive(
+        packed: Vec<u8>,
+    ) -> (PaimonCallbackDirectory, Arc<MockBackend>) {
+        let entries = parse_archive_header(&packed);
+        build_mock_directory(packed, entries)
+    }
+
+    /// Parse the archive header — mirrors the layout that
+    /// C++ `ParseArchiveHeader` will produce in production (K3).
+    fn parse_archive_header(bytes: &[u8]) -> Vec<(String, u64, u64)> {
+        let mut off = 0usize;
+        let file_count = i32::from_be_bytes(bytes[off..off + 4].try_into().unwrap()) as usize;
+        off += 4;
+        let mut entries = Vec::with_capacity(file_count);
+        for _ in 0..file_count {
+            let nlen = i32::from_be_bytes(bytes[off..off + 4].try_into().unwrap()) as usize;
+            off += 4;
+            let name =
+                std::str::from_utf8(&bytes[off..off + nlen]).unwrap().to_owned();
+            off += nlen;
+            let flen = i64::from_be_bytes(bytes[off..off + 8].try_into().unwrap()) as u64;
+            off += 8;
+            let data_offset = off as u64;
+            entries.push((name, data_offset, flen));
+            off += flen as usize;
+        }
+        entries
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::test_support::*;
+    use super::*;
+
+    #[test]
+    fn file_handle_reads_correct_bytes() {
+        let data = b"hello world".to_vec();
+        let entries = vec![("foo.txt".to_string(), 0, 11)];
+        let (dir, _backend) = build_mock_directory(data, entries);
+
+        let handle = dir.get_file_handle(Path::new("foo.txt")).unwrap();
+        let bytes = handle.read_bytes(0..5).unwrap();
+        assert_eq!(&bytes[..], b"hello");
+        let bytes = handle.read_bytes(6..11).unwrap();
+        assert_eq!(&bytes[..], b"world");
+    }
+
+    #[test]
+    fn missing_file_returns_error() {
+        let (dir, _backend) = build_mock_directory(vec![], vec![]);
+        let err = dir.get_file_handle(Path::new("nonexistent")).unwrap_err();
+        match err {
+            OpenReadError::FileDoesNotExist(p) => {
+                assert_eq!(p.to_string_lossy(), "nonexistent")
+            }
+            other => panic!("expected FileDoesNotExist, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn pread_out_of_range_propagates_error() {
+        let data = b"short".to_vec();
+        let entries = vec![("bad.txt".to_string(), 0, 100)]; // 长度超出 data
+        let (dir, _backend) = build_mock_directory(data, entries);
+        let handle = dir.get_file_handle(Path::new("bad.txt")).unwrap();
+        let err = handle.read_bytes(0..100).unwrap_err();
+        assert_eq!(err.kind(), io::ErrorKind::Other);
+    }
+
+    #[test]
+    fn atomic_write_read_roundtrip_and_exists() {
+        let (dir, _backend) = build_mock_directory(vec![], vec![]);
+        dir.atomic_write(Path::new(".lock"), b"locked").unwrap();
+        let data = dir.atomic_read(Path::new(".lock")).unwrap();
+        assert_eq!(data, b"locked");
+        assert!(dir.exists(Path::new(".lock")).unwrap());
+        assert!(!dir.exists(Path::new("gone")).unwrap());
+    }
+
+    #[test]
+    fn release_called_exactly_once_on_last_drop() {
+        let entries = vec![("a".to_string(), 0, 5)];
+        let (dir, backend) = build_mock_directory(b"hello".to_vec(), entries);
+        assert_eq!(backend.release_count.load(std::sync::atomic::Ordering::SeqCst), 0);
+        drop(dir);
+        assert_eq!(backend.release_count.load(std::sync::atomic::Ordering::SeqCst), 1);
+    }
+
+    #[test]
+    fn cloned_directory_shares_ctx_and_atomic_data() {
+        let (dir, backend) = build_mock_directory(vec![], vec![]);
+        let dir2 = dir.clone();
+        dir.atomic_write(Path::new("x"), b"hello").unwrap();
+        assert!(dir2.exists(Path::new("x")).unwrap()); // shared atomic_data
+        drop(dir);
+        assert_eq!(backend.release_count.load(std::sync::atomic::Ordering::SeqCst), 0); // ctx still held by dir2
+        drop(dir2);
+        assert_eq!(backend.release_count.load(std::sync::atomic::Ordering::SeqCst), 1);
+    }
+
+    #[test]
+    fn concurrent_pread_results_correct_under_stream_mutex() {
+        use std::thread;
+
+        let data: Vec<u8> = (0..1000).map(|i| (i % 256) as u8).collect();
+        let entries = vec![("data".to_string(), 0, 1000)];
+        let (dir, backend) = build_mock_directory(data.clone(), entries);
+        let handle: Arc<dyn FileHandle> =
+            dir.get_file_handle(Path::new("data")).unwrap();
+
+        let threads: Vec<_> = (0..8)
+            .map(|_| {
+                let h = handle.clone();
+                let expected = data.clone();
+                thread::spawn(move || {
+                    for _ in 0..20 {
+                        let bytes = h.read_bytes(100..200).unwrap();
+                        assert_eq!(&bytes[..], &expected[100..200]);
+                    }
+                })
+            })
+            .collect();
+
+        for t in threads {
+            t.join().unwrap();
+        }
+        assert_eq!(
+            backend.read_count.load(std::sync::atomic::Ordering::SeqCst),
+            8 * 20
+        );
+    }
+
+    #[test]
+    fn file_names_sorted() {
+        let entries = vec![
+            ("z.idx".to_string(), 0, 10),
+            ("a.meta".to_string(), 10, 20),
+            ("m.term".to_string(), 30, 5),
+        ];
+        let (dir, _backend) = build_mock_directory(vec![0u8; 100], entries);
+        let names = dir.file_names();
+        assert_eq!(names, vec!["a.meta", "m.term", "z.idx"]);
+    }
+}
diff --git a/third_party/tantivy_ffi/src/error.rs b/third_party/tantivy_ffi/src/error.rs
new file mode 100644
index 000000000..80f16df65
--- /dev/null
+++ b/third_party/tantivy_ffi/src/error.rs
@@ -0,0 +1,137 @@
+//! Error model for paimon_tantivy_ffi.
+//!
+//! See docs/dev/tantivy_ffi_design.md §2. Contract:
+//! - Every fallible FFI function returns `paimon_tantivy_status_t`
+//! - Failure sets `last_error` (thread-local) with human-readable text
+//! - C++ calls `paimon_tantivy_last_error()` after a non-OK status to fetch text
+//! - Pointer returned by `last_error()` is thread-local and valid until the
+//!   next failing FFI call on the same thread. C++ must NOT free it.
+
+use std::cell::RefCell;
+use std::ffi::c_char;
+use std::ffi::CString;
+
+/// Status codes. Values are stable ABI; append-only.
+#[repr(i32)]
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+pub enum PaimonTantivyStatus {
+    Ok = 0,
+    InvalidArgument = 1,
+    NotFound = 2,
+    IoError = 3,
+    Unsupported = 4,
+    TokenizerError = 5,
+    QueryParseError = 6,
+    IndexFormatError = 7,
+    InternalError = 99,
+}
+
+thread_local! {
+    /// Pre-allocated empty string so `paimon_tantivy_last_error()` can always
+    /// return a valid non-null pointer.
+    static LAST_ERROR: RefCell<CString> = RefCell::new(CString::new("").unwrap());
+}
+
+/// Record an error message for the current thread. Called by fallible FFI
+/// functions right before returning a non-OK status.
+pub(crate) fn set_last_error(msg: impl Into<String>) {
+    // Interior nul bytes would make CString::new fail; strip them as a safety net.
+    let s: String = msg.into().replace('\0', "\u{FFFD}");
+    LAST_ERROR.with(|cell| {
+        // CString::new clones the bytes and appends a nul terminator.
+        *cell.borrow_mut() = CString::new(s).unwrap_or_else(|_| CString::new("").unwrap());
+    });
+}
+
+/// Clear the current thread's error slot. Called at the top of fallible APIs
+/// so a subsequent successful call doesn't return stale text.
+#[allow(dead_code)]
+pub(crate) fn clear_last_error() {
+    LAST_ERROR.with(|cell| {
+        *cell.borrow_mut() = CString::new("").unwrap();
+    });
+}
+
+/// Macro that wraps a `Result<T, String>`-returning block: sets last_error on
+/// Err and returns the given status code; returns Ok value on success.
+#[macro_export]
+macro_rules! ffi_try {
+    ($expr:expr, $err_status:expr) => {{
+        match $expr {
+            Ok(v) => v,
+            Err(e) => {
+                $crate::error::set_last_error(format!("{e}"));
+                return $err_status;
+            }
+        }
+    }};
+}
+
+/// Return the last error text for the calling thread. Always non-null; returns
+/// pointer to "" when there is no error recorded yet. Pointer is thread-local;
+/// C++ must NOT free it; treat as valid until the next failing FFI call on
+/// the same thread.
+#[no_mangle]
+pub extern "C" fn paimon_tantivy_last_error() -> *const c_char {
+    LAST_ERROR.with(|cell| cell.borrow().as_ptr())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::ffi::CStr;
+
+    #[test]
+    fn initial_last_error_is_empty() {
+        let ptr = paimon_tantivy_last_error();
+        assert!(!ptr.is_null());
+        let s = unsafe { CStr::from_ptr(ptr) }.to_str().unwrap();
+        assert_eq!(s, "");
+    }
+
+    #[test]
+    fn set_then_retrieve() {
+        set_last_error("boom");
+        let s = unsafe { CStr::from_ptr(paimon_tantivy_last_error()) }
+            .to_str()
+            .unwrap();
+        assert_eq!(s, "boom");
+    }
+
+    #[test]
+    fn clear_resets_to_empty() {
+        set_last_error("x");
+        clear_last_error();
+        let s = unsafe { CStr::from_ptr(paimon_tantivy_last_error()) }
+            .to_str()
+            .unwrap();
+        assert_eq!(s, "");
+    }
+
+    #[test]
+    fn embedded_nul_is_stripped() {
+        set_last_error("a\0b");
+        let s = unsafe { CStr::from_ptr(paimon_tantivy_last_error()) }
+            .to_str()
+            .unwrap();
+        assert_eq!(s, "a\u{FFFD}b");
+    }
+
+    #[test]
+    fn thread_local_isolation() {
+        set_last_error("main");
+        let t = std::thread::spawn(|| {
+            let s = unsafe { CStr::from_ptr(paimon_tantivy_last_error()) }
+                .to_str()
+                .unwrap();
+            s.to_owned()
+        })
+        .join()
+        .unwrap();
+        assert_eq!(t, "");
+        let s = unsafe { CStr::from_ptr(paimon_tantivy_last_error()) }
+            .to_str()
+            .unwrap();
+        assert_eq!(s, "main");
+    }
+}
diff --git a/third_party/tantivy_ffi/src/handle.rs b/third_party/tantivy_ffi/src/handle.rs
new file mode 100644
index 000000000..175c75e05
--- /dev/null
+++ b/third_party/tantivy_ffi/src/handle.rs
@@ -0,0 +1,106 @@
+//! Opaque handle helpers.
+//!
+//! Contract (see docs/dev/tantivy_ffi_design.md §3 Category A):
+//! - Rust creates handles with `Box::into_raw(Box::new(T))`
+//! - C++ must free with the matching `xxx_free(*mut T)` function, once
+//! - Functions accepting handles treat null as invalid argument
+
+use std::ffi::c_void;
+
+/// Consume `T`, return a raw opaque pointer suitable for C++.
+#[inline]
+pub(crate) fn into_handle<T>(value: T) -> *mut T {
+    Box::into_raw(Box::new(value))
+}
+
+/// Reconstitute a `Box<T>` from an FFI-provided pointer and drop it.
+/// SAFETY: caller must pass a pointer previously returned by `into_handle::<T>`,
+/// and must not use it again after this call.
+#[inline]
+pub(crate) unsafe fn free_handle<T>(handle: *mut T) {
+    if handle.is_null() {
+        return;
+    }
+    drop(unsafe { Box::from_raw(handle) });
+}
+
+/// Borrow an `&T` from an FFI-provided pointer. Returns None on null.
+/// SAFETY: caller must ensure the pointer was previously returned by
+/// `into_handle::<T>` and is still alive (not freed).
+#[inline]
+pub(crate) unsafe fn borrow_handle<'a, T>(handle: *const T) -> Option<&'a T> {
+    if handle.is_null() {
+        None
+    } else {
+        Some(unsafe { &*handle })
+    }
+}
+
+/// Borrow `&mut T` from an FFI-provided pointer. Returns None on null.
+/// SAFETY: same as `borrow_handle`, plus caller must ensure there is no
+/// concurrent access via another pointer (writer/reader handles are
+/// documented as thread-unsafe).
+#[inline]
+pub(crate) unsafe fn borrow_handle_mut<'a, T>(handle: *mut T) -> Option<&'a mut T> {
+    if handle.is_null() {
+        None
+    } else {
+        Some(unsafe { &mut *handle })
+    }
+}
+
+/// Opaque ctx pointer from C++ (passed through to Rust Directory callbacks).
+/// Type-erased on purpose: only C++ side knows the concrete type.
+pub(crate) type OpaqueCtx = *mut c_void;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn into_then_free() {
+        struct X(i32);
+        let h: *mut X = into_handle(X(42));
+        assert!(!h.is_null());
+        unsafe { free_handle(h) };
+        // no leak (LSAN would catch if compiled with sanitizers)
+    }
+
+    #[test]
+    fn free_null_is_noop() {
+        let h: *mut i32 = std::ptr::null_mut();
+        unsafe { free_handle(h) };
+    }
+
+    #[test]
+    fn borrow_roundtrip() {
+        let h = into_handle(42i32);
+        unsafe {
+            assert_eq!(*borrow_handle(h as *const i32).unwrap(), 42);
+            *borrow_handle_mut(h).unwrap() = 7;
+            assert_eq!(*borrow_handle(h as *const i32).unwrap(), 7);
+            free_handle(h);
+        }
+    }
+
+    #[test]
+    fn borrow_null_is_none() {
+        unsafe {
+            assert!(borrow_handle::<i32>(std::ptr::null()).is_none());
+            assert!(borrow_handle_mut::<i32>(std::ptr::null_mut()).is_none());
+        }
+    }
+
+    #[test]
+    fn stress_many_create_destroy() {
+        // smoke stress: many allocations, no leak
+        for i in 0..10_000 {
+            let h = into_handle(vec![i; 8]);
+            unsafe {
+                let v = borrow_handle(h as *const Vec<i32>).unwrap();
+                assert_eq!(v.len(), 8);
+                free_handle(h);
+            }
+        }
+    }
+}
diff --git a/third_party/tantivy_ffi/src/lib.rs b/third_party/tantivy_ffi/src/lib.rs
new file mode 100644
index 000000000..c96dac998
--- /dev/null
+++ b/third_party/tantivy_ffi/src/lib.rs
@@ -0,0 +1,84 @@
+//! paimon_tantivy_ffi: C ABI layer for tantivy + jieba-rs,
+//! consumed by paimon-cpp's `tantivy-fulltext` global index.
+//!
+//! See `docs/dev/tantivy_ffi_design.md` for the contract.
+//!
+//! Stage 1: scaffold + version FFI.
+//! Stage 2: error / handle / buffer / log modules.
+//! Stage 3: tokenizer.
+//! Stage 4: writer.
+//! Later stages fill in directory / reader / query.
+
+#![deny(unsafe_op_in_unsafe_fn)]
+
+use std::ffi::c_char;
+
+pub mod error;
+pub mod handle;
+pub mod buffer;
+pub mod log_bridge;
+pub mod tokenizer;
+pub mod writer;
+pub mod callback_directory;
+pub mod reader;
+
+// Re-export public FFI symbols at crate root so cbindgen picks them up.
+pub use buffer::{paimon_tantivy_buffer_free, PaimonTantivyBuffer};
+pub use error::{paimon_tantivy_last_error, PaimonTantivyStatus};
+pub use log_bridge::{
+    paimon_tantivy_clear_log_callback, paimon_tantivy_set_log_callback, PaimonTantivyLogFn,
+};
+pub use tokenizer::{
+    paimon_tantivy_tokenizer_free, paimon_tantivy_tokenizer_new,
+    paimon_tantivy_tokenizer_tokenize, PaimonJiebaTokenizer,
+};
+pub use writer::{
+    paimon_tantivy_writer_add, paimon_tantivy_writer_finish_streaming,
+    paimon_tantivy_writer_free, paimon_tantivy_writer_new, PaimonTantivyWriter,
+    PaimonWriteCallbacks,
+};
+pub use callback_directory::{PaimonCallbackDirectory, PaimonStreamCallbacks};
+pub use reader::{
+    paimon_tantivy_reader_free, paimon_tantivy_reader_new_streaming,
+    paimon_tantivy_reader_search, PaimonTantivyReader,
+};
+
+/// Semantic version of this crate, **'static lifetime**; C++ must NOT free.
+/// Format: `"<semver>"` (git sha postfix can be added later via build.rs).
+/// Returned as a NUL-terminated UTF-8 C string.
+#[no_mangle]
+pub extern "C" fn paimon_tantivy_version() -> *const c_char {
+    concat!(env!("CARGO_PKG_VERSION"), "\0").as_ptr() as *const c_char
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::ffi::CStr;
+
+    #[test]
+    fn version_is_non_empty() {
+        let ptr = paimon_tantivy_version();
+        assert!(!ptr.is_null());
+        let s = unsafe { CStr::from_ptr(ptr) }.to_str().unwrap();
+        assert!(!s.is_empty(), "version must be non-empty");
+        assert!(s.contains('.'), "version must look like semver, got {s:?}");
+    }
+
+    #[test]
+    fn tantivy_and_jieba_are_linked() {
+        let _ = tantivy::schema::Schema::builder();
+        let _ = jieba_rs::Jieba::new();
+    }
+
+    #[test]
+    fn croaring_serialize_roundtrip() {
+        use croaring::Bitmap;
+        let mut b = Bitmap::new();
+        b.add(42);
+        b.add(100);
+        let bytes = b.serialize::<croaring::Portable>();
+        let b2 = Bitmap::deserialize::<croaring::Portable>(&bytes);
+        assert_eq!(b.cardinality(), b2.cardinality());
+    }
+}
diff --git a/third_party/tantivy_ffi/src/log_bridge.rs b/third_party/tantivy_ffi/src/log_bridge.rs
new file mode 100644
index 000000000..380832c81
--- /dev/null
+++ b/third_party/tantivy_ffi/src/log_bridge.rs
@@ -0,0 +1,103 @@
+//! Log bridge: tantivy internally emits log records via the `log` crate
+//! (via `tantivy::debug` / `info` etc.). This module registers a global
+//! `log::Log` implementation that forwards records to a C callback.
+//!
+//! Contract (see docs/dev/tantivy_ffi_design.md §7):
+//! - C++ calls `paimon_tantivy_set_log_callback(cb)` once at process startup
+//! - Passing null unregisters (reverts to stderr)
+//! - Callback receives (level, msg_ptr, msg_len); pointer is non-null,
+//!   UTF-8, NOT null-terminated, valid only for the duration of the call
+//! - Level mapping: 0=trace 1=debug 2=info 3=warn 4=error
+//! - Callback must be thread-safe: tantivy writes from worker threads
+//!
+//! NOTE: tantivy uses `tracing` in newer versions and `log` in others.
+//! Our current `tantivy = "0.22"` uses `log` (verified Stage 0.5 probe).
+//! If a future upgrade switches to `tracing`, install a `tracing-log`
+//! bridge here.
+
+use std::ffi::c_char;
+use std::sync::atomic::{AtomicPtr, Ordering};
+
+pub type PaimonTantivyLogFn = extern "C" fn(level: i32, msg: *const c_char, len: usize);
+
+static CALLBACK: AtomicPtr<()> = AtomicPtr::new(std::ptr::null_mut());
+
+struct LogBridge;
+
+impl log::Log for LogBridge {
+    fn enabled(&self, _: &log::Metadata) -> bool {
+        true
+    }
+
+    fn log(&self, record: &log::Record) {
+        let level = match record.level() {
+            log::Level::Trace => 0,
+            log::Level::Debug => 1,
+            log::Level::Info => 2,
+            log::Level::Warn => 3,
+            log::Level::Error => 4,
+        };
+        let msg = format!("[{}] {}", record.target(), record.args());
+        let ptr = CALLBACK.load(Ordering::Acquire);
+        if ptr.is_null() {
+            // Fallback: stderr
+            eprintln!("{msg}");
+            return;
+        }
+        // SAFETY: ptr was installed as PaimonTantivyLogFn via transmute below
+        let cb: PaimonTantivyLogFn = unsafe { std::mem::transmute(ptr) };
+        cb(level, msg.as_ptr() as *const c_char, msg.len());
+    }
+
+    fn flush(&self) {}
+}
+
+static LOGGER: LogBridge = LogBridge;
+
+/// Install a non-null callback. First call also registers `LogBridge` as
+/// the global `log` crate sink. Subsequent calls swap the callback atomically.
+/// Thread-safety: safe to call from any thread.
+///
+/// Note: we use separate `set`/`clear` functions instead of `Option<fn>`
+/// because cbindgen translates `Option<extern "C" fn>` into an opaque struct
+/// rather than a nullable C function pointer.
+#[no_mangle]
+pub extern "C" fn paimon_tantivy_set_log_callback(cb: PaimonTantivyLogFn) {
+    let ptr = cb as *mut ();
+    CALLBACK.store(ptr, Ordering::Release);
+    let _ = log::set_logger(&LOGGER);
+    log::set_max_level(log::LevelFilter::Info);
+}
+
+/// Clear the installed callback (revert to Rust-side stderr fallback).
+#[no_mangle]
+pub extern "C" fn paimon_tantivy_clear_log_callback() {
+    CALLBACK.store(std::ptr::null_mut(), Ordering::Release);
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+
+    // Simple test callback that counts invocations
+    static COUNT: AtomicUsize = AtomicUsize::new(0);
+    extern "C" fn counting_cb(_: i32, _: *const c_char, _: usize) {
+        COUNT.fetch_add(1, Ordering::SeqCst);
+    }
+
+    #[test]
+    fn install_then_log() {
+        COUNT.store(0, Ordering::SeqCst);
+        paimon_tantivy_set_log_callback(counting_cb);
+        log::info!("hello");
+        assert!(COUNT.load(Ordering::SeqCst) >= 1);
+    }
+
+    #[test]
+    fn clear_reverts_to_stderr() {
+        paimon_tantivy_set_log_callback(counting_cb);
+        paimon_tantivy_clear_log_callback();
+        log::warn!("goes to stderr");
+    }
+}
diff --git a/third_party/tantivy_ffi/src/reader.rs b/third_party/tantivy_ffi/src/reader.rs
new file mode 100644
index 000000000..b2032299c
--- /dev/null
+++ b/third_party/tantivy_ffi/src/reader.rs
@@ -0,0 +1,1237 @@
+//! PaimonTantivyReader: query side of tantivy-fulltext.
+//!
+//! Constructs a tantivy Index from a packed-blob produced by writer.rs (via
+//! PaimonDirectory), registers the same `paimon_jieba` tokenizer, and runs
+//! one of 5 search types (mirrors `paimon::FullTextSearch::SearchType`):
+//!
+//!   1 MATCH_ALL — tokenize query, BooleanQuery (Must)
+//!   2 MATCH_ANY — tokenize query, BooleanQuery (Should)
+//!   3 PHRASE    — tokenize query, PhraseQuery
+//!   4 PREFIX    — RegexQuery `<escaped>.*` (no tokenization, mirrors lucene-fts)
+//!   5 WILDCARD  — RegexQuery from glob pattern (`*` → `.*`, `?` → `.`, others escaped)
+//!
+//! Decision B1 (paimon-java compat): row_id is stored as an explicit u64 field
+//! (`fast` for O(1) retrieval). Reader translates tantivy DocAddress → row_id
+//! via `fast_fields().u64("row_id").first(doc_id)` per segment.
+//!
+//! FFI return format (little-endian, **doc identifiers are u64 row_ids**):
+//!   `[u8 has_scores | u64 count | u64 row_id[count] | optional f32 score[count]]`
+
+use std::ffi::{c_char, CStr};
+use std::path::Path;
+
+use croaring::{Portable, Treemap};
+use tantivy::collector::{Collector, SegmentCollector};
+use tantivy::columnar::Column;
+use tantivy::query::{BooleanQuery, Occur, PhraseQuery, Query, RegexQuery, TermQuery};
+use tantivy::schema::{Field, IndexRecordOption};
+use tantivy::{DocAddress, DocId, Index, IndexReader, ReloadPolicy, Score, SegmentOrdinal,
+               SegmentReader, Term};
+
+use crate::buffer::PaimonTantivyBuffer;
+use crate::callback_directory::{PaimonCallbackDirectory, PaimonStreamCallbacks};
+use crate::error::{set_last_error, PaimonTantivyStatus};
+use crate::handle::{borrow_handle_mut, free_handle, into_handle};
+use crate::tokenizer::{PaimonJiebaTokenizer, TokenizeMode};
+use crate::writer::{PAIMON_ROW_ID_FIELD_NAME, PAIMON_TEXT_FIELD_NAME, PAIMON_TOKENIZER_NAME};
+
+/// Numeric encoding of `paimon::FullTextSearch::SearchType`. Kept in sync
+/// with include/paimon/predicate/full_text_search.h.
+#[repr(i32)]
+#[derive(Clone, Copy, Debug)]
+pub enum SearchType {
+    MatchAll = 1,
+    MatchAny = 2,
+    Phrase = 3,
+    Prefix = 4,
+    Wildcard = 5,
+}
+
+impl SearchType {
+    fn from_i32(v: i32) -> Option<Self> {
+        match v {
+            1 => Some(Self::MatchAll),
+            2 => Some(Self::MatchAny),
+            3 => Some(Self::Phrase),
+            4 => Some(Self::Prefix),
+            5 => Some(Self::Wildcard),
+            _ => None,
+        }
+    }
+}
+
+pub struct PaimonTantivyReader {
+    /// Held alive so `IndexReader::searcher()` + `index.tokenizers()` stay
+    /// usable for the reader's lifetime.
+    index: Index,
+    reader: IndexReader,
+    text_field: Field,
+    /// Name of the tokenizer the `text` field is actually bound to in the open
+    /// index's schema (read from `meta.json` at construction time). Query-side
+    /// tokenization looks this up in `index.tokenizers()` every time
+    tokenizer_name: String,
+}
+
+impl PaimonTantivyReader {
+    /// Construct a reader from a pre-built callback-backed Directory.
+    /// Layout (file names + offsets + lengths) must come from the caller
+    /// (C++ side `ParseArchiveHeader`); Rust does not re-parse the archive.
+    pub fn new(
+        directory: PaimonCallbackDirectory,
+        mode: TokenizeMode,
+        with_position: bool,
+        dict_dir: &Path,
+    ) -> Result<Self, String> {
+        let index = Index::open(directory)
+            .map_err(|e| format!("tantivy::Index::open: {e}"))?;
+
+        // Resolve fields by their fixed names (B1: schema is `row_id` + `text`).
+        let schema = index.schema();
+        let text_field = schema.get_field(PAIMON_TEXT_FIELD_NAME).map_err(|e| {
+            format!("tantivy index missing '{PAIMON_TEXT_FIELD_NAME}' field: {e}")
+        })?;
+
+        // Read the tokenizer name the `text` field was actually written with
+        // (lives in meta.json's schema). Auto-aligns cpp query-side tokenizer
+        // with whatever the writer side used.
+        let tokenizer_name = match schema.get_field_entry(text_field).field_type() {
+            tantivy::schema::FieldType::Str(text_options) => text_options
+                .get_indexing_options()
+                .map(|io| io.tokenizer().to_string())
+                .unwrap_or_else(|| "default".to_string()),
+            other => {
+                return Err(format!(
+                    "text field has non-TEXT type: {other:?} (schema corrupted?)"
+                ));
+            }
+        };
+
+        // Only register paimon_jieba if the index actually uses it. The
+        // tantivy-builtin "default" / "raw" / "en_stem" etc. are pre-registered
+        // by the TokenizerManager — no setup needed for those.
+        if tokenizer_name == PAIMON_TOKENIZER_NAME {
+            // `Path::is_empty` is unstable; check via OsStr.
+            if dict_dir.as_os_str().is_empty() {
+                return Err(format!(
+                    "paimon_jieba tokenizer required by archive schema but dict dir \
+                     is empty — set the PAIMON_JIEBA_DICT_DIR env var to a directory \
+                     containing jieba.dict.utf8 / hmm_model.utf8 / user.dict.utf8 / \
+                     idf.utf8 / stop_words.utf8"
+                ));
+            }
+            let jieba = PaimonJiebaTokenizer::new(dict_dir, mode, with_position)
+                .map_err(|e| format!("create paimon_jieba tokenizer: {e}"))?;
+            index.tokenizers().register(PAIMON_TOKENIZER_NAME, jieba);
+        } else {
+            // For other known-safe names we trust tantivy's builtin registry.
+            // `mode` / `dict_dir` are unused in this branch — no-op; we still
+            // require them in the ABI for backward-compat with the jieba case.
+            let _ = (mode, dict_dir);
+        }
+
+        // Sanity: the tokenizer MUST be resolvable now; otherwise query-time
+        // lookup fails mid-flight.
+        if index.tokenizers().get(&tokenizer_name).is_none() {
+            return Err(format!(
+                "tokenizer {tokenizer_name:?} referenced by text field is not \
+                 registered; add it to TokenizerManager before opening the reader"
+            ));
+        }
+
+        let reader = index
+            .reader_builder()
+            .reload_policy(ReloadPolicy::Manual)
+            .try_into()
+            .map_err(|e| format!("build IndexReader: {e}"))?;
+
+        Ok(Self {
+            index,
+            reader,
+            text_field,
+            tokenizer_name,
+        })
+    }
+
+    /// Tokenize the query string using the *same* tokenizer the index's text
+    /// field was built with. Looks up `self.tokenizer_name` in the index's
+    /// `TokenizerManager` — which was populated by `new()` with either
+    /// `paimon_jieba` (if cpp wrote the index) or a tantivy builtin like
+    /// `default` (if paimon-java wrote it).
+    fn tokenize_query(&self, query: &str) -> Vec<String> {
+        // `TokenizerManager::get` returns a fresh clone per call — safe to use
+        // across threads / calls. If the tokenizer was missing we'd have
+        // failed in `new()`; we still defend with `unwrap_or_default`.
+        let mut analyzer = match self.index.tokenizers().get(&self.tokenizer_name) {
+            Some(a) => a,
+            None => return Vec::new(),
+        };
+        let mut stream = analyzer.token_stream(query);
+        let mut out = Vec::new();
+        while stream.advance() {
+            out.push(stream.token().text.clone());
+        }
+        out
+    }
+
+    fn build_match_query(&self, query: &str, occur: Occur) -> Result<Box<dyn Query>, String> {
+        let terms = self.tokenize_query(query);
+        if terms.is_empty() {
+            return Err(format!("query {query:?} produced no tokens after analysis"));
+        }
+        if terms.len() == 1 {
+            let term = Term::from_field_text(self.text_field, &terms[0]);
+            return Ok(Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)));
+        }
+        let clauses: Vec<(Occur, Box<dyn Query>)> = terms
+            .iter()
+            .map(|t| {
+                let term = Term::from_field_text(self.text_field, t);
+                let q: Box<dyn Query> =
+                    Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs));
+                (occur, q)
+            })
+            .collect();
+        Ok(Box::new(BooleanQuery::new(clauses)))
+    }
+
+    fn build_phrase_query(&self, query: &str) -> Result<Box<dyn Query>, String> {
+        let terms = self.tokenize_query(query);
+        if terms.is_empty() {
+            return Err(format!("phrase query {query:?} produced no tokens"));
+        }
+        if terms.len() == 1 {
+            // PhraseQuery requires >=2 terms in tantivy; degrade to TermQuery.
+            let term = Term::from_field_text(self.text_field, &terms[0]);
+            return Ok(Box::new(TermQuery::new(term, IndexRecordOption::WithFreqsAndPositions)));
+        }
+        let tantivy_terms: Vec<Term> = terms
+            .iter()
+            .map(|t| Term::from_field_text(self.text_field, t))
+            .collect();
+        Ok(Box::new(PhraseQuery::new(tantivy_terms)))
+    }
+
+    fn build_prefix_query(&self, query: &str) -> Result<Box<dyn Query>, String> {
+        if query.is_empty() {
+            return Err("prefix query is empty".into());
+        }
+        // Mirror lucene-fts: don't tokenize prefix; match indexed term bytes
+        // starting with the given prefix verbatim.
+        let pattern = format!("{}.*", regex_escape(query));
+        RegexQuery::from_pattern(&pattern, self.text_field)
+            .map(|q| Box::new(q) as Box<dyn Query>)
+            .map_err(|e| format!("RegexQuery from prefix {query:?}: {e}"))
+    }
+
+    fn build_wildcard_query(&self, query: &str) -> Result<Box<dyn Query>, String> {
+        if query.is_empty() {
+            return Err("wildcard query is empty".into());
+        }
+        let pattern = wildcard_to_regex(query);
+        RegexQuery::from_pattern(&pattern, self.text_field)
+            .map(|q| Box::new(q) as Box<dyn Query>)
+            .map_err(|e| format!("RegexQuery from wildcard {query:?} (pattern {pattern}): {e}"))
+    }
+
+    fn build_query(&self, search_type: SearchType, query: &str) -> Result<Box<dyn Query>, String> {
+        match search_type {
+            SearchType::MatchAll => self.build_match_query(query, Occur::Must),
+            SearchType::MatchAny => self.build_match_query(query, Occur::Should),
+            SearchType::Phrase => self.build_phrase_query(query),
+            SearchType::Prefix => self.build_prefix_query(query),
+            SearchType::Wildcard => self.build_wildcard_query(query),
+        }
+    }
+
+    /// Return all matching row_ids (no scoring, no limit, no pre_filter).
+    /// row_ids come from the explicit `row_id` u64 fast field, supporting
+    /// multi-segment indexes (e.g. produced by paimon-java without force-merge).
+    pub fn search_all(&self, search_type: SearchType, query: &str) -> Result<Vec<u64>, String> {
+        let q = self.build_query(search_type, query)?;
+        let searcher = self.reader.searcher();
+        let mut ids: Vec<u64> = searcher
+            .search(&*q, &RowIdCollector)
+            .map_err(|e| format!("tantivy search: {e}"))?;
+        ids.sort_unstable();
+        ids.dedup();
+        Ok(ids)
+    }
+
+    /// 4-path dispatch on `(with_score, limit)` — see `docs/dev/tantivy_bm25_score_contract.md`
+    /// §4.
+    ///
+    /// | with_score | limit  | path | collector              | sort           | truncate | output score |
+    /// |------------|--------|------|------------------------|----------------|----------|--------------|
+    /// | false      | None   |  A   | RowIdCollector         | row_id asc     | —        | ❌           |
+    /// | false      | Some(n)|  B   | AllScoredCollector     | score desc     | top n    | ❌ (dropped) |
+    /// | true       | None   |  C   | AllScoredCollector     | row_id asc     | —        | ✅           |
+    /// | true       | Some(n)|  D   | AllScoredCollector     | score desc     | top n    | ✅           |
+    ///
+    /// Pre-filter is a `Treemap` of paimon row_ids (not tantivy doc_ids), applied BEFORE
+    /// truncation so high-score matches outside the filter don't crowd out valid ones.
+    ///
+    /// **v0.2 contract change**: previously `limit.is_some()` implicitly triggered scoring; now
+    /// scoring is gated solely by `with_score`. See changelog in tantivy_ffi_design.md §4.6.
+    pub fn search_with_limit_and_filter(
+        &self,
+        search_type: SearchType,
+        query: &str,
+        with_score: bool,
+        limit: Option<usize>,
+        pre_filter: Option<&Treemap>,
+        min_score: Option<f32>,
+    ) -> Result<Vec<(u64, Option<f32>)>, String> {
+        let q = self.build_query(search_type, query)?;
+        let searcher = self.reader.searcher();
+        match (with_score, limit) {
+            // Path A: all rows, no score. RowIdCollector reads the `row_id` fast
+            // field inline per segment (opened once), avoiding a DocSetCollector
+            // HashSet and per-doc handle — hot path for high-cardinality counts.
+            (false, None) => {
+                let mut row_ids: Vec<u64> = searcher
+                    .search(&*q, &RowIdCollector)
+                    .map_err(|e| format!("tantivy search: {e}"))?;
+                if let Some(filter) = pre_filter {
+                    row_ids.retain(|id| filter.contains(*id));
+                }
+                row_ids.sort_unstable();
+                row_ids.dedup();
+                Ok(row_ids.into_iter().map(|id| (id, None)).collect())
+            }
+            // Path B: any N matches, unscored. Used by SR's `WHERE MATCH ... LIMIT N` (no
+            // ORDER BY): pushes the limit down so each shard stops collecting once N hits
+            // are gathered per segment instead of materialising the full posting list.
+            // If the caller wants top-N by BM25 they should set `with_score=true` (Path D)
+            // and ignore the score values.
+            (false, Some(n)) => {
+                if n == 0 {
+                    return Ok(Vec::new());
+                }
+                if min_score.is_some() {
+                    // min_score requires scoring — fall back to collect_scored path
+                    let mut filtered = self.collect_scored(&*q, &searcher, pre_filter)?;
+                    if let Some(threshold) = min_score {
+                        filtered.retain(|(s, _)| *s > threshold);
+                    }
+                    let truncated = Self::sort_by_score_desc_truncate(filtered, n);
+                    Ok(truncated.into_iter().map(|(_, id)| (id, None)).collect())
+                } else if let Some(filter) = pre_filter {
+                    // pre_filter present: it MUST be applied to the full match set
+                    // before truncation. LimitedDocSetCollector stops after the
+                    // first N raw matches, which could all be filtered out while
+                    // valid matches exist further down the posting list — that
+                    // would under-return (fewer than N, or even empty). So collect
+                    // every matching row_id (filter-aware), then truncate to N.
+                    let mut row_ids: Vec<u64> = searcher
+                        .search(&*q, &RowIdCollector)
+                        .map_err(|e| format!("tantivy search: {e}"))?;
+                    row_ids.retain(|id| filter.contains(*id));
+                    row_ids.sort_unstable();
+                    row_ids.dedup();
+                    row_ids.truncate(n);
+                    Ok(row_ids.into_iter().map(|id| (id, None)).collect())
+                } else {
+                    // No pre_filter: fast path — stop collecting once N matches are
+                    // gathered per segment instead of materialising the full posting list.
+                    let collector = LimitedDocSetCollector::new(n);
+                    let mut docset = searcher
+                        .search(&*q, &collector)
+                        .map_err(|e| format!("tantivy search: {e}"))?;
+                    let mut by_segment: std::collections::HashMap<SegmentOrdinal, Vec<DocId>> =
+                        std::collections::HashMap::new();
+                    for addr in docset.drain(..) {
+                        by_segment.entry(addr.segment_ord).or_default().push(addr.doc_id);
+                    }
+                    let mut row_ids: Vec<u64> = Vec::new();
+                    for (segment_ord, doc_ids) in by_segment.iter() {
+                        let segment_reader = searcher.segment_reader(*segment_ord);
+                        let fast = segment_reader
+                            .fast_fields()
+                            .u64(PAIMON_ROW_ID_FIELD_NAME)
+                            .map_err(|e| format!("fast_fields().u64('row_id') on segment {}: {e}",
+                                                 segment_ord))?;
+                        for &doc_id in doc_ids {
+                            row_ids.push(fast.first(doc_id).unwrap_or(0));
+                        }
+                    }
+                    row_ids.sort_unstable();
+                    row_ids.dedup();
+                    row_ids.truncate(n);
+                    Ok(row_ids.into_iter().map(|id| (id, None)).collect())
+                }
+            }
+            // Path C: all rows + all scores, sorted by row_id asc to match the
+            // BitmapScoredGlobalIndexResult contract (bitmap iter order == score order).
+            (true, None) => {
+                let mut filtered = self.collect_scored(&*q, &searcher, pre_filter)?;
+                if let Some(threshold) = min_score {
+                    filtered.retain(|(s, _)| *s > threshold);
+                }
+                filtered.sort_unstable_by(|a, b| a.1.cmp(&b.1));
+                Ok(filtered.into_iter().map(|(s, id)| (id, Some(s))).collect())
+            }
+            // Path D: top-N by BM25 with scores.
+            (true, Some(n)) => {
+                if n == 0 {
+                    return Ok(Vec::new());
+                }
+                let mut filtered = self.collect_scored(&*q, &searcher, pre_filter)?;
+                if let Some(threshold) = min_score {
+                    filtered.retain(|(s, _)| *s > threshold);
+                }
+                let truncated = Self::sort_by_score_desc_truncate(filtered, n);
+                Ok(truncated.into_iter().map(|(s, id)| (id, Some(s))).collect())
+            }
+        }
+    }
+
+    /// Helper for paths B/C/D: run AllScoredCollector, translate doc_id → row_id, apply pre_filter.
+    /// Groups results by segment so the fast field column handle is opened once per segment
+    /// (same rationale as Path A — avoids per-match Column<u64> allocation).
+    fn collect_scored(
+        &self,
+        q: &dyn Query,
+        searcher: &tantivy::Searcher,
+        pre_filter: Option<&Treemap>,
+    ) -> Result<Vec<(Score, u64)>, String> {
+        let scored = searcher
+            .search(q, &AllScoredCollector)
+            .map_err(|e| format!("tantivy search: {e}"))?;
+        let mut by_segment: std::collections::HashMap<SegmentOrdinal, Vec<(Score, DocId)>> =
+            std::collections::HashMap::new();
+        for (s, addr) in scored.into_iter() {
+            by_segment.entry(addr.segment_ord).or_default().push((s, addr.doc_id));
+        }
+        let mut result: Vec<(Score, u64)> = Vec::new();
+        for (segment_ord, entries) in by_segment.iter() {
+            let segment_reader = searcher.segment_reader(*segment_ord);
+            let fast = segment_reader
+                .fast_fields()
+                .u64(PAIMON_ROW_ID_FIELD_NAME)
+                .map_err(|e| format!("fast_fields().u64('row_id') on segment {}: {e}",
+                                     segment_ord))?;
+            for &(score, doc_id) in entries {
+                let rid = fast.first(doc_id).unwrap_or(0);
+                if pre_filter.map_or(true, |t| t.contains(rid)) {
+                    result.push((score, rid));
+                }
+            }
+        }
+        Ok(result)
+    }
+
+    /// Helper for paths B/D: sort (score, row_id) by score desc with row_id asc tie-break,
+    /// then truncate to `n` items.
+    fn sort_by_score_desc_truncate(mut v: Vec<(Score, u64)>, n: usize) -> Vec<(Score, u64)> {
+        v.sort_unstable_by(|a, b| {
+            b.0.partial_cmp(&a.0)
+                .unwrap_or(std::cmp::Ordering::Equal)
+                .then(a.1.cmp(&b.1))
+        });
+        v.truncate(n);
+        v
+    }
+
+    #[cfg(test)]
+    pub(crate) fn tokenizer_name(&self) -> &str {
+        &self.tokenizer_name
+    }
+
+    #[cfg(test)]
+    pub(crate) fn debug_index(&self) -> &Index {
+        &self.index
+    }
+}
+
+/// Escape regex metacharacters, but leave the input as a verbatim literal.
+fn regex_escape(input: &str) -> String {
+    let mut out = String::with_capacity(input.len() + 4);
+    for ch in input.chars() {
+        match ch {
+            '.' | '+' | '*' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '\\' => {
+                out.push('\\');
+                out.push(ch);
+            }
+            _ => out.push(ch),
+        }
+    }
+    out
+}
+
+/// Translate a glob-style wildcard ('*' = any, '?' = single char) into a
+/// regex pattern, escaping all other regex metacharacters.
+fn wildcard_to_regex(input: &str) -> String {
+    let mut out = String::with_capacity(input.len() + 4);
+    for ch in input.chars() {
+        match ch {
+            '*' => out.push_str(".*"),
+            '?' => out.push('.'),
+            '.' | '+' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '\\' => {
+                out.push('\\');
+                out.push(ch);
+            }
+            _ => out.push(ch),
+        }
+    }
+    out
+}
+
+/// Collector that reads the explicit `row_id` u64 fast field directly into a
+/// `Vec<u64>`, opening the column once per segment in `for_segment`. Replaces
+/// the DocSetCollector → HashSet → per-doc translate path for unscored queries.
+struct RowIdCollector;
+
+struct RowIdSegmentCollector {
+    row_id: Column<u64>,
+    ids: Vec<u64>,
+}
+
+impl SegmentCollector for RowIdSegmentCollector {
+    type Fruit = Vec<u64>;
+
+    fn collect(&mut self, doc: DocId, _score: Score) {
+        self.ids.push(self.row_id.first(doc).unwrap_or(0));
+    }
+
+    fn harvest(self) -> Vec<u64> {
+        self.ids
+    }
+}
+
+impl Collector for RowIdCollector {
+    type Fruit = Vec<u64>;
+    type Child = RowIdSegmentCollector;
+
+    fn for_segment(
+        &self, _ord: SegmentOrdinal, segment: &SegmentReader,
+    ) -> tantivy::Result<RowIdSegmentCollector> {
+        let row_id = segment.fast_fields().u64(PAIMON_ROW_ID_FIELD_NAME)?;
+        Ok(RowIdSegmentCollector { row_id, ids: Vec::new() })
+    }
+
+    fn requires_scoring(&self) -> bool {
+        false
+    }
+
+    fn merge_fruits(&self, segs: Vec<Vec<u64>>) -> tantivy::Result<Vec<u64>> {
+        Ok(segs.into_iter().flatten().collect())
+    }
+}
+
+/// Collector that returns at most `limit` DocAddresses across all segments,
+/// no scoring. Shared atomic counter caps the global total so per-shard
+/// transfer stays bounded for plain `LIMIT N` queries (no ORDER BY).
+struct LimitedDocSetCollector {
+    limit: usize,
+    counter: std::sync::Arc<std::sync::atomic::AtomicU64>,
+}
+
+impl LimitedDocSetCollector {
+    fn new(limit: usize) -> Self {
+        Self { limit, counter: std::sync::Arc::new(std::sync::atomic::AtomicU64::new(0)) }
+    }
+}
+
+struct LimitedDocSetSegmentCollector {
+    segment_ord: SegmentOrdinal,
+    docs: Vec<DocId>,
+    counter: std::sync::Arc<std::sync::atomic::AtomicU64>,
+    limit: u64,
+}
+
+impl SegmentCollector for LimitedDocSetSegmentCollector {
+    type Fruit = Vec<DocAddress>;
+
+    fn collect(&mut self, doc: DocId, _score: Score) {
+        // Best-effort cap: if multiple segments are scanned concurrently the
+        // atomic ensures we never accept more than `limit` rows total.
+        let prev = self.counter.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        if prev < self.limit {
+            self.docs.push(doc);
+        }
+    }
+
+    fn harvest(self) -> Self::Fruit {
+        let segment_ord = self.segment_ord;
+        self.docs.into_iter().map(|d| DocAddress::new(segment_ord, d)).collect()
+    }
+}
+
+impl Collector for LimitedDocSetCollector {
+    type Fruit = Vec<DocAddress>;
+    type Child = LimitedDocSetSegmentCollector;
+
+    fn for_segment(
+        &self, segment_ord: SegmentOrdinal, _segment: &SegmentReader,
+    ) -> tantivy::Result<Self::Child> {
+        Ok(LimitedDocSetSegmentCollector {
+            segment_ord,
+            docs: Vec::new(),
+            counter: self.counter.clone(),
+            limit: self.limit as u64,
+        })
+    }
+
+    fn requires_scoring(&self) -> bool { false }
+
+    fn merge_fruits(
+        &self, segment_fruits: Vec<Vec<DocAddress>>,
+    ) -> tantivy::Result<Vec<DocAddress>> {
+        let mut result: Vec<DocAddress> = segment_fruits.into_iter().flatten().collect();
+        result.truncate(self.limit);
+        Ok(result)
+    }
+}
+
+/// Custom Collector that returns ALL matching (score, DocAddress) tuples,
+/// without truncation. tantivy's stock `TopDocs::with_limit(N)` would force
+/// us to either pick N upfront (wrong when pre_filter rejects high-score
+/// docs) or pass `usize::MAX` (which still enforces a binary heap on every
+/// push). Our collector is just a plain Vec append, then merge.
+struct AllScoredCollector;
+
+struct AllScoredSegmentCollector {
+    segment_ord: SegmentOrdinal,
+    docs: Vec<(Score, DocId)>,
+}
+
+impl SegmentCollector for AllScoredSegmentCollector {
+    type Fruit = Vec<(Score, DocAddress)>;
+
+    fn collect(&mut self, doc: DocId, score: Score) {
+        self.docs.push((score, doc));
+    }
+
+    fn harvest(self) -> Self::Fruit {
+        let segment_ord = self.segment_ord;
+        self.docs
+            .into_iter()
+            .map(|(s, d)| (s, DocAddress::new(segment_ord, d)))
+            .collect()
+    }
+}
+
+impl Collector for AllScoredCollector {
+    type Fruit = Vec<(Score, DocAddress)>;
+    type Child = AllScoredSegmentCollector;
+
+    fn for_segment(
+        &self,
+        segment_ord: SegmentOrdinal,
+        _segment: &SegmentReader,
+    ) -> tantivy::Result<Self::Child> {
+        Ok(AllScoredSegmentCollector {
+            segment_ord,
+            docs: Vec::new(),
+        })
+    }
+
+    fn requires_scoring(&self) -> bool {
+        true
+    }
+
+    fn merge_fruits(
+        &self,
+        segment_fruits: Vec<Vec<(Score, DocAddress)>>,
+    ) -> tantivy::Result<Vec<(Score, DocAddress)>> {
+        Ok(segment_fruits.into_iter().flatten().collect())
+    }
+}
+
+// ============================ FFI surface ============================
+
+/// Construct a streaming reader from a layout table + pread callbacks.
+///
+/// The layout arrays (names / offsets / lengths) are produced by C++-side
+/// `ParseArchiveHeader` after reading only the archive header bytes. Payload
+/// bytes are fetched lazily through `callbacks.read_at` as tantivy reads.
+///
+/// # Arguments
+/// * `file_names` — array of `file_count` UTF-8 NUL-terminated C strings
+/// * `file_offsets` / `file_lengths` — u64 arrays (archive-absolute offsets and lengths)
+/// * `file_count` — number of entries in each of the three arrays
+/// * `callbacks` — pread + release callbacks; `ctx` ownership transfers to Rust
+/// * `mode_cstr` — tokenize mode ("mp"/"mix"/"full"/"query"; "hmm" → Unsupported)
+/// * `with_position` — whether text field was indexed with positions
+/// * `dict_dir_cstr` — paimon_jieba dictionary directory
+/// * `out` — receives the reader handle on success
+///
+/// # Safety
+/// All pointer args must be valid for the duration of the call; ctx lifetime
+/// extends until `callbacks.release` is invoked (when reader handle is freed).
+#[no_mangle]
+pub unsafe extern "C" fn paimon_tantivy_reader_new_streaming(
+    file_names: *const *const c_char,
+    file_offsets: *const u64,
+    file_lengths: *const u64,
+    file_count: usize,
+    callbacks: PaimonStreamCallbacks,
+    mode_cstr: *const c_char,
+    with_position: bool,
+    dict_dir_cstr: *const c_char,
+    out: *mut *mut PaimonTantivyReader,
+) -> PaimonTantivyStatus {
+    if mode_cstr.is_null() || dict_dir_cstr.is_null() || out.is_null() {
+        set_last_error("paimon_tantivy_reader_new_streaming: null mandatory argument");
+        // NOTE: we cannot call callbacks.release here because we don't know
+        // if the caller populated it yet. Caller must manage ctx on failure.
+        return PaimonTantivyStatus::InvalidArgument;
+    }
+    if file_count > 0
+        && (file_names.is_null() || file_offsets.is_null() || file_lengths.is_null())
+    {
+        set_last_error("file_names/offsets/lengths must be non-null when file_count > 0");
+        return PaimonTantivyStatus::InvalidArgument;
+    }
+
+    let mode_str = match unsafe { CStr::from_ptr(mode_cstr) }.to_str() {
+        Ok(s) => s,
+        Err(e) => {
+            set_last_error(format!("mode not utf-8: {e}"));
+            return PaimonTantivyStatus::InvalidArgument;
+        }
+    };
+    let dict_dir = match unsafe { CStr::from_ptr(dict_dir_cstr) }.to_str() {
+        Ok(s) => s,
+        Err(e) => {
+            set_last_error(format!("dict_dir not utf-8: {e}"));
+            return PaimonTantivyStatus::InvalidArgument;
+        }
+    };
+    let mode = match TokenizeMode::parse(mode_str) {
+        Some(m) => m,
+        None => {
+            set_last_error(format!(
+                "unknown tokenize mode {mode_str:?}; expected mp/mix/full/query"
+            ));
+            return PaimonTantivyStatus::InvalidArgument;
+        }
+    };
+
+    // Copy the C string array into owned Rust entries so the directory doesn't
+    // depend on caller-supplied lifetime.
+    let mut entries: Vec<(String, u64, u64)> = Vec::with_capacity(file_count);
+    for i in 0..file_count {
+        let name_ptr = unsafe { *file_names.add(i) };
+        if name_ptr.is_null() {
+            set_last_error(format!("file_names[{i}] is null"));
+            return PaimonTantivyStatus::InvalidArgument;
+        }
+        let name = match unsafe { CStr::from_ptr(name_ptr) }.to_str() {
+            Ok(s) => s.to_owned(),
+            Err(e) => {
+                set_last_error(format!("file_names[{i}] not utf-8: {e}"));
+                return PaimonTantivyStatus::InvalidArgument;
+            }
+        };
+        let offset = unsafe { *file_offsets.add(i) };
+        let length = unsafe { *file_lengths.add(i) };
+        entries.push((name, offset, length));
+    }
+
+    // Build callback directory (ctx ownership transfers here; release fires on drop).
+    let directory = PaimonCallbackDirectory::new(entries, callbacks);
+
+    match PaimonTantivyReader::new(directory, mode, with_position, Path::new(dict_dir)) {
+        Ok(r) => {
+            unsafe { *out = into_handle(r) };
+            PaimonTantivyStatus::Ok
+        }
+        Err(e) => {
+            let unsupported = e.contains("'hmm' is not supported");
+            let bad_format = e.contains("tantivy::Index::open")
+                || e.contains("missing 'text' field");
+            set_last_error(e);
+            if unsupported {
+                PaimonTantivyStatus::Unsupported
+            } else if bad_format {
+                PaimonTantivyStatus::IndexFormatError
+            } else {
+                PaimonTantivyStatus::InternalError
+            }
+        }
+    }
+}
+
+/// Run a query and emit results into `out`.
+///
+/// Output bytes (little-endian):
+///   `[u8 has_scores | u64 count | u64 row_ids[count] | optional f32 scores[count]]`
+///
+/// `has_scores=1` iff `limit >= 0` (caller asked for scoring + limit).
+///
+/// `limit < 0` ⇒ no limit, no scoring; sorted ascending by row_id.
+/// `limit >= 0` ⇒ top-N by descending score (pre_filter applied first).
+/// `pre_filter_bytes`: serialized croaring `Roaring64Map::write` (portable),
+///   containing paimon **row_ids** (not tantivy doc_ids); null+0 = no filter.
+///
+/// SAFETY: `reader` must be a live handle; `query` and `pre_filter_bytes`
+/// may be null+0 or readable slices; `out` non-null.
+#[no_mangle]
+pub unsafe extern "C" fn paimon_tantivy_reader_search(
+    reader: *mut PaimonTantivyReader,
+    search_type: i32,
+    query: *const c_char,
+    query_len: usize,
+    with_score: bool,
+    limit: i32,
+    pre_filter_bytes: *const c_char,
+    pre_filter_len: usize,
+    min_score: f32,
+    out: *mut PaimonTantivyBuffer,
+) -> PaimonTantivyStatus {
+    if out.is_null() {
+        set_last_error("reader_search: out is null");
+        return PaimonTantivyStatus::InvalidArgument;
+    }
+    let Some(r) = (unsafe { borrow_handle_mut::<PaimonTantivyReader>(reader) }) else {
+        set_last_error("reader_search: null reader handle");
+        return PaimonTantivyStatus::InvalidArgument;
+    };
+    let st = match SearchType::from_i32(search_type) {
+        Some(s) => s,
+        None => {
+            set_last_error(format!("unknown search_type {search_type}"));
+            return PaimonTantivyStatus::InvalidArgument;
+        }
+    };
+    if query.is_null() && query_len != 0 {
+        set_last_error("query is null but len > 0");
+        return PaimonTantivyStatus::InvalidArgument;
+    }
+    let query_str = if query_len == 0 {
+        ""
+    } else {
+        let slice = unsafe { std::slice::from_raw_parts(query as *const u8, query_len) };
+        match std::str::from_utf8(slice) {
+            Ok(s) => s,
+            Err(e) => {
+                set_last_error(format!("query not utf-8: {e}"));
+                return PaimonTantivyStatus::InvalidArgument;
+            }
+        }
+    };
+
+    let pre_filter: Option<Treemap> = if pre_filter_bytes.is_null() && pre_filter_len == 0 {
+        None
+    } else if pre_filter_bytes.is_null() {
+        set_last_error("pre_filter_bytes is null but len > 0");
+        return PaimonTantivyStatus::InvalidArgument;
+    } else {
+        let slice = unsafe {
+            std::slice::from_raw_parts(pre_filter_bytes as *const u8, pre_filter_len)
+        };
+        match Treemap::try_deserialize::<Portable>(slice) {
+            Some(t) => Some(t),
+            None => {
+                set_last_error(format!(
+                    "pre_filter not a valid Roaring64Map portable serialization ({} bytes)",
+                    pre_filter_len
+                ));
+                return PaimonTantivyStatus::InvalidArgument;
+            }
+        }
+    };
+
+    let limit_opt: Option<usize> = if limit < 0 { None } else { Some(limit as usize) };
+    let min_score_opt: Option<f32> = if min_score > 0.0 { Some(min_score) } else { None };
+
+    match r.search_with_limit_and_filter(st, query_str, with_score, limit_opt, pre_filter.as_ref(), min_score_opt)
+    {
+        Ok(rows) => {
+            // v0.2: has_scores is decoupled from limit — it equals with_score directly.
+            let has_scores = with_score;
+            let count = rows.len() as u64;
+            // 1 byte has_scores + 8 bytes count + 8 bytes per row_id + optional 4 bytes per score
+            let mut buf = Vec::with_capacity(
+                1 + 8 + rows.len() * 8 + if has_scores { rows.len() * 4 } else { 0 },
+            );
+            buf.push(if has_scores { 1u8 } else { 0u8 });
+            buf.extend_from_slice(&count.to_le_bytes());
+            for (id, _) in &rows {
+                buf.extend_from_slice(&id.to_le_bytes()); // u64 row_id LE
+            }
+            if has_scores {
+                for (_, score) in &rows {
+                    let s = score.unwrap_or(0.0);
+                    buf.extend_from_slice(&s.to_le_bytes());
+                }
+            }
+            unsafe { *out = PaimonTantivyBuffer::from_vec(buf) };
+            PaimonTantivyStatus::Ok
+        }
+        Err(e) => {
+            let parse_err = e.contains("RegexQuery from")
+                || e.contains("phrase query")
+                || e.contains("produced no tokens");
+            set_last_error(e);
+            if parse_err {
+                PaimonTantivyStatus::QueryParseError
+            } else {
+                PaimonTantivyStatus::InternalError
+            }
+        }
+    }
+}
+
+/// Destroy a reader handle. Safe on null.
+#[no_mangle]
+pub unsafe extern "C" fn paimon_tantivy_reader_free(reader: *mut PaimonTantivyReader) {
+    unsafe { free_handle(reader) };
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::callback_directory::test_support::build_directory_from_archive;
+    use crate::writer::PaimonTantivyWriter;
+    use std::path::PathBuf;
+
+    fn dict_dir() -> PathBuf {
+        std::env::var("PAIMON_JIEBA_DICT_DIR")
+            .map(PathBuf::from)
+            .unwrap_or_else(|_| PathBuf::from("/tmp/nonexistent-dict"))
+    }
+
+    fn build(docs: &[&str]) -> Vec<u8> {
+        let mut w = PaimonTantivyWriter::new("f0", TokenizeMode::Mix, true, &dict_dir(), "paimon_jieba").unwrap();
+        for (i, d) in docs.iter().enumerate() {
+            w.add(i as u64, d).unwrap();
+        }
+        w.finish().unwrap().1
+    }
+
+    fn open(packed: &[u8]) -> PaimonTantivyReader {
+        // Simulate production flow: parse archive header → build layout →
+        // back PaimonCallbackDirectory with a mock pread that reads from the
+        // packed Vec. Once C++ `ParseArchiveHeader` (K3) is in place, prod
+        // uses the same PaimonCallbackDirectory path.
+        let (dir, _backend) = build_directory_from_archive(packed.to_vec());
+        PaimonTantivyReader::new(dir, TokenizeMode::Mix, true, &dict_dir()).unwrap()
+    }
+
+    #[test]
+    fn match_all_single_term() {
+        let bytes = build(&["hello world", "hello there", "world peace"]);
+        let r = open(&bytes);
+        let ids = r.search_all(SearchType::MatchAll, "hello").unwrap();
+        assert_eq!(ids, vec![0u64, 1]);
+    }
+
+    #[test]
+    fn match_all_two_terms_intersection() {
+        let bytes = build(&["hello world", "hello there", "world peace"]);
+        let r = open(&bytes);
+        let ids = r.search_all(SearchType::MatchAll, "hello world").unwrap();
+        assert_eq!(ids, vec![0u64]);
+    }
+
+    #[test]
+    fn match_any_two_terms_union() {
+        let bytes = build(&["hello world", "hello there", "world peace"]);
+        let r = open(&bytes);
+        let ids = r.search_all(SearchType::MatchAny, "hello peace").unwrap();
+        assert_eq!(ids, vec![0u64, 1, 2]);
+    }
+
+    #[test]
+    fn phrase_only_consecutive() {
+        let bytes = build(&["hello world there", "world hello there"]);
+        let r = open(&bytes);
+        let ids = r.search_all(SearchType::Phrase, "hello world").unwrap();
+        assert_eq!(ids, vec![0u64]);
+    }
+
+    #[test]
+    fn prefix_matches_indexed_terms() {
+        let bytes = build(&["unordered user-defined doc id"]);
+        let r = open(&bytes);
+        let ids = r.search_all(SearchType::Prefix, "unorder").unwrap();
+        assert_eq!(ids, vec![0u64]);
+    }
+
+    #[test]
+    fn wildcard_with_star() {
+        let bytes = build(&["unordered", "ordered", "border"]);
+        let r = open(&bytes);
+        let ids = r.search_all(SearchType::Wildcard, "*order*").unwrap();
+        assert_eq!(ids, vec![0u64, 1, 2]);
+    }
+
+    #[test]
+    fn empty_query_for_match_returns_query_parse_error() {
+        let bytes = build(&["hello"]);
+        let r = open(&bytes);
+        let err = r.search_all(SearchType::MatchAll, "").unwrap_err();
+        assert!(err.contains("no tokens"), "got: {err}");
+    }
+
+    #[test]
+    fn wildcard_helper_escapes_dots() {
+        assert_eq!(wildcard_to_regex("a*b"), "a.*b");
+        assert_eq!(wildcard_to_regex("a?b"), "a.b");
+        assert_eq!(wildcard_to_regex("a.b"), r"a\.b");
+        assert_eq!(wildcard_to_regex("*a*"), ".*a.*");
+    }
+
+    // ----- limit + pre_filter + scoring (B1: row_id-based) -----
+
+    #[test]
+    fn limit_returns_top_n_with_scores() {
+        let bytes = build(&[
+            "doc",                              // 0: low score (1 occurrence)
+            "doc doc doc doc doc",              // 1: high score (5 occurrences)
+            "doc doc",                          // 2: medium score
+        ]);
+        let r = open(&bytes);
+        let rows = r
+            .search_with_limit_and_filter(SearchType::MatchAll, "doc", true, Some(2), None, None)
+            .unwrap();
+        assert_eq!(rows.len(), 2);
+        // doc 1 has highest TF, expect first
+        assert_eq!(rows[0].0, 1u64);
+        assert!(rows[0].1.is_some());
+        assert!(rows[1].1.is_some());
+        // Scores monotonically decreasing
+        assert!(rows[0].1.unwrap() >= rows[1].1.unwrap());
+    }
+
+    #[test]
+    fn no_limit_returns_all_unscored() {
+        let bytes = build(&["hello world", "world hello", "world peace"]);
+        let r = open(&bytes);
+        let rows = r
+            .search_with_limit_and_filter(SearchType::MatchAll, "world", false, None, None, None)
+            .unwrap();
+        let ids: Vec<u64> = rows.iter().map(|(id, _)| *id).collect();
+        assert_eq!(ids, vec![0u64, 1, 2]);
+        assert!(rows.iter().all(|(_, s)| s.is_none()));
+    }
+
+    #[test]
+    fn pre_filter_no_limit_intersects() {
+        let bytes = build(&["alpha beta", "alpha gamma", "beta gamma"]);
+        let r = open(&bytes);
+        // pre_filter = {0, 2}; query "alpha" matches {0, 1}; expect intersection {0}
+        let mut tm = Treemap::new();
+        tm.add(0);
+        tm.add(2);
+        let rows = r
+            .search_with_limit_and_filter(SearchType::MatchAll, "alpha", false, None, Some(&tm), None)
+            .unwrap();
+        let ids: Vec<u64> = rows.iter().map(|(id, _)| *id).collect();
+        assert_eq!(ids, vec![0u64]);
+    }
+
+    #[test]
+    fn pre_filter_with_limit_filters_before_topn() {
+        // doc 0 has highest TF for "doc" but is NOT in pre_filter → must NOT
+        // be in result, even with limit=1.
+        let bytes = build(&[
+            "doc doc doc doc doc",    // 0: highest TF, but excluded
+            "doc doc",                // 1: medium TF, included
+            "doc",                    // 2: low TF, excluded
+        ]);
+        let r = open(&bytes);
+        let mut tm = Treemap::new();
+        tm.add(1);  // only doc 1 passes pre_filter
+        let rows = r
+            .search_with_limit_and_filter(SearchType::MatchAll, "doc", true, Some(10), Some(&tm), None)
+            .unwrap();
+        assert_eq!(rows.len(), 1);
+        assert_eq!(rows[0].0, 1u64);
+    }
+
+    #[test]
+    fn unscored_limit_with_pre_filter_applies_filter_before_truncate() {
+        // Regression (review finding #1): with_score=false + limit=N + pre_filter
+        // must apply the filter to the FULL match set before truncating to N.
+        // All three docs match "doc" but only row_id 2 (the LAST one) passes the
+        // pre_filter; a truncate-before-filter impl (LimitedDocSetCollector that
+        // stops at N raw matches, then filters) would collect doc 0, filter it
+        // out, and wrongly return empty instead of {2}.
+        let bytes = build(&["doc", "doc", "doc"]);
+        let r = open(&bytes);
+        let mut tm = Treemap::new();
+        tm.add(2); // only row_id 2 passes the pre_filter
+        let rows = r
+            .search_with_limit_and_filter(SearchType::MatchAll, "doc", false, Some(1), Some(&tm), None)
+            .unwrap();
+        let ids: Vec<u64> = rows.iter().map(|(id, _)| *id).collect();
+        assert_eq!(ids, vec![2u64], "pre_filter must be applied before LIMIT truncation");
+        assert!(rows.iter().all(|(_, s)| s.is_none()));
+    }
+
+    #[test]
+    fn empty_pre_filter_returns_empty() {
+        let bytes = build(&["alpha", "beta"]);
+        let r = open(&bytes);
+        let tm = Treemap::new();  // empty
+        let rows = r
+            .search_with_limit_and_filter(SearchType::MatchAll, "alpha", false, None, Some(&tm), None)
+            .unwrap();
+        assert!(rows.is_empty());
+    }
+
+    #[test]
+    fn limit_zero_returns_empty_without_running_query() {
+        let bytes = build(&["alpha", "beta"]);
+        let r = open(&bytes);
+        let rows = r
+            .search_with_limit_and_filter(SearchType::MatchAll, "alpha", true, Some(0), None, None)
+            .unwrap();
+        assert!(rows.is_empty());
+    }
+
+    // ----- B1: row_id is independent of doc_id -----
+
+    #[test]
+    fn pre_filter_uses_row_id_not_doc_id() {
+        // Build with non-contiguous row_ids so doc_id ≠ row_id. Then verify
+        // pre_filter operates on row_id values, not internal tantivy doc_ids.
+        let mut w = PaimonTantivyWriter::new("f0", TokenizeMode::Mix, true, &dict_dir(), "paimon_jieba").unwrap();
+        w.add(100, "alpha").unwrap();
+        w.add(200, "alpha").unwrap();
+        w.add(300, "alpha").unwrap();
+        let bytes = w.finish().unwrap().1;
+        let r = open(&bytes);
+
+        // pre_filter = {200} as row_id (doc_id would be 1)
+        let mut tm = Treemap::new();
+        tm.add(200);
+        let rows = r
+            .search_with_limit_and_filter(SearchType::MatchAll, "alpha", false, None, Some(&tm), None)
+            .unwrap();
+        let ids: Vec<u64> = rows.iter().map(|(id, _)| *id).collect();
+        assert_eq!(ids, vec![200u64], "pre_filter must operate on row_id, not doc_id");
+    }
+
+    #[test]
+    fn search_returns_caller_supplied_row_ids() {
+        // Same setup: row_ids 100/200/300, verify search_all returns those values.
+        let mut w = PaimonTantivyWriter::new("f0", TokenizeMode::Mix, true, &dict_dir(), "paimon_jieba").unwrap();
+        w.add(100, "doc").unwrap();
+        w.add(200, "doc").unwrap();
+        w.add(300, "doc").unwrap();
+        let bytes = w.finish().unwrap().1;
+        let r = open(&bytes);
+        let ids = r.search_all(SearchType::MatchAll, "doc").unwrap();
+        assert_eq!(ids, vec![100u64, 200, 300]);
+    }
+
+    #[test]
+    fn tokenizer_name_reflects_paimon_jieba_schema_for_cpp_written_index() {
+        // cpp-written index: PaimonTantivyWriter binds the text field to
+        // `paimon_jieba`. Reader must pick that up from meta.json (not hardcode).
+        let bytes = build(&["hello world"]);
+        let r = open(&bytes);
+        assert_eq!(r.tokenizer_name(), PAIMON_TOKENIZER_NAME);
+
+        // tokenize sanity: jieba mode="mix" picks `hello` + `world` from ASCII.
+        let q = r.tokenize_query("hello world");
+        assert_eq!(q, vec!["hello".to_string(), "world".to_string()]);
+    }
+
+    #[test]
+    fn tokenizer_name_reflects_default_schema_for_externally_written_index() {
+        // Simulate a paimon-java-shaped index: text field bound to the
+        // builtin `default` tokenizer (SimpleTokenizer + LowerCaser), not jieba.
+        // Build it directly via tantivy (bypassing PaimonTantivyWriter's jieba
+        // schema) so we can prove the reader auto-switches to the builtin.
+        use crate::callback_directory::test_support::build_mock_directory;
+        use tantivy::directory::Directory;
+        use tantivy::schema::{IndexRecordOption, NumericOptions, Schema, TextFieldIndexing, TextOptions};
+        use tantivy::{doc, Index};
+
+        // Build a minimal index with field "text" bound to "default".
+        let mut sb = Schema::builder();
+        let row_id_f = sb.add_u64_field(
+            "row_id",
+            NumericOptions::default().set_stored().set_indexed().set_fast(),
+        );
+        let text_opts = TextOptions::default().set_indexing_options(
+            TextFieldIndexing::default()
+                .set_tokenizer("default") // ← key: match paimon-java's TEXT default
+                .set_index_option(IndexRecordOption::WithFreqsAndPositions),
+        );
+        let text_f = sb.add_text_field("text", text_opts);
+        let schema = sb.build();
+        let tmp = tempfile::Builder::new()
+            .prefix("paimon-tantivy-dyn-tk-")
+            .tempdir()
+            .unwrap();
+        let index = Index::create_in_dir(tmp.path(), schema).unwrap();
+        let mut writer = index.writer(15_000_000).unwrap();
+        writer
+            .add_document(doc!(row_id_f => 0u64, text_f => "Hello World"))
+            .unwrap();
+        writer
+            .add_document(doc!(row_id_f => 1u64, text_f => "Apple.Banana"))
+            .unwrap();
+        writer.commit().unwrap();
+        writer.wait_merging_threads().unwrap();
+
+        // Pack the index dir into our archive format so the callback directory
+        // can serve it. Reuse writer.rs's format by streaming entries manually.
+        let mut data = Vec::new();
+        let mut entries = Vec::<(String, u64, u64)>::new();
+        let dir_iter = std::fs::read_dir(tmp.path()).unwrap();
+        let mut files: Vec<_> = dir_iter
+            .filter_map(|e| e.ok())
+            .filter(|e| e.file_type().ok().map_or(false, |t| t.is_file()))
+            .filter(|e| !e.file_name().to_string_lossy().starts_with('.'))
+            .collect();
+        files.sort_by_key(|e| e.file_name());
+        data.extend_from_slice(&(files.len() as i32).to_be_bytes());
+        for e in &files {
+            let name = e.file_name().to_string_lossy().into_owned();
+            let bytes = std::fs::read(e.path()).unwrap();
+            data.extend_from_slice(&(name.len() as i32).to_be_bytes());
+            data.extend_from_slice(name.as_bytes());
+            data.extend_from_slice(&(bytes.len() as i64).to_be_bytes());
+            let off = data.len() as u64;
+            data.extend_from_slice(&bytes);
+            entries.push((name, off, bytes.len() as u64));
+        }
+
+        let (dir, _backend) = build_mock_directory(data, entries);
+        let r = PaimonTantivyReader::new(dir, TokenizeMode::Mix, true, &dict_dir()).unwrap();
+
+        // Reader must pick up `default` from schema, not hardcode `paimon_jieba`.
+        assert_eq!(r.tokenizer_name(), "default");
+
+        // Query tokenization now goes through tantivy's builtin default
+        // (SimpleTokenizer + LowerCaser):
+        //   "Apple.Banana" → ["apple", "banana"]  (dot is non-alnum, split)
+        //   "Hello World"  → ["hello", "world"]   (space split + lowercase)
+        let q1 = r.tokenize_query("Hello World");
+        assert_eq!(q1, vec!["hello".to_string(), "world".to_string()]);
+        let q2 = r.tokenize_query("Apple.Banana");
+        assert_eq!(q2, vec!["apple".to_string(), "banana".to_string()]);
+
+        // And the search path works across tokenizer:
+        let ids = r.search_all(SearchType::MatchAll, "hello").unwrap();
+        assert_eq!(ids, vec![0u64]);
+        let ids = r.search_all(SearchType::MatchAll, "apple").unwrap();
+        assert_eq!(ids, vec![1u64]);
+    }
+
+    #[test]
+    fn reader_aggregates_row_ids_across_segments() {
+        // Multi-thread default writer + many docs => may produce multiple
+        // segments before force-merge. After finish(), force-merge collapses
+        // to one segment, but this test still validates the row_id retrieval
+        // path works for ≥1 segment.
+        let mut w = PaimonTantivyWriter::new("f0", TokenizeMode::Mix, true, &dict_dir(), "paimon_jieba").unwrap();
+        for i in 0..200u64 {
+            w.add(i * 7, &format!("docmark_{i} apple")).unwrap();
+        }
+        let bytes = w.finish().unwrap().1;
+        let r = open(&bytes);
+        let ids = r.search_all(SearchType::MatchAll, "apple").unwrap();
+        assert_eq!(ids.len(), 200);
+        for i in 0..200u64 {
+            assert!(ids.contains(&(i * 7)), "missing row_id={}", i * 7);
+        }
+    }
+}
diff --git a/third_party/tantivy_ffi/src/tokenizer.rs b/third_party/tantivy_ffi/src/tokenizer.rs
new file mode 100644
index 000000000..2ab4c5e96
--- /dev/null
+++ b/third_party/tantivy_ffi/src/tokenizer.rs
@@ -0,0 +1,447 @@
+//! PaimonJiebaTokenizer: tantivy Tokenizer impl wrapping jieba-rs.
+//!
+//! Contract (see docs/dev/tantivy_ffi_design.md §4.2 and migration plan Stage 3):
+//! - Behavior-equivalent with `JiebaAnalyzer` in src/paimon/global_index/lucene/
+//! - 5 modes: mp / hmm / mix / full / query
+//!   - `hmm` is Unsupported (jieba-rs has no standalone HMM entry point)
+//!   - `mp` accepts cut(hmm=false) but does not replicate cppjieba's
+//!     max_word_len truncation (docs/dev/tantivy_ffi_design.md §9.3 entry)
+//! - Normalize: skip pure whitespace, skip stop_words, lowercase ASCII-only tokens
+//! - Token offsets: byte offsets into the original UTF-8 string
+//! - `with_position=false`: all tokens emitted at `position=0` (disables PhraseQuery)
+//! - Custom dict dir: loads `jieba.dict.utf8` (+optional `user.dict.utf8`) from
+//!   `$PAIMON_JIEBA_DICT_DIR`; stop_words.utf8 loaded if present
+
+use std::collections::HashSet;
+use std::ffi::{c_char, CStr};
+use std::fs::File;
+use std::io::{BufRead, BufReader};
+use std::path::Path;
+use std::sync::Arc;
+
+use jieba_rs::Jieba;
+use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
+
+use crate::buffer::PaimonTantivyBuffer;
+use crate::error::{set_last_error, PaimonTantivyStatus};
+use crate::handle::{borrow_handle, free_handle, into_handle};
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum TokenizeMode {
+    Mp,
+    Hmm,
+    Mix,
+    Full,
+    Query,
+}
+
+impl TokenizeMode {
+    pub(crate) fn parse(s: &str) -> Option<Self> {
+        match s {
+            "mp" => Some(Self::Mp),
+            "hmm" => Some(Self::Hmm),
+            "mix" => Some(Self::Mix),
+            "full" => Some(Self::Full),
+            "query" => Some(Self::Query),
+            _ => None,
+        }
+    }
+}
+
+#[derive(Clone)]
+pub struct PaimonJiebaTokenizer {
+    jieba: Arc<Jieba>,
+    mode: TokenizeMode,
+    with_position: bool,
+    stop_words: Arc<HashSet<String>>,
+}
+
+impl PaimonJiebaTokenizer {
+    pub fn new(
+        dict_dir: &Path,
+        mode: TokenizeMode,
+        with_position: bool,
+    ) -> Result<Self, String> {
+        if mode == TokenizeMode::Hmm {
+            return Err(
+                "tokenize mode 'hmm' is not supported (jieba-rs does not expose standalone HMM)"
+                    .into(),
+            );
+        }
+        let jieba = load_jieba(dict_dir)?;
+        let stop_words = load_stop_words(dict_dir);
+        Ok(Self {
+            jieba: Arc::new(jieba),
+            mode,
+            with_position,
+            stop_words: Arc::new(stop_words),
+        })
+    }
+
+    /// Directly tokenize, returning a Vec of (offset_start, offset_end, text) tuples.
+    /// Used both by the tantivy Tokenizer impl and the standalone `tokenize` FFI.
+    pub fn tokenize_raw(&self, text: &str) -> Vec<(usize, usize, String)> {
+        // Use jieba-rs's cut variants which return Vec<&'a str>; compute byte offsets
+        // via pointer arithmetic (each &str is a slice of the original).
+        let cuts: Vec<&str> = match self.mode {
+            TokenizeMode::Mp => self.jieba.cut(text, false),
+            TokenizeMode::Hmm => Vec::new(), // unreachable (caught in new())
+            TokenizeMode::Mix => self.jieba.cut(text, true),
+            TokenizeMode::Full => self.jieba.cut_all(text),
+            TokenizeMode::Query => self.jieba.cut_for_search(text, true),
+        };
+
+        let text_start = text.as_ptr() as usize;
+        let mut out = Vec::with_capacity(cuts.len());
+        for piece in cuts {
+            // skip pure whitespace
+            if piece.chars().all(char::is_whitespace) {
+                continue;
+            }
+            // skip stop words (compare original case)
+            if self.stop_words.contains(piece) {
+                continue;
+            }
+            // offset calc
+            let start = piece.as_ptr() as usize - text_start;
+            let end = start + piece.len();
+            // lowercase only if pure ASCII alphanumeric (match cppjieba Normalize behavior)
+            let token_text = if is_ascii_alnum(piece) {
+                piece.to_ascii_lowercase()
+            } else {
+                piece.to_string()
+            };
+            out.push((start, end, token_text));
+        }
+        out
+    }
+}
+
+fn is_ascii_alnum(s: &str) -> bool {
+    !s.is_empty() && s.bytes().all(|b| b.is_ascii_alphanumeric())
+}
+
+fn load_jieba(dict_dir: &Path) -> Result<Jieba, String> {
+    let main_dict = dict_dir.join("jieba.dict.utf8");
+    let mut jieba = if main_dict.exists() {
+        let file = File::open(&main_dict)
+            .map_err(|e| format!("open {}: {e}", main_dict.display()))?;
+        let mut rdr = BufReader::new(file);
+        Jieba::with_dict(&mut rdr).map_err(|e| format!("load jieba dict: {e:?}"))?
+    } else {
+        // No custom dict; use jieba-rs builtin
+        Jieba::new()
+    };
+    // Optional user dict. cppjieba's user.dict.utf8 is lenient: lines are
+    // `word [freq] [tag]` where freq can be omitted (e.g. "蓝翔 nz"), but
+    // jieba-rs's load_dict strictly requires `word freq [tag]` and fails if
+    // freq is not an integer. We parse line-by-line with `add_word` to stay
+    // compatible.
+    let user_dict = dict_dir.join("user.dict.utf8");
+    if user_dict.exists() {
+        let file = File::open(&user_dict)
+            .map_err(|e| format!("open {}: {e}", user_dict.display()))?;
+        for (n, line_res) in BufReader::new(file).lines().enumerate() {
+            let line = match line_res {
+                Ok(l) => l,
+                Err(_) => continue, // skip unreadable lines
+            };
+            let trimmed = line.trim();
+            if trimmed.is_empty() || trimmed.starts_with('#') {
+                continue;
+            }
+            let mut it = trimmed.split_whitespace();
+            let word = it.next().unwrap(); // non-empty guaranteed
+            let next = it.next();
+            let freq = next.and_then(|s| s.parse::<usize>().ok());
+            let tag = match (freq, next) {
+                (Some(_), _) => it.next(),       // <word> <freq> [tag]
+                (None, tok) => tok,              // <word> <tag>  (no freq)
+            };
+            // `add_word` returns the assigned frequency; ignore it. For lines
+            // with bogus content we silently keep going, matching cppjieba's
+            // tolerant behavior.
+            let _ = jieba.add_word(word, freq, tag);
+            let _ = n; // keep for potential debug
+        }
+    }
+    Ok(jieba)
+}
+
+fn load_stop_words(dict_dir: &Path) -> HashSet<String> {
+    let path = dict_dir.join("stop_words.utf8");
+    let mut out = HashSet::new();
+    if let Ok(f) = File::open(&path) {
+        for line in BufReader::new(f).lines().map_while(Result::ok) {
+            let w = line.trim();
+            if !w.is_empty() {
+                out.insert(w.to_owned());
+            }
+        }
+    }
+    out
+}
+
+// ----------------- tantivy Tokenizer integration -----------------
+
+pub struct PaimonJiebaTokenStream {
+    tokens: Vec<Token>,
+    index: usize,
+}
+
+impl TokenStream for PaimonJiebaTokenStream {
+    fn advance(&mut self) -> bool {
+        self.index += 1;
+        self.index <= self.tokens.len()
+    }
+
+    fn token(&self) -> &Token {
+        &self.tokens[self.index - 1]
+    }
+
+    fn token_mut(&mut self) -> &mut Token {
+        &mut self.tokens[self.index - 1]
+    }
+}
+
+impl Tokenizer for PaimonJiebaTokenizer {
+    type TokenStream<'a> = PaimonJiebaTokenStream;
+
+    fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
+        let raw = self.tokenize_raw(text);
+        let tokens: Vec<Token> = raw
+            .into_iter()
+            .enumerate()
+            .map(|(i, (s, e, t))| Token {
+                offset_from: s,
+                offset_to: e,
+                position: if self.with_position { i } else { 0 },
+                text: t,
+                position_length: 1,
+            })
+            .collect();
+        PaimonJiebaTokenStream { tokens, index: 0 }
+    }
+}
+
+// ----------------- FFI surface -----------------
+
+/// Create a tokenizer handle. Returns OK and writes *out on success; returns
+/// status and sets last_error on failure.
+///
+/// SAFETY: `mode_cstr` and `dict_dir_cstr` must be NUL-terminated UTF-8;
+/// `out` must be a valid non-null pointer.
+#[no_mangle]
+pub unsafe extern "C" fn paimon_tantivy_tokenizer_new(
+    mode_cstr: *const c_char,
+    with_position: bool,
+    dict_dir_cstr: *const c_char,
+    out: *mut *mut PaimonJiebaTokenizer,
+) -> PaimonTantivyStatus {
+    if mode_cstr.is_null() || dict_dir_cstr.is_null() || out.is_null() {
+        set_last_error("paimon_tantivy_tokenizer_new: null argument");
+        return PaimonTantivyStatus::InvalidArgument;
+    }
+    let mode_s = match unsafe { CStr::from_ptr(mode_cstr) }.to_str() {
+        Ok(s) => s,
+        Err(e) => {
+            set_last_error(format!("mode not utf-8: {e}"));
+            return PaimonTantivyStatus::InvalidArgument;
+        }
+    };
+    let dict_s = match unsafe { CStr::from_ptr(dict_dir_cstr) }.to_str() {
+        Ok(s) => s,
+        Err(e) => {
+            set_last_error(format!("dict_dir not utf-8: {e}"));
+            return PaimonTantivyStatus::InvalidArgument;
+        }
+    };
+    let mode = match TokenizeMode::parse(mode_s) {
+        Some(m) => m,
+        None => {
+            set_last_error(format!(
+                "unknown tokenize mode {mode_s:?}; expected one of mp/hmm/mix/full/query"
+            ));
+            return PaimonTantivyStatus::InvalidArgument;
+        }
+    };
+    match PaimonJiebaTokenizer::new(Path::new(dict_s), mode, with_position) {
+        Ok(t) => {
+            unsafe { *out = into_handle(t) };
+            PaimonTantivyStatus::Ok
+        }
+        Err(e) => {
+            let is_hmm_unsupported = e.contains("'hmm' is not supported");
+            set_last_error(e);
+            if is_hmm_unsupported {
+                PaimonTantivyStatus::Unsupported
+            } else {
+                PaimonTantivyStatus::TokenizerError
+            }
+        }
+    }
+}
+
+/// Free a tokenizer handle. Safe on null.
+#[no_mangle]
+pub unsafe extern "C" fn paimon_tantivy_tokenizer_free(tok: *mut PaimonJiebaTokenizer) {
+    unsafe { free_handle(tok) };
+}
+
+/// Tokenize a string and return a newline-delimited list of tokens as bytes.
+/// Used for Stage 3 golden-sample tests (easy to diff from C++).
+///
+/// Output format:
+///   `<offset_from>\t<offset_to>\t<position>\t<text>\n` for each token.
+///
+/// SAFETY: `tok` must be a valid handle; `text` must point to `text_len` UTF-8 bytes;
+/// `out` must be non-null.
+#[no_mangle]
+pub unsafe extern "C" fn paimon_tantivy_tokenizer_tokenize(
+    tok: *const PaimonJiebaTokenizer,
+    text: *const c_char,
+    text_len: usize,
+    out: *mut PaimonTantivyBuffer,
+) -> PaimonTantivyStatus {
+    if out.is_null() {
+        set_last_error("paimon_tantivy_tokenizer_tokenize: out is null");
+        return PaimonTantivyStatus::InvalidArgument;
+    }
+    let Some(tokenizer) = (unsafe { borrow_handle::<PaimonJiebaTokenizer>(tok) }) else {
+        set_last_error("paimon_tantivy_tokenizer_tokenize: null tokenizer handle");
+        return PaimonTantivyStatus::InvalidArgument;
+    };
+    if text.is_null() && text_len != 0 {
+        set_last_error("text is null but len > 0");
+        return PaimonTantivyStatus::InvalidArgument;
+    }
+    let text_str = if text_len == 0 {
+        ""
+    } else {
+        let slice = unsafe { std::slice::from_raw_parts(text as *const u8, text_len) };
+        match std::str::from_utf8(slice) {
+            Ok(s) => s,
+            Err(e) => {
+                set_last_error(format!("text not utf-8: {e}"));
+                return PaimonTantivyStatus::InvalidArgument;
+            }
+        }
+    };
+    let raw = tokenizer.tokenize_raw(text_str);
+    let mut buf = String::new();
+    for (i, (s, e, t)) in raw.iter().enumerate() {
+        let pos = if tokenizer.with_position { i } else { 0 };
+        buf.push_str(&format!("{s}\t{e}\t{pos}\t{t}\n"));
+    }
+    let bytes = buf.into_bytes();
+    unsafe { *out = PaimonTantivyBuffer::from_vec(bytes) };
+    PaimonTantivyStatus::Ok
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::ffi::CString;
+
+    fn dict_dir_from_env() -> std::path::PathBuf {
+        std::env::var("PAIMON_JIEBA_DICT_DIR")
+            .map(std::path::PathBuf::from)
+            .unwrap_or_else(|_| std::path::PathBuf::from("/tmp/nonexistent-dict"))
+    }
+
+    #[test]
+    fn mode_parse() {
+        for (s, m) in [
+            ("mp", TokenizeMode::Mp),
+            ("hmm", TokenizeMode::Hmm),
+            ("mix", TokenizeMode::Mix),
+            ("full", TokenizeMode::Full),
+            ("query", TokenizeMode::Query),
+        ] {
+            assert_eq!(TokenizeMode::parse(s), Some(m));
+        }
+        assert!(TokenizeMode::parse("bogus").is_none());
+    }
+
+    #[test]
+    fn hmm_mode_returns_unsupported() {
+        let tok = PaimonJiebaTokenizer::new(
+            &dict_dir_from_env(),
+            TokenizeMode::Hmm,
+            true,
+        );
+        match tok {
+            Err(e) => assert!(e.contains("'hmm' is not supported"), "got: {e}"),
+            Ok(_) => panic!("expected Err"),
+        }
+    }
+
+    #[test]
+    fn tokenize_mix_default_dict_smoke() {
+        // If no custom dict dir, jieba-rs builtin is used.
+        let t = PaimonJiebaTokenizer::new(Path::new("/tmp/nonexistent-dict"), TokenizeMode::Mix, true)
+            .unwrap();
+        let raw = t.tokenize_raw("他来到了网易杭研大厦");
+        let texts: Vec<&str> = raw.iter().map(|(_, _, s)| s.as_str()).collect();
+        assert!(texts.contains(&"网易"));
+        assert!(texts.contains(&"大厦"));
+    }
+
+    #[test]
+    fn ascii_alnum_is_lowercased() {
+        let t = PaimonJiebaTokenizer::new(Path::new("/tmp/nx"), TokenizeMode::Mix, true).unwrap();
+        let raw = t.tokenize_raw("Hello World 中国");
+        let texts: Vec<&str> = raw.iter().map(|(_, _, s)| s.as_str()).collect();
+        assert!(texts.contains(&"hello"));
+        assert!(texts.contains(&"world"));
+        assert!(texts.contains(&"中国"));
+    }
+
+    #[test]
+    fn with_position_false_emits_zero_position() {
+        let t = PaimonJiebaTokenizer::new(Path::new("/tmp/nx"), TokenizeMode::Mix, false).unwrap();
+        let raw = t.tokenize_raw("中国人");
+        // Can't check position on raw tuples; check via tantivy Token stream:
+        let mut t2 = t.clone();
+        let mut stream = <PaimonJiebaTokenizer as Tokenizer>::token_stream(&mut t2, "中国人");
+        let mut positions = Vec::new();
+        while stream.advance() {
+            positions.push(stream.token().position);
+        }
+        assert!(!raw.is_empty());
+        assert!(positions.iter().all(|&p| p == 0));
+    }
+
+    #[test]
+    fn ffi_roundtrip() {
+        let dict = dict_dir_from_env();
+        let dict_str = dict.to_str().unwrap();
+        let mode = CString::new("mix").unwrap();
+        let dict_c = CString::new(dict_str).unwrap();
+        let mut handle: *mut PaimonJiebaTokenizer = std::ptr::null_mut();
+        unsafe {
+            let st = paimon_tantivy_tokenizer_new(
+                mode.as_ptr(),
+                true,
+                dict_c.as_ptr(),
+                &mut handle,
+            );
+            assert_eq!(st, PaimonTantivyStatus::Ok);
+            assert!(!handle.is_null());
+
+            let input = "Hello 中国";
+            let input_c = CString::new(input).unwrap();
+            let mut buf = PaimonTantivyBuffer::empty();
+            let st2 = paimon_tantivy_tokenizer_tokenize(
+                handle,
+                input_c.as_ptr(),
+                input.len(),
+                &mut buf,
+            );
+            assert_eq!(st2, PaimonTantivyStatus::Ok);
+            assert!(buf.len > 0);
+            crate::buffer::paimon_tantivy_buffer_free(&mut buf);
+            paimon_tantivy_tokenizer_free(handle);
+        }
+    }
+}
diff --git a/third_party/tantivy_ffi/src/writer.rs b/third_party/tantivy_ffi/src/writer.rs
new file mode 100644
index 000000000..291408ef6
--- /dev/null
+++ b/third_party/tantivy_ffi/src/writer.rs
@@ -0,0 +1,769 @@
+//! PaimonTantivyWriter: Writer for tantivy-fulltext global index.
+//!
+//! Contract (see docs/dev/tantivy_java_compat_plan.md §2.5 + §5.1 J2):
+//! - `writer_new(field_name, mode, with_position, dict_dir, out)` — create on a
+//!   private tmp dir backed by MmapDirectory + PaimonJiebaTokenizer.
+//!   `field_name` is **ignored** by the Rust schema (kept for FFI ABI
+//!   compatibility); schema field names are fixed (`row_id`, `text`) to match
+//!   paimon-java `paimon-tantivy-jni/rust/src/lib.rs:55-66`.
+//! - `writer_add(writer, row_id, text, len)` — add a single document with the
+//!   caller-supplied `row_id` (u64) and a TEXT field
+//! - `writer_finish(writer, out_row_count, out_buf)` — commit + force-merge to
+//!   single segment + pack all on-disk index files into a Rust-allocated buffer
+//! - `writer_free(writer)` — destroy (RAII removes tmp dir)
+//!
+//! Packing format (big-endian, **cross-readable with paimon-java archive**;
+//! see `paimon-tantivy-index/README.md` §Archive File Format):
+//!   `[i32 BE file_count |
+//!     (i32 BE name_len | name_bytes | i64 BE file_len | file_bytes)*]`
+
+use std::ffi::{c_char, c_void, CStr};
+use std::fs::File;
+use std::io::Read;
+use std::path::{Path, PathBuf};
+
+use tantivy::schema::{
+    Field, IndexRecordOption, NumericOptions, Schema, TextFieldIndexing, TextOptions,
+};
+use tantivy::{doc, Index, IndexWriter, TantivyDocument};
+use tempfile::TempDir;
+
+use crate::error::{set_last_error, PaimonTantivyStatus};
+use crate::handle::{borrow_handle_mut, free_handle, into_handle};
+use crate::tokenizer::{PaimonJiebaTokenizer, TokenizeMode};
+
+/// Schema field names. Fixed to match paimon-java's tantivy schema so that
+/// indexes are cross-readable. Both fields are required.
+pub const PAIMON_ROW_ID_FIELD_NAME: &str = "row_id";
+pub const PAIMON_TEXT_FIELD_NAME: &str = "text";
+
+/// Name registered with the tantivy `TokenizerManager`. Reader must register
+/// the same name to make stored term dictionaries readable.
+pub const PAIMON_TOKENIZER_NAME: &str = "paimon_jieba";
+
+/// Heap budget for the in-process IndexWriter (50 MB; tantivy minimum is ~3 MB).
+/// Default multi-threaded writer (`Index::writer(heap)`) splits this budget
+/// across `min(num_cpus, MAX_NUM_THREAD=8)` worker threads.
+const WRITER_HEAP_SIZE: usize = 50_000_000;
+
+pub struct PaimonTantivyWriter {
+    /// Owned tmp dir; cleaned up when this struct drops.
+    tmpdir: TempDir,
+    /// `row_id` u64 field (stored + indexed + fast). Reader retrieves the
+    /// caller-supplied row_id via `fast_fields().u64("row_id").first(doc_id)`.
+    row_id_field: Field,
+    /// `text` TEXT field tokenized via the registered jieba tokenizer.
+    text_field: Field,
+    /// tantivy index instance, file-backed in `tmpdir`.
+    index: Index,
+    /// Active writer; consumed by `wait_merging_threads()` in `finish`.
+    writer: Option<IndexWriter>,
+    /// Documents added since construction.
+    row_count: i64,
+}
+
+impl PaimonTantivyWriter {
+    pub fn new(
+        field_name: &str,
+        mode: TokenizeMode,
+        with_position: bool,
+        dict_dir: &Path,
+        tokenizer_name: &str,
+    ) -> Result<Self, String> {
+        if field_name.is_empty() {
+            return Err("field_name must be non-empty".into());
+        }
+        // Schema is fixed to match paimon-java (decision B1): row_id (u64
+        // stored+indexed+fast) + text (TEXT). The caller-supplied `field_name`
+        // parameter is currently ignored by the Rust schema (kept for FFI
+        // backward-compatibility); the C++ side still uses it to extract the
+        // right column from arrow batches.
+        let _ = field_name; // intentionally unused on the Rust side
+        let mut schema_builder = Schema::builder();
+        let row_id_field = schema_builder.add_u64_field(
+            PAIMON_ROW_ID_FIELD_NAME,
+            NumericOptions::default()
+                .set_stored()
+                .set_indexed()
+                .set_fast(),
+        );
+        let index_option = if with_position {
+            IndexRecordOption::WithFreqsAndPositions
+        } else {
+            IndexRecordOption::Basic
+        };
+        // Empty input falls back to tantivy's built-in "default" (SimpleTokenizer),
+        // matching the cpp-side default in `tantivy_defs.h::kDefaultTantivyWriteTokenizer`.
+        // Cross-read with paimon-java works out of the box; CJK callers must
+        // pass "paimon_jieba" explicitly.
+        let effective_tokenizer = if tokenizer_name.is_empty() {
+            "default"
+        } else {
+            tokenizer_name
+        };
+        let text_options = TextOptions::default().set_indexing_options(
+            TextFieldIndexing::default()
+                .set_tokenizer(effective_tokenizer)
+                .set_index_option(index_option),
+        );
+        let text_field = schema_builder.add_text_field(PAIMON_TEXT_FIELD_NAME, text_options);
+        let schema = schema_builder.build();
+
+        let tmpdir = tempfile::Builder::new()
+            .prefix("paimon-tantivy-")
+            .tempdir()
+            .map_err(|e| format!("create tmp dir: {e}"))?;
+
+        let index = Index::create_in_dir(tmpdir.path(), schema)
+            .map_err(|e| format!("create tantivy index: {e}"))?;
+        // When caller picks "paimon_jieba" we construct + register the jieba
+        // tokenizer. For any tantivy built-in name ("default", "whitespace",
+        // "raw", "en_stem", ...) tantivy's TokenizerManager already has it
+        // registered via `TokenizerManager::default()`; no-op here. This lets
+        // paimon-cpp emit archives cross-readable by paimon-java's default
+        // TEXT tokenizer path.
+        if effective_tokenizer == PAIMON_TOKENIZER_NAME {
+            let tokenizer = PaimonJiebaTokenizer::new(dict_dir, mode, with_position)
+                .map_err(|e| format!("create tokenizer: {e}"))?;
+            index
+                .tokenizers()
+                .register(PAIMON_TOKENIZER_NAME, tokenizer);
+        }
+
+        // Default multi-threaded writer (B1 schema stores row_id explicitly so
+        // we no longer need single-threaded ordering invariants). tantivy will
+        // use min(num_cpus, MAX_NUM_THREAD=8) workers, splitting heap budget.
+        let writer: IndexWriter = index
+            .writer(WRITER_HEAP_SIZE)
+            .map_err(|e| format!("create index writer: {e}"))?;
+
+        Ok(Self {
+            tmpdir,
+            row_id_field,
+            text_field,
+            index,
+            writer: Some(writer),
+            row_count: 0,
+        })
+    }
+
+    pub fn add(&mut self, row_id: u64, text: &str) -> Result<(), String> {
+        let writer = self
+            .writer
+            .as_mut()
+            .ok_or_else(|| "writer already finished".to_string())?;
+        let document: TantivyDocument = doc!(
+            self.row_id_field => row_id,
+            self.text_field => text,
+        );
+        writer
+            .add_document(document)
+            .map_err(|e| format!("add document: {e}"))?;
+        self.row_count += 1;
+        Ok(())
+    }
+
+    /// Commit + force-merge + GC on-disk index. Extracted from `finish_*`
+    /// so both streaming and test paths can share it.
+    fn commit_and_merge(&mut self) -> Result<(), String> {
+        let mut writer = self
+            .writer
+            .take()
+            .ok_or_else(|| "writer already finished".to_string())?;
+        writer.commit().map_err(|e| format!("commit: {e}"))?;
+
+        let segment_metas = self
+            .index
+            .searchable_segment_metas()
+            .map_err(|e| format!("list segments: {e}"))?;
+        if segment_metas.len() > 1 {
+            let segment_ids: Vec<_> = segment_metas.iter().map(|m| m.id()).collect();
+            writer
+                .merge(&segment_ids)
+                .wait()
+                .map_err(|e| format!("merge: {e}"))?;
+        }
+        writer
+            .garbage_collect_files()
+            .wait()
+            .map_err(|e| format!("garbage_collect_files: {e}"))?;
+        writer
+            .wait_merging_threads()
+            .map_err(|e| format!("wait_merging_threads: {e}"))?;
+        Ok(())
+    }
+
+    /// Streaming finish (W1 production path): commit + force-merge + push
+    /// archive bytes through the FFI callback in 64KB chunks. Peak RAM
+    /// independent of archive size — one stack buffer + a few KB metadata.
+    pub fn finish_streaming(
+        &mut self,
+        cb: &PaimonWriteCallbacks,
+    ) -> Result<i64, String> {
+        self.commit_and_merge()?;
+        let ctx = cb.ctx;
+        let write_fn = cb.write;
+        pack_index_dir_stream(self.tmpdir.path(), |bytes| {
+            // Calling extern "C" fn pointer is safe; C++ side owns ctx validity.
+            let rc = (write_fn)(ctx, bytes.as_ptr(), bytes.len());
+            if rc != 0 {
+                return Err(format!("write callback rc={rc} len={}", bytes.len()));
+            }
+            Ok(())
+        })?;
+        Ok(self.row_count)
+    }
+
+    /// Test-only convenience: collect streaming output into a `Vec<u8>`.
+    /// Rust unit tests / integration tests use this; production path is
+    /// `finish_streaming`.
+    #[cfg(test)]
+    pub(crate) fn finish(&mut self) -> Result<(i64, Vec<u8>), String> {
+        self.commit_and_merge()?;
+        let mut out: Vec<u8> = Vec::new();
+        pack_index_dir_stream(self.tmpdir.path(), |bytes| {
+            out.extend_from_slice(bytes);
+            Ok(())
+        })?;
+        Ok((self.row_count, out))
+    }
+
+    #[cfg(test)]
+    pub(crate) fn tmpdir_path(&self) -> &Path {
+        self.tmpdir.path()
+    }
+}
+
+// =========================================================================
+// Streaming pack (W1)
+// =========================================================================
+
+/// Streaming pack buffer size. Bigger than Java packIndex's 8KB for throughput,
+/// still far below any archive size we care about.
+const WRITER_STREAM_BUFFER_SIZE: usize = 64 * 1024;
+
+/// Callback table passed from C++ for streaming writer output (W1).
+///
+/// `ctx` is an opaque pointer to C++'s `WriteCtx` (holding a `paimon::OutputStream`).
+/// `write` is called in-order by Rust (not concurrently) to push bytes.
+#[repr(C)]
+pub struct PaimonWriteCallbacks {
+    pub ctx: *mut c_void,
+    /// Returns 0 on success, non-zero to signal C++ side error (Rust aborts pack).
+    pub write: extern "C" fn(ctx: *mut c_void, data: *const u8, len: usize) -> i32,
+}
+
+/// Walk tempdir + pack into the Java-compatible archive format, pushing each
+/// chunk through `write_fn`. Peak RAM = one 64KB stack buffer + a few KB of
+/// entry metadata (name + PathBuf + u64 length). Mirrors Java
+/// `TantivyFullTextGlobalIndexWriter.packIndex` but with a bigger buffer.
+///
+/// Archive format (BE, no version): `[i32 file_count | (i32 name_len, name,
+/// i64 file_len, file_bytes)*]`. Files sorted alphabetically for deterministic
+/// output; `.`-prefixed (lock) files and non-regular entries skipped.
+fn pack_index_dir_stream<F>(dir: &Path, mut write_fn: F) -> Result<(), String>
+where
+    F: FnMut(&[u8]) -> Result<(), String>,
+{
+    let entries = collect_dir_entries(dir)?;
+
+    // Header: BE i32 file_count
+    write_fn(&(entries.len() as i32).to_be_bytes())?;
+
+    let mut buf = [0u8; WRITER_STREAM_BUFFER_SIZE];
+    for (name, path, file_len) in &entries {
+        // Per-entry header: name_len, name, data_len
+        write_fn(&(name.len() as i32).to_be_bytes())?;
+        write_fn(name.as_bytes())?;
+        write_fn(&(*file_len as i64).to_be_bytes())?;
+
+        // Payload: 64KB buffer loop
+        let mut f = File::open(path)
+            .map_err(|e| format!("open {}: {e}", path.display()))?;
+        let mut pushed: u64 = 0;
+        loop {
+            let n = f
+                .read(&mut buf)
+                .map_err(|e| format!("read {}: {e}", path.display()))?;
+            if n == 0 {
+                break;
+            }
+            write_fn(&buf[..n])?;
+            pushed += n as u64;
+        }
+        if pushed != *file_len {
+            return Err(format!(
+                "file {} changed size during packing: header said {}, streamed {}",
+                name, file_len, pushed
+            ));
+        }
+    }
+    Ok(())
+}
+
+/// Enumerate the tempdir: sorted (name, path, len) for regular non-`.lock` files.
+fn collect_dir_entries(dir: &Path) -> Result<Vec<(String, PathBuf, u64)>, String> {
+    let mut entries: Vec<(String, PathBuf, u64)> = Vec::new();
+    let read_dir =
+        std::fs::read_dir(dir).map_err(|e| format!("read tmp dir {}: {e}", dir.display()))?;
+    for entry_res in read_dir {
+        let entry = entry_res.map_err(|e| format!("read entry: {e}"))?;
+        let name = match entry.file_name().into_string() {
+            Ok(n) => n,
+            Err(_) => continue,
+        };
+        if name.starts_with('.') {
+            continue;
+        }
+        let ft = entry
+            .file_type()
+            .map_err(|e| format!("file_type for {}: {e}", entry.path().display()))?;
+        if !ft.is_file() {
+            continue;
+        }
+        let len = entry
+            .metadata()
+            .map_err(|e| format!("metadata for {}: {e}", entry.path().display()))?
+            .len();
+        entries.push((name, entry.path(), len));
+    }
+    entries.sort_by(|a, b| a.0.cmp(&b.0));
+    Ok(entries)
+}
+
+// ============================ FFI surface ============================
+
+/// Create a writer handle on a private tmp dir.
+///
+/// SAFETY: all C-string args must be NUL-terminated UTF-8; `out` non-null.
+#[no_mangle]
+pub unsafe extern "C" fn paimon_tantivy_writer_new(
+    field_name_cstr: *const c_char,
+    mode_cstr: *const c_char,
+    with_position: bool,
+    dict_dir_cstr: *const c_char,
+    tokenizer_cstr: *const c_char,
+    out: *mut *mut PaimonTantivyWriter,
+) -> PaimonTantivyStatus {
+    if field_name_cstr.is_null()
+        || mode_cstr.is_null()
+        || dict_dir_cstr.is_null()
+        || tokenizer_cstr.is_null()
+        || out.is_null()
+    {
+        set_last_error("paimon_tantivy_writer_new: null argument");
+        return PaimonTantivyStatus::InvalidArgument;
+    }
+    let field_name = match unsafe { CStr::from_ptr(field_name_cstr) }.to_str() {
+        Ok(s) => s,
+        Err(e) => {
+            set_last_error(format!("field_name not utf-8: {e}"));
+            return PaimonTantivyStatus::InvalidArgument;
+        }
+    };
+    let mode_str = match unsafe { CStr::from_ptr(mode_cstr) }.to_str() {
+        Ok(s) => s,
+        Err(e) => {
+            set_last_error(format!("mode not utf-8: {e}"));
+            return PaimonTantivyStatus::InvalidArgument;
+        }
+    };
+    let dict_dir = match unsafe { CStr::from_ptr(dict_dir_cstr) }.to_str() {
+        Ok(s) => s,
+        Err(e) => {
+            set_last_error(format!("dict_dir not utf-8: {e}"));
+            return PaimonTantivyStatus::InvalidArgument;
+        }
+    };
+    let tokenizer_name = match unsafe { CStr::from_ptr(tokenizer_cstr) }.to_str() {
+        Ok(s) => s,
+        Err(e) => {
+            set_last_error(format!("tokenizer not utf-8: {e}"));
+            return PaimonTantivyStatus::InvalidArgument;
+        }
+    };
+    let mode = match TokenizeMode::parse(mode_str) {
+        Some(m) => m,
+        None => {
+            set_last_error(format!(
+                "unknown tokenize mode {mode_str:?}; expected one of mp/hmm/mix/full/query"
+            ));
+            return PaimonTantivyStatus::InvalidArgument;
+        }
+    };
+    match PaimonTantivyWriter::new(
+        field_name,
+        mode,
+        with_position,
+        Path::new(dict_dir),
+        tokenizer_name,
+    ) {
+        Ok(w) => {
+            unsafe { *out = into_handle(w) };
+            PaimonTantivyStatus::Ok
+        }
+        Err(e) => {
+            // hmm-mode rejection bubbles through tokenizer construction.
+            let unsupported = e.contains("'hmm' is not supported");
+            set_last_error(e);
+            if unsupported {
+                PaimonTantivyStatus::Unsupported
+            } else {
+                PaimonTantivyStatus::InternalError
+            }
+        }
+    }
+}
+
+/// Add a single document. `text` need not be NUL-terminated; treat as a slice
+/// of `text_len` UTF-8 bytes. Empty text (len=0) inserts an empty-text doc.
+/// `row_id` is the caller-supplied paimon row id (u64), stored in a fast field
+/// for retrieval by the reader.
+///
+/// SAFETY: `writer` must be a live handle from `writer_new`.
+#[no_mangle]
+pub unsafe extern "C" fn paimon_tantivy_writer_add(
+    writer: *mut PaimonTantivyWriter,
+    row_id: u64,
+    text: *const c_char,
+    text_len: usize,
+) -> PaimonTantivyStatus {
+    let Some(w) = (unsafe { borrow_handle_mut::<PaimonTantivyWriter>(writer) }) else {
+        set_last_error("paimon_tantivy_writer_add: null writer handle");
+        return PaimonTantivyStatus::InvalidArgument;
+    };
+    if text.is_null() && text_len != 0 {
+        set_last_error("text is null but len > 0");
+        return PaimonTantivyStatus::InvalidArgument;
+    }
+    let text_str = if text_len == 0 {
+        ""
+    } else {
+        let slice = unsafe { std::slice::from_raw_parts(text as *const u8, text_len) };
+        match std::str::from_utf8(slice) {
+            Ok(s) => s,
+            Err(e) => {
+                set_last_error(format!("text not utf-8: {e}"));
+                return PaimonTantivyStatus::InvalidArgument;
+            }
+        }
+    };
+    match w.add(row_id, text_str) {
+        Ok(()) => PaimonTantivyStatus::Ok,
+        Err(e) => {
+            set_last_error(e);
+            PaimonTantivyStatus::InternalError
+        }
+    }
+}
+
+/// Commit + force-merge + stream archive bytes through `callbacks.write` in
+/// 64KB chunks (W1). May only be called once per writer; subsequent calls
+/// return InvalidArgument with last_error="writer already finished".
+/// Peak Rust RAM ≈ 64KB + entry metadata (independent of archive size).
+///
+/// The callback is invoked **serially** (not concurrently) within this call;
+/// C++ side can write directly to paimon OutputStream without locking.
+///
+/// SAFETY: `writer` must be a live handle; `out_row_count` non-null.
+/// `callbacks.write` / `callbacks.ctx` must remain valid for the duration of
+/// the call (callback is consumed in-place, not retained).
+#[no_mangle]
+pub unsafe extern "C" fn paimon_tantivy_writer_finish_streaming(
+    writer: *mut PaimonTantivyWriter,
+    callbacks: PaimonWriteCallbacks,
+    out_row_count: *mut i64,
+) -> PaimonTantivyStatus {
+    if out_row_count.is_null() {
+        set_last_error("paimon_tantivy_writer_finish_streaming: null out_row_count");
+        return PaimonTantivyStatus::InvalidArgument;
+    }
+    let Some(w) = (unsafe { borrow_handle_mut::<PaimonTantivyWriter>(writer) }) else {
+        set_last_error("paimon_tantivy_writer_finish_streaming: null writer handle");
+        return PaimonTantivyStatus::InvalidArgument;
+    };
+    match w.finish_streaming(&callbacks) {
+        Ok(rows) => {
+            unsafe { *out_row_count = rows };
+            PaimonTantivyStatus::Ok
+        }
+        Err(e) => {
+            let already_finished = e == "writer already finished";
+            let io_err = e.starts_with("write callback rc=")
+                || e.starts_with("open ")
+                || e.starts_with("read ");
+            set_last_error(e);
+            if already_finished {
+                PaimonTantivyStatus::InvalidArgument
+            } else if io_err {
+                PaimonTantivyStatus::IoError
+            } else {
+                PaimonTantivyStatus::InternalError
+            }
+        }
+    }
+}
+
+/// Destroy a writer handle. Safe on null. Tmp dir is removed via Drop.
+#[no_mangle]
+pub unsafe extern "C" fn paimon_tantivy_writer_free(writer: *mut PaimonTantivyWriter) {
+    unsafe { free_handle(writer) };
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::ffi::CString;
+
+    /// Test dict dir for jieba; defaults to a non-existent path so jieba-rs uses
+    /// its built-in dict (which is enough for these smoke tests).
+    fn dict_dir_from_env() -> std::path::PathBuf {
+        std::env::var("PAIMON_JIEBA_DICT_DIR")
+            .map(std::path::PathBuf::from)
+            .unwrap_or_else(|_| std::path::PathBuf::from("/tmp/nonexistent-dict"))
+    }
+
+    #[test]
+    fn empty_field_name_rejected() {
+        let err = PaimonTantivyWriter::new("", TokenizeMode::Mix, true, Path::new("/tmp/nx"), "paimon_jieba")
+            .err()
+            .unwrap();
+        assert!(err.contains("field_name"), "got: {err}");
+    }
+
+    #[test]
+    fn hmm_mode_rejected() {
+        let err =
+            PaimonTantivyWriter::new("f0", TokenizeMode::Hmm, true, Path::new("/tmp/nx"), "paimon_jieba")
+                .err()
+                .unwrap();
+        assert!(err.contains("'hmm' is not supported"), "got: {err}");
+    }
+
+    #[test]
+    fn create_add_finish_roundtrip() {
+        let mut w =
+            PaimonTantivyWriter::new("f0", TokenizeMode::Mix, true, &dict_dir_from_env(), "paimon_jieba").unwrap();
+        w.add(0, "hello world").unwrap();
+        w.add(1, "中国人民").unwrap();
+        w.add(2, "").unwrap(); // empty doc
+        let (rows, bytes) = w.finish().unwrap();
+        assert_eq!(rows, 3);
+        assert!(bytes.len() > 4);
+
+        // Validate header (Java-compatible: BE int32 file_count, no version)
+        let file_count = i32::from_be_bytes(bytes[0..4].try_into().unwrap());
+        assert!(file_count > 0, "expected >0 packed files");
+
+        // Walk entries (BE)
+        let mut off: usize = 4;
+        let mut names = Vec::new();
+        for _ in 0..file_count {
+            let nlen = i32::from_be_bytes(bytes[off..off + 4].try_into().unwrap()) as usize;
+            off += 4;
+            let name = std::str::from_utf8(&bytes[off..off + nlen]).unwrap().to_owned();
+            off += nlen;
+            let flen = i64::from_be_bytes(bytes[off..off + 8].try_into().unwrap()) as usize;
+            off += 8;
+            assert!(off + flen <= bytes.len(), "file {name} extends past buffer");
+            off += flen;
+            names.push(name);
+        }
+        assert_eq!(off, bytes.len(), "trailing bytes after pack");
+        // tantivy must produce at least meta.json
+        assert!(names.iter().any(|n| n == "meta.json"), "names={names:?}");
+    }
+
+    #[test]
+    fn schema_field_names_are_fixed() {
+        // Schema must be `row_id` (u64) + `text` (TEXT) regardless of caller's
+        // field_name argument — matches paimon-java for cross-readability.
+        let w =
+            PaimonTantivyWriter::new("ignored_name", TokenizeMode::Mix, true, &dict_dir_from_env(), "paimon_jieba")
+                .unwrap();
+        let schema = w.index.schema();
+        assert!(schema.get_field(PAIMON_ROW_ID_FIELD_NAME).is_ok(),
+                "schema must have row_id field");
+        assert!(schema.get_field(PAIMON_TEXT_FIELD_NAME).is_ok(),
+                "schema must have text field");
+        // Caller-supplied name must NOT appear
+        assert!(schema.get_field("ignored_name").is_err(),
+                "caller-supplied field_name must be ignored");
+    }
+
+    #[test]
+    fn archive_uses_big_endian_no_version_header() {
+        // Strong guard: header must be BE int32 file_count, NOT LE int32
+        // version=1 + LE int32 file_count. Any regression to LE/version-header
+        // would silently break paimon-java cross-read.
+        let mut w =
+            PaimonTantivyWriter::new("f0", TokenizeMode::Mix, true, &dict_dir_from_env(), "paimon_jieba").unwrap();
+        w.add(0, "hello").unwrap();
+        let (_, bytes) = w.finish().unwrap();
+        let header_be = i32::from_be_bytes(bytes[0..4].try_into().unwrap());
+        let header_le = i32::from_le_bytes(bytes[0..4].try_into().unwrap());
+        // BE file_count is small (single-segment force-merge: ~6-7 files)
+        assert!(header_be > 0 && header_be < 100,
+                "expected sensible BE file_count, got BE={header_be} LE={header_le}");
+        // LE-decoded header would be a huge number (e.g. 0x06000000), ensuring
+        // we did NOT regress to the old LE+version layout.
+        assert_ne!(header_be, header_le, "buffer must be BE-encoded");
+    }
+
+    #[test]
+    fn multi_thread_writer_default() {
+        // B1 schema stores row_id explicitly so we no longer enforce
+        // single-threaded writer. Just verify many docs across threads land
+        // correctly and force-merge collapses to a single segment.
+        let mut w =
+            PaimonTantivyWriter::new("f0", TokenizeMode::Mix, true, &dict_dir_from_env(), "paimon_jieba").unwrap();
+        for i in 0..200u64 {
+            w.add(i, &format!("row {i} apple banana")).unwrap();
+        }
+        let (rows, bytes) = w.finish().unwrap();
+        assert_eq!(rows, 200);
+        assert!(bytes.len() > 4);
+        // After force-merge there must be exactly one meta.json + segment files.
+        let file_count = i32::from_be_bytes(bytes[0..4].try_into().unwrap());
+        assert!(file_count >= 2, "force-merged single segment needs ≥ 2 files (meta + segment), got {file_count}");
+    }
+
+    #[test]
+    fn finish_twice_errors() {
+        let mut w =
+            PaimonTantivyWriter::new("f0", TokenizeMode::Mix, true, &dict_dir_from_env(), "paimon_jieba").unwrap();
+        w.add(0, "hi").unwrap();
+        let _ = w.finish().unwrap();
+        let err = w.finish().err().unwrap();
+        assert!(err.contains("already finished"), "got: {err}");
+    }
+
+    /// Mock collector for FFI streaming tests: push bytes into a Box<Vec<u8>>
+    /// pointed to by `ctx`. (No Arc / atomic needed — test is single-threaded.)
+    extern "C" fn mock_write_collect(ctx: *mut c_void, data: *const u8, len: usize) -> i32 {
+        let vec = unsafe { &mut *(ctx as *mut Vec<u8>) };
+        let slice = unsafe { std::slice::from_raw_parts(data, len) };
+        vec.extend_from_slice(slice);
+        0
+    }
+
+    /// Mock that counts the largest single `write` call — sanity check that
+    /// Rust streams with small chunks (≤ 64KB buffer + header fields).
+    extern "C" fn mock_write_max_chunk(
+        ctx: *mut c_void,
+        _data: *const u8,
+        len: usize,
+    ) -> i32 {
+        let max = unsafe { &mut *(ctx as *mut usize) };
+        if len > *max {
+            *max = len;
+        }
+        0
+    }
+
+    #[test]
+    fn ffi_full_path_streaming() {
+        unsafe {
+            let field = CString::new("f0").unwrap();
+            let mode = CString::new("mix").unwrap();
+            let dict = CString::new(dict_dir_from_env().to_str().unwrap()).unwrap();
+            let tokenizer = CString::new("paimon_jieba").unwrap();
+            let mut handle: *mut PaimonTantivyWriter = std::ptr::null_mut();
+            let st = paimon_tantivy_writer_new(
+                field.as_ptr(),
+                mode.as_ptr(),
+                true,
+                dict.as_ptr(),
+                tokenizer.as_ptr(),
+                &mut handle,
+            );
+            assert_eq!(st, PaimonTantivyStatus::Ok);
+            assert!(!handle.is_null());
+
+            let txt = "hello world";
+            let st =
+                paimon_tantivy_writer_add(handle, 42u64, txt.as_ptr() as *const c_char, txt.len());
+            assert_eq!(st, PaimonTantivyStatus::Ok);
+
+            // Streaming finish: collect bytes into a Vec<u8> via FFI callback
+            let mut out: Vec<u8> = Vec::new();
+            let cb = PaimonWriteCallbacks {
+                ctx: &mut out as *mut _ as *mut c_void,
+                write: mock_write_collect,
+            };
+            let mut rows: i64 = 0;
+            let st = paimon_tantivy_writer_finish_streaming(handle, cb, &mut rows);
+            assert_eq!(st, PaimonTantivyStatus::Ok);
+            assert_eq!(rows, 1);
+            // BE file_count at byte 0,> 0
+            let file_count = i32::from_be_bytes(out[0..4].try_into().unwrap());
+            assert!(file_count > 0);
+
+            // double finish must error
+            let mut out2: Vec<u8> = Vec::new();
+            let cb2 = PaimonWriteCallbacks {
+                ctx: &mut out2 as *mut _ as *mut c_void,
+                write: mock_write_collect,
+            };
+            let mut rows2: i64 = 0;
+            let st = paimon_tantivy_writer_finish_streaming(handle, cb2, &mut rows2);
+            assert_eq!(st, PaimonTantivyStatus::InvalidArgument);
+
+            paimon_tantivy_writer_free(handle);
+        }
+    }
+
+    #[test]
+    fn streaming_chunk_size_bounded_by_buffer() {
+        // After force-merge, a 200-doc index still streams in chunks ≤ 64KB
+        // (payload) / or small header-field chunks. Peak chunk ≤ 64KB.
+        let mut w =
+            PaimonTantivyWriter::new("f0", TokenizeMode::Mix, true, &dict_dir_from_env(), "paimon_jieba").unwrap();
+        for i in 0..200u64 {
+            w.add(i, &format!("row {i} apple banana")).unwrap();
+        }
+        let mut max_chunk: usize = 0;
+        let cb = PaimonWriteCallbacks {
+            ctx: &mut max_chunk as *mut _ as *mut c_void,
+            write: mock_write_max_chunk,
+        };
+        let rows = w.finish_streaming(&cb).unwrap();
+        assert_eq!(rows, 200);
+        assert!(
+            max_chunk <= WRITER_STREAM_BUFFER_SIZE,
+            "streaming chunk size {} exceeded buffer {}",
+            max_chunk,
+            WRITER_STREAM_BUFFER_SIZE
+        );
+    }
+
+    #[test]
+    fn streaming_write_callback_error_propagates() {
+        extern "C" fn always_fail(_ctx: *mut c_void, _data: *const u8, _len: usize) -> i32 {
+            7
+        }
+        let mut w =
+            PaimonTantivyWriter::new("f0", TokenizeMode::Mix, true, &dict_dir_from_env(), "paimon_jieba").unwrap();
+        w.add(0, "hello").unwrap();
+        let cb = PaimonWriteCallbacks {
+            ctx: std::ptr::null_mut(),
+            write: always_fail,
+        };
+        let err = w.finish_streaming(&cb).unwrap_err();
+        assert!(err.contains("write callback rc=7"), "got: {err}");
+    }
+
+    #[test]
+    fn ffi_null_writer_invalid() {
+        unsafe {
+            let txt = "x";
+            let st = paimon_tantivy_writer_add(
+                std::ptr::null_mut(),
+                0u64,
+                txt.as_ptr() as *const c_char,
+                txt.len(),
+            );
+            assert_eq!(st, PaimonTantivyStatus::InvalidArgument);
+        }
+    }
+}