diff --git a/benchmarks/duckdb-bench/src/lib.rs b/benchmarks/duckdb-bench/src/lib.rs
index fed9f82b004..308553852c0 100644
--- a/benchmarks/duckdb-bench/src/lib.rs
+++ b/benchmarks/duckdb-bench/src/lib.rs
@@ -90,7 +90,7 @@ impl DuckClient {
         let connection = db.connect()?;
         vortex_duckdb::initialize(&db)?;
 
-        // Enable Parquet metadata cache for all benchmark runs.
+        // Enable metadata caches for all benchmark runs.
         //
         // `parquet_metadata_cache` is an extension-specific option that's
         // only available after the Parquet extension is loaded. The Parquet
@@ -100,6 +100,7 @@ impl DuckClient {
         // "Invalid Input Error: The following options were not recognized:
         // parquet_metadata_cache" when running DuckDB in debug mode.
         connection.query("SET parquet_metadata_cache = true")?;
+        connection.query("SET vortex_metadata_cache = true")?;
 
         Ok((db, connection))
     }
diff --git a/vortex-duckdb/build.rs b/vortex-duckdb/build.rs
index e435e9ba063..e1ef334334f 100644
--- a/vortex-duckdb/build.rs
+++ b/vortex-duckdb/build.rs
@@ -20,7 +20,7 @@ const DUCKDB_SOURCE_COMMIT_URL: &str = "https://github.com/duckdb/duckdb/archive
 
 const BUILD_ARTIFACTS: [&str; 3] = ["libduckdb.dylib", "libduckdb.so", "libduckdb_static.a"];
 
-const SOURCE_FILES: [&str; 17] = [
+const SOURCE_FILES: [&str; 19] = [
     "cpp/client_context.cpp",
     "cpp/config.cpp",
     "cpp/copy_function.cpp",
@@ -30,6 +30,8 @@ const SOURCE_FILES: [&str; 17] = [
     "cpp/expr.cpp",
     "cpp/file_system.cpp",
     "cpp/logical_type.cpp",
+    "cpp/multi_file_function.cpp",
+    "cpp/object_cache.cpp",
     "cpp/replacement_scan.cpp",
     "cpp/reusable_dict.cpp",
     "cpp/scalar_function.cpp",
diff --git a/vortex-duckdb/cpp/file_system.cpp b/vortex-duckdb/cpp/file_system.cpp
index 11083ad6b86..562a12b0855 100644
--- a/vortex-duckdb/cpp/file_system.cpp
+++ b/vortex-duckdb/cpp/file_system.cpp
@@ -12,11 +12,21 @@ DUCKDB_INCLUDES_BEGIN
 #include <duckdb/main/client_context.hpp>
 DUCKDB_INCLUDES_END
 
+#include <memory>
 #include <utility>
 
 using namespace duckdb;
 using vortex::SetError;
 
+struct duckdb_vx_file_handle_ {
+    explicit duckdb_vx_file_handle_(shared_ptr<ClientContext> context, unique_ptr<FileHandle> handle)
+        : context(std::move(context)), handle(std::move(handle)) {
+    }
+
+    shared_ptr<ClientContext> context;
+    unique_ptr<FileHandle> handle;
+};
+
 extern "C" duckdb_vx_file_handle
 duckdb_vx_fs_open(duckdb_client_context ctx, const char *path, duckdb_vx_error *error_out) {
     if (!ctx || !path) {
@@ -29,7 +39,7 @@ duckdb_vx_fs_open(duckdb_client_context ctx, const char *path, duckdb_vx_error *
     try {
         auto &fs = FileSystem::GetFileSystem(*client_context);
         auto handle = fs.OpenFile(path, FileFlags::FILE_FLAGS_READ | FileFlags::FILE_FLAGS_PARALLEL_ACCESS);
-        return reinterpret_cast<duckdb_vx_file_handle>(handle.release());
+        return new duckdb_vx_file_handle_(client_context->shared_from_this(), std::move(handle));
     } catch (const std::exception &e) {
         SetError(error_out, e.what());
         return nullptr;
@@ -50,7 +60,7 @@ duckdb_vx_fs_create(duckdb_client_context ctx, const char *path, duckdb_vx_error
     try {
         auto &fs = FileSystem::GetFileSystem(*client_context);
         auto handle = fs.OpenFile(path, flags);
-        return reinterpret_cast<duckdb_vx_file_handle>(handle.release());
+        return new duckdb_vx_file_handle_(client_context->shared_from_this(), std::move(handle));
     } catch (const std::exception &e) {
         SetError(error_out, e.what());
         return nullptr;
@@ -59,7 +69,7 @@ duckdb_vx_fs_create(duckdb_client_context ctx, const char *path, duckdb_vx_error
 
 extern "C" void duckdb_vx_fs_close(duckdb_vx_file_handle *handle) {
     if (handle && *handle) {
-        delete reinterpret_cast<FileHandle *>(std::exchange(*handle, nullptr));
+        delete std::exchange(*handle, nullptr);
     }
 }
 
@@ -70,7 +80,7 @@ duckdb_vx_fs_get_size(duckdb_vx_file_handle handle, idx_t *size_out, duckdb_vx_e
     }
 
     try {
-        *size_out = reinterpret_cast<FileHandle *>(handle)->GetFileSize();
+        *size_out = handle->handle->GetFileSize();
     } catch (const std::exception &e) {
         return SetError(error_out, e.what());
     }
@@ -88,7 +98,7 @@ extern "C" duckdb_state duckdb_vx_fs_read(duckdb_vx_file_handle handle,
     }
 
     try {
-        reinterpret_cast<FileHandle *>(handle)->Read(buffer, len, offset);
+        handle->handle->Read(buffer, len, offset);
         *out_len = len;
     } catch (const std::exception &e) {
         return SetError(error_out, e.what());
@@ -107,7 +117,7 @@ extern "C" duckdb_state duckdb_vx_fs_write(duckdb_vx_file_handle handle,
     }
 
     try {
-        reinterpret_cast<FileHandle *>(handle)->Write(QueryContext(), buffer, len, offset);
+        handle->handle->Write(QueryContext(), buffer, len, offset);
         *out_len = len;
     } catch (const std::exception &e) {
         return SetError(error_out, e.what());
@@ -144,7 +154,7 @@ extern "C" duckdb_state duckdb_vx_fs_sync(duckdb_vx_file_handle handle, duckdb_v
     }
 
     try {
-        reinterpret_cast<FileHandle *>(handle)->Sync();
+        handle->handle->Sync();
     } catch (const std::exception &e) {
         return SetError(error_out, e.what());
     }
diff --git a/vortex-duckdb/cpp/include/duckdb_vx.h b/vortex-duckdb/cpp/include/duckdb_vx.h
index dcad0ae1487..afe56803b2f 100644
--- a/vortex-duckdb/cpp/include/duckdb_vx.h
+++ b/vortex-duckdb/cpp/include/duckdb_vx.h
@@ -12,6 +12,8 @@
 #include "duckdb_vx/expr.h"
 #include "duckdb_vx/file_system.h"
 #include "duckdb_vx/logical_type.h"
+#include "duckdb_vx/multi_file_function.h"
+#include "duckdb_vx/object_cache.h"
 #include "duckdb_vx/reusable_dict.h"
 #include "duckdb_vx/replacement_scan.h"
 #include "duckdb_vx/scalar_function.h"
diff --git a/vortex-duckdb/cpp/include/duckdb_vx/multi_file_function.h b/vortex-duckdb/cpp/include/duckdb_vx/multi_file_function.h
new file mode 100644
index 00000000000..dd7350c0e98
--- /dev/null
+++ b/vortex-duckdb/cpp/include/duckdb_vx/multi_file_function.h
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+/**
+ * C ABI for registering a DuckDB MultiFileFunction-backed table function.
+ *
+ * Unlike duckdb_vx_tfunc_register (which wraps a single TableFunction), this exposes
+ * DuckDB's templated MultiFileFunction<OP> machinery: file globbing, per-file readers,
+ * hive partitioning, virtual columns, etc. are all driven by DuckDB itself; the
+ * extension only supplies a per-format reader.
+ *
+ * Lifecycle, mirroring DuckDB's Parquet reader:
+ *   1. create_options / initialize_bind_data / bind_reader collect bind-time options,
+ *      metadata, and schema.
+ *   2. init_global / init_local create per-query and per-worker state.
+ *   3. create_reader opens one file. DuckDB has dropped the global multi-file scheduling
+ *      mutex before this call and holds a per-file mutex for this reader.
+ *   4. prepare_reader maps the projection and filters onto the opened reader.
+ *   5. try_initialize_scan is called with DuckDB's global multi-file scheduling mutex held.
+ *      It must only claim one cheap unit of scan work into local state.
+ *   6. prepare_scan runs outside that scheduling mutex and initializes local scan state
+ *      for the work claimed by try_initialize_scan.
+ *   7. scan drains the local state prepared by prepare_scan into DuckDB chunks.
+ *
+ * Owned-pointer convention: every non-null pointer the extension returns is owned by
+ * DuckDB and must be released by the corresponding free_* callback. Borrowed pointers
+ * (passed in to callbacks) must not be freed.
+ */
+#pragma once
+
+#include "duckdb_vx/data.h"
+#include "error.h"
+#include "table_function.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque, extension-owned. Lifetime is tied to the corresponding free_* callback.
+typedef struct duckdb_vx_mff_options_ *duckdb_vx_mff_options;
+typedef struct duckdb_vx_mff_bind_data_ *duckdb_vx_mff_bind_data;
+typedef struct duckdb_vx_mff_global_ *duckdb_vx_mff_global;
+typedef struct duckdb_vx_mff_local_ *duckdb_vx_mff_local;
+typedef struct duckdb_vx_mff_reader_ *duckdb_vx_mff_reader;
+
+// Opaque writers populated by the extension during bind.
+typedef struct duckdb_vx_mff_schema_writer_ *duckdb_vx_mff_schema_writer;
+
+// A single scan column passed to prepare_reader. `name` is borrowed for the
+// duration of the call. `is_projected` distinguishes final output columns from
+// filter-only scan columns.
+typedef struct {
+    const char *name;
+    size_t name_len;
+    uint64_t column_id;
+    bool is_virtual;
+    bool is_projected;
+} duckdb_vx_mff_column;
+
+// Exact per-file partition statistics for DuckDB's aggregate/statistics
+// optimizer. Currently only row counts are exposed.
+typedef struct {
+    uint64_t row_count;
+} duckdb_vx_mff_partition_stats;
+
+// Opaque writer for EXPLAIN/to_string output. Same shape as
+// duckdb_vx_string_map but kept distinct for FFI hygiene.
+typedef duckdb_vx_string_map duckdb_vx_mff_string_map;
+
+/**
+ * Append a column to the bind schema. The name is copied; the logical type is
+ * cloned. Both arguments remain owned by the caller.
+ */
+void duckdb_vx_mff_schema_writer_add_column(duckdb_vx_mff_schema_writer writer,
+                                            const char *name,
+                                            size_t name_len,
+                                            duckdb_logical_type type);
+
+// vtable mirroring the subset of MultiFileReaderInterface + BaseFileReader we expose.
+// All callbacks are required and must be non-null.
+typedef struct {
+    /** Function name, e.g. "read_vortex". Must outlive the registered function. */
+    const char *name;
+
+    /** Whether DuckDB may pass pushed table filters to prepare_reader. */
+    bool filter_pushdown;
+
+    /** Whether DuckDB may omit filter-only columns from final table-scan output. */
+    bool filter_prune;
+
+    /**
+     * Try to push a complex filter expression into bind data. Returns true when
+     * the filter is handled exactly and DuckDB may remove the standalone filter.
+     */
+    bool (*pushdown_complex_filter)(duckdb_vx_mff_bind_data bind_data,
+                                    duckdb_vx_expr expr,
+                                    duckdb_vx_error *error_out);
+
+    // ---------------------------------------------------------------------
+    // Options lifecycle
+    // ---------------------------------------------------------------------
+
+    /** Create a fresh, default options object. Called once per bind. */
+    duckdb_vx_mff_options (*create_options)(duckdb_client_context ctx, duckdb_vx_error *error);
+    /** Release options created by create_options. Must accept null. */
+    void (*free_options)(duckdb_vx_mff_options options);
+
+    // ---------------------------------------------------------------------
+    // Bind lifecycle
+    // ---------------------------------------------------------------------
+
+    /**
+     * Initialize bind data from options. Called once per bind, after options.
+     * Takes ownership of `options` (must be freed via free_options if the
+     * extension does not retain it).
+     */
+    duckdb_vx_mff_bind_data (*initialize_bind_data)(duckdb_vx_mff_options options,
+                                                    duckdb_vx_error *error);
+    /** Clone bind data. Used when DuckDB rewrites plans, e.g. late materialization. */
+    duckdb_vx_mff_bind_data (*clone_bind_data)(duckdb_vx_mff_bind_data bind_data,
+                                               duckdb_vx_error *error);
+    /** Release bind data. Must accept null. */
+    void (*free_bind_data)(duckdb_vx_mff_bind_data bind_data);
+
+    /**
+     * Bind the reader's schema. Called by DuckDB after the first file in the
+     * file list is known. The extension should open the file (or a metadata-
+     * only handle) and append result columns via the schema_writer.
+     *
+     * `first_file_path` is borrowed (not nul-terminated, length given).
+     */
+    void (*bind_reader)(duckdb_client_context ctx,
+                        duckdb_vx_mff_bind_data bind_data,
+                        const char *first_file_path,
+                        size_t path_len,
+                        duckdb_vx_mff_schema_writer schema_out,
+                        duckdb_vx_error *error);
+
+    // ---------------------------------------------------------------------
+    // Per-query state lifecycle
+    // ---------------------------------------------------------------------
+
+    duckdb_vx_mff_global (*init_global)(duckdb_client_context ctx,
+                                        duckdb_vx_mff_bind_data bind_data,
+                                        duckdb_vx_error *error);
+    void (*free_global)(duckdb_vx_mff_global global);
+
+    duckdb_vx_mff_local (*init_local)(duckdb_vx_mff_global global);
+    void (*free_local)(duckdb_vx_mff_local local);
+
+    // ---------------------------------------------------------------------
+    // Per-file reader lifecycle
+    // ---------------------------------------------------------------------
+
+    /**
+     * Open a per-file reader. Called once per file when DuckDB first opens
+     * that file for scanning. This may open file metadata, but should not do
+     * per-scan work because projection/filter state has not been prepared yet.
+     */
+    duckdb_vx_mff_reader (*create_reader)(duckdb_client_context ctx,
+                                          duckdb_vx_mff_global global,
+                                          duckdb_vx_mff_bind_data bind_data,
+                                          const char *file_path,
+                                          size_t path_len,
+                                          size_t file_idx,
+                                          duckdb_vx_error *error);
+    void (*free_reader)(duckdb_vx_mff_reader reader);
+
+    /**
+     * Configure the reader with the columns it should produce and any filters
+     * pushed down by DuckDB. Called once per (reader, scan) pair before any
+     * try_initialize_scan / scan calls. `projection` is the ordered list of
+     * intermediate scan columns DuckDB needs the chunks to contain. Columns
+     * marked `is_projected=false` are only needed for pushed filters and are
+     * not referenced by DuckDB's final output expressions. `filters` may be
+     * null when no filters were pushed down.
+     */
+    void (*prepare_reader)(duckdb_vx_mff_reader reader,
+                           const duckdb_vx_mff_column *projection,
+                           size_t projection_count,
+                           duckdb_vx_table_filter_set filters,
+                           duckdb_vx_error *error);
+
+    /**
+     * Try to initialize a scan over `reader`. Returns true if a scan can begin,
+     * false if the reader is exhausted. Called with DuckDB's multi-file global
+     * scheduling mutex held; must not block on I/O, run async work, or build
+     * expensive scan pipelines. Store only the claimed work descriptor in `local`.
+     */
+    bool (*try_initialize_scan)(duckdb_vx_mff_reader reader,
+                                duckdb_vx_mff_global global,
+                                duckdb_vx_mff_local local,
+                                duckdb_vx_error *error);
+
+    /**
+     * Prepare local scan state for the work claimed by try_initialize_scan.
+     * Called outside DuckDB's multi-file global scheduling mutex, mirroring
+     * DuckDB's BaseFileReader::PrepareScan hook.
+     */
+    void (*prepare_scan)(duckdb_vx_mff_reader reader,
+                         duckdb_vx_mff_global global,
+                         duckdb_vx_mff_local local,
+                         duckdb_vx_error *error);
+
+    /**
+     * Produce the next batch of data into `chunk_out`. Called outside DuckDB's
+     * multi-file global scheduling mutex after prepare_scan. Returns:
+     *   - true with chunk size > 0  : more data may follow.
+     *   - true with chunk size == 0 : reader is exhausted; DuckDB will move on.
+     *   - false                     : an error occurred (see error_out).
+     */
+    bool (*scan)(duckdb_vx_mff_reader reader,
+                 duckdb_vx_mff_global global,
+                 duckdb_vx_mff_local local,
+                 duckdb_data_chunk chunk_out,
+                 duckdb_vx_error *error);
+
+    /**
+     * Get bind-time per-column statistics by name. Used when DuckDB asks for
+     * scan stats after copying bind data, before a per-file reader exists.
+     * Returns false if no stats are available.
+     */
+    bool (*statistics)(duckdb_vx_mff_bind_data bind_data,
+                       const char *col_name,
+                       size_t name_len,
+                       duckdb_column_statistics *stats_out);
+
+    /**
+     * Get per-column statistics by name. Returns false if no stats are
+     * available. Same convention as duckdb_vx_tfunc_vtab_t::statistics.
+     */
+    bool (*get_statistics)(duckdb_vx_mff_reader reader,
+                           const char *col_name,
+                           size_t name_len,
+                           duckdb_column_statistics *stats_out);
+
+    /** Scan progress within a file in [0.0, 100.0]. */
+    double (*progress_in_file)(duckdb_vx_mff_reader reader);
+
+    /**
+     * Estimated cardinality across `file_count` files. Returning false leaves
+     * cardinality unknown (DuckDB falls back to its own heuristic).
+     */
+    bool (*cardinality)(duckdb_vx_mff_bind_data bind_data,
+                        size_t file_count,
+                        duckdb_vx_node_statistics *out);
+
+    /**
+     * Get exact row count statistics for one file. Returning false means the
+     * stats are not currently available; DuckDB will skip statistics-based
+     * aggregate rewrites unless every file returns exact stats.
+     */
+    bool (*partition_stats)(duckdb_client_context ctx,
+                            duckdb_vx_mff_bind_data bind_data,
+                            const char *file_path,
+                            size_t path_len,
+                            duckdb_vx_mff_partition_stats *out,
+                            duckdb_vx_error *error);
+
+    /**
+     * Populate the bind-time EXPLAIN map with key/value pairs (e.g. "Filters",
+     * "Projection"). Called whenever DuckDB renders the table function in an
+     * EXPLAIN output.
+     */
+    void (*to_string)(duckdb_vx_mff_bind_data bind_data, duckdb_vx_mff_string_map map);
+} duckdb_vx_mff_vtab_t;
+
+/**
+ * Register the multi-file function described by `vtab` against `ffi_db`. The
+ * vtab is copied into a TableFunctionInfo owned by the catalog, so the caller
+ * may free it after this returns.
+ */
+duckdb_state duckdb_vx_mff_register(duckdb_database ffi_db, const duckdb_vx_mff_vtab_t *vtab);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/vortex-duckdb/cpp/include/duckdb_vx/object_cache.h b/vortex-duckdb/cpp/include/duckdb_vx/object_cache.h
new file mode 100644
index 00000000000..3ac4ddeef85
--- /dev/null
+++ b/vortex-duckdb/cpp/include/duckdb_vx/object_cache.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#pragma once
+
+#include "duckdb_vx/duckdb_diagnostics.h"
+
+DUCKDB_INCLUDES_BEGIN
+#include <duckdb.h>
+DUCKDB_INCLUDES_END
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct duckdb_vx_object_cache_entry_ *duckdb_vx_object_cache_entry;
+
+duckdb_vx_object_cache_entry duckdb_vx_object_cache_get(duckdb_client_context ctx,
+                                                         const char *key,
+                                                         size_t key_len,
+                                                         const char *object_type);
+
+void *duckdb_vx_object_cache_entry_get_data(duckdb_vx_object_cache_entry entry);
+
+void duckdb_vx_object_cache_entry_free(duckdb_vx_object_cache_entry *entry);
+
+duckdb_state duckdb_vx_object_cache_put(duckdb_client_context ctx,
+                                        const char *key,
+                                        size_t key_len,
+                                        const char *object_type,
+                                        idx_t estimated_memory,
+                                        void *data,
+                                        duckdb_delete_callback_t delete_callback);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/vortex-duckdb/cpp/multi_file_function.cpp b/vortex-duckdb/cpp/multi_file_function.cpp
new file mode 100644
index 00000000000..e81d14bd82c
--- /dev/null
+++ b/vortex-duckdb/cpp/multi_file_function.cpp
@@ -0,0 +1,705 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+/**
+ * C++ adapters that bridge a duckdb_vx_mff_vtab_t to DuckDB's MultiFileFunction<OP>.
+ *
+ * Layered design:
+ *   - VortexBaseFileReaderOptions : BaseFileReaderOptions   - opaque options handle
+ *   - VortexFileReader            : BaseFileReader          - per-file scan adapter
+ *   - VortexMultiFileReaderInterface : MultiFileReaderInterface - cross-file orchestrator
+ *   - VortexMultiFileFunctionOp                              - OP type for MultiFileFunction<OP>
+ *
+ * Each adapter holds a non-owning pointer to the registered vtab and an
+ * extension-owned FFI handle. The FFI handle is freed via the vtab's free_*
+ * callback in the destructor.
+ */
+
+#include "duckdb_vx/data.hpp"
+#include "duckdb_vx/duckdb_diagnostics.h"
+#include "duckdb_vx/error.hpp"
+#include "duckdb_vx/multi_file_function.h"
+
+#include <cstring>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+DUCKDB_INCLUDES_BEGIN
+#include "duckdb.h"
+#include "duckdb/catalog/catalog.hpp"
+#include "duckdb/common/multi_file/base_file_reader.hpp"
+#include "duckdb/common/multi_file/multi_file_data.hpp"
+#include "duckdb/common/multi_file/multi_file_function.hpp"
+#include "duckdb/common/multi_file/multi_file_reader.hpp"
+#include "duckdb/common/multi_file/multi_file_states.hpp"
+#include "duckdb/function/partition_stats.hpp"
+#include "duckdb/main/capi/capi_internal.hpp"
+#include "duckdb/parser/parsed_data/create_table_function_info.hpp"
+DUCKDB_INCLUDES_END
+
+using namespace duckdb;
+using vortex::IntoErrString;
+constexpr column_t COLUMN_IDENTIFIER_FILE_INDEX = MultiFileReader::COLUMN_IDENTIFIER_FILE_INDEX;
+constexpr column_t COLUMN_IDENTIFIER_FILE_ROW_NUMBER = MultiFileReader::COLUMN_IDENTIFIER_FILE_ROW_NUMBER;
+
+namespace {
+
+/**
+ * Internal bind data stored on the catalog-owned function. We keep the vtable
+ * here so per-bind/per-file adapters can find it without a separate registry.
+ */
+struct VortexMultiFileFunctionInfo : TableFunctionInfo {
+    explicit VortexMultiFileFunctionInfo(const duckdb_vx_mff_vtab_t &vtab_p) : vtab(vtab_p) {
+    }
+
+    const duckdb_vx_mff_vtab_t vtab;
+};
+
+class VortexBaseFileReaderOptions : public BaseFileReaderOptions {
+public:
+    VortexBaseFileReaderOptions(const duckdb_vx_mff_vtab_t &vtab, duckdb_vx_mff_options handle)
+        : vtab(vtab), handle(handle) {
+    }
+    ~VortexBaseFileReaderOptions() override {
+        if (handle) {
+            vtab.free_options(handle);
+        }
+    }
+
+    /** Release ownership of the FFI handle to the caller. */
+    duckdb_vx_mff_options Release() {
+        auto out = handle;
+        handle = nullptr;
+        return out;
+    }
+
+    const duckdb_vx_mff_vtab_t &vtab;
+
+private:
+    duckdb_vx_mff_options handle;
+};
+
+/**
+ * Bind data attached to the MultiFileBindData. Holds the FFI bind-data handle
+ * for the lifetime of the prepared statement.
+ */
+struct VortexMultiFileBindData : public TableFunctionData {
+    VortexMultiFileBindData(const duckdb_vx_mff_vtab_t &vtab, duckdb_vx_mff_bind_data handle)
+        : vtab(vtab), handle(handle) {
+    }
+    ~VortexMultiFileBindData() override {
+        if (handle) {
+            vtab.free_bind_data(handle);
+        }
+    }
+
+    bool SupportStatementCache() const override {
+        return false;
+    }
+
+    unique_ptr<FunctionData> Copy() const override {
+        duckdb_vx_error error_out = nullptr;
+        auto cloned = vtab.clone_bind_data(handle, &error_out);
+        if (error_out) {
+            throw InternalException(IntoErrString(error_out));
+        }
+        return make_uniq<VortexMultiFileBindData>(vtab, cloned);
+    }
+
+    const duckdb_vx_mff_vtab_t &vtab;
+    duckdb_vx_mff_bind_data handle;
+};
+
+/**
+ * Global state for a single multi-file scan. Distinct from MultiFileGlobalState
+ * (which DuckDB owns); this is the *interface*-owned global state slot.
+ */
+class VortexInterfaceGlobalState : public GlobalTableFunctionState {
+public:
+    VortexInterfaceGlobalState(const duckdb_vx_mff_vtab_t &vtab,
+                               duckdb_vx_mff_global handle,
+                               const MultiFileGlobalState &multi_file_state)
+        : vtab(vtab), handle(handle), multi_file_state(&multi_file_state) {
+    }
+    ~VortexInterfaceGlobalState() override {
+        if (handle) {
+            vtab.free_global(handle);
+        }
+    }
+
+    const duckdb_vx_mff_vtab_t &vtab;
+    duckdb_vx_mff_global handle;
+    const MultiFileGlobalState *multi_file_state;
+};
+
+class VortexInterfaceLocalState : public LocalTableFunctionState {
+public:
+    VortexInterfaceLocalState(const duckdb_vx_mff_vtab_t &vtab, duckdb_vx_mff_local handle)
+        : vtab(vtab), handle(handle) {
+    }
+    ~VortexInterfaceLocalState() override {
+        if (handle) {
+            vtab.free_local(handle);
+        }
+    }
+
+    const duckdb_vx_mff_vtab_t &vtab;
+    duckdb_vx_mff_local handle;
+};
+
+static Value &UnwrapValue(duckdb_value value) {
+    return *(reinterpret_cast<Value *>(value));
+}
+
+void DestroyValues(duckdb_column_statistics &stats) {
+    if (stats.min) {
+        duckdb_destroy_value(&stats.min);
+    }
+    if (stats.max) {
+        duckdb_destroy_value(&stats.max);
+    }
+}
+
+unique_ptr<BaseStatistics> NumericStatsFrom(duckdb_column_statistics &stats, const LogicalType &type) {
+    BaseStatistics out = BaseStatistics::CreateUnknown(type);
+    if (stats.min) {
+        NumericStats::SetMin(out, UnwrapValue(stats.min));
+        duckdb_destroy_value(&stats.min);
+    }
+    if (stats.max) {
+        NumericStats::SetMax(out, UnwrapValue(stats.max));
+        duckdb_destroy_value(&stats.max);
+    }
+    if (!stats.has_null) {
+        out.Set(StatsInfo::CANNOT_HAVE_NULL_VALUES);
+    }
+    return out.ToUnique();
+}
+
+unique_ptr<BaseStatistics> StringStatsFrom(duckdb_column_statistics &stats, const LogicalType &type) {
+    BaseStatistics out = BaseStatistics::CreateUnknown(type);
+    if (stats.min) {
+        StringStats::SetMin(out, StringValue::Get(UnwrapValue(stats.min)));
+        duckdb_destroy_value(&stats.min);
+    }
+    if (stats.max) {
+        StringStats::SetMax(out, StringValue::Get(UnwrapValue(stats.max)));
+        duckdb_destroy_value(&stats.max);
+    }
+    if (stats.max_string_length >> 63) {
+        StringStats::SetMaxStringLength(out, uint32_t(stats.max_string_length));
+    }
+    if (!stats.has_null) {
+        out.Set(StatsInfo::CANNOT_HAVE_NULL_VALUES);
+    }
+    return out.ToUnique();
+}
+
+unique_ptr<BaseStatistics> BaseStatsFrom(duckdb_column_statistics &stats, const LogicalType &type) {
+    BaseStatistics out = BaseStatistics::CreateUnknown(type);
+    DestroyValues(stats);
+    if (!stats.has_null) {
+        out.Set(StatsInfo::CANNOT_HAVE_NULL_VALUES);
+    }
+    return out.ToUnique();
+}
+
+unique_ptr<BaseStatistics> ColumnStatsFrom(duckdb_column_statistics &stats, const LogicalType &type) {
+    switch (type.id()) {
+    case LogicalTypeId::BOOLEAN:
+    case LogicalTypeId::TINYINT:
+    case LogicalTypeId::SMALLINT:
+    case LogicalTypeId::INTEGER:
+    case LogicalTypeId::BIGINT:
+    case LogicalTypeId::FLOAT:
+    case LogicalTypeId::DOUBLE:
+    case LogicalTypeId::UTINYINT:
+    case LogicalTypeId::USMALLINT:
+    case LogicalTypeId::UINTEGER:
+    case LogicalTypeId::UBIGINT:
+    case LogicalTypeId::UHUGEINT:
+    case LogicalTypeId::HUGEINT:
+        return NumericStatsFrom(stats, type);
+    case LogicalTypeId::VARCHAR:
+    case LogicalTypeId::BLOB:
+        return StringStatsFrom(stats, type);
+    case LogicalTypeId::STRUCT:
+        DestroyValues(stats);
+        return nullptr;
+    default:
+        return BaseStatsFrom(stats, type);
+    }
+}
+
+/**
+ * Per-file reader adapter. DuckDB's MultiFileFunction<OP> drives one of these
+ * per opened file; each Scan call asks the extension for the next chunk.
+ */
+class VortexFileReader : public BaseFileReader {
+public:
+    VortexFileReader(OpenFileInfo file_p,
+                     const duckdb_vx_mff_vtab_t &vtab_p,
+                     duckdb_vx_mff_reader handle_p)
+        : BaseFileReader(std::move(file_p)), vtab(vtab_p), handle(handle_p) {
+    }
+    ~VortexFileReader() override {
+        if (handle) {
+            vtab.free_reader(handle);
+        }
+    }
+
+    string GetReaderType() const override {
+        return "vortex";
+    }
+
+    void AddVirtualColumn(column_t virtual_column_id) override {
+        if (columns.empty()) {
+            throw InternalException("Vortex reader received virtual column before column registration");
+        }
+        virtual_column_ids[columns.size() - 1] = virtual_column_id;
+    }
+
+    void PrepareReader(ClientContext &, GlobalTableFunctionState &gstate) override {
+        // Translate the multi-file column ids into projected column names, then
+        // hand DuckDB's TableFilterSet through to Rust as a borrow. The reader
+        // stores the resulting projection/filter so it can apply them when the
+        // scan starts.
+        auto &g = gstate.Cast<VortexInterfaceGlobalState>();
+        std::unordered_set<column_t> projected_column_ids;
+        if (g.multi_file_state && !g.multi_file_state->projection_ids.empty()) {
+            projected_column_ids.reserve(g.multi_file_state->projection_ids.size());
+            for (const auto &projection_id : g.multi_file_state->projection_ids) {
+                if (projection_id >= g.multi_file_state->column_indexes.size()) {
+                    throw InternalException("Vortex projection id out of range");
+                }
+                projected_column_ids.insert(g.multi_file_state->column_indexes[projection_id].GetPrimaryIndex());
+            }
+        }
+
+        std::vector<duckdb_vx_mff_column> ffi_proj;
+        ffi_proj.reserve(column_ids.size());
+        for (idx_t i = 0; i < column_ids.size(); i++) {
+            auto local_id = column_ids[MultiFileLocalIndex(i)];
+            // `local_id` is an index into our local `columns` schema. Physical
+            // columns use their local id directly; non-constant virtual columns
+            // are appended to `columns` by DuckDB's mapper and announced via
+            // AddVirtualColumn.
+            const auto &col = columns[local_id];
+            auto virtual_entry = virtual_column_ids.find(local_id.GetId());
+            const bool is_virtual = virtual_entry != virtual_column_ids.end();
+            const auto column_id = is_virtual ? virtual_entry->second : local_id.GetId();
+            const bool is_projected =
+                projected_column_ids.empty() || projected_column_ids.find(column_id) != projected_column_ids.end();
+            ffi_proj.push_back({col.name.c_str(), col.name.size(), column_id, is_virtual, is_projected});
+        }
+        auto filter_ptr = reinterpret_cast<duckdb_vx_table_filter_set>(filters.get());
+        duckdb_vx_error error_out = nullptr;
+        vtab.prepare_reader(handle, ffi_proj.data(), ffi_proj.size(), filter_ptr, &error_out);
+        if (error_out) {
+            throw IOException(IntoErrString(error_out));
+        }
+    }
+
+    bool TryInitializeScan(ClientContext &,
+                           GlobalTableFunctionState &gstate,
+                           LocalTableFunctionState &lstate) override {
+        auto &g = gstate.Cast<VortexInterfaceGlobalState>();
+        auto &l = lstate.Cast<VortexInterfaceLocalState>();
+        duckdb_vx_error error_out = nullptr;
+        const bool ok = vtab.try_initialize_scan(handle, g.handle, l.handle, &error_out);
+        if (error_out) {
+            throw IOException(IntoErrString(error_out));
+        }
+        return ok;
+    }
+
+    void PrepareScan(ClientContext &,
+                     GlobalTableFunctionState &gstate,
+                     LocalTableFunctionState &lstate) override {
+        auto &g = gstate.Cast<VortexInterfaceGlobalState>();
+        auto &l = lstate.Cast<VortexInterfaceLocalState>();
+        duckdb_vx_error error_out = nullptr;
+        vtab.prepare_scan(handle, g.handle, l.handle, &error_out);
+        if (error_out) {
+            throw IOException(IntoErrString(error_out));
+        }
+    }
+
+    AsyncResult Scan(ClientContext &,
+                     GlobalTableFunctionState &gstate,
+                     LocalTableFunctionState &lstate,
+                     DataChunk &chunk) override {
+        auto &g = gstate.Cast<VortexInterfaceGlobalState>();
+        auto &l = lstate.Cast<VortexInterfaceLocalState>();
+        duckdb_vx_error error_out = nullptr;
+        auto chunk_handle = reinterpret_cast<duckdb_data_chunk>(&chunk);
+        const bool ok = vtab.scan(handle, g.handle, l.handle, chunk_handle, &error_out);
+        if (!ok || error_out) {
+            throw IOException(IntoErrString(error_out));
+        }
+        // Translate "0 rows" into FINISHED so the multi-file scanner advances
+        // to the next file. Otherwise, signal we may have more.
+        return chunk.size() == 0 ? AsyncResult(SourceResultType::FINISHED)
+                                 : AsyncResult(SourceResultType::HAVE_MORE_OUTPUT);
+    }
+
+    unique_ptr<BaseStatistics> GetStatistics(ClientContext &, const string &name) override {
+        for (auto &col : columns) {
+            if (col.name != name) {
+                continue;
+            }
+            duckdb_column_statistics stats = {};
+            if (!vtab.get_statistics(handle, name.c_str(), name.size(), &stats)) {
+                return nullptr;
+            }
+            return ColumnStatsFrom(stats, col.type);
+        }
+        return nullptr;
+    }
+
+    double GetProgressInFile(ClientContext &) override {
+        return vtab.progress_in_file(handle);
+    }
+
+private:
+    const duckdb_vx_mff_vtab_t &vtab;
+    duckdb_vx_mff_reader handle;
+    std::unordered_map<idx_t, column_t> virtual_column_ids;
+};
+
+/**
+ * Cross-file orchestrator. Implements only the methods the basic scan pipeline
+ * needs; everything else (hive partitioning, COPY, union-by-name, virtual cols)
+ * defaults to the base-class behaviour.
+ */
+class VortexMultiFileReaderInterface : public MultiFileReaderInterface {
+public:
+    VortexMultiFileReaderInterface() = default;
+
+    unique_ptr<BaseFileReaderOptions> InitializeOptions(ClientContext &context,
+                                                        optional_ptr<TableFunctionInfo> info) override {
+        if (!info) {
+            throw BinderException("Vortex multi-file function requires TableFunctionInfo");
+        }
+        vtab = &info->Cast<VortexMultiFileFunctionInfo>().vtab;
+        auto &vtab = Vtab();
+        duckdb_vx_error error_out = nullptr;
+        auto ctx = reinterpret_cast<duckdb_client_context>(&context);
+        auto handle = vtab.create_options(ctx, &error_out);
+        if (error_out) {
+            throw BinderException(IntoErrString(error_out));
+        }
+        return make_uniq<VortexBaseFileReaderOptions>(vtab, handle);
+    }
+
+    bool ParseCopyOption(ClientContext &, const string &, const vector<Value> &,
+                         BaseFileReaderOptions &, vector<string> &, vector<LogicalType> &) override {
+        return false;
+    }
+
+    bool ParseOption(ClientContext &, const string &, const Value &, MultiFileOptions &,
+                     BaseFileReaderOptions &) override {
+        return false;
+    }
+
+    unique_ptr<TableFunctionData> InitializeBindData(MultiFileBindData &,
+                                                     unique_ptr<BaseFileReaderOptions> options) override {
+        auto &vtab = Vtab();
+        auto &vortex_options = options->Cast<VortexBaseFileReaderOptions>();
+        // Take ownership of the options handle and pass it to the FFI.
+        duckdb_vx_error error_out = nullptr;
+        auto bind_handle = vtab.initialize_bind_data(vortex_options.Release(), &error_out);
+        if (error_out) {
+            throw BinderException(IntoErrString(error_out));
+        }
+        return make_uniq<VortexMultiFileBindData>(vtab, bind_handle);
+    }
+
+    void BindReader(ClientContext &context, vector<LogicalType> &return_types, vector<string> &names,
+                    MultiFileBindData &bind_data) override {
+        auto &vtab = Vtab();
+        auto first_file = bind_data.file_list->GetFirstFile();
+        auto &vortex_bind = bind_data.bind_data->Cast<VortexMultiFileBindData>();
+
+        // Schema collection writer: a pair of vectors that the FFI populates.
+        struct SchemaWriter {
+            vector<string> &names;
+            vector<LogicalType> &types;
+        };
+        SchemaWriter writer = {names, return_types};
+
+        duckdb_vx_error error_out = nullptr;
+        auto ctx = reinterpret_cast<duckdb_client_context>(&context);
+        vtab.bind_reader(ctx, vortex_bind.handle, first_file.path.c_str(), first_file.path.size(),
+                         reinterpret_cast<duckdb_vx_mff_schema_writer>(&writer), &error_out);
+        if (error_out) {
+            throw BinderException(IntoErrString(error_out));
+        }
+    }
+
+    unique_ptr<GlobalTableFunctionState> InitializeGlobalState(ClientContext &context,
+                                                                MultiFileBindData &bind_data,
+                                                                MultiFileGlobalState &multi_file_state) override {
+        auto &vtab = Vtab();
+        auto &vortex_bind = bind_data.bind_data->Cast<VortexMultiFileBindData>();
+        duckdb_vx_error error_out = nullptr;
+        auto ctx = reinterpret_cast<duckdb_client_context>(&context);
+        auto handle = vtab.init_global(ctx, vortex_bind.handle, &error_out);
+        if (error_out) {
+            throw BinderException(IntoErrString(error_out));
+        }
+        return make_uniq<VortexInterfaceGlobalState>(vtab, handle, multi_file_state);
+    }
+
+    unique_ptr<LocalTableFunctionState> InitializeLocalState(ExecutionContext &,
+                                                              GlobalTableFunctionState &gstate) override {
+        auto &vtab = Vtab();
+        auto &g = gstate.Cast<VortexInterfaceGlobalState>();
+        auto handle = vtab.init_local(g.handle);
+        return make_uniq<VortexInterfaceLocalState>(vtab, handle);
+    }
+
+    void GetVirtualColumns(ClientContext &, MultiFileBindData &, virtual_column_map_t &result) override {
+        result.insert(make_pair(COLUMN_IDENTIFIER_FILE_ROW_NUMBER,
+                                TableColumn("file_row_number", LogicalType::BIGINT)));
+    }
+
+    shared_ptr<BaseFileReader> CreateReader(ClientContext &, GlobalTableFunctionState &, BaseUnionData &,
+                                            const MultiFileBindData &) override {
+        // UNION BY NAME path - not supported yet.
+        throw NotImplementedException("UNION BY NAME is not yet supported by the Vortex multi-file function");
+    }
+
+    shared_ptr<BaseFileReader> CreateReader(ClientContext &context, GlobalTableFunctionState &gstate,
+                                            const OpenFileInfo &file, idx_t file_idx,
+                                            const MultiFileBindData &bind_data) override {
+        auto &vtab = Vtab();
+        auto &vortex_bind = bind_data.bind_data->Cast<VortexMultiFileBindData>();
+        auto &vortex_g = gstate.Cast<VortexInterfaceGlobalState>();
+        duckdb_vx_error error_out = nullptr;
+        auto ctx = reinterpret_cast<duckdb_client_context>(&context);
+        auto handle = vtab.create_reader(ctx, vortex_g.handle, vortex_bind.handle, file.path.c_str(),
+                                         file.path.size(), file_idx, &error_out);
+        if (error_out) {
+            throw IOException(IntoErrString(error_out));
+        }
+        auto reader = make_shared_ptr<VortexFileReader>(file, vtab, handle);
+        // BaseFileReader exposes its file-local schema via the `columns` field;
+        // the multi-file reader uses it to build the global<->local column
+        // mapping. We don't yet support per-file schema variation, so inherit
+        // the bind-time global schema directly.
+        reader->columns = bind_data.columns;
+        return reader;
+    }
+
+    unique_ptr<NodeStatistics> GetCardinality(ClientContext &context, const MultiFileBindData &data,
+                                              idx_t file_count) override {
+        auto &vtab = Vtab();
+        auto &vortex_bind = data.bind_data->Cast<VortexMultiFileBindData>();
+        duckdb_vx_node_statistics stats = {};
+        if (!vtab.cardinality(vortex_bind.handle, file_count, &stats)) {
+            return MultiFileReaderInterface::GetCardinality(context, data, file_count);
+        }
+        auto out = make_uniq<NodeStatistics>();
+        out->has_estimated_cardinality = stats.has_estimated_cardinality;
+        out->estimated_cardinality = stats.estimated_cardinality;
+        out->has_max_cardinality = stats.has_max_cardinality;
+        out->max_cardinality = stats.max_cardinality;
+        return out;
+    }
+
+    unique_ptr<MultiFileReaderInterface> Copy() override {
+        auto copy = make_uniq<VortexMultiFileReaderInterface>();
+        copy->vtab = vtab;
+        return copy;
+    }
+
+private:
+    const duckdb_vx_mff_vtab_t &Vtab() const {
+        if (!vtab) {
+            throw InternalException("VortexMultiFileReaderInterface used before InitializeOptions");
+        }
+        return *vtab;
+    }
+
+    const duckdb_vx_mff_vtab_t *vtab = nullptr;
+};
+
+/**
+ * The OP type required by MultiFileFunction<OP>. Holds a pointer to the vtab so
+ * CreateInterface can construct a VortexMultiFileReaderInterface bound to it.
+ */
+struct VortexMultiFileFunctionOp {
+    static unique_ptr<MultiFileReaderInterface> CreateInterface(ClientContext &) {
+        return make_uniq<VortexMultiFileReaderInterface>();
+    }
+};
+
+void mff_pushdown_complex_filter(ClientContext &context,
+                                 LogicalGet &get,
+                                 FunctionData *bind_data_p,
+                                 vector<unique_ptr<Expression>> &filters) {
+    auto &data = bind_data_p->Cast<MultiFileBindData>();
+
+    MultiFilePushdownInfo info(get);
+    auto new_list =
+        data.multi_file_reader->ComplexFilterPushdown(context, *data.file_list, data.file_options, info, filters);
+
+    if (new_list) {
+        data.file_list = std::move(new_list);
+        MultiFileReader::PruneReaders(data, *data.file_list);
+    }
+
+    auto &vortex_bind = data.bind_data->Cast<VortexMultiFileBindData>();
+    duckdb_vx_error error_out = nullptr;
+    for (auto iter = filters.begin(); iter != filters.end();) {
+        duckdb_vx_expr ffi_expr = reinterpret_cast<duckdb_vx_expr>(iter->get());
+        const bool pushed = vortex_bind.vtab.pushdown_complex_filter(vortex_bind.handle, ffi_expr, &error_out);
+        if (error_out) {
+            throw BinderException(IntoErrString(error_out));
+        }
+        iter = pushed ? filters.erase(iter) : std::next(iter);
+    }
+}
+
+unique_ptr<BaseStatistics> mff_statistics(ClientContext &context, const FunctionData *bind_data_p,
+                                          column_t column_index) {
+    auto stats = MultiFileFunction<VortexMultiFileFunctionOp>::MultiFileScanStats(context, bind_data_p,
+                                                                                  column_index);
+    if (stats) {
+        return stats;
+    }
+
+    auto &data = bind_data_p->Cast<MultiFileBindData>();
+    if (IsVirtualColumn(column_index) || !data.bind_data || !data.file_list) {
+        return nullptr;
+    }
+    if (data.file_list->GetExpandResult() == FileExpandResult::MULTIPLE_FILES) {
+        return nullptr;
+    }
+    if (column_index >= data.names.size() || column_index >= data.types.size()) {
+        return nullptr;
+    }
+
+    auto &vortex_bind = data.bind_data->Cast<VortexMultiFileBindData>();
+    if (!vortex_bind.vtab.statistics) {
+        return nullptr;
+    }
+
+    duckdb_column_statistics raw_stats = {};
+    const auto &name = data.names[column_index];
+    if (!vortex_bind.vtab.statistics(vortex_bind.handle, name.c_str(), name.size(), &raw_stats)) {
+        return nullptr;
+    }
+    return ColumnStatsFrom(raw_stats, data.types[column_index]);
+}
+
+vector<PartitionStatistics> mff_get_partition_stats(ClientContext &context, GetPartitionStatsInput &input) {
+    vector<PartitionStatistics> result;
+    if (!input.bind_data) {
+        return result;
+    }
+
+    auto &data = input.bind_data->Cast<MultiFileBindData>();
+    if (!data.bind_data || !data.file_list) {
+        return result;
+    }
+
+    auto &vortex_bind = data.bind_data->Cast<VortexMultiFileBindData>();
+    if (!vortex_bind.vtab.partition_stats) {
+        return result;
+    }
+
+    auto ctx = reinterpret_cast<duckdb_client_context>(&context);
+    idx_t row_start = 0;
+    for (const auto &file : data.file_list->Files()) {
+        duckdb_vx_mff_partition_stats ffi_stats = {};
+        duckdb_vx_error error_out = nullptr;
+        const bool found = vortex_bind.vtab.partition_stats(ctx, vortex_bind.handle, file.path.c_str(),
+                                                            file.path.size(), &ffi_stats, &error_out);
+        if (error_out) {
+            throw IOException(IntoErrString(error_out));
+        }
+        if (!found) {
+            return {};
+        }
+
+        PartitionStatistics stats;
+        stats.row_start = optional_idx(row_start);
+        stats.count = static_cast<idx_t>(ffi_stats.row_count);
+        stats.count_type = CountType::COUNT_EXACT;
+        result.push_back(std::move(stats));
+        row_start += static_cast<idx_t>(ffi_stats.row_count);
+    }
+    return result;
+}
+
+} // namespace
+
+extern "C" void duckdb_vx_mff_schema_writer_add_column(duckdb_vx_mff_schema_writer writer,
+                                                       const char *name,
+                                                       size_t name_len,
+                                                       duckdb_logical_type type) {
+    struct SchemaWriter {
+        vector<string> &names;
+        vector<LogicalType> &types;
+    };
+    auto &w = *reinterpret_cast<SchemaWriter *>(writer);
+    w.names.emplace_back(name, name_len);
+    w.types.emplace_back(*reinterpret_cast<LogicalType *>(type));
+}
+
+extern "C" duckdb_state duckdb_vx_mff_register(duckdb_database ffi_db, const duckdb_vx_mff_vtab_t *vtab) {
+    D_ASSERT(ffi_db);
+    D_ASSERT(vtab);
+
+    const auto &wrapper = *reinterpret_cast<DatabaseWrapper *>(ffi_db);
+    auto &db = *wrapper.database->instance;
+
+    // The catalog-owned TableFunctionInfo carries the vtab copy that each bind
+    // resolves through InitializeOptions. Keeping it there avoids a shared
+    // global pointer across databases/tests.
+    auto info = make_shared_ptr<VortexMultiFileFunctionInfo>(*vtab);
+
+    MultiFileFunction<VortexMultiFileFunctionOp> mff(vtab->name);
+    mff.function_info = info;
+    mff.statistics = mff_statistics;
+    mff.filter_pushdown = vtab->filter_pushdown;
+    mff.filter_prune = vtab->filter_prune;
+    mff.pushdown_complex_filter = mff_pushdown_complex_filter;
+    mff.get_partition_stats = mff_get_partition_stats;
+    mff.late_materialization = true;
+    mff.get_row_id_columns = [](ClientContext &, optional_ptr<FunctionData>) -> vector<column_t> {
+        return {COLUMN_IDENTIFIER_FILE_INDEX, COLUMN_IDENTIFIER_FILE_ROW_NUMBER};
+    };
+
+    // Bind-time EXPLAIN output. Adds keys like "Function", "Files",
+    // "Projection", "Filters". MultiFileFunction also installs a
+    // dynamic_to_string that lists files at scan time; we leave that as-is.
+    mff.to_string = [](TableFunctionToStringInput &input) {
+        InsertionOrderPreservingMap<string> result;
+        const auto &bind = input.bind_data->Cast<MultiFileBindData>();
+        const auto &vortex_bind = bind.bind_data->Cast<VortexMultiFileBindData>();
+        auto map = reinterpret_cast<duckdb_vx_mff_string_map>(&result);
+        vortex_bind.vtab.to_string(vortex_bind.handle, map);
+        return result;
+    };
+
+    try {
+        // CreateFunctionSet returns a TableFunctionSet that bundles both the
+        // single-VARCHAR and LIST(VARCHAR) overloads (matching read_parquet's
+        // shape). This is what enables `read_vortex_v2(['a.vortex','b.vortex'])`.
+        auto function_set = MultiFileReader::CreateFunctionSet(mff);
+        auto &system_catalog = Catalog::GetSystemCatalog(db);
+        auto data = CatalogTransaction::GetSystemTransaction(db);
+        CreateTableFunctionInfo tf_info(function_set);
+        tf_info.on_conflict = OnCreateConflict::ALTER_ON_CONFLICT;
+        system_catalog.CreateFunction(data, tf_info);
+    } catch (const std::exception &e) {
+        ErrorData err(e);
+        DUCKDB_LOG_ERROR(db, "Failed to create Vortex multi-file function:\t" + err.Message());
+        return DuckDBError;
+    }
+    return DuckDBSuccess;
+}
diff --git a/vortex-duckdb/cpp/object_cache.cpp b/vortex-duckdb/cpp/object_cache.cpp
new file mode 100644
index 00000000000..953a1b63ca1
--- /dev/null
+++ b/vortex-duckdb/cpp/object_cache.cpp
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#include "duckdb_vx/object_cache.h"
+
+DUCKDB_INCLUDES_BEGIN
+#include <duckdb/common/shared_ptr.hpp>
+#include <duckdb/main/client_context.hpp>
+#include <duckdb/storage/object_cache.hpp>
+DUCKDB_INCLUDES_END
+
+#include <string>
+#include <utility>
+
+namespace {
+
+class VortexObjectCacheEntry final : public duckdb::ObjectCacheEntry {
+public:
+    VortexObjectCacheEntry(std::string object_type_p,
+                           duckdb::idx_t estimated_memory_p,
+                           void *data_p,
+                           duckdb_delete_callback_t delete_callback_p)
+        : object_type(std::move(object_type_p)), estimated_memory(estimated_memory_p), data(data_p),
+          delete_callback(delete_callback_p) {
+    }
+
+    ~VortexObjectCacheEntry() override {
+        if (delete_callback && data) {
+            delete_callback(data);
+        }
+    }
+
+    std::string GetObjectType() override {
+        return object_type;
+    }
+
+    duckdb::optional_idx GetEstimatedCacheMemory() const override {
+        return estimated_memory;
+    }
+
+    void *GetData() const {
+        return data;
+    }
+
+private:
+    std::string object_type;
+    duckdb::idx_t estimated_memory;
+    void *data;
+    duckdb_delete_callback_t delete_callback;
+};
+
+} // namespace
+
+struct duckdb_vx_object_cache_entry_ {
+    explicit duckdb_vx_object_cache_entry_(duckdb::shared_ptr<duckdb::ObjectCacheEntry> entry_p)
+        : entry(std::move(entry_p)) {
+    }
+
+    duckdb::shared_ptr<duckdb::ObjectCacheEntry> entry;
+};
+
+extern "C" duckdb_vx_object_cache_entry duckdb_vx_object_cache_get(duckdb_client_context ctx,
+                                                                   const char *key,
+                                                                   size_t key_len,
+                                                                   const char *object_type) {
+    if (!ctx || !key || !object_type) {
+        return nullptr;
+    }
+
+    try {
+        auto &context = *reinterpret_cast<duckdb::ClientContext *>(ctx);
+        auto object = duckdb::ObjectCache::GetObjectCache(context).GetObject(std::string(key, key_len));
+        if (!object || object->GetObjectType() != object_type) {
+            return nullptr;
+        }
+        if (!dynamic_cast<VortexObjectCacheEntry *>(object.get())) {
+            return nullptr;
+        }
+        return new duckdb_vx_object_cache_entry_(std::move(object));
+    } catch (...) {
+        return nullptr;
+    }
+}
+
+extern "C" void *duckdb_vx_object_cache_entry_get_data(duckdb_vx_object_cache_entry entry) {
+    if (!entry) {
+        return nullptr;
+    }
+
+    auto *vortex_entry = dynamic_cast<VortexObjectCacheEntry *>(entry->entry.get());
+    return vortex_entry ? vortex_entry->GetData() : nullptr;
+}
+
+extern "C" void duckdb_vx_object_cache_entry_free(duckdb_vx_object_cache_entry *entry) {
+    if (!entry || !*entry) {
+        return;
+    }
+    delete *entry;
+    *entry = nullptr;
+}
+
+extern "C" duckdb_state duckdb_vx_object_cache_put(duckdb_client_context ctx,
+                                                   const char *key,
+                                                   size_t key_len,
+                                                   const char *object_type,
+                                                   idx_t estimated_memory,
+                                                   void *data,
+                                                   duckdb_delete_callback_t delete_callback) {
+    bool entry_created = false;
+    try {
+        if (!ctx || !key || !object_type || !data) {
+            if (delete_callback && data) {
+                delete_callback(data);
+            }
+            return DuckDBError;
+        }
+
+        auto &context = *reinterpret_cast<duckdb::ClientContext *>(ctx);
+        auto entry = duckdb::make_shared_ptr<VortexObjectCacheEntry>(
+            object_type, estimated_memory, data, delete_callback);
+        entry_created = true;
+        duckdb::ObjectCache::GetObjectCache(context).Put(std::string(key, key_len), std::move(entry));
+        return DuckDBSuccess;
+    } catch (...) {
+        if (!entry_created && delete_callback && data) {
+            delete_callback(data);
+        }
+        return DuckDBError;
+    }
+}
diff --git a/vortex-duckdb/src/datasource.rs b/vortex-duckdb/src/datasource.rs
index 1f07155e1f6..8921466bf40 100644
--- a/vortex-duckdb/src/datasource.rs
+++ b/vortex-duckdb/src/datasource.rs
@@ -63,7 +63,6 @@ use vortex_utils::parallelism::get_available_parallelism;
 use crate::RUNTIME;
 use crate::SESSION;
 use crate::convert::ToDuckDBScalar;
-use crate::convert::try_from_bound_expression;
 use crate::convert::try_from_table_filter;
 use crate::convert::try_from_virtual_column_filter;
 use crate::duckdb::BindInputRef;
@@ -497,23 +496,13 @@ impl<T: DataSourceTableFunction> TableFunction for T {
     }
 
     fn pushdown_complex_filter(
-        bind_data: &mut Self::BindData,
+        _bind_data: &mut Self::BindData,
         expr: &ExpressionRef,
     ) -> VortexResult<bool> {
         tracing::debug!("Attempting to push down filter expression: {expr}");
-        let Some(expr) = try_from_bound_expression(expr)? else {
-            return Ok(false);
-        };
-        bind_data.filter_exprs.push(expr);
-
-        // NOTE(ngates): Vortex does indeed run exact filters, so in theory we should return `true`
-        //  here to tell DuckDB we've handled the filter. However, DuckDB applies some crude
-        //  cardinality estimation heuristics (e.g. an equality filter => 20% selectivity) that
-        //  means by returning false, DuckDB runs an additional filter (a little bit of overhead)
-        //  but tends to end up with a better query plan.
-        //  If we plumb row count estimation into the layout tree, perhaps we could use zone maps
-        //  etc. to return estimates. But this function is probably called too late anyway. Maybe
-        //  we need our own cardinality heuristics.
+        // Returning false keeps DuckDB's cardinality heuristics in the plan. Since DuckDB will
+        // still evaluate the filter, do not also push it into Vortex: that only evaluates the same
+        // predicate twice.
         Ok(false)
     }
 
diff --git a/vortex-duckdb/src/duckdb/mod.rs b/vortex-duckdb/src/duckdb/mod.rs
index c42fbdaf1e4..5fb36350136 100644
--- a/vortex-duckdb/src/duckdb/mod.rs
+++ b/vortex-duckdb/src/duckdb/mod.rs
@@ -13,6 +13,8 @@ mod expr;
 mod file_system;
 mod logical_type;
 mod macro_;
+mod multi_file_function;
+mod object_cache;
 mod query_result;
 mod reusable_dict;
 mod scalar_function;
@@ -37,6 +39,7 @@ pub use ddb_string::*;
 pub use expr::*;
 pub use file_system::*;
 pub use logical_type::*;
+pub use multi_file_function::*;
 pub use query_result::*;
 pub use reusable_dict::*;
 pub use scalar_function::*;
diff --git a/vortex-duckdb/src/duckdb/multi_file_function.rs b/vortex-duckdb/src/duckdb/multi_file_function.rs
new file mode 100644
index 00000000000..8898f00e7a5
--- /dev/null
+++ b/vortex-duckdb/src/duckdb/multi_file_function.rs
@@ -0,0 +1,702 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Rust-side wrapper for DuckDB's `MultiFileFunction<OP>` template.
+//!
+//! Lets a table-function-like type plug into DuckDB's native multi-file machinery
+//! (file globbing, virtual columns, hive partitioning, COPY support, etc.) by
+//! supplying only what's format-specific: how to open a file, how to read its
+//! schema, and how to scan a chunk. The cross-file orchestration is owned by
+//! DuckDB.
+//!
+//! The trait pair mirrors the DuckDB C++ surface:
+//!   - [`MultiFileFunction`] ↔ `MultiFileReaderInterface`
+//!   - [`BaseFileReader`]    ↔ `BaseFileReader`
+//!
+//! The wrapper is generic over one [`MultiFileFunction`] implementation so each
+//! registered function gets statically-monomorphised callbacks (no per-call dyn
+//! dispatch).
+//!
+//! Callback lifecycle, matching DuckDB's `MultiFileFunction` / Parquet reader
+//! model:
+//!
+//! 1. Bind: [`MultiFileFunction::create_options`],
+//!    [`MultiFileFunction::initialize_bind_data`], and
+//!    [`MultiFileFunction::bind_reader`] run once to collect options, bind-time
+//!    state, and schema.
+//! 2. Query init: [`MultiFileFunction::init_global`] and
+//!    [`MultiFileFunction::init_local`] create per-query and per-worker state.
+//! 3. File open: [`MultiFileFunction::create_reader`] is called when DuckDB
+//!    decides to open a file. DuckDB does not hold the global multi-file
+//!    scheduling mutex while opening; it switches to a per-file mutex so other
+//!    workers can wait for that specific reader.
+//! 4. Reader preparation: [`BaseFileReader::prepare_reader`] maps projection
+//!    and filters onto the opened reader. It happens once per reader before any
+//!    scan assignment for that reader.
+//! 5. Scan assignment: [`BaseFileReader::try_initialize_scan`] is called while
+//!    DuckDB holds its global multi-file scheduling mutex. This must be a cheap
+//!    claim of one independent unit of work, e.g. a row group or row range. It
+//!    must not perform I/O, block on async work, or construct expensive scan
+//!    pipelines.
+//! 6. Scan execution: DuckDB releases the scheduling mutex before calling its
+//!    `PrepareScan` hook, exposed here as [`BaseFileReader::prepare_scan`].
+//!    Reader implementations should build per-assignment local scan state
+//!    there, then [`BaseFileReader::scan`] drains that local state into chunks.
+
+use std::ffi::CStr;
+use std::ffi::CString;
+use std::fmt::Debug;
+use std::ptr;
+use std::slice;
+
+use vortex::error::VortexExpect;
+use vortex::error::VortexResult;
+use vortex::error::vortex_err;
+
+use crate::cpp;
+use crate::duckdb::Cardinality;
+use crate::duckdb::ClientContext;
+use crate::duckdb::ClientContextRef;
+use crate::duckdb::ColumnStatistics;
+use crate::duckdb::DataChunk;
+use crate::duckdb::DataChunkRef;
+use crate::duckdb::DatabaseRef;
+use crate::duckdb::DuckdbStringMap;
+use crate::duckdb::DuckdbStringMapRef;
+use crate::duckdb::ExpressionRef;
+use crate::duckdb::LogicalTypeRef;
+use crate::duckdb::TableFilterSet;
+use crate::duckdb::TableFilterSetRef;
+use crate::duckdb::try_or;
+use crate::duckdb_try;
+
+/// A table function backed by DuckDB's `MultiFileFunction<OP>` template.
+///
+/// Implementors describe a per-format reader; DuckDB owns the cross-file
+/// orchestration (globbing, parallelism, virtual columns).
+pub trait MultiFileFunction: Sized + Debug {
+    /// Per-format options collected from the `TABLE(...)` named parameters.
+    /// For minimal implementations this can be a unit struct.
+    type ReaderOptions: Send + Sync;
+
+    /// Bind-time data, populated from options and (the schema of) the first
+    /// file. Must be `Send` because DuckDB may move it across threads.
+    type BindData: Clone + Send;
+
+    /// Global state for one query invocation. Shared across worker threads.
+    type GlobalState: Send + Sync;
+
+    /// Per-thread local state.
+    type LocalState;
+
+    /// Per-file reader. Created when DuckDB first opens a file, dropped when
+    /// scanning of that file finishes. DuckDB stores it in a shared pointer and
+    /// may call read-only scan callbacks from multiple workers, so shared
+    /// callbacks must be thread-safe.
+    type Reader: BaseFileReader<Self::GlobalState, Self::LocalState> + Sync;
+
+    /// Whether DuckDB may pass pushed table filters to
+    /// [`BaseFileReader::prepare_reader`].
+    const FILTER_PUSHDOWN: bool = false;
+
+    /// Whether DuckDB may omit filter-only columns from final table-scan
+    /// output.
+    ///
+    /// Only meaningful when [`Self::FILTER_PUSHDOWN`] is true.
+    const FILTER_PRUNE: bool = false;
+
+    /// Construct default options. Called once per bind.
+    fn create_options(ctx: &ClientContextRef) -> VortexResult<Self::ReaderOptions>;
+
+    /// Push a complex filter expression into bind data.
+    ///
+    /// Returning `true` tells DuckDB the filter is handled exactly and may be
+    /// removed from the remaining plan. Returning `false` leaves it for DuckDB
+    /// to apply above the scan or turn into a regular table filter.
+    fn pushdown_complex_filter(
+        bind_data: &mut Self::BindData,
+        expr: &ExpressionRef,
+    ) -> VortexResult<bool> {
+        let _ = (bind_data, expr);
+        Ok(false)
+    }
+
+    /// Build bind data from options. Takes ownership of the options struct.
+    fn initialize_bind_data(options: Self::ReaderOptions) -> VortexResult<Self::BindData>;
+
+    /// Populate the result schema. DuckDB picks the first file in the file list
+    /// to bind against; the implementation should open it (cheaply, metadata-
+    /// only if possible), record any bind-time metadata it needs, and append
+    /// columns to `schema`.
+    fn bind_reader(
+        ctx: &ClientContextRef,
+        bind_data: &mut Self::BindData,
+        first_file: &str,
+        schema: &mut SchemaBuilder,
+    ) -> VortexResult<()>;
+
+    /// Initialize global state for one query.
+    fn init_global(
+        ctx: &ClientContextRef,
+        bind_data: &Self::BindData,
+    ) -> VortexResult<Self::GlobalState>;
+
+    /// Initialize per-thread state.
+    fn init_local(global: &Self::GlobalState) -> Self::LocalState;
+
+    /// Open a per-file reader. Called once per file, on the thread that won the
+    /// race to open it. DuckDB has dropped the global multi-file scheduling
+    /// mutex before this call, but holds a per-file mutex for this reader. It is
+    /// reasonable to open file metadata here; do not do per-scan or per-split
+    /// work here because projection/filter state is not fully prepared yet.
+    fn create_reader(
+        ctx: &ClientContextRef,
+        global: &Self::GlobalState,
+        bind_data: &Self::BindData,
+        file_path: &str,
+        file_idx: usize,
+    ) -> VortexResult<Self::Reader>;
+
+    /// Estimated cardinality across `file_count` files. Default returns
+    /// [`Cardinality::Unknown`] (DuckDB falls back to its own heuristic).
+    fn cardinality(_bind_data: &Self::BindData, _file_count: usize) -> Cardinality {
+        Cardinality::Unknown
+    }
+
+    /// Exact partition statistics for a file, if already available cheaply.
+    ///
+    /// DuckDB uses these to fold aggregates such as `COUNT(*)` during
+    /// optimization. Returning `None` leaves the scan plan unchanged.
+    fn partition_stats(
+        _ctx: &ClientContextRef,
+        _bind_data: &Self::BindData,
+        _file_path: &str,
+    ) -> VortexResult<Option<PartitionStats>> {
+        Ok(None)
+    }
+
+    /// Per-column statistics available from bind-time metadata. Default
+    /// returns `None`.
+    fn statistics(_bind_data: &Self::BindData, _name: &str) -> Option<ColumnStatistics> {
+        None
+    }
+
+    /// Populate the bind-time EXPLAIN map with key/value pairs (typical keys:
+    /// `Function`, `Files`, `Projection`, `Filters`). Default no-op.
+    fn to_string(_bind_data: &Self::BindData, _map: &mut DuckdbStringMapRef) {}
+}
+
+/// Exact per-file partition statistics exposed to DuckDB's optimizer.
+#[derive(Clone, Copy, Debug)]
+pub struct PartitionStats {
+    /// Exact number of rows in this file.
+    pub row_count: u64,
+}
+
+/// A column DuckDB asks a [`BaseFileReader`] to produce in the intermediate
+/// scan chunk.
+#[derive(Clone, Copy, Debug)]
+pub struct ProjectedColumn<'a> {
+    /// Column name in the file-local scan chunk.
+    pub name: &'a str,
+    /// DuckDB column id. Physical columns use a file-local id; virtual columns
+    /// use DuckDB's global virtual column id.
+    pub column_id: u64,
+    /// True when this column is one of DuckDB's virtual columns.
+    pub is_virtual: bool,
+    /// True when DuckDB's final output expressions reference this column.
+    ///
+    /// False columns are filter-only: the reader may use them for pushed filter
+    /// evaluation, but does not need to materialize them into the scan chunk.
+    pub is_projected: bool,
+}
+
+/// Per-file reader contract. Implementations are owned by DuckDB once handed
+/// off via [`MultiFileFunction::create_reader`] and dropped when scanning of
+/// that file completes.
+///
+/// DuckDB calls [`Self::try_initialize_scan`] while holding its global
+/// multi-file lock. That method should claim one independent unit of scan work
+/// and store only its descriptor in `LocalState`. [`Self::prepare_scan`] then
+/// initializes actual per-worker state outside that lock. [`Self::scan`] drains
+/// only that local state and may overlap with later
+/// [`Self::try_initialize_scan`] calls on the same reader.
+pub trait BaseFileReader<GlobalState, LocalState> {
+    /// Configure projection and filter pushdown. Called once after the reader
+    /// is created and before any [`Self::try_initialize_scan`] call.
+    /// `projection` is the ordered list of intermediate scan columns DuckDB
+    /// allocated for this reader. Filter-only columns have
+    /// [`ProjectedColumn::is_projected`] set to false. `filters` carries any
+    /// filters DuckDB pushed down for this scan.
+    ///
+    /// Default: no-op (reader scans all columns, no filter pushdown).
+    fn prepare_reader(
+        &mut self,
+        projection: &[ProjectedColumn<'_>],
+        filters: Option<&TableFilterSetRef>,
+    ) -> VortexResult<()> {
+        let _ = (projection, filters);
+        Ok(())
+    }
+
+    /// Set up scan state for the next batch. Called under DuckDB's global
+    /// multi-file scheduling lock; this should only claim work into `local`.
+    /// Do not open readers, call `block_on`, or construct scan iterators here.
+    /// Return `false` once exhausted.
+    fn try_initialize_scan(
+        &self,
+        global: &GlobalState,
+        local: &mut LocalState,
+    ) -> VortexResult<bool>;
+
+    /// Initialize local scan state for the work claimed by
+    /// [`Self::try_initialize_scan`]. DuckDB calls this outside its global
+    /// multi-file scheduling lock, so implementations may open per-split
+    /// iterators, block on async setup, or build scan pipelines here.
+    ///
+    /// Default: no-op.
+    fn prepare_scan(&self, global: &GlobalState, local: &mut LocalState) -> VortexResult<()> {
+        let _ = (global, local);
+        Ok(())
+    }
+
+    /// Produce the next batch into `chunk`. Setting `chunk` to size 0 signals
+    /// end-of-assignment; otherwise non-empty implies more may follow. This is
+    /// called outside DuckDB's global multi-file scheduling lock after
+    /// [`Self::prepare_scan`].
+    fn scan(
+        &self,
+        global: &GlobalState,
+        local: &mut LocalState,
+        chunk: &mut DataChunkRef,
+    ) -> VortexResult<()>;
+
+    /// Per-column statistics by name. Default returns `None`.
+    fn get_statistics(&self, _name: &str) -> Option<ColumnStatistics> {
+        None
+    }
+
+    /// Scan progress within this file in `[0.0, 100.0]`. Default `0.0`.
+    fn progress_in_file(&self) -> f64 {
+        0.0
+    }
+}
+
+/// Append-only schema builder passed to [`MultiFileFunction::bind_reader`].
+///
+/// Wraps the C++ `vector<string>` / `vector<LogicalType>` pair via
+/// `duckdb_vx_mff_schema_writer_add_column`.
+pub struct SchemaBuilder {
+    raw: cpp::duckdb_vx_mff_schema_writer,
+}
+
+impl SchemaBuilder {
+    /// Append `(name, type)` to the result schema.
+    pub fn add_column(&mut self, name: &str, logical_type: &LogicalTypeRef) {
+        unsafe {
+            cpp::duckdb_vx_mff_schema_writer_add_column(
+                self.raw,
+                name.as_ptr().cast(),
+                name.len(),
+                logical_type.as_ptr(),
+            );
+        }
+    }
+}
+
+impl DatabaseRef {
+    /// Register `T` as a multi-file table function on this database under
+    /// `name`.
+    ///
+    /// The vtable is statically derived from `T` and copied into a C++
+    /// `TableFunctionInfo` owned by the catalog; `T` itself is never instanced.
+    pub fn register_multi_file_function<T: MultiFileFunction>(
+        &self,
+        name: &CStr,
+    ) -> VortexResult<()> {
+        let vtab = cpp::duckdb_vx_mff_vtab_t {
+            name: name.as_ptr(),
+            filter_pushdown: T::FILTER_PUSHDOWN,
+            filter_prune: T::FILTER_PRUNE,
+            pushdown_complex_filter: Some(pushdown_complex_filter::<T>),
+            create_options: Some(create_options::<T>),
+            free_options: Some(free_options::<T>),
+            initialize_bind_data: Some(initialize_bind_data::<T>),
+            clone_bind_data: Some(clone_bind_data::<T>),
+            free_bind_data: Some(free_bind_data::<T>),
+            bind_reader: Some(bind_reader::<T>),
+            init_global: Some(init_global::<T>),
+            free_global: Some(free_global::<T>),
+            init_local: Some(init_local::<T>),
+            free_local: Some(free_local::<T>),
+            create_reader: Some(create_reader::<T>),
+            free_reader: Some(free_reader::<T>),
+            prepare_reader: Some(prepare_reader::<T>),
+            try_initialize_scan: Some(try_initialize_scan::<T>),
+            prepare_scan: Some(prepare_scan::<T>),
+            scan: Some(scan::<T>),
+            statistics: Some(statistics::<T>),
+            get_statistics: Some(get_statistics::<T>),
+            progress_in_file: Some(progress_in_file::<T>),
+            cardinality: Some(cardinality::<T>),
+            partition_stats: Some(partition_stats::<T>),
+            to_string: Some(to_string::<T>),
+        };
+
+        duckdb_try!(
+            unsafe { cpp::duckdb_vx_mff_register(self.as_ptr(), &raw const vtab) },
+            "Failed to register multi-file function '{}'",
+            name.to_string_lossy()
+        );
+
+        Ok(())
+    }
+}
+
+// ---------------------------------------------------------------------------
+// FFI shim: each callback boxes/unboxes the trait's associated type and
+// dispatches to the corresponding trait method.
+// ---------------------------------------------------------------------------
+
+unsafe extern "C-unwind" fn create_options<T: MultiFileFunction>(
+    ctx: cpp::duckdb_client_context,
+    error_out: *mut cpp::duckdb_vx_error,
+) -> cpp::duckdb_vx_mff_options {
+    let ctx = unsafe { ClientContext::borrow(ctx) };
+    try_or(error_out, || {
+        let opts = T::create_options(ctx)?;
+        Ok(Box::into_raw(Box::new(opts)).cast())
+    })
+}
+
+unsafe extern "C-unwind" fn free_options<T: MultiFileFunction>(opts: cpp::duckdb_vx_mff_options) {
+    if !opts.is_null() {
+        drop(unsafe { Box::from_raw(opts.cast::<T::ReaderOptions>()) });
+    }
+}
+
+unsafe extern "C-unwind" fn initialize_bind_data<T: MultiFileFunction>(
+    opts: cpp::duckdb_vx_mff_options,
+    error_out: *mut cpp::duckdb_vx_error,
+) -> cpp::duckdb_vx_mff_bind_data {
+    let opts = unsafe { Box::from_raw(opts.cast::<T::ReaderOptions>()) };
+    try_or(error_out, || {
+        let bind_data = T::initialize_bind_data(*opts)?;
+        Ok(Box::into_raw(Box::new(bind_data)).cast())
+    })
+}
+
+unsafe extern "C-unwind" fn free_bind_data<T: MultiFileFunction>(
+    bind_data: cpp::duckdb_vx_mff_bind_data,
+) {
+    if !bind_data.is_null() {
+        drop(unsafe { Box::from_raw(bind_data.cast::<T::BindData>()) });
+    }
+}
+
+unsafe extern "C-unwind" fn clone_bind_data<T: MultiFileFunction>(
+    bind_data: cpp::duckdb_vx_mff_bind_data,
+    error_out: *mut cpp::duckdb_vx_error,
+) -> cpp::duckdb_vx_mff_bind_data {
+    let bind_data =
+        unsafe { bind_data.cast::<T::BindData>().as_ref() }.vortex_expect("bind_data null");
+    try_or(error_out, || {
+        Ok(Box::into_raw(Box::new(bind_data.clone())).cast())
+    })
+}
+
+unsafe extern "C-unwind" fn pushdown_complex_filter<T: MultiFileFunction>(
+    bind_data: cpp::duckdb_vx_mff_bind_data,
+    expr: cpp::duckdb_vx_expr,
+    error_out: *mut cpp::duckdb_vx_error,
+) -> bool {
+    let bind_data =
+        unsafe { bind_data.cast::<T::BindData>().as_mut() }.vortex_expect("bind_data null");
+    let expr = unsafe { crate::duckdb::Expression::borrow(expr) };
+    try_or(error_out, || T::pushdown_complex_filter(bind_data, expr))
+}
+
+unsafe extern "C-unwind" fn bind_reader<T: MultiFileFunction>(
+    ctx: cpp::duckdb_client_context,
+    bind_data: cpp::duckdb_vx_mff_bind_data,
+    file_path: *const std::os::raw::c_char,
+    path_len: usize,
+    schema_writer: cpp::duckdb_vx_mff_schema_writer,
+    error_out: *mut cpp::duckdb_vx_error,
+) {
+    let ctx = unsafe { ClientContext::borrow(ctx) };
+    let bind_data =
+        unsafe { bind_data.cast::<T::BindData>().as_mut() }.vortex_expect("bind_data null");
+    let mut builder = SchemaBuilder { raw: schema_writer };
+    try_or(error_out, || {
+        let path_bytes = unsafe { slice::from_raw_parts(file_path.cast::<u8>(), path_len) };
+        let path = std::str::from_utf8(path_bytes)
+            .map_err(|e| vortex_err!("file path is not UTF-8: {e}"))?;
+        T::bind_reader(ctx, bind_data, path, &mut builder)
+    })
+}
+
+unsafe extern "C-unwind" fn init_global<T: MultiFileFunction>(
+    ctx: cpp::duckdb_client_context,
+    bind_data: cpp::duckdb_vx_mff_bind_data,
+    error_out: *mut cpp::duckdb_vx_error,
+) -> cpp::duckdb_vx_mff_global {
+    let ctx = unsafe { ClientContext::borrow(ctx) };
+    let bind_data =
+        unsafe { bind_data.cast::<T::BindData>().as_ref() }.vortex_expect("bind_data null");
+    try_or(error_out, || {
+        let global = T::init_global(ctx, bind_data)?;
+        Ok(Box::into_raw(Box::new(global)).cast())
+    })
+}
+
+unsafe extern "C-unwind" fn free_global<T: MultiFileFunction>(global: cpp::duckdb_vx_mff_global) {
+    if !global.is_null() {
+        drop(unsafe { Box::from_raw(global.cast::<T::GlobalState>()) });
+    }
+}
+
+unsafe extern "C-unwind" fn init_local<T: MultiFileFunction>(
+    global: cpp::duckdb_vx_mff_global,
+) -> cpp::duckdb_vx_mff_local {
+    let global = unsafe { global.cast::<T::GlobalState>().as_ref() }.vortex_expect("global null");
+    let local = T::init_local(global);
+    Box::into_raw(Box::new(local)).cast()
+}
+
+unsafe extern "C-unwind" fn free_local<T: MultiFileFunction>(local: cpp::duckdb_vx_mff_local) {
+    if !local.is_null() {
+        drop(unsafe { Box::from_raw(local.cast::<T::LocalState>()) });
+    }
+}
+
+#[allow(clippy::too_many_arguments)]
+unsafe extern "C-unwind" fn create_reader<T: MultiFileFunction>(
+    ctx: cpp::duckdb_client_context,
+    global: cpp::duckdb_vx_mff_global,
+    bind_data: cpp::duckdb_vx_mff_bind_data,
+    file_path: *const std::os::raw::c_char,
+    path_len: usize,
+    file_idx: usize,
+    error_out: *mut cpp::duckdb_vx_error,
+) -> cpp::duckdb_vx_mff_reader {
+    let ctx = unsafe { ClientContext::borrow(ctx) };
+    let global = unsafe { global.cast::<T::GlobalState>().as_ref() }.vortex_expect("global null");
+    let bind_data =
+        unsafe { bind_data.cast::<T::BindData>().as_ref() }.vortex_expect("bind_data null");
+    try_or(error_out, || {
+        let path_bytes = unsafe { slice::from_raw_parts(file_path.cast::<u8>(), path_len) };
+        let path = std::str::from_utf8(path_bytes)
+            .map_err(|e| vortex_err!("file path is not UTF-8: {e}"))?;
+        let reader = T::create_reader(ctx, global, bind_data, path, file_idx)?;
+        Ok(Box::into_raw(Box::new(reader)).cast())
+    })
+}
+
+unsafe extern "C-unwind" fn free_reader<T: MultiFileFunction>(reader: cpp::duckdb_vx_mff_reader) {
+    if !reader.is_null() {
+        drop(unsafe { Box::from_raw(reader.cast::<T::Reader>()) });
+    }
+}
+
+unsafe extern "C-unwind" fn prepare_reader<T: MultiFileFunction>(
+    reader: cpp::duckdb_vx_mff_reader,
+    projection: *const cpp::duckdb_vx_mff_column,
+    projection_count: usize,
+    filters: cpp::duckdb_vx_table_filter_set,
+    error_out: *mut cpp::duckdb_vx_error,
+) {
+    let reader = unsafe { reader.cast::<T::Reader>().as_mut() }.vortex_expect("reader null");
+    let filter_ref = if filters.is_null() {
+        None
+    } else {
+        Some(unsafe { TableFilterSet::borrow(filters) })
+    };
+    try_or(error_out, || {
+        // Materialize column metadata with &str borrows scoped to this call.
+        let mut projected_columns = Vec::with_capacity(projection_count);
+        for i in 0..projection_count {
+            let col = unsafe { &*projection.add(i) };
+            let bytes = unsafe { slice::from_raw_parts(col.name.cast::<u8>(), col.name_len) };
+            let name = std::str::from_utf8(bytes)
+                .map_err(|e| vortex_err!("projection column name not UTF-8: {e}"))?;
+            projected_columns.push(ProjectedColumn {
+                name,
+                column_id: col.column_id,
+                is_virtual: col.is_virtual,
+                is_projected: col.is_projected,
+            });
+        }
+        reader.prepare_reader(&projected_columns, filter_ref)
+    });
+}
+
+unsafe extern "C-unwind" fn try_initialize_scan<T: MultiFileFunction>(
+    reader: cpp::duckdb_vx_mff_reader,
+    global: cpp::duckdb_vx_mff_global,
+    local: cpp::duckdb_vx_mff_local,
+    error_out: *mut cpp::duckdb_vx_error,
+) -> bool {
+    let reader = unsafe { reader.cast::<T::Reader>().as_ref() }.vortex_expect("reader null");
+    let global = unsafe { global.cast::<T::GlobalState>().as_ref() }.vortex_expect("global null");
+    let local = unsafe { local.cast::<T::LocalState>().as_mut() }.vortex_expect("local null");
+    try_or(error_out, || reader.try_initialize_scan(global, local))
+}
+
+unsafe extern "C-unwind" fn prepare_scan<T: MultiFileFunction>(
+    reader: cpp::duckdb_vx_mff_reader,
+    global: cpp::duckdb_vx_mff_global,
+    local: cpp::duckdb_vx_mff_local,
+    error_out: *mut cpp::duckdb_vx_error,
+) {
+    let reader = unsafe { reader.cast::<T::Reader>().as_ref() }.vortex_expect("reader null");
+    let global = unsafe { global.cast::<T::GlobalState>().as_ref() }.vortex_expect("global null");
+    let local = unsafe { local.cast::<T::LocalState>().as_mut() }.vortex_expect("local null");
+    try_or(error_out, || reader.prepare_scan(global, local))
+}
+
+unsafe extern "C-unwind" fn scan<T: MultiFileFunction>(
+    reader: cpp::duckdb_vx_mff_reader,
+    global: cpp::duckdb_vx_mff_global,
+    local: cpp::duckdb_vx_mff_local,
+    chunk: cpp::duckdb_data_chunk,
+    error_out: *mut cpp::duckdb_vx_error,
+) -> bool {
+    let reader = unsafe { reader.cast::<T::Reader>().as_ref() }.vortex_expect("reader null");
+    let global = unsafe { global.cast::<T::GlobalState>().as_ref() }.vortex_expect("global null");
+    let local = unsafe { local.cast::<T::LocalState>().as_mut() }.vortex_expect("local null");
+    let chunk_ref = unsafe { DataChunk::borrow_mut(chunk) };
+    match reader.scan(global, local, chunk_ref) {
+        Ok(()) => {
+            unsafe { error_out.write(ptr::null_mut()) };
+            true
+        }
+        Err(err) => {
+            let msg = err.to_string();
+            unsafe { error_out.write(cpp::duckdb_vx_error_create(msg.as_ptr().cast(), msg.len())) };
+            false
+        }
+    }
+}
+
+unsafe extern "C-unwind" fn get_statistics<T: MultiFileFunction>(
+    reader: cpp::duckdb_vx_mff_reader,
+    name: *const std::os::raw::c_char,
+    name_len: usize,
+    stats_out: *mut cpp::duckdb_column_statistics,
+) -> bool {
+    let reader = unsafe { reader.cast::<T::Reader>().as_ref() }.vortex_expect("reader null");
+    let name = unsafe { slice::from_raw_parts(name.cast::<u8>(), name_len) };
+    let Ok(name) = std::str::from_utf8(name) else {
+        return false;
+    };
+    let Some(stats) = reader.get_statistics(name) else {
+        return false;
+    };
+    write_column_statistics(stats_out, stats);
+    true
+}
+
+unsafe extern "C-unwind" fn statistics<T: MultiFileFunction>(
+    bind_data: cpp::duckdb_vx_mff_bind_data,
+    name: *const std::os::raw::c_char,
+    name_len: usize,
+    stats_out: *mut cpp::duckdb_column_statistics,
+) -> bool {
+    let bind_data =
+        unsafe { bind_data.cast::<T::BindData>().as_ref() }.vortex_expect("bind_data null");
+    let name = unsafe { slice::from_raw_parts(name.cast::<u8>(), name_len) };
+    let Ok(name) = std::str::from_utf8(name) else {
+        return false;
+    };
+    let Some(stats) = T::statistics(bind_data, name) else {
+        return false;
+    };
+    write_column_statistics(stats_out, stats);
+    true
+}
+
+fn write_column_statistics(stats_out: *mut cpp::duckdb_column_statistics, stats: ColumnStatistics) {
+    let out = unsafe { &mut *stats_out };
+    out.min = stats.min.map_or(ptr::null_mut(), |v| v.into_ptr());
+    out.max = stats.max.map_or(ptr::null_mut(), |v| v.into_ptr());
+    out.max_string_length = stats.max_string_length;
+    out.has_null = stats.has_null;
+}
+
+unsafe extern "C-unwind" fn progress_in_file<T: MultiFileFunction>(
+    reader: cpp::duckdb_vx_mff_reader,
+) -> f64 {
+    let reader = unsafe { reader.cast::<T::Reader>().as_ref() }.vortex_expect("reader null");
+    reader.progress_in_file()
+}
+
+unsafe extern "C-unwind" fn cardinality<T: MultiFileFunction>(
+    bind_data: cpp::duckdb_vx_mff_bind_data,
+    file_count: usize,
+    out: *mut cpp::duckdb_vx_node_statistics,
+) -> bool {
+    let bind_data =
+        unsafe { bind_data.cast::<T::BindData>().as_ref() }.vortex_expect("bind_data null");
+    let out = unsafe { &mut *out };
+    match T::cardinality(bind_data, file_count) {
+        Cardinality::Unknown => false,
+        Cardinality::Estimate(c) => {
+            out.has_estimated_cardinality = true;
+            out.estimated_cardinality = c;
+            true
+        }
+        Cardinality::Maximum(c) => {
+            out.has_max_cardinality = true;
+            out.max_cardinality = c;
+            out.has_estimated_cardinality = true;
+            out.estimated_cardinality = c;
+            true
+        }
+    }
+}
+
+unsafe extern "C-unwind" fn partition_stats<T: MultiFileFunction>(
+    ctx: cpp::duckdb_client_context,
+    bind_data: cpp::duckdb_vx_mff_bind_data,
+    file_path: *const std::os::raw::c_char,
+    path_len: usize,
+    out: *mut cpp::duckdb_vx_mff_partition_stats,
+    error_out: *mut cpp::duckdb_vx_error,
+) -> bool {
+    let ctx = unsafe { ClientContext::borrow(ctx) };
+    let bind_data =
+        unsafe { bind_data.cast::<T::BindData>().as_ref() }.vortex_expect("bind_data null");
+    try_or(error_out, || {
+        let path_bytes = unsafe { slice::from_raw_parts(file_path.cast::<u8>(), path_len) };
+        let path = std::str::from_utf8(path_bytes)
+            .map_err(|e| vortex_err!("file path is not UTF-8: {e}"))?;
+        let Some(stats) = T::partition_stats(ctx, bind_data, path)? else {
+            return Ok(false);
+        };
+        let out = unsafe { &mut *out };
+        out.row_count = stats.row_count;
+        Ok(true)
+    })
+}
+
+unsafe extern "C-unwind" fn to_string<T: MultiFileFunction>(
+    bind_data: cpp::duckdb_vx_mff_bind_data,
+    map: cpp::duckdb_vx_string_map,
+) {
+    let bind_data =
+        unsafe { bind_data.cast::<T::BindData>().as_ref() }.vortex_expect("bind_data null");
+    let map = unsafe { DuckdbStringMap::borrow_mut(map) };
+    T::to_string(bind_data, map);
+}
+
+// ---------------------------------------------------------------------------
+// Helpers used by Phase 5 (concrete implementations).
+// ---------------------------------------------------------------------------
+
+/// Build a `CStr` literal-equivalent at runtime. Convenient for type names
+/// passed to `LogicalType` / DuckDB FFI.
+#[allow(dead_code)]
+pub(crate) fn cstring(s: &str) -> CString {
+    CString::new(s).unwrap_or_else(|_| CString::default())
+}
diff --git a/vortex-duckdb/src/duckdb/object_cache.rs b/vortex-duckdb/src/duckdb/object_cache.rs
new file mode 100644
index 00000000000..67b7840836b
--- /dev/null
+++ b/vortex-duckdb/src/duckdb/object_cache.rs
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::ffi::CStr;
+use std::ffi::CString;
+
+use vortex::error::VortexResult;
+use vortex::error::vortex_bail;
+use vortex::error::vortex_err;
+
+use crate::cpp;
+use crate::duckdb::ClientContextRef;
+use crate::duckdb::drop_boxed;
+use crate::lifetime_wrapper;
+
+lifetime_wrapper!(
+    /// A borrowed DuckDB object-cache entry handle.
+    ObjectCacheEntry,
+    cpp::duckdb_vx_object_cache_entry,
+    cpp::duckdb_vx_object_cache_entry_free
+);
+
+impl ObjectCacheEntryRef {
+    fn data_ptr(&self) -> *mut std::ffi::c_void {
+        unsafe { cpp::duckdb_vx_object_cache_entry_get_data(self.as_ptr()) }
+    }
+}
+
+impl ClientContextRef {
+    /// Retrieve a cloned Rust value from DuckDB's per-database object cache.
+    ///
+    /// `object_type` must match the type string used when storing the value.
+    ///
+    /// # Safety
+    ///
+    /// The caller must ensure that every object stored under the `(key,
+    /// object_type)` pair was inserted as the same Rust type `T`.
+    pub unsafe fn object_cache_get_cloned<T: Clone>(
+        &self,
+        key: &str,
+        object_type: &CStr,
+    ) -> VortexResult<Option<T>> {
+        let key = cache_key(key)?;
+        let entry = unsafe {
+            cpp::duckdb_vx_object_cache_get(
+                self.as_ptr(),
+                key.as_ptr(),
+                key.as_bytes().len(),
+                object_type.as_ptr(),
+            )
+        };
+        if entry.is_null() {
+            return Ok(None);
+        }
+
+        let entry = unsafe { ObjectCacheEntry::own(entry) };
+        let data = entry.data_ptr();
+        if data.is_null() {
+            return Ok(None);
+        }
+
+        Ok(Some(unsafe { (&*data.cast::<T>()).clone() }))
+    }
+
+    /// Store a Rust value in DuckDB's per-database object cache.
+    ///
+    /// `estimated_memory` is reported to DuckDB's object cache in bytes for
+    /// eviction accounting.
+    pub fn object_cache_put<T: Send + Sync + 'static>(
+        &self,
+        key: &str,
+        object_type: &CStr,
+        estimated_memory: usize,
+        value: T,
+    ) -> VortexResult<()> {
+        let key = cache_key(key)?;
+        let estimated_memory = cpp::idx_t::try_from(estimated_memory)
+            .map_err(|_| vortex_err!("object cache memory estimate does not fit idx_t"))?;
+        let data = Box::into_raw(Box::new(value));
+        let state = unsafe {
+            cpp::duckdb_vx_object_cache_put(
+                self.as_ptr(),
+                key.as_ptr(),
+                key.as_bytes().len(),
+                object_type.as_ptr(),
+                estimated_memory,
+                data.cast(),
+                Some(drop_boxed::<T>),
+            )
+        };
+        if state != cpp::duckdb_state::DuckDBSuccess {
+            vortex_bail!("failed to store object in DuckDB object cache");
+        }
+        Ok(())
+    }
+}
+
+fn cache_key(key: &str) -> VortexResult<CString> {
+    CString::new(key).map_err(|_| vortex_err!("object cache key contains an interior NUL byte"))
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex::error::VortexResult;
+
+    use crate::duckdb::Database;
+
+    #[test]
+    fn object_cache_round_trip_clones_stored_value() -> VortexResult<()> {
+        let db = Database::open_in_memory()?;
+        let conn = db.connect()?;
+        let ctx = conn.client_context()?;
+
+        assert_eq!(
+            unsafe { ctx.object_cache_get_cloned::<String>("cache-key", c"vortex_test") }?,
+            None
+        );
+
+        ctx.object_cache_put("cache-key", c"vortex_test", 5, String::from("value"))?;
+
+        assert_eq!(
+            unsafe { ctx.object_cache_get_cloned::<String>("cache-key", c"vortex_test") }?,
+            Some(String::from("value"))
+        );
+        assert_eq!(
+            unsafe { ctx.object_cache_get_cloned::<String>("cache-key", c"other_type") }?,
+            None
+        );
+
+        Ok(())
+    }
+}
diff --git a/vortex-duckdb/src/e2e_test/vortex_scan_test.rs b/vortex-duckdb/src/e2e_test/vortex_scan_test.rs
index 8e65d26ed6f..2c0b2c16481 100644
--- a/vortex-duckdb/src/e2e_test/vortex_scan_test.rs
+++ b/vortex-duckdb/src/e2e_test/vortex_scan_test.rs
@@ -3,7 +3,6 @@
 
 //! This module contains tests for the `vortex_scan` table function.
 
-use std::ffi::CStr;
 use std::io::Write;
 use std::net::TcpListener;
 use std::path::Path;
@@ -185,8 +184,7 @@ fn test_scan_function_registration() {
     let chunk = result.into_iter().next().unwrap();
     let vec = chunk.get_vector(0);
     let mut result = vec.as_slice_with_len::<duckdb_string_t>(chunk.len().as_())[0];
-    let string =
-        unsafe { CStr::from_ptr(cpp::duckdb_string_t_data(&raw mut result)).to_string_lossy() };
+    let string = String::from_duckdb_value(&mut result);
 
     assert_eq!(string, "vortex_scan");
 }
@@ -993,3 +991,407 @@ fn test_vortex_encodings_roundtrip() {
     let fixed_child_values = fixed_child.as_slice_with_len::<i32>(10); // 10 total child elements
     assert_eq!(fixed_child_values, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
 }
+
+#[test]
+fn test_read_vortex_v2_basic() {
+    let file = RUNTIME.block_on(async {
+        let numbers = buffer![1i32, 2, 3, 4, 5];
+        write_single_column_vortex_file("number", numbers).await
+    });
+    let conn = database_connection();
+    let file_path = file.path().to_string_lossy();
+    let result = conn
+        .query(&format!(
+            "SELECT SUM(number) FROM read_vortex_v2('{file_path}')"
+        ))
+        .unwrap();
+    let chunk = result.into_iter().next().unwrap();
+    let vec = chunk.get_vector(0);
+    let sum = vec.as_slice_with_len::<i64>(chunk.len().as_())[0];
+    assert_eq!(sum, 15);
+}
+
+#[test]
+fn test_read_vortex_v2_strings() {
+    let file = RUNTIME.block_on(async {
+        let strings = VarBinArray::from(vec!["alpha", "beta", "gamma"]);
+        write_single_column_vortex_file("s", strings).await
+    });
+    let conn = database_connection();
+    let file_path = file.path().to_string_lossy();
+    let result = conn
+        .query(&format!(
+            "SELECT string_agg(s, ',') FROM read_vortex_v2('{file_path}')"
+        ))
+        .unwrap();
+    let mut chunk = result.into_iter().next().unwrap();
+    let len = chunk.len().as_();
+    let vec = chunk.get_vector_mut(0);
+    let mut s = unsafe { vec.as_slice_mut::<duckdb_string_t>(len) }[0];
+    let aggregated = String::from_duckdb_value(&mut s);
+    assert_eq!(aggregated, "alpha,beta,gamma");
+}
+
+#[test]
+fn test_read_vortex_v2_multiple_files() {
+    let (tempdir, _f1, _f2) = RUNTIME.block_on(async {
+        let tempdir = tempfile::tempdir().unwrap();
+        let f1 = write_vortex_file_to_dir(tempdir.path(), "numbers", buffer![10i32, 20, 30]).await;
+        let f2 = write_vortex_file_to_dir(tempdir.path(), "numbers", buffer![40i32, 50, 60]).await;
+        (tempdir, f1, f2)
+    });
+
+    let glob_pattern = format!("{}/*.vortex", tempdir.path().display());
+    let conn = database_connection();
+    let result = conn
+        .query(&format!(
+            "SELECT SUM(numbers) FROM read_vortex_v2('{glob_pattern}')"
+        ))
+        .unwrap();
+    let chunk = result.into_iter().next().unwrap();
+    let vec = chunk.get_vector(0);
+    let total = vec.as_slice_with_len::<i64>(chunk.len().as_())[0];
+    assert_eq!(total, 210);
+}
+
+#[test]
+fn test_read_vortex_v2_filters_on_unprojected_column() {
+    let file = RUNTIME.block_on(async {
+        write_vortex_file(
+            [
+                (
+                    "payload",
+                    PrimitiveArray::from_iter([10i32, 20, 30, 40, 50]),
+                ),
+                ("unused_a", PrimitiveArray::from_iter([1i32, 1, 1, 1, 1])),
+                ("unused_b", PrimitiveArray::from_iter([2i32, 2, 2, 2, 2])),
+                ("filter_key", PrimitiveArray::from_iter([1i32, 2, 3, 4, 5])),
+            ]
+            .into_iter(),
+        )
+        .await
+    });
+
+    let conn = database_connection();
+    let file_path = file.path().to_string_lossy();
+    let result = conn
+        .query(&format!(
+            "SELECT SUM(payload) FROM read_vortex_v2('{file_path}') WHERE filter_key <= 2"
+        ))
+        .unwrap();
+    let chunk = result.into_iter().next().unwrap();
+    let vec = chunk.get_vector(0);
+    let total = vec.as_slice_with_len::<i64>(chunk.len().as_())[0];
+    assert_eq!(total, 30);
+}
+
+#[test]
+fn test_read_vortex_v2_exposes_dynamic_filter_pushdown() {
+    let file = RUNTIME.block_on(async {
+        let numbers = PrimitiveArray::from_iter(0i32..10_000);
+        write_single_column_vortex_file("number", numbers).await
+    });
+
+    let conn = database_connection();
+    let file_path = file.path().to_string_lossy();
+    let result = conn
+        .query(&format!(
+            "EXPLAIN SELECT number FROM read_vortex_v2('{file_path}') ORDER BY number LIMIT 5"
+        ))
+        .unwrap();
+
+    let mut explain = String::new();
+    for mut chunk in result {
+        let len = chunk.len().as_();
+        for column_idx in 0..chunk.column_count() {
+            let vector = chunk.get_vector_mut(column_idx);
+            for value in unsafe { vector.as_slice_mut::<duckdb_string_t>(len) } {
+                explain.push_str(&String::from_duckdb_value(value));
+                explain.push('\n');
+            }
+        }
+    }
+
+    assert!(
+        explain.contains("Dynamic Filter"),
+        "expected read_vortex_v2 EXPLAIN to include a pushed dynamic filter, got:\n{explain}"
+    );
+}
+
+#[test]
+fn test_read_vortex_v2_pushes_complex_contains_filter() {
+    let file = RUNTIME.block_on(async {
+        write_vortex_file(
+            [
+                (
+                    "URL",
+                    VarBinArray::from(vec![
+                        "https://example.com",
+                        "https://www.google.com/search",
+                        "https://mail.google.com",
+                        "https://vortex.dev",
+                    ])
+                    .into_array(),
+                ),
+                (
+                    "EventTime",
+                    PrimitiveArray::from_iter([40i32, 30, 20, 10]).into_array(),
+                ),
+            ]
+            .into_iter(),
+        )
+        .await
+    });
+
+    let conn = database_connection();
+    let file_path = file.path().to_string_lossy();
+    let count = conn
+        .query(&format!(
+            "SELECT COUNT(*) FROM read_vortex_v2('{file_path}') WHERE URL LIKE '%google%'"
+        ))
+        .unwrap()
+        .into_iter()
+        .next()
+        .unwrap()
+        .get_vector(0)
+        .as_slice_with_len::<i64>(1)[0];
+    assert_eq!(count, 2);
+
+    let result = conn
+        .query(&format!(
+            "EXPLAIN SELECT * FROM read_vortex_v2('{file_path}') WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10"
+        ))
+        .unwrap();
+
+    let mut explain = String::new();
+    for mut chunk in result {
+        let len = chunk.len().as_();
+        for column_idx in 0..chunk.column_count() {
+            let vector = chunk.get_vector_mut(column_idx);
+            for value in unsafe { vector.as_slice_mut::<duckdb_string_t>(len) } {
+                explain.push_str(&String::from_duckdb_value(value));
+                explain.push('\n');
+            }
+        }
+    }
+
+    assert!(
+        !explain.contains("│           FILTER          │"),
+        "expected the URL contains filter to be removed from the standalone DuckDB FILTER, got:\n{explain}"
+    );
+    assert!(
+        !explain.contains("contains(URL, 'google')"),
+        "expected the URL contains filter to be fully pushed into read_vortex_v2, got:\n{explain}"
+    );
+}
+
+#[test]
+fn test_read_vortex_v2_leaves_non_consumed_complex_or_filter_to_duckdb() {
+    let file = RUNTIME.block_on(async {
+        write_vortex_file(
+            [
+                ("a", PrimitiveArray::from_iter([1i32, 2, 3, 4]).into_array()),
+                (
+                    "b",
+                    PrimitiveArray::from_iter([10i32, 20, 20, 40]).into_array(),
+                ),
+            ]
+            .into_iter(),
+        )
+        .await
+    });
+
+    let conn = database_connection();
+    let file_path = file.path().to_string_lossy();
+    let count = conn
+        .query(&format!(
+            "SELECT COUNT(*) FROM read_vortex_v2('{file_path}') WHERE a = 1 OR b = 20"
+        ))
+        .unwrap()
+        .into_iter()
+        .next()
+        .unwrap()
+        .get_vector(0)
+        .as_slice_with_len::<i64>(1)[0];
+    assert_eq!(count, 3);
+
+    let result = conn
+        .query(&format!(
+            "EXPLAIN SELECT * FROM read_vortex_v2('{file_path}') WHERE a = 1 OR b = 20"
+        ))
+        .unwrap();
+
+    let mut explain = String::new();
+    for mut chunk in result {
+        let len = chunk.len().as_();
+        for column_idx in 0..chunk.column_count() {
+            let vector = chunk.get_vector_mut(column_idx);
+            for value in unsafe { vector.as_slice_mut::<duckdb_string_t>(len) } {
+                explain.push_str(&String::from_duckdb_value(value));
+                explain.push('\n');
+            }
+        }
+    }
+
+    assert!(
+        !explain.contains("$.a") && !explain.contains("$.b"),
+        "expected the non-consumed OR filter to stay out of read_vortex_v2, got:\n{explain}"
+    );
+    assert!(
+        explain.contains("│           FILTER          │"),
+        "expected DuckDB to keep its standalone OR filter for planning, got:\n{explain}"
+    );
+}
+
+#[test]
+fn test_read_vortex_v2_uses_file_stats_for_join_filter_pushdown() {
+    let fact = RUNTIME.block_on(async {
+        write_single_column_vortex_file("k", PrimitiveArray::from_iter(0i32..1000)).await
+    });
+    let dim = RUNTIME.block_on(async {
+        write_single_column_vortex_file("k", PrimitiveArray::from_iter(10i32..21)).await
+    });
+
+    let conn = database_connection();
+    let fact_path = fact.path().to_string_lossy();
+    let dim_path = dim.path().to_string_lossy();
+    let count = conn
+        .query(&format!(
+            "SELECT COUNT(*) FROM read_vortex_v2('{fact_path}') fact JOIN read_vortex_v2('{dim_path}') dim USING (k)"
+        ))
+        .unwrap()
+        .into_iter()
+        .next()
+        .unwrap()
+        .get_vector(0)
+        .as_slice_with_len::<i64>(1)[0];
+    assert_eq!(count, 11);
+
+    let result = conn
+        .query(&format!(
+            "EXPLAIN SELECT COUNT(*) FROM read_vortex_v2('{fact_path}') fact JOIN read_vortex_v2('{dim_path}') dim USING (k)"
+        ))
+        .unwrap();
+
+    let mut explain = String::new();
+    for mut chunk in result {
+        let len = chunk.len().as_();
+        for column_idx in 0..chunk.column_count() {
+            let vector = chunk.get_vector_mut(column_idx);
+            for value in unsafe { vector.as_slice_mut::<duckdb_string_t>(len) } {
+                explain.push_str(&String::from_duckdb_value(value));
+                explain.push('\n');
+            }
+        }
+    }
+
+    assert!(
+        explain.contains("k>=10") && explain.contains("k<=20"),
+        "expected file statistics to produce a join-derived scan filter, got:\n{explain}"
+    );
+}
+
+#[test]
+fn test_read_vortex_v2_uses_late_materialization_for_top_n() {
+    let file = RUNTIME.block_on(async {
+        write_vortex_file(
+            [
+                (
+                    "URL",
+                    VarBinArray::from(vec![
+                        "https://example.com",
+                        "https://www.google.com/search",
+                        "https://mail.google.com",
+                        "https://vortex.dev",
+                    ])
+                    .into_array(),
+                ),
+                (
+                    "EventTime",
+                    PrimitiveArray::from_iter([40i32, 30, 20, 10]).into_array(),
+                ),
+                (
+                    "WatchID",
+                    PrimitiveArray::from_iter([100i64, 200, 300, 400]).into_array(),
+                ),
+                (
+                    "JavaEnable",
+                    PrimitiveArray::from_iter([1i8, 0, 1, 0]).into_array(),
+                ),
+            ]
+            .into_iter(),
+        )
+        .await
+    });
+
+    let conn = database_connection();
+    let file_path = file.path().to_string_lossy();
+    let result = conn
+        .query(&format!(
+            "SELECT string_agg(URL, ',') FROM (SELECT * FROM read_vortex_v2('{file_path}') WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 2)"
+        ))
+        .unwrap();
+    let mut chunk = result.into_iter().next().unwrap();
+    let len = chunk.len().as_();
+    let vector = chunk.get_vector_mut(0);
+    let mut value = unsafe { vector.as_slice_mut::<duckdb_string_t>(len) }[0];
+    assert_eq!(
+        String::from_duckdb_value(&mut value),
+        "https://mail.google.com,https://www.google.com/search"
+    );
+
+    let result = conn
+        .query(&format!(
+            "EXPLAIN SELECT * FROM read_vortex_v2('{file_path}') WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 2"
+        ))
+        .unwrap();
+
+    let mut explain = String::new();
+    for mut chunk in result {
+        let len = chunk.len().as_();
+        for column_idx in 0..chunk.column_count() {
+            let vector = chunk.get_vector_mut(column_idx);
+            for value in unsafe { vector.as_slice_mut::<duckdb_string_t>(len) } {
+                explain.push_str(&String::from_duckdb_value(value));
+                explain.push('\n');
+            }
+        }
+    }
+
+    assert!(
+        explain.contains("HASH_JOIN") && explain.contains("SEMI"),
+        "expected read_vortex_v2 TopN plan to use a late-materialization semi join, got:\n{explain}"
+    );
+    assert!(
+        explain.contains("file_row_number"),
+        "expected late materialization to project file_row_number as a row id, got:\n{explain}"
+    );
+}
+
+#[test]
+fn test_read_vortex_v2_many_large_files_parallel_scan() {
+    let (tempdir, _files) = RUNTIME.block_on(async {
+        let tempdir = tempfile::tempdir().unwrap();
+        let mut files = Vec::new();
+        for file_idx in 0..32 {
+            let start = file_idx * 10_000;
+            let numbers = PrimitiveArray::from_iter(start..start + 10_000);
+            files.push(write_vortex_file_to_dir(tempdir.path(), "number", numbers).await);
+        }
+        (tempdir, files)
+    });
+
+    let conn = database_connection();
+    conn.query("SET threads = 8").unwrap();
+
+    let glob_pattern = format!("{}/*.vortex", tempdir.path().display());
+    let result = conn
+        .query(&format!(
+            "SELECT SUM(number) FROM read_vortex_v2('{glob_pattern}')"
+        ))
+        .unwrap();
+    let chunk = result.into_iter().next().unwrap();
+    let vec = chunk.get_vector(0);
+    let total = vec.as_slice_with_len::<i64>(chunk.len().as_())[0];
+    assert_eq!(total, 51_199_840_000);
+}
diff --git a/vortex-duckdb/src/exporter/mod.rs b/vortex-duckdb/src/exporter/mod.rs
index 517776f5521..7c8f2943be4 100644
--- a/vortex-duckdb/src/exporter/mod.rs
+++ b/vortex-duckdb/src/exporter/mod.rs
@@ -46,6 +46,11 @@ pub struct ArrayExporter {
     /// Columns DuckDB requested to read from file. If empty, it's a zero-column
     /// projection and should be handled accordingly, see ArrayExporter::export.
     fields: Vec<Box<dyn ColumnExporter>>,
+    /// Optional sparse mapping from Vortex struct fields to DuckDB chunk
+    /// vectors. Used by DuckDB's multi-file scan when filter-only columns are
+    /// present in the intermediate chunk but not materialized by Vortex.
+    field_positions: Option<Vec<usize>>,
+    chunk_column_count: Option<usize>,
     array_len: usize,
     remaining: usize,
 }
@@ -67,6 +72,41 @@ impl ArrayExporter {
         Ok(Self {
             ctx,
             fields,
+            field_positions: None,
+            chunk_column_count: None,
+            array_len: array.len(),
+            remaining: array.len(),
+        })
+    }
+
+    pub fn try_new_with_positions(
+        array: &StructArray,
+        cache: &ConversionCache,
+        mut ctx: ExecutionCtx,
+        field_positions: Vec<usize>,
+        chunk_column_count: usize,
+    ) -> VortexResult<Self> {
+        let validity = array.validity()?.execute_mask(array.len(), &mut ctx)?;
+        assert!(validity.all_true());
+
+        let fields = array
+            .iter_unmasked_fields()
+            .map(|field| new_array_exporter(field.clone(), cache, &mut ctx))
+            .collect::<VortexResult<Vec<_>>>()?;
+
+        if fields.len() != field_positions.len() {
+            vortex_bail!(
+                "Expected {} output positions for {} fields",
+                fields.len(),
+                field_positions.len()
+            );
+        }
+
+        Ok(Self {
+            ctx,
+            fields,
+            field_positions: Some(field_positions),
+            chunk_column_count: Some(chunk_column_count),
             array_len: array.len(),
             remaining: array.len(),
         })
@@ -88,6 +128,32 @@ impl ArrayExporter {
 
         let zero_projection = self.fields.is_empty();
 
+        if let Some(field_positions) = &self.field_positions {
+            let expected_cols = self
+                .chunk_column_count
+                .vortex_expect("sparse exporter missing chunk column count");
+            let chunk_cols = chunk.column_count();
+            if chunk_cols != expected_cols {
+                vortex_bail!("Expected {expected_cols} columns in output chunk, got {chunk_cols}");
+            }
+
+            let chunk_len = duckdb_vector_size().min(self.remaining);
+            let position = self.array_len - self.remaining;
+            self.remaining -= chunk_len;
+            chunk.set_len(chunk_len);
+
+            for (field, pos) in self.fields.iter().zip(field_positions.iter().copied()) {
+                field.export(
+                    position,
+                    chunk_len,
+                    chunk.get_vector_mut(pos),
+                    &mut self.ctx,
+                )?;
+            }
+
+            return Ok(true);
+        }
+
         // file_row_number column is already populated in scan construction
         let expected_cols = self.fields.len() + file_index_column_pos.is_some() as usize;
         let chunk_cols = chunk.column_count();
diff --git a/vortex-duckdb/src/lib.rs b/vortex-duckdb/src/lib.rs
index 413a71c611e..902c501e3a0 100644
--- a/vortex-duckdb/src/lib.rs
+++ b/vortex-duckdb/src/lib.rs
@@ -22,6 +22,7 @@ use crate::duckdb::LogicalType;
 use crate::duckdb::Value;
 use crate::multi_file::VortexMultiFileScan;
 use crate::multi_file::VortexMultiFileScanList;
+use crate::multi_file_function::VortexMultiFileFunction;
 
 mod convert;
 mod datasource;
@@ -29,6 +30,7 @@ pub mod duckdb;
 mod exporter;
 mod filesystem;
 mod multi_file;
+mod multi_file_function;
 
 #[rustfmt::skip]
 #[path = "./cpp.rs"]
@@ -45,6 +47,32 @@ static RUNTIME: LazyLock<CurrentThreadRuntime> = LazyLock::new(CurrentThreadRunt
 static SESSION: LazyLock<VortexSession> =
     LazyLock::new(|| VortexSession::default().with_handle(RUNTIME.handle()));
 
+/// Returns true if the user has opted into the experimental MultiFileFunction-
+/// backed scan path via `VX_DUCKDB_MULTI_FILE_FUNCTION=1` (or `=true`).
+///
+/// Used to switch between the existing TableFunction-driven `read_vortex` and
+/// the new `MultiFileFunction<OP>`-driven path during benchmarking. Defaults
+/// to off so the existing scan remains the path of record.
+///
+/// Known gaps in the v2 path (compared to v1) at time of writing:
+/// - No batch parallelism within a file (`TryInitializeScan` is one-shot, so
+///   each Vortex file is scanned by a single worker).
+/// - No `union_by_name`, hive partitioning columns, or `filename` virtual
+///   column wired through.
+/// - No support for the named parameters DuckDB's `MultiFileReader` adds
+///   (`union_by_name`, `hive_partitioning`, …) — `ParseOption` returns false.
+/// - No `COPY ... FROM 'x.vortex'` via this path.
+///
+/// These are tracked as follow-up work; for now `read_vortex_v2` exists
+/// alongside `read_vortex` so orchestration paths can be benchmarked
+/// side-by-side.
+fn use_multi_file_function() -> bool {
+    matches!(
+        std::env::var("VX_DUCKDB_MULTI_FILE_FUNCTION").as_deref(),
+        Ok("1") | Ok("true") | Ok("TRUE")
+    )
+}
+
 /// Initialize the Vortex extension by registering the extension functions.
 /// Note: This also registers extension options. If you want to register options
 /// separately (e.g., before creating connections), call `register_extension_options` first.
@@ -55,11 +83,29 @@ pub fn initialize(db: &DatabaseRef) -> VortexResult<()> {
         LogicalType::varchar(),
         Value::from("vortex"),
     )?;
-    db.register_table_function::<VortexMultiFileScan>(c"vortex_scan")?;
-    db.register_table_function::<VortexMultiFileScan>(c"read_vortex")?;
-    // Register list overloads for multi-glob scanning (e.g., read_vortex(['a.vortex', 'b.vortex']))
-    db.register_table_function::<VortexMultiFileScanList>(c"vortex_scan")?;
-    db.register_table_function::<VortexMultiFileScanList>(c"read_vortex")?;
+    db.config().add_extension_options(
+        "vortex_metadata_cache",
+        "Cache Vortex file metadata - useful when reading the same files multiple times.",
+        LogicalType::bool(),
+        Value::from(false),
+    )?;
+    if use_multi_file_function() {
+        // Replace the table-function-based scan with the MultiFileFunction<OP>
+        // path under the canonical names. Also expose under v2 names so an A/B
+        // test can run both registrations side-by-side.
+        db.register_multi_file_function::<VortexMultiFileFunction>(c"vortex_scan")?;
+        db.register_multi_file_function::<VortexMultiFileFunction>(c"read_vortex")?;
+    } else {
+        db.register_table_function::<VortexMultiFileScan>(c"vortex_scan")?;
+        db.register_table_function::<VortexMultiFileScan>(c"read_vortex")?;
+        // Register list overloads for multi-glob scanning (e.g., read_vortex(['a.vortex', 'b.vortex']))
+        db.register_table_function::<VortexMultiFileScanList>(c"vortex_scan")?;
+        db.register_table_function::<VortexMultiFileScanList>(c"read_vortex")?;
+    }
+    // Always expose the v2 path under its own name so it can be invoked
+    // explicitly without flipping the env var (useful for A/B testing within
+    // a single process).
+    db.register_multi_file_function::<VortexMultiFileFunction>(c"read_vortex_v2")?;
     db.register_copy_function::<VortexCopyFunction>(c"vortex", c"vortex")
 }
 
diff --git a/vortex-duckdb/src/multi_file.rs b/vortex-duckdb/src/multi_file.rs
index 3f99a854a22..2c4646aa08c 100644
--- a/vortex-duckdb/src/multi_file.rs
+++ b/vortex-duckdb/src/multi_file.rs
@@ -30,7 +30,7 @@ use crate::filesystem::resolve_filesystem;
 /// Accepts full URLs (e.g. `s3://bucket/prefix/*.vortex`, `file:///data/*.vortex`) as well as
 /// bare file paths. For bare paths, the path is made absolute (without requiring it to exist)
 /// so that relative paths such as `./data/*.vortex` or `../data/*.vortex` are resolved correctly.
-fn parse_glob_url(glob_url_str: &str) -> VortexResult<Url> {
+pub(crate) fn parse_glob_url(glob_url_str: &str) -> VortexResult<Url> {
     Url::parse(glob_url_str).or_else(|_| {
         let path = absolute(Path::new(glob_url_str))
             .map_err(|e| vortex_err!("Failed making {glob_url_str} absolute: {e}"))?;
diff --git a/vortex-duckdb/src/multi_file_function.rs b/vortex-duckdb/src/multi_file_function.rs
new file mode 100644
index 00000000000..bee8734aadb
--- /dev/null
+++ b/vortex-duckdb/src/multi_file_function.rs
@@ -0,0 +1,1423 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Vortex implementation of [`MultiFileFunction`].
+//!
+//! Plugs Vortex into DuckDB's `MultiFileFunction<OP>` template: cross-file
+//! orchestration (globbing, parallelism, virtual columns, hive partitioning)
+//! is handled by DuckDB. This module supplies the per-file reader using
+//! [`VortexFile`] directly so file-level statistics, dtype, and pruning are
+//! available without going through `MultiLayoutDataSource`.
+
+use std::collections::VecDeque;
+use std::ffi::CStr;
+use std::fmt::Debug;
+use std::ops::Range;
+use std::sync::atomic::AtomicBool;
+use std::sync::atomic::AtomicU64;
+use std::sync::atomic::Ordering;
+
+use itertools::Itertools;
+use parking_lot::Mutex;
+use vortex::array::ArrayRef;
+use vortex::array::Canonical;
+use vortex::array::VortexSessionExecute;
+use vortex::array::arrays::ScalarFn;
+use vortex::array::arrays::Struct;
+use vortex::array::arrays::StructArray;
+use vortex::array::arrays::scalar_fn::ScalarFnArrayExt;
+use vortex::buffer::Buffer;
+use vortex::dtype::DType;
+use vortex::dtype::FieldName;
+use vortex::dtype::FieldNames;
+use vortex::dtype::PType;
+use vortex::error::VortexResult;
+use vortex::error::vortex_err;
+use vortex::expr::Expression;
+use vortex::expr::VortexExprExt;
+use vortex::expr::and_collect;
+use vortex::expr::cast;
+use vortex::expr::col;
+use vortex::expr::merge;
+use vortex::expr::pack;
+use vortex::expr::root;
+use vortex::expr::select;
+use vortex::file::Footer;
+use vortex::file::OpenOptionsSessionExt;
+use vortex::file::VortexFile;
+use vortex::io::runtime::BlockingRuntime;
+use vortex::io::runtime::Task;
+use vortex::layout::layouts::row_idx::row_idx;
+use vortex::scalar_fn::fns::pack::Pack;
+use vortex::scan::selection::Selection;
+
+use crate::RUNTIME;
+use crate::SESSION;
+use crate::convert::try_from_bound_expression;
+use crate::convert::try_from_table_filter;
+use crate::convert::try_from_virtual_column_filter;
+use crate::cpp::DUCKDB_VX_EXPR_TYPE;
+use crate::duckdb::BaseFileReader;
+use crate::duckdb::Cardinality;
+use crate::duckdb::ClientContextRef;
+use crate::duckdb::ColumnStatistics;
+use crate::duckdb::DataChunkRef;
+use crate::duckdb::DuckdbStringMapRef;
+use crate::duckdb::ExpressionClass;
+use crate::duckdb::ExpressionRef;
+use crate::duckdb::ExtractedValue;
+use crate::duckdb::LogicalType;
+use crate::duckdb::MultiFileFunction;
+use crate::duckdb::PartitionStats;
+use crate::duckdb::ProjectedColumn;
+use crate::duckdb::SchemaBuilder;
+use crate::duckdb::TableFilterSetRef;
+use crate::duckdb::duckdb_vector_size;
+use crate::exporter::ArrayExporter;
+use crate::exporter::ConversionCache;
+use crate::filesystem::resolve_filesystem;
+use crate::multi_file::parse_glob_url;
+
+type ScanTask = Task<VortexResult<Option<ArrayRef>>>;
+
+const VORTEX_METADATA_CACHE_SETTING: &CStr = c"vortex_metadata_cache";
+const VORTEX_FOOTER_CACHE_TYPE: &CStr = c"vortex_footer";
+const DEFAULT_FOOTER_CACHE_BYTES: usize = 10 * 1024;
+const FILE_ROW_NUMBER_COLUMN_ID: u64 = 9223372036854775809;
+const FILE_INDEX_COLUMN_ID: u64 = 9223372036854775810;
+
+/// Open a [`VortexFile`] using whichever filesystem the user has configured
+/// via the `vortex_filesystem` extension option. DuckDB has already expanded
+/// any glob and chosen this exact path; we only need the right reader for
+/// the URL scheme. Routing through the filesystem also lets HTTP/S3/etc.
+/// transparently use DuckDB's `httpfs` when the user picks `'duckdb'`.
+fn open_vortex_file(ctx: &ClientContextRef, path: &str) -> VortexResult<VortexFile> {
+    let metadata_cache_enabled = vortex_metadata_cache_enabled(ctx);
+    let cached_footer = if metadata_cache_enabled {
+        // SAFETY: this module is the only writer for `vortex_footer` entries,
+        // and it stores exactly `Footer` values for this object type.
+        unsafe { ctx.object_cache_get_cloned::<Footer>(path, VORTEX_FOOTER_CACHE_TYPE) }?
+    } else {
+        None
+    };
+    let cache_miss = metadata_cache_enabled && cached_footer.is_none();
+
+    let url = parse_glob_url(path)?;
+    let mut base_url = url.clone();
+    base_url.set_path("");
+    let fs = resolve_filesystem(&base_url, ctx)?;
+    let mut options = SESSION.open_options();
+    if let Some(footer) = cached_footer {
+        options = options.with_footer(footer);
+    }
+
+    let file = RUNTIME.block_on(async move {
+        let reader = fs.open_read(url.path()).await?;
+        options.open(reader).await
+    })?;
+
+    if cache_miss {
+        ctx.object_cache_put(
+            path,
+            VORTEX_FOOTER_CACHE_TYPE,
+            footer_cache_memory(file.footer()),
+            file.footer().clone(),
+        )?;
+    }
+
+    Ok(file)
+}
+
+/// Multi-file Vortex scan registered via `MultiFileFunction<OP>`.
+///
+/// Compared to [`crate::multi_file::VortexMultiFileScan`] (the table-function
+/// path), this delegates file globbing, virtual columns, and hive partitioning
+/// to DuckDB's native machinery, and reads each file via [`VortexFile`].
+#[derive(Debug)]
+pub struct VortexMultiFileFunction;
+
+#[derive(Default)]
+pub struct VortexReaderOptions;
+
+/// Bind-time data shared across all per-file readers in a query.
+#[derive(Clone, Default)]
+pub struct VortexBindData {
+    /// Metadata and open handle for the file DuckDB selected for binding.
+    first_file: Option<BoundFirstFile>,
+    /// Exact complex filters consumed at optimizer time. These are copied into
+    /// every per-file reader before scan planning.
+    ///
+    /// Non-consumed complex filters remain DuckDB-owned so DuckDB's cardinality
+    /// heuristics still guide planning and Vortex does not evaluate the same
+    /// predicate again inside the scan.
+    complex_filter_exprs: Vec<Expression>,
+    /// True when pushed complex filters are removed from DuckDB's plan, so
+    /// file-level row counts are no longer exact output cardinality.
+    complex_filters_change_cardinality: bool,
+}
+
+#[derive(Clone)]
+struct BoundFirstFile {
+    path: String,
+    file: VortexFile,
+    column_dtypes: Vec<(String, DType)>,
+}
+
+#[derive(Debug)]
+pub struct VortexGlobal;
+
+#[derive(Default)]
+pub struct VortexLocal {
+    /// Row range claimed under DuckDB's multi-file scheduling lock.
+    row_range: Option<Range<u64>>,
+    /// Remaining metadata-only rows for zero-projection, no-filter scans.
+    remaining_rows: u64,
+    /// Split task claimed under DuckDB's multi-file scheduling lock.
+    task: Option<ScanTask>,
+    /// Export batches being drained for this local scan assignment.
+    exporters: VecDeque<ArrayExporter>,
+}
+
+/// Per-file scan state. Holds the open [`VortexFile`] plus immutable scan
+/// configuration shared by DuckDB workers. The scan is built once after
+/// projection/filter preparation into a queue of split tasks; workers then
+/// claim those tasks into [`VortexLocal`] under DuckDB's scheduling lock.
+pub struct VortexFileReader {
+    file: VortexFile,
+    file_idx: usize,
+    cache: ConversionCache,
+    /// Projection set by [`Self::prepare_reader`]. `None` means prepare was
+    /// never called (defensive — scan all columns). `Some(empty)` is the
+    /// explicit zero-projection case (e.g. `SELECT count(*)`); the scan
+    /// produces struct arrays with no fields, and `ArrayExporter` short-
+    /// circuits on the empty fields list.
+    projection: Option<Vec<FieldName>>,
+    /// Positions in DuckDB's intermediate scan chunk for fields materialized by
+    /// the Vortex projection.
+    field_positions: Vec<usize>,
+    /// Number of columns DuckDB allocated in the intermediate scan chunk.
+    scan_column_count: usize,
+    /// Position of DuckDB's file_index virtual column in the scan chunk, if
+    /// DuckDB asks the reader to materialize it instead of filling it as a
+    /// per-file constant.
+    file_index_column_pos: Option<usize>,
+    /// Position of DuckDB's file_row_number virtual column in the scan chunk.
+    file_row_number_column_pos: Option<usize>,
+    /// File-relative row indices selected by a pushed file_row_number filter.
+    row_selection: Selection,
+    /// File-relative row range selected by a pushed file_row_number filter.
+    row_range: Option<Range<u64>>,
+    /// Filter expression set by [`Self::prepare_reader`]. None when no filters
+    /// were pushed down or when conversion failed.
+    filter: Option<Expression>,
+    /// Complex filters accepted at bind/optimizer time and applied by every
+    /// per-file scan.
+    complex_filter_exprs: Vec<Expression>,
+    /// Set when a filter has been pushed down and file-level statistics prove
+    /// the file can be skipped. Causes [`Self::try_initialize_scan`] to return
+    /// false without opening a scan iterator.
+    file_pruned: bool,
+    /// Split tasks prepared from one scan builder. DuckDB serializes
+    /// TryInitializeScan with its global lock, but the reader itself is shared,
+    /// so the queue still needs interior mutability on the Rust side.
+    tasks: Mutex<VecDeque<ScanTask>>,
+    /// Whether the single metadata-only assignment has been claimed.
+    metadata_only_claimed: AtomicBool,
+    /// Total rows in the file, cached for [`Self::progress_in_file`].
+    total_rows: u64,
+    /// Rows produced so far. Bumped after each chunk in [`Self::scan`].
+    rows_scanned: AtomicU64,
+}
+
+impl Debug for VortexFileReader {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("VortexFileReader")
+            .field("file_idx", &self.file_idx)
+            .field("row_count", &self.file.row_count())
+            .field("remaining_tasks", &self.tasks.lock().len())
+            .finish_non_exhaustive()
+    }
+}
+
+impl MultiFileFunction for VortexMultiFileFunction {
+    type ReaderOptions = VortexReaderOptions;
+    type BindData = VortexBindData;
+    type GlobalState = VortexGlobal;
+    type LocalState = VortexLocal;
+    type Reader = VortexFileReader;
+
+    const FILTER_PUSHDOWN: bool = true;
+    const FILTER_PRUNE: bool = true;
+
+    fn create_options(_ctx: &ClientContextRef) -> VortexResult<Self::ReaderOptions> {
+        Ok(VortexReaderOptions)
+    }
+
+    fn initialize_bind_data(_options: Self::ReaderOptions) -> VortexResult<Self::BindData> {
+        Ok(VortexBindData::default())
+    }
+
+    fn pushdown_complex_filter(
+        bind_data: &mut Self::BindData,
+        expr: &ExpressionRef,
+    ) -> VortexResult<bool> {
+        if !contains_string_filter(expr) {
+            return Ok(false);
+        }
+        let Some(expr) = try_from_bound_expression(expr)? else {
+            return Ok(false);
+        };
+        bind_data.complex_filter_exprs.push(expr);
+        bind_data.complex_filters_change_cardinality = true;
+        Ok(true)
+    }
+
+    fn bind_reader(
+        ctx: &ClientContextRef,
+        bind_data: &mut Self::BindData,
+        first_file: &str,
+        schema: &mut SchemaBuilder,
+    ) -> VortexResult<()> {
+        // Open the first file (using whichever filesystem the user picked via
+        // the `vortex_filesystem` extension option) to discover the schema.
+        let file = open_vortex_file(ctx, first_file)?;
+        let dtype = file.dtype();
+        let fields = dtype.as_struct_fields_opt().ok_or_else(|| {
+            vortex_err!("Vortex file must contain a struct array at the top level")
+        })?;
+        for (name, field_dtype) in fields.names().iter().zip(fields.fields()) {
+            let logical_type = LogicalType::try_from(&field_dtype)?;
+            schema.add_column(name.as_ref(), &logical_type);
+        }
+        bind_data.first_file = Some(BoundFirstFile {
+            path: first_file.to_string(),
+            column_dtypes: column_dtypes(&file),
+            file,
+        });
+        Ok(())
+    }
+
+    fn init_global(
+        _ctx: &ClientContextRef,
+        _bind_data: &Self::BindData,
+    ) -> VortexResult<Self::GlobalState> {
+        Ok(VortexGlobal)
+    }
+
+    fn init_local(_global: &Self::GlobalState) -> Self::LocalState {
+        VortexLocal::default()
+    }
+
+    fn create_reader(
+        ctx: &ClientContextRef,
+        _global: &Self::GlobalState,
+        bind_data: &Self::BindData,
+        file_path: &str,
+        file_idx: usize,
+    ) -> VortexResult<Self::Reader> {
+        let (file, _column_dtypes) = open_or_reuse_vortex_file(ctx, bind_data, file_path)?;
+
+        let total_rows = file.row_count();
+        Ok(VortexFileReader {
+            file,
+            file_idx,
+            cache: ConversionCache {
+                file_index: file_idx,
+                ..Default::default()
+            },
+            projection: None,
+            field_positions: vec![],
+            scan_column_count: 0,
+            file_index_column_pos: None,
+            file_row_number_column_pos: None,
+            row_selection: Selection::All,
+            row_range: None,
+            filter: None,
+            complex_filter_exprs: bind_data.complex_filter_exprs.clone(),
+            file_pruned: false,
+            tasks: Mutex::new(VecDeque::new()),
+            metadata_only_claimed: AtomicBool::new(false),
+            total_rows,
+            rows_scanned: AtomicU64::new(0),
+        })
+    }
+
+    fn cardinality(bind_data: &Self::BindData, file_count: usize) -> Cardinality {
+        if bind_data.complex_filters_change_cardinality {
+            return Cardinality::Unknown;
+        }
+        let first_file_row_count = bind_data
+            .first_file
+            .as_ref()
+            .map(|first| first.file.row_count());
+        match (file_count, first_file_row_count) {
+            (0, _) | (_, None) => Cardinality::Unknown,
+            (1, Some(row_count)) => Cardinality::Maximum(row_count),
+            (file_count, Some(rows_per_file)) => {
+                Cardinality::Estimate(rows_per_file.saturating_mul(file_count as u64))
+            }
+        }
+    }
+
+    fn partition_stats(
+        ctx: &ClientContextRef,
+        bind_data: &Self::BindData,
+        file_path: &str,
+    ) -> VortexResult<Option<PartitionStats>> {
+        if bind_data.complex_filters_change_cardinality {
+            return Ok(None);
+        }
+        if let Some(first_file) = bind_data
+            .first_file
+            .as_ref()
+            .filter(|first| first.path == file_path)
+        {
+            return Ok(Some(PartitionStats {
+                row_count: first_file.file.row_count(),
+            }));
+        }
+        if !vortex_metadata_cache_enabled(ctx) {
+            return Ok(None);
+        }
+
+        // SAFETY: this module is the only writer for `vortex_footer` entries,
+        // and it stores exactly `Footer` values for this object type.
+        let Some(footer) =
+            unsafe { ctx.object_cache_get_cloned::<Footer>(file_path, VORTEX_FOOTER_CACHE_TYPE) }?
+        else {
+            return Ok(None);
+        };
+        Ok(Some(PartitionStats {
+            row_count: footer.row_count(),
+        }))
+    }
+
+    fn statistics(bind_data: &Self::BindData, name: &str) -> Option<ColumnStatistics> {
+        let first_file = bind_data.first_file.as_ref()?;
+        column_statistics_for_file(&first_file.file, name)
+    }
+
+    fn to_string(bind_data: &Self::BindData, map: &mut DuckdbStringMapRef) {
+        map.push("Function", "Vortex Multi-File Scan");
+        if !bind_data.complex_filter_exprs.is_empty() {
+            let mut filters = bind_data
+                .complex_filter_exprs
+                .iter()
+                .map(|f| format!("{}", f));
+            map.push("Filters", &filters.join(" /\\\n"));
+        }
+    }
+}
+
+impl BaseFileReader<VortexGlobal, VortexLocal> for VortexFileReader {
+    fn prepare_reader(
+        &mut self,
+        projection: &[ProjectedColumn<'_>],
+        filters: Option<&TableFilterSetRef>,
+    ) -> VortexResult<()> {
+        // Capture the physical projection in scan chunk order, excluding
+        // DuckDB virtual columns that Vortex must synthesize separately.
+        // `Some(empty)` is the explicit zero-projection case (e.g. SELECT
+        // count(*)) and is handled when the scan starts.
+        let mut proj = Vec::new();
+        let mut physical_field_positions = Vec::new();
+        let mut physical_by_projection = Vec::with_capacity(projection.len());
+        let mut file_index_column_pos = None;
+        let mut file_row_number_column_pos = None;
+        for (column_pos, column) in projection.iter().enumerate() {
+            if column.is_virtual {
+                if column.is_projected {
+                    match column.column_id {
+                        FILE_INDEX_COLUMN_ID => file_index_column_pos = Some(column_pos),
+                        FILE_ROW_NUMBER_COLUMN_ID => file_row_number_column_pos = Some(column_pos),
+                        _ => {}
+                    }
+                }
+                physical_by_projection.push(None);
+            } else {
+                let name = FieldName::from(column.name);
+                physical_by_projection.push(Some(name.clone()));
+                if column.is_projected {
+                    physical_field_positions.push(column_pos);
+                    proj.push(name);
+                }
+            }
+        }
+        let mut field_positions = Vec::with_capacity(
+            physical_field_positions.len() + file_row_number_column_pos.is_some() as usize,
+        );
+        if let Some(pos) = file_row_number_column_pos {
+            field_positions.push(pos);
+        }
+        field_positions.extend(physical_field_positions);
+        // Build a Vortex filter expression from DuckDB's table filters and
+        // complex filters. DuckDB
+        // keys multi-file filters by position in BaseFileReader::column_ids;
+        // the C++ adapter passes that same ordered list here as `projection`.
+        let mut pieces = self.complex_filter_exprs.clone();
+        let mut row_selection = Selection::All;
+        let mut row_range = None;
+        if let Some(filters) = filters {
+            let dtype = self.file.dtype();
+            for (idx, filter) in filters.into_iter() {
+                let idx = usize::try_from(idx)
+                    .map_err(|_| vortex_err!("filter column index does not fit usize"))?;
+                let Some(column) = projection.get(idx) else {
+                    continue;
+                };
+                if column.is_virtual {
+                    match column.column_id {
+                        FILE_ROW_NUMBER_COLUMN_ID => {
+                            let (selection, range) = try_from_virtual_column_filter(filter)?;
+                            row_selection = selection;
+                            row_range = range;
+                        }
+                        FILE_INDEX_COLUMN_ID => {
+                            if !file_filter_matches(filter, self.file_idx)? {
+                                self.file_pruned = true;
+                            }
+                        }
+                        _ => {}
+                    }
+                    continue;
+                }
+                let Some(name) = physical_by_projection.get(idx).and_then(Option::as_ref) else {
+                    continue;
+                };
+                if let Some(expr) = try_from_table_filter(filter, &col(name.as_ref()), dtype)? {
+                    pieces.push(expr);
+                }
+            }
+        }
+        normalize_row_filter(&mut row_selection, &mut row_range);
+        let filter = and_collect(pieces);
+        if proj.is_empty()
+            && file_row_number_column_pos.is_none()
+            && let Some(filter) = &filter
+        {
+            proj = filter_field_projection(filter, self.file.dtype());
+        }
+
+        self.file_index_column_pos = file_index_column_pos;
+        self.file_row_number_column_pos = file_row_number_column_pos;
+        self.field_positions = field_positions;
+        self.scan_column_count = projection.len();
+        self.row_selection = row_selection;
+        self.row_range = row_range;
+        self.filter = filter;
+        self.projection = Some(proj);
+
+        // File-level pruning: if the filter combined with the file's stored
+        // statistics proves no row can match, skip this file entirely.
+        if !self.file_pruned
+            && let Some(filter) = &self.filter
+            && self.file.can_prune(filter)?
+        {
+            self.file_pruned = true;
+        }
+        self.metadata_only_claimed.store(false, Ordering::Release);
+        let tasks = if self.file_pruned || self.metadata_only_count() {
+            VecDeque::new()
+        } else {
+            self.build_scan_tasks()?
+        };
+        if !self.file_pruned && !self.metadata_only_count() && tasks.is_empty() {
+            self.file_pruned = true;
+        }
+        self.tasks = Mutex::new(tasks);
+        Ok(())
+    }
+
+    fn try_initialize_scan(
+        &self,
+        _global: &VortexGlobal,
+        local: &mut VortexLocal,
+    ) -> VortexResult<bool> {
+        if self.file_pruned {
+            return Ok(false);
+        }
+        local.remaining_rows = 0;
+        local.task = None;
+        local.exporters.clear();
+
+        if self.metadata_only_count() {
+            if self.metadata_only_claimed.swap(true, Ordering::AcqRel) {
+                return Ok(false);
+            }
+            local.row_range = Some(0..self.total_rows);
+            return Ok(true);
+        }
+
+        let Some(task) = self.tasks.lock().pop_front() else {
+            return Ok(false);
+        };
+
+        local.row_range = None;
+        local.task = Some(task);
+        Ok(true)
+    }
+
+    fn prepare_scan(&self, _global: &VortexGlobal, local: &mut VortexLocal) -> VortexResult<()> {
+        local.remaining_rows = 0;
+        local.exporters.clear();
+        Ok(())
+    }
+
+    fn scan(
+        &self,
+        _global: &VortexGlobal,
+        local: &mut VortexLocal,
+        chunk: &mut DataChunkRef,
+    ) -> VortexResult<()> {
+        if self.metadata_only_count() {
+            return self.scan_metadata_only(local, chunk);
+        }
+        if self.row_count_only() {
+            if local.remaining_rows == 0 && local.task.is_some() {
+                self.prepare_count_rows(local)?;
+            }
+            return self.scan_remaining_rows(local, chunk);
+        }
+
+        loop {
+            // Drain the in-flight split arrays if we have any.
+            while let Some(exporter) = local.exporters.front_mut() {
+                let has_more = exporter.export(
+                    chunk,
+                    self.file_index_column_pos,
+                    self.file_row_number_column_pos,
+                )?;
+                if has_more {
+                    if let Some(pos) = self.file_index_column_pos {
+                        chunk
+                            .get_vector_mut(pos)
+                            .reference_value(&crate::duckdb::Value::from(self.file_idx as u64));
+                    }
+                    self.rows_scanned.fetch_add(chunk.len(), Ordering::Relaxed);
+                    return Ok(());
+                }
+                local.exporters.pop_front();
+            }
+
+            if local.task.is_some() {
+                self.prepare_exporters(local)?;
+                continue;
+            }
+
+            chunk.set_len(0);
+            return Ok(());
+        }
+    }
+
+    fn get_statistics(&self, name: &str) -> Option<ColumnStatistics> {
+        column_statistics_for_file(&self.file, name)
+    }
+
+    fn progress_in_file(&self) -> f64 {
+        if self.total_rows == 0 {
+            return 100.0;
+        }
+        let rows_scanned = self.rows_scanned.load(Ordering::Relaxed);
+        let pct = (rows_scanned as f64 / self.total_rows as f64) * 100.0;
+        pct.clamp(0.0, 100.0)
+    }
+}
+
+fn column_statistics_for_file(file: &VortexFile, name: &str) -> Option<ColumnStatistics> {
+    let stats = file.file_stats()?;
+    let (stats_set, dtype) = stats.get_by_name(file.dtype(), name)?;
+    Some(make_column_statistics(stats_set, dtype))
+}
+
+fn contains_string_filter(expr: &ExpressionRef) -> bool {
+    match expr.as_class() {
+        Some(ExpressionClass::BoundFunction(func)) => func.scalar_function.name() == "contains",
+        Some(ExpressionClass::BoundOperator(op))
+            if op.op == DUCKDB_VX_EXPR_TYPE::DUCKDB_VX_EXPR_TYPE_OPERATOR_NOT =>
+        {
+            op.children().any(contains_string_filter)
+        }
+        Some(ExpressionClass::BoundConjunction(conj)) => {
+            conj.children().any(contains_string_filter)
+        }
+        _ => false,
+    }
+}
+
+fn file_filter_matches(
+    filter: &crate::duckdb::TableFilterRef,
+    file_idx: usize,
+) -> VortexResult<bool> {
+    let file_idx =
+        u64::try_from(file_idx).map_err(|_| vortex_err!("file index does not fit u64"))?;
+    let (selection, range) = try_from_virtual_column_filter(filter)?;
+    let selection_matches = match selection {
+        Selection::All => true,
+        Selection::IncludeByIndex(indices) => indices.as_slice().binary_search(&file_idx).is_ok(),
+        Selection::ExcludeByIndex(indices) => indices.as_slice().binary_search(&file_idx).is_err(),
+        Selection::IncludeRoaring(indices) => indices.contains(file_idx),
+        Selection::ExcludeRoaring(indices) => !indices.contains(file_idx),
+    };
+    let range_matches = range.as_ref().is_none_or(|range| range.contains(&file_idx));
+    Ok(selection_matches && range_matches)
+}
+
+fn normalize_row_filter(selection: &mut Selection, range: &mut Option<Range<u64>>) {
+    let Some(active_range) = range.clone() else {
+        return;
+    };
+    let Selection::IncludeByIndex(indices) = selection else {
+        return;
+    };
+    let filtered = indices
+        .iter()
+        .copied()
+        .filter(|idx| active_range.contains(idx))
+        .collect::<Buffer<u64>>();
+    *selection = Selection::IncludeByIndex(filtered);
+    *range = None;
+}
+
+fn filter_field_projection(filter: &Expression, dtype: &DType) -> Vec<FieldName> {
+    let referenced = filter.field_references();
+    dtype
+        .as_struct_fields_opt()
+        .map(|fields| {
+            fields
+                .names()
+                .iter()
+                .filter(|name| referenced.contains(*name))
+                .cloned()
+                .collect()
+        })
+        .unwrap_or_default()
+}
+
+impl VortexFileReader {
+    fn metadata_only_count(&self) -> bool {
+        self.projection.as_ref().is_some_and(Vec::is_empty)
+            && self.file_row_number_column_pos.is_none()
+            && self.filter.is_none()
+    }
+
+    fn row_count_only(&self) -> bool {
+        self.scan_column_count == 0
+            && self.file_row_number_column_pos.is_none()
+            && self.field_positions.is_empty()
+            && self.filter.is_some()
+            && self
+                .projection
+                .as_ref()
+                .is_some_and(|proj| !proj.is_empty())
+    }
+
+    fn scan_remaining_rows(
+        &self,
+        local: &mut VortexLocal,
+        chunk: &mut DataChunkRef,
+    ) -> VortexResult<()> {
+        if local.remaining_rows == 0 {
+            chunk.set_len(0);
+            return Ok(());
+        }
+
+        let chunk_len =
+            duckdb_vector_size().min(usize::try_from(local.remaining_rows).unwrap_or(usize::MAX));
+        chunk.reset();
+        chunk.set_len(chunk_len);
+        local.remaining_rows -= chunk_len as u64;
+        self.rows_scanned
+            .fetch_add(chunk_len as u64, Ordering::Relaxed);
+        Ok(())
+    }
+
+    fn prepare_count_rows(&self, local: &mut VortexLocal) -> VortexResult<()> {
+        let Some(task) = local.task.take() else {
+            return Ok(());
+        };
+        local.remaining_rows = RUNTIME
+            .block_on(task)?
+            .map(|array| array.len() as u64)
+            .unwrap_or(0);
+        Ok(())
+    }
+
+    fn prepare_exporters(&self, local: &mut VortexLocal) -> VortexResult<()> {
+        let Some(task) = local.task.take() else {
+            return Ok(());
+        };
+        local.exporters = RUNTIME
+            .block_on(task)?
+            .map(|array| {
+                make_exporters(
+                    array,
+                    &self.cache,
+                    self.field_positions.clone(),
+                    self.scan_column_count,
+                )
+            })
+            .transpose()?
+            .unwrap_or_default();
+        Ok(())
+    }
+
+    fn scan_metadata_only(
+        &self,
+        local: &mut VortexLocal,
+        chunk: &mut DataChunkRef,
+    ) -> VortexResult<()> {
+        if local.remaining_rows == 0 {
+            let Some(row_range) = local.row_range.take() else {
+                chunk.set_len(0);
+                return Ok(());
+            };
+            local.remaining_rows = row_range.end.saturating_sub(row_range.start);
+        }
+
+        if local.remaining_rows == 0 {
+            chunk.set_len(0);
+            return Ok(());
+        }
+
+        let chunk_len =
+            duckdb_vector_size().min(usize::try_from(local.remaining_rows).unwrap_or(usize::MAX));
+        chunk.reset();
+        chunk.set_len(chunk_len);
+        local.remaining_rows -= chunk_len as u64;
+        self.rows_scanned
+            .fetch_add(chunk_len as u64, Ordering::Relaxed);
+        if let Some(pos) = self.file_index_column_pos {
+            chunk
+                .get_vector_mut(pos)
+                .reference_value(&crate::duckdb::Value::from(self.file_idx as u64));
+        }
+        Ok(())
+    }
+
+    fn build_scan_tasks(&self) -> VortexResult<VecDeque<ScanTask>> {
+        let mut builder = self.file.scan()?;
+        // Apply projection. `None` (prepare not called) defaults to all
+        // columns; `Some` (including the empty case for SELECT count(*))
+        // applies an explicit `select` so the resulting struct arrays contain
+        // exactly the columns DuckDB expects.
+        if let Some(names) = &self.projection {
+            let names = FieldNames::from_iter(names.iter().cloned());
+            let select = select(names, root());
+            let projection = if self.file_row_number_column_pos.is_some() {
+                let row_idx = cast(row_idx(), DType::Primitive(PType::I64, false.into()));
+                let row_idx_struct = pack([("file_row_number", row_idx)], false.into());
+                merge([row_idx_struct, select])
+            } else {
+                select
+            };
+            builder = builder.with_projection(projection);
+        }
+        if let Some(row_range) = self.row_range.clone() {
+            builder = builder.with_row_range(row_range);
+        }
+        builder = builder.with_selection(self.row_selection.clone());
+        if let Some(filter) = self.filter.clone() {
+            builder = builder.with_filter(filter);
+        }
+        let handle = RUNTIME.handle();
+        Ok(builder
+            .build()?
+            .into_iter()
+            .map(|task| handle.spawn(task))
+            .collect())
+    }
+}
+
+/// Convert the next array off the scan stream into a [`StructArray`] suitable
+/// for [`ArrayExporter`].
+fn make_exporter(
+    array: ArrayRef,
+    cache: &ConversionCache,
+    field_positions: Vec<usize>,
+    scan_column_count: usize,
+) -> VortexResult<ArrayExporter> {
+    let mut ctx = SESSION.create_execution_ctx();
+    let struct_array: StructArray = if let Some(s) = array.as_opt::<Struct>() {
+        s.into_owned()
+    } else if let Some(array) = array.as_opt::<ScalarFn>()
+        && let Some(pack_options) = array.scalar_fn().as_opt::<Pack>()
+    {
+        StructArray::new(
+            pack_options.names.clone(),
+            array.children(),
+            array.len(),
+            pack_options.nullability.into(),
+        )
+    } else {
+        array.execute::<Canonical>(&mut ctx)?.into_struct()
+    };
+    ArrayExporter::try_new_with_positions(
+        &struct_array,
+        cache,
+        ctx,
+        field_positions,
+        scan_column_count,
+    )
+}
+
+fn make_exporters(
+    array: ArrayRef,
+    cache: &ConversionCache,
+    field_positions: Vec<usize>,
+    scan_column_count: usize,
+) -> VortexResult<VecDeque<ArrayExporter>> {
+    array
+        .to_array_iterator()
+        .map(|array| make_exporter(array?, cache, field_positions.clone(), scan_column_count))
+        .collect()
+}
+
+fn column_dtypes(file: &VortexFile) -> Vec<(String, DType)> {
+    file.dtype()
+        .as_struct_fields_opt()
+        .map(|fields| {
+            fields
+                .names()
+                .iter()
+                .zip(fields.fields())
+                .map(|(name, dtype)| (name.to_string(), dtype))
+                .collect()
+        })
+        .unwrap_or_default()
+}
+
+fn open_or_reuse_vortex_file(
+    ctx: &ClientContextRef,
+    bind_data: &VortexBindData,
+    file_path: &str,
+) -> VortexResult<(VortexFile, Vec<(String, DType)>)> {
+    if let Some(first_file) = cached_first_file(bind_data, file_path) {
+        return Ok(first_file);
+    }
+
+    let file = open_vortex_file(ctx, file_path)?;
+    let column_dtypes = column_dtypes(&file);
+    Ok((file, column_dtypes))
+}
+
+fn cached_first_file(
+    bind_data: &VortexBindData,
+    file_path: &str,
+) -> Option<(VortexFile, Vec<(String, DType)>)> {
+    let first_file = bind_data.first_file.as_ref()?;
+    (first_file.path == file_path)
+        .then(|| (first_file.file.clone(), first_file.column_dtypes.clone()))
+}
+
+fn vortex_metadata_cache_enabled(ctx: &ClientContextRef) -> bool {
+    ctx.try_get_current_setting(VORTEX_METADATA_CACHE_SETTING)
+        .is_some_and(|value| matches!(value.extract(), ExtractedValue::Boolean(true)))
+}
+
+fn footer_cache_memory(footer: &Footer) -> usize {
+    footer
+        .approx_byte_size()
+        .unwrap_or(DEFAULT_FOOTER_CACHE_BYTES)
+}
+
+/// Build a [`ColumnStatistics`] from a Vortex `StatsSet`. Handles the shared
+/// shape (min/max/has_null/max_string_length); same logic as the existing
+/// `datasource.rs` path.
+fn make_column_statistics(
+    stats_set: &vortex::array::stats::StatsSet,
+    dtype: &DType,
+) -> ColumnStatistics {
+    use vortex::expr::stats::Precision;
+    use vortex::expr::stats::Stat;
+    use vortex::scalar::Scalar;
+
+    use crate::convert::ToDuckDBScalar;
+
+    let min = match stats_set.get(Stat::Min) {
+        Some(Precision::Exact(v)) => Scalar::try_new(dtype.clone(), Some(v))
+            .ok()
+            .and_then(|s| s.try_to_duckdb_scalar().ok()),
+        _ => None,
+    };
+    let max = match stats_set.get(Stat::Max) {
+        Some(Precision::Exact(v)) => Scalar::try_new(dtype.clone(), Some(v))
+            .ok()
+            .and_then(|s| s.try_to_duckdb_scalar().ok()),
+        _ => None,
+    };
+    let max_string_length = match stats_set.get(Stat::UncompressedSizeInBytes) {
+        Some(Precision::Exact(v)) => v
+            .as_primitive()
+            .as_u64()
+            .map(|u| (1u64 << 63) | u)
+            .unwrap_or(0),
+        _ => 0,
+    };
+    let has_null = match stats_set.get(Stat::NullCount) {
+        Some(Precision::Exact(c)) => c.as_primitive().as_u64().map(|u| u > 0).unwrap_or(true),
+        _ => true,
+    } && dtype.is_nullable();
+
+    ColumnStatistics {
+        min,
+        max,
+        max_string_length,
+        has_null,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use vortex::array::IntoArray;
+    use vortex::array::arrays::ChunkedArray;
+    use vortex::array::arrays::DictArray;
+    use vortex::array::arrays::PrimitiveArray;
+    use vortex::array::arrays::StructArray;
+    use vortex::array::arrays::VarBinViewArray;
+    use vortex::array::stream::ArrayStreamAdapter;
+    use vortex::expr::lit;
+    use vortex::expr::lt_eq;
+    use vortex::file::WriteOptionsSessionExt;
+    use vortex::layout::layouts::chunked::writer::ChunkedLayoutStrategy;
+    use vortex::layout::layouts::flat::writer::FlatLayoutStrategy;
+
+    use super::*;
+    use crate::cpp::DUCKDB_TYPE;
+    use crate::duckdb::DataChunk;
+
+    #[test]
+    fn try_initialize_scan_assigns_independent_splits() -> VortexResult<()> {
+        let (_temp_file, file) = write_chunked_struct_file(4, 16)?;
+        let total_rows = file.row_count();
+        let mut reader = VortexFileReader {
+            file,
+            file_idx: 0,
+            cache: ConversionCache::default(),
+            projection: None,
+            field_positions: vec![],
+            scan_column_count: 0,
+            file_index_column_pos: None,
+            file_row_number_column_pos: None,
+            row_selection: Selection::All,
+            row_range: None,
+            filter: None,
+            complex_filter_exprs: vec![],
+            file_pruned: false,
+            tasks: Mutex::new(VecDeque::new()),
+            metadata_only_claimed: AtomicBool::new(false),
+            total_rows,
+            rows_scanned: AtomicU64::new(0),
+        };
+        let projection = [ProjectedColumn {
+            name: "number",
+            column_id: 0,
+            is_virtual: false,
+            is_projected: true,
+        }];
+        reader.prepare_reader(&projection, None)?;
+        let task_count = reader.tasks.lock().len();
+        assert!(
+            task_count > 1,
+            "test file should expose multiple layout splits"
+        );
+
+        let global = VortexGlobal;
+        let mut first_local = VortexLocal::default();
+        let mut second_local = VortexLocal::default();
+        assert!(reader.try_initialize_scan(&global, &mut first_local)?);
+        assert!(reader.try_initialize_scan(&global, &mut second_local)?);
+        assert!(first_local.task.is_some());
+        assert!(second_local.task.is_some());
+        assert_eq!(reader.tasks.lock().len(), task_count - 2);
+        reader.prepare_scan(&global, &mut first_local)?;
+        reader.prepare_scan(&global, &mut second_local)?;
+        assert!(first_local.task.is_some());
+        assert!(second_local.task.is_some());
+        assert!(first_local.exporters.is_empty());
+        assert!(second_local.exporters.is_empty());
+
+        let mut chunk = DataChunk::new([LogicalType::new(DUCKDB_TYPE::DUCKDB_TYPE_INTEGER)]);
+        reader.scan(&global, &mut first_local, &mut chunk)?;
+        assert!(first_local.task.is_none());
+        assert!(!chunk.is_empty());
+        reader.scan(&global, &mut second_local, &mut chunk)?;
+        assert!(second_local.task.is_none());
+        assert!(!chunk.is_empty());
+
+        Ok(())
+    }
+
+    #[test]
+    fn prepare_reader_does_not_project_filter_only_columns() -> VortexResult<()> {
+        let (_temp_file, file) = write_two_column_struct_file(64)?;
+        let total_rows = file.row_count();
+        let mut reader = VortexFileReader {
+            file,
+            file_idx: 0,
+            cache: ConversionCache::default(),
+            projection: None,
+            field_positions: vec![],
+            scan_column_count: 0,
+            file_index_column_pos: None,
+            file_row_number_column_pos: None,
+            row_selection: Selection::All,
+            row_range: None,
+            filter: None,
+            complex_filter_exprs: vec![],
+            file_pruned: false,
+            tasks: Mutex::new(VecDeque::new()),
+            metadata_only_claimed: AtomicBool::new(false),
+            total_rows,
+            rows_scanned: AtomicU64::new(0),
+        };
+        let projection = [
+            ProjectedColumn {
+                name: "filter_key",
+                column_id: 0,
+                is_virtual: false,
+                is_projected: false,
+            },
+            ProjectedColumn {
+                name: "payload",
+                column_id: 1,
+                is_virtual: false,
+                is_projected: true,
+            },
+        ];
+
+        reader.prepare_reader(&projection, None)?;
+
+        let projected = reader
+            .projection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .map(|name| name.as_ref())
+            .collect::<Vec<_>>();
+        assert_eq!(projected, ["payload"]);
+        assert_eq!(reader.field_positions, [1]);
+        assert_eq!(reader.scan_column_count, 2);
+
+        Ok(())
+    }
+
+    #[test]
+    fn filtered_count_star_projects_filter_fields_for_row_counts() -> VortexResult<()> {
+        let (_temp_file, file) = write_two_column_struct_file(64)?;
+        let total_rows = file.row_count();
+        let mut reader = VortexFileReader {
+            file,
+            file_idx: 0,
+            cache: ConversionCache::default(),
+            projection: None,
+            field_positions: vec![],
+            scan_column_count: 0,
+            file_index_column_pos: None,
+            file_row_number_column_pos: None,
+            row_selection: Selection::All,
+            row_range: None,
+            filter: None,
+            complex_filter_exprs: vec![lt_eq(col("filter_key"), lit(1i32))],
+            file_pruned: false,
+            tasks: Mutex::new(VecDeque::new()),
+            metadata_only_claimed: AtomicBool::new(false),
+            total_rows,
+            rows_scanned: AtomicU64::new(0),
+        };
+        let projection: [ProjectedColumn<'_>; 0] = [];
+
+        reader.prepare_reader(&projection, None)?;
+
+        let projected = reader
+            .projection
+            .as_ref()
+            .unwrap()
+            .iter()
+            .map(|name| name.as_ref())
+            .collect::<Vec<_>>();
+        assert_eq!(projected, ["filter_key"]);
+        assert!(reader.field_positions.is_empty());
+        assert_eq!(reader.scan_column_count, 0);
+
+        let global = VortexGlobal;
+        let mut rows_seen = 0;
+        loop {
+            let mut local = VortexLocal::default();
+            if !reader.try_initialize_scan(&global, &mut local)? {
+                break;
+            }
+            reader.prepare_scan(&global, &mut local)?;
+            let mut chunk = DataChunk::new([]);
+            loop {
+                reader.scan(&global, &mut local, &mut chunk)?;
+                if chunk.is_empty() {
+                    break;
+                }
+                rows_seen += chunk.len();
+            }
+        }
+        assert_eq!(rows_seen, 2);
+
+        Ok(())
+    }
+
+    #[test]
+    fn count_star_uses_one_metadata_only_assignment() -> VortexResult<()> {
+        let (_temp_file, file) = write_chunked_struct_file(4, 16_384)?;
+        let total_rows = file.row_count();
+        let mut reader = VortexFileReader {
+            file,
+            file_idx: 0,
+            cache: ConversionCache::default(),
+            projection: None,
+            field_positions: vec![],
+            scan_column_count: 0,
+            file_index_column_pos: None,
+            file_row_number_column_pos: None,
+            row_selection: Selection::All,
+            row_range: None,
+            filter: None,
+            complex_filter_exprs: vec![],
+            file_pruned: false,
+            tasks: Mutex::new(VecDeque::new()),
+            metadata_only_claimed: AtomicBool::new(false),
+            total_rows,
+            rows_scanned: AtomicU64::new(0),
+        };
+        let projection: [ProjectedColumn<'_>; 0] = [];
+        reader.prepare_reader(&projection, None)?;
+        assert!(reader.tasks.lock().is_empty());
+
+        let global = VortexGlobal;
+        let mut local = VortexLocal::default();
+        assert!(reader.try_initialize_scan(&global, &mut local)?);
+        reader.prepare_scan(&global, &mut local)?;
+        assert!(local.task.is_none());
+        assert!(
+            !reader.try_initialize_scan(&global, &mut VortexLocal::default())?,
+            "COUNT(*) should not claim one assignment per layout split"
+        );
+
+        let mut chunk = DataChunk::new([]);
+        let mut rows_seen = 0;
+        loop {
+            reader.scan(&global, &mut local, &mut chunk)?;
+            if chunk.is_empty() {
+                break;
+            }
+            rows_seen += chunk.len();
+            assert!(local.task.is_none());
+        }
+        assert_eq!(rows_seen, total_rows);
+
+        Ok(())
+    }
+
+    #[test]
+    fn cardinality_uses_first_file_row_count_from_bind_data() -> VortexResult<()> {
+        let (_temp_file, file) = write_chunked_struct_file(1, 42)?;
+        let bind_data = VortexBindData {
+            first_file: Some(BoundFirstFile {
+                path: "first.vortex".to_string(),
+                column_dtypes: column_dtypes(&file),
+                file,
+            }),
+            complex_filter_exprs: vec![],
+            complex_filters_change_cardinality: false,
+        };
+
+        let Cardinality::Maximum(42) = VortexMultiFileFunction::cardinality(&bind_data, 1) else {
+            panic!("single-file cardinality should be exact maximum");
+        };
+        let Cardinality::Estimate(126) = VortexMultiFileFunction::cardinality(&bind_data, 3) else {
+            panic!("multi-file cardinality should estimate from first file row count");
+        };
+
+        let bind_data = VortexBindData::default();
+        let Cardinality::Unknown = VortexMultiFileFunction::cardinality(&bind_data, 3) else {
+            panic!("cardinality should be unknown before bind_reader records row count");
+        };
+
+        Ok(())
+    }
+
+    #[test]
+    fn cardinality_is_unknown_with_pushed_complex_filters() -> VortexResult<()> {
+        let (_temp_file, file) = write_chunked_struct_file(1, 42)?;
+        let bind_data = VortexBindData {
+            first_file: Some(BoundFirstFile {
+                path: "first.vortex".to_string(),
+                column_dtypes: column_dtypes(&file),
+                file,
+            }),
+            complex_filter_exprs: vec![root()],
+            complex_filters_change_cardinality: false,
+        };
+
+        let Cardinality::Maximum(42) = VortexMultiFileFunction::cardinality(&bind_data, 1) else {
+            panic!("duplicate pushed filters should preserve exact file cardinality");
+        };
+
+        let bind_data = VortexBindData {
+            first_file: bind_data.first_file,
+            complex_filter_exprs: bind_data.complex_filter_exprs,
+            complex_filters_change_cardinality: true,
+        };
+
+        let Cardinality::Unknown = VortexMultiFileFunction::cardinality(&bind_data, 1) else {
+            panic!("cardinality should be unknown after consuming a complex filter in Vortex");
+        };
+
+        Ok(())
+    }
+
+    #[test]
+    fn partition_stats_are_disabled_with_pushed_complex_filters() -> VortexResult<()> {
+        let (_temp_file, file) = write_chunked_struct_file(1, 42)?;
+        let db = crate::duckdb::Database::open_in_memory()?;
+        let conn = db.connect()?;
+        let ctx = conn.client_context()?;
+        let bind_data = VortexBindData {
+            first_file: Some(BoundFirstFile {
+                path: "first.vortex".to_string(),
+                column_dtypes: column_dtypes(&file),
+                file,
+            }),
+            complex_filter_exprs: vec![root()],
+            complex_filters_change_cardinality: true,
+        };
+
+        assert!(
+            VortexMultiFileFunction::partition_stats(ctx, &bind_data, "first.vortex")?.is_none(),
+            "physical file row counts are pre-filter and cannot be exact scan output stats"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn cached_first_file_matches_only_bind_path() -> VortexResult<()> {
+        let (_temp_file, file) = write_chunked_struct_file(1, 16)?;
+        let bind_data = VortexBindData {
+            first_file: Some(BoundFirstFile {
+                path: "first.vortex".to_string(),
+                column_dtypes: column_dtypes(&file),
+                file,
+            }),
+            complex_filter_exprs: vec![],
+            complex_filters_change_cardinality: false,
+        };
+
+        let Some((cached, cached_dtypes)) = cached_first_file(&bind_data, "first.vortex") else {
+            panic!("expected first file to be reused for matching path");
+        };
+        assert_eq!(cached.row_count(), 16);
+        assert_eq!(cached_dtypes.len(), 1);
+        assert!(cached_first_file(&bind_data, "other.vortex").is_none());
+
+        Ok(())
+    }
+
+    #[test]
+    fn make_exporters_unwraps_chunked_struct_batches_before_export() -> VortexResult<()> {
+        let first = struct_with_dict_strings(["a", "b"], [0u32, 1, 0])?;
+        let second = struct_with_dict_strings(["c", "d"], [1u32, 0, 1])?;
+        let dtype = first.dtype().clone();
+        let array = ChunkedArray::try_new(vec![first.into_array(), second.into_array()], dtype)?
+            .into_array();
+
+        let mut exporters = make_exporters(array, &ConversionCache::default(), vec![0], 1)?;
+
+        assert_eq!(exporters.len(), 2);
+
+        let mut first_exporter = exporters.pop_front().unwrap();
+        let mut chunk = DataChunk::new([LogicalType::varchar()]);
+        assert!(first_exporter.export(&mut chunk, None, None)?);
+        let display = String::try_from(&*chunk)?;
+
+        assert!(
+            display.contains("DICTIONARY VARCHAR"),
+            "expected dictionary export, got:\n{display}"
+        );
+
+        Ok(())
+    }
+
+    fn write_chunked_struct_file(
+        chunk_count: usize,
+        rows_per_chunk: i32,
+    ) -> VortexResult<(tempfile::NamedTempFile, VortexFile)> {
+        RUNTIME.block_on(async {
+            let temp_file = tempfile::Builder::new().suffix(".vortex").tempfile()?;
+            let chunks = (0..chunk_count)
+                .map(|chunk_idx| {
+                    let chunk_idx = i32::try_from(chunk_idx)
+                        .map_err(|_| vortex_err!("chunk index does not fit i32"))?;
+                    let start = chunk_idx * rows_per_chunk;
+                    let numbers = PrimitiveArray::from_iter(start..start + rows_per_chunk);
+                    StructArray::from_fields(&[("number", numbers.into_array())])
+                        .map(IntoArray::into_array)
+                })
+                .collect::<VortexResult<Vec<_>>>()?;
+            let dtype = chunks[0].dtype().clone();
+            let stream = futures::stream::iter(chunks.into_iter().map(Ok));
+            let stream = ArrayStreamAdapter::new(dtype, stream);
+
+            let mut writer = async_fs::File::create(&temp_file).await?;
+            SESSION
+                .write_options()
+                .with_strategy(Arc::new(ChunkedLayoutStrategy::new(
+                    FlatLayoutStrategy::default(),
+                )))
+                .write(&mut writer, stream)
+                .await?;
+            drop(writer);
+
+            let file = SESSION.open_options().open_path(temp_file.path()).await?;
+            Ok((temp_file, file))
+        })
+    }
+
+    fn write_two_column_struct_file(
+        rows: i32,
+    ) -> VortexResult<(tempfile::NamedTempFile, VortexFile)> {
+        RUNTIME.block_on(async {
+            let temp_file = tempfile::Builder::new().suffix(".vortex").tempfile()?;
+            let filter_key = PrimitiveArray::from_iter(0..rows);
+            let payload = PrimitiveArray::from_iter((0..rows).map(|value| value * 10));
+            let chunk = StructArray::from_fields(&[
+                ("filter_key", filter_key.into_array()),
+                ("payload", payload.into_array()),
+            ])?
+            .into_array();
+            let dtype = chunk.dtype().clone();
+            let stream = futures::stream::iter([Ok(chunk)]);
+            let stream = ArrayStreamAdapter::new(dtype, stream);
+
+            let mut writer = async_fs::File::create(&temp_file).await?;
+            SESSION
+                .write_options()
+                .with_strategy(Arc::new(ChunkedLayoutStrategy::new(
+                    FlatLayoutStrategy::default(),
+                )))
+                .write(&mut writer, stream)
+                .await?;
+            drop(writer);
+
+            let file = SESSION.open_options().open_path(temp_file.path()).await?;
+            Ok((temp_file, file))
+        })
+    }
+
+    fn struct_with_dict_strings<const N: usize>(
+        values: [&str; 2],
+        codes: [u32; N],
+    ) -> VortexResult<StructArray> {
+        let values = VarBinViewArray::from_iter_str(values).into_array();
+        let codes = PrimitiveArray::from_iter(codes).into_array();
+        let strings = DictArray::new(codes, values).into_array();
+        StructArray::from_fields(&[("s", strings)])
+    }
+}
diff --git a/vortex-file/public-api.lock b/vortex-file/public-api.lock
index a20ab092751..a96459bf37d 100644
--- a/vortex-file/public-api.lock
+++ b/vortex-file/public-api.lock
@@ -140,6 +140,8 @@ pub fn vortex_file::FileStatistics::from_flatbuffer<'a>(&vortex_flatbuffers::foo
 
 pub fn vortex_file::FileStatistics::get(&self, usize) -> (&vortex_array::stats::stats_set::StatsSet, &vortex_array::dtype::DType)
 
+pub fn vortex_file::FileStatistics::get_by_name(&self, &vortex_array::dtype::DType, &str) -> core::option::Option<(&vortex_array::stats::stats_set::StatsSet, &vortex_array::dtype::DType)>
+
 pub fn vortex_file::FileStatistics::new(alloc::sync::Arc<[vortex_array::stats::stats_set::StatsSet]>, alloc::sync::Arc<[vortex_array::dtype::DType]>) -> Self
 
 pub fn vortex_file::FileStatistics::new_with_dtype(alloc::sync::Arc<[vortex_array::stats::stats_set::StatsSet]>, &vortex_array::dtype::DType) -> Self
diff --git a/vortex-file/src/footer/file_statistics.rs b/vortex-file/src/footer/file_statistics.rs
index 4fac3ad8482..5abd80339d6 100644
--- a/vortex-file/src/footer/file_statistics.rs
+++ b/vortex-file/src/footer/file_statistics.rs
@@ -146,6 +146,20 @@ impl FileStatistics {
     pub fn get(&self, field_idx: usize) -> (&StatsSet, &DType) {
         (&self.stats[field_idx], &self.dtypes[field_idx])
     }
+
+    /// Returns the statistics and data type for a struct field, looked up by name.
+    ///
+    /// This is a convenience for callers that key columns by name (e.g. DuckDB's
+    /// `BaseFileReader::GetStatistics`). Requires `file_dtype` to be a struct so
+    /// that field names can be matched against the stats indices.
+    ///
+    /// Returns `None` if `file_dtype` is not a struct or if `name` does not match
+    /// a field.
+    pub fn get_by_name(&self, file_dtype: &DType, name: &str) -> Option<(&StatsSet, &DType)> {
+        let fields = file_dtype.as_struct_fields_opt()?;
+        let idx = fields.names().iter().position(|n| n.as_ref() == name)?;
+        Some(self.get(idx))
+    }
 }
 
 impl<'a> IntoIterator for &'a FileStatistics {
diff --git a/vortex-layout/public-api.lock b/vortex-layout/public-api.lock
index c0f3acec787..e527b895861 100644
--- a/vortex-layout/public-api.lock
+++ b/vortex-layout/public-api.lock
@@ -1120,6 +1120,8 @@ pub vortex_layout::scan::split_by::SplitBy::RowCount(usize)
 
 impl vortex_layout::scan::split_by::SplitBy
 
+pub const fn vortex_layout::scan::split_by::SplitBy::layout() -> Self
+
 pub fn vortex_layout::scan::split_by::SplitBy::splits(&self, &dyn vortex_layout::LayoutReader, &core::ops::range::Range<u64>, &[vortex_array::dtype::field_mask::FieldMask]) -> vortex_error::VortexResult<alloc::collections::btree::set::BTreeSet<u64>>
 
 impl core::clone::Clone for vortex_layout::scan::split_by::SplitBy
diff --git a/vortex-layout/src/layouts/row_idx/mod.rs b/vortex-layout/src/layouts/row_idx/mod.rs
index 08700f89a41..a5793c12678 100644
--- a/vortex-layout/src/layouts/row_idx/mod.rs
+++ b/vortex-layout/src/layouts/row_idx/mod.rs
@@ -27,15 +27,24 @@ use vortex_array::dtype::Nullability;
 use vortex_array::dtype::PType;
 use vortex_array::expr::ExactExpr;
 use vortex_array::expr::Expression;
+use vortex_array::expr::and_collect;
+use vortex_array::expr::forms::conjuncts;
 use vortex_array::expr::is_root;
 use vortex_array::expr::root;
 use vortex_array::expr::transform::PartitionedExpr;
 use vortex_array::expr::transform::partition;
 use vortex_array::expr::transform::replace;
 use vortex_array::scalar::PValue;
+use vortex_array::scalar::Scalar;
+use vortex_array::scalar_fn::fns::binary::Binary;
+use vortex_array::scalar_fn::fns::list_contains::ListContains;
+use vortex_array::scalar_fn::fns::literal::Literal;
+use vortex_array::scalar_fn::fns::operators::Operator;
+use vortex_buffer::Buffer;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 use vortex_mask::Mask;
+use vortex_scan::selection::Selection;
 use vortex_sequence::Sequence;
 use vortex_sequence::SequenceArray;
 use vortex_session::VortexSession;
@@ -117,6 +126,227 @@ impl RowIdxLayoutReader {
     }
 }
 
+pub(crate) struct ExtractedRowIdxFilter {
+    pub(crate) filter: Option<Expression>,
+    pub(crate) selection: Selection,
+    pub(crate) row_range: Option<Range<u64>>,
+}
+
+pub(crate) fn extract_row_idx_filter(
+    filter: &Expression,
+    row_offset: u64,
+    row_count: u64,
+) -> ExtractedRowIdxFilter {
+    let mut remaining = Vec::new();
+    let mut selection = Selection::All;
+    let mut row_range = None;
+
+    for conjunct in conjuncts(filter) {
+        match extract_row_idx_conjunct(&conjunct, row_offset, row_count) {
+            Some(RowIdxFilterPart::Selection(indices)) => {
+                intersect_selection(&mut selection, indices);
+            }
+            Some(RowIdxFilterPart::Range(range)) => {
+                intersect_row_range(&mut row_range, range);
+            }
+            None => remaining.push(conjunct),
+        }
+    }
+
+    normalize_selection_and_range(&mut selection, &mut row_range);
+
+    ExtractedRowIdxFilter {
+        filter: and_collect(remaining),
+        selection,
+        row_range,
+    }
+}
+
+enum RowIdxFilterPart {
+    Selection(Buffer<u64>),
+    Range(Range<u64>),
+}
+
+fn extract_row_idx_conjunct(
+    expr: &Expression,
+    row_offset: u64,
+    row_count: u64,
+) -> Option<RowIdxFilterPart> {
+    extract_row_idx_binary(expr, row_offset, row_count)
+        .or_else(|| extract_row_idx_in_list(expr, row_offset, row_count))
+}
+
+fn extract_row_idx_binary(
+    expr: &Expression,
+    row_offset: u64,
+    row_count: u64,
+) -> Option<RowIdxFilterPart> {
+    let operator = *expr.as_opt::<Binary>()?;
+    let (operator, scalar) = if expr.child(0).is::<RowIdx>() {
+        (operator, expr.child(1).as_opt::<Literal>()?)
+    } else if expr.child(1).is::<RowIdx>() {
+        (swap_operator(operator)?, expr.child(0).as_opt::<Literal>()?)
+    } else {
+        return None;
+    };
+
+    let Some(value) = literal_to_u64(scalar)? else {
+        return Some(RowIdxFilterPart::Selection(empty_indices()));
+    };
+
+    match operator {
+        Operator::Eq => Some(RowIdxFilterPart::Selection(Buffer::from_iter(
+            relative_index(value, row_offset, row_count),
+        ))),
+        Operator::Gt => Some(RowIdxFilterPart::Range(relative_range(
+            value.saturating_add(1)..u64::MAX,
+            row_offset,
+            row_count,
+        ))),
+        Operator::Gte => Some(RowIdxFilterPart::Range(relative_range(
+            value..u64::MAX,
+            row_offset,
+            row_count,
+        ))),
+        Operator::Lt => Some(RowIdxFilterPart::Range(relative_range(
+            0..value,
+            row_offset,
+            row_count,
+        ))),
+        Operator::Lte => Some(RowIdxFilterPart::Range(relative_range(
+            0..value.saturating_add(1),
+            row_offset,
+            row_count,
+        ))),
+        _ => None,
+    }
+}
+
+fn extract_row_idx_in_list(
+    expr: &Expression,
+    row_offset: u64,
+    row_count: u64,
+) -> Option<RowIdxFilterPart> {
+    expr.as_opt::<ListContains>()?;
+
+    if !expr.child(1).is::<RowIdx>() {
+        return None;
+    }
+
+    let list = expr.child(0).as_opt::<Literal>()?.as_list_opt()?;
+    let mut indices = Vec::new();
+    for scalar in list.elements()? {
+        let Some(value) = literal_to_u64(&scalar)? else {
+            continue;
+        };
+        indices.extend(relative_index(value, row_offset, row_count));
+    }
+    indices.sort_unstable();
+    indices.dedup();
+
+    Some(RowIdxFilterPart::Selection(Buffer::from_iter(indices)))
+}
+
+fn swap_operator(operator: Operator) -> Option<Operator> {
+    Some(match operator {
+        Operator::Eq => Operator::Eq,
+        Operator::Gt => Operator::Lt,
+        Operator::Gte => Operator::Lte,
+        Operator::Lt => Operator::Gt,
+        Operator::Lte => Operator::Gte,
+        _ => return None,
+    })
+}
+
+fn literal_to_u64(scalar: &Scalar) -> Option<Option<u64>> {
+    scalar.as_primitive_opt()?.as_opt::<u64>()
+}
+
+fn relative_index(value: u64, row_offset: u64, row_count: u64) -> Option<u64> {
+    let row_end = row_offset.saturating_add(row_count);
+    (row_offset..row_end)
+        .contains(&value)
+        .then(|| value - row_offset)
+}
+
+fn relative_range(range: Range<u64>, row_offset: u64, row_count: u64) -> Range<u64> {
+    let row_end = row_offset.saturating_add(row_count);
+    let start = range.start.max(row_offset);
+    let end = range.end.min(row_end);
+
+    if start >= end {
+        0..0
+    } else {
+        start - row_offset..end - row_offset
+    }
+}
+
+fn empty_indices() -> Buffer<u64> {
+    Buffer::from_iter(std::iter::empty::<u64>())
+}
+
+fn intersect_selection(selection: &mut Selection, indices: Buffer<u64>) {
+    match selection {
+        Selection::All => {
+            *selection = Selection::IncludeByIndex(indices);
+        }
+        Selection::IncludeByIndex(existing) => {
+            *selection = Selection::IncludeByIndex(Buffer::from_iter(intersect_sorted(
+                existing.as_slice(),
+                indices.as_slice(),
+            )));
+        }
+        Selection::ExcludeByIndex(_)
+        | Selection::IncludeRoaring(_)
+        | Selection::ExcludeRoaring(_) => {}
+    }
+}
+
+fn intersect_sorted(left: &[u64], right: &[u64]) -> Vec<u64> {
+    let mut result = Vec::new();
+    let (mut left_idx, mut right_idx) = (0, 0);
+    while left_idx < left.len() && right_idx < right.len() {
+        match left[left_idx].cmp(&right[right_idx]) {
+            std::cmp::Ordering::Equal => {
+                result.push(left[left_idx]);
+                left_idx += 1;
+                right_idx += 1;
+            }
+            std::cmp::Ordering::Less => left_idx += 1,
+            std::cmp::Ordering::Greater => right_idx += 1,
+        }
+    }
+    result
+}
+
+fn intersect_row_range(row_range: &mut Option<Range<u64>>, next: Range<u64>) {
+    *row_range = Some(match row_range.take() {
+        Some(existing) => existing.start.max(next.start)..existing.end.min(next.end),
+        None => next,
+    });
+}
+
+fn normalize_selection_and_range(selection: &mut Selection, row_range: &mut Option<Range<u64>>) {
+    if row_range.as_ref().is_some_and(|range| range.is_empty()) {
+        *selection = Selection::IncludeByIndex(empty_indices());
+        *row_range = None;
+        return;
+    }
+
+    if !matches!(selection, Selection::IncludeByIndex(_)) {
+        return;
+    }
+
+    let Some(range) = row_range.take() else {
+        return;
+    };
+
+    let Selection::IncludeByIndex(indices) = selection else {
+        unreachable!("row range only removed for include-by-index selection");
+    };
+    *indices = Buffer::from_iter(indices.iter().copied().filter(|idx| range.contains(idx)));
+}
+
 #[derive(Clone)]
 enum Partitioning {
     // An expression that only references the row index (e.g., `row_idx == 5`).
diff --git a/vortex-layout/src/scan/scan_builder.rs b/vortex-layout/src/scan/scan_builder.rs
index bdf5d0bfb11..ebbba89b7ed 100644
--- a/vortex-layout/src/scan/scan_builder.rs
+++ b/vortex-layout/src/scan/scan_builder.rs
@@ -43,6 +43,7 @@ use vortex_utils::parallelism::get_available_parallelism;
 use crate::LayoutReader;
 use crate::LayoutReaderRef;
 use crate::layouts::row_idx::RowIdxLayoutReader;
+use crate::layouts::row_idx::extract_row_idx_filter;
 use crate::scan::repeated_scan::RepeatedScan;
 use crate::scan::split_by::SplitBy;
 use crate::scan::splits::Splits;
@@ -261,30 +262,39 @@ impl<A: 'static + Send> ScanBuilder<A> {
         // Normalize and simplify the expressions.
         let projection = self.projection.optimize_recursive(layout_reader.dtype())?;
 
-        let filter = self
+        let mut filter = self
             .filter
             .map(|f| f.optimize_recursive(layout_reader.dtype()))
             .transpose()?;
+        let mut row_range = self.row_range.clone();
+        let mut selection = self.selection.clone();
+
+        if let (Selection::All, Some(filter_expr)) = (&selection, filter.as_ref()) {
+            let extracted =
+                extract_row_idx_filter(filter_expr, self.row_offset, layout_reader.row_count());
+            filter = extracted.filter;
+            selection = extracted.selection;
+            row_range = intersect_optional_ranges(row_range, extracted.row_range);
+            normalize_selection_and_range(&mut selection, &mut row_range);
+        }
 
         // Construct field masks and compute the row splits of the scan.
         let (filter_mask, projection_mask) =
             filter_and_projection_masks(&projection, filter.as_ref(), layout_reader.dtype())?;
         let field_mask: Vec<_> = [filter_mask, projection_mask].concat();
 
-        let splits =
-            if let Some(ranges) = attempt_split_ranges(&self.selection, self.row_range.as_ref()) {
-                Splits::Ranges(ranges)
-            } else {
-                let split_range = self
-                    .row_range
-                    .clone()
-                    .unwrap_or_else(|| 0..layout_reader.row_count());
-                Splits::Natural(self.split_by.splits(
-                    layout_reader.as_ref(),
-                    &split_range,
-                    &field_mask,
-                )?)
-            };
+        let splits = if let Some(ranges) = attempt_split_ranges(&selection, row_range.as_ref()) {
+            Splits::Ranges(ranges)
+        } else {
+            let split_range = row_range
+                .clone()
+                .unwrap_or_else(|| 0..layout_reader.row_count());
+            Splits::Natural(self.split_by.splits(
+                layout_reader.as_ref(),
+                &split_range,
+                &field_mask,
+            )?)
+        };
 
         Ok(RepeatedScan::new(
             self.session.clone(),
@@ -292,8 +302,8 @@ impl<A: 'static + Send> ScanBuilder<A> {
             projection,
             filter,
             self.ordered,
-            self.row_range,
-            self.selection,
+            row_range,
+            selection,
             splits,
             self.concurrency,
             self.map_fn,
@@ -329,6 +339,38 @@ impl<A: 'static + Send> ScanBuilder<A> {
     }
 }
 
+fn intersect_optional_ranges(
+    left: Option<Range<u64>>,
+    right: Option<Range<u64>>,
+) -> Option<Range<u64>> {
+    match (left, right) {
+        (None, None) => None,
+        (Some(range), None) | (None, Some(range)) => Some(range),
+        (Some(left), Some(right)) => Some(left.start.max(right.start)..left.end.min(right.end)),
+    }
+}
+
+fn normalize_selection_and_range(selection: &mut Selection, row_range: &mut Option<Range<u64>>) {
+    if row_range.as_ref().is_some_and(|range| range.is_empty()) {
+        *selection = Selection::IncludeByIndex(Buffer::from_iter(std::iter::empty::<u64>()));
+        *row_range = None;
+        return;
+    }
+
+    if !matches!(selection, Selection::IncludeByIndex(_)) {
+        return;
+    }
+
+    let Some(range) = row_range.take() else {
+        return;
+    };
+
+    let Selection::IncludeByIndex(indices) = selection else {
+        unreachable!("checked selection kind before taking row range");
+    };
+    *indices = Buffer::from_iter(indices.iter().copied().filter(|idx| range.contains(idx)));
+}
+
 enum LazyScanState<A: 'static + Send> {
     Builder(Option<Box<ScanBuilder<A>>>),
     Preparing(PreparingScan<A>),
@@ -473,6 +515,8 @@ mod test {
     use vortex_array::dtype::Nullability;
     use vortex_array::dtype::PType;
     use vortex_array::expr::Expression;
+    use vortex_array::expr::eq;
+    use vortex_array::expr::lit;
     use vortex_error::VortexResult;
     use vortex_error::vortex_err;
     use vortex_io::runtime::BlockingRuntime;
@@ -482,6 +526,7 @@ mod test {
     use super::ScanBuilder;
     use crate::ArrayFuture;
     use crate::LayoutReader;
+    use crate::layouts::row_idx::row_idx;
 
     #[derive(Debug)]
     struct CountingLayoutReader {
@@ -683,6 +728,63 @@ mod test {
         Ok(())
     }
 
+    #[test]
+    fn row_idx_equality_filter_uses_exact_selection_before_splitting() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let calls = Arc::new(AtomicUsize::new(0));
+        let reader = Arc::new(SplittingLayoutReader::new(Arc::clone(&calls)));
+
+        let runtime = SingleThreadRuntime::default();
+        let session = crate::scan::test::session_with_handle(runtime.handle());
+
+        let stream = ScanBuilder::new(session, reader)
+            .with_filter(eq(row_idx(), lit(2u64)))
+            .into_stream()?;
+        let mut iter = runtime.block_on_stream(stream);
+
+        let mut values = Vec::new();
+        for chunk in &mut iter {
+            let prim = chunk?.execute::<PrimitiveArray>(&mut ctx)?;
+            values.extend(prim.into_buffer::<i32>());
+        }
+
+        assert_eq!(
+            calls.load(Ordering::Relaxed),
+            0,
+            "row_idx equality should be converted to an exact row selection before natural split planning"
+        );
+        assert_eq!(values.as_ref(), [2]);
+
+        Ok(())
+    }
+
+    #[test]
+    fn row_idx_equality_filter_accounts_for_row_offset() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let calls = Arc::new(AtomicUsize::new(0));
+        let reader = Arc::new(SplittingLayoutReader::new(Arc::clone(&calls)));
+
+        let runtime = SingleThreadRuntime::default();
+        let session = crate::scan::test::session_with_handle(runtime.handle());
+
+        let stream = ScanBuilder::new(session, reader)
+            .with_row_offset(10)
+            .with_filter(eq(row_idx(), lit(12u64)))
+            .into_stream()?;
+        let mut iter = runtime.block_on_stream(stream);
+
+        let mut values = Vec::new();
+        for chunk in &mut iter {
+            let prim = chunk?.execute::<PrimitiveArray>(&mut ctx)?;
+            values.extend(prim.into_buffer::<i32>());
+        }
+
+        assert_eq!(calls.load(Ordering::Relaxed), 0);
+        assert_eq!(values.as_ref(), [2]);
+
+        Ok(())
+    }
+
     #[derive(Debug)]
     struct BlockingSplitsLayoutReader {
         name: Arc<str>,
diff --git a/vortex-layout/src/scan/split_by.rs b/vortex-layout/src/scan/split_by.rs
index 6ddc68c2568..84cafa0a22b 100644
--- a/vortex-layout/src/scan/split_by.rs
+++ b/vortex-layout/src/scan/split_by.rs
@@ -7,6 +7,7 @@ use std::ops::Range;
 
 use vortex_array::dtype::FieldMask;
 use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
 
 use crate::LayoutReader;
 
@@ -24,6 +25,11 @@ pub enum SplitBy {
 }
 
 impl SplitBy {
+    /// Splits any time there is a chunk boundary in the file.
+    pub const fn layout() -> Self {
+        Self::Layout
+    }
+
     /// Compute the splits for the given layout.
     // TODO(ngates): remove this once layout readers are stream based.
     pub fn splits(
@@ -42,11 +48,16 @@ impl SplitBy {
                 layout_reader.register_splits(field_mask, row_range, &mut row_splits)?;
                 row_splits
             }
-            SplitBy::RowCount(n) => row_range
-                .clone()
-                .step_by(n)
-                .chain(once(row_range.end))
-                .collect(),
+            SplitBy::RowCount(n) => {
+                if n == 0 {
+                    vortex_bail!("row count split size must be greater than zero");
+                }
+                row_range
+                    .clone()
+                    .step_by(n)
+                    .chain(once(row_range.end))
+                    .collect()
+            }
         })
     }
 }
@@ -124,4 +135,19 @@ mod test {
             .unwrap();
         assert_eq!(splits, [0, 3, 6, 9, 10].into_iter().collect());
     }
+
+    #[test]
+    fn test_invalid_row_count_split() {
+        let reader = reader();
+
+        assert!(
+            SplitBy::RowCount(0)
+                .splits(
+                    reader.as_ref(),
+                    &(0..10),
+                    &[FieldMask::Exact(FieldPath::root())],
+                )
+                .is_err()
+        );
+    }
 }
diff --git a/vortex-python/src/dataset.rs b/vortex-python/src/dataset.rs
index bb1a99b8b48..d97a5b46eb8 100644
--- a/vortex-python/src/dataset.rs
+++ b/vortex-python/src/dataset.rs
@@ -177,7 +177,11 @@ impl PyVortexDataset {
             .scan()?
             .with_projection(projection_from_python(columns)?)
             .with_some_filter(filter_from_python(row_filter))
-            .with_split_by(split_by.map(SplitBy::RowCount).unwrap_or(SplitBy::Layout));
+            .with_split_by(
+                split_by
+                    .map(SplitBy::RowCount)
+                    .unwrap_or_else(SplitBy::layout),
+            );
         if let Some((l, r)) = row_range {
             scan = scan.with_row_range(l..r);
         }
@@ -213,7 +217,11 @@ impl PyVortexDataset {
             .scan()?
             .with_projection(select(FieldNames::empty(), root()))
             .with_some_filter(filter_from_python(row_filter))
-            .with_split_by(split_by.map(SplitBy::RowCount).unwrap_or(SplitBy::Layout));
+            .with_split_by(
+                split_by
+                    .map(SplitBy::RowCount)
+                    .unwrap_or_else(SplitBy::layout),
+            );
         if let Some((l, r)) = row_range {
             scan = scan.with_row_range(l..r);
         }