diff --git a/.agents/skills/bench-performance/SKILL.md b/.agents/skills/bench-performance/SKILL.md index 821f0e022ee..99ef33fa5e9 100644 --- a/.agents/skills/bench-performance/SKILL.md +++ b/.agents/skills/bench-performance/SKILL.md @@ -45,6 +45,14 @@ Do not wait for a deep code read before showing benchmark comparisons or first s - engine/format target(s), for example `datafusion:vortex` versus `datafusion:parquet`; - runtime environment toggles, if the branch exposes any. + If the checkout is an agent worktree, keep benchmark data in the canonical checkout cache rather + than downloading or generating it inside the worktree. Prefer a `file://` data URL that points at + `/Users/ngates/git/vortex/vortex-bench/data/...` (or the user's main checkout equivalent), for + example `--opt remote-data-dir=file:///Users/ngates/git/vortex/vortex-bench/data/clickbench_partitioned/` + when the benchmark supports `remote-data-dir`. For local-only suites such as `statpopgen`, run + from the main checkout or arrange the suite's `vortex-bench/data//...` path to reuse that + canonical cache before generating data. + 3. Run a small comparable benchmark through `vx-bench`: ```bash diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml index 1fa6ceb7258..a141d2f11e2 100644 --- a/.github/workflows/sql-benchmarks.yml +++ b/.github/workflows/sql-benchmarks.yml @@ -10,6 +10,11 @@ on: required: false type: string default: i7i.metal-24xl + vortex_scan_impl: + required: false + type: string + default: "" + description: "Optional VORTEX_SCAN_IMPL override for Vortex file scans, e.g. v1 for legacy scans" benchmark_matrix: required: false type: string @@ -511,6 +516,7 @@ jobs: bench: timeout-minutes: 120 env: + VORTEX_SCAN_IMPL: ${{ inputs.vortex_scan_impl }} VORTEX_EXPERIMENTAL_PATCHED_ARRAY: "1" FLAT_LAYOUT_INLINE_ARRAY_NODE: "1" # Makes python output nicer diff --git a/.gitignore b/.gitignore index bcc8ef746ee..bb0c2d4b5bd 100644 --- a/.gitignore +++ b/.gitignore @@ -245,3 +245,6 @@ vortex-python/.benchmarks/ # For local benchmarks website server and things like the WAL **.duckdb* .bench-env + +.agents/worktrees/ +.claude/worktrees/ diff --git a/AGENTS.md b/AGENTS.md index e5c3d0cc13b..3453e7da0dd 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -113,6 +113,19 @@ cargo +nightly fmt --all cargo clippy --all-targets --all-features ``` +Before pushing Rust changes, compile the relevant test targets, not only library targets. At +minimum, run `cargo test -p --all-features --no-run` for every touched Rust crate that +has tests. For cross-crate scan, layout, file, Arrow export, or execution-context changes, include +the crates that can compile hidden or feature-gated tests, for example: + +```bash +cargo test -p vortex-array --all-features --no-run +cargo check -p vortex-layout -p vortex-file -p vortex-duckdb -p vortex-datafusion --all-features +``` + +Do not push after merge conflict resolution until the post-merge test-target build succeeds for the +affected crates. + Notes: - For `.github/` changes, follow `.github/AGENTS.md` and run diff --git a/Cargo.lock b/Cargo.lock index 66457601e06..535af96b257 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9902,6 +9902,7 @@ dependencies = [ "reqwest 0.13.4", "rstest", "static_assertions", + "temp-env", "tempfile", "tracing", "tracing-subscriber", @@ -10391,13 +10392,17 @@ version = "0.1.0" dependencies = [ "async-trait", "futures", + "parking_lot", "roaring", + "rustc-hash", "tracing", "vortex-array", "vortex-buffer", "vortex-error", + "vortex-io", "vortex-mask", "vortex-session", + "vortex-utils", ] [[package]] diff --git a/benchmarks/datafusion-bench/src/lib.rs b/benchmarks/datafusion-bench/src/lib.rs index fdda58188e7..5c38b9f2b1d 100644 --- a/benchmarks/datafusion-bench/src/lib.rs +++ b/benchmarks/datafusion-bench/src/lib.rs @@ -24,6 +24,10 @@ use object_store::aws::AmazonS3Builder; use object_store::gcp::GoogleCloudStorageBuilder; use object_store::local::LocalFileSystem; use url::Url; +use vortex::scan::ScanScheduler; +use vortex::scan::ScanSchedulerConfig; +use vortex::scan::ScanSchedulerSessionExt; +use vortex::session::VortexSession; use vortex_bench::Format; use vortex_bench::SESSION; use vortex_datafusion::VortexFormat; @@ -45,7 +49,11 @@ pub fn get_session_context() -> SessionContext { .build_arc() .expect("could not build runtime environment"); - let factory = VortexFormatFactory::new().with_options(vortex_table_options()); + let factory = VortexFormatFactory::new() + .with_session( + vortex_session_from_env().expect("invalid Vortex benchmark scan scheduler env"), + ) + .with_options(vortex_table_options()); let mut session_state_builder = SessionStateBuilder::new() .with_config(SessionConfig::from_env().expect("shouldn't fail")) @@ -106,19 +114,51 @@ pub fn make_object_store( } } -pub fn format_to_df_format(format: Format) -> Arc { - match format { +pub fn format_to_df_format(format: Format) -> anyhow::Result> { + Ok(match format { Format::Csv => Arc::new(CsvFormat::default()) as _, Format::Arrow => Arc::new(ArrowFormat), Format::Parquet => Arc::new(ParquetFormat::new()), Format::OnDiskVortex | Format::VortexCompact => Arc::new(VortexFormat::new_with_options( - SESSION.clone(), + vortex_session_from_env()?, vortex_table_options(), )), Format::OnDiskDuckDB | Format::Lance => { - unimplemented!("Format {format} cannot be turned into a DataFusion `FileFormat`") + anyhow::bail!("Format {format} cannot be turned into a DataFusion `FileFormat`") } - } + }) +} + +fn vortex_session_from_env() -> anyhow::Result { + let session = SESSION.clone(); + let Ok(mode) = std::env::var("VORTEX_SCAN_SCHEDULER") else { + return Ok(session); + }; + let config = scan_scheduler_config_from_env()?; + Ok(match mode.as_str() { + "unbounded" => session.with_unbounded_scan_scheduler(), + "shared" | "global" => session.with_scan_scheduler(Arc::new(ScanScheduler::new(config))), + "per-query" | "per-scan" => session.with_new_scan_scheduler_per_scan(config), + other => anyhow::bail!( + "Invalid VORTEX_SCAN_SCHEDULER={other}; expected unbounded, shared, or per-query" + ), + }) +} + +fn scan_scheduler_config_from_env() -> anyhow::Result { + let read_byte_budget = std::env::var("VORTEX_SCAN_MAX_READ_BYTES") + .ok() + .map(|value| { + value.parse::().map_err(|e| { + anyhow::anyhow!("invalid scan scheduler read byte budget {value}: {e}") + }) + }) + .transpose()?; + + Ok(match read_byte_budget { + Some(bytes) => ScanSchedulerConfig::default().with_read_byte_budget(Some(bytes)), + None => ScanSchedulerConfig::default(), + }) } fn vortex_table_options() -> VortexTableOptions { diff --git a/benchmarks/datafusion-bench/src/main.rs b/benchmarks/datafusion-bench/src/main.rs index b8f9ac42df6..043fe26e494 100644 --- a/benchmarks/datafusion-bench/src/main.rs +++ b/benchmarks/datafusion-bench/src/main.rs @@ -27,8 +27,6 @@ use datafusion_physical_plan::collect; use futures::StreamExt; use parking_lot::Mutex; use tokio::fs::File; -use vortex::io::filesystem::FileSystemRef; -use vortex::scan::DataSourceRef; use vortex_bench::Benchmark; use vortex_bench::BenchmarkArg; use vortex_bench::CompactionStrategy; @@ -36,7 +34,6 @@ use vortex_bench::Engine; use vortex_bench::Format; use vortex_bench::Opt; use vortex_bench::Opts; -use vortex_bench::SESSION; use vortex_bench::conversions::convert_parquet_directory_to_vortex; use vortex_bench::create_benchmark; use vortex_bench::create_output_writer; @@ -190,7 +187,7 @@ async fn main() -> anyhow::Result<()> { async move { let session = datafusion_bench::get_session_context(); datafusion_bench::make_object_store(&session, benchmark.data_url())?; - register_benchmark_tables(&session, benchmark, format).await?; + register_benchmark_tables(&session, benchmark, format, show_metrics).await?; Ok((session, format)) } }, @@ -246,99 +243,42 @@ async fn main() -> anyhow::Result<()> { Ok(()) } -fn use_scan_api() -> bool { - std::env::var("VORTEX_USE_SCAN_API").is_ok_and(|v| v == "1") -} - async fn register_benchmark_tables( session: &SessionContext, benchmark: &B, format: Format, + _show_metrics: bool, ) -> anyhow::Result<()> { - match format { - Format::Arrow => register_arrow_tables(session, benchmark).await, - _ if use_scan_api() && matches!(format, Format::OnDiskVortex | Format::VortexCompact) => { - register_v2_tables(session, benchmark, format).await - } - _ => { - let benchmark_base = benchmark.data_url().join(&format!("{}/", format.name()))?; - let file_format = format_to_df_format(format); - - for table in benchmark.table_specs().iter() { - let pattern = benchmark.pattern(table.name, format); - let table_url = ListingTableUrl::try_new(benchmark_base.clone(), pattern)?; - - let listing_options = ListingOptions::new(Arc::clone(&file_format)) - .with_session_config_options(session.state().config()); - let mut config = - ListingTableConfig::new(table_url).with_listing_options(listing_options); - - config = match table.schema.as_ref() { - Some(schema) => config.with_schema(Arc::new(schema.clone())), - None => config.infer_schema(&session.state()).await?, - }; - - let listing_table = Arc::new( - ListingTable::try_new(config)?.with_cache( - session - .runtime_env() - .cache_manager - .get_file_statistic_cache(), - ), - ); - - session.register_table(table.name, listing_table)?; - } - - Ok(()) - } + if matches!(format, Format::Arrow) { + return register_arrow_tables(session, benchmark).await; } -} - -/// Register tables using the V2 `VortexTable` + `MultiFileDataSource` path. -async fn register_v2_tables( - session: &SessionContext, - benchmark: &B, - format: Format, -) -> anyhow::Result<()> { - use vortex::file::multi::MultiFileDataSource; - use vortex::io::object_store::ObjectStoreFileSystem; - use vortex::io::session::RuntimeSessionExt; - use vortex::scan::DataSource as _; - use vortex_datafusion::v2::VortexTable; let benchmark_base = benchmark.data_url().join(&format!("{}/", format.name()))?; + let file_format = format_to_df_format(format)?; for table in benchmark.table_specs().iter() { let pattern = benchmark.pattern(table.name, format); - let table_url = ListingTableUrl::try_new(benchmark_base.clone(), pattern.clone())?; - let store = session - .state() - .runtime_env() - .object_store(table_url.object_store())?; - - let fs: FileSystemRef = Arc::new(ObjectStoreFileSystem::new( - Arc::clone(&store), - SESSION.handle(), - )); - let base_prefix = benchmark_base.path().trim_start_matches('/').to_string(); - let fs = fs.with_prefix(base_prefix); - - let glob_pattern = match &pattern { - Some(p) => p.as_str().to_string(), - None => format!("*.{}", format.ext()), - }; + let table_url = ListingTableUrl::try_new(benchmark_base.clone(), pattern)?; - let multi_ds = MultiFileDataSource::new(SESSION.clone()) - .with_glob(glob_pattern, Some(fs)) - .build() - .await?; + let listing_options = ListingOptions::new(Arc::clone(&file_format)) + .with_session_config_options(session.state().config()); + let mut config = ListingTableConfig::new(table_url).with_listing_options(listing_options); - let arrow_schema = Arc::new(multi_ds.dtype().to_arrow_schema()?); - let data_source: DataSourceRef = Arc::new(multi_ds); + config = match table.schema.as_ref() { + Some(schema) => config.with_schema(Arc::new(schema.clone())), + None => config.infer_schema(&session.state()).await?, + }; + + let listing_table = Arc::new( + ListingTable::try_new(config)?.with_cache( + session + .runtime_env() + .cache_manager + .get_file_statistic_cache(), + ), + ); - let table_provider = Arc::new(VortexTable::new(data_source, SESSION.clone(), arrow_schema)); - session.register_table(table.name, table_provider)?; + session.register_table(table.name, listing_table)?; } Ok(()) @@ -439,7 +379,21 @@ pub async fn execute_query( /// Print Vortex metrics from execution plans. fn print_metrics(plans: &[(usize, Format, Arc)]) { + // VORTEX_BENCH_FULL_PLAN=1 dumps the full per-operator annotated plan (DataFusion + // EXPLAIN ANALYZE-style: elapsed_compute / output_rows per operator), to localize where + // wall time goes (scan vs HashJoin build/probe vs aggregate). + let full_plan = std::env::var_os("VORTEX_BENCH_FULL_PLAN").is_some(); for (query_idx, format, plan) in plans { + if full_plan { + eprintln!("=== annotated plan query={query_idx}, {format} ==="); + eprintln!( + "{}", + datafusion_physical_plan::display::DisplayableExecutionPlan::with_metrics( + plan.as_ref() + ) + .indent(true) + ); + } let metric_sets = VortexMetricsFinder::find_all(plan.as_ref()); if metric_sets.is_empty() { continue; diff --git a/docs/concepts/file-format.md b/docs/concepts/file-format.md index 36a6fe0a935..f97276840fc 100644 --- a/docs/concepts/file-format.md +++ b/docs/concepts/file-format.md @@ -5,7 +5,7 @@ The writer accepts a stream of Vortex arrays, applies a layout strategy to organ and serializes the layout and its segments into a single file. The bulk of the file format specification describes the representation of the footer bytes such that the -layout tree can be reconstructed for scans. +layout tree can be reconstructed and expanded into scan plans. See the [Vortex File Format Specification](../specs/file-format.md) for full details. diff --git a/docs/concepts/layouts.md b/docs/concepts/layouts.md index f6503b3330e..9399a46670c 100644 --- a/docs/concepts/layouts.md +++ b/docs/concepts/layouts.md @@ -1,48 +1,79 @@ # Layouts -Layouts are the out-of-memory equivalent of [Vortex arrays](/concepts/arrays). They are similarly hierarchical, -with an associated vtable, metadata, dtype, children, and lazy buffers known as "segments". +Layouts are the out-of-memory equivalent of [Vortex arrays](/concepts/arrays). A layout describes +how a logical array is organized across children and file segments so that scans can read only the +data they need. -The tree-structure of a layout can be serialized and persisted. During deserialization, the layout is bound to a -segment source that can lazily fetch the data buffers as needed. This abstraction allows Vortex to implement highly -efficient columnar scans over any block storage including local disk, object stores, remote caches like Redis, -Postgres block storage, and more. +The serialized layout tree is stored in a file footer. During deserialization, Vortex resolves each +layout encoding ID through the session's layout registry and constructs a typed `Layout`: -In fact, the [Vortex file format](/concepts/file-format) is just a serialized layout tree with the data segments -stored in the same file. +- common fields are hoisted into `Layout`: dtype, row count, child access, and segment IDs; +- layout-specific metadata lives in `V::LayoutData`; +- the erased `LayoutRef` lets heterogeneous layout nodes live in one tree; and +- child layouts are materialized lazily from the footer FlatBuffer when a scan route asks for them. + +A layout does not execute a scan directly. Its vtable expands the typed layout into a +[`ScanPlan`](scanning.md), and the scan runtime prepares evidence, predicate, projection, +statistics, and aggregate work from that node tree. ## Built-in Layouts -As with arrays, Vortex provides a number of built-in layouts, and users can define their own custom layouts. +As with arrays, Vortex provides a number of built-in layouts, and users can define their own custom +layouts. + +| Name | Description | +|--------------------|--------------------------------------------------------------------------------------------------------| +| `FlatLayout` | Stores one serialized Vortex array in one segment. | +| `StructLayout` | Stores named child layouts corresponding to fields of a struct dtype. | +| `ChunkedLayout` | Stores row-wise partitioned child layouts and exposes chunk boundaries as natural scan splits. | +| `DictionaryLayout` | Stores dictionary values in one child and row-domain codes in another child. | +| `ZonedLayout` | Stores a data child plus zone statistics that can produce predicate evidence before reading row data. | + +## Layout Children + +Child relationships are part of the layout contract. A child can be: + +- a field child, such as one column of a struct; +- a chunk child, covering a row range of the parent; +- a transparent child, such as the data child of a zoned wrapper; or +- an auxiliary child, such as a validity bitmap, dictionary values, or zone statistics. + +The parent vtable defines each child's expected dtype and relationship. This lets Vortex validate +lazy child access without deserializing the entire tree up front. + +## Layouts and Segments + +Layouts refer to data buffers by `SegmentId`. A segment source, such as a Vortex file or an +in-memory buffer, maps those logical segment IDs to bytes. This indirection keeps the layout tree +independent of where the bytes live: local disk, object storage, an embedded buffer, or a remote +cache can all back the same logical layout structure. -| Name | Description | -|--------------------|---------------------------------------------------------------------------------------------------------| -| `FlatLayout` | A layout that holds a single serialized Vortex array. | -| `StructLayout` | A layout that holds a collection of named child layouts, corresponding to an associated `StructDType`. | -| `ChunkedLayout ` | A layout that holds a collection of row-wise partitioned child layouts. | -| `DictionaryLayout` | A layout that shares a single dictionary of values with a child layout holding indices. | -| `ZonedLayout` | A layout that stores a zone-map of statistics to perform filter pruning. | +The scan path asks prepared reads and prepared evidence handles for segment requests when the +requests are known exactly. The segment source handles caching, coalescing, and in-flight +deduplication. +## Example: Parquet Row Groups -### Example: Parquet Row Groups +Layouts can be composed together in arbitrary hierarchical structures. This allows writers to model +the performance characteristics of other file formats or storage systems. -Layouts can be composed together in arbitrary hierarchical structures. This allows users of Vortex to configure -writers that model the performance characteristics of other file formats or storage systems. +As an example, a Parquet-like layout could use: -As an example, suppose we want to replicate the behavior of Parquet row groups in Vortex. We would define a layout that -looked something like: +- `ChunkedLayout(ChunkBy::RowCount(100_000))` at the top level for row groups. +- `StructLayout` inside each row group to split data by column. +- `ChunkedLayout(ChunkBy::CompressedSize(64k))` inside each column for page-like pieces. +- `FlatLayout` leaves that store serialized array chunks. -* `ChunkedLayout(ChunkBy::RowCount(100_000))` - at the top-level, we define row-groups of at most 100k rows. - * `StructLayout` - Parquet then splits the row group into individual columns known as column chunks. - * `ChunkedLayout(ChunkBy::CompressedSize(64k))` - finally, each column chunk is split into pages by compressed - size. +The scan runtime would still see one `ScanPlan` tree. Column projections route through the struct +node, row-range work routes through chunked nodes, and leaf reads touch only the flat segments needed +for the current morsel. ## Layout Strategies -A `LayoutStrategy` defines how to construct a layout tree from a stream of Vortex arrays. These strategies can -partition arrays by column, by row-groups, or by any other arbitrary scheme. Some strategies compute pruning stats, -others apply compression to the data. +A `LayoutStrategy` defines how to construct a layout tree from a stream of Vortex arrays. Strategies +can partition arrays by column, by row range, by size, or by any other scheme. Some strategies +compute pruning statistics, and others choose compression or buffering policies for leaf data. -For segment sinks that are locality-aware, such as a Vortex file, layout strategies can make use of sequence IDs. -These are powerful logical clocks that allow layouts to parallelize writes and compression tasks while maintaining -full control and determinism over where segments are written into the file. +For segment sinks that are locality-aware, such as a Vortex file, layout strategies can use sequence +IDs. These logical clocks let layouts parallelize writes and compression tasks while retaining +deterministic control over where segments are written. diff --git a/docs/concepts/scanning.md b/docs/concepts/scanning.md index 393b7d2d83c..890bc315157 100644 --- a/docs/concepts/scanning.md +++ b/docs/concepts/scanning.md @@ -1,93 +1,131 @@ -# Scan API +# Scanning + +Vortex scans are built around the layout tree stored in a file footer. A scan opens the file, +deserializes the root layout, expands that layout into a `ScanPlan` tree, and prepares executable +runtime handles for predicates, projections, statistics, and aggregates. + +The query engine sees a standard scan request: a projection, an optional filter, ordering +requirements, limits, and split preferences. The layout and scan layers decide how to satisfy that +request with the least data movement. + +```text +footer layout bytes + | + v +LayoutRef / Layout + | + v +ScanPlan tree + | + +-- push expressions into layout-local row domains + +-- prepare predicate evidence + +-- prepare residual predicate reads + +-- prepare projection reads + +-- prepare statistics and aggregate answers + | + v +morsel execution -> array batches +``` -:::{note} -The Scan API is on the roadmap and under active development. The core `Source` trait and scan pipeline -are functional, but the full API surface is not yet fully defined or implemented. -::: +## Layout Expansion -The Vortex Scan API defines a standard interface between data storage and query engines. It solves the -N x M problem of having N different storage backends and M different query engines by providing a common -interface that both sides can implement against. +Each layout encoding has a layout vtable. The serialized form stores common fields such as dtype, +row count, child layouts, and segment IDs. Deserialization hoists those common fields into +`Layout` and leaves only layout-specific metadata in `V::LayoutData`. -``` - Storage Query Engines - ─────── ───────────── +The layout vtable's scan hook expands a `Layout` into a `ScanPlan`. This keeps serialized layout +concerns separate from runtime execution: layouts describe the physical organization of data, +whereas `ScanPlan`s expose what that organization can do during a scan. - Vortex Files ──► ┌──────────────┐ ──► DuckDB - Parquet Files ──► │ Scan API │ ──► DataFusion - Iceberg Tables ──► └──────────────┘ ──► Spark -``` +Layout children are lazy. Accessing a child validates the dtype expected by the parent and +materializes that child from the same footer FlatBuffer only when a scan route actually needs it. +For example, a struct layout does not need to deserialize every column child when the query reads +only a few fields. + +## Scan Plans -Storage backends implement the `Source` trait for reads. Query engines issue a scan request -describing the filter and projection to push down, and the source returns a stream of -independently-executable splits that can be run concurrently to produce result arrays. An -equivalent `Sink` trait exists for the write path, accepting an array stream and writing it to -the underlying storage. +A `ScanPlan` is an immutable runtime view of a layout. It can: -## Motivation +- push an expression into the plan's row domain; +- prepare value reads for the plan's root value; +- prepare predicate evidence; +- provide natural split hints; +- answer statistics or partial aggregates from metadata; and +- release cached state behind a completed row frontier. -Traditional data integrations require each storage backend and query engine to agree on a common -interchange format, typically Apache Arrow. This means the storage backend must fully decompress its -data into Arrow arrays, even if the query engine could operate on the compressed representation -directly. +Pushing an expression returns another `ScanPlan` whose `root()` value is that expression. A struct +plan can route `field("a")` to the child for column `a`; a dictionary plan can apply some +expressions once over dictionary values and reuse the result with per-row codes; a generic apply +plan handles expressions that cannot be pushed into a specialized layout. -The Vortex Scan API avoids this by allowing data to flow between storage and query engines in its -native compressed encoding. For example, the DuckDB integration can receive FSST-encoded string -arrays directly from a Vortex file and pass them into DuckDB's own internal FSST format without -any decompression step. +## Prepared Runtime Handles -## Source +Planning a scan creates prepared handles from the `ScanPlan` tree: -A **Source** represents any scannable tabular data. It accepts a scan request (filter, projection, -limit) and returns a stream of independently-executable splits. An equivalent **Sink** interface -exists for the write path, allowing query engines to both read from and write to any storage -backend through a single pair of interfaces. +- `PreparedEvidence` produces evidence fragments for one predicate expression. +- `PreparedRead` reads one pushed projection or residual predicate expression. +- `PreparedStats` and `PreparedAggregate` answer metadata-backed statistics and aggregates. +- `PreparedSplit` reports row ranges that are natural units of scan work. -### Splits +Prepared handles are scan-level runtime objects. They can hold child prepared handles and shared +state, but they do not choose the next row range themselves. The scan driver chooses explicit +morsel ranges and asks prepared handles to work on those ranges. -A source produces splits, each representing an independent unit of work that can be executed in -parallel. A split typically corresponds to a range of rows in a layout, such as a chunk or a set -of row-group partitions. +Each morsel carries a `RowScope`: -Each split carries size and row count estimates that query engines use for scheduling decisions. -Splits can also be serialized for distributed execution across remote workers. +- `selection` says which rows in the requested range remain live. +- `demand` says which selected rows need meaningful values from this operation. -### Remote Sources +This lets a projection skip data that no longer affects output, while still preserving output +cardinality for selected rows. -A source may front remote storage rather than local files. In this case, the split's execution -issues a remote call and receives the result over the network. The -[Vortex IPC format](../specs/ipc-format.md) can be used as the wire protocol for these calls, allowing -compressed arrays to be transferred without decompression. This gives remote sources the same -zero-decompression benefits as local scans -- the data stays in its compressed encoding end-to-end, -from remote storage through the network and into the query engine. +## Predicate Evidence -## Filter Pushdown +Predicates are decomposed into independent expressions. Before reading row data for a predicate, +the scan asks available prepared evidence handles whether metadata can prove something about the +requested rows. -Filter expressions are decomposed into individual conjuncts (AND-separated terms) and evaluated -independently. The scan tracks the selectivity of each conjunct using a probabilistic sketch -and dynamically reorders them so that the most selective predicates are evaluated first. This -means that as a scan progresses, it learns the most efficient evaluation order for the filter. +Evidence is a statement over the row domain. A zone map can prove that a range cannot match a +predicate; file or layout statistics can prove that a predicate is already satisfied; other +evidence sources can leave a range unknown. Unknown rows continue to residual predicate reads, +which materialize only the columns needed to compute the predicate exactly. -Filters are evaluated in two stages. First, pruning evaluation uses statistics stored in a -`ZonedLayout` auxiliary `zones` child to eliminate entire row zones without reading the underlying -data child. These pruning predicates are falsification checks derived from the original filter, for -example by comparing a zone's min/max values against the requested predicate. Second, filter -evaluation materializes only the filter-referenced columns and computes a row mask of matching -rows. +Prepared evidence handles are expected to be cheap relative to projection reads. They should use +layout metadata, statistics, indexes, or already-prepared shared state rather than speculatively +reading large data columns. Cheap evidence can also opt into a final `recheck_before_projection` +pass, which is useful when dynamic filters change while a morsel is in flight. ## Projection Pushdown -Projection expressions describe the output schema of the scan. The scan analyzes the projection -and filter expressions to compute two field masks: which columns are needed for filtering, and -which are needed for the final output. Only the union of these columns is read from storage. +Projection pushdown is expression pushdown through the `ScanPlan` tree. The scan prepares reads only +for the requested output expressions, and each layout decides how much of its child tree those reads +need. + +For a struct layout, field access routes to the named child and avoids unrelated columns. For a +chunked layout, the read is sliced by chunk and only overlapping chunks with demanded rows are +visited. For a dictionary layout, values can be shared across row ranges while codes are read for +the requested rows. + +## State and Caches + +The scan path uses several layers of state: + +- The segment source owns physical I/O, coalescing, segment caching, and in-flight segment + deduplication. +- The expanded `ScanPlan` tree is immutable and safe to share. +- `PrepareCtx` owns a prepared-state cache for scan/file-level state shared by prepared reads, + evidence, aggregate, and stats handles. +- A layout plan can create child-local prepared-state caches so repeated pushes into the same child + share decoded dictionaries, zone tables, or other expensive setup without leaking state across + unrelated row domains. +- Morsel tasks own only the row range and masks needed for that operation. -Columns needed exclusively for filtering are discarded after the filter mask is computed, so they -never appear in the output stream. This separation ensures minimal data movement throughout the -pipeline. +When ordered scans advance, prepared reads and scan plans receive a release frontier. Layouts use +that frontier to drop caches that only cover rows that cannot be read again. ## Query Engine Integration -Query engines integrate with the Scan API by translating their internal plan representations into -scan requests and consuming the resulting array stream in their preferred format. Integrations -exist for DuckDB, DataFusion, Spark, and Trino, with each engine converting its native filter -and projection representations into Vortex [expressions](expressions.md). +Query engines translate their native expressions into Vortex expressions and submit a scan request. +Vortex handles layout expansion, evidence, residual predicates, projection reads, and array +production. Integrations then export the produced Vortex arrays to the engine's preferred batch +format, such as Arrow `RecordBatch`es for DataFusion or DuckDB `DataChunk`s for DuckDB. diff --git a/docs/developer-guide/extending/index.md b/docs/developer-guide/extending/index.md index ff004a69bc5..315f9170cfe 100644 --- a/docs/developer-guide/extending/index.md +++ b/docs/developer-guide/extending/index.md @@ -13,8 +13,8 @@ The following topics are planned for this section: and Arrow interoperability. - **Writing an Encoding** -- implementing a custom array encoding with compression and decompression logic. -- **Writing a Layout** -- implementing the LayoutReader and LayoutWriter traits for custom - on-disk data organizations. +- **Writing a Layout** -- implementing a layout vtable, lazy child contracts, and ScanPlan + expansion for custom on-disk data organizations. - **Writing a Compute Function** -- the dispatch model, implementing kernels, vtable registration, and testing. diff --git a/docs/developer-guide/extending/writing-a-layout.md b/docs/developer-guide/extending/writing-a-layout.md index 112f7c6a80d..738061a7e8c 100644 --- a/docs/developer-guide/extending/writing-a-layout.md +++ b/docs/developer-guide/extending/writing-a-layout.md @@ -1,13 +1,160 @@ # Writing a Layout -:::{warning} -This page is under construction. -::: +A Vortex layout plugin describes serialized layout metadata and how that layout expands into the +scan runtime. Layout plugins do not implement a separate reader trait. Instead, they implement the +layout vtable, deserialize layout-specific data into `Layout`, and return a `ScanPlan` for +runtime reads. -Planned content: +## Layout Vtable -- What a layout is and when you need a custom one -- Implementing the LayoutReader trait -- Implementing the LayoutWriter trait -- Registering layouts with a session -- Testing layout implementations +The layout vtable lives in `vortex_layout::layout_v2`. Its shape follows the same plugin pattern as +the other Vortex vtables: + +- `Layout` is the typed layout handle. +- `LayoutRef` is the type-erased layout handle. +- `LayoutParts` constructs typed layouts from common fields plus `V::LayoutData`. +- `DynLayout` is private erased dispatch plumbing. +- `LayoutVTablePlugin` is the registry object used for ID-based deserialization. + +Common fields are hoisted out of the plugin-specific data. The vtable receives dtype, row count, +segment IDs, lazy child access, and layout metadata during deserialization, but returns only the +layout-specific `LayoutData`. + +```rust +use vortex_layout::layout_v2; +use vortex_layout::{LayoutChildType, LayoutId}; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::request::ScanRequest; +use vortex_session::VortexSession; + +#[derive(Clone, Debug)] +pub struct MyLayout; + +#[derive(Clone, Debug)] +pub struct MyLayoutData { + // layout-specific metadata +} + +impl layout_v2::VTable for MyLayout { + type LayoutData = MyLayoutData; + + fn id(&self) -> LayoutId { + LayoutId::new("example.my_layout") + } + + fn deserialize( + &self, + args: &layout_v2::LayoutDeserializeArgs<'_>, + ) -> vortex_error::VortexResult { + // Parse args.metadata and validate args.segment_ids / args.children. + Ok(MyLayoutData {}) + } + + fn child_dtype( + layout: layout_v2::Layout, + idx: usize, + ) -> vortex_error::VortexResult { + // Return the dtype expected for child `idx`. + Ok(layout.dtype().clone()) + } + + fn child_type( + _layout: layout_v2::Layout, + idx: usize, + ) -> vortex_error::VortexResult { + Ok(LayoutChildType::Transparent(format!("child-{idx}").into())) + } + + fn new_scan_plan( + layout: layout_v2::Layout, + req: &mut ScanRequest, + session: &VortexSession, + ) -> vortex_error::VortexResult { + // Expand the layout into a runtime ScanPlan. + todo!() + } +} +``` + +## Deserialization + +`LayoutDeserializeArgs` contains the common serialized fields: + +- `dtype`: the logical dtype of this layout; +- `row_count`: the number of rows in this layout's row domain; +- `metadata`: plugin-specific metadata bytes; +- `segment_ids`: logical segments referenced directly by this layout; +- `children`: lazy child access; +- `array_ctx`: the array read context captured from the file footer. + +Use `deserialize` to validate invariants that are local to the layout. For example, a flat layout +requires exactly one segment ID, and a chunked layout verifies that child row counts add up to the +parent row count. + +Do not eagerly deserialize children unless the layout metadata itself requires it. Child access is +intentionally lazy so projection and predicate pushdown can avoid unrelated branches of a wide +layout tree. + +## Child Contracts + +`child_dtype` and `child_type` define the contract between a parent layout and its children. The +scan path calls `layout.child(idx)`, which asks the parent for the expected dtype and then +materializes that child from the footer. + +Use `LayoutChildType` to describe how child rows relate to parent rows: + +- `Field(name)` for struct fields; +- `Chunk((idx, offset))` for row-range chunks; +- `Transparent(name)` for wrappers whose data child shares the parent row domain; +- `Auxiliary(name)` for metadata or support children such as validity, dictionary values, or zone + statistics. + +These relationships are used by debugging tools, split planning, and scan expansion. + +## ScanPlan Expansion + +`new_scan_plan` turns a typed layout into an immutable runtime `ScanPlan`. The plan should hold +layout metadata and child plan references, not per-morsel state. Runtime state belongs in prepared +handles or state caches created during preparation. + +A `ScanPlan` implementation can specialize: + +- `try_push_expr` to route expressions into children or rewrite them into a cheaper row domain; +- `prepare_read` to produce a `PreparedRead` for the plan's root value; +- `prepare_evidence` to produce cheap predicate evidence from metadata or indexes; +- `prepare_stats` and `prepare_aggregate_partial` for metadata-backed answers; +- `split_hints` to expose natural morsel boundaries; and +- `release` to drop caches behind the completed-row frontier. + +The layout vtable expands child layouts by calling `child.new_scan_plan(req, session)`. Pass the +same `ScanRequest` through for children in the same row domain, and use a fresh +`ScanRequest::empty()` for children in independent row domains such as dictionary values or zone +statistics. This keeps the layout plugin responsible for its local structure while the scan runtime +owns predicate ordering, morsel execution, and output assembly. + +## State Placement + +Keep state at the narrowest level that can safely reuse it: + +- `ScanPlan` stores immutable structure only. +- `PrepareCtx::shared_state` stores scan/file-level prepared state shared across prepared reads, + evidence, statistics, and aggregate handles. +- Layouts with independent child row domains can create child-local prepared-state caches so one + child shares dictionaries, zone tables, or decoded setup without colliding with another child. +- `ReadTask` and `EvidenceTask` own only one morsel's range and masks. +- Segment bytes belong to the segment source and segment cache, not to layout plans. + +This separation lets a scan clone and prepare many pushed expressions while still sharing expensive +setup where the row domain is the same. + +## Registration + +Register layout vtables through the session's layout registry: + +```rust +use vortex_layout::LayoutSessionExt; + +session.layouts().register_v2(MyLayout); +``` + +The session resolves serialized layout IDs through this registry when opening a Vortex file. diff --git a/docs/developer-guide/index.md b/docs/developer-guide/index.md index e6eec4bfb70..6c14f35bb93 100644 --- a/docs/developer-guide/index.md +++ b/docs/developer-guide/index.md @@ -22,6 +22,7 @@ caption: Internals internals/architecture internals/session internals/async-runtime +internals/scan-scheduler internals/vtables internals/execution internals/stats-pruning diff --git a/docs/developer-guide/integrations/datafusion.md b/docs/developer-guide/integrations/datafusion.md index 93f45be0a8d..52d421ecb91 100644 --- a/docs/developer-guide/integrations/datafusion.md +++ b/docs/developer-guide/integrations/datafusion.md @@ -21,9 +21,9 @@ discovered file becomes a `PartitionedFile` that DataFusion assigns to execution Vortex implements the `FileOpener` trait to open individual files on demand as DataFusion's executor schedules them. -Layout readers are cached across partitions using a shared concurrent map keyed by file path. -This avoids redundant footer parsing when the same file is accessed by multiple partitions or -repeated queries. +Opened file metadata and scan preparation state are shared where possible across partitions keyed +by file path. This avoids redundant footer parsing and repeated layout expansion when the same file +is accessed by multiple partitions or repeated queries. ## Threading Model @@ -33,24 +33,25 @@ runtime handle. All I/O -- file opens, segment reads, object store fetches -- is Tokio tasks and scheduled across Tokio's multi-threaded executor. DataFusion's physical executor manages parallelism by assigning partitions to its own task pool. -Each partition opens its files and drives a `ScanBuilder` that returns an async stream of -record batches. Multiple partitions execute concurrently, with DataFusion controlling the degree -of parallelism. +Each partition opens its files and drives a Vortex file scan backed by layout expansion and +`ScanPlan` prepared reads. The scan returns an async stream of record batches. Multiple partitions +execute concurrently, with DataFusion controlling the degree of parallelism. ## Filter and Projection Pushdown The integration converts DataFusion physical expressions into Vortex expressions using an `ExpressionConvertor` trait. Supported predicates (comparisons, LIKE, IS NULL, IN lists, casts) -are pushed into the Vortex scan where they participate in pruning and filter evaluation at the -layout level. Unsupported predicates remain in the DataFusion plan and are evaluated after the -scan. +are pushed into the Vortex scan where they participate in layout-level evidence, pruning, and +residual filter evaluation. Unsupported predicates remain in the DataFusion plan and are evaluated +after the scan. Filter pushdown operates at two levels. The full predicate is used to prune entire files before they are opened, using file-level statistics. The subset of predicates that Vortex can evaluate efficiently is pushed into the per-file scan for row-level filtering. Projection pushdown maps DataFusion's requested column indices to Vortex field names and passes -them as a projection expression to the scan. Only the requested columns are read from storage. +them as projection expressions to the scan. Struct layouts route those expressions to the requested +field children, so only the requested columns are read from storage. The integration supports pluggable expression conversion via a custom `ExpressionConvertor`, allowing engine-specific rewrites or schema adaptation when file schemas diverge from the table @@ -61,15 +62,9 @@ schema. Vortex arrays produced by the scan are converted to Arrow `RecordBatch`es for consumption by DataFusion. Batches are sliced to respect DataFusion's configured batch size preference. -## Future Work +## Dynamic Filters -The current integration builds directly on the `ScanBuilder` and layout reader APIs. Future work -will migrate it to use the [Scan API](/concepts/scanning) `Source` trait, which will simplify -the integration by providing a standard interface for file discovery, partitioning, and pushdown -that is shared across all engine integrations. - -Other planned improvements include projection expression pushdown, which would allow DataFusion -to push complex projection expressions (such as extracting nested struct fields) into the Vortex -scan rather than materializing entire columns and projecting afterwards. Additionally, better -support for dynamic expressions would enable use-cases like top-k queries, where the scan's -filter expression is updated during execution as the query engine discovers tighter bounds. +Dynamic expressions support use-cases like top-k queries, where the query engine discovers tighter +bounds during execution. When a dynamic predicate version changes, cheap prepared evidence handles can recheck +in-flight morsels before projection so the scan avoids reading output rows that are no longer +needed. diff --git a/docs/developer-guide/integrations/duckdb.md b/docs/developer-guide/integrations/duckdb.md index 060f5fca973..77501bebd0a 100644 --- a/docs/developer-guide/integrations/duckdb.md +++ b/docs/developer-guide/integrations/duckdb.md @@ -42,13 +42,15 @@ the [runtime documentation](../internals/async-runtime.md) for more on this trad DuckDB's planner pushes filter predicates into the scan via the `pushdown_complex_filter` callback. These are converted from DuckDB's bound expression representation into Vortex expressions and stored alongside any table filter expressions. During scanning, the combined -filter is applied to the `ScanBuilder` for each file. +filter is pushed into the Vortex file scan for each file. Files can be pruned entirely before opening if their statistics prove that no rows can match -the filter. +the filter. For opened files, layout-level evidence can prune row ranges before residual +predicate reads materialize row data. Projection pushdown maps DuckDB's requested column indices to Vortex field names and passes -them as a projection expression to the scan. +them as projection expressions to the scan. Struct layouts route those expressions to field +children, so unrelated columns are not read. ## Data Export @@ -61,9 +63,8 @@ canonical (Arrow-compatible) conversion before export. Results are exported in chunks matching DuckDB's standard vector size to align with its vectorized execution model. -## Future Work +## Scan Runtime -The current integration builds directly on the `ScanBuilder`, layout reader, and file APIs. -Future work will migrate it to use the [Scan API](/concepts/scanning) `Source` trait, unifying -file discovery, multi-file coordination, and pushdown behind a single interface shared across -all engine integrations. +DuckDB workers consume chunks from a shared scan stream. Vortex opens files, expands layouts into +ScanPlan trees, prepares evidence and projection reads, and exports produced arrays into DuckDB's +native vector format. diff --git a/docs/developer-guide/internals/io.md b/docs/developer-guide/internals/io.md index 9143d203fb2..9871093af46 100644 --- a/docs/developer-guide/internals/io.md +++ b/docs/developer-guide/internals/io.md @@ -78,6 +78,12 @@ A `SharedSegmentSource` deduplicates concurrent requests for the same segment us shared futures, ensuring that only one underlying I/O request is issued regardless of how many callers request the same segment simultaneously. +`SegmentId` is scoped to one file's footer segment map. A shared cache used across several opened +files must include file/source identity in its effective key to avoid collisions between, for +example, `SegmentId(0)` in two different files. The `SegmentSource` adapter is installed per opened +file, and scheduler-aware segment requests carry source identity when scan work is coordinated +across sources. + ## Backend Adaptation Each `VortexReadAt` implementation provides its own concurrency and coalescing parameters, @@ -91,3 +97,16 @@ allowing the I/O scheduler to adapt automatically: Local file reads are dispatched via `spawn_blocking` to avoid blocking the async executor. Object store reads are natively async and wrapped with `async_compat` for runtime compatibility. + +## Scan Scheduler Integration + +The ScanPlan scheduler keeps `VortexReadAt` as the common adapter for positional byte sources, but +makes segment requests visible to scan planning. A prepared `VortexFile` binds layout `SegmentId`s +to a segment source, and morsel tasks can report the segment requests they need before execution. +The scheduler sees the source ID, segment ID, byte size, and priority metadata, but not physical +byte locations. Cacheable segment reads carry a source-scoped segment cache key. + +Prepared reads and evidence tasks request segments through the segment source. The source remains +responsible for segment-cache lookup, backend-specific physical coalescing, in-flight +deduplication, and submission. See [Scan Scheduler](scan-scheduler.md) for scheduler resource +coordination. diff --git a/docs/developer-guide/internals/scan-scheduler.md b/docs/developer-guide/internals/scan-scheduler.md new file mode 100644 index 00000000000..9b8ce751ce8 --- /dev/null +++ b/docs/developer-guide/internals/scan-scheduler.md @@ -0,0 +1,322 @@ +# Scan Scheduler + +This document describes the current ScanPlan-backed scheduler and I/O pipeline. +It is an implementation guide, not a design sketch. + +The scheduler is split across three layers: + +- `vortex-scan::scheduler` owns the process/query-level scheduler object, + scheduler provider, and read-byte budget configuration. +- `vortex-layout::scan::plan` owns the ScanPlan runtime interfaces and + layout-backed implementations. Deserialized layouts construct concrete plans + with the file-provided segment source. +- `vortex-file::multi::scan_v2` wires files into that runtime. It builds the + root plan for a file, plans morsels, queues evidence/predicate/projection + work, and decides which queued task is useful next. +- `vortex-file::segments` and `vortex-file::read` own segment future + registration, logical read deduplication, physical range coalescing, and + backend request concurrency. + +The global scheduler is deliberately not a central work queue. It does not know +about predicates, layouts, row masks, or query semantics. The scan runtime makes +those decisions locally, then uses scheduler-visible read bytes and task lanes to +control how much work is launched. + +## Execution Shape + +The normal DataFusion ScanPlan path is: + +```text +DataFusion DataSource::open(partition) + | + v +VortexDataSource builds ScanRequest + | + v +DataSourceRef::plan_morsel_partitions or DataSourceRef::scan + | + v +ScanSchedulerProvider::scheduler_for_scan + | + v +partition_work_stream + | + +-- plan morsels into task queues + +-- create task steps and register segment reads synchronously + +-- admit tasks by lane/frontier/read bytes + +-- poll task futures on the Vortex runtime + +-- emit arrays in ordered or unordered mode +``` + +`DataSource::plan_morsel_partitions` is used when the engine can consume many +output partitions. It opens files, asks each prepared file for split ranges, and +round-robins planned morsels across the engine-requested partition count. Each +partition then runs its own `partition_work_stream`, but planned morsels from the +same file share the same `ScanExecution`, `SegmentFutureCache`, and +`FileSegmentSource`. + +`DataSource::scan` is the fallback path. It yields file partitions and each file +partition creates its own `partition_work_stream`. + +Limited scans force a morsel planning window of one because limit accounting is +owned by the scan runtime and must not speculatively consume rows far ahead of +the output frontier. + +## Scheduler Objects + +`ScanSchedulerConfig` currently has one enforced field: + +- `read_byte_budget`: optional per-partition active logical segment-byte budget. + +`ScanSchedulerProvider` chooses scheduler ownership: + +- `Unbounded`: create an unbounded scheduler for the scan. +- `Shared`: reuse one `Arc`. +- `PerScan`: create a fresh scheduler from the config for each logical scan. + +The default `VortexSession` provider is `Unbounded`. DuckDB installs a shared +default scheduler in the extension session. The DataFusion benchmark only +installs a scheduler when `VORTEX_SCAN_SCHEDULER` is set. + +There is no scheduler permit API in the ScanPlan runtime. Task launch is admitted by +the per-partition `ScanTaskQueue` using active logical read bytes. Limited scans +still plan one active morsel at a time internally because limit accounting must +not consume rows far ahead of the output frontier, but that is not a public +tuning knob. + +## Planning Morsels + +`partition_work_stream` owns a `PartitionWorkSchedulerState`: + +- `pending`: planned morsel ranges not yet converted to runtime state. +- `morsels`: active morsel states indexed by morsel id. +- `task_queue`: queued evidence, predicate, projection, and aggregate tasks. +- `in_flight`: launched task futures. +- `completed_morsels`: ordered-output buffer. +- `plan_window`: internal active planned-morsel cap. This is unbounded for + normal scans and one for limited scans. + +On each stream poll, the runtime: + +1. Emits already-completed output if possible. +2. Plans more morsels while `active_morsels < plan_window`. +3. Launches admissible queued tasks until the task queue refuses more work. +4. Waits for one launched task to complete. +5. Updates evidence, predicate masks, projection state, and read accounting. + +Planning a morsel is synchronous. It creates initial evidence work and then calls +`enqueue_ready_work`. For scans without predicates, projection work is queued +immediately. For filtered scans, the runtime queues evidence first, then residual +predicate reads, then projection once all predicates are proven for the morsel. + +## Task Lanes + +`ScanTaskQueue` groups queued work into lanes: + +- `ScanEvidence`: scan-domain evidence shared by all morsels for one predicate. +- `Evidence`: morsel-local evidence for one predicate. +- `Predicate`: exact residual predicate evaluation. +- `Projection`: final projected values. +- `Aggregate`: aggregate reads, grouped with projection. + +Admission is not FIFO across all work. The queue tries groups in this order: + +1. Evidence within its byte target. +2. Predicate within its byte target. +3. Projection within its byte target. +4. Predicate ignoring group target. +5. Projection ignoring group target. +6. Evidence ignoring group target. + +All groups still obey the total read-byte budget unless the task contributes no +new bytes or the runtime has no launched work at all. The empty-in-flight escape +hatch prevents deadlock when one task is larger than the configured budget. + +Within a group, lower priority wins, then lower incremental read bytes, then +lower total read bytes, then lower morsel id. The incremental byte score is +important because tasks reading the same active segment can be admitted without +increasing active physical-read pressure. + +There is no fixed morsel read-ahead frontier. Morsels can vary substantially in +byte size and can overlap in their segment requests, so run-ahead is governed by +incremental active read bytes rather than by a count of morsels. A later morsel +with small or already-active reads may be admitted ahead of an earlier morsel +whose reads would exceed the active byte budget. + +For dynamic-predicate scans there is one extra gate: speculative projection is +suppressed while completed output is backlogged, except when there are no +launched tasks and one projection is needed to keep an ordered stream moving. +Evidence and predicate tasks are still admissible while projection is gated. This +favors avoiding wasted projection I/O over maximizing object-store request depth. + +## Read-Byte Budget + +`read_byte_budget` is per partition stream. It counts active logical segment +bytes for admitted tasks, deduped by `SegmentRequestKey`. If two launched tasks +await the same segment, only the first contributes bytes; the active entry keeps +a reference count until both tasks complete. + +When the budget is finite, the queue divides target bytes by group: + +```text +predicate: 6/8 of budget +projection: 1/8 of budget +evidence: 1/8 of budget +``` + +These are soft group targets. The second pass can use any remaining total budget +for predicate, projection, or evidence, but no task can exceed the total budget +unless it is the only way to make progress. + +The default bounded config uses: + +```text +DEFAULT_READ_BYTE_BUDGET = 256 MiB +``` + +`ScanSchedulerConfig::unbounded()` leaves this unset, which becomes `u64::MAX` +inside `partition_work_stream`. + +## Segment Reads + +Prepared reads and evidence providers create tasks. When a task is converted +into a scheduler-visible step, the concrete `ScanPlan` implementation turns any +needed layout segments into `ScanRead` values through its scan-local +`SegmentFutureCache`: + +```rust +cache.register(source, requests) +``` + +This call is synchronous. For cache misses, it calls the underlying +`SegmentSource::request(segment)` immediately and stores a shared future in the +scan-local cache. That means creating a read step registers the logical reads +with the file segment source before the task continuation is run. + +The cache key is currently the logical `SegmentId`. That is sufficient inside a +file-bound `SegmentFutureCache` because the concrete plans using that cache are +bound to the same file segment source. It is not a cross-file cache key. + +`SegmentInfo` contains only logical payload `bytes`, which the task scheduler +uses for read-budget admission. Segment-cache policy is owned by the +`SegmentCacheSourceAdapter`; it is not expressed through scheduler-visible +segment metadata. + +## Physical I/O + +`FileSegmentSource` bridges logical segment requests to a `VortexReadAt` backend. +It has an internal event stream with these request states: + +- registered: a segment future exists, but has not been polled; +- requested: the segment future has been polled; +- in-flight: the physical backend read has been submitted; +- resolved: the future has completed. + +Registered but unpolled requests are still visible to coalescing. When one +request is polled, `IoRequestStream` picks the earliest polled request and may +coalesce nearby registered or polled requests by physical offset. + +Physical coalescing is controlled by `VortexReadAt::coalesce_config()`: + +```text +in-memory: 8 KiB distance, 8 KiB max +local file: 1 MiB distance, 4 MiB max +object storage: 1 MiB distance, 16 MiB max +``` + +Physical request concurrency is controlled by `VortexReadAt::concurrency()`: + +```text +ObjectStoreReadAt default concurrency = 192 +``` + +This concurrency is below the scan task queue. The object-store layer can only +use that depth if the scan runtime has registered and polled enough segment +futures. + +## Object Store Behavior + +The current object-store path has good physical defaults but no automatic scan +scheduler preset: + +- `ObjectStoreReadAt` uses object-store coalescing and high physical request + concurrency. +- DataFusion remote benchmarks create the `VortexSession` before registering the + object store URL, so the Vortex scheduler provider cannot infer S3/GCS from + the source URL. +- DuckDB uses a shared scheduler with the default active read-byte budget. +- DataFusion uses an unbounded scheduler unless benchmark environment variables + opt into a scheduler. + +For object stores, the main risk is not the `ObjectStoreReadAt` queue depth. It +is failing to expose enough useful segment futures early enough, or exposing far +too many tiny/sparse reads without a workload-specific budget. The important +knobs are: + +- `read_byte_budget`: how many active logical segment bytes may be polled; +- physical coalescing distance/max size on the object-store reader; +- physical object-store request concurrency; +- DataFusion output partition count, which controls how many partition streams + run at once. + +## Benchmark Knobs + +The DataFusion benchmark supports: + +```text +VORTEX_SCAN_SCHEDULER=unbounded|shared|per-query +VORTEX_SCAN_MAX_READ_BYTES=... +``` + +Useful S3 sweeps should compare: + +```text +# Default unbounded behavior. +VORTEX_SCAN_SCHEDULER=unbounded + +# Bounded read pressure, one scheduler per query. +VORTEX_SCAN_SCHEDULER=per-query +VORTEX_SCAN_MAX_READ_BYTES=268435456 + +# Larger remote-storage byte window. +VORTEX_SCAN_SCHEDULER=per-query +VORTEX_SCAN_MAX_READ_BYTES=1073741824 +``` + +An active-logical-read target was tested as an I/O-depth proxy and rejected: it +improved some FineWeb cases, but regressed local PolarSignals enough that it was +too indirect to use as a scheduler knob. + +## Tuning Guidance + +For local NVMe, keep the read budget moderate and rely on local filesystem +coalescing. Excessive read-ahead can increase memory pressure without hiding much +latency. + +For S3/GCS, prefer a larger byte budget so the file segment source can keep more +useful logical reads active and coalesce adjacent registered requests. +If a query is highly selective and projection reads are sparse, validate the +coalesced-byte metrics before increasing the object-store coalescing max size. +If dynamic predicates are active, also compare projection-gated behavior against +object-store request depth: the gate is intended to avoid wasted projection I/O, +but it can reduce S3 latency hiding for projection-light queries. + +Use scan metrics to separate three failure modes: + +- low object-store request concurrency: not enough futures are being polled; +- low coalescing: not enough adjacent futures are registered before polling; +- excessive over-read: coalesced requests are much larger than useful projected + segment bytes. + +The scheduler today cannot distinguish those automatically. The next practical +tuning step is to expose byte-based controls for physical object-store +coalescing/request pressure if logical read-byte budgeting is not enough. + +## Known Gaps + +- The benchmark can configure scheduler mode and read-byte budget, but not + physical object-store coalescing or request concurrency. +- There is no automatic object-store scheduler preset. +- The scan runtime accounts logical segment bytes, not physical coalesced bytes. +- Output Arrow conversion is outside the scan task queue and has separate + buffering in the DataFusion adapter. diff --git a/docs/developer-guide/internals/serialization.md b/docs/developer-guide/internals/serialization.md index 11cfe720c47..7f5628d3be1 100644 --- a/docs/developer-guide/internals/serialization.md +++ b/docs/developer-guide/internals/serialization.md @@ -88,8 +88,9 @@ The postscript locates four regions by offset and length: configs. The layout FlatBuffer is a tree of `Layout` nodes, each containing an encoding ID, row count, -metadata, child layouts, and segment indices. This tree is deserialized and bound to a segment -source to create a `LayoutReader` that can lazily fetch data on demand. +metadata, child layouts, and segment indices. This tree is deserialized into lazy `LayoutRef` +nodes. During a scan, layout vtables expand those nodes into a `ScanPlan` tree that requests +segments from the bound segment source on demand. ## FlatBuffers diff --git a/docs/developer-guide/internals/session.md b/docs/developer-guide/internals/session.md index 823183ae7cc..8c8efdb0938 100644 --- a/docs/developer-guide/internals/session.md +++ b/docs/developer-guide/internals/session.md @@ -30,7 +30,7 @@ Each Vortex crate defines a session variable that holds a registry for its exten | `DTypeSession` | `vortex-array` | Extension dtype vtables (Date, Time, ...) | | `ArraySession` | `vortex-array` | Array encoding vtables (ALP, FSST, ...) | | `ScalarFnSession` | `vortex-array` | Scalar function vtables | -| `LayoutSession` | `vortex-layout` | Layout encoding vtables (Flat, Chunked, ...) | +| `LayoutSession` | `vortex-layout` | Layout vtable plugins (Flat, Chunked, ...) | | `RuntimeSession` | `vortex-io` | Async runtime handle | | `CudaSession` | `vortex-cuda` | CUDA context, kernels, and stream pool | @@ -47,8 +47,8 @@ Plugins register with the session by accessing the relevant component and callin // Register a custom array encoding session.arrays().register(MyEncoding); -// Register a custom layout -session.layouts().register(MyLayout::encoding()); +// Register a custom layout vtable +session.layouts().register_v2(MyLayout); // Register a custom scalar function session.scalar_fns().register(MyScalarFnVTable); @@ -61,7 +61,7 @@ to register all built-in encodings. ## Explicit Passing Sessions are passed explicitly through constructors and method arguments. This means every API -that needs access to registries -- file readers, writers, scan builders, layout readers -- receives +that needs access to registries -- file readers, writers, scan sources, layout vtables -- receives the session directly rather than reaching for global state. ```rust @@ -75,8 +75,11 @@ session.write_options() .write(&mut file, array_stream) .await?; -// Scanning a layout -ScanBuilder::new(session.clone(), layout_reader) +// Scanning a file +let stream = session.open_options() + .open_path("data.vortex") + .await? + .scan()? .with_filter(expr) .into_array_stream()?; ``` diff --git a/docs/developer-guide/internals/vtables.md b/docs/developer-guide/internals/vtables.md index 77283ea012d..0e73bca2fda 100644 --- a/docs/developer-guide/internals/vtables.md +++ b/docs/developer-guide/internals/vtables.md @@ -206,11 +206,21 @@ Currently uses `VTable` (unqualified), `VTableAdapter`, `DynExprVTable` (sealed and `ExprVTable` (confusingly, the erased ref). Needs renaming to `ExprVTable`, `DynExpr`, `ExprRef`. Introduce `Expr` data struct, remove `VTableAdapter`. -### Layout -- Not started +### Layout -- Implemented for serialized scan layouts -Currently uses `VTable` (unqualified), `LayoutAdapter`, and `Layout` (sealed trait doubling -as public API). Needs renaming to `LayoutVTable`, `DynLayout`, `LayoutRef`. Introduce -`Layout` data struct, remove `LayoutAdapter`. +The scan layout path follows this pattern in `vortex_layout::layout_v2`: + +- `layout_v2::VTable` is the layout vtable implemented by layout plugins. +- `Layout` is the typed layout handle with common fields hoisted: dtype, row count, segment IDs, + and lazy child access. +- `V::LayoutData` stores only layout-specific metadata. +- `LayoutRef` is the public type-erased layout handle. +- `DynLayout` is private erased dispatch plumbing. +- `LayoutVTablePlugin` is the registry object used for ID-based footer deserialization. + +The layout vtable also owns scan expansion through `new_scan_plan`. This keeps serialized layout +metadata and runtime scan behavior registered at the same plugin point: deserializing a layout +produces `Layout`, and scanning it expands that typed layout into a `ScanPlan`. ### Array -- Not started diff --git a/docs/specs/file-format.md b/docs/specs/file-format.md index 425294f2d99..8704a510e40 100644 --- a/docs/specs/file-format.md +++ b/docs/specs/file-format.md @@ -75,8 +75,9 @@ valid to store a `Float64` array, a `Boolean` array, or any other root data type ## Footer The footer is a flat buffer serialized `Footer` object. This object contains all the information required to -load the root `Layout` object into a usable `LayoutReader`). +deserialize the root `Layout` object into a `LayoutRef` and bind its segment IDs to file byte ranges. For example, it contains the locations, compression schemes, encryption schemes, and required alignment of all segments in the file. +The scan runtime expands that root layout into a `ScanPlan` tree when a query is executed. :::{literalinclude} ../../vortex-flatbuffers/flatbuffers/vortex-file/footer.fbs :start-after: [footer] diff --git a/vortex-array/src/arrays/chunked/compute/kernel.rs b/vortex-array/src/arrays/chunked/compute/kernel.rs index db0042105cd..4d897afb067 100644 --- a/vortex-array/src/arrays/chunked/compute/kernel.rs +++ b/vortex-array/src/arrays/chunked/compute/kernel.rs @@ -5,10 +5,8 @@ use vortex_session::VortexSession; use crate::ArrayVTable; use crate::arrays::Chunked; -use crate::arrays::Dict; use crate::arrays::Filter; use crate::arrays::Slice; -use crate::arrays::dict::TakeExecuteAdaptor; use crate::arrays::filter::FilterExecuteAdaptor; use crate::arrays::slice::SliceExecuteAdaptor; use crate::optimizer::kernels::ArrayKernelsExt; @@ -23,6 +21,5 @@ pub(crate) fn initialize(session: &VortexSession) { kernels.register_execute_parent_kernel(Filter.id(), Chunked, FilterExecuteAdaptor(Chunked)); kernels.register_execute_parent_kernel(Mask.id(), Chunked, MaskExecuteAdaptor(Chunked)); kernels.register_execute_parent_kernel(Slice.id(), Chunked, SliceExecuteAdaptor(Chunked)); - kernels.register_execute_parent_kernel(Dict.id(), Chunked, TakeExecuteAdaptor(Chunked)); kernels.register_execute_parent_kernel(Zip.id(), Chunked, ZipExecuteAdaptor(Chunked)); } diff --git a/vortex-datafusion/src/convert/stats.rs b/vortex-datafusion/src/convert/stats.rs index 33a33a78ccf..ffa62c71e77 100644 --- a/vortex-datafusion/src/convert/stats.rs +++ b/vortex-datafusion/src/convert/stats.rs @@ -2,13 +2,24 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use datafusion_common::ColumnStatistics; +use datafusion_common::ScalarValue; use datafusion_common::stats::Precision; +use vortex::array::aggregate_fn::AggregateFnRef; +use vortex::array::aggregate_fn::AggregateFnVTableExt; +use vortex::array::aggregate_fn::EmptyOptions; +use vortex::array::aggregate_fn::NumericalAggregateOpts; +use vortex::array::aggregate_fn::fns::max::Max; +use vortex::array::aggregate_fn::fns::min::Min; +use vortex::array::aggregate_fn::fns::null_count::NullCount; +use vortex::array::aggregate_fn::fns::sum::Sum; +use vortex::array::aggregate_fn::fns::uncompressed_size_in_bytes::UncompressedSizeInBytes; use vortex::array::stats::StatsSet; use vortex::dtype::DType; use vortex::dtype::Nullability; use vortex::dtype::PType; use vortex::error::VortexExpect; use vortex::error::VortexResult; +use vortex::error::vortex_err; use vortex::expr::stats::Precision as VortexPrecision; use vortex::expr::stats::Stat; use vortex::scalar::Scalar; @@ -16,7 +27,45 @@ use vortex::scalar::Scalar; use crate::PrecisionExt; use crate::convert::TryToDataFusion; +const MIN_INDEX: usize = 0; +const MAX_INDEX: usize = 1; +const SUM_INDEX: usize = 2; +const NULL_COUNT_INDEX: usize = 3; +const BYTE_SIZE_INDEX: usize = 4; + +pub(crate) fn column_statistics_aggregate_fns() -> Vec { + vec![ + Min.bind(NumericalAggregateOpts::default()), + Max.bind(NumericalAggregateOpts::default()), + Sum.bind(NumericalAggregateOpts::default()), + NullCount.bind(EmptyOptions), + UncompressedSizeInBytes.bind(EmptyOptions), + ] +} + +pub(crate) fn aggregate_stats_to_df( + stats: &[VortexPrecision], +) -> VortexResult { + if stats.len() != BYTE_SIZE_INDEX + 1 { + return Err(vortex_err!( + "expected {} aggregate statistics, got {}", + BYTE_SIZE_INDEX + 1, + stats.len() + )); + } + + Ok(ColumnStatistics { + null_count: scalar_u64_to_df_usize(&stats[NULL_COUNT_INDEX])?, + min_value: scalar_to_df(&stats[MIN_INDEX])?, + max_value: scalar_to_df(&stats[MAX_INDEX])?, + sum_value: scalar_to_df(&stats[SUM_INDEX])?, + distinct_count: Precision::Absent, + byte_size: scalar_u64_to_df_usize(&stats[BYTE_SIZE_INDEX])?, + }) +} + /// Convert a stats set for an array with the given dtype. +#[allow(dead_code)] pub(crate) fn stats_set_to_df( stats_set: &StatsSet, dtype: &DType, @@ -88,6 +137,29 @@ pub(crate) fn is_constant_to_distinct_count( } } +fn scalar_to_df(stat: &VortexPrecision) -> VortexResult> { + match stat { + VortexPrecision::Exact(scalar) => Ok(Precision::Exact(scalar.try_to_df()?)), + VortexPrecision::Inexact(scalar) => Ok(Precision::Inexact(scalar.try_to_df()?)), + VortexPrecision::Absent => Ok(Precision::Absent), + } +} + +fn scalar_u64_to_df_usize(stat: &VortexPrecision) -> VortexResult> { + match stat { + VortexPrecision::Exact(scalar) => Ok(Precision::Exact(scalar_u64_to_usize(scalar)?)), + VortexPrecision::Inexact(scalar) => Ok(Precision::Inexact(scalar_u64_to_usize(scalar)?)), + VortexPrecision::Absent => Ok(Precision::Absent), + } +} + +fn scalar_u64_to_usize(scalar: &Scalar) -> VortexResult { + let Some(value) = scalar.as_primitive().typed_value::() else { + return Err(vortex_err!("expected u64 statistic scalar, got {}", scalar)); + }; + Ok(usize::try_from(value).unwrap_or(usize::MAX)) +} + #[cfg(test)] mod tests { use vortex::expr::stats::Precision as VortexPrecision; diff --git a/vortex-datafusion/src/persistent/format.rs b/vortex-datafusion/src/persistent/format.rs index a0d49e6105a..a181a8c64db 100644 --- a/vortex-datafusion/src/persistent/format.rs +++ b/vortex-datafusion/src/persistent/format.rs @@ -57,8 +57,10 @@ use vortex::file::EOF_SIZE; use vortex::file::MAX_POSTSCRIPT_SIZE; use vortex::file::OpenOptionsSessionExt; use vortex::file::VORTEX_FILE_EXTENSION; +use vortex::file::VortexFile; use vortex::io::object_store::ObjectStoreReadAt; use vortex::io::session::RuntimeSessionExt; +use vortex::layout::scan::v2::scan2_enabled; use vortex::scalar::Scalar; use vortex::scalar::ScalarValue as VortexScalarValue; use vortex::session::VortexSession; @@ -68,6 +70,8 @@ use super::sink::VortexSink; use super::source::VortexSource; use crate::PrecisionExt as _; use crate::convert::TryToDataFusion; +use crate::convert::stats::aggregate_stats_to_df; +use crate::convert::stats::column_statistics_aggregate_fns; use crate::convert::stats::is_constant_to_distinct_count; const DEFAULT_FOOTER_INITIAL_READ_SIZE_BYTES: usize = MAX_POSTSCRIPT_SIZE as usize + EOF_SIZE; @@ -247,16 +251,10 @@ impl VortexFormatFactory { } } - /// Creates a factory with an explicit session and default options. - /// - /// The supplied options become the baseline for every [`VortexFormat`] - /// created by this factory. DataFusion may still override them with - /// table-level options passed into [`FileFormatFactory::create`]. - pub fn new_with_options(session: VortexSession, options: VortexTableOptions) -> Self { - Self { - session, - options: Some(options), - } + /// Overrides the [`VortexSession`] used by formats created from this factory. + pub fn with_session(mut self, session: VortexSession) -> Self { + self.session = session; + self } /// Overrides the default options for this factory. @@ -436,6 +434,49 @@ impl FileFormat for VortexFormat { let file_metadata_cache = state.runtime_env().cache_manager.get_file_metadata_cache(); SpawnedTask::spawn(async move { + if scan2_enabled().map_err(|error| DataFusionError::External(Box::new(error)))? { + let cached_footer = file_metadata_cache + .get(&object.location) + .filter(|entry| entry.is_valid_for(&object)) + .and_then(|entry| { + entry + .file_metadata + .as_any() + .downcast_ref::() + .map(|vortex_metadata| vortex_metadata.footer().clone()) + }); + let footer_cache_hit = cached_footer.is_some(); + + let reader = Arc::new(ObjectStoreReadAt::new_with_allocator( + store, + object.location.clone(), + session.handle(), + session.allocator(), + )); + let mut open_opts = session + .open_options() + .with_initial_read_size(opts.footer_initial_read_size_bytes) + .with_file_size(object.size); + if let Some(footer) = cached_footer { + open_opts = open_opts.with_footer(footer); + } + + let vxf = open_opts.open_read(reader).await.map_err(|e| { + DataFusionError::Execution(format!( + "Failed to open Vortex file {}: {e}", + object.location + )) + })?; + + if !footer_cache_hit { + let file_metadata = Arc::new(CachedVortexMetadata::new(&vxf)); + let entry = CachedFileMetadataEntry::new(object.clone(), file_metadata); + file_metadata_cache.put(&object.location, entry); + } + + return infer_scan_plan_stats(&table_schema, &vxf).await; + } + // Try to get entry metadata first let cached_metadata = file_metadata_cache .get(&object.location) @@ -632,6 +673,49 @@ impl FileFormat for VortexFormat { } } +async fn infer_scan_plan_stats(table_schema: &SchemaRef, vxf: &VortexFile) -> DFResult { + let struct_dtype = vxf + .dtype() + .as_struct_fields_opt() + .vortex_expect("dtype is not a struct"); + let funcs = column_statistics_aggregate_fns(); + let mut column_statistics = vec![ColumnStatistics::default(); table_schema.fields().len()]; + let mut requested_columns = Vec::new(); + let mut requested_exprs = Vec::new(); + + for (idx, field) in table_schema.fields().iter().enumerate() { + if struct_dtype.find(field.name()).is_some() { + requested_columns.push(idx); + requested_exprs.push(vortex::expr::get_item( + field.name().as_str(), + vortex::expr::root(), + )); + } + } + + let stats = vxf + .scan_plan_statistics_many(&requested_exprs, &funcs) + .await + .map_err(|e| DataFusionError::Execution(format!("Failed to infer scan2 stats: {e}")))?; + for (column_idx, stats) in requested_columns.into_iter().zip(stats) { + column_statistics[column_idx] = aggregate_stats_to_df(&stats).map_err(|e| { + DataFusionError::Execution(format!("Failed to convert scan2 stats: {e}")) + })?; + } + + let total_byte_size = column_statistics + .iter() + .fold(DFPrecision::Exact(0), |acc, cs| acc.add(&cs.byte_size)); + let num_rows = usize::try_from(vxf.row_count()) + .map_err(|_| DataFusionError::Execution("Row count overflow".to_string()))?; + + Ok(Statistics { + num_rows: DFPrecision::Exact(num_rows), + total_byte_size, + column_statistics, + }) +} + fn scalar_stat_to_df( stat: Stat, value: Precision, diff --git a/vortex-datafusion/src/persistent/metrics.rs b/vortex-datafusion/src/persistent/metrics.rs index e3bb1b18868..e94fc426646 100644 --- a/vortex-datafusion/src/persistent/metrics.rs +++ b/vortex-datafusion/src/persistent/metrics.rs @@ -23,6 +23,7 @@ use vortex::metrics::Metric; use vortex::metrics::MetricValue; use crate::persistent::source::VortexSource; +use crate::v2::VortexDataSource; pub(crate) static PARTITION_LABEL: &str = "partition"; pub(crate) static PATH_LABEL: &str = "file_path"; @@ -91,6 +92,18 @@ impl ExecutionPlanVisitor for VortexMetricsFinder { } } + if let Some(scan) = exec.data_source().downcast_ref::() + && let Some(metrics_registry) = scan.metrics_registry() + { + for metric in metrics_registry + .snapshot() + .iter() + .flat_map(metric_to_datafusion) + { + set.push(Arc::new(metric)); + } + } + self.0.push(set); Ok(false) diff --git a/vortex-datafusion/src/persistent/opener.rs b/vortex-datafusion/src/persistent/opener.rs index 2923b6c313c..d0dd158fa41 100644 --- a/vortex-datafusion/src/persistent/opener.rs +++ b/vortex-datafusion/src/persistent/opener.rs @@ -42,15 +42,19 @@ use vortex::dtype::FieldMask; use vortex::error::VortexError; use vortex::error::VortexExpect; use vortex::file::OpenOptionsSessionExt; +use vortex::file::VortexFile; use vortex::io::InstrumentedReadAt; +use vortex::io::session::RuntimeSessionExt; use vortex::layout::LayoutReader; use vortex::layout::scan::scan_builder::ScanBuilder; use vortex::layout::scan::split_by::SplitBy; use vortex::metrics::Label; use vortex::metrics::MetricsRegistry; +use vortex::scan::ScanRequest; use vortex::session::VortexSession; use vortex_utils::aliases::dash_map::DashMap; use vortex_utils::aliases::dash_map::Entry; +use vortex_utils::parallelism::get_available_parallelism; use crate::VortexAccessPlan; use crate::convert::exprs::ExpressionConvertor; @@ -69,6 +73,7 @@ pub(crate) struct VortexOpener { pub partition: usize, pub session: VortexSession, pub vortex_reader_factory: Arc, + pub scan_v2: bool, /// Optional table schema projection. The indices are w.r.t. the `table_schema`, which is /// all fields in the final scan result not including the partition columns. pub projection: ProjectionExprs, @@ -96,6 +101,8 @@ pub(crate) struct VortexOpener { pub layout_readers: Arc>>, /// Shared full-file natural split ranges keyed by file path. pub natural_split_ranges: Arc]>>>, + /// Shared V2 file handles keyed by file path. + pub vortex_files: Arc>>, /// Whether the query has output ordering specified pub has_output_ordering: bool, @@ -132,11 +139,13 @@ impl FileOpener for VortexOpener { let limit = self.limit; let layout_readers = Arc::clone(&self.layout_readers); let natural_split_ranges = Arc::clone(&self.natural_split_ranges); + let vortex_files = Arc::clone(&self.vortex_files); let has_output_ordering = self.has_output_ordering; let scan_concurrency = self.scan_concurrency; let expr_convertor = Arc::clone(&self.expression_convertor); let projection_pushdown = self.projection_pushdown; + let scan_v2 = self.scan_v2; // Replace column access for partition columns with literals #[expect(clippy::disallowed_types)] @@ -208,10 +217,24 @@ impl FileOpener for VortexOpener { open_opts = open_opts.with_footer(footer); } - let vxf = open_opts - .open_read(reader) - .await - .map_err(|e| exec_datafusion_err!("Failed to open Vortex file {e}"))?; + let vxf = if let Some(hit) = vortex_files.get(&file.object_meta.location) { + Arc::clone(hit.value()) + } else { + let opened = Arc::new( + open_opts + .open_read(reader) + .await + .map_err(|e| exec_datafusion_err!("Failed to open Vortex file {e}"))?, + ); + + match vortex_files.entry(file.object_meta.location.clone()) { + Entry::Occupied(entry) => Arc::clone(entry.get()), + Entry::Vacant(entry) => { + entry.insert(Arc::clone(&opened)); + opened + } + } + }; // On a miss, cache the parsed footer so other partitions and later executions // skip the footer fetch and parse. `infer_schema`/`infer_stats` also populate @@ -302,33 +325,163 @@ impl FileOpener for VortexOpener { .try_map_exprs(|expr| reassign_expr_columns(expr, &stream_schema))?; let projector = leftover_projection.make_projector(&stream_schema)?; - // We share our layout readers with others partitions in the scan, so we can only need to read each layout in each file once. - let layout_reader = match layout_readers.entry(file.object_meta.location.clone()) { - Entry::Occupied(mut occupied_entry) => { - if let Some(reader) = occupied_entry.get().upgrade() { - tracing::trace!("reusing layout reader for {}", occupied_entry.key()); - reader + let filter = filter + .and_then(|f| { + // Verify that all filters we've accepted from DataFusion get pushed down. + // This will only fail if the user has not configured a suitable + // PhysicalExprAdapterFactory on the file source to handle rewriting the + // expression to handle missing/reordered columns in the Vortex file. + let (pushed, unpushed): (Vec, Vec) = + split_conjunction(&f) + .into_iter() + .cloned() + .partition(|expr| { + expr_convertor.can_be_pushed_down(expr, &this_file_schema) + }); + + if !unpushed.is_empty() { + return Some(Err(exec_datafusion_err!( + r#"VortexSource accepted but failed to push {} filters. + This should never happen if you have a properly configured + PhysicalExprAdapterFactory configured on the source. + + Failed filters: + + {unpushed:#?} + "#, + unpushed.len() + ))); + } + + make_vortex_predicate(expr_convertor.as_ref(), &pushed).transpose() + }) + .transpose()?; + + if scan_v2 { + let row_range = if let Some(file_range) = file.range { + let byte_range = Range { + start: u64::try_from(file_range.start).map_err(|_| { + exec_datafusion_err!("Vortex file range start is negative") + })?, + end: u64::try_from(file_range.end).map_err(|_| { + exec_datafusion_err!("Vortex file range end is negative") + })?, + }; + if byte_range.start == 0 && byte_range.end == file.object_meta.size { + None } else { - tracing::trace!("creating layout reader for {}", occupied_entry.key()); - let reader = vxf.layout_reader().map_err(|e| { - DataFusionError::Execution(format!( - "Failed to create layout reader: {e}" - )) - })?; - occupied_entry.insert(Arc::downgrade(&reader)); - reader + // DataFusion partitions a single file by byte ranges. V2 may expose only + // coarse top-level split hints, so assigning whole natural splits here can + // collapse many byte ranges into a few row ranges. Slice proportionally by + // row count; the V2 scan plan will still split the resulting row range into + // layout-aware morsels during preparation. + let Some(row_range) = byte_range_to_row_range( + byte_range, + file.object_meta.size, + vxf.row_count(), + ) else { + return Ok(stream::empty().boxed()); + }; + Some(row_range) } - } - Entry::Vacant(vacant_entry) => { - tracing::trace!("creating layout reader for {}", vacant_entry.key()); - let reader = vxf.layout_reader().map_err(|e| { - DataFusionError::Execution(format!("Failed to create layout reader: {e}")) - })?; - vacant_entry.insert(Arc::downgrade(&reader)); + } else { + None + }; - reader + let selection = file + .extensions + .get::() + .and_then(|vortex_plan| vortex_plan.selection().cloned()) + .unwrap_or_default(); + let stream_target_field = + Field::new_struct("", stream_schema.fields().clone(), false); + let file_location = file.object_meta.location.clone(); + let array_stream = vxf + .scan_plan_stream(ScanRequest { + projection: scan_projection, + filter, + row_range, + selection, + ordered: has_output_ordering, + limit, + ..Default::default() + }) + .map_err(|e| { + exec_datafusion_err!("Failed to create Vortex scan2 stream: {e}") + })?; + // The Vortex->Arrow conversion (decode + canonicalize) is CPU-bound, so spawn each + // chunk's conversion onto the runtime's CPU pool and buffer them. This fans the + // decode out within a single partition instead of running serially on the consumer's + // poll thread, which matters for scans with few partitions (e.g. small tables). + // `buffered` preserves order for ordered consumers. + let handle = session.handle(); + let decode_concurrency = 4 * get_available_parallelism().unwrap_or(1); + let converted = array_stream.map(move |chunk| { + let session = session.clone(); + let stream_target_field = stream_target_field.clone(); + handle.spawn_cpu(move || { + let chunk = chunk?; + let mut ctx = session.create_execution_ctx(); + let arrow_session = ctx.session().clone(); + let arrow = arrow_session.arrow().execute_arrow( + chunk, + Some(&stream_target_field), + &mut ctx, + )?; + Ok(RecordBatch::from(arrow.as_struct().clone())) + }) + }); + let stream = if has_output_ordering { + converted.buffered(decode_concurrency).boxed() + } else { + converted.buffer_unordered(decode_concurrency).boxed() } - }; + .map_ok(move |rb| { + // We try and slice the stream into respecting datafusion's configured batch size. + stream::iter( + (0..rb.num_rows().div_ceil(batch_size * 2)) + .flat_map(move |block_idx| { + let offset = block_idx * batch_size * 2; + + // If we have less than two batches worth of rows left, we keep them together as a single batch. + if rb.num_rows() - offset < 2 * batch_size { + let length = rb.num_rows() - offset; + [Some(rb.slice(offset, length)), None].into_iter() + } else { + let first = rb.slice(offset, batch_size); + let second = rb.slice(offset + batch_size, batch_size); + [Some(first), Some(second)].into_iter() + } + }) + .flatten() + .map(Ok), + ) + }) + .map_err(move |e: VortexError| { + DataFusionError::External(Box::new( + e.with_context(format!("Failed to read Vortex file: {file_location}")), + )) + }) + .try_flatten() + .map(move |batch| { + if projector.projection().as_ref().is_empty() { + batch + } else { + batch.and_then(|b| projector.project_batch(&b)) + } + }) + .boxed(); + + return if let Some(file_pruner) = file_pruner { + Ok(PrunableStream::new(file_pruner, stream).boxed()) + } else { + Ok(stream) + }; + } + + // We share our layout readers with others partitions in the scan, so we can only need to read each layout in each file once. + let layout_reader = + layout_reader_for_file(layout_readers.as_ref(), &file.object_meta.location, &vxf)?; let mut scan_builder = ScanBuilder::new(session.clone(), Arc::clone(&layout_reader)); @@ -364,38 +517,6 @@ impl FileOpener for VortexOpener { } } - let filter = filter - .and_then(|f| { - // Verify that all filters we've accepted from DataFusion get pushed down. - // This will only fail if the user has not configured a suitable - // PhysicalExprAdapterFactory on the file source to handle rewriting the - // expression to handle missing/reordered columns in the Vortex file. - let (pushed, unpushed): (Vec, Vec) = - split_conjunction(&f) - .into_iter() - .cloned() - .partition(|expr| { - expr_convertor.can_be_pushed_down(expr, &this_file_schema) - }); - - if !unpushed.is_empty() { - return Some(Err(exec_datafusion_err!( - r#"VortexSource accepted but failed to push {} filters. - This should never happen if you have a properly configured - PhysicalExprAdapterFactory configured on the source. - - Failed filters: - - {unpushed:#?} - "#, - unpushed.len() - ))); - } - - make_vortex_predicate(expr_convertor.as_ref(), &pushed).transpose() - }) - .transpose()?; - if let Some(limit) = limit && filter.is_none() { @@ -472,6 +593,36 @@ impl FileOpener for VortexOpener { } } +/// Get or create a shared layout reader for a file. Layout readers are cached (weakly) per path so +/// each file's layout is parsed only once across all partitions of a scan. +fn layout_reader_for_file( + layout_readers: &DashMap>, + path: &Path, + vxf: &VortexFile, +) -> DFResult> { + let create = || { + vxf.layout_reader() + .map_err(|e| DataFusionError::Execution(format!("Failed to create layout reader: {e}"))) + }; + + match layout_readers.entry(path.clone()) { + Entry::Occupied(mut occupied_entry) => { + if let Some(reader) = occupied_entry.get().upgrade() { + Ok(reader) + } else { + let reader = create()?; + occupied_entry.insert(Arc::downgrade(&reader)); + Ok(reader) + } + } + Entry::Vacant(vacant_entry) => { + let reader = create()?; + vacant_entry.insert(Arc::downgrade(&reader)); + Ok(reader) + } + } +} + fn natural_split_ranges_for_file( natural_split_ranges: &DashMap]>>, path: &Path, @@ -506,6 +657,36 @@ fn compute_natural_split_ranges(layout_reader: &dyn LayoutReader) -> DFResult, + total_size: u64, + row_count: u64, +) -> Option> { + if byte_range.start >= byte_range.end || total_size == 0 || row_count == 0 { + return None; + } + + let start_byte = byte_range.start.min(total_size); + let end_byte = byte_range.end.min(total_size); + if start_byte >= end_byte { + return None; + } + + let start = byte_to_row(start_byte, total_size, row_count); + let end = if end_byte == total_size { + row_count + } else { + byte_to_row(end_byte, total_size, row_count) + }; + + (start < end).then_some(start..end) +} + +fn byte_to_row(byte: u64, total_size: u64, row_count: u64) -> u64 { + let row = (u128::from(byte) * u128::from(row_count)) / u128::from(total_size); + u64::try_from(row).vortex_expect("byte-to-row projection should fit into u64") +} + /// Translate a DataFusion byte range to the contiguous natural split ranges it owns. /// Most splits are assigned by midpoint, but the leading split stays with the range that owns /// byte 0 so a tiny first byte range still claims the first rows. @@ -612,6 +793,57 @@ mod tests { static SESSION: LazyLock = LazyLock::new(VortexSession::default); + #[rstest] + #[case(0..10, 100, 50, Some(0..5))] + #[case(10..20, 100, 50, Some(5..10))] + #[case(90..100, 100, 50, Some(45..50))] + #[case(100..110, 100, 50, None)] + #[case(0..1, 100, 50, None)] + fn test_byte_range_to_row_range( + #[case] byte_range: Range, + #[case] total_size: u64, + #[case] row_count: u64, + #[case] expected: Option>, + ) { + assert_eq!( + byte_range_to_row_range(byte_range, total_size, row_count), + expected + ); + } + + #[test] + fn test_byte_ranges_cover_rows_exactly_once() { + let total_size = 179_114_706; + let row_count = 6_001_215; + let partitions = 18; + let byte_ranges = (0..partitions) + .map(|idx| { + let start = idx * total_size / partitions; + let end = (idx + 1) * total_size / partitions; + start..end + }) + .collect::>(); + + let row_ranges = byte_ranges + .into_iter() + .filter_map(|byte_range| byte_range_to_row_range(byte_range, total_size, row_count)) + .collect::>(); + + assert_eq!(u64::try_from(row_ranges.len()), Ok(partitions)); + assert_eq!(row_ranges.first().map(|range| range.start), Some(0)); + assert_eq!(row_ranges.last().map(|range| range.end), Some(row_count)); + assert_eq!( + row_ranges + .iter() + .map(|range| range.end - range.start) + .sum::(), + row_count + ); + for (left, right) in row_ranges.iter().tuple_windows() { + assert_eq!(left.end, right.start); + } + } + #[rstest] #[case(0..3, 10, vec![0..2, 2..5, 5..10], Some(0..2))] #[case(3..7, 10, vec![0..2, 2..5, 5..10], Some(2..5))] @@ -695,6 +927,7 @@ mod tests { partition: 1, session: SESSION.clone(), vortex_reader_factory: Arc::new(DefaultVortexReaderFactory::new(object_store)), + scan_v2: false, projection: ProjectionExprs::from_indices(&[0], table_schema.file_schema()), filter, file_pruning_predicate: None, @@ -705,6 +938,7 @@ mod tests { metrics_registry: Arc::new(DefaultMetricsRegistry::default()), layout_readers: Default::default(), natural_split_ranges: Default::default(), + vortex_files: Default::default(), has_output_ordering: false, expression_convertor: Arc::new(DefaultExpressionConvertor::default()), file_metadata_cache: None, @@ -793,6 +1027,29 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_open_scan_v2() -> anyhow::Result<()> { + let object_store = Arc::new(InMemory::new()) as Arc; + let file_path = "scan2/file.vortex"; + let batch = record_batch!(("a", Int32, vec![Some(1), Some(2), Some(3)])).unwrap(); + let data_size = + write_arrow_to_vortex(Arc::clone(&object_store), file_path, batch.clone()).await?; + + let table_schema = TableSchema::from_file_schema(batch.schema()); + let mut opener = make_opener(object_store, table_schema, None); + opener.scan_v2 = true; + + let stream = opener + .open(PartitionedFile::new(file_path.to_string(), data_size))? + .await?; + let data = stream.try_collect::>().await?; + let num_rows = data.iter().map(|rb| rb.num_rows()).sum::(); + + assert_eq!(num_rows, 3); + + Ok(()) + } + #[tokio::test] async fn test_open_populates_file_metadata_cache() -> anyhow::Result<()> { let object_store = Arc::new(InMemory::new()) as Arc; @@ -867,6 +1124,7 @@ mod tests { vortex_reader_factory: Arc::new(DefaultVortexReaderFactory::new(Arc::clone( &object_store, ))), + scan_v2: false, projection: ProjectionExprs::from_indices(&[0], table_schema.file_schema()), filter: Some(filter), file_pruning_predicate: None, @@ -877,6 +1135,7 @@ mod tests { metrics_registry: Arc::new(DefaultMetricsRegistry::default()), layout_readers: Default::default(), natural_split_ranges: Default::default(), + vortex_files: Default::default(), has_output_ordering: false, expression_convertor: Arc::new(DefaultExpressionConvertor::default()), file_metadata_cache: None, @@ -954,6 +1213,7 @@ mod tests { partition: 1, session: SESSION.clone(), vortex_reader_factory: Arc::new(DefaultVortexReaderFactory::new(object_store)), + scan_v2: false, projection: ProjectionExprs::from_indices(&[0, 1, 2], &table_schema), filter: None, file_pruning_predicate: None, @@ -964,6 +1224,7 @@ mod tests { metrics_registry: Arc::new(DefaultMetricsRegistry::default()), layout_readers: Default::default(), natural_split_ranges: Default::default(), + vortex_files: Default::default(), has_output_ordering: false, expression_convertor: Arc::new(DefaultExpressionConvertor::default()), file_metadata_cache: None, @@ -1108,6 +1369,7 @@ mod tests { vortex_reader_factory: Arc::new(DefaultVortexReaderFactory::new(Arc::clone( &object_store, ))), + scan_v2: false, projection: ProjectionExprs::from_indices( projection.as_ref(), table_schema.file_schema(), @@ -1121,6 +1383,7 @@ mod tests { metrics_registry: Arc::new(DefaultMetricsRegistry::default()), layout_readers: Default::default(), natural_split_ranges: Default::default(), + vortex_files: Default::default(), has_output_ordering: false, expression_convertor: Arc::new(DefaultExpressionConvertor::default()), file_metadata_cache: None, @@ -1171,6 +1434,7 @@ mod tests { partition: 1, session: SESSION.clone(), vortex_reader_factory: Arc::new(DefaultVortexReaderFactory::new(object_store)), + scan_v2: false, projection, filter: None, file_pruning_predicate: None, @@ -1181,6 +1445,7 @@ mod tests { metrics_registry: Arc::new(DefaultMetricsRegistry::default()), layout_readers: Default::default(), natural_split_ranges: Default::default(), + vortex_files: Default::default(), has_output_ordering: false, expression_convertor: Arc::new(DefaultExpressionConvertor::default()), file_metadata_cache: None, @@ -1314,6 +1579,64 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_scan_impl_flip_flop_v1_v2() -> anyhow::Result<()> { + let object_store = Arc::new(InMemory::new()) as Arc; + let file_path = "/path/file.vortex"; + + let batch = make_test_batch_with_10_rows(); + let data_size = + write_arrow_to_vortex(Arc::clone(&object_store), file_path, batch.clone()).await?; + + let schema = batch.schema(); + let mut file = PartitionedFile::new(file_path.to_string(), data_size); + file.extensions + .insert( + VortexAccessPlan::default().with_selection(Selection::IncludeByIndex( + Buffer::from_iter(vec![1, 3, 5, 7, 9]), + )), + ); + + let mut opener_v1 = make_test_opener( + Arc::clone(&object_store), + Arc::clone(&schema), + ProjectionExprs::from_indices(&[0, 1], &schema), + ); + opener_v1.scan_v2 = false; + opener_v1.limit = Some(3); + opener_v1.has_output_ordering = true; + + let mut opener_v2 = opener_v1.clone(); + opener_v2.scan_v2 = true; + + let v1 = opener_v1 + .open(file.clone())? + .await? + .try_collect::>() + .await?; + let v2 = opener_v2.open(file)?.await?.try_collect::>().await?; + + let format_opts = FormatOptions::new().with_types_info(true); + let v1_pretty = pretty_format_batches_with_options(&v1, &format_opts)?.to_string(); + let v2_pretty = pretty_format_batches_with_options(&v2, &format_opts)?.to_string(); + + assert_eq!(v1_pretty, v2_pretty); + assert_eq!( + v1_pretty, + r"+-------+------+ +| a | b | +| Int32 | Utf8 | ++-------+------+ +| 1 | r1 | +| 3 | r3 | +| 5 | r5 | ++-------+------+" + .trim() + ); + + Ok(()) + } + #[tokio::test] // Test that when no extensions are provided, all rows are returned (backward compatibility). async fn test_selection_no_extensions() -> anyhow::Result<()> { @@ -1380,6 +1703,7 @@ mod tests { vortex_reader_factory: Arc::new(DefaultVortexReaderFactory::new(Arc::clone( &object_store, ))), + scan_v2: false, projection, filter: None, file_pruning_predicate: None, @@ -1390,6 +1714,7 @@ mod tests { metrics_registry: Arc::new(DefaultMetricsRegistry::default()), layout_readers: Default::default(), natural_split_ranges: Default::default(), + vortex_files: Default::default(), has_output_ordering: false, expression_convertor: Arc::new(DefaultExpressionConvertor::default()), file_metadata_cache: None, diff --git a/vortex-datafusion/src/persistent/source.rs b/vortex-datafusion/src/persistent/source.rs index 2f4888404e8..74f4d5520b0 100644 --- a/vortex-datafusion/src/persistent/source.rs +++ b/vortex-datafusion/src/persistent/source.rs @@ -6,6 +6,7 @@ use std::ops::Range; use std::sync::Arc; use std::sync::Weak; +use datafusion_common::DataFusionError; use datafusion_common::Result as DFResult; use datafusion_common::config::ConfigOptions; use datafusion_datasource::TableSchema; @@ -31,7 +32,9 @@ use object_store::ObjectStore; use object_store::path::Path; use vortex::error::VortexExpect; use vortex::file::VORTEX_FILE_EXTENSION; +use vortex::file::VortexFile; use vortex::layout::LayoutReader; +use vortex::layout::scan::v2::scan2_enabled; use vortex::metrics::DefaultMetricsRegistry; use vortex::metrics::MetricsRegistry; use vortex::session::VortexSession; @@ -198,6 +201,8 @@ pub struct VortexSource { layout_readers: Arc>>, /// Shared full-file natural split ranges keyed by path. natural_split_ranges: Arc]>>>, + /// Shared V2 file handles keyed by path. + vortex_files: Arc>>, expression_convertor: Arc, pub(crate) vortex_reader_factory: Option>, pub(crate) ordered: bool, @@ -231,6 +236,7 @@ impl VortexSource { _unused_df_metrics: Default::default(), layout_readers: Arc::new(DashMap::default()), natural_split_ranges: Arc::new(DashMap::default()), + vortex_files: Arc::new(DashMap::default()), expression_convertor: Arc::new(DefaultExpressionConvertor::default()), vortex_reader_factory: None, vx_metrics_registry: Arc::new(DefaultMetricsRegistry::default()), @@ -347,11 +353,14 @@ impl VortexSource { .vortex_reader_factory .clone() .unwrap_or_else(|| Arc::new(DefaultVortexReaderFactory::new(object_store))); + let scan_v2 = + scan2_enabled().map_err(|error| DataFusionError::External(Box::new(error)))?; let opener = VortexOpener { partition, session: self.session.clone(), vortex_reader_factory, + scan_v2, projection: self.projection.clone(), filter: self.vortex_predicate.clone(), file_pruning_predicate: self.full_predicate.clone(), @@ -362,6 +371,7 @@ impl VortexSource { metrics_registry: Arc::clone(&self.vx_metrics_registry), layout_readers: Arc::clone(&self.layout_readers), natural_split_ranges: Arc::clone(&self.natural_split_ranges), + vortex_files: Arc::clone(&self.vortex_files), has_output_ordering: !base_config.output_ordering.is_empty() || self.ordered, expression_convertor: Arc::clone(&self.expression_convertor), file_metadata_cache: self.file_metadata_cache.clone(), diff --git a/vortex-datafusion/src/persistent/tests.rs b/vortex-datafusion/src/persistent/tests.rs index 220a1477b13..34687c5ca15 100644 --- a/vortex-datafusion/src/persistent/tests.rs +++ b/vortex-datafusion/src/persistent/tests.rs @@ -26,6 +26,8 @@ use vortex::array::arrays::VarBinArray; use vortex::array::validity::Validity; use vortex::buffer::Buffer; use vortex::buffer::buffer; +use vortex::expr::root; +use vortex::expr::select; use vortex::file::OpenOptionsSessionExt; use vortex::file::WriteOptionsSessionExt; use vortex::io::VortexWrite; @@ -444,8 +446,10 @@ async fn test_repartitioned_scan_matches_non_repartitioned_for_uneven_splits() - .iter() .map(|range| range.end - range.start) .collect::>(); + let scan2_split_ranges = vxf.plan_splits(&select(["value"], root())).await?; assert!(split_ranges.len() > 1); + assert_eq!(scan2_split_ranges, split_ranges); assert!( split_lengths .windows(2) diff --git a/vortex-datafusion/src/v2/source.rs b/vortex-datafusion/src/v2/source.rs index 82fae9bf56d..b7ecd0aafb9 100644 --- a/vortex-datafusion/src/v2/source.rs +++ b/vortex-datafusion/src/v2/source.rs @@ -71,6 +71,7 @@ use std::fmt; use std::fmt::Formatter; use std::sync::Arc; +use arrow_schema::DataType; use arrow_schema::Field; use arrow_schema::Schema; use arrow_schema::SchemaRef; @@ -97,10 +98,13 @@ use datafusion_physical_plan::stream::RecordBatchStreamAdapter; use futures::StreamExt; use futures::TryStreamExt; use futures::future::try_join_all; +use futures::stream; +use futures::stream::BoxStream; +use tokio::sync::OnceCell; +use vortex::array::ArrayRef; use vortex::array::VortexSessionExecute; use vortex::array::arrow::ArrowSessionExt; use vortex::dtype::DType; -use vortex::dtype::FieldPath; use vortex::dtype::Nullability; use vortex::error::VortexResult; use vortex::error::vortex_bail; @@ -112,7 +116,9 @@ use vortex::expr::root; use vortex::expr::stats::Precision; use vortex::expr::transform::replace; use vortex::io::session::RuntimeSessionExt; +use vortex::metrics::MetricsRegistry; use vortex::scan::DataSourceRef; +use vortex::scan::PlannedMorselScanRef; use vortex::scan::ScanRequest; use vortex::session::VortexSession; use vortex_utils::parallelism::get_available_parallelism; @@ -121,7 +127,8 @@ use crate::convert::exprs::DefaultExpressionConvertor; use crate::convert::exprs::ExpressionConvertor; use crate::convert::exprs::ProcessedProjection; use crate::convert::exprs::make_vortex_predicate; -use crate::convert::stats::stats_set_to_df; +use crate::convert::stats::aggregate_stats_to_df; +use crate::convert::stats::column_statistics_aggregate_fns; /// Builder for [`VortexDataSource`]. /// @@ -168,6 +175,7 @@ pub struct VortexDataSourceBuilder { arrow_schema: Option, projection: Option>, + metrics_registry: Option>, } impl VortexDataSourceBuilder { @@ -198,6 +206,15 @@ impl VortexDataSourceBuilder { self } + /// Attaches a Vortex metrics registry populated by the underlying data source. + /// + /// The V2 adapter does not open files itself, so callers that want Vortex read metrics must + /// also configure the wrapped source to write to this same registry. + pub fn with_metrics_registry(mut self, metrics_registry: Arc) -> Self { + self.metrics_registry = Some(metrics_registry); + self + } + /// Builds the [`VortexDataSource`]. /// /// The builder eagerly resolves statistics for the initial projection @@ -242,21 +259,21 @@ impl VortexDataSourceBuilder { }; // We now compute initial statistics. - let field_paths: Vec<_> = fields + let statistics_exprs: Vec<_> = fields .names() .iter() .cloned() - .map(FieldPath::from_name) + .map(|name| get_item(name, root())) .collect(); + let statistics_funcs = column_statistics_aggregate_fns(); let statistics = try_join_all( - field_paths + statistics_exprs .iter() - .map(|path| self.data_source.field_statistics(path)), + .map(|expr| self.data_source.statistics(expr, &statistics_funcs)), ) .await? .iter() - .zip(fields.fields()) - .map(|(stats, dtype)| stats_set_to_df(stats, &dtype)) + .map(|stats| aggregate_stats_to_df(stats)) .collect::>>()?; Ok(VortexDataSource { @@ -275,6 +292,8 @@ impl VortexDataSourceBuilder { limit: None, ordered: false, num_partitions: get_available_parallelism().unwrap_or(1), + metrics_registry: self.metrics_registry, + morsel_plan: Arc::new(OnceCell::new()), }) } } @@ -287,8 +306,30 @@ impl VortexDataSource { session, arrow_schema: None, projection: None, + metrics_registry: None, } } + + fn scan_partition_count(&self) -> usize { + if self.should_morsel_repartition() { + self.num_partitions.max(1) + } else { + 1 + } + } + + fn should_morsel_repartition(&self) -> bool { + self.data_source.supports_morsel_partitioning() && !self.ordered && self.limit.is_none() + } + + fn reset_morsel_plan(&mut self) { + self.morsel_plan = Arc::new(OnceCell::new()); + } + + /// Returns the metrics registry attached to this source, if one was configured. + pub fn metrics_registry(&self) -> Option<&Arc> { + self.metrics_registry.as_ref() + } } /// DataFusion [`DataSource`] backed by a Vortex [`DataSourceRef`]. @@ -301,9 +342,12 @@ impl VortexDataSource { /// During execution, it builds the final Vortex [`ScanRequest`] from the /// current projection, pushed filters, ordering hints, and row limit. /// -/// This integration intentionally reports a single DataFusion output partition. -/// Vortex then handles split-level concurrency internally by polling multiple -/// split streams concurrently. +/// For unordered scans without a limit, this integration reports DataFusion's +/// requested partition count when the wrapped source supports ScanPlan morsel +/// partitioning. The async morsel plan is still built lazily in [`DataSource::open`], +/// so partitions beyond the discovered morsel count produce empty streams. +/// Ordered and limited scans use one output partition so the source can preserve +/// semantics. /// /// Use [`crate::VortexSource`] instead when DataFusion should discover and plan /// `.vortex` files on its own. @@ -352,10 +396,15 @@ pub struct VortexDataSource { ordered: bool, /// The requested partition count from DataFusion, populated by [`DataSource::repartitioned`]. - /// We use this as a hint for how many splits to execute concurrently in `open()`, but we - /// always declare to DataFusion that we only have a single partition so that we can - /// internally manage concurrency and fix the problem of partition skew. + /// When morsel partitioning is enabled, this is also the count we report back to DataFusion. + /// The final lazy plan may discover fewer non-empty partitions. num_partitions: usize, + + /// Optional Vortex metrics registry populated by the wrapped source. + metrics_registry: Option>, + + /// Shared planned scan for DataFusion morsel repartitioning. + morsel_plan: Arc>>, } impl fmt::Debug for VortexDataSource { @@ -369,17 +418,46 @@ impl fmt::Debug for VortexDataSource { } } +async fn scan_to_array_stream( + data_source: DataSourceRef, + scan_request: ScanRequest, + num_partitions: usize, +) -> DFResult>> { + let ordered = scan_request.ordered; + let scan = data_source + .scan(scan_request) + .await + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + // Each split.execute() returns a lazy stream whose early polls do preparation + // work (expression resolution, layout traversal, first I/O spawns). Unordered + // scans can poll multiple split streams concurrently so the next split is + // already warm when the current one finishes; ordered scans must preserve + // partition order. + let scan_streams = scan.partitions().map(|split_result| { + let split = split_result?; + split.execute() + }); + + if ordered { + Ok(scan_streams.try_flatten().boxed()) + } else { + Ok(scan_streams + .try_flatten_unordered(Some(num_partitions * 2)) + .boxed()) + } +} + impl DataSource for VortexDataSource { fn open( &self, partition: usize, _context: Arc, ) -> DFResult { - // VortexScanSource always uses a single partition since Vortex handles parallelism - // and concurrency internally. - if partition != 0 { + let scan_partition_count = self.scan_partition_count(); + if partition >= scan_partition_count { return Err(DataFusionError::Internal(format!( - "VortexScanSource: expected partition 0, got {partition}" + "VortexScanSource: expected partition in 0..{scan_partition_count}, got {partition}" ))); } @@ -401,7 +479,10 @@ impl DataSource for VortexDataSource { false, )); let session = self.session.clone(); - let num_partitions = self.num_partitions; + let num_partitions = self.num_partitions.max(1); + let scan_partition_count = self.scan_partition_count(); + let use_morsel_repartition = self.should_morsel_repartition(); + let morsel_plan = Arc::clone(&self.morsel_plan); // Pre-build the leftover projector (if any) so we can apply it after batch conversion. let leftover_projector = self @@ -410,25 +491,48 @@ impl DataSource for VortexDataSource { .map(|proj| proj.make_projector(&self.projected_schema)) .transpose()?; - // Defer the async DataSource::scan() call to the first poll of the stream. - let stream = futures::stream::once(async move { - let scan = data_source - .scan(scan_request) - .await - .map_err(|e| DataFusionError::External(Box::new(e)))?; - - // Each split.execute() returns a lazy stream whose early polls do preparation - // work (expression resolution, layout traversal, first I/O spawns). We use - // try_flatten_unordered to poll multiple split streams concurrently so that - // the next split is already warm when the current one finishes. - let scan_streams = scan.partitions().map(|split_result| { - let split = split_result?; - split.execute() - }); + // Defer the async DataSource work to the first poll of the stream. + let stream = stream::once(async move { + let array_stream: BoxStream<'static, VortexResult> = if use_morsel_repartition + { + let planned = morsel_plan + .get_or_try_init(|| { + let data_source = Arc::clone(&data_source); + let scan_request = scan_request.clone(); + async move { + data_source + .plan_morsel_partitions(scan_request, scan_partition_count) + .await + .map_err(|e| DataFusionError::External(Box::new(e))) + } + }) + .await?; + + if let Some(planned) = planned { + if partition >= planned.partition_count() { + // DataFusion can schedule every partition it asked us to expose. If the + // final lazy plan found fewer morsels, the surplus partitions are empty. + stream::empty().boxed() + } else { + Arc::clone(planned) + .partition(partition) + .map_err(|e| DataFusionError::External(Box::new(e)))? + .execute() + .map_err(|e| DataFusionError::External(Box::new(e)))? + .boxed() + } + } else if partition == 0 { + scan_to_array_stream(Arc::clone(&data_source), scan_request, num_partitions) + .await? + } else { + stream::empty().boxed() + } + } else { + scan_to_array_stream(Arc::clone(&data_source), scan_request, num_partitions).await? + }; let handle = session.handle(); - let stream = scan_streams - .try_flatten_unordered(Some(num_partitions * 2)) + let stream = array_stream .map(move |result| { let session = session.clone(); let target_field = Arc::clone(&projected_target_field); @@ -489,28 +593,36 @@ impl DataSource for VortexDataSource { _repartition_file_min_size: usize, output_ordering: Option, ) -> DFResult>> { - // Vortex handles parallelism internally — always use a single partition. let mut this = self.clone(); this.num_partitions = target_partitions; this.ordered |= output_ordering.is_some(); + this.reset_morsel_plan(); Ok(Some(Arc::new(this))) } fn output_partitioning(&self) -> Partitioning { - Partitioning::UnknownPartitioning(1) + // Report the engine-requested partition count. We do not pre-open files here just to learn + // the exact morsel count; open() maps any surplus partitions to empty streams. + Partitioning::UnknownPartitioning(self.scan_partition_count()) } fn eq_properties(&self) -> EquivalenceProperties { EquivalenceProperties::new(Arc::clone(&self.leftover_schema)) } - fn partition_statistics(&self, _partition: Option) -> DFResult> { + fn partition_statistics(&self, partition: Option) -> DFResult> { // FIXME(ngates): this should be adjusted based on filters. See DuckDB for heuristics, // and in the future, store the selectivity stats in the session. - let num_rows = estimate_to_df_precision(&self.data_source.row_count()); + let mut num_rows = estimate_to_df_precision(&self.data_source.row_count()); // FIXME(ngates): byte size should be adjusted for the initial projection... - let total_byte_size = estimate_to_df_precision(&self.data_source.byte_size()); + let mut total_byte_size = estimate_to_df_precision(&self.data_source.byte_size()); + + if partition.is_some() { + let partition_count = self.scan_partition_count(); + num_rows = divide_df_precision(num_rows, partition_count); + total_byte_size = divide_df_precision(total_byte_size, partition_count); + } // Column statistics must match the output schema (leftover_schema), which may differ // from the initial schema after try_swapping_with_projection adds computed columns. @@ -526,6 +638,7 @@ impl DataSource for VortexDataSource { fn with_fetch(&self, limit: Option) -> Option> { let mut this = self.clone(); this.limit = limit; + this.reset_morsel_plan(); Some(Arc::new(this)) } @@ -558,7 +671,9 @@ impl DataSource for VortexDataSource { // Compose with the initial projection so the scan operates on the original // source columns, not the initial projection's output columns. - let scan_projection = replace(scan_projection, &root(), self.initial_projection.clone()); + let scan_projection = replace(scan_projection, &root(), self.initial_projection.clone()) + .optimize_recursive(self.data_source.dtype()) + .map_err(|e| DataFusionError::External(Box::new(e)))?; // Compute the scan output schema from the Vortex expression's return dtype. let scan_dtype = scan_projection @@ -586,6 +701,7 @@ impl DataSource for VortexDataSource { this.leftover_schema = Arc::clone(&final_schema); this.leftover_statistics = vec![ColumnStatistics::new_unknown(); final_schema.fields().len()]; + this.reset_morsel_plan(); Ok(Some(Arc::new(this))) } @@ -609,7 +725,8 @@ impl DataSource for VortexDataSource { let pushdown_results: Vec = filters .iter() .map(|expr| { - if convertor.can_be_pushed_down(expr, input_schema) { + let is_boolean = matches!(expr.data_type(input_schema), Ok(DataType::Boolean)); + if is_boolean && convertor.can_be_pushed_down(expr, input_schema) { PushedDown::Yes } else { PushedDown::No @@ -647,6 +764,7 @@ impl DataSource for VortexDataSource { let mut this = self.clone(); this.filter = new_filter; + this.reset_morsel_plan(); Ok( FilterPushdownPropagation::with_parent_pushdown_result(pushdown_results) .with_updated_node(Arc::new(this) as _), @@ -665,3 +783,12 @@ fn estimate_to_df_precision(est: &Precision) -> DFPrecision { Precision::Absent => DFPrecision::Absent, } } + +fn divide_df_precision(est: DFPrecision, divisor: usize) -> DFPrecision { + let divisor = divisor.max(1); + match est { + DFPrecision::Exact(v) => DFPrecision::Exact(v.div_ceil(divisor)), + DFPrecision::Inexact(v) => DFPrecision::Inexact(v.div_ceil(divisor)), + DFPrecision::Absent => DFPrecision::Absent, + } +} diff --git a/vortex-datafusion/src/v2/table.rs b/vortex-datafusion/src/v2/table.rs index 15849893121..b46e995afe1 100644 --- a/vortex-datafusion/src/v2/table.rs +++ b/vortex-datafusion/src/v2/table.rs @@ -24,6 +24,7 @@ use datafusion_expr::Expr; use datafusion_expr::TableType; use datafusion_physical_plan::ExecutionPlan; use vortex::expr::stats::Precision as VortexPrecision; +use vortex::metrics::MetricsRegistry; use vortex::scan::DataSourceRef; use vortex::session::VortexSession; @@ -76,6 +77,7 @@ pub struct VortexTable { data_source: DataSourceRef, session: VortexSession, arrow_schema: SchemaRef, + metrics_registry: Option>, } impl fmt::Debug for VortexTable { @@ -100,8 +102,18 @@ impl VortexTable { data_source, session, arrow_schema, + metrics_registry: None, } } + + /// Attaches a Vortex metrics registry populated by the underlying data source. + /// + /// The V2 table does not open files itself, so callers that want Vortex read metrics must also + /// configure the wrapped source to write to this same registry. + pub fn with_metrics_registry(mut self, metrics_registry: Arc) -> Self { + self.metrics_registry = Some(metrics_registry); + self + } } #[async_trait] @@ -122,23 +134,27 @@ impl TableProvider for VortexTable { _limit: Option, ) -> DFResult> { // Construct the physical node representing this table. - let data_source = - VortexDataSource::builder(Arc::clone(&self.data_source), self.session.clone()) - .with_arrow_schema(Arc::clone(&self.arrow_schema)) - // We push down the projection now since it can make building the physical plan a lot - // cheaper, e.g. by only computing stats for the projected columns. - .with_some_projection(projection.cloned()) - // We don't push down filters for two reasons: - // 1. Vortex requires a physical expression, not logical. DataFusion will try to push - // the physical filters later. - // 2. There's nothing useful we can do with filters now to reduce the amount of work - // we have to do. - // - // We also don't push down the limit for the same reason, there's nothing useful we - // can do with it. - .build() - .await - .map_err(|e| DataFusionError::External(Box::new(e)))?; + let mut builder = + VortexDataSource::builder(Arc::clone(&self.data_source), self.session.clone()); + if let Some(metrics_registry) = &self.metrics_registry { + builder = builder.with_metrics_registry(Arc::clone(metrics_registry)); + } + let data_source = builder + .with_arrow_schema(Arc::clone(&self.arrow_schema)) + // We push down the projection now since it can make building the physical plan a lot + // cheaper, e.g. by only computing stats for the projected columns. + .with_some_projection(projection.cloned()) + // We don't push down filters for two reasons: + // 1. Vortex requires a physical expression, not logical. DataFusion will try to push + // the physical filters later. + // 2. There's nothing useful we can do with filters now to reduce the amount of work + // we have to do. + // + // We also don't push down the limit for the same reason, there's nothing useful we + // can do with it. + .build() + .await + .map_err(|e| DataFusionError::External(Box::new(e)))?; Ok(DataSourceExec::from_data_source(data_source)) } diff --git a/vortex-duckdb/Cargo.toml b/vortex-duckdb/Cargo.toml index b6896910a89..9cfa3f118e8 100644 --- a/vortex-duckdb/Cargo.toml +++ b/vortex-duckdb/Cargo.toml @@ -48,6 +48,7 @@ anyhow = { workspace = true } geo-types = { workspace = true } jiff = { workspace = true } rstest = { workspace = true } +temp-env = { workspace = true } tempfile = { workspace = true } vortex-array = { workspace = true, features = ["_test-harness"] } vortex-runend = { workspace = true } diff --git a/vortex-duckdb/src/column_statistics.rs b/vortex-duckdb/src/column_statistics.rs index ccc71eeade1..c0d880bd1bc 100644 --- a/vortex-duckdb/src/column_statistics.rs +++ b/vortex-duckdb/src/column_statistics.rs @@ -1,6 +1,14 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use vortex::array::aggregate_fn::AggregateFnRef; +use vortex::array::aggregate_fn::AggregateFnVTableExt; +use vortex::array::aggregate_fn::EmptyOptions; +use vortex::array::aggregate_fn::NumericalAggregateOpts; +use vortex::array::aggregate_fn::fns::max::Max; +use vortex::array::aggregate_fn::fns::min::Min; +use vortex::array::aggregate_fn::fns::null_count::NullCount; +use vortex::array::aggregate_fn::fns::uncompressed_size_in_bytes::UncompressedSizeInBytes; use vortex::array::stats::StatsSet; use vortex::dtype::DType; use vortex::error::VortexExpect as _; @@ -12,6 +20,20 @@ use vortex::scalar::ScalarValue; use crate::convert::ToDuckDBScalar as _; use crate::duckdb::Value; +const MIN_INDEX: usize = 0; +const MAX_INDEX: usize = 1; +const NULL_COUNT_INDEX: usize = 2; +const BYTE_SIZE_INDEX: usize = 3; + +pub fn column_statistics_aggregate_fns() -> Vec { + vec![ + Min.bind(NumericalAggregateOpts::default()), + Max.bind(NumericalAggregateOpts::default()), + NullCount.bind(EmptyOptions), + UncompressedSizeInBytes.bind(EmptyOptions), + ] +} + #[derive(Debug, Default)] pub struct ColumnStatistics { pub min: Option, @@ -52,7 +74,7 @@ impl ColumnStatistics { } } -#[derive(Default)] +#[derive(Clone, Default)] pub struct ColumnStatisticsAggregate { pub min: Option, pub max: Option, @@ -93,4 +115,38 @@ impl ColumnStatisticsAggregate { has_null, } } + + pub fn from_aggregate_stats(stats: &[Precision]) -> Self { + let min = exact_scalar_value(stats.get(MIN_INDEX)); + let max = exact_scalar_value(stats.get(MAX_INDEX)); + let max_string_length = stats + .get(BYTE_SIZE_INDEX) + .and_then(exact_scalar_u64) + .map(|value| u32::try_from(value).unwrap_or(u32::MAX)); + let has_null = stats + .get(NULL_COUNT_INDEX) + .and_then(exact_scalar_u64) + .is_none_or(|count| count > 0); + + Self { + min, + max, + max_string_length, + has_null, + } + } +} + +fn exact_scalar_value(stat: Option<&Precision>) -> Option { + match stat { + Some(Precision::Exact(value)) => value.clone().into_value(), + _ => None, + } +} + +fn exact_scalar_u64(stat: &Precision) -> Option { + match stat { + Precision::Exact(value) => value.as_primitive().typed_value::(), + _ => None, + } } diff --git a/vortex-duckdb/src/e2e_test/vortex_scan_test.rs b/vortex-duckdb/src/e2e_test/vortex_scan_test.rs index 6cc28483571..cecd2271c48 100644 --- a/vortex-duckdb/src/e2e_test/vortex_scan_test.rs +++ b/vortex-duckdb/src/e2e_test/vortex_scan_test.rs @@ -237,6 +237,32 @@ fn test_vortex_scan_integers() { assert_eq!(sum, 138); } +#[test] +fn test_vortex_scan_impl_flip_flop_env() { + let file = RUNTIME.block_on(async { + let numbers = buffer![1i32, 42, 100, -5, 0]; + write_single_column_vortex_file("number", numbers).await + }); + let file_path = file.path().to_string_lossy(); + let query = format!("SELECT SUM(number) FROM '{file_path}' WHERE number >= 0 LIMIT 3"); + + let scan = |scan_impl| { + temp_env::with_var("VORTEX_SCAN_IMPL", Some(scan_impl), || { + let conn = database_connection(); + let result = conn.query(&query).unwrap(); + let mut chunk = result.into_iter().next().unwrap(); + let len = chunk.len().as_(); + let vec = chunk.get_vector_mut(0); + i64::from_duckdb_value(&mut unsafe { vec.as_slice_mut::(len) }[0]) + }) + }; + + let v1 = scan("v1"); + let v2 = scan("v2"); + assert_eq!(v1, v2); + assert_eq!(v1, 143); +} + #[test] fn test_vortex_scan_integers_in_list() { let file = RUNTIME.block_on(async { diff --git a/vortex-duckdb/src/lib.rs b/vortex-duckdb/src/lib.rs index 55039b88737..cbd014f8b0e 100644 --- a/vortex-duckdb/src/lib.rs +++ b/vortex-duckdb/src/lib.rs @@ -5,6 +5,7 @@ use std::ffi::c_char; use std::ffi::c_void; +use std::sync::Arc; use std::sync::LazyLock; use std::sync::OnceLock; @@ -14,6 +15,9 @@ use vortex::error::VortexResult; use vortex::io::runtime::BlockingRuntime; use vortex::io::runtime::current::CurrentThreadRuntime; use vortex::io::session::RuntimeSessionExt; +use vortex::scan::ScanScheduler; +use vortex::scan::ScanSchedulerConfig; +use vortex::scan::ScanSchedulerSessionExt; use vortex::session::VortexSession; use crate::duckdb::Database; @@ -41,8 +45,12 @@ mod e2e_test; // A global runtime for Vortex operations within DuckDB. static RUNTIME: LazyLock = LazyLock::new(CurrentThreadRuntime::new); +static SCAN_SCHEDULER: LazyLock> = + LazyLock::new(|| Arc::new(ScanScheduler::new(ScanSchedulerConfig::duckdb_default()))); static SESSION: LazyLock = LazyLock::new(|| { - let session = VortexSession::default().with_handle(RUNTIME.handle()); + let session = VortexSession::default() + .with_handle(RUNTIME.handle()) + .with_scan_scheduler(Arc::clone(&SCAN_SCHEDULER)); vortex_geo::initialize(&session); session }); diff --git a/vortex-duckdb/src/multi_file.rs b/vortex-duckdb/src/multi_file.rs index bb9e015af5c..ddaf45cd145 100644 --- a/vortex-duckdb/src/multi_file.rs +++ b/vortex-duckdb/src/multi_file.rs @@ -19,6 +19,8 @@ use vortex::io::filesystem::FileSystemRef; use vortex::io::object_store::ObjectStoreFileSystem; use vortex::io::runtime::BlockingRuntime; use vortex::layout::scan::multi::MultiLayoutDataSource; +use vortex::layout::scan::v2::scan2_enabled; +use vortex::scan::DataSourceRef; use vortex_utils::aliases::hash_map::HashMap; use crate::RUNTIME; @@ -26,6 +28,11 @@ use crate::SESSION; use crate::duckdb::BindInputRef; use crate::duckdb::ExtractedValue; +pub struct BoundMultiFileScan { + pub data_source: DataSourceRef, + pub statistics_source: Option>, +} + /// Parse a glob string into a [`Url`]. /// /// Accepts full URLs (e.g. `s3://bucket/prefix/*.vortex`, `file:///data/*.vortex`) as well as @@ -78,7 +85,7 @@ fn resolve_filesystem(base_url: &Url) -> VortexResult { } /// Shared bind logic for both single-glob and multi-glob variants. -pub fn bind_multi_file_scan(input: &BindInputRef) -> VortexResult { +pub fn bind_multi_file_scan(input: &BindInputRef) -> VortexResult { let glob_url_parameter = input .get_parameter(0) .ok_or_else(|| vortex_err!("Missing file glob parameter"))?; @@ -131,7 +138,20 @@ pub fn bind_multi_file_scan(input: &BindInputRef) -> VortexResult::clone(&statistics_source); + Ok(BoundMultiFileScan { + data_source, + statistics_source: Some(statistics_source), + }) + } }) } diff --git a/vortex-duckdb/src/projection.rs b/vortex-duckdb/src/projection.rs index 4521115666c..ba7df81af79 100644 --- a/vortex-duckdb/src/projection.rs +++ b/vortex-duckdb/src/projection.rs @@ -50,6 +50,7 @@ pub struct Projection { pub projection: Expression, pub file_index_column_pos: Option, pub file_row_number_column_pos: Option, + pub is_zero_column: bool, } impl Projection { @@ -106,6 +107,10 @@ impl Projection { real_column_count += 1; } + let is_zero_column = real_column_count == 0 + && file_index_column_pos.is_none() + && file_row_number_column_pos.is_none(); + // Duckdb can request less columns than there are in table i.e. [0, 1] with // 5 columns total. is_star &= real_column_count == column_fields.len() as u64; @@ -123,6 +128,7 @@ impl Projection { projection, file_index_column_pos, file_row_number_column_pos, + is_zero_column, }; } @@ -185,6 +191,7 @@ impl Projection { projection, file_index_column_pos, file_row_number_column_pos, + is_zero_column, } } } diff --git a/vortex-duckdb/src/table_function.rs b/vortex-duckdb/src/table_function.rs index 5151e47a464..ebffd004678 100644 --- a/vortex-duckdb/src/table_function.rs +++ b/vortex-duckdb/src/table_function.rs @@ -5,27 +5,34 @@ use std::cmp::max; use std::fmt::Formatter; use std::fmt::{self}; use std::sync::Arc; +use std::sync::OnceLock; use std::sync::atomic::AtomicBool; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use custom_labels::CURRENT_LABELSET; use futures::StreamExt; +use futures::stream; use itertools::Itertools; use num_traits::AsPrimitive; use static_assertions::assert_impl_all; use tracing::debug; use vortex::array::ArrayRef; use vortex::array::Canonical; +use vortex::array::IntoArray; use vortex::array::VortexSessionExecute as _; use vortex::array::arrays::ScalarFn; use vortex::array::arrays::Struct; use vortex::array::arrays::StructArray; use vortex::array::arrays::scalar_fn::ScalarFnArrayExt; use vortex::array::optimizer::ArrayOptimizer; +use vortex::array::validity::Validity; +use vortex::dtype::FieldNames; use vortex::error::VortexExpect; use vortex::error::VortexResult; +use vortex::error::vortex_err; use vortex::expr::Expression; +use vortex::expr::col; use vortex::expr::stats::Precision; use vortex::file::v2::FileStatsLayoutReader; use vortex::io::kanal_ext::KanalExt as _; @@ -37,14 +44,16 @@ use vortex::metrics::tracing::get_global_labels; use vortex::scalar_fn::fns::binary::Binary; use vortex::scalar_fn::fns::operators::Operator; use vortex::scalar_fn::fns::pack::Pack; -use vortex::scan::DataSource; +use vortex::scan::DataSourceRef; use vortex::scan::ScanRequest; +use vortex::scan::selection::Selection; use vortex_utils::parallelism::get_available_parallelism; use crate::RUNTIME; use crate::SESSION; use crate::column_statistics::ColumnStatistics; use crate::column_statistics::ColumnStatisticsAggregate; +use crate::column_statistics::column_statistics_aggregate_fns; use crate::convert::try_from_bound_expression; use crate::convert::try_from_projection_expression; use crate::duckdb::BindInputRef; @@ -54,6 +63,7 @@ use crate::duckdb::DuckdbStringMapRef; use crate::duckdb::ExpressionRef; use crate::duckdb::TableInitInput; use crate::duckdb::Value; +use crate::duckdb::duckdb_vector_size; use crate::exporter::ArrayExporter; use crate::exporter::ConversionCache; use crate::multi_file::bind_multi_file_scan; @@ -63,9 +73,11 @@ use crate::projection::Projection; use crate::projection::extract_schema_from_dtype; pub struct TableFunctionBind { - data_source: Arc, + data_source: DataSourceRef, + statistics_source: Option>, filter_exprs: Vec, column_fields: Vec, + column_statistics: Arc>>>, // There exists at least one non-optional table filter or at least one // complex filter is pushed down. has_non_optional_filter: AtomicBool, @@ -76,9 +88,11 @@ impl Clone for TableFunctionBind { fn clone(&self) -> Self { Self { data_source: Arc::clone(&self.data_source), + statistics_source: self.statistics_source.clone(), // filter_exprs are consumed once in `init_global`. filter_exprs: vec![], column_fields: self.column_fields.clone(), + column_statistics: Arc::clone(&self.column_statistics), has_non_optional_filter: AtomicBool::new( self.has_non_optional_filter.load(Ordering::Relaxed), ), @@ -146,13 +160,19 @@ pub enum Cardinality { pub fn bind(input: &BindInputRef, result: &mut BindResultRef) -> VortexResult { let data_source = bind_multi_file_scan(input)?; - let column_fields = extract_schema_from_dtype(data_source.dtype())?; + let column_fields = extract_schema_from_dtype(data_source.data_source.dtype())?; for fields in &column_fields { result.add_result_column(&fields.name, &fields.logical_type); } Ok(TableFunctionBind { - data_source: Arc::new(data_source), + data_source: data_source.data_source, + statistics_source: data_source.statistics_source, filter_exprs: vec![], + column_statistics: Arc::new( + (0..column_fields.len()) + .map(|_| OnceLock::new()) + .collect::>(), + ), column_fields, has_non_optional_filter: AtomicBool::new(false), }) @@ -169,6 +189,7 @@ pub fn init_global(init_input: &TableInitInput) -> VortexResult VortexResult VortexResult TableFunctionGlobal { + TableFunctionGlobal { + iterator: zero_column_iterator(row_count), + batch_id: AtomicU64::new(0), + bytes_total: Arc::new(AtomicU64::new(row_count)), + bytes_read: AtomicU64::new(0), + file_index_column_pos: None, + file_row_number_column_pos: None, + } +} + +fn zero_column_iterator(row_count: u64) -> DataSourceIterator { + let vector_size = u64::try_from(duckdb_vector_size()) + .unwrap_or(u64::MAX) + .max(1); + RUNTIME.block_on_stream_thread_safe(move |_handle| { + let cache = Arc::new(ConversionCache::default()); + stream::unfold((row_count, cache), move |(remaining, cache)| async move { + if remaining == 0 { + return None; + } + let batch_len = remaining.min(vector_size); + let item = usize::try_from(batch_len) + .map_err(|_| vortex_err!("zero-column batch length exceeds usize")) + .and_then(zero_column_array) + .map(|array| (array, Arc::clone(&cache))); + Some((item, (remaining - batch_len, cache))) + }) + }) +} + +fn zero_column_array(len: usize) -> VortexResult { + Ok( + StructArray::try_new(FieldNames::empty(), Vec::new(), len, Validity::NonNullable)? + .into_array(), + ) +} + pub fn init_local(global: &TableFunctionGlobal) -> TableFunctionLocal { unsafe { use custom_labels::sys; @@ -449,24 +520,48 @@ pub fn pushdown_projection_expression( /// Get column-wise statistics. Available only if we're reading a single file. pub fn statistics(bind_data: &TableFunctionBind, column_index: usize) -> Option { - let children = bind_data.data_source.children(); - // Otherwise we'd have to open all files eagerly which is a performance - // regression. Duckdb's Parquet reader only gets metadata for multiple - // files with a UNION BY NAME and we don't support it (yet) - // See duckdb/common/multi_file/multi_file_function.hpp#L691 - if children.len() != 1 { - return None; - } - let MultiLayoutChild::Opened { reader, .. } = &children[0] else { - return None; - }; - let stats_sets = match reader.as_any().downcast_ref::() { - Some(inner) => inner.file_stats().stats_sets(), - None => return None, - }; - let stats_aggregate = ColumnStatisticsAggregate::new(&stats_sets[column_index]); let dtype = bind_data.column_fields[column_index].dtype.clone(); - Some(ColumnStatistics::from(&stats_aggregate, dtype)) + let stats_aggregate = bind_data + .column_statistics + .get(column_index)? + .get_or_init(|| column_statistics_aggregate(bind_data, column_index)) + .as_ref()?; + Some(ColumnStatistics::from(stats_aggregate, dtype)) +} + +fn column_statistics_aggregate( + bind_data: &TableFunctionBind, + column_index: usize, +) -> Option { + if let Some(statistics_source) = bind_data.statistics_source.as_ref() { + let children = statistics_source.children(); + // Otherwise we'd have to open all files eagerly which is a performance + // regression. Duckdb's Parquet reader only gets metadata for multiple + // files with a UNION BY NAME and we don't support it (yet) + // See duckdb/common/multi_file/multi_file_function.hpp#L691 + if children.len() != 1 { + return None; + } + let MultiLayoutChild::Opened { reader, .. } = &children[0] else { + return None; + }; + let stats_sets = match reader.as_any().downcast_ref::() { + Some(inner) => inner.file_stats().stats_sets(), + None => return None, + }; + return Some(ColumnStatisticsAggregate::new(&stats_sets[column_index])); + } + + let name = &bind_data.column_fields[column_index].name; + let funcs = column_statistics_aggregate_fns(); + let stats = RUNTIME + .block_on( + bind_data + .data_source + .statistics(&col(name.as_str()), &funcs), + ) + .ok()?; + Some(ColumnStatisticsAggregate::from_aggregate_stats(&stats)) } /// Duckdb requires post-filter cardinality estimates, otherwise join planner diff --git a/vortex-file/src/file.rs b/vortex-file/src/file.rs index c9a71f85c55..e142e0a2edd 100644 --- a/vortex-file/src/file.rs +++ b/vortex-file/src/file.rs @@ -12,20 +12,28 @@ use std::sync::OnceLock; use itertools::Itertools; use vortex_array::ArrayRef; +use vortex_array::aggregate_fn::AggregateFnRef; use vortex_array::dtype::DType; use vortex_array::dtype::FieldMask; use vortex_array::expr::Expression; +use vortex_array::expr::stats::Precision; +use vortex_array::scalar::Scalar; +use vortex_array::stream::SendableArrayStream; use vortex_error::VortexResult; use vortex_layout::LayoutReader; use vortex_layout::scan::layout::LayoutReaderDataSource; use vortex_layout::scan::scan_builder::ScanBuilder; use vortex_layout::scan::split_by::SplitBy; +use vortex_layout::segments::SegmentFutureCache; use vortex_layout::segments::SegmentSource; use vortex_scan::DataSourceRef; +use vortex_scan::ScanRequest; +use vortex_scan::plan::ScanPlanRef; use vortex_session::VortexSession; use crate::FileStatistics; use crate::footer::Footer; +use crate::multi::scan_v2; use crate::pruning::can_prune_file_stats; use crate::v2::FileStatsLayoutReader; @@ -44,6 +52,10 @@ pub struct VortexFile { session: VortexSession, /// None id LayoutReader caching is turned off layout_reader_cache: Option>>, + /// Shared cache for the v2 physical scan plan root. + scan_plan_root_cache: Arc>, + /// Shared cache for v2 in-flight segment futures across row-range scans of this file. + scan_plan_segment_future_cache: Arc, } fn layout_reader( @@ -79,6 +91,8 @@ impl VortexFile { segment_source, session, layout_reader_cache: None, + scan_plan_root_cache: Arc::new(OnceLock::new()), + scan_plan_segment_future_cache: Arc::new(SegmentFutureCache::new()), } } @@ -92,6 +106,8 @@ impl VortexFile { segment_source: self.segment_source, session: self.session, layout_reader_cache: Some(OnceLock::new()), + scan_plan_root_cache: self.scan_plan_root_cache, + scan_plan_segment_future_cache: self.scan_plan_segment_future_cache, } } @@ -160,6 +176,24 @@ impl VortexFile { } } + pub(crate) fn scan_plan_root(&self) -> VortexResult { + if let Some(root) = self.scan_plan_root_cache.get() { + return Ok(Arc::clone(root)); + } + + let root = scan_v2::build_file_scan_plan_root(self)?; + if self.scan_plan_root_cache.set(Arc::clone(&root)).is_err() + && let Some(root) = self.scan_plan_root_cache.get() + { + return Ok(Arc::clone(root)); + } + Ok(root) + } + + pub(crate) fn scan_plan_segment_future_cache(&self) -> Arc { + Arc::clone(&self.scan_plan_segment_future_cache) + } + /// Create a [`DataSource`](vortex_scan::DataSource) from this file for scanning. /// /// Wraps the file's layout reader with [`FileStatsLayoutReader`] (when file-level @@ -182,6 +216,39 @@ impl VortexFile { )) } + /// Execute a ScanPlan-backed scan for this file. + pub fn scan_plan_stream(&self, request: ScanRequest) -> VortexResult { + scan_v2::scan_plan_file_stream(self.clone(), request) + } + + /// Return ScanPlan-backed aggregate-function statistics for this file. + pub async fn scan_plan_statistics( + &self, + expr: &Expression, + funcs: &[AggregateFnRef], + ) -> VortexResult>> { + scan_v2::scan_plan_file_statistics(self.clone(), expr, funcs).await + } + + /// Return ScanPlan-backed aggregate-function statistics for several expressions in this file. + pub async fn scan_plan_statistics_many( + &self, + exprs: &[Expression], + funcs: &[AggregateFnRef], + ) -> VortexResult>>> { + scan_v2::scan_plan_file_statistics_many(self.clone(), exprs, funcs).await + } + + /// Return ScanPlan natural row split ranges for this file. + pub fn scan_plan_splits(&self) -> VortexResult>> { + scan_v2::scan_plan_file_splits(self) + } + + /// Plan ScanPlan natural row split ranges for a projected scan of this file. + pub async fn plan_splits(&self, projection: &Expression) -> VortexResult>> { + scan_v2::scan_plan_file_plan_splits(self.clone(), projection).await + } + /// Returns `true` if file-level statistics prove the expression cannot /// match any rows in this file. /// diff --git a/vortex-file/src/footer/mod.rs b/vortex-file/src/footer/mod.rs index c5de627482a..8708a496a32 100644 --- a/vortex-file/src/footer/mod.rs +++ b/vortex-file/src/footer/mod.rs @@ -34,6 +34,8 @@ use vortex_flatbuffers::footer as fb; use vortex_layout::LayoutEncodingId; use vortex_layout::LayoutRef; use vortex_layout::layout_from_flatbuffer_with_options; +use vortex_layout::layout_v2; +use vortex_layout::session::LayoutSessionExt; use vortex_session::VortexSession; use vortex_session::registry::ReadContext; @@ -41,6 +43,7 @@ use vortex_session::registry::ReadContext; #[derive(Debug, Clone)] pub struct Footer { root_layout: LayoutRef, + root_layout2: Option, segments: Arc<[SegmentSpec]>, statistics: Option, // The specific arrays used within the file, in the order they were registered. @@ -58,6 +61,7 @@ impl Footer { ) -> Self { Self { root_layout, + root_layout2: None, segments, statistics, array_read_ctx, @@ -100,13 +104,21 @@ impl Footer { let array_read_ctx = ReadContext::new(array_ids); let root_layout = layout_from_flatbuffer_with_options( - layout_bytes, + layout_bytes.clone(), &dtype, &layout_read_ctx, &array_read_ctx, session, session.allows_unknown(), )?; + let root_layout2 = layout_v2::layout_from_flatbuffer( + layout_bytes, + &dtype, + &layout_read_ctx, + &array_read_ctx, + session.layouts().v2_registry(), + session, + )?; let segments: Arc<[SegmentSpec]> = fb_footer .segment_specs() @@ -122,6 +134,7 @@ impl Footer { Ok(Self { root_layout, + root_layout2: Some(root_layout2), segments, statistics, array_read_ctx, @@ -134,6 +147,11 @@ impl Footer { &self.root_layout } + /// Returns the root v2 layout of the file, when available. + pub fn layout2(&self) -> Option<&layout_v2::LayoutRef> { + self.root_layout2.as_ref() + } + /// Returns the segment map of the file. pub fn segment_map(&self) -> &Arc<[SegmentSpec]> { &self.segments diff --git a/vortex-file/src/lib.rs b/vortex-file/src/lib.rs index e9c7741af93..c59e2c644e8 100644 --- a/vortex-file/src/lib.rs +++ b/vortex-file/src/lib.rs @@ -104,6 +104,8 @@ pub mod multi; mod open; mod pruning; mod read; +#[cfg(test)] +mod scan_v1_v2_differential; /// Segment sources, caches, and sinks used by file readers and writers. pub mod segments; mod strategy; diff --git a/vortex-file/src/multi/mod.rs b/vortex-file/src/multi/mod.rs index 215331f0540..fbf12960a87 100644 --- a/vortex-file/src/multi/mod.rs +++ b/vortex-file/src/multi/mod.rs @@ -3,6 +3,7 @@ //! Builder for constructing a [`MultiLayoutDataSource`] from multiple Vortex files. +pub(crate) mod scan_v2; mod session; use std::sync::Arc; @@ -17,18 +18,26 @@ use tracing::debug; use vortex_error::VortexError; use vortex_error::VortexResult; use vortex_error::vortex_bail; +use vortex_io::InstrumentedReadAt; +use vortex_io::VortexReadAt; use vortex_io::filesystem::FileListing; use vortex_io::filesystem::FileSystemRef; use vortex_layout::LayoutReaderRef; use vortex_layout::scan::multi::LayoutReaderFactory; use vortex_layout::scan::multi::MultiLayoutDataSource; +use vortex_layout::scan::v2::scan2_enabled; +use vortex_metrics::Label; +use vortex_metrics::MetricsRegistry; use vortex_scan::DataSource; +use vortex_scan::DataSourceRef; use vortex_session::VortexSession; use crate::OpenOptionsSessionExt; use crate::VortexFile; use crate::VortexOpenOptions; +const PATH_LABEL: &str = "file_path"; + /// A builder that discovers multiple Vortex files from glob patterns and constructs a /// [`MultiLayoutDataSource`] to scan them as a single data source. /// @@ -65,6 +74,7 @@ pub struct MultiFileDataSource { /// When the filesystem is None, a local filesystem will be created in build(). glob_sources: Vec<(String, Option)>, open_options_fn: Arc VortexOpenOptions + Send + Sync>, + metrics_registry: Option>, } /// In-flight glob resolutions in [`MultiFileDataSource::build`]. Callers like the JNI data @@ -79,6 +89,7 @@ impl MultiFileDataSource { session, glob_sources: Vec::new(), open_options_fn: Arc::new(|opts| opts), + metrics_registry: None, } } @@ -114,6 +125,16 @@ impl MultiFileDataSource { self } + /// Configure a shared metrics registry for all files opened by this data source. + /// + /// This instruments both the underlying [`VortexReadAt`] and the Vortex segment source so + /// callers can inspect read sizes, read durations, segment request coalescing, and segment + /// cache behavior for scans that use this data source. + pub fn with_metrics_registry(mut self, metrics_registry: Arc) -> Self { + self.metrics_registry = Some(metrics_registry); + self + } + /// Build the [`DataSource`]. /// /// Discovers files via glob, opens the first file eagerly to determine the schema, @@ -169,7 +190,14 @@ impl MultiFileDataSource { // Open first file eagerly for dtype. let (first_file_listing, first_fs) = &all_files[0]; let open_fn = self.open_options_fn.as_ref(); - let first_file = open_file(first_fs, first_file_listing, &self.session, open_fn).await?; + let first_file = open_file( + first_fs, + first_file_listing, + &self.session, + self.metrics_registry.as_ref(), + open_fn, + ) + .await?; let first_reader = first_file.layout_reader()?; let byte_sizes: Vec> = all_files.iter().map(|(file, _)| file.size).collect(); @@ -182,6 +210,7 @@ impl MultiFileDataSource { file: file.clone(), session: self.session.clone(), open_options_fn: Arc::clone(&self.open_options_fn), + metrics_registry: self.metrics_registry.clone(), }) as Arc }) .collect(); @@ -197,6 +226,18 @@ impl MultiFileDataSource { Ok(inner) } + + /// Build the [`DataSource`] selected by `VORTEX_SCAN_IMPL`. + /// + /// The default is the ScanPlan-backed V2 scan. Setting + /// `VORTEX_SCAN_IMPL=v1` (or `legacy`) falls back to the existing LayoutReader-backed scan. + pub async fn build_data_source(self) -> VortexResult { + if scan2_enabled()? { + Ok(Arc::new(scan_v2::build_scan_plan_data_source(self).await?)) + } else { + Ok(Arc::new(self.build().await?)) + } + } } /// Creates a local filesystem backed by `object_store::local::LocalFileSystem`. @@ -226,6 +267,7 @@ async fn open_file( fs: &FileSystemRef, file: &FileListing, session: &VortexSession, + metrics_registry: Option<&Arc>, open_options_fn: &(dyn Fn(VortexOpenOptions) -> VortexOpenOptions + Send + Sync), ) -> VortexResult { tracing::trace!(path = %file.path, "opening vortex file"); @@ -234,6 +276,16 @@ async fn open_file( // The URI includes the full path (with any filesystem prefix), making it unique // even when different PrefixFileSystem instances strip paths to the same relative name. let source = fs.open_read(&file.path).await?; + let labels = vec![Label::new(PATH_LABEL, file.path.clone())]; + let source = if let Some(metrics_registry) = metrics_registry { + Arc::new(InstrumentedReadAt::new_with_labels( + source, + metrics_registry.as_ref(), + labels.clone(), + )) as Arc + } else { + source + }; let cache_key = source .uri() .map(|u| u.to_string()) @@ -243,6 +295,11 @@ async fn open_file( // so we scope the cache lookup in a block. let options = { let mut options = open_options_fn(session.open_options()); + if let Some(metrics_registry) = metrics_registry { + options = options + .with_metrics_registry(Arc::clone(metrics_registry)) + .with_labels(labels); + } if let Some(size) = file.size { options = options.with_file_size(size); } @@ -267,6 +324,7 @@ struct VortexFileReaderFactory { file: FileListing, session: VortexSession, open_options_fn: Arc VortexOpenOptions + Send + Sync>, + metrics_registry: Option>, } #[async_trait] @@ -276,6 +334,7 @@ impl LayoutReaderFactory for VortexFileReaderFactory { &self.fs, &self.file, &self.session, + self.metrics_registry.as_ref(), self.open_options_fn.as_ref(), ) .await?; diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs new file mode 100644 index 00000000000..0ce65cb212b --- /dev/null +++ b/vortex-file/src/multi/scan_v2.rs @@ -0,0 +1,484 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! File adapters for ScanPlan-backed scans. + +use std::fmt; +use std::ops::Range; +use std::sync::Arc; + +use async_trait::async_trait; +use futures::TryStreamExt; +use futures::future::BoxFuture; +use vortex_array::aggregate_fn::AggregateFnRef; +use vortex_array::dtype::DType; +use vortex_array::dtype::FieldName; +use vortex_array::dtype::FieldPath; +use vortex_array::dtype::StructFields; +use vortex_array::expr::Expression; +use vortex_array::expr::stats::Precision; +use vortex_array::expr::stats::Stat; +use vortex_array::scalar::Scalar; +use vortex_array::scalar_fn::fns::get_item::GetItem; +use vortex_array::scalar_fn::fns::root::Root; +use vortex_array::stats::StatsSet; +use vortex_array::stream::SendableArrayStream; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_err; +use vortex_io::filesystem::FileListing; +use vortex_io::filesystem::FileSystemRef; +use vortex_layout::layout_v2::LayoutScanPlanCtx; +use vortex_layout::scan::v2::with_row_idx; +use vortex_metrics::MetricsRegistry; +use vortex_scan::ScanRequest as DataSourceScanRequest; +use vortex_scan::plan::PrepareCtx; +use vortex_scan::plan::PreparedAggregateRef; +use vortex_scan::plan::PreparedEvidenceRef; +use vortex_scan::plan::PreparedReadRef; +use vortex_scan::plan::PreparedStats; +use vortex_scan::plan::PreparedStatsRef; +use vortex_scan::plan::PushCtx; +use vortex_scan::plan::ReadContext; +use vortex_scan::plan::ScanPlan; +use vortex_scan::plan::ScanPlanDataSource; +use vortex_scan::plan::ScanPlanFactory; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::ScanState; +use vortex_scan::plan::ScanStateRef; +use vortex_scan::plan::StateCtx; +use vortex_scan::plan::downcast_state; +use vortex_scan::plan::request::ScanRequest; +use vortex_scan::plan::scan_plan_projected_splits; +use vortex_scan::plan::scan_plan_split_ranges; +use vortex_scan::plan::scan_plan_statistics; +use vortex_scan::plan::scan_plan_statistics_many; +use vortex_scan::plan::scan_plan_stream; +use vortex_session::VortexSession; + +use super::MultiFileDataSource; +use super::create_local_filesystem; +use super::open_file; +use crate::FileStatistics; +use crate::VortexFile; +use crate::VortexOpenOptions; + +struct FileStatsScanPlan { + data: ScanPlanRef, + stats: Arc, + fields: StructFields, + row_count: u64, +} + +struct FileStatsExprScanPlan { + data: ScanPlanRef, + stats: Arc, + field_idx: usize, + field_dtype: DType, + row_count: u64, +} + +struct FilePreparedStats { + stats: StatsSet, + field_dtype: DType, + row_count: u64, + funcs: Vec, +} + +impl FileStatsScanPlan { + fn try_new( + data: ScanPlanRef, + stats: Arc, + dtype: &DType, + row_count: u64, + ) -> Option { + let fields = dtype.as_struct_fields_opt()?.clone(); + Some(Self { + data, + stats, + fields, + row_count, + }) + } + + fn pushed_field(&self, expr: &Expression) -> Option<(usize, FieldName, DType)> { + let name = root_field(expr)?; + let field_idx = self.fields.find(name)?; + let field_dtype = self.fields.field_by_index(field_idx)?; + Some((field_idx, name.clone(), field_dtype)) + } +} + +impl ScanPlan for FileStatsScanPlan { + fn dtype(&self) -> &DType { + self.data.dtype() + } + + fn row_count(&self) -> u64 { + self.row_count + } + + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { + cx.init_plan(&self.data) + } + + fn try_push_expr( + self: Arc, + expr: &Expression, + cx: &mut PushCtx, + ) -> VortexResult> { + let Some(data) = Arc::clone(&self.data).try_push_expr(expr, cx)? else { + return Ok(None); + }; + let Some((field_idx, _name, field_dtype)) = self.pushed_field(expr) else { + return Ok(Some(data)); + }; + Ok(Some(Arc::new(FileStatsExprScanPlan { + data, + stats: Arc::clone(&self.stats), + field_idx, + field_dtype, + row_count: self.row_count, + }))) + } + + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { + Arc::clone(&self.data).prepare_read(cx) + } + + fn prepare_evidence( + self: Arc, + cx: &mut PrepareCtx, + ) -> VortexResult> { + Arc::clone(&self.data).prepare_evidence(cx) + } + + fn prepare_field_stats( + self: Arc, + field_path: &FieldPath, + funcs: &[AggregateFnRef], + cx: &mut PrepareCtx, + ) -> VortexResult> { + if field_path.parts().len() != 1 { + return Arc::clone(&self.data).prepare_field_stats(field_path, funcs, cx); + } + let Some(name) = field_path.parts()[0].as_name() else { + return Arc::clone(&self.data).prepare_field_stats(field_path, funcs, cx); + }; + let Some(field_idx) = self.fields.find(name) else { + return Ok(None); + }; + let Some(field_dtype) = self.fields.field_by_index(field_idx) else { + return Ok(None); + }; + let stats = self.stats.stats_sets()[field_idx].clone(); + Ok(Some(Arc::new(FilePreparedStats { + stats, + field_dtype, + row_count: self.row_count, + funcs: funcs.to_vec(), + }))) + } + + fn prepare_aggregate_partial( + self: Arc, + funcs: &[AggregateFnRef], + cx: &mut PrepareCtx, + ) -> VortexResult> { + Arc::clone(&self.data).prepare_aggregate_partial(funcs, cx) + } + + fn split_hints(&self) -> Option<&[u64]> { + self.data.split_hints() + } + + fn release(&self, frontier: u64, state: &ScanState) -> VortexResult<()> { + let state = downcast_state::(state)?; + self.data.release(frontier, state.as_ref()) + } + + fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "file_stats:")?; + self.data.fmt_chain(f) + } +} + +impl ScanPlan for FileStatsExprScanPlan { + fn dtype(&self) -> &DType { + &self.field_dtype + } + + fn row_count(&self) -> u64 { + self.row_count + } + + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { + cx.init_plan(&self.data) + } + + fn try_push_expr( + self: Arc, + expr: &Expression, + cx: &mut PushCtx, + ) -> VortexResult> { + Arc::clone(&self.data).try_push_expr(expr, cx) + } + + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { + Arc::clone(&self.data).prepare_read(cx) + } + + fn prepare_evidence( + self: Arc, + cx: &mut PrepareCtx, + ) -> VortexResult> { + Arc::clone(&self.data).prepare_evidence(cx) + } + + fn prepare_aggregate_partial( + self: Arc, + funcs: &[AggregateFnRef], + cx: &mut PrepareCtx, + ) -> VortexResult> { + Arc::clone(&self.data).prepare_aggregate_partial(funcs, cx) + } + + fn prepare_field_stats( + self: Arc, + field_path: &FieldPath, + funcs: &[AggregateFnRef], + cx: &mut PrepareCtx, + ) -> VortexResult> { + if !field_path.is_root() { + return Arc::clone(&self.data).prepare_field_stats(field_path, funcs, cx); + } + let stats = self.stats.stats_sets()[self.field_idx].clone(); + Ok(Some(Arc::new(FilePreparedStats { + stats, + field_dtype: self.field_dtype.clone(), + row_count: self.row_count, + funcs: funcs.to_vec(), + }))) + } + + fn split_hints(&self) -> Option<&[u64]> { + self.data.split_hints() + } + + fn release(&self, frontier: u64, state: &ScanState) -> VortexResult<()> { + let state = downcast_state::(state)?; + self.data.release(frontier, state.as_ref()) + } + + fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "file_stats_expr:")?; + self.data.fmt_chain(f) + } +} + +impl PreparedStats for FilePreparedStats { + fn init_state(&self, _ctx: &VortexSession) -> VortexResult { + Ok(Arc::new(())) + } + + fn stats<'a>( + &'a self, + range: Range, + _io: &'a ReadContext, + _state: &'a ScanState, + ) -> BoxFuture<'a, VortexResult>>> { + Box::pin(async move { + if range != (0..self.row_count) { + return Ok(absent_statistics(&self.funcs)); + } + self.funcs + .iter() + .map(|func| self.stat_for_func(func)) + .collect() + }) + } + + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "file_stats") + } +} + +impl FilePreparedStats { + fn stat_for_func(&self, func: &AggregateFnRef) -> VortexResult> { + let Some(stat) = Stat::from_aggregate_fn(func) else { + return Ok(Precision::Absent); + }; + let Some(dtype) = func.return_dtype(&self.field_dtype) else { + return Ok(Precision::Absent); + }; + self.stats + .get(stat) + .map(|value| Scalar::try_new(dtype, Some(value))) + .transpose() + } +} + +fn root_field(expr: &Expression) -> Option<&FieldName> { + let name = expr.as_opt::()?; + expr.child(0).is::().then_some(name) +} + +fn absent_statistics(funcs: &[AggregateFnRef]) -> Vec> { + funcs.iter().map(|_| Precision::Absent).collect() +} + +/// Build a scan2 [`DataSource`](vortex_scan::DataSource) from a multi-file builder. +pub(crate) async fn build_scan_plan_data_source( + builder: MultiFileDataSource, +) -> VortexResult { + if builder.glob_sources.is_empty() { + vortex_bail!("MultiFileDataSource requires at least one glob pattern"); + } + + let local_fs: Option = builder + .glob_sources + .iter() + .any(|(_, fs)| fs.is_none()) + .then(|| create_local_filesystem(&builder.session)) + .transpose()?; + + let mut all_files: Vec<(FileListing, FileSystemRef)> = Vec::new(); + for (glob, maybe_fs) in &builder.glob_sources { + let fs = maybe_fs + .as_ref() + .or(local_fs.as_ref()) + .map(Arc::clone) + .unwrap_or_else(|| unreachable!("local_fs is set when any glob lacks a filesystem")); + let files: Vec = fs.glob(glob)?.try_collect().await?; + for file in files { + all_files.push((file, Arc::clone(&fs))); + } + } + + if all_files.is_empty() { + let globs: Vec<_> = builder + .glob_sources + .iter() + .map(|(glob, _)| glob.as_str()) + .collect(); + vortex_bail!("No files matched the glob pattern(s): {:?}", globs); + } + + let (first_file_listing, first_fs) = &all_files[0]; + let first_file = open_file( + first_fs, + first_file_listing, + &builder.session, + builder.metrics_registry.as_ref(), + builder.open_options_fn.as_ref(), + ) + .await?; + let first_root = first_file.scan_plan_root()?; + + let factories: Vec> = all_files[1..] + .iter() + .map(|(file, fs)| { + Arc::new(ScanPlanFileFactory { + fs: Arc::clone(fs), + file: file.clone(), + session: builder.session.clone(), + open_options_fn: Arc::clone(&builder.open_options_fn), + metrics_registry: builder.metrics_registry.clone(), + }) as Arc + }) + .collect(); + + Ok(ScanPlanDataSource::new_with_first( + first_root, + factories, + &builder.session, + )) +} + +struct ScanPlanFileFactory { + fs: FileSystemRef, + file: FileListing, + session: VortexSession, + open_options_fn: Arc VortexOpenOptions + Send + Sync>, + metrics_registry: Option>, +} + +#[async_trait] +impl ScanPlanFactory for ScanPlanFileFactory { + async fn open(&self) -> VortexResult> { + let file = open_file( + &self.fs, + &self.file, + &self.session, + self.metrics_registry.as_ref(), + self.open_options_fn.as_ref(), + ) + .await?; + Ok(Some(file.scan_plan_root()?)) + } +} + +pub(crate) fn scan_plan_file_stream( + file: VortexFile, + request: DataSourceScanRequest, +) -> VortexResult { + let root = file.scan_plan_root()?; + scan_plan_stream(root, file.session().clone(), request) +} + +pub(crate) async fn scan_plan_file_statistics( + file: VortexFile, + expr: &Expression, + funcs: &[AggregateFnRef], +) -> VortexResult>> { + let root = file.scan_plan_root()?; + scan_plan_statistics(root, file.session().clone(), expr, funcs).await +} + +pub(crate) async fn scan_plan_file_statistics_many( + file: VortexFile, + exprs: &[Expression], + funcs: &[AggregateFnRef], +) -> VortexResult>>> { + let root = file.scan_plan_root()?; + scan_plan_statistics_many(root, file.session().clone(), exprs, funcs).await +} + +pub(crate) fn scan_plan_file_splits(file: &VortexFile) -> VortexResult>> { + let root = file.scan_plan_root()?; + Ok(scan_plan_split_ranges(&root)) +} + +pub(crate) async fn scan_plan_file_plan_splits( + file: VortexFile, + projection: &Expression, +) -> VortexResult>> { + let root = file.scan_plan_root()?; + scan_plan_projected_splits(root, file.session().clone(), projection).await +} + +pub(crate) fn build_file_scan_plan_root(file: &VortexFile) -> VortexResult { + let mut plan_request = ScanRequest::empty(); + let layout = file + .footer() + .layout2() + .ok_or_else(|| vortex_err!("scan2 requires a v2 footer layout"))?; + let ctx = LayoutScanPlanCtx::new( + file.session().clone(), + file.segment_source(), + file.scan_plan_segment_future_cache(), + ); + let root = layout.new_scan_plan(&mut plan_request, &ctx)?; + let root = with_row_idx(root, 0); + Ok(match file.footer().statistics().cloned() { + Some(stats) => FileStatsScanPlan::try_new( + Arc::clone(&root), + Arc::new(stats), + file.dtype(), + file.row_count(), + ) + .map(|node| Arc::new(node) as ScanPlanRef) + .unwrap_or(root), + None => root, + }) +} diff --git a/vortex-file/src/scan_v1_v2_differential.rs b/vortex-file/src/scan_v1_v2_differential.rs new file mode 100644 index 00000000000..27b3b9c61c2 --- /dev/null +++ b/vortex-file/src/scan_v1_v2_differential.rs @@ -0,0 +1,637 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Differential tests that scan the same [`ScanRequest`] through both the V1 +//! (LayoutReader-based) and V2 (ScanPlan-based) scan paths and assert the +//! outputs are identical. +//! +//! V1 is driven through [`VortexFile::scan`] + +//! [`ScanBuilder::into_array_stream`]; V2 is driven directly through +//! [`VortexFile::scan_plan_stream`]. Neither side flips the process-global +//! `VORTEX_SCAN_IMPL` env var, so the two implementations run side by side in +//! the same test process. + +// Nested struct fixtures use short field names (a, b, c, s) that mirror the v1 +// regression tests; single-char names are clearest here. +#![allow(clippy::many_single_char_names)] + +use std::collections::BTreeMap; +use std::sync::Arc; +use std::sync::LazyLock; + +use async_trait::async_trait; +use futures::StreamExt; +use futures::TryStreamExt; +use futures::stream; +use futures::stream::BoxStream; +use rstest::rstest; +use vortex_array::ArrayRef; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::accessor::ArrayAccessor; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::ChunkedArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::StructArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::assert_arrays_eq; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::expr::Expression; +use vortex_array::expr::get_item; +use vortex_array::expr::gt; +use vortex_array::expr::lit; +use vortex_array::expr::merge; +use vortex_array::expr::pack; +use vortex_array::expr::root; +use vortex_array::expr::select; +use vortex_array::stats::PRUNING_STATS; +use vortex_array::stream::ArrayStreamAdapter; +use vortex_array::stream::ArrayStreamExt; +use vortex_array::validity::Validity; +use vortex_buffer::Buffer; +use vortex_buffer::ByteBuffer; +use vortex_buffer::ByteBufferMut; +use vortex_buffer::buffer; +use vortex_error::VortexResult; +use vortex_io::VortexReadAt; +use vortex_io::filesystem::FileListing; +use vortex_io::filesystem::FileSystem; +use vortex_io::filesystem::FileSystemRef; +use vortex_layout::layouts::row_idx::row_idx; +use vortex_scan::DataSourceRef; +use vortex_scan::ScanRequest; +use vortex_scan::selection::Selection; +use vortex_session::VortexSession; + +use crate::OpenOptionsSessionExt; +use crate::VortexFile; +use crate::WriteOptionsSessionExt; +use crate::multi::MultiFileDataSource; +use crate::multi::scan_v2::build_scan_plan_data_source; + +static SESSION: LazyLock = LazyLock::new(crate::tests::new_test_session); + +/// Write `array` to an in-memory Vortex file, optionally with file statistics +/// (which exercises the V2 `FileStatsScanPlan` path and V1 `FileStatsLayoutReader`). +async fn write_file(array: ArrayRef, with_stats: bool) -> VortexResult { + let mut buf = ByteBufferMut::empty(); + if with_stats { + let mut writer = SESSION + .write_options() + .with_file_statistics(PRUNING_STATS.to_vec()) + .writer(&mut buf, array.dtype().clone()); + writer.push(array).await?; + writer.finish().await?; + } else { + SESSION + .write_options() + .write(&mut buf, array.to_array_stream()) + .await?; + } + SESSION.open_options().open_buffer(buf.freeze()) +} + +/// Scan `file` through the V1 LayoutReader path. +async fn scan_v1(file: &VortexFile, request: &ScanRequest) -> VortexResult { + let mut builder = file + .scan()? + .with_projection(request.projection.clone()) + .with_selection(request.selection.clone()) + .with_some_limit(request.limit) + .with_ordered(request.ordered); + if let Some(filter) = &request.filter { + builder = builder.with_filter(filter.clone()); + } + if let Some(row_range) = request.row_range.clone() { + builder = builder.with_row_range(row_range); + } + builder.into_array_stream()?.read_all().await +} + +/// Scan `file` through the V2 ScanPlan path. +async fn scan_v2(file: &VortexFile, request: &ScanRequest) -> VortexResult { + file.scan_plan_stream(request.clone())?.read_all().await +} + +/// Scan the same request through both paths and assert the outputs are equal. +async fn assert_v1_eq_v2(file: &VortexFile, request: ScanRequest) -> VortexResult<()> { + let v1 = scan_v1(file, &request).await?; + let v2 = scan_v2(file, &request).await?; + assert_eq!( + v1.dtype(), + v2.dtype(), + "V1/V2 dtype mismatch for projection {} filter {:?}", + request.projection, + request.filter + ); + let mut ctx = SESSION.create_execution_ctx(); + assert_arrays_eq!(v1, v2, &mut ctx); + Ok(()) +} + +/// Build an ordered V2 scan request from a projection and optional filter. +fn request(projection: Expression, filter: Option) -> ScanRequest { + ScanRequest { + projection, + filter, + ordered: true, + ..Default::default() + } +} + +async fn write_part(array: ArrayRef) -> VortexResult { + let mut buf = ByteBufferMut::empty(); + SESSION + .write_options() + .write(&mut buf, array.to_array_stream()) + .await?; + Ok(buf.freeze()) +} + +#[derive(Debug)] +struct MemoryFileSystem { + files: BTreeMap, +} + +#[async_trait] +impl FileSystem for MemoryFileSystem { + fn list(&self, prefix: &str) -> BoxStream<'_, VortexResult> { + let listings = self + .files + .iter() + .filter_map(move |(path, bytes)| { + path.starts_with(prefix).then_some(Ok(FileListing { + path: path.clone(), + size: Some(bytes.len() as u64), + })) + }) + .collect::>(); + stream::iter(listings).boxed() + } + + async fn head(&self, path: &str) -> VortexResult> { + Ok(self.files.get(path).map(|bytes| FileListing { + path: path.to_string(), + size: Some(bytes.len() as u64), + })) + } + + async fn open_read(&self, path: &str) -> VortexResult> { + self.files + .get(path) + .cloned() + .map(|bytes| Arc::new(bytes) as Arc) + .ok_or_else(|| vortex_error::vortex_err!("missing test file {path}")) + } + + async fn delete(&self, _path: &str) -> VortexResult<()> { + Ok(()) + } +} + +async fn scan_data_source(source: DataSourceRef, request: ScanRequest) -> VortexResult { + let scan = source.scan(request).await?; + let dtype = scan.dtype().clone(); + let stream = scan + .partitions() + .then(|partition| async move { partition?.execute() }) + .try_flatten() + .boxed(); + ArrayStreamAdapter::new(dtype, stream).read_all().await +} + +fn sorted_i32_values(array: ArrayRef) -> VortexResult> { + let mut ctx = SESSION.create_execution_ctx(); + let primitive = array.execute::(&mut ctx)?; + let mut values = primitive + .with_iterator(|iter| iter.map(|value| value.copied()).collect::>>()) + .ok_or_else(|| { + vortex_error::vortex_err!("unordered differential values must be non-null") + })?; + values.sort_unstable(); + Ok(values) +} + +// ---- Fixtures ---- + +/// Flat primitive column, both nullable and non-nullable variants. +fn flat_primitive(nullable: bool) -> ArrayRef { + let numbers = if nullable { + PrimitiveArray::from_option_iter([Some(1i32), None, Some(3), Some(4), None, Some(6)]) + .into_array() + } else { + buffer![1i32, 2, 3, 4, 5, 6].into_array() + }; + StructArray::from_fields(&[("numbers", numbers)]) + .unwrap() + .into_array() +} + +/// A chunked primitive column. +fn chunked() -> ArrayRef { + let numbers = ChunkedArray::from_iter([ + buffer![1i32, 2, 3, 4].into_array(), + buffer![5i32, 6, 7, 8].into_array(), + buffer![9i32, 10, 11, 12].into_array(), + ]) + .into_array(); + StructArray::from_fields(&[("numbers", numbers)]) + .unwrap() + .into_array() +} + +/// A low-cardinality string column that the writer dictionary-encodes. +fn dict_encoded() -> ArrayRef { + let n = 4096usize; + let values: Vec<&str> = (0..n).map(|i| ["alpha", "beta", "gamma"][i % 3]).collect(); + let strings = VarBinViewArray::from_iter_str(values).into_array(); + StructArray::from_fields(&[("letters", strings)]) + .unwrap() + .into_array() +} + +/// A wide-range numeric column over many rows so the writer emits zone stats. +fn zoned() -> ArrayRef { + let n = 100_000i32; + let numbers = PrimitiveArray::from_iter(0..n).into_array(); + StructArray::from_fields(&[("numbers", numbers)]) + .unwrap() + .into_array() +} + +/// A `keep` flag column plus a `name` string column, for multi-conjunct filter tests: +/// `id != 0` is a cheap, selective predicate; `name LIKE '%match%'` is the expensive +/// residual that should run filter-first once `id` has narrowed the demanded rows. +fn id_and_name(keep: &[u32], names: &[&str]) -> ArrayRef { + StructArray::from_fields(&[ + ( + "id", + PrimitiveArray::from_iter(keep.iter().copied()).into_array(), + ), + ( + "name", + VarBinViewArray::from_iter_str(names.iter().copied()).into_array(), + ), + ]) + .unwrap() + .into_array() +} + +/// 16 names where most rows contain the `match` needle (decoys), so a residual `LIKE` +/// that ignored the cheaper predicate would diverge from V1. +const MULTI_CONJUNCT_NAMES: [&str; 16] = [ + "row0_match", + "row1_match", + "no_hit_here", + "row3_match", + "row4_match", + "row5_match", + "row6_match", + "row7_match", + "row8_match", + "has_match_inside", + "row10_match", + "row11_match", + "row12_match", + "row13_match", + "row14_match", + "row15_match", +]; + +fn multi_conjunct_filter() -> Expression { + vortex_array::expr::and( + vortex_array::expr::not_eq(get_item("id", root()), lit(0u32)), + vortex_array::expr::like(get_item("name", root()), lit("%match%")), + ) +} + +/// Outer struct is non-nullable (so the file writes), but it contains a nullable +/// nested struct `a` with a non-nullable field `b.c`. Projecting `a.b.c` (or +/// selecting `c` out of `a.b`) must preserve the nulls of the nullable `a.b` +/// struct. +/// +/// | a.b | +/// |-------------------| +/// | `{ "c": 4 }` | +/// | `NULL` | +/// | `{ "c": 6 }` | +/// | `NULL` | +/// | `{ "c": 10 }` | +fn nested_nullable_struct() -> ArrayRef { + let c = buffer![4i32, 5, 6, 8, 10].into_array(); + let b = StructArray::try_from_iter_with_validity([("c", c)], Validity::NonNullable) + .unwrap() + .into_array(); + let a = StructArray::try_from_iter_with_validity( + [("b", b)], + Validity::Array(BoolArray::from_iter([true, false, true, false, true]).into_array()), + ) + .unwrap() + .into_array(); + StructArray::try_from_iter_with_validity([("a", a)], Validity::NonNullable) + .unwrap() + .into_array() +} + +// ---- Differential cases ---- + +#[rstest] +#[case::flat_primitive_nonnull(flat_primitive(false))] +#[case::flat_primitive_nullable(flat_primitive(true))] +#[case::chunked(chunked())] +#[case::dict_encoded(dict_encoded())] +#[tokio::test] +async fn differential_full_scan(#[case] array: ArrayRef) -> VortexResult<()> { + let file = write_file(array, false).await?; + assert_v1_eq_v2(&file, request(root(), None)).await +} + +#[rstest] +#[case::flat_primitive_nonnull(flat_primitive(false))] +#[case::flat_primitive_nullable(flat_primitive(true))] +#[case::chunked(chunked())] +#[tokio::test] +async fn differential_project_numbers(#[case] array: ArrayRef) -> VortexResult<()> { + let file = write_file(array, false).await?; + assert_v1_eq_v2(&file, request(select(["numbers"], root()), None)).await +} + +#[rstest] +#[case::flat_primitive_nonnull(flat_primitive(false))] +#[case::flat_primitive_nullable(flat_primitive(true))] +#[case::chunked(chunked())] +#[tokio::test] +async fn differential_filter_numbers(#[case] array: ArrayRef) -> VortexResult<()> { + let file = write_file(array, false).await?; + let filter = gt(get_item("numbers", root()), lit(3i32)); + assert_v1_eq_v2(&file, request(root(), Some(filter))).await +} + +#[tokio::test] +async fn differential_row_range() -> VortexResult<()> { + let file = write_file(chunked(), false).await?; + let scan_request = ScanRequest { + row_range: Some(2..8), + ..request(root(), None) + }; + assert_v1_eq_v2(&file, scan_request).await +} + +#[tokio::test] +async fn differential_include_selection() -> VortexResult<()> { + let file = write_file(chunked(), false).await?; + let scan_request = ScanRequest { + selection: Selection::IncludeByIndex(Buffer::from_iter([0, 2, 5, 9])), + ..request(root(), None) + }; + assert_v1_eq_v2(&file, scan_request).await +} + +#[tokio::test] +async fn differential_exclude_selection() -> VortexResult<()> { + let file = write_file(chunked(), false).await?; + let scan_request = ScanRequest { + selection: Selection::ExcludeByIndex(Buffer::from_iter([1, 4, 7])), + ..request(root(), None) + }; + assert_v1_eq_v2(&file, scan_request).await +} + +#[tokio::test] +async fn differential_limit() -> VortexResult<()> { + let file = write_file(chunked(), false).await?; + let scan_request = ScanRequest { + limit: Some(5), + ..request(root(), None) + }; + assert_v1_eq_v2(&file, scan_request).await +} + +#[tokio::test] +async fn differential_unordered_multi_file_partition_selection() -> VortexResult<()> { + let request = ScanRequest { + projection: get_item("numbers", root()), + row_range: Some(1..4), + selection: Selection::ExcludeByIndex(Buffer::from_iter([2])), + partition_selection: Selection::IncludeByIndex(Buffer::from_iter([0, 2])), + ordered: false, + ..Default::default() + }; + + let parts = [ + ("part-0.vortex", buffer![0i32, 1, 2, 3, 4].into_array()), + ("part-1.vortex", buffer![10i32, 11, 12, 13, 14].into_array()), + ("part-2.vortex", buffer![20i32, 21, 22, 23, 24].into_array()), + ]; + let files = BTreeMap::from_iter( + futures::future::try_join_all(parts.into_iter().map(|(path, numbers)| async move { + let array = StructArray::from_fields(&[("numbers", numbers)])?.into_array(); + Ok::<_, vortex_error::VortexError>((path.to_string(), write_part(array).await?)) + })) + .await?, + ); + let fs: FileSystemRef = Arc::new(MemoryFileSystem { files }); + + let v1_source: DataSourceRef = Arc::new( + MultiFileDataSource::new(SESSION.clone()) + .with_glob("part-*.vortex", Some(Arc::clone(&fs))) + .build() + .await?, + ); + let v1 = scan_data_source(v1_source, request.clone()).await?; + + let v2_source: DataSourceRef = Arc::new( + build_scan_plan_data_source( + MultiFileDataSource::new(SESSION.clone()).with_glob("part-*.vortex", Some(fs)), + ) + .await?, + ); + let v2 = scan_data_source(v2_source, request).await?; + + assert_eq!(sorted_i32_values(v1)?, vec![1, 3, 21, 23]); + assert_eq!(sorted_i32_values(v2)?, vec![1, 3, 21, 23]); + Ok(()) +} + +#[tokio::test] +async fn differential_dict_filter() -> VortexResult<()> { + let file = write_file(dict_encoded(), false).await?; + let filter = vortex_array::expr::eq(get_item("letters", root()), lit("beta")); + assert_v1_eq_v2(&file, request(root(), Some(filter))).await +} + +#[tokio::test] +async fn differential_zoned_full() -> VortexResult<()> { + let file = write_file(zoned(), true).await?; + assert_v1_eq_v2(&file, request(root(), None)).await +} + +#[tokio::test] +async fn differential_zoned_filter() -> VortexResult<()> { + let file = write_file(zoned(), true).await?; + // Filter that zone stats can partially prune. + let filter = gt(get_item("numbers", root()), lit(99_990i32)); + assert_v1_eq_v2(&file, request(root(), Some(filter))).await +} + +/// Low-density multi-conjunct filter: `id != 0` keeps 2/16 rows (density 0.125 < 0.2), +/// so the expensive `name LIKE '%match%'` runs filter-first over only the demanded rows +/// and its compacted verdict is scattered back. Asserted against the V1 reference, which +/// catches any off-by-rank error in the scatter-back. +#[tokio::test] +async fn differential_multi_conjunct_filter_first() -> VortexResult<()> { + let mut keep = [0u32; 16]; + keep[2] = 1; + keep[9] = 1; + let file = write_file(id_and_name(&keep, &MULTI_CONJUNCT_NAMES), false).await?; + assert_v1_eq_v2(&file, request(root(), Some(multi_conjunct_filter()))).await +} + +/// High-density multi-conjunct filter: `id != 0` keeps 14/16 rows (density 0.875 > 0.2), +/// so the residual takes the dense path. Must still match V1. +#[tokio::test] +async fn differential_multi_conjunct_dense() -> VortexResult<()> { + let mut keep = [1u32; 16]; + keep[2] = 0; + keep[9] = 0; + let file = write_file(id_and_name(&keep, &MULTI_CONJUNCT_NAMES), false).await?; + assert_v1_eq_v2(&file, request(root(), Some(multi_conjunct_filter()))).await +} + +#[tokio::test] +async fn differential_single_field_merge_select_projection() -> VortexResult<()> { + let file = write_file(flat_primitive(false), true).await?; + let projection = merge([ + pack([("file_row_number", row_idx())], Nullability::NonNullable), + select(["numbers"], root()), + ]); + assert_v1_eq_v2(&file, request(projection, None)).await +} + +/// Reproduces the struct-null bug: projecting a single deep field out of a +/// nullable nested struct must apply the parent struct's validity. The V2 +/// single-field fast path previously bypassed `self.validity`. +#[tokio::test] +async fn differential_nested_nullable_struct_get_item() -> VortexResult<()> { + let file = write_file(nested_nullable_struct(), false).await?; + // SELECT a.b.c (single deep field access) + let projection = get_item("c", get_item("b", get_item("a", root()))); + assert_v1_eq_v2(&file, request(projection, None)).await +} + +/// Same bug via `select(["c"], a.b)`: selecting a single field out of the +/// nullable nested struct must preserve the struct's nulls. +#[tokio::test] +async fn differential_nested_nullable_struct_select() -> VortexResult<()> { + let file = write_file(nested_nullable_struct(), false).await?; + let projection = select(["c"], get_item("b", get_item("a", root()))); + assert_v1_eq_v2(&file, request(projection, None)).await +} + +/// Projecting the nullable nested struct `a.b` itself (a struct value) must also +/// preserve its nulls. +#[tokio::test] +async fn differential_nested_nullable_struct_project_struct() -> VortexResult<()> { + let file = write_file(nested_nullable_struct(), false).await?; + let projection = get_item("b", get_item("a", root())); + assert_v1_eq_v2(&file, request(projection, None)).await +} + +// ---- Ported V1 regression tests (struct nulls), exercised through V2 ---- + +/// Port of `vortex-layout` `test_struct_layout_nulls`: a nullable struct, when a +/// single field is projected, must mask the field with the parent struct's +/// validity. Reachable on the V2 file path through a non-nullable outer struct +/// wrapping a nullable inner struct. +#[tokio::test] +async fn v2_struct_layout_nulls() -> VortexResult<()> { + // inner struct `a` is nullable with fields a, b, c; row 0 is null. + let inner = StructArray::try_from_iter_with_validity( + [ + ("a", buffer![7i32, 2, 3].into_array()), + ("b", buffer![4i32, 5, 6].into_array()), + ("c", buffer![4i32, 5, 6].into_array()), + ], + Validity::Array(BoolArray::from_iter([false, true, true]).into_array()), + )? + .into_array(); + let outer = StructArray::try_from_iter_with_validity([("s", inner)], Validity::NonNullable)? + .into_array(); + + let file = write_file(outer, false).await?; + + // SELECT s.a -> the result must be masked with s's validity (row 0 null). + let projection = get_item("a", get_item("s", root())); + let v2 = scan_v2(&file, &request(projection, None)).await?; + + assert_eq!( + v2.dtype(), + &DType::Primitive(PType::I32, Nullability::Nullable) + ); + + let expected = PrimitiveArray::from_option_iter([None, Some(2i32), Some(3)]).into_array(); + let mut ctx = SESSION.create_execution_ctx(); + assert_arrays_eq!(v2, expected, &mut ctx); + Ok(()) +} + +/// Port of `vortex-layout` `test_struct_layout_nested`: projecting `c` out of a +/// nullable nested struct `s.a.b` must preserve the nested struct's nulls. +#[tokio::test] +async fn v2_struct_layout_nested() -> VortexResult<()> { + // s.a.b is nullable (true, false, true); s.a.b.c is non-nullable. + let c = buffer![4i32, 5, 6].into_array(); + let b = + StructArray::try_from_iter_with_validity([("c", c)], Validity::NonNullable)?.into_array(); + let a = StructArray::try_from_iter_with_validity( + [("b", b)], + Validity::Array(BoolArray::from_iter([true, false, true]).into_array()), + )? + .into_array(); + let s = + StructArray::try_from_iter_with_validity([("a", a)], Validity::NonNullable)?.into_array(); + let outer = + StructArray::try_from_iter_with_validity([("s", s)], Validity::NonNullable)?.into_array(); + + let file = write_file(outer, false).await?; + + // SELECT c from s.a.b + let projection = select(["c"], get_item("b", get_item("a", get_item("s", root())))); + let v2 = scan_v2(&file, &request(projection, None)).await?; + + // Result is a nullable struct (because s.a.b is nullable) with a + // non-nullable field "c". + assert_eq!( + v2.dtype(), + &DType::Struct( + vortex_array::dtype::StructFields::from_iter([( + "c", + DType::Primitive(PType::I32, Nullability::NonNullable) + )]), + Nullability::Nullable, + ) + ); + + // Cross-check against V1 producing the same masked output. + let v1 = scan_v1( + &file, + &request( + select(["c"], get_item("b", get_item("a", get_item("s", root())))), + None, + ), + ) + .await?; + let mut ctx = SESSION.create_execution_ctx(); + assert_arrays_eq!(v1, v2, &mut ctx); + + // Build the expected struct directly: rows 0 and 2 valid, row 1 null. + let expected = StructArray::try_from_iter_with_validity( + [("c", buffer![4i32, 5, 6].into_array())], + Validity::Array(BoolArray::from_iter([true, false, true]).into_array()), + )? + .into_array(); + assert_arrays_eq!(v2, expected, &mut ctx); + Ok(()) +} diff --git a/vortex-file/src/segments/cache.rs b/vortex-file/src/segments/cache.rs index 31301028afc..2add501d67d 100644 --- a/vortex-file/src/segments/cache.rs +++ b/vortex-file/src/segments/cache.rs @@ -8,6 +8,7 @@ use parking_lot::RwLock; use vortex_buffer::ByteBuffer; use vortex_error::VortexResult; use vortex_layout::segments::SegmentCache; +use vortex_layout::segments::SegmentCacheKey; use vortex_layout::segments::SegmentId; use vortex_utils::aliases::hash_map::HashMap; @@ -21,14 +22,14 @@ pub struct InitialReadSegmentCache { #[async_trait] impl SegmentCache for InitialReadSegmentCache { - async fn get(&self, id: SegmentId) -> VortexResult> { - if let Some(buffer) = self.initial.read().get(&id) { + async fn get(&self, key: SegmentCacheKey) -> VortexResult> { + if let Some(buffer) = self.initial.read().get(&key.segment_id) { return Ok(Some(buffer.clone())); } - self.fallback.get(id).await + self.fallback.get(key).await } - async fn put(&self, id: SegmentId, buffer: ByteBuffer) -> VortexResult<()> { - self.fallback.put(id, buffer).await + async fn put(&self, key: SegmentCacheKey, buffer: ByteBuffer) -> VortexResult<()> { + self.fallback.put(key, buffer).await } } diff --git a/vortex-file/src/segments/source.rs b/vortex-file/src/segments/source.rs index e7c9dc3b222..c048a5cb9b3 100644 --- a/vortex-file/src/segments/source.rs +++ b/vortex-file/src/segments/source.rs @@ -12,6 +12,7 @@ use futures::FutureExt; use futures::StreamExt; use futures::channel::mpsc; use futures::future; +use tracing::Instrument; use vortex_array::buffer::BufferHandle; use vortex_buffer::Alignment; use vortex_buffer::ByteBuffer; @@ -23,6 +24,7 @@ use vortex_io::VortexReadAt; use vortex_io::runtime::Handle; use vortex_layout::segments::SegmentFuture; use vortex_layout::segments::SegmentId; +use vortex_layout::segments::SegmentInfo; use vortex_layout::segments::SegmentSource; use vortex_metrics::Counter; use vortex_metrics::Histogram; @@ -121,8 +123,15 @@ impl FileSegmentSource { .map(move |req| { let reader = reader.clone(); async move { + let offset = req.offset(); + let len = req.len(); let result = reader - .read_at(req.offset(), req.len(), req.alignment()) + .read_at(offset, len, req.alignment()) + .instrument(tracing::trace_span!( + "vortex_segment_read", + offset, + len, + )) .await; let result = result.and_then(|buffer| { if req.len() != buffer.len() { @@ -155,6 +164,13 @@ impl FileSegmentSource { } impl SegmentSource for FileSegmentSource { + fn segment_info(&self, id: SegmentId) -> VortexResult { + self.segments + .get(*id as usize) + .map(|spec| SegmentInfo::new(u64::from(spec.length))) + .ok_or_else(|| vortex_err!("Missing segment: {}", id)) + } + fn request(&self, id: SegmentId) -> SegmentFuture { // We eagerly register the read request here assuming the behaviour of [`FileSegmentSource`], where // coalescing becomes effective prior to the future being polled. @@ -296,6 +312,13 @@ impl BufferSegmentSource { } impl SegmentSource for BufferSegmentSource { + fn segment_info(&self, id: SegmentId) -> VortexResult { + self.segments + .get(*id as usize) + .map(|spec| SegmentInfo::new(u64::from(spec.length))) + .ok_or_else(|| vortex_err!("Missing segment: {}", id)) + } + fn request(&self, id: SegmentId) -> SegmentFuture { let spec = match self.segments.get(*id as usize) { Some(spec) => spec, diff --git a/vortex-file/src/tests.rs b/vortex-file/src/tests.rs index 32ec3c75f8d..3bdb5a864b0 100644 --- a/vortex-file/src/tests.rs +++ b/vortex-file/src/tests.rs @@ -48,6 +48,7 @@ use vortex_array::expr::lt_eq; use vortex_array::expr::or; use vortex_array::expr::root; use vortex_array::expr::select; +use vortex_array::expr::stats::Precision; use vortex_array::extension::datetime::TimeUnit; use vortex_array::extension::datetime::Timestamp; use vortex_array::extension::datetime::TimestampOptions; @@ -60,6 +61,7 @@ use vortex_array::stream::ArrayStreamAdapter; use vortex_array::stream::ArrayStreamExt; use vortex_array::validity::Validity; use vortex_buffer::Buffer; +use vortex_buffer::ByteBuffer; use vortex_buffer::ByteBufferMut; use vortex_buffer::buffer; use vortex_error::VortexResult; @@ -71,6 +73,7 @@ use vortex_layout::layouts::zoned::Zoned; use vortex_layout::scan::scan_builder::ScanBuilder; use vortex_layout::scan::split_by::SplitBy; use vortex_layout::session::LayoutSession; +use vortex_scan::ScanSchedulerSession; use vortex_session::VortexSession; use crate::OpenOptionsSessionExt; @@ -79,15 +82,224 @@ use crate::VERSION; use crate::VortexFile; use crate::WriteOptionsSessionExt; use crate::footer::SegmentSpec; -static SESSION: LazyLock = LazyLock::new(|| { +use crate::multi::MultiFileDataSource; +use crate::multi::MultiFileSession; + +static SESSION: LazyLock = LazyLock::new(new_test_session); + +pub(crate) fn new_test_session() -> VortexSession { let session = array_session() .with::() - .with::(); + .with::() + .with::() + .with::(); crate::register_default_encodings(&session); session -}); +} + +fn exact_u32_stat(stat: &Precision) -> Option { + stat.as_ref() + .as_exact()? + .as_primitive() + .typed_value::() +} + +fn exact_u64_stat(stat: &Precision) -> Option { + stat.as_ref() + .as_exact()? + .as_primitive() + .typed_value::() +} + +#[test] +fn multi_file_scan_plan_data_source_filters_and_projects() -> VortexResult<()> { + use vortex_io::runtime::BlockingRuntime; + use vortex_io::runtime::single::SingleThreadRuntime; + use vortex_io::session::RuntimeSessionExt; + + let runtime = SingleThreadRuntime::default(); + let session = new_test_session().with_handle(runtime.handle()); + + runtime.block_on(async { + use async_trait::async_trait; + use futures::stream; + use futures::stream::BoxStream; + use vortex_array::aggregate_fn::AggregateFnVTableExt; + use vortex_array::aggregate_fn::EmptyOptions; + use vortex_array::aggregate_fn::NumericalAggregateOpts; + use vortex_array::aggregate_fn::fns::max::Max; + use vortex_array::aggregate_fn::fns::min::Min; + use vortex_array::aggregate_fn::fns::null_count::NullCount; + use vortex_io::VortexReadAt; + use vortex_io::filesystem::FileListing; + use vortex_io::filesystem::FileSystem; + use vortex_io::filesystem::FileSystemRef; + + #[derive(Debug)] + struct MemoryFileSystem { + files: std::collections::BTreeMap, + } + + #[async_trait] + impl FileSystem for MemoryFileSystem { + fn list(&self, prefix: &str) -> BoxStream<'_, VortexResult> { + let listings = self + .files + .iter() + .filter_map(move |(path, bytes)| { + path.starts_with(prefix).then_some(Ok(FileListing { + path: path.clone(), + size: Some(bytes.len() as u64), + })) + }) + .collect::>(); + stream::iter(listings).boxed() + } + + async fn head(&self, path: &str) -> VortexResult> { + Ok(self.files.get(path).map(|bytes| FileListing { + path: path.to_string(), + size: Some(bytes.len() as u64), + })) + } + + async fn open_read(&self, path: &str) -> VortexResult> { + self.files + .get(path) + .cloned() + .map(|bytes| Arc::new(bytes) as Arc) + .ok_or_else(|| vortex_error::vortex_err!("missing test file {path}")) + } + + async fn delete(&self, _path: &str) -> VortexResult<()> { + Ok(()) + } + } + + async fn write_part(session: &VortexSession, values: ArrayRef) -> VortexResult { + let mut buf = ByteBufferMut::empty(); + session + .write_options() + .write(&mut buf, values.to_array_stream()) + .await?; + Ok(buf.freeze()) + } + + async fn write_part_with_stats( + session: &VortexSession, + values: ArrayRef, + ) -> VortexResult { + let mut buf = ByteBufferMut::empty(); + let mut writer = session + .write_options() + .with_file_statistics(PRUNING_STATS.to_vec()) + .writer(&mut buf, values.dtype().clone()); + writer.push(values).await?; + writer.finish().await?; + Ok(buf.freeze()) + } + + let single = StructArray::from_fields(&[("numbers", buffer![10u32, 20, 30].into_array())])? + .into_array(); + let single_fs: FileSystemRef = Arc::new(MemoryFileSystem { + files: std::collections::BTreeMap::from_iter([( + "single.vortex".to_string(), + write_part_with_stats(&session, single).await?, + )]), + }); + let single_source = MultiFileDataSource::new(session.clone()) + .with_glob("single.vortex", Some(single_fs)) + .build_data_source() + .await?; + let stats = single_source + .statistics( + &col("numbers"), + &[ + Min.bind(NumericalAggregateOpts::default()), + Max.bind(NumericalAggregateOpts::default()), + NullCount.bind(EmptyOptions), + ], + ) + .await?; + assert_eq!(exact_u32_stat(&stats[0]), Some(10)); + assert_eq!(exact_u32_stat(&stats[1]), Some(30)); + assert_eq!(exact_u64_stat(&stats[2]), Some(0)); + + let first = StructArray::from_fields(&[("numbers", buffer![1u32, 2, 3].into_array())])? + .into_array(); + let second = StructArray::from_fields(&[("numbers", buffer![4u32, 5, 6].into_array())])? + .into_array(); + + let fs: FileSystemRef = Arc::new(MemoryFileSystem { + files: std::collections::BTreeMap::from_iter([ + ( + "part-0.vortex".to_string(), + write_part(&session, first).await?, + ), + ( + "part-1.vortex".to_string(), + write_part(&session, second).await?, + ), + ]), + }); + + let data_source = MultiFileDataSource::new(session.clone()) + .with_glob("part-*.vortex", Some(fs)) + .build_data_source() + .await?; + let scan = data_source + .scan(vortex_scan::ScanRequest { + projection: col("numbers"), + filter: Some(gt(col("numbers"), lit(2u32))), + ordered: true, + ..Default::default() + }) + .await?; + + let dtype = scan.dtype().clone(); + let stream = scan + .partitions() + .then(|partition| async move { partition?.execute() }) + .try_flatten() + .boxed(); + let actual = ArrayStreamAdapter::new(dtype, stream).read_all().await?; + + let mut ctx = session.create_execution_ctx(); + assert_arrays_eq!(actual, buffer![3u32, 4, 5, 6].into_array(), &mut ctx); + + let planned = data_source + .plan_morsel_partitions( + vortex_scan::ScanRequest { + projection: col("numbers"), + filter: Some(gt(col("numbers"), lit(2u32))), + ..Default::default() + }, + 128, + ) + .await? + .ok_or_else(|| { + vortex_error::vortex_err!("scan plan data source must plan morsel partitions") + })?; + + assert_eq!(planned.partition_count(), 2); + + let dtype = planned.dtype().clone(); + let stream = stream::iter(0..planned.partition_count()) + .then(|partition| { + let planned = Arc::clone(&planned); + async move { planned.partition(partition)?.execute() } + }) + .try_flatten() + .boxed(); + let actual = ArrayStreamAdapter::new(dtype, stream).read_all().await?; + + let mut ctx = session.create_execution_ctx(); + assert_arrays_eq!(actual, buffer![3u32, 4, 5, 6].into_array(), &mut ctx); + Ok(()) + }) +} #[tokio::test] async fn test_eof_values() { diff --git a/vortex-layout/src/layout_v2.rs b/vortex-layout/src/layout_v2.rs new file mode 100644 index 00000000000..994767cca7e --- /dev/null +++ b/vortex-layout/src/layout_v2.rs @@ -0,0 +1,796 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::any::Any; +use std::env; +use std::fmt::Debug; +use std::fmt::Formatter; +use std::ops::Deref; +use std::sync::Arc; +use std::sync::LazyLock; + +use flatbuffers::Follow; +use flatbuffers::VerifierOptions; +use flatbuffers::root_with_opts; +use once_cell::sync::OnceCell; +use vortex_array::dtype::DType; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_err; +use vortex_flatbuffers::FlatBuffer; +use vortex_flatbuffers::layout; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::request::ScanRequest; +use vortex_session::VortexSession; +use vortex_session::registry::ReadContext; +use vortex_session::registry::Registry; + +use crate::LayoutChildType; +use crate::LayoutId; +use crate::segments::SegmentFutureCache; +use crate::segments::SegmentId; +use crate::segments::SegmentSource; + +/// A reference-counted, type-erased v2 layout. +#[derive(Clone)] +pub struct LayoutRef(Arc); + +/// Reference-counted v2 layout-vtable plugin. +pub type LayoutVTableRef = Arc; + +/// Registry mapping layout IDs to v2 layout-vtable plugins. +pub type LayoutVTableRegistry = Registry; + +static LAYOUT_VERIFIER: LazyLock = LazyLock::new(|| VerifierOptions { + max_tables: env::var("VORTEX_MAX_LAYOUT_TABLES") + .ok() + .and_then(|lmt| lmt.parse::().ok()) + .unwrap_or(1000000), + max_depth: env::var("VORTEX_MAX_LAYOUT_DEPTH") + .ok() + .and_then(|lmt| lmt.parse::().ok()) + .unwrap_or(64), + max_apparent_size: 1 << 31, + ignore_missing_null_terminator: false, +}); + +/// Layout-specific behavior for the v2 layout model. +/// +/// Common layout fields live in [`Layout`] and are handled by the erased adapter. The vtable only +/// supplies layout-specific data interpretation, child typing, and runtime scan expansion. +pub trait VTable: 'static + Clone + Send + Sync + Debug { + /// Layout-specific data. Common fields such as dtype, row count, children, and segments are + /// stored by the adapter. + type LayoutData: 'static + Send + Sync + Clone + Debug; + + /// Returns the ID of this layout encoding. + fn id(&self) -> LayoutId; + + /// Deserialize layout-specific data from serialized metadata. + /// + /// Common fields are provided in `args`, but remain owned by [`LayoutParts`]. Implementations + /// should only return layout-specific data. + fn deserialize(&self, _args: &LayoutDeserializeArgs<'_>) -> VortexResult { + vortex_bail!( + "layout v2 deserialization is not implemented for {}", + self.id() + ) + } + + /// Returns the expected dtype of child `idx`. + fn child_dtype(layout: Layout, idx: usize) -> VortexResult; + + /// Returns the relationship between child `idx` and its parent. + fn child_type(layout: Layout, idx: usize) -> VortexResult; + + /// Expand this layout into a physical scan plan. + fn new_scan_plan( + layout: Layout, + req: &mut ScanRequest, + ctx: &LayoutScanPlanCtx, + ) -> VortexResult; +} + +/// Context captured while expanding a serialized layout into a physical scan plan. +/// +/// Layouts are serialization metadata; concrete scan plans are bound to the segment source +/// they will read from when the layout is expanded. +#[derive(Clone)] +pub struct LayoutScanPlanCtx { + session: VortexSession, + segment_source: Arc, + segment_future_cache: Arc, +} + +impl LayoutScanPlanCtx { + /// Create a layout scan-plan expansion context. + pub fn new( + session: VortexSession, + segment_source: Arc, + segment_future_cache: Arc, + ) -> Self { + Self { + session, + segment_source, + segment_future_cache, + } + } + + /// Return the session used while constructing scan plans. + pub fn session(&self) -> &VortexSession { + &self.session + } + + /// Return the segment source concrete scan plans should capture. + pub fn segment_source(&self) -> &Arc { + &self.segment_source + } + + /// Return the file-level cache used for scheduled segment futures. + pub fn segment_future_cache(&self) -> &Arc { + &self.segment_future_cache + } +} + +/// Object-safe plugin for deserializing v2 layouts by ID. +pub trait LayoutVTablePlugin: 'static + Send + Sync { + /// Returns the ID of this layout encoding. + fn id(&self) -> LayoutId; + + /// Deserialize a type-erased v2 layout. + fn deserialize(&self, args: LayoutDeserializeArgs<'_>) -> VortexResult; +} + +impl Debug for dyn LayoutVTablePlugin { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("LayoutVTablePlugin") + .field(&self.id()) + .finish() + } +} + +impl LayoutVTablePlugin for V { + fn id(&self) -> LayoutId { + VTable::id(self) + } + + fn deserialize(&self, args: LayoutDeserializeArgs<'_>) -> VortexResult { + Ok(LayoutParts::deserialize(self.clone(), args)?.into_layout()) + } +} + +/// Common serialized layout fields made available while deserializing v2 layout data. +pub struct LayoutDeserializeArgs<'a> { + /// The logical dtype of this layout. + pub dtype: &'a DType, + /// The row count of this layout. + pub row_count: u64, + /// The layout-specific metadata payload. + pub metadata: &'a [u8], + /// Segment IDs referenced directly by this layout. + pub segment_ids: Vec, + /// Lazy child access for this layout. + pub children: Arc, + /// Array read context captured from the file footer. + pub array_ctx: &'a ReadContext, + /// Session used to deserialize session-registered layout metadata. + pub session: &'a VortexSession, +} + +/// Pieces used to construct a v2 layout. +pub struct LayoutParts { + vtable: V, + dtype: DType, + row_count: u64, + segment_ids: Vec, + children: Arc, + data: V::LayoutData, +} + +impl LayoutParts { + /// Create layout parts from common fields and vtable-specific data. + pub fn new( + vtable: V, + dtype: DType, + row_count: u64, + segment_ids: Vec, + children: Arc, + data: V::LayoutData, + ) -> Self { + Self { + vtable, + dtype, + row_count, + segment_ids, + children, + data, + } + } + + /// Deserialize layout-specific data and hoist common fields into layout parts. + pub fn deserialize(vtable: V, args: LayoutDeserializeArgs<'_>) -> VortexResult { + let data = vtable.deserialize(&args)?; + Ok(Self { + vtable, + dtype: args.dtype.clone(), + row_count: args.row_count, + segment_ids: args.segment_ids, + children: args.children, + data, + }) + } + + /// Convert these parts into a typed layout. + pub fn into_typed(self) -> Layout { + Layout::from_parts(self) + } + + /// Erase these parts into a layout reference. + pub fn into_layout(self) -> LayoutRef { + self.into_typed().into_layout() + } +} + +/// A typed v2 layout handle. +pub struct Layout { + inner: Arc>, +} + +struct LayoutInner { + vtable: V, + dtype: DType, + row_count: u64, + segment_ids: Vec, + children: Arc, + data: V::LayoutData, +} + +impl Layout { + /// Create a typed layout from explicit construction parts. + pub fn from_parts(parts: LayoutParts) -> Self { + Self { + inner: Arc::new(LayoutInner { + vtable: parts.vtable, + dtype: parts.dtype, + row_count: parts.row_count, + segment_ids: parts.segment_ids, + children: parts.children, + data: parts.data, + }), + } + } + + /// Returns this layout's vtable. + pub fn vtable(&self) -> &V { + &self.inner.vtable + } + + /// Returns the layout-specific data. + pub fn data(&self) -> &V::LayoutData { + &self.inner.data + } + + /// Returns this layout's dtype. + pub fn dtype(&self) -> &DType { + &self.inner.dtype + } + + /// Returns this layout's row count. + pub fn row_count(&self) -> u64 { + self.inner.row_count + } + + /// Returns this layout's segment IDs. + pub fn segment_ids(&self) -> &[SegmentId] { + &self.inner.segment_ids + } + + /// Returns this layout's children adapter. + pub fn children(&self) -> &Arc { + &self.inner.children + } + + /// Returns the number of children. + pub fn nchildren(&self) -> usize { + self.inner.children.nchildren() + } + + /// Returns child `idx`, materializing it lazily. + pub fn child(&self, idx: usize) -> VortexResult { + let dtype = V::child_dtype(self.clone(), idx)?; + self.inner.children.child(idx, &dtype) + } + + /// Returns the row count of child `idx`. + pub fn child_row_count(&self, idx: usize) -> VortexResult { + self.inner.children.child_row_count(idx) + } + + /// Returns the relationship between child `idx` and this layout. + pub fn child_type(&self, idx: usize) -> VortexResult { + V::child_type(self.clone(), idx) + } + + /// Erase this typed layout into a layout reference. + pub fn to_layout(&self) -> LayoutRef { + self.clone().into_layout() + } + + /// Erase this typed layout into a layout reference. + pub fn into_layout(self) -> LayoutRef { + LayoutRef(Arc::new(self)) + } +} + +impl Clone for Layout { + fn clone(&self) -> Self { + Self { + inner: Arc::clone(&self.inner), + } + } +} + +impl Debug for Layout { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Layout") + .field("encoding_id", &self.inner.vtable.id()) + .field("dtype", &self.inner.dtype) + .field("row_count", &self.inner.row_count) + .field("segment_ids", &self.inner.segment_ids) + .field("data", &self.inner.data) + .finish() + } +} + +impl Deref for Layout { + type Target = V::LayoutData; + + fn deref(&self) -> &Self::Target { + self.data() + } +} + +trait DynLayout: 'static + Send + Sync + Debug { + fn as_any(&self) -> &dyn Any; + + fn dyn_encoding_id(&self) -> LayoutId; + + fn dyn_dtype(&self) -> &DType; + + fn dyn_row_count(&self) -> u64; + + fn dyn_segment_ids(&self) -> &[SegmentId]; + + fn dyn_nchildren(&self) -> usize; + + fn dyn_child(&self, idx: usize) -> VortexResult; + + fn dyn_child_row_count(&self, idx: usize) -> VortexResult; + + fn dyn_child_type(&self, idx: usize) -> VortexResult; + + fn dyn_new_scan_plan( + &self, + req: &mut ScanRequest, + ctx: &LayoutScanPlanCtx, + ) -> VortexResult; +} + +impl LayoutRef { + /// Downcast this layout to a typed v2 layout handle. + pub fn as_opt(&self) -> Option> { + self.0.as_any().downcast_ref::>().cloned() + } + + /// Returns a cloned layout reference. + pub fn to_layout(&self) -> LayoutRef { + self.clone() + } + + /// Returns the layout encoding ID. + pub fn encoding_id(&self) -> LayoutId { + self.0.dyn_encoding_id() + } + + /// Returns this layout's dtype. + pub fn dtype(&self) -> &DType { + self.0.dyn_dtype() + } + + /// Returns this layout's row count. + pub fn row_count(&self) -> u64 { + self.0.dyn_row_count() + } + + /// Returns this layout's segment IDs. + pub fn segment_ids(&self) -> &[SegmentId] { + self.0.dyn_segment_ids() + } + + /// Returns the number of children. + pub fn nchildren(&self) -> usize { + self.0.dyn_nchildren() + } + + /// Returns child `idx`, materializing it lazily. + pub fn child(&self, idx: usize) -> VortexResult { + self.0.dyn_child(idx) + } + + /// Returns the row count of child `idx`. + pub fn child_row_count(&self, idx: usize) -> VortexResult { + self.0.dyn_child_row_count(idx) + } + + /// Returns the relationship between child `idx` and this layout. + pub fn child_type(&self, idx: usize) -> VortexResult { + self.0.dyn_child_type(idx) + } + + /// Expand this layout into a physical scan plan. + pub fn new_scan_plan( + &self, + req: &mut ScanRequest, + ctx: &LayoutScanPlanCtx, + ) -> VortexResult { + self.0.dyn_new_scan_plan(req, ctx) + } + + /// Returns an iterator over child row offsets. + pub fn child_row_offsets(&self) -> impl Iterator>> + '_ { + (0..self.nchildren()).map(|idx| Ok(self.child_type(idx)?.row_offset())) + } +} + +impl Debug for LayoutRef { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + Debug::fmt(&self.0, f) + } +} + +impl DynLayout for Layout { + fn as_any(&self) -> &dyn Any { + self + } + + fn dyn_encoding_id(&self) -> LayoutId { + self.vtable().id() + } + + fn dyn_dtype(&self) -> &DType { + &self.inner.dtype + } + + fn dyn_row_count(&self) -> u64 { + self.inner.row_count + } + + fn dyn_segment_ids(&self) -> &[SegmentId] { + &self.inner.segment_ids + } + + fn dyn_nchildren(&self) -> usize { + self.inner.children.nchildren() + } + + fn dyn_child(&self, idx: usize) -> VortexResult { + Layout::child(self, idx) + } + + fn dyn_child_row_count(&self, idx: usize) -> VortexResult { + self.inner.children.child_row_count(idx) + } + + fn dyn_child_type(&self, idx: usize) -> VortexResult { + V::child_type(self.clone(), idx) + } + + fn dyn_new_scan_plan( + &self, + req: &mut ScanRequest, + ctx: &LayoutScanPlanCtx, + ) -> VortexResult { + V::new_scan_plan(self.clone(), req, ctx) + } +} + +/// Lazily provides v2 layout children. +pub trait LayoutChildren: 'static + Send + Sync { + /// Returns child `idx`, validating its dtype. + fn child(&self, idx: usize, dtype: &DType) -> VortexResult; + + /// Returns child `idx`'s row count. + fn child_row_count(&self, idx: usize) -> VortexResult; + + /// Returns the number of children. + fn nchildren(&self) -> usize; +} + +impl Debug for dyn LayoutChildren { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("LayoutChildren") + .field("nchildren", &self.nchildren()) + .finish() + } +} + +#[derive(Clone)] +struct ViewedLayoutChildren { + flatbuffer: FlatBuffer, + flatbuffer_loc: usize, + array_ctx: ReadContext, + layout_ctx: ReadContext, + layouts: LayoutVTableRegistry, + session: VortexSession, + cache: Arc<[OnceCell]>, +} + +impl ViewedLayoutChildren { + unsafe fn new_unchecked( + flatbuffer: FlatBuffer, + flatbuffer_loc: usize, + array_ctx: ReadContext, + layout_ctx: ReadContext, + layouts: LayoutVTableRegistry, + session: VortexSession, + ) -> Self { + // SAFETY: guaranteed by caller. + let nchildren = unsafe { layout::Layout::follow(flatbuffer.as_ref(), flatbuffer_loc) } + .children() + .unwrap_or_default() + .len(); + let cache = vec![OnceCell::new(); nchildren].into_boxed_slice().into(); + Self { + flatbuffer, + flatbuffer_loc, + array_ctx, + layout_ctx, + layouts, + session, + cache, + } + } + + fn flatbuffer(&self) -> layout::Layout<'_> { + // SAFETY: flatbuffer_loc is produced from a verified flatbuffer table. + unsafe { layout::Layout::follow(self.flatbuffer.as_ref(), self.flatbuffer_loc) } + } +} + +impl LayoutChildren for ViewedLayoutChildren { + fn child(&self, idx: usize, dtype: &DType) -> VortexResult { + if idx >= self.cache.len() { + vortex_bail!("Child index out of bounds: {idx} of {}", self.cache.len()); + } + let child = self.cache[idx].get_or_try_init(|| { + let fb_child = self.flatbuffer().children().unwrap_or_default().get(idx); + // SAFETY: same verified flatbuffer; fb_child._tab.loc() is a valid table location. + let children = unsafe { + ViewedLayoutChildren::new_unchecked( + self.flatbuffer.clone(), + fb_child._tab.loc(), + self.array_ctx.clone(), + self.layout_ctx.clone(), + self.layouts.clone(), + self.session.clone(), + ) + }; + layout_from_fb_layout( + fb_child, + dtype, + self.layout_ctx.clone(), + self.array_ctx.clone(), + self.layouts.clone(), + &self.session, + Arc::new(children), + ) + })?; + Ok(child.clone()) + } + + fn child_row_count(&self, idx: usize) -> VortexResult { + if idx >= self.cache.len() { + vortex_bail!("Child index out of bounds: {idx} of {}", self.cache.len()); + } + Ok(self + .flatbuffer() + .children() + .unwrap_or_default() + .get(idx) + .row_count()) + } + + fn nchildren(&self) -> usize { + self.cache.len() + } +} + +/// Parse a v2 [`LayoutRef`] from a layout flatbuffer. +pub fn layout_from_flatbuffer( + flatbuffer: FlatBuffer, + dtype: &DType, + layout_ctx: &ReadContext, + array_ctx: &ReadContext, + layouts: &LayoutVTableRegistry, + session: &VortexSession, +) -> VortexResult { + let fb_layout = root_with_opts::(&LAYOUT_VERIFIER, &flatbuffer)?; + // SAFETY: the flatbuffer was verified by root_with_opts. + let children = unsafe { + ViewedLayoutChildren::new_unchecked( + flatbuffer.clone(), + fb_layout._tab.loc(), + array_ctx.clone(), + layout_ctx.clone(), + layouts.clone(), + session.clone(), + ) + }; + layout_from_fb_layout( + fb_layout, + dtype, + layout_ctx.clone(), + array_ctx.clone(), + layouts.clone(), + session, + Arc::new(children), + ) +} + +fn layout_from_fb_layout( + fb_layout: layout::Layout<'_>, + dtype: &DType, + layout_ctx: ReadContext, + array_ctx: ReadContext, + layouts: LayoutVTableRegistry, + session: &VortexSession, + children: Arc, +) -> VortexResult { + let encoding_id = layout_ctx + .resolve(fb_layout.encoding()) + .ok_or_else(|| vortex_err!("Invalid layout encoding ID: {}", fb_layout.encoding()))?; + let vtable = layouts + .find(&encoding_id) + .ok_or_else(|| vortex_err!("Invalid v2 layout encoding ID: {encoding_id}"))?; + vtable.deserialize(LayoutDeserializeArgs { + dtype, + row_count: fb_layout.row_count(), + metadata: fb_layout + .metadata() + .map(|m| m.bytes()) + .unwrap_or_else(|| &[]), + segment_ids: fb_layout + .segments() + .unwrap_or_default() + .iter() + .map(SegmentId::from) + .collect(), + children, + array_ctx: &array_ctx, + session, + }) +} + +pub(crate) fn metadata_bool_field( + metadata: &[u8], + field_number: u64, +) -> VortexResult> { + Ok(metadata_varint_field(metadata, field_number)?.map(|value| value != 0)) +} + +pub(crate) fn metadata_varint_field( + metadata: &[u8], + field_number: u64, +) -> VortexResult> { + let mut offset = 0; + while offset < metadata.len() { + let key = read_varint(metadata, &mut offset)?; + let field = key >> 3; + let wire_type = key & 0x07; + if field == field_number { + if wire_type != 0 { + vortex_bail!("metadata field {field_number} is not a varint"); + } + return Ok(Some(read_varint(metadata, &mut offset)?)); + } + skip_proto_field(metadata, &mut offset, wire_type)?; + } + Ok(None) +} + +pub(crate) fn metadata_bytes_field( + metadata: &[u8], + field_number: u64, +) -> VortexResult>> { + let mut offset = 0; + while offset < metadata.len() { + let key = read_varint(metadata, &mut offset)?; + let field = key >> 3; + let wire_type = key & 0x07; + if field == field_number { + if wire_type != 2 { + vortex_bail!("metadata field {field_number} is not length-delimited"); + } + let len = usize::try_from(read_varint(metadata, &mut offset)?)?; + let end = offset + .checked_add(len) + .ok_or_else(|| vortex_err!("metadata field length overflows buffer offset"))?; + if end > metadata.len() { + vortex_bail!("metadata field extends past end of buffer"); + } + return Ok(Some(metadata[offset..end].to_vec())); + } + skip_proto_field(metadata, &mut offset, wire_type)?; + } + Ok(None) +} + +fn skip_proto_field(metadata: &[u8], offset: &mut usize, wire_type: u64) -> VortexResult<()> { + match wire_type { + 0 => { + read_varint(metadata, offset)?; + } + 1 => { + *offset = offset + .checked_add(8) + .ok_or_else(|| vortex_err!("metadata field offset overflow"))?; + } + 2 => { + let len = usize::try_from(read_varint(metadata, offset)?)?; + *offset = offset + .checked_add(len) + .ok_or_else(|| vortex_err!("metadata field offset overflow"))?; + } + 5 => { + *offset = offset + .checked_add(4) + .ok_or_else(|| vortex_err!("metadata field offset overflow"))?; + } + _ => vortex_bail!("unsupported protobuf wire type {wire_type}"), + } + if *offset > metadata.len() { + vortex_bail!("metadata field extends past end of buffer"); + } + Ok(()) +} + +fn read_varint(metadata: &[u8], offset: &mut usize) -> VortexResult { + let mut value = 0u64; + for shift in (0..64).step_by(7) { + let byte = *metadata + .get(*offset) + .ok_or_else(|| vortex_err!("truncated protobuf varint"))?; + *offset += 1; + value |= u64::from(byte & 0x7f) << shift; + if byte & 0x80 == 0 { + return Ok(value); + } + } + vortex_bail!("protobuf varint exceeds 64 bits") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn metadata_bytes_field_rejects_length_overflow() { + let mut metadata = vec![0x0a]; + metadata.extend_from_slice(&u64::MAX.to_le_bytes()); + // Replace the fixed-width bytes with a protobuf varint for u64::MAX. + metadata.truncate(1); + metadata.extend([0xff; 9]); + metadata.push(0x01); + + assert!(metadata_bytes_field(&metadata, 1).is_err()); + } + + #[test] + fn skip_proto_field_rejects_length_overflow() { + let mut metadata = vec![0x12]; + metadata.extend([0xff; 9]); + metadata.push(0x01); + + assert!(metadata_varint_field(&metadata, 1).is_err()); + } +} diff --git a/vortex-layout/src/layouts/zoned/mod.rs b/vortex-layout/src/layouts/zoned/mod.rs index 3d82fa27b34..f5d8c8afbbe 100644 --- a/vortex-layout/src/layouts/zoned/mod.rs +++ b/vortex-layout/src/layouts/zoned/mod.rs @@ -25,6 +25,9 @@ pub(crate) use builder::aggregate_partials; use prost::Message; pub use schema::MAX_IS_TRUNCATED; pub use schema::MIN_IS_TRUNCATED; +pub(crate) use schema::aggregate_fns_from_specs; +pub(crate) use schema::aggregate_stats_table_dtype; +pub(crate) use schema::legacy_stats_table_dtype; use vortex_array::DeserializeMetadata; use vortex_array::SerializeMetadata; use vortex_array::aggregate_fn::AggregateFnRef; @@ -52,10 +55,7 @@ use crate::children::LayoutChildren; use crate::children::OwnedLayoutChildren; use crate::layouts::zoned::reader::ZonedReader; use crate::layouts::zoned::schema::AggregateSpecProto; -use crate::layouts::zoned::schema::aggregate_fns_from_specs; use crate::layouts::zoned::schema::aggregate_specs_from_fns; -use crate::layouts::zoned::schema::aggregate_stats_table_dtype; -use crate::layouts::zoned::schema::legacy_stats_table_dtype; use crate::segments::SegmentId; use crate::segments::SegmentSource; use crate::vtable; @@ -393,14 +393,14 @@ impl ZonedLayout { /// aggregate functions stored in the auxiliary stats-table child. #[derive(Debug, PartialEq, Eq, Clone)] pub struct ZonedMetadata { - pub(super) zone_len: u32, - pub(super) aggregate_specs: Arc<[AggregateSpecProto]>, + pub(crate) zone_len: u32, + pub(crate) aggregate_specs: Arc<[AggregateSpecProto]>, } /// Serialized metadata for legacy `vortex.stats` layouts. #[derive(Debug, PartialEq, Eq, Clone)] pub struct LegacyStatsMetadata { - pub(super) zone_len: u32, + pub(crate) zone_len: u32, pub(crate) zone_map_schema: ZoneMapSchema, } diff --git a/vortex-layout/src/layouts/zoned/zone_map.rs b/vortex-layout/src/layouts/zoned/zone_map.rs index e5f2af494de..96782157ec5 100644 --- a/vortex-layout/src/layouts/zoned/zone_map.rs +++ b/vortex-layout/src/layouts/zoned/zone_map.rs @@ -87,7 +87,7 @@ impl ZoneMap { Ok(unsafe { Self::new_unchecked(column_dtype, array, aggregate_fns, zone_len, row_count) }) } - pub(super) unsafe fn new_unchecked( + pub(crate) unsafe fn new_unchecked( column_dtype: DType, array: StructArray, aggregate_fns: Arc<[AggregateFnRef]>, diff --git a/vortex-layout/src/layouts_v2/chunked.rs b/vortex-layout/src/layouts_v2/chunked.rs new file mode 100644 index 00000000000..e4af2652364 --- /dev/null +++ b/vortex-layout/src/layouts_v2/chunked.rs @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::DeserializeMetadata; +use vortex_array::EmptyMetadata; +use vortex_array::dtype::DType; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; +use vortex_error::vortex_err; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::request::ScanRequest; + +use crate::LayoutChildType; +use crate::LayoutId; +use crate::layout_v2::Layout; +use crate::layout_v2::LayoutDeserializeArgs; +use crate::layout_v2::LayoutScanPlanCtx; +use crate::layout_v2::VTable; +use crate::scan::v2::layouts::chunked as scan_chunked; + +/// V2 chunked layout vtable. +#[derive(Clone, Debug)] +pub struct Chunked; + +/// V2 chunked layout data. +#[derive(Clone, Debug)] +pub struct ChunkedData { + pub(crate) chunk_offsets: Vec, +} + +impl ChunkedData { + /// Returns the cumulative chunk offsets. + pub fn chunk_offsets(&self) -> &[u64] { + &self.chunk_offsets + } +} + +impl VTable for Chunked { + type LayoutData = ChunkedData; + + fn id(&self) -> LayoutId { + LayoutId::new("vortex.chunked") + } + + fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { + EmptyMetadata::deserialize(args.metadata)?; + let mut chunk_offsets: Vec = Vec::with_capacity(args.children.nchildren() + 1); + chunk_offsets.push(0); + for idx in 0..args.children.nchildren() { + let next = chunk_offsets[idx] + .checked_add(args.children.child_row_count(idx)?) + .ok_or_else(|| vortex_err!("Chunked child row counts overflow"))?; + chunk_offsets.push(next); + } + vortex_ensure!( + chunk_offsets.last().copied() == Some(args.row_count), + "Chunked child row counts do not add up to parent row count" + ); + Ok(ChunkedData { chunk_offsets }) + } + + fn child_dtype(layout: Layout, _idx: usize) -> VortexResult { + Ok(layout.dtype().clone()) + } + + fn child_type(layout: Layout, idx: usize) -> VortexResult { + if idx >= layout.nchildren() { + vortex_bail!("Chunked child index out of bounds: {idx}"); + } + let offset = *layout + .data() + .chunk_offsets + .get(idx) + .ok_or_else(|| vortex_err!("Chunked child index out of bounds: {idx}"))?; + Ok(LayoutChildType::Chunk((idx, offset))) + } + + fn new_scan_plan( + layout: Layout, + req: &mut ScanRequest, + ctx: &LayoutScanPlanCtx, + ) -> VortexResult { + scan_chunked::new_scan_plan(layout, req, ctx) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use vortex_array::dtype::DType; + use vortex_array::dtype::Nullability; + use vortex_array::dtype::PType; + use vortex_error::VortexResult; + use vortex_error::vortex_bail; + use vortex_error::vortex_err; + use vortex_session::VortexSession; + use vortex_session::registry::ReadContext; + + use super::*; + use crate::layout_v2::LayoutChildren; + use crate::layout_v2::LayoutParts; + use crate::layout_v2::LayoutRef; + + #[derive(Debug)] + struct TestChildren { + row_counts: Vec, + } + + impl LayoutChildren for TestChildren { + fn child(&self, idx: usize, _dtype: &DType) -> VortexResult { + vortex_bail!("test child {idx} is not materialized") + } + + fn child_row_count(&self, idx: usize) -> VortexResult { + self.row_counts + .get(idx) + .copied() + .ok_or_else(|| vortex_err!("test child index out of bounds: {idx}")) + } + + fn nchildren(&self) -> usize { + self.row_counts.len() + } + } + + fn primitive_dtype() -> DType { + DType::Primitive(PType::I32, Nullability::NonNullable) + } + + fn read_context() -> ReadContext { + ReadContext::new([]) + } + + #[test] + fn chunked_deserialize_rejects_row_count_overflow() { + let dtype = primitive_dtype(); + let read_context = read_context(); + let session = VortexSession::empty(); + let args = LayoutDeserializeArgs { + dtype: &dtype, + row_count: 0, + metadata: &[], + segment_ids: Vec::new(), + children: Arc::new(TestChildren { + row_counts: vec![u64::MAX, 1], + }), + array_ctx: &read_context, + session: &session, + }; + + assert!(VTable::deserialize(&Chunked, &args).is_err()); + } + + #[test] + fn chunked_child_type_rejects_terminal_offset_index() { + let dtype = primitive_dtype(); + let layout = LayoutParts::new( + Chunked, + dtype, + 1, + Vec::new(), + Arc::new(TestChildren { + row_counts: vec![1], + }), + ChunkedData { + chunk_offsets: vec![0, 1], + }, + ) + .into_typed(); + + assert!(layout.child_type(1).is_err()); + } +} diff --git a/vortex-layout/src/layouts_v2/dict.rs b/vortex-layout/src/layouts_v2/dict.rs new file mode 100644 index 00000000000..a3cb2234a07 --- /dev/null +++ b/vortex-layout/src/layouts_v2/dict.rs @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_err; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::request::ScanRequest; + +use crate::LayoutChildType; +use crate::LayoutId; +use crate::layout_v2::Layout; +use crate::layout_v2::LayoutDeserializeArgs; +use crate::layout_v2::LayoutScanPlanCtx; +use crate::layout_v2::VTable; +use crate::layout_v2::metadata_bool_field; +use crate::layout_v2::metadata_varint_field; +use crate::scan::v2::layouts::dict as scan_dict; + +/// V2 dictionary layout vtable. +#[derive(Clone, Debug)] +pub struct Dict; + +/// V2 dictionary layout data. +#[derive(Clone, Debug)] +pub struct DictData { + pub(crate) codes_dtype: DType, + pub(crate) all_values_referenced: bool, +} + +impl DictData { + /// Returns whether all dictionary values are definitely referenced. + pub fn has_all_values_referenced(&self) -> bool { + self.all_values_referenced + } +} + +impl VTable for Dict { + type LayoutData = DictData; + + fn id(&self) -> LayoutId { + LayoutId::new("vortex.dict") + } + + fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { + let codes_ptype = metadata_varint_field(args.metadata, 1)? + .ok_or_else(|| vortex_err!("Dict metadata missing codes ptype"))?; + let codes_ptype = PType::try_from(i32::try_from(codes_ptype)?)?; + let codes_nullable = metadata_bool_field(args.metadata, 2)? + .map(Nullability::from) + .unwrap_or_else(|| args.dtype.nullability()); + Ok(DictData { + codes_dtype: DType::Primitive(codes_ptype, codes_nullable), + all_values_referenced: metadata_bool_field(args.metadata, 3)?.unwrap_or(false), + }) + } + + fn child_dtype(layout: Layout, idx: usize) -> VortexResult { + match idx { + 0 => Ok(layout.dtype().clone()), + 1 => Ok(layout.data().codes_dtype.clone()), + _ => vortex_bail!("Dict child index out of bounds: {idx}"), + } + } + + fn child_type(_layout: Layout, idx: usize) -> VortexResult { + match idx { + 0 => Ok(LayoutChildType::Auxiliary("values".into())), + 1 => Ok(LayoutChildType::Transparent("codes".into())), + _ => vortex_bail!("Dict child index out of bounds: {idx}"), + } + } + + fn new_scan_plan( + layout: Layout, + req: &mut ScanRequest, + ctx: &LayoutScanPlanCtx, + ) -> VortexResult { + scan_dict::new_scan_plan(layout, req, ctx) + } +} diff --git a/vortex-layout/src/layouts_v2/flat.rs b/vortex-layout/src/layouts_v2/flat.rs new file mode 100644 index 00000000000..12f02850f2c --- /dev/null +++ b/vortex-layout/src/layouts_v2/flat.rs @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::dtype::DType; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::request::ScanRequest; +use vortex_session::registry::ReadContext; + +use crate::LayoutChildType; +use crate::LayoutId; +use crate::layout_v2::Layout; +use crate::layout_v2::LayoutDeserializeArgs; +use crate::layout_v2::LayoutScanPlanCtx; +use crate::layout_v2::VTable; +use crate::layout_v2::metadata_bytes_field; +use crate::scan::v2::layouts::flat as scan_flat; +use crate::segments::SegmentId; + +/// V2 flat layout vtable. +#[derive(Clone, Debug)] +pub struct Flat; + +/// V2 flat layout data. +#[derive(Clone, Debug)] +pub struct FlatData { + pub(crate) segment_id: SegmentId, + pub(crate) array_ctx: ReadContext, + pub(crate) array_tree: Option, +} + +impl FlatData { + /// Returns the serialized array segment ID. + pub fn segment_id(&self) -> SegmentId { + self.segment_id + } + + /// Returns the array read context. + pub fn array_ctx(&self) -> &ReadContext { + &self.array_ctx + } + + /// Returns the optional inline array encoding tree. + pub fn array_tree(&self) -> Option<&ByteBuffer> { + self.array_tree.as_ref() + } +} + +impl VTable for Flat { + type LayoutData = FlatData; + + fn id(&self) -> LayoutId { + LayoutId::new("vortex.flat") + } + + fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { + vortex_ensure!( + args.segment_ids.len() == 1, + "Flat layout must have exactly one segment ID" + ); + Ok(FlatData { + segment_id: args.segment_ids[0], + array_ctx: args.array_ctx.clone(), + array_tree: metadata_bytes_field(args.metadata, 1)?.map(ByteBuffer::from), + }) + } + + fn child_dtype(_layout: Layout, idx: usize) -> VortexResult { + vortex_bail!("Flat layout has no child {idx}") + } + + fn child_type(_layout: Layout, idx: usize) -> VortexResult { + vortex_bail!("Flat layout has no child {idx}") + } + + fn new_scan_plan( + layout: Layout, + req: &mut ScanRequest, + ctx: &LayoutScanPlanCtx, + ) -> VortexResult { + scan_flat::new_scan_plan(layout, req, ctx) + } +} diff --git a/vortex-layout/src/layouts_v2/mod.rs b/vortex-layout/src/layouts_v2/mod.rs new file mode 100644 index 00000000000..22d2fd76fdb --- /dev/null +++ b/vortex-layout/src/layouts_v2/mod.rs @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Built-in v2 layout vtables. + +pub mod chunked; +pub mod dict; +pub mod flat; +pub mod struct_; +pub mod zoned; diff --git a/vortex-layout/src/layouts_v2/struct_.rs b/vortex-layout/src/layouts_v2/struct_.rs new file mode 100644 index 00000000000..077d4c330b5 --- /dev/null +++ b/vortex-layout/src/layouts_v2/struct_.rs @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::DeserializeMetadata; +use vortex_array::EmptyMetadata; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_error::VortexResult; +use vortex_error::vortex_err; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::request::ScanRequest; + +use crate::LayoutChildType; +use crate::LayoutId; +use crate::layout_v2::Layout; +use crate::layout_v2::LayoutDeserializeArgs; +use crate::layout_v2::LayoutScanPlanCtx; +use crate::layout_v2::VTable; +use crate::scan::v2::layouts::struct_ as scan_struct; + +/// V2 struct layout vtable. +#[derive(Clone, Debug)] +pub struct Struct; + +impl VTable for Struct { + type LayoutData = (); + + fn id(&self) -> LayoutId { + LayoutId::new("vortex.struct") + } + + fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { + EmptyMetadata::deserialize(args.metadata)?; + Ok(()) + } + + fn child_dtype(layout: Layout, idx: usize) -> VortexResult { + let schema_index = if layout.dtype().is_nullable() { + idx.saturating_sub(1) + } else { + idx + }; + if idx == 0 && layout.dtype().is_nullable() { + Ok(DType::Bool(Nullability::NonNullable)) + } else { + layout + .dtype() + .as_struct_fields_opt() + .and_then(|fields| fields.field_by_index(schema_index)) + .ok_or_else(|| vortex_err!("Missing struct field {schema_index}")) + } + } + + fn child_type(layout: Layout, idx: usize) -> VortexResult { + let schema_index = if layout.dtype().is_nullable() { + idx.saturating_sub(1) + } else { + idx + }; + if idx == 0 && layout.dtype().is_nullable() { + Ok(LayoutChildType::Auxiliary("validity".into())) + } else { + let name = layout + .dtype() + .as_struct_fields_opt() + .and_then(|fields| fields.field_name(schema_index)) + .ok_or_else(|| vortex_err!("Missing struct field {schema_index}"))?; + Ok(LayoutChildType::Field(name.clone())) + } + } + + fn new_scan_plan( + layout: Layout, + req: &mut ScanRequest, + ctx: &LayoutScanPlanCtx, + ) -> VortexResult { + scan_struct::new_scan_plan(layout, req, ctx) + } +} diff --git a/vortex-layout/src/layouts_v2/zoned.rs b/vortex-layout/src/layouts_v2/zoned.rs new file mode 100644 index 00000000000..596b488404b --- /dev/null +++ b/vortex-layout/src/layouts_v2/zoned.rs @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::sync::Arc; + +use vortex_array::DeserializeMetadata; +use vortex_array::aggregate_fn::AggregateFnRef; +use vortex_array::dtype::DType; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::request::ScanRequest; + +use crate::LayoutChildType; +use crate::LayoutId; +use crate::layout_v2::Layout; +use crate::layout_v2::LayoutDeserializeArgs; +use crate::layout_v2::LayoutScanPlanCtx; +use crate::layout_v2::VTable; +use crate::layouts::zoned::LegacyStatsMetadata; +use crate::layouts::zoned::ZoneMapSchema; +use crate::layouts::zoned::ZonedMetadata; +use crate::layouts::zoned::aggregate_fns_from_specs; +use crate::layouts::zoned::aggregate_stats_table_dtype; +use crate::layouts::zoned::legacy_stats_table_dtype; +use crate::scan::v2::layouts::zoned as scan_zoned; + +/// V2 zoned layout vtable. +#[derive(Clone, Debug)] +pub struct Zoned; + +/// V2 legacy stats layout vtable. +#[derive(Clone, Debug)] +pub struct LegacyStats; + +/// V2 zoned layout data. +#[derive(Clone, Debug)] +pub struct ZonedData { + pub(crate) zone_len: usize, + pub(crate) zone_map_schema: ZoneMapSchema, + pub(crate) aggregate_fns: Arc<[AggregateFnRef]>, +} + +impl ZonedData { + /// Returns the configured zone length. + pub fn zone_len(&self) -> usize { + self.zone_len + } + + /// Returns the aggregate functions stored in the zone table. + pub fn aggregate_fns(&self) -> &Arc<[AggregateFnRef]> { + &self.aggregate_fns + } + + /// Returns the zone-map schema used by the zone table. + pub(crate) fn zone_map_schema(&self) -> &ZoneMapSchema { + &self.zone_map_schema + } + + fn stats_table_dtype(&self, dtype: &DType) -> DType { + match &self.zone_map_schema { + ZoneMapSchema::LegacyStats(stats) => legacy_stats_table_dtype(dtype, stats), + ZoneMapSchema::AggregateFns(aggregate_fns) => { + aggregate_stats_table_dtype(dtype, aggregate_fns) + } + } + } +} + +impl VTable for Zoned { + type LayoutData = ZonedData; + + fn id(&self) -> LayoutId { + LayoutId::new("vortex.zoned") + } + + fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { + let metadata = ZonedMetadata::deserialize(args.metadata)?; + let aggregate_fns = aggregate_fns_from_specs(&metadata.aggregate_specs, args.session)?; + Ok(ZonedData { + zone_len: metadata.zone_len as usize, + zone_map_schema: ZoneMapSchema::AggregateFns(Arc::clone(&aggregate_fns)), + aggregate_fns, + }) + } + + fn child_dtype(layout: Layout, idx: usize) -> VortexResult { + match idx { + 0 => Ok(layout.dtype().clone()), + 1 => Ok(layout.data().stats_table_dtype(layout.dtype())), + _ => vortex_bail!("Zoned child index out of bounds: {idx}"), + } + } + + fn child_type(_layout: Layout, idx: usize) -> VortexResult { + match idx { + 0 => Ok(LayoutChildType::Transparent("data".into())), + 1 => Ok(LayoutChildType::Auxiliary("zones".into())), + _ => vortex_bail!("Zoned child index out of bounds: {idx}"), + } + } + + fn new_scan_plan( + layout: Layout, + req: &mut ScanRequest, + ctx: &LayoutScanPlanCtx, + ) -> VortexResult { + scan_zoned::new_scan_plan(layout, req, ctx) + } +} + +impl VTable for LegacyStats { + type LayoutData = ZonedData; + + fn id(&self) -> LayoutId { + LayoutId::new("vortex.stats") + } + + fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { + let metadata = LegacyStatsMetadata::deserialize(args.metadata)?; + let aggregate_fns = match &metadata.zone_map_schema { + ZoneMapSchema::LegacyStats(stats) => stats + .iter() + .filter_map(|stat| stat.aggregate_fn()) + .collect::>() + .into(), + ZoneMapSchema::AggregateFns(aggregate_fns) => Arc::clone(aggregate_fns), + }; + Ok(ZonedData { + zone_len: metadata.zone_len as usize, + zone_map_schema: metadata.zone_map_schema, + aggregate_fns, + }) + } + + fn child_dtype(layout: Layout, idx: usize) -> VortexResult { + match idx { + 0 => Ok(layout.dtype().clone()), + 1 => Ok(layout.data().stats_table_dtype(layout.dtype())), + _ => vortex_bail!("Legacy stats child index out of bounds: {idx}"), + } + } + + fn child_type(_layout: Layout, idx: usize) -> VortexResult { + match idx { + 0 => Ok(LayoutChildType::Transparent("data".into())), + 1 => Ok(LayoutChildType::Auxiliary("zones".into())), + _ => vortex_bail!("Legacy stats child index out of bounds: {idx}"), + } + } + + fn new_scan_plan( + layout: Layout, + req: &mut ScanRequest, + ctx: &LayoutScanPlanCtx, + ) -> VortexResult { + scan_zoned::new_scan_plan(layout, req, ctx) + } +} diff --git a/vortex-layout/src/lib.rs b/vortex-layout/src/lib.rs index 6612f5e0350..e5e9eaae8b2 100644 --- a/vortex-layout/src/lib.rs +++ b/vortex-layout/src/lib.rs @@ -14,7 +14,10 @@ //! Scanning is built with [`scan::scan_builder::ScanBuilder`]. It accepts a projection expression, //! optional filter, optional row range, [`Selection`](vortex_scan::selection::Selection), split //! strategy, and task concurrency settings, then produces array streams or iterators. + +pub mod layout_v2; pub mod layouts; +pub mod layouts_v2; pub use children::*; pub use encoding::*; diff --git a/vortex-layout/src/scan/mod.rs b/vortex-layout/src/scan/mod.rs index 98fd1918a42..ab003641eb5 100644 --- a/vortex-layout/src/scan/mod.rs +++ b/vortex-layout/src/scan/mod.rs @@ -12,6 +12,7 @@ mod splits; mod tasks; #[cfg(test)] mod test; +pub mod v2; /// A heuristic for an ideal split size. /// diff --git a/vortex-layout/src/scan/v2/layouts/chunked.rs b/vortex-layout/src/scan/v2/layouts/chunked.rs new file mode 100644 index 00000000000..125947de9ef --- /dev/null +++ b/vortex-layout/src/scan/v2/layouts/chunked.rs @@ -0,0 +1,1143 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Scan2 vtable support for chunked layouts. +//! +//! Chunks stay *lazy*: children are resolved from the footer and expanded +//! through their own layout scan vtables per request, never pre-planned. Chunked is +//! therefore a lazy pushdown boundary: pushed expressions are recorded +//! once, then replayed into each concrete child only when a read, +//! evidence request, or aggregate touches that chunk. This lets +//! child-local layouts such as zoned, dictionary, or index wrappers keep +//! their scan behavior without expanding every chunk up front. +//! +//! The selected read path is where chunking pays off (plan 017 SP5): a +//! chunk whose selection slice is empty is skipped outright — its node is +//! never expanded, its state never created, its segments never fetched. + +use std::fmt; +use std::ops::Range; +use std::sync::Arc; +#[cfg(debug_assertions)] +use std::sync::atomic::AtomicU64; +#[cfg(debug_assertions)] +use std::sync::atomic::Ordering; + +use futures::future::BoxFuture; +use parking_lot::Mutex; +use rustc_hash::FxHashMap; +use vortex_array::ArrayRef; +use vortex_array::IntoArray; +use vortex_array::aggregate_fn::AggregateFnRef; +use vortex_array::arrays::ChunkedArray; +use vortex_array::arrays::ConstantArray; +use vortex_array::dtype::DType; +use vortex_array::expr::Expression; +use vortex_array::expr::is_root; +use vortex_array::expr::root; +use vortex_array::scalar::Scalar; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_err; +use vortex_scan::plan::AggregateAnswer; +use vortex_scan::plan::DeferredReadTask; +use vortex_scan::plan::EvidenceStep; +use vortex_scan::plan::EvidenceTask; +use vortex_scan::plan::OwnedRowScope; +use vortex_scan::plan::PrepareCtx; +use vortex_scan::plan::PreparedAggregate; +use vortex_scan::plan::PreparedAggregateRef; +use vortex_scan::plan::PreparedEvidence; +use vortex_scan::plan::PreparedEvidenceRef; +use vortex_scan::plan::PreparedRead; +use vortex_scan::plan::PreparedReadRef; +use vortex_scan::plan::PreparedStateCacheRef; +use vortex_scan::plan::PreparedStateKey; +use vortex_scan::plan::PushCtx; +use vortex_scan::plan::ReadContext; +use vortex_scan::plan::ReadStep; +use vortex_scan::plan::ReadTask; +use vortex_scan::plan::ReadTaskOutput; +use vortex_scan::plan::ScanPlan; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::ScanState; +use vortex_scan::plan::ScanStateRef; +use vortex_scan::plan::StateCtx; +use vortex_scan::plan::default_try_push_expr; +use vortex_scan::plan::downcast_state; +use vortex_scan::plan::evidence::EvidenceFragment; +use vortex_scan::plan::request::EvidenceMode; +use vortex_scan::plan::request::EvidenceRequest; +use vortex_scan::plan::request::OwnedEvidenceRequest; +use vortex_scan::plan::request::ScanRequest; +use vortex_scan::read::ReadResults; +use vortex_scan::read::ScanIoPhase; +use vortex_session::VortexSession; + +use crate::layout_v2::Layout; +use crate::layout_v2::LayoutRef; +use crate::layout_v2::LayoutScanPlanCtx; +use crate::layouts_v2::chunked::Chunked; + +pub(crate) fn new_scan_plan( + layout: Layout, + _req: &mut ScanRequest, + ctx: &LayoutScanPlanCtx, +) -> VortexResult { + Ok(Arc::new(ChunkedScanPlan { + layout: layout.to_layout(), + offsets: layout.data().chunk_offsets().to_vec(), + ctx: ctx.clone(), + children: Mutex::new(FxHashMap::default()), + })) +} + +/// Reads a chunked layout: cumulative chunk offsets +/// (`offsets.len() == chunks + 1`), with chunk children expanded lazily +/// through their own layout vtables. +pub struct ChunkedScanPlan { + layout: LayoutRef, + offsets: Vec, + ctx: LayoutScanPlanCtx, + /// Lazily expanded chunk nodes, shared across queries. + children: Mutex>, +} + +/// Per-query states of the lazily expanded chunk nodes. Chunk states +/// behind the scan's morsel frontier are dropped by +/// [`ScanPlan::release`], so a long scan retains the working set, not +/// every chunk it touched. +#[derive(Default)] +pub struct ChunkedScanState { + reads: Mutex>, + child_state_caches: Mutex>, + /// Every chunk whose state was ever created (never cleared by + /// release), for read-avoidance tests. + #[cfg(any(test, debug_assertions))] + created: Mutex>, + /// Highest released frontier, for the debug no-read-behind check. + #[cfg(debug_assertions)] + released: AtomicU64, +} + +/// A pushed expression over a chunked layout. +/// +/// Chunk children remain lazy: this node records the expression once and +/// replays expression pushdown into each concrete child only when a read, +/// evidence request, or aggregate touches that chunk. +pub struct ChunkedExprScanPlan { + chunked: Arc, + expr: Expression, + dtype: DType, + children: Mutex>, +} + +/// Per-query states of lazily pushed chunk children. +pub struct ChunkedExprScanState { + chunked: Arc, + reads: Mutex>, + #[cfg(debug_assertions)] + released: AtomicU64, +} + +struct ChunkedPreparedEvidence { + node: Arc, + state: Arc, + session: VortexSession, +} + +struct ChunkedEvidenceTask { + evidence: Arc, + req: OwnedEvidenceRequest, + phase: ScanIoPhase, +} + +enum ChunkedAggregateNode { + Root(Arc), + Expr(Arc), +} + +struct ChunkedPreparedAggregate { + node: ChunkedAggregateNode, + chunked_state: Arc, + dtype: DType, + funcs: Vec, +} + +struct ChunkedPreparedRead { + node: Arc, + state: Arc, +} + +struct ChunkedExprPreparedRead { + node: Arc, + state: Arc, +} + +enum ChunkedReadPart { + Ready(ArrayRef), + Pending { + expected_len: usize, + task: Box, + }, +} + +struct ChunkedReadTask { + dtype: DType, + parts: Vec, +} + +impl ReadTask for ChunkedReadTask { + fn into_step(self: Box) -> VortexResult { + let Self { dtype, parts } = *self; + let mut step_parts = Vec::with_capacity(parts.len()); + let mut continuations = Vec::new(); + let mut required_reads = Vec::new(); + let mut prefetch_reads = Vec::new(); + for part in parts { + match part { + ChunkedReadPart::Ready(array) => step_parts.push(ChunkedReadPart::Ready(array)), + ChunkedReadPart::Pending { expected_len, task } => { + let step = task.into_step()?; + required_reads.extend(step.required_reads); + prefetch_reads.extend(step.prefetch_reads); + continuations.push((step_parts.len(), expected_len, step.continuation)); + step_parts.push(ChunkedReadPart::Pending { + expected_len, + task: Box::new(DeferredReadTask), + }); + } + } + } + Ok(ReadStep::new( + required_reads, + prefetch_reads, + move |io, local, results| { + let mut parts = step_parts; + let mut pending = false; + for (idx, expected_len, continuation) in continuations { + match continuation.run(io, local, results.clone())? { + ReadTaskOutput::Ready(chunk) => { + if chunk.len() != expected_len { + vortex_bail!( + "scoped chunk read returned length {}, expected {}", + chunk.len(), + expected_len + ); + } + parts[idx] = ChunkedReadPart::Ready(chunk); + } + ReadTaskOutput::Continue(task) => { + parts[idx] = ChunkedReadPart::Pending { expected_len, task }; + pending = true; + } + } + } + if pending { + return Ok(ReadTaskOutput::Continue(Box::new(ChunkedReadTask { + dtype, + parts, + }))); + } + let mut arrays = parts + .into_iter() + .map(|part| match part { + ChunkedReadPart::Ready(array) => Ok(array), + ChunkedReadPart::Pending { .. } => { + vortex_bail!("chunked read part remained pending after step completion") + } + }) + .collect::>>()?; + let array = match arrays.len() { + 0 => vortex_bail!("chunked scoped read produced no parts"), + 1 => arrays.swap_remove(0), + _ => ChunkedArray::try_new(arrays, dtype)?.into_array(), + }; + Ok(ReadTaskOutput::Ready(array)) + }, + )) + } +} + +struct ChunkedEvidenceState { + chunked: Arc, + children: Mutex>>, + recheck_children: Mutex>>, +} + +#[derive(Default)] +struct ChunkedAggregateState { + children: Mutex>>, +} + +impl ChunkedScanState { + fn child_prepare_ctx(&self, idx: usize, session: &VortexSession) -> PrepareCtx { + if let Some(hit) = self.child_state_caches.lock().get(&idx) { + return PrepareCtx::with_state_cache(session.clone(), Arc::clone(hit)); + } + let cache = Default::default(); + let mut caches = self.child_state_caches.lock(); + let cache = Arc::clone(caches.entry(idx).or_insert(cache)); + PrepareCtx::with_state_cache(session.clone(), cache) + } + + /// The number of chunk states currently retained. + #[allow(dead_code)] + #[cfg(any(test, debug_assertions))] + pub fn retained_children(&self) -> usize { + self.reads.lock().len() + } + + /// Whether chunk `idx` was ever read this query (release does not + /// clear this). + #[allow(dead_code)] + #[cfg(any(test, debug_assertions))] + pub fn touched(&self, idx: usize) -> bool { + self.created.lock().contains(&idx) + } +} + +impl ChunkedEvidenceState { + fn new(chunked: Arc) -> Self { + Self { + chunked, + children: Mutex::new(FxHashMap::default()), + recheck_children: Mutex::new(FxHashMap::default()), + } + } +} + +impl ChunkedScanPlan { + fn scan_state(&self, cx: &mut PrepareCtx) -> VortexResult> { + let key = + PreparedStateKey::new::(self as *const Self as *const () as usize); + cx.shared_state(key, || Ok(ChunkedScanState::default())) + } + + /// The scan plan for chunk `idx`, expanding it on first use. Lazy + /// expansion is independent of pushed predicate expressions. + fn child(&self, idx: usize) -> VortexResult { + if let Some(hit) = self.children.lock().get(&idx) { + return Ok(Arc::clone(hit)); + } + let mut req = ScanRequest::empty(); + let plan = self.layout.child(idx)?.new_scan_plan(&mut req, &self.ctx)?; + self.children.lock().insert(idx, Arc::clone(&plan)); + Ok(plan) + } + + /// The planned value read for chunk `idx`, creating it on first use. + fn child_read( + &self, + idx: usize, + state: &ChunkedScanState, + session: &VortexSession, + ) -> VortexResult { + if let Some(hit) = state.reads.lock().get(&idx) { + return Ok(Arc::clone(hit)); + } + let node = self.child(idx)?; + let mut cx = state.child_prepare_ctx(idx, session); + let read = node + .prepare_read(&mut cx)? + .ok_or_else(|| vortex_err!("chunked child {idx} did not produce a prepared read"))?; + let mut reads = state.reads.lock(); + #[cfg(any(test, debug_assertions))] + state.created.lock().insert(idx); + Ok(Arc::clone(reads.entry(idx).or_insert(read))) + } + + fn first_chunk(&self, start: u64) -> usize { + self.offsets + .partition_point(|&offset| offset <= start) + .saturating_sub(1) + } +} + +impl ChunkedExprScanPlan { + fn new(chunked: Arc, expr: Expression, dtype: DType) -> Self { + Self { + chunked, + expr, + dtype, + children: Mutex::new(FxHashMap::default()), + } + } + + fn child(&self, idx: usize, session: &VortexSession) -> VortexResult { + if let Some(hit) = self.children.lock().get(&idx) { + return Ok(Arc::clone(hit)); + } + let child = self.chunked.child(idx)?; + let mut cx = PushCtx::new(session.clone()); + let pushed = child.try_push_expr(&self.expr, &mut cx)?.ok_or_else(|| { + vortex_err!( + "chunked child {idx} could not push expression {}", + self.expr + ) + })?; + let mut children = self.children.lock(); + Ok(Arc::clone(children.entry(idx).or_insert(pushed))) + } + + /// The planned value read for pushed chunk child `idx`. + fn child_read( + &self, + idx: usize, + state: &ChunkedExprScanState, + session: &VortexSession, + ) -> VortexResult { + if let Some(hit) = state.reads.lock().get(&idx) { + return Ok(Arc::clone(hit)); + } + let node = self.child(idx, session)?; + let mut cx = state.chunked.child_prepare_ctx(idx, session); + let read = node.prepare_read(&mut cx)?.ok_or_else(|| { + vortex_err!("chunked expression child {idx} did not produce a prepared read") + })?; + let mut reads = state.reads.lock(); + Ok(Arc::clone(reads.entry(idx).or_insert(read))) + } +} + +impl ChunkedAggregateNode { + fn offsets(&self) -> &[u64] { + match self { + Self::Root(node) => &node.offsets, + Self::Expr(node) => &node.chunked.offsets, + } + } + + fn first_chunk(&self, start: u64) -> usize { + match self { + Self::Root(node) => node.first_chunk(start), + Self::Expr(node) => node.chunked.first_chunk(start), + } + } + + fn child(&self, idx: usize, io: &ReadContext) -> VortexResult { + match self { + Self::Root(node) => node.child(idx), + Self::Expr(node) => node.child(idx, io.session()), + } + } +} + +impl ChunkedPreparedAggregate { + fn child_plan( + &self, + idx: usize, + state: &ChunkedAggregateState, + io: &ReadContext, + ) -> VortexResult> { + if let Some(hit) = state.children.lock().get(&idx) { + return Ok(hit.clone()); + } + let child = self.node.child(idx, io)?; + let mut plan_ctx = self.chunked_state.child_prepare_ctx(idx, io.session()); + let planned = match child.prepare_aggregate_partial(&self.funcs, &mut plan_ctx)? { + Some(plan) => { + let plan_state = plan.init_state(io.session())?; + Some((plan, plan_state)) + } + None => None, + }; + let mut children = state.children.lock(); + Ok(children.entry(idx).or_insert(planned).clone()) + } +} + +impl PreparedAggregate for ChunkedPreparedAggregate { + fn init_state(&self, _ctx: &VortexSession) -> VortexResult { + Ok(Arc::new(ChunkedAggregateState::default())) + } + + fn aggregate_partial<'a>( + &'a self, + range: Range, + io: &'a ReadContext, + state: &'a ScanState, + ) -> BoxFuture<'a, VortexResult>>> { + Box::pin(async move { + let state = downcast_state::(state)?; + if range.start >= range.end { + return Ok(None); + } + let mut accumulators = self + .funcs + .iter() + .map(|func| { + func.state_dtype(&self.dtype) + .map(|_| func.accumulator(&self.dtype)) + .transpose() + }) + .collect::>>()?; + let mut contributed = vec![false; self.funcs.len()]; + let mut covered = vec![false; self.funcs.len()]; + let mut residuals: Vec>> = vec![Vec::new(); self.funcs.len()]; + let push_residual = + |residual: &mut Vec>, span: Range| match residual.last_mut() { + Some(last) if last.end == span.start => last.end = span.end, + _ => residual.push(span), + }; + + let offsets = self.node.offsets(); + let mut idx = self.node.first_chunk(range.start); + while idx + 1 < offsets.len() && offsets[idx] < range.end { + let chunk_start = offsets[idx]; + let chunk_end = offsets[idx + 1]; + let local = range.start.saturating_sub(chunk_start) + ..(range.end.min(chunk_end) - chunk_start); + let answers = match self.child_plan(idx, state, io)? { + Some((plan, plan_state)) => { + plan.aggregate_partial(local.clone(), io, plan_state.as_ref()) + .await? + } + None => None, + }; + match answers { + Some(answers) => { + for (func_idx, answer) in answers.into_iter().enumerate() { + let has_partial = answer.partial.is_some(); + let mut residual_rows = 0; + for span in answer.residual { + residual_rows += span.end - span.start; + push_residual( + &mut residuals[func_idx], + chunk_start + span.start..chunk_start + span.end, + ); + } + if let Some(partial) = answer.partial { + let Some(Some(acc)) = accumulators.get_mut(func_idx) else { + vortex_bail!("chunk answered an unsupported aggregate"); + }; + acc.combine_partials(partial)?; + contributed[func_idx] = true; + } + covered[func_idx] |= + has_partial || residual_rows < local.end - local.start; + } + } + None => { + for residual in residuals.iter_mut() { + push_residual( + residual, + chunk_start + local.start..chunk_start + local.end, + ); + } + } + } + idx += 1; + } + if !covered.iter().any(|&covered| covered) { + return Ok(None); + } + let mut answers = Vec::with_capacity(self.funcs.len()); + for ((accumulator, contributed), residual) in + accumulators.iter_mut().zip(contributed).zip(residuals) + { + let partial = match accumulator { + Some(acc) if contributed => Some(acc.flush()?), + _ => None, + }; + answers.push(AggregateAnswer { partial, residual }); + } + Ok(Some(answers)) + }) + } + + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "chunked") + } +} + +impl ScanPlan for ChunkedScanPlan { + fn dtype(&self) -> &DType { + self.layout.dtype() + } + + fn row_count(&self) -> u64 { + self.layout.row_count() + } + + fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { + Ok(Arc::new(ChunkedScanState::default())) + } + + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { + let state = self.scan_state(cx)?; + Ok(Some(Arc::new(ChunkedPreparedRead { node: self, state }))) + } + + fn try_push_expr( + self: Arc, + expr: &Expression, + _cx: &mut PushCtx, + ) -> VortexResult> { + if is_root(expr) { + return Ok(Some(self)); + } + let dtype = expr.return_dtype(self.layout.dtype())?; + Ok(Some(Arc::new(ChunkedExprScanPlan::new( + self, + expr.clone(), + dtype, + )))) + } + + fn prepare_evidence( + self: Arc, + cx: &mut PrepareCtx, + ) -> VortexResult> { + let node = Arc::new(ChunkedExprScanPlan::new( + Arc::clone(&self), + root(), + self.layout.dtype().clone(), + )); + let chunked_state = self.scan_state(cx)?; + let key = + PreparedStateKey::new::(Arc::as_ptr(&self) as *const () as usize); + let state = cx.shared_state(key, || Ok(ChunkedEvidenceState::new(chunked_state)))?; + Ok(vec![Arc::new(ChunkedPreparedEvidence { + node, + state, + session: cx.session().clone(), + })]) + } + + fn prepare_aggregate_partial( + self: Arc, + funcs: &[AggregateFnRef], + cx: &mut PrepareCtx, + ) -> VortexResult> { + if funcs.is_empty() { + return Ok(None); + } + let chunked_state = self.scan_state(cx)?; + Ok(Some(Arc::new(ChunkedPreparedAggregate { + node: ChunkedAggregateNode::Root(Arc::clone(&self)), + chunked_state, + dtype: self.layout.dtype().clone(), + funcs: funcs.to_vec(), + }))) + } + + fn split_hints(&self) -> Option<&[u64]> { + Some(&self.offsets) + } + + /// Drop chunk states wholly behind the frontier and recurse into the + /// boundary chunk so nested layouts release their own state. The + /// expanded chunk *nodes* stay: they are shared across queries and + /// hold no data. + fn release(&self, frontier: u64, state: &ScanState) -> VortexResult<()> { + let state = downcast_state::(state)?; + state + .reads + .lock() + .retain(|&idx, _| self.offsets[idx + 1] > frontier); + state + .child_state_caches + .lock() + .retain(|&idx, _| self.offsets[idx + 1] > frontier); + let idx = self.first_chunk(frontier); + if idx + 1 < self.offsets.len() && self.offsets[idx] < frontier { + let child = state.reads.lock().get(&idx).cloned(); + if let Some(child) = child { + child.release(frontier - self.offsets[idx])?; + } + } + #[cfg(debug_assertions)] + state.released.fetch_max(frontier, Ordering::Relaxed); + Ok(()) + } + + fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "chunked({})", self.offsets.len().saturating_sub(1)) + } +} + +impl PreparedRead for ChunkedPreparedRead { + fn create_task( + self: Arc, + range: Range, + rows: OwnedRowScope, + phase: ScanIoPhase, + ) -> VortexResult> { + if range.start >= range.end { + vortex_bail!("empty chunked scoped read range"); + } + #[cfg(debug_assertions)] + { + let released = self.state.released.load(Ordering::Relaxed); + debug_assert!( + range.start >= released, + "chunked read {range:?} below the released frontier {released}" + ); + } + let range_len = usize::try_from(range.end - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + let row_scope = rows.as_scope(); + if row_scope.selection.len() != range_len { + vortex_bail!( + "selection length {} does not match range length {range_len}", + row_scope.selection.len() + ); + } + if row_scope.demand.len() != range_len { + vortex_bail!( + "demand length {} does not match range length {range_len}", + row_scope.demand.len() + ); + } + if row_scope.selection.all_false() { + return Ok(Box::new(ChunkedReadTask { + dtype: self.node.layout.dtype().clone(), + parts: vec![ChunkedReadPart::Ready( + ConstantArray::new(Scalar::default_value(self.node.layout.dtype()), 0) + .into_array(), + )], + })); + } + + let dtype = self.node.layout.dtype().clone(); + let dense_scope = row_scope.selection.all_true() && row_scope.demand.all_true(); + let selected_scope = !dense_scope && row_scope.demands_all_selected(); + let mut parts = Vec::new(); + let mut idx = self.node.first_chunk(range.start); + while idx + 1 < self.node.offsets.len() && self.node.offsets[idx] < range.end { + let chunk_start = self.node.offsets[idx]; + let chunk_end = self.node.offsets[idx + 1]; + let local = + range.start.saturating_sub(chunk_start)..(range.end.min(chunk_end) - chunk_start); + let sel_start = usize::try_from(chunk_start.max(range.start) - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + let sel_end = usize::try_from(chunk_end.min(range.end) - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + let chunk_selection = row_scope.selection.slice(sel_start..sel_end); + idx += 1; + if chunk_selection.all_false() { + continue; + } + let chunk_demand = row_scope.demand.slice(sel_start..sel_end); + if chunk_demand.all_false() { + parts.push(ChunkedReadPart::Ready( + ConstantArray::new(Scalar::default_value(&dtype), chunk_selection.true_count()) + .into_array(), + )); + continue; + } + let chunk_idx = idx - 1; + let read = self + .node + .child_read(chunk_idx, &self.state, self.node.ctx.session())?; + let chunk_rows = if dense_scope || selected_scope { + OwnedRowScope::selected(chunk_selection.clone()) + } else { + OwnedRowScope::try_new(chunk_selection.clone(), chunk_demand)? + }; + let expected_len = chunk_selection.true_count(); + parts.push(ChunkedReadPart::Pending { + expected_len, + task: Arc::clone(&read).create_task(local, chunk_rows, phase)?, + }); + } + match parts.len() { + 0 => vortex_bail!("chunked scoped read range {range:?} out of bounds"), + _ => Ok(Box::new(ChunkedReadTask { dtype, parts })), + } + } + + fn release(&self, frontier: u64) -> VortexResult<()> { + self.node.release(frontier, &self.state) + } + + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.node.fmt_chain(f) + } +} + +impl ScanPlan for ChunkedExprScanPlan { + fn dtype(&self) -> &DType { + &self.dtype + } + + fn row_count(&self) -> u64 { + self.chunked.layout.row_count() + } + + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { + let _ = cx; + Ok(Arc::new(ChunkedExprScanState { + chunked: Arc::new(ChunkedScanState::default()), + reads: Mutex::new(FxHashMap::default()), + #[cfg(debug_assertions)] + released: AtomicU64::new(0), + })) + } + + fn try_push_expr( + self: Arc, + expr: &Expression, + _cx: &mut PushCtx, + ) -> VortexResult> { + default_try_push_expr(self, expr) + } + + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { + let key = + PreparedStateKey::new::(Arc::as_ptr(&self) as *const () as usize); + let chunked = self.chunked.scan_state(cx)?; + let state = cx.shared_state(key, || { + Ok(ChunkedExprScanState { + chunked, + reads: Mutex::new(FxHashMap::default()), + #[cfg(debug_assertions)] + released: AtomicU64::new(0), + }) + })?; + Ok(Some(Arc::new(ChunkedExprPreparedRead { + node: self, + state, + }))) + } + + fn prepare_evidence( + self: Arc, + cx: &mut PrepareCtx, + ) -> VortexResult> { + let key = + PreparedStateKey::new::(Arc::as_ptr(&self) as *const () as usize); + let chunked = self.chunked.scan_state(cx)?; + let state = cx.shared_state(key, || Ok(ChunkedEvidenceState::new(chunked)))?; + Ok(vec![Arc::new(ChunkedPreparedEvidence { + node: self, + state, + session: cx.session().clone(), + })]) + } + + fn prepare_aggregate_partial( + self: Arc, + funcs: &[AggregateFnRef], + cx: &mut PrepareCtx, + ) -> VortexResult> { + if funcs.is_empty() { + return Ok(None); + } + let chunked_state = self.chunked.scan_state(cx)?; + Ok(Some(Arc::new(ChunkedPreparedAggregate { + node: ChunkedAggregateNode::Expr(Arc::clone(&self)), + chunked_state, + dtype: self.dtype.clone(), + funcs: funcs.to_vec(), + }))) + } + + fn split_hints(&self) -> Option<&[u64]> { + Some(&self.chunked.offsets) + } + + fn release(&self, frontier: u64, state: &ScanState) -> VortexResult<()> { + let state = downcast_state::(state)?; + state + .reads + .lock() + .retain(|&idx, _| self.chunked.offsets[idx + 1] > frontier); + let idx = self.chunked.first_chunk(frontier); + if idx + 1 < self.chunked.offsets.len() && self.chunked.offsets[idx] < frontier { + let child = state.reads.lock().get(&idx).cloned(); + if let Some(child) = child { + child.release(frontier - self.chunked.offsets[idx])?; + } + } + #[cfg(debug_assertions)] + state.released.fetch_max(frontier, Ordering::Relaxed); + Ok(()) + } + + fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "chunked_expr({})", self.expr) + } +} + +impl PreparedRead for ChunkedExprPreparedRead { + fn create_task( + self: Arc, + range: Range, + rows: OwnedRowScope, + phase: ScanIoPhase, + ) -> VortexResult> { + if range.start >= range.end { + vortex_bail!("empty chunked scoped read range"); + } + #[cfg(debug_assertions)] + { + let released = self.state.released.load(Ordering::Relaxed); + debug_assert!( + range.start >= released, + "chunked expression read {range:?} below the released frontier {released}" + ); + } + let range_len = usize::try_from(range.end - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + let row_scope = rows.as_scope(); + if row_scope.selection.len() != range_len { + vortex_bail!( + "selection length {} does not match range length {range_len}", + row_scope.selection.len() + ); + } + if row_scope.demand.len() != range_len { + vortex_bail!( + "demand length {} does not match range length {range_len}", + row_scope.demand.len() + ); + } + if row_scope.selection.all_false() { + return Ok(Box::new(ChunkedReadTask { + dtype: self.node.dtype.clone(), + parts: vec![ChunkedReadPart::Ready( + ConstantArray::new(Scalar::default_value(&self.node.dtype), 0).into_array(), + )], + })); + } + + let dense_scope = row_scope.selection.all_true() && row_scope.demand.all_true(); + let selected_scope = !dense_scope && row_scope.demands_all_selected(); + let mut parts = Vec::new(); + let mut idx = self.node.chunked.first_chunk(range.start); + while idx + 1 < self.node.chunked.offsets.len() + && self.node.chunked.offsets[idx] < range.end + { + let chunk_start = self.node.chunked.offsets[idx]; + let chunk_end = self.node.chunked.offsets[idx + 1]; + let local = + range.start.saturating_sub(chunk_start)..(range.end.min(chunk_end) - chunk_start); + let sel_start = usize::try_from(chunk_start.max(range.start) - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + let sel_end = usize::try_from(chunk_end.min(range.end) - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + let chunk_selection = row_scope.selection.slice(sel_start..sel_end); + idx += 1; + if chunk_selection.all_false() { + continue; + } + let chunk_demand = row_scope.demand.slice(sel_start..sel_end); + if chunk_demand.all_false() { + parts.push(ChunkedReadPart::Ready( + ConstantArray::new( + Scalar::default_value(&self.node.dtype), + chunk_selection.true_count(), + ) + .into_array(), + )); + continue; + } + let chunk_idx = idx - 1; + let read = + self.node + .child_read(chunk_idx, &self.state, self.node.chunked.ctx.session())?; + let chunk_rows = if dense_scope || selected_scope { + OwnedRowScope::selected(chunk_selection.clone()) + } else { + OwnedRowScope::try_new(chunk_selection.clone(), chunk_demand)? + }; + let expected_len = chunk_selection.true_count(); + parts.push(ChunkedReadPart::Pending { + expected_len, + task: Arc::clone(&read).create_task(local, chunk_rows, phase)?, + }); + } + match parts.len() { + 0 => vortex_bail!("chunked scoped read range {range:?} out of bounds"), + _ => Ok(Box::new(ChunkedReadTask { + dtype: self.node.dtype.clone(), + parts, + })), + } + } + + fn release(&self, frontier: u64) -> VortexResult<()> { + self.node.release(frontier, &self.state) + } + + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.node.fmt_chain(f) + } +} + +impl PreparedEvidence for ChunkedPreparedEvidence { + fn evidence<'a>( + &'a self, + req: &'a EvidenceRequest<'a>, + io: &'a ReadContext, + results: ReadResults, + ) -> VortexResult> { + if req.range.start >= req.range.end { + return Ok(Vec::new()); + } + let mut fragments = Vec::new(); + let mut idx = self.node.chunked.first_chunk(req.range.start); + while idx + 1 < self.node.chunked.offsets.len() + && self.node.chunked.offsets[idx] < req.range.end + { + let chunk_start = self.node.chunked.offsets[idx]; + let chunk_end = self.node.chunked.offsets[idx + 1]; + let local = req.range.start.saturating_sub(chunk_start) + ..(req.range.end.min(chunk_end) - chunk_start); + let recheck = req.mode == EvidenceMode::RecheckBeforeProjection; + let child_plans = if let Some(hit) = self.state.children.lock().get(&idx) { + hit.clone() + } else if recheck { + if let Some(hit) = self.state.recheck_children.lock().get(&idx) { + hit.clone() + } else { + let node = self.node.child(idx, io.session())?; + let mut plan_ctx = self.state.chunked.child_prepare_ctx(idx, io.session()); + let plans = node.prepare_evidence(&mut plan_ctx)?; + let planned = plans + .into_iter() + .filter(|plan| plan.recheck_before_projection()) + .collect::>(); + let mut children = self.state.recheck_children.lock(); + children.entry(idx).or_insert(planned).clone() + } + } else { + let node = self.node.child(idx, io.session())?; + let mut plan_ctx = self.state.chunked.child_prepare_ctx(idx, io.session()); + let planned = node.prepare_evidence(&mut plan_ctx)?; + let mut children = self.state.children.lock(); + children.entry(idx).or_insert(planned).clone() + }; + if !child_plans.is_empty() { + let child_req = EvidenceRequest { + id: req.id, + version: req.version, + predicate: req.predicate, + range: local, + mode: req.mode, + }; + for plan in child_plans { + if recheck && !plan.recheck_before_projection() { + continue; + } + for fragment in plan.evidence(&child_req, io, results.clone())? { + fragments.push(translate_fragment(fragment, chunk_start)); + } + } + } + idx += 1; + } + Ok(fragments) + } + + fn recheck_before_projection(&self) -> bool { + true + } + + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "chunked") + } + + fn create_task( + self: Arc, + req: OwnedEvidenceRequest, + phase: ScanIoPhase, + ) -> VortexResult> { + Ok(Box::new(ChunkedEvidenceTask { + evidence: self, + req, + phase, + })) + } +} + +impl EvidenceTask for ChunkedEvidenceTask { + fn into_step(self: Box) -> VortexResult { + let Self { + evidence, + req, + phase, + } = *self; + if req.range.start >= req.range.end { + return Ok(EvidenceStep::new( + Vec::new(), + Vec::new(), + move |io, results| evidence.evidence(&req.as_request(), io, results), + )); + } + + let mut required_reads = Vec::new(); + let mut prefetch_reads = Vec::new(); + let mut idx = evidence.node.chunked.first_chunk(req.range.start); + while idx + 1 < evidence.node.chunked.offsets.len() + && evidence.node.chunked.offsets[idx] < req.range.end + { + let chunk_start = evidence.node.chunked.offsets[idx]; + let chunk_end = evidence.node.chunked.offsets[idx + 1]; + let local = req.range.start.saturating_sub(chunk_start) + ..(req.range.end.min(chunk_end) - chunk_start); + let recheck = req.mode == EvidenceMode::RecheckBeforeProjection; + let child_plans = if let Some(hit) = evidence.state.children.lock().get(&idx) { + hit.clone() + } else if recheck { + if let Some(hit) = evidence.state.recheck_children.lock().get(&idx) { + hit.clone() + } else { + let node = evidence.node.child(idx, &evidence.session)?; + let mut plan_ctx = evidence + .state + .chunked + .child_prepare_ctx(idx, &evidence.session); + let plans = node.prepare_evidence(&mut plan_ctx)?; + let planned = plans + .into_iter() + .filter(|plan| plan.recheck_before_projection()) + .collect::>(); + let mut children = evidence.state.recheck_children.lock(); + children.entry(idx).or_insert(planned).clone() + } + } else { + let node = evidence.node.child(idx, &evidence.session)?; + let mut plan_ctx = evidence + .state + .chunked + .child_prepare_ctx(idx, &evidence.session); + let planned = node.prepare_evidence(&mut plan_ctx)?; + let mut children = evidence.state.children.lock(); + children.entry(idx).or_insert(planned).clone() + }; + if !child_plans.is_empty() { + let child_req = OwnedEvidenceRequest { + id: req.id, + version: req.version, + predicate: req.predicate.clone(), + range: local, + mode: req.mode, + }; + for plan in child_plans { + if recheck && !plan.recheck_before_projection() { + continue; + } + let step = Arc::clone(&plan) + .create_task(child_req.clone(), phase)? + .into_step()?; + required_reads.extend(step.required_reads); + prefetch_reads.extend(step.prefetch_reads); + } + } + idx += 1; + } + Ok(EvidenceStep::new( + required_reads, + prefetch_reads, + move |io, results| evidence.evidence(&req.as_request(), io, results), + )) + } +} + +fn translate_fragment(mut fragment: EvidenceFragment, offset: u64) -> EvidenceFragment { + fragment.rows = fragment.rows.start + offset..fragment.rows.end + offset; + fragment +} diff --git a/vortex-layout/src/scan/v2/layouts/dict.rs b/vortex-layout/src/scan/v2/layouts/dict.rs new file mode 100644 index 00000000000..e8273e1b226 --- /dev/null +++ b/vortex-layout/src/scan/v2/layouts/dict.rs @@ -0,0 +1,1168 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Scan2 vtable support for dictionary layouts. +//! +//! Value reads use the dictionary value domain: values read once per query and +//! cached, codes read per range (selection-aware), the pair rebuilt as a lazy +//! `DictArray`. Pushed dictionary expressions also try to evaluate the +//! expression over the dictionary values once per query, then reuse the +//! resulting value-domain array with per-range codes. +//! +//! Dictionary predicate evidence is intentionally absent for now. Without +//! zone maps or indexes, reading dictionary values speculatively can cost +//! more than it proves; exact row-domain predicate work owns the codes read. + +use std::fmt; +use std::ops::Range; +use std::sync::Arc; + +use parking_lot::Mutex; +use rustc_hash::FxHashMap; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::DictArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::dtype::DType; +use vortex_array::dtype::NativePType; +use vortex_array::expr::Expression; +use vortex_array::expr::is_root; +use vortex_array::match_each_integer_ptype; +use vortex_array::optimizer::ArrayOptimizer; +use vortex_array::validity::Validity; +use vortex_buffer::BufferMut; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_err; +use vortex_mask::AllOr; +use vortex_mask::Mask; +use vortex_scan::plan::DeferredReadTask; +use vortex_scan::plan::OwnedRowScope; +use vortex_scan::plan::PrepareCtx; +use vortex_scan::plan::PreparedRead; +use vortex_scan::plan::PreparedReadRef; +use vortex_scan::plan::PreparedStateKey; +use vortex_scan::plan::PushCtx; +use vortex_scan::plan::ReadStep; +use vortex_scan::plan::ReadTask; +use vortex_scan::plan::ReadTaskOutput; +use vortex_scan::plan::RowScope; +use vortex_scan::plan::ScanPlan; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::ScanState; +use vortex_scan::plan::ScanStateRef; +use vortex_scan::plan::StateCtx; +use vortex_scan::plan::default_try_push_expr; +use vortex_scan::plan::request::ScanRequest; +use vortex_scan::read::ScanIoPhase; + +use crate::layout_v2::Layout; +use crate::layout_v2::LayoutScanPlanCtx; +use crate::layouts_v2::dict::Dict; + +const DENSE_REMAP_MAX_VALUES: usize = 1 << 20; +const DENSE_REMAP_VALUES_PER_CODE: usize = 4; +const UNREFERENCED_VALUE: usize = usize::MAX; + +pub(crate) fn new_scan_plan( + layout: Layout, + _req: &mut ScanRequest, + ctx: &LayoutScanPlanCtx, +) -> VortexResult { + let values = layout.child(0)?; + let codes = layout.child(1)?; + Ok(Arc::new(DictScanPlan { + values_len: values.row_count(), + dtype: layout.dtype().clone(), + row_count: layout.row_count(), + // Values and codes live in other row domains. + values: values.new_scan_plan(&mut ScanRequest::empty(), ctx)?, + codes: codes.new_scan_plan(&mut ScanRequest::empty(), ctx)?, + })) +} + +/// Reads a dict layout: shared values (another row domain, read once per +/// query) plus a codes chain in this node's row domain. +pub struct DictScanPlan { + values: ScanPlanRef, + values_len: u64, + codes: ScanPlanRef, + dtype: DType, + row_count: u64, +} + +/// Per-query dictionary caches for value-domain expression results. +#[derive(Clone)] +pub struct DictScanState { + shared: DictSharedState, +} + +#[derive(Clone)] +struct DictSharedState { + value_exprs: Arc>>>, +} + +impl DictScanState { + fn new() -> Self { + Self { + shared: DictSharedState::default(), + } + } +} + +impl Default for DictSharedState { + fn default() -> Self { + Self { + value_exprs: Arc::new(Mutex::new(FxHashMap::default())), + } + } +} + +/// A pushed scalar expression over a dictionary value. +struct DictExprScanPlan { + dict: Arc, + expr: Expression, + dtype: DType, +} + +struct DictPreparedRead { + node: Arc, + values_read: PreparedReadRef, + codes_read: PreparedReadRef, +} + +struct DictExprPreparedRead { + node: Arc, + state: Arc, + values_read: PreparedReadRef, + codes_read: PreparedReadRef, +} + +fn value_expr_is_expensive(expr: &Expression) -> bool { + // TODO: Move this cost classification onto ScalarFnVTable instead of matching function IDs + // here. + matches!( + expr.id().as_str(), + "vortex.like" + | "vortex.list.contains" + | "vortex.dynamic" + | "vortex.variant_get" + | "vortex.parquet.variant" + ) || expr.children().iter().any(value_expr_is_expensive) +} + +fn sparse_dict_candidate(values_len: u64, rows: RowScope<'_>) -> bool { + rows.demands_all_selected() + && !rows.selection.all_true() + && rows.selection.density() < 0.5 + && matches!( + usize::try_from(values_len), + Ok(values_len) if values_len > rows.demand.true_count() + ) +} + +fn sparse_value_expr_candidate(expr: &Expression, values_len: u64, rows: RowScope<'_>) -> bool { + sparse_dict_candidate(values_len, rows) && value_expr_is_expensive(expr) +} + +fn value_expr_candidate(expr: &Expression, values_len: u64, rows: RowScope<'_>) -> bool { + if sparse_value_expr_candidate(expr, values_len, rows) { + return false; + } + if !value_expr_is_expensive(expr) { + return true; + } + + let Ok(values_len) = usize::try_from(values_len) else { + return false; + }; + let demand = rows.demand.true_count(); + // Dense scans will usually touch every morsel in this dictionary. Since value-domain + // expressions are cached per DictScanState, allow a small amount of look-ahead instead of + // repeatedly evaluating expensive predicates over decoded row values. + values_len <= demand + || (rows.selection.all_true() + && rows.demand.all_true() + && values_len <= demand.saturating_mul(4)) +} + +impl DictScanPlan { + fn build_dict(&self, codes: ArrayRef, values: ArrayRef) -> VortexResult { + // SAFETY: the codes and values children come from a validated dictionary layout. + Ok(unsafe { DictArray::new_unchecked(codes, values) }.into_array()) + } +} + +impl ScanPlan for DictScanPlan { + fn dtype(&self) -> &DType { + &self.dtype + } + + fn row_count(&self) -> u64 { + self.row_count + } + + fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { + Ok(Arc::new(DictScanState::new())) + } + + fn try_push_expr( + self: Arc, + expr: &Expression, + _cx: &mut PushCtx, + ) -> VortexResult> { + if is_root(expr) { + Ok(Some(self)) + } else { + let dtype = expr.return_dtype(&self.dtype)?; + Ok(Some(Arc::new(DictExprScanPlan { + dict: self, + expr: expr.clone(), + dtype, + }))) + } + } + + fn split_hints(&self) -> Option<&[u64]> { + self.codes.split_hints() + } + + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { + let values_read = Arc::clone(&self.values) + .prepare_read(cx)? + .ok_or_else(|| vortex_err!("dictionary values did not produce a prepared read"))?; + let codes_read = Arc::clone(&self.codes) + .prepare_read(cx)? + .ok_or_else(|| vortex_err!("dictionary codes did not produce a prepared read"))?; + Ok(Some(Arc::new(DictPreparedRead { + node: self, + values_read, + codes_read, + }))) + } + + /// Codes live in this node's row domain and release with it. + fn release(&self, frontier: u64, state: &ScanState) -> VortexResult<()> { + let _ = (frontier, state); + Ok(()) + } + + fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "dict(")?; + self.codes.fmt_chain(f)?; + write!(f, ")") + } +} + +impl ScanPlan for DictExprScanPlan { + fn dtype(&self) -> &DType { + &self.dtype + } + + fn row_count(&self) -> u64 { + self.dict.row_count + } + + fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { + Ok(Arc::new(DictScanState::new())) + } + + fn try_push_expr( + self: Arc, + expr: &Expression, + _cx: &mut PushCtx, + ) -> VortexResult> { + default_try_push_expr(self, expr) + } + + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { + let key = + PreparedStateKey::new::(Arc::as_ptr(&self.dict) as *const () as usize); + let state = cx.shared_state(key, || Ok(DictScanState::new()))?; + let values_read = Arc::clone(&self.dict.values) + .prepare_read(cx)? + .ok_or_else(|| vortex_err!("dictionary values did not produce a prepared read"))?; + let codes_read = Arc::clone(&self.dict.codes) + .prepare_read(cx)? + .ok_or_else(|| vortex_err!("dictionary codes did not produce a prepared read"))?; + Ok(Some(Arc::new(DictExprPreparedRead { + node: self, + state, + values_read, + codes_read, + }))) + } + + fn release(&self, frontier: u64, state: &ScanState) -> VortexResult<()> { + self.dict.release(frontier, state) + } + + fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "dict_expr({})", self.expr) + } +} + +enum DictReadState { + Start, + SparseValues { + compact_codes: ArrayRef, + values: Option>, + }, + FullValues { + codes: ArrayRef, + values: Option>, + }, +} + +struct DictReadTask { + read: Arc, + codes: Box, + phase: ScanIoPhase, + state: DictReadState, +} + +impl ReadTask for DictReadTask { + fn into_step(self: Box) -> VortexResult { + let task = *self; + match task.state { + DictReadState::Start => { + let DictReadTask { + read, + codes, + phase, + state: _, + } = task; + let codes_step = codes.into_step()?; + let values_prefetch_step = + DictReadTask::create_full_values_task_for(&read, phase)?.into_step()?; + let mut prefetch_reads = codes_step.prefetch_reads; + prefetch_reads.extend(values_prefetch_step.required_reads); + prefetch_reads.extend(values_prefetch_step.prefetch_reads); + Ok(ReadStep::new( + codes_step.required_reads, + prefetch_reads, + move |io, local, results| match codes_step + .continuation + .run(io, local, results)? + { + ReadTaskOutput::Continue(codes) => { + Ok(ReadTaskOutput::Continue(Box::new(DictReadTask { + read, + codes, + phase, + state: DictReadState::Start, + }))) + } + ReadTaskOutput::Ready(codes) => { + let mut task = DictReadTask { + read, + codes: Box::new(DeferredReadTask), + phase, + state: DictReadState::Start, + }; + let rows = OwnedRowScope::selected(Mask::new_true(codes.len())); + if sparse_dict_candidate(task.read.node.values_len, rows.as_scope()) { + let values_len = usize::try_from(task.read.node.values_len) + .map_err(|_| { + vortex_err!("dictionary values length exceeds usize") + })?; + if let Some((compact_codes, value_selection)) = + compact_codes_and_value_selection( + codes.clone(), + values_len, + local, + )? + { + let values = task + .create_values_task(RowScope::selected(&value_selection))?; + task.state = DictReadState::SparseValues { + compact_codes, + values: Some(values), + }; + return Ok(ReadTaskOutput::Continue(Box::new(task))); + } + } + let values = task.create_full_values_task()?; + task.state = DictReadState::FullValues { + codes, + values: Some(values), + }; + Ok(ReadTaskOutput::Continue(Box::new(task))) + } + }, + )) + } + DictReadState::SparseValues { + compact_codes, + mut values, + } => { + let values_task = values.take().ok_or_else(|| { + vortex_err!("dictionary sparse values task was not initialized") + })?; + let values_step = values_task.into_step()?; + let read = task.read; + let phase = task.phase; + Ok(ReadStep::new( + values_step.required_reads, + values_step.prefetch_reads, + move |io, local, results| match values_step + .continuation + .run(io, local, results)? + { + ReadTaskOutput::Continue(values) => { + Ok(ReadTaskOutput::Continue(Box::new(DictReadTask { + read, + codes: Box::new(DeferredReadTask), + phase, + state: DictReadState::SparseValues { + compact_codes, + values: Some(values), + }, + }))) + } + ReadTaskOutput::Ready(values) => Ok(ReadTaskOutput::Ready( + read.node.build_dict(compact_codes, values)?.optimize()?, + )), + }, + )) + } + DictReadState::FullValues { codes, mut values } => { + let values_task = values.take().ok_or_else(|| { + vortex_err!("dictionary full values task was not initialized") + })?; + let values_step = values_task.into_step()?; + let read = task.read; + let phase = task.phase; + Ok(ReadStep::new( + values_step.required_reads, + values_step.prefetch_reads, + move |io, local, results| match values_step + .continuation + .run(io, local, results)? + { + ReadTaskOutput::Continue(values) => { + Ok(ReadTaskOutput::Continue(Box::new(DictReadTask { + read, + codes: Box::new(DeferredReadTask), + phase, + state: DictReadState::FullValues { + codes, + values: Some(values), + }, + }))) + } + ReadTaskOutput::Ready(values) => Ok(ReadTaskOutput::Ready( + read.node.build_dict(codes, values)?.optimize()?, + )), + }, + )) + } + } + } +} +impl DictReadTask { + fn create_values_task(&mut self, rows: RowScope<'_>) -> VortexResult> { + Self::create_values_task_for(&self.read, self.phase, rows) + } + + fn create_values_task_for( + read: &Arc, + phase: ScanIoPhase, + rows: RowScope<'_>, + ) -> VortexResult> { + let range = 0..read.node.values_len; + let owned_rows = OwnedRowScope::try_new(rows.selection.clone(), rows.demand.clone())?; + Arc::clone(&read.values_read).create_task(range, owned_rows, phase) + } + + fn create_full_values_task(&mut self) -> VortexResult> { + Self::create_full_values_task_for(&self.read, self.phase) + } + + fn create_full_values_task_for( + read: &Arc, + phase: ScanIoPhase, + ) -> VortexResult> { + let values_selection = Mask::new_true( + usize::try_from(read.node.values_len) + .map_err(|_| vortex_err!("dictionary values length exceeds usize"))?, + ); + Self::create_values_task_for(read, phase, RowScope::selected(&values_selection)) + } +} + +impl PreparedRead for DictPreparedRead { + fn create_task( + self: Arc, + range: Range, + rows: OwnedRowScope, + phase: ScanIoPhase, + ) -> VortexResult> { + Ok(Box::new(DictReadTask { + codes: Arc::clone(&self.codes_read).create_task(range, rows, phase)?, + read: self, + phase, + state: DictReadState::Start, + })) + } + + fn release(&self, frontier: u64) -> VortexResult<()> { + self.codes_read.release(frontier) + } + + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.node.fmt_chain(f) + } +} + +fn compact_codes_and_value_selection( + codes: ArrayRef, + values_len: usize, + local: &mut ExecutionCtx, +) -> VortexResult> { + let codes = codes.execute::(local)?; + let validity = codes.validity()?; + let valid = validity.execute_mask(codes.len(), local)?; + if valid.all_false() { + return Ok(None); + } + + match_each_integer_ptype!(codes.ptype(), |Code| { + compact_codes_and_value_selection_typed::( + codes.as_slice::(), + validity, + &valid, + values_len, + ) + }) +} + +fn compact_codes_and_value_selection_typed( + codes: &[Code], + validity: Validity, + valid: &Mask, + values_len: usize, +) -> VortexResult> +where + Code: NativePType + TryFrom, + usize: TryFrom, +{ + if use_dense_value_rank_map(codes.len(), valid.true_count(), values_len) { + return compact_codes_and_value_selection_dense(codes, validity, valid, values_len); + } + + let referenced = referenced_values(codes, valid, values_len)?; + if referenced.is_empty() || referenced.len() == values_len { + return Ok(None); + } + + let compact = remap_codes(codes, valid, values_len, &referenced)?; + let value_selection = Mask::from_indices(values_len, referenced); + let compact_codes = PrimitiveArray::new(compact.freeze(), validity).into_array(); + Ok(Some((compact_codes, value_selection))) +} + +fn use_dense_value_rank_map(codes_len: usize, valid_count: usize, values_len: usize) -> bool { + values_len <= DENSE_REMAP_MAX_VALUES + && values_len <= valid_count.saturating_mul(DENSE_REMAP_VALUES_PER_CODE) + && values_len <= codes_len.saturating_mul(DENSE_REMAP_VALUES_PER_CODE) +} + +fn compact_codes_and_value_selection_dense( + codes: &[Code], + validity: Validity, + valid: &Mask, + values_len: usize, +) -> VortexResult> +where + Code: NativePType + TryFrom, + usize: TryFrom, +{ + let mut rank_by_value = vec![UNREFERENCED_VALUE; values_len]; + mark_referenced_values(codes, valid, values_len, &mut rank_by_value)?; + + let mut referenced = Vec::with_capacity(valid.true_count().min(values_len)); + let mut rank = 0; + for (value_idx, value_rank) in rank_by_value.iter_mut().enumerate() { + if *value_rank != UNREFERENCED_VALUE { + *value_rank = rank; + referenced.push(value_idx); + rank += 1; + } + } + + if referenced.is_empty() || referenced.len() == values_len { + return Ok(None); + } + + let compact = remap_codes_dense(codes, valid, values_len, &rank_by_value)?; + let value_selection = Mask::from_indices(values_len, referenced); + let compact_codes = PrimitiveArray::new(compact.freeze(), validity).into_array(); + Ok(Some((compact_codes, value_selection))) +} + +fn mark_referenced_values( + codes: &[Code], + valid: &Mask, + values_len: usize, + rank_by_value: &mut [usize], +) -> VortexResult<()> +where + Code: Copy + fmt::Display, + usize: TryFrom, +{ + match valid.bit_buffer() { + AllOr::All => { + for &code in codes { + let idx = checked_code_index(code, values_len)?; + rank_by_value[idx] = 0; + } + } + AllOr::None => {} + AllOr::Some(mask) => { + for idx in mask.set_indices() { + let value_idx = checked_code_index(codes[idx], values_len)?; + rank_by_value[value_idx] = 0; + } + } + } + Ok(()) +} + +fn referenced_values( + codes: &[Code], + valid: &Mask, + values_len: usize, +) -> VortexResult> +where + Code: Copy + fmt::Display, + usize: TryFrom, +{ + let mut referenced = Vec::with_capacity(valid.true_count().min(values_len)); + match valid.bit_buffer() { + AllOr::All => { + for &code in codes { + referenced.push(checked_code_index(code, values_len)?); + } + } + AllOr::None => {} + AllOr::Some(mask) => { + for idx in mask.set_indices() { + referenced.push(checked_code_index(codes[idx], values_len)?); + } + } + } + referenced.sort_unstable(); + referenced.dedup(); + Ok(referenced) +} + +fn remap_codes( + codes: &[Code], + valid: &Mask, + values_len: usize, + referenced: &[usize], +) -> VortexResult> +where + Code: Copy + Default + fmt::Display + TryFrom, + usize: TryFrom, +{ + let mut compact = BufferMut::::with_capacity(codes.len()); + match valid.bit_buffer() { + AllOr::All => { + for &code in codes { + compact.push(compact_code(code, values_len, referenced)?); + } + } + AllOr::None => compact.extend(std::iter::repeat_n(Code::default(), codes.len())), + AllOr::Some(mask) => { + let mut valid_indices = mask.set_indices(); + let mut next_valid = valid_indices.next(); + for (idx, &code) in codes.iter().enumerate() { + if next_valid == Some(idx) { + compact.push(compact_code(code, values_len, referenced)?); + next_valid = valid_indices.next(); + } else { + compact.push(Code::default()); + } + } + } + } + Ok(compact) +} + +fn remap_codes_dense( + codes: &[Code], + valid: &Mask, + values_len: usize, + rank_by_value: &[usize], +) -> VortexResult> +where + Code: Copy + Default + fmt::Display + TryFrom, + usize: TryFrom, +{ + let mut compact = BufferMut::::with_capacity(codes.len()); + match valid.bit_buffer() { + AllOr::All => { + for &code in codes { + compact.push(compact_code_dense(code, values_len, rank_by_value)?); + } + } + AllOr::None => compact.extend(std::iter::repeat_n(Code::default(), codes.len())), + AllOr::Some(mask) => { + let mut valid_indices = mask.set_indices(); + let mut next_valid = valid_indices.next(); + for (idx, &code) in codes.iter().enumerate() { + if next_valid == Some(idx) { + compact.push(compact_code_dense(code, values_len, rank_by_value)?); + next_valid = valid_indices.next(); + } else { + compact.push(Code::default()); + } + } + } + } + Ok(compact) +} + +fn checked_code_index(code: Code, values_len: usize) -> VortexResult +where + Code: Copy + fmt::Display, + usize: TryFrom, +{ + let idx = usize::try_from(code) + .map_err(|_| vortex_err!("invalid negative dictionary code {code}"))?; + if idx >= values_len { + vortex_bail!( + "dictionary code {idx} out of bounds for values length {}", + values_len + ); + } + Ok(idx) +} + +fn compact_code_dense( + code: Code, + values_len: usize, + rank_by_value: &[usize], +) -> VortexResult +where + Code: Copy + fmt::Display + TryFrom, + usize: TryFrom, +{ + let idx = checked_code_index(code, values_len)?; + let rank = rank_by_value[idx]; + if rank == UNREFERENCED_VALUE { + vortex_bail!("dictionary code {idx} missing from sparse referenced value map"); + } + Code::try_from(rank).map_err(|_| { + vortex_err!( + "sparse dictionary code rank {rank} cannot be represented by original code type" + ) + }) +} + +fn compact_code(code: Code, values_len: usize, referenced: &[usize]) -> VortexResult +where + Code: Copy + fmt::Display + TryFrom, + usize: TryFrom, +{ + let idx = checked_code_index(code, values_len)?; + let rank = referenced.binary_search(&idx).map_err(|_| { + vortex_err!("dictionary code {idx} missing from sparse referenced value set") + })?; + Code::try_from(rank).map_err(|_| { + vortex_err!( + "sparse dictionary code rank {rank} cannot be represented by original code type" + ) + }) +} + +enum DictExprReadState { + Start, + Values { + codes: ArrayRef, + values: Option>, + mode: DictExprValueMode, + }, +} + +enum DictExprValueMode { + Full { try_value_expr: bool }, + Sparse { compact_codes: ArrayRef }, +} + +struct DictExprReadTask { + read: Arc, + codes: Box, + phase: ScanIoPhase, + state: DictExprReadState, +} + +impl ReadTask for DictExprReadTask { + fn into_step(self: Box) -> VortexResult { + let task = *self; + match task.state { + DictExprReadState::Start => { + let DictExprReadTask { + read, + codes, + phase, + state: _, + } = task; + let codes_step = codes.into_step()?; + let values_prefetch_step = + DictExprReadTask::create_full_values_task_for(&read, phase)?.into_step()?; + let mut prefetch_reads = codes_step.prefetch_reads; + prefetch_reads.extend(values_prefetch_step.required_reads); + prefetch_reads.extend(values_prefetch_step.prefetch_reads); + Ok(ReadStep::new( + codes_step.required_reads, + prefetch_reads, + move |io, local, results| match codes_step + .continuation + .run(io, local, results)? + { + ReadTaskOutput::Continue(codes) => { + Ok(ReadTaskOutput::Continue(Box::new(DictExprReadTask { + read, + codes, + phase, + state: DictExprReadState::Start, + }))) + } + ReadTaskOutput::Ready(codes) => { + let mut task = DictExprReadTask { + read, + codes: Box::new(DeferredReadTask), + phase, + state: DictExprReadState::Start, + }; + let selection = Mask::new_true(codes.len()); + let rows = RowScope::selected(&selection); + let sparse_candidate = sparse_value_expr_candidate( + &task.read.node.expr, + task.read.node.dict.values_len, + rows, + ); + let value_candidate = value_expr_candidate( + &task.read.node.expr, + task.read.node.dict.values_len, + rows, + ); + let all_valid = !codes.dtype().is_nullable() + || codes + .validity() + .and_then(|validity| validity.execute_mask(codes.len(), local))? + .all_true(); + let mut try_value_expr = value_candidate && all_valid; + if try_value_expr { + let cached = task + .read + .state + .shared + .value_exprs + .lock() + .get(&task.read.node.expr) + .cloned(); + match cached { + Some(Some(value_expr)) => { + return Ok(ReadTaskOutput::Ready( + task.read.node.dict.build_dict(codes, value_expr)?, + )); + } + Some(None) => try_value_expr = false, + None => {} + } + } + if try_value_expr { + let values = task.create_full_values_task()?; + task.state = DictExprReadState::Values { + codes, + values: Some(values), + mode: DictExprValueMode::Full { + try_value_expr: true, + }, + }; + return Ok(ReadTaskOutput::Continue(Box::new(task))); + } + if sparse_candidate { + let values_len = usize::try_from(task.read.node.dict.values_len) + .map_err(|_| { + vortex_err!("dictionary values length exceeds usize") + })?; + if let Some((compact_codes, value_selection)) = + compact_codes_and_value_selection( + codes.clone(), + values_len, + local, + )? + { + let values = task + .create_values_task(RowScope::selected(&value_selection))?; + task.state = DictExprReadState::Values { + codes, + values: Some(values), + mode: DictExprValueMode::Sparse { compact_codes }, + }; + return Ok(ReadTaskOutput::Continue(Box::new(task))); + } + } + let values = task.create_full_values_task()?; + task.state = DictExprReadState::Values { + codes, + values: Some(values), + mode: DictExprValueMode::Full { + try_value_expr: false, + }, + }; + Ok(ReadTaskOutput::Continue(Box::new(task))) + } + }, + )) + } + DictExprReadState::Values { + codes, + mut values, + mode, + } => { + let values_task = values.take().ok_or_else(|| { + vortex_err!("dictionary expression values task was not initialized") + })?; + let values_step = values_task.into_step()?; + let read = task.read; + let phase = task.phase; + Ok(ReadStep::new( + values_step.required_reads, + values_step.prefetch_reads, + move |io, local, results| match values_step + .continuation + .run(io, local, results)? + { + ReadTaskOutput::Continue(values) => { + Ok(ReadTaskOutput::Continue(Box::new(DictExprReadTask { + read, + codes: Box::new(DeferredReadTask), + phase, + state: DictExprReadState::Values { + codes, + values: Some(values), + mode, + }, + }))) + } + ReadTaskOutput::Ready(values_array) => { + finish_dict_expr_values(read, phase, codes, mode, values_array, local) + } + }, + )) + } + } + } +} + +fn finish_dict_expr_values( + read: Arc, + phase: ScanIoPhase, + codes: ArrayRef, + mode: DictExprValueMode, + values_array: ArrayRef, + local: &mut ExecutionCtx, +) -> VortexResult { + match mode { + DictExprValueMode::Full { try_value_expr } => { + if try_value_expr { + let value_expr = { + let mut value_exprs = read.state.shared.value_exprs.lock(); + if let Some(cached) = value_exprs.get(&read.node.expr).cloned() { + cached + } else { + let computed = values_array.clone().apply(&read.node.expr).and_then( + |array| match array.clone().execute::(local) { + Ok(mask) => { + let DType::Bool(nullability) = array.dtype() else { + return array.execute::(local); + }; + Ok(BoolArray::new( + mask.to_bit_buffer(), + Validity::from(nullability), + ) + .into_array()) + } + Err(_) => array.execute::(local), + }, + ); + let value_expr = match computed { + Ok(array) => Some(array), + Err(error) => { + tracing::debug!( + predicate = %read.node.expr, + %error, + "dict value-domain expression read unavailable" + ); + None + } + }; + value_exprs.insert(read.node.expr.clone(), value_expr.clone()); + value_expr + } + }; + if let Some(value_expr) = value_expr { + return Ok(ReadTaskOutput::Ready( + read.node.dict.build_dict(codes, value_expr)?, + )); + } + } + let input = read.node.dict.build_dict(codes, values_array)?.optimize()?; + Ok(ReadTaskOutput::Ready( + input.apply(&read.node.expr)?.execute::(local)?, + )) + } + DictExprValueMode::Sparse { compact_codes } => { + let input = read + .node + .dict + .build_dict(compact_codes, values_array)? + .optimize()?; + let computed = input + .apply(&read.node.expr) + .and_then(|array| array.execute::(local)); + match computed { + Ok(array) => Ok(ReadTaskOutput::Ready(array)), + Err(error) => { + tracing::debug!( + predicate = %read.node.expr, + %error, + "sparse dict expression read unavailable" + ); + let full_values = DictExprReadTask::create_full_values_task_for(&read, phase)?; + Ok(ReadTaskOutput::Continue(Box::new(DictExprReadTask { + read, + codes: Box::new(DeferredReadTask), + phase, + state: DictExprReadState::Values { + codes, + values: Some(full_values), + mode: DictExprValueMode::Full { + try_value_expr: false, + }, + }, + }))) + } + } + } + } +} + +impl DictExprReadTask { + fn create_values_task(&mut self, rows: RowScope<'_>) -> VortexResult> { + Self::create_values_task_for(&self.read, self.phase, rows) + } + + fn create_values_task_for( + read: &Arc, + phase: ScanIoPhase, + rows: RowScope<'_>, + ) -> VortexResult> { + let range = 0..read.node.dict.values_len; + let owned_rows = OwnedRowScope::try_new(rows.selection.clone(), rows.demand.clone())?; + Arc::clone(&read.values_read).create_task(range, owned_rows, phase) + } + + fn create_full_values_task(&mut self) -> VortexResult> { + Self::create_full_values_task_for(&self.read, self.phase) + } + + fn create_full_values_task_for( + read: &Arc, + phase: ScanIoPhase, + ) -> VortexResult> { + let values_selection = Mask::new_true( + usize::try_from(read.node.dict.values_len) + .map_err(|_| vortex_err!("dictionary values length exceeds usize"))?, + ); + Self::create_values_task_for(read, phase, RowScope::selected(&values_selection)) + } +} + +impl PreparedRead for DictExprPreparedRead { + fn create_task( + self: Arc, + range: Range, + rows: OwnedRowScope, + phase: ScanIoPhase, + ) -> VortexResult> { + Ok(Box::new(DictExprReadTask { + codes: Arc::clone(&self.codes_read).create_task(range, rows, phase)?, + read: self, + phase, + state: DictExprReadState::Start, + })) + } + + fn release(&self, frontier: u64) -> VortexResult<()> { + self.codes_read.release(frontier) + } + + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.node.fmt_chain(f) + } +} + +#[cfg(test)] +mod tests { + use vortex_array::LEGACY_SESSION; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::validity::Validity; + use vortex_buffer::buffer; + use vortex_error::VortexResult; + use vortex_mask::Mask; + + use super::compact_codes_and_value_selection_typed; + + #[test] + fn dense_compaction_preserves_sparse_value_order_and_validity() -> VortexResult<()> { + let validity = Validity::from_iter([true, false, true, true, true, true]); + let valid = validity.execute_mask(6, &mut LEGACY_SESSION.create_execution_ctx())?; + let (compact_codes, value_selection) = compact_codes_and_value_selection_typed::( + &[7, 9, 3, 7, 1, 3], + validity, + &valid, + 8, + )? + .expect("sparse dict compaction should be available"); + + assert_eq!(value_selection, Mask::from_indices(8, [1, 3, 7])); + let compact_codes = + compact_codes.execute::(&mut LEGACY_SESSION.create_execution_ctx())?; + assert_eq!(compact_codes.as_slice::(), &[2, 0, 1, 2, 0, 1]); + assert_eq!( + compact_codes + .validity()? + .execute_mask(6, &mut LEGACY_SESSION.create_execution_ctx())?, + Mask::from_indices(6, [0, 2, 3, 4, 5]) + ); + + Ok(()) + } + + #[test] + fn dense_compaction_returns_none_when_all_values_referenced() -> VortexResult<()> { + let validity = Validity::NonNullable; + let valid = validity.execute_mask(4, &mut LEGACY_SESSION.create_execution_ctx())?; + assert!( + compact_codes_and_value_selection_typed::( + buffer![2u8, 0, 1, 3].as_slice(), + validity, + &valid, + 4, + )? + .is_none() + ); + + Ok(()) + } +} diff --git a/vortex-layout/src/scan/v2/layouts/flat.rs b/vortex-layout/src/scan/v2/layouts/flat.rs new file mode 100644 index 00000000000..c888f7f02dc --- /dev/null +++ b/vortex-layout/src/scan/v2/layouts/flat.rs @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Scan2 vtable support for flat layouts: one segment, parsed lazily, decoded on +//! demand. +//! +//! A flat leaf exposes no evidence producers — it has no statistics or +//! index — and keeps the default selection path: its segment decodes whole, so a +//! selected read is the dense parse followed by a lazy filter, which +//! vortex pushes through the encodings. + +use std::fmt; +use std::ops::Range; +use std::sync::Arc; + +use parking_lot::Mutex; +use vortex_array::ArrayRef; +use vortex_array::IntoArray; +use vortex_array::arrays::SliceArray; +use vortex_array::dtype::DType; +use vortex_array::expr::Expression; +use vortex_array::optimizer::ArrayOptimizer; +use vortex_array::serde::SerializedArray; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_err; +use vortex_scan::plan::OwnedRowScope; +use vortex_scan::plan::PrepareCtx; +use vortex_scan::plan::PreparedRead; +use vortex_scan::plan::PreparedReadRef; +use vortex_scan::plan::PreparedStateKey; +use vortex_scan::plan::PushCtx; +use vortex_scan::plan::ReadStep; +use vortex_scan::plan::ReadTask; +use vortex_scan::plan::ReadTaskOutput; +use vortex_scan::plan::ScanPlan; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::ScanState; +use vortex_scan::plan::ScanStateRef; +use vortex_scan::plan::StateCtx; +use vortex_scan::plan::default_try_push_expr; +use vortex_scan::plan::downcast_state; +use vortex_scan::plan::request::ScanRequest; +use vortex_scan::read::ReadRequestKey; +use vortex_scan::read::ReadResults; +use vortex_scan::read::ScanIoPhase; +use vortex_scan::read::ScanRead; +use vortex_session::VortexSession; + +use crate::layout_v2::Layout; +use crate::layout_v2::LayoutRef; +use crate::layout_v2::LayoutScanPlanCtx; +use crate::layouts_v2::flat::Flat; +use crate::segments::SegmentFutureCache; +use crate::segments::SegmentRequest; +use crate::segments::SegmentRequestKey; +use crate::segments::SegmentSource; + +pub(crate) fn new_scan_plan( + layout: Layout, + _req: &mut ScanRequest, + ctx: &LayoutScanPlanCtx, +) -> VortexResult { + Ok(Arc::new(FlatScanPlan { + layout: layout.to_layout(), + session: ctx.session().clone(), + segment_source: Arc::clone(ctx.segment_source()), + segment_future_cache: Arc::clone(ctx.segment_future_cache()), + })) +} + +/// Reads a flat layout: fetches its segment once per query, parses it +/// into a (lazy) array, and slices per request. +pub struct FlatScanPlan { + layout: LayoutRef, + session: VortexSession, + segment_source: Arc, + segment_future_cache: Arc, +} + +/// Per-query cache of the parsed (still lazy) array. +#[derive(Default)] +pub struct FlatScanState { + array: Mutex>, +} + +struct FlatPreparedRead { + node: Arc, + state: Arc, +} + +struct FlatReadTask { + read: Arc, + range: Range, + rows: OwnedRowScope, + phase: ScanIoPhase, +} + +impl FlatScanPlan { + fn array(&self, results: &ReadResults, state: &FlatScanState) -> VortexResult { + if let Some(hit) = state.array.lock().clone() { + return Ok(hit); + } + + let mut guard = state.array.lock(); + if let Some(hit) = guard.clone() { + return Ok(hit); + } + + let array = decode_flat(&self.layout, results, &self.session)?; + *guard = Some(array.clone()); + Ok(array) + } +} + +impl ScanPlan for FlatScanPlan { + fn dtype(&self) -> &DType { + self.layout.dtype() + } + + fn row_count(&self) -> u64 { + self.layout.row_count() + } + + fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { + Ok(Arc::new(FlatScanState::default())) + } + + fn try_push_expr( + self: Arc, + expr: &Expression, + _cx: &mut PushCtx, + ) -> VortexResult> { + default_try_push_expr(self, expr) + } + + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { + let flat = self.layout.as_opt::().ok_or_else(|| { + vortex_err!("expected flat layout, got {}", self.layout.encoding_id()) + })?; + let key = PreparedStateKey::new::(*flat.data().segment_id() as usize); + let state = cx.shared_state(key, || Ok(FlatScanState::default()))?; + Ok(Some(Arc::new(FlatPreparedRead { node: self, state }))) + } + + /// A flat leaf releases only once *wholly* behind the frontier: a + /// partially-covered flat is the working set, and dropping it would + /// thrash the segment fetch. + fn release(&self, frontier: u64, state: &ScanState) -> VortexResult<()> { + let state = downcast_state::(state)?; + if frontier >= self.layout.row_count() { + *state.array.lock() = None; + } + Ok(()) + } + + fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "flat") + } +} + +impl PreparedRead for FlatPreparedRead { + fn create_task( + self: Arc, + range: Range, + rows: OwnedRowScope, + phase: ScanIoPhase, + ) -> VortexResult> { + Ok(Box::new(FlatReadTask { + read: self, + range, + rows, + phase, + })) + } + + fn release(&self, frontier: u64) -> VortexResult<()> { + self.node.release(frontier, &self.state) + } + + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.node.fmt_chain(f) + } +} + +impl FlatPreparedRead { + fn segment_read(&self, phase: ScanIoPhase) -> VortexResult { + let Some(flat) = self.node.layout.as_opt::() else { + vortex_bail!( + "expected flat layout, got {}", + self.node.layout.encoding_id() + ); + }; + self.node + .segment_future_cache + .register( + self.node.segment_source.as_ref(), + vec![SegmentRequest::new( + flat.data().segment_id(), + self.node + .segment_source + .segment_info(flat.data().segment_id())?, + phase, + )], + ) + .into_iter() + .next() + .ok_or_else(|| vortex_err!("flat segment read registration returned no reads")) + } +} + +impl ReadTask for FlatReadTask { + fn into_step(self: Box) -> VortexResult { + let Self { + read, + range, + rows, + phase, + } = *self; + let segment_read = read.segment_read(phase)?; + Ok(ReadStep::new( + vec![segment_read], + Vec::new(), + move |_, _, results| { + let array = read.node.array(&results, &read.state)?; + let rows = rows.as_scope(); + let dense = slice_to_range(array, &range)?; + if rows.selection.len() != dense.len() { + vortex_bail!( + "selection length {} does not match read range length {}", + rows.selection.len(), + dense.len() + ); + } + if rows.demand.len() != dense.len() { + vortex_bail!( + "demand length {} does not match read range length {}", + rows.demand.len(), + dense.len() + ); + } + if rows.selection.all_true() { + return Ok(ReadTaskOutput::Ready(dense)); + } + Ok(ReadTaskOutput::Ready(dense.filter(rows.selection.clone())?)) + }, + )) + } +} + +pub(crate) fn decode_flat( + layout: &LayoutRef, + results: &ReadResults, + session: &VortexSession, +) -> VortexResult { + let Some(flat) = layout.as_opt::() else { + vortex_bail!("expected flat layout, got {}", layout.encoding_id()); + }; + let row_count = usize::try_from(layout.row_count()) + .map_err(|_| vortex_err!("layout row count exceeds usize"))?; + let key = ReadRequestKey::from(SegmentRequestKey::new(flat.data().segment_id())); + let segment = results.get(key)?; + let parts = if let Some(tree) = flat.data().array_tree() { + SerializedArray::from_flatbuffer_and_segment(tree.clone(), segment)? + } else { + SerializedArray::try_from(segment)? + }; + parts.decode(layout.dtype(), row_count, flat.data().array_ctx(), session) +} + +pub(crate) fn slice_to_range(array: ArrayRef, range: &Range) -> VortexResult { + let start = usize::try_from(range.start).map_err(|_| vortex_err!("row range exceeds usize"))?; + let end = usize::try_from(range.end).map_err(|_| vortex_err!("row range exceeds usize"))?; + if start == 0 && end == array.len() { + return Ok(array); + } + SliceArray::try_new(array, start..end)? + .into_array() + .optimize() +} diff --git a/vortex-layout/src/scan/v2/layouts/mod.rs b/vortex-layout/src/scan/v2/layouts/mod.rs new file mode 100644 index 00000000000..83023252437 --- /dev/null +++ b/vortex-layout/src/scan/v2/layouts/mod.rs @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Built-in scan2 layout-vtable implementations. + +pub mod chunked; +pub mod dict; +pub mod flat; +pub mod struct_; +pub mod zoned; diff --git a/vortex-layout/src/scan/v2/layouts/struct_.rs b/vortex-layout/src/scan/v2/layouts/struct_.rs new file mode 100644 index 00000000000..01516ecfa23 --- /dev/null +++ b/vortex-layout/src/scan/v2/layouts/struct_.rs @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Scan2 vtable support for struct layouts: plans field access expressions. +//! +//! A struct node treats field access as scalar expression pushdown: +//! `get_item(field, root())` pushes to the field child, and `select(...)` +//! becomes a virtual struct node assembled from pushed child nodes. + +use std::fmt; +use std::sync::Arc; + +use parking_lot::Mutex; +use vortex_array::dtype::DType; +use vortex_array::dtype::FieldName; +use vortex_array::dtype::FieldNames; +use vortex_array::dtype::StructFields; +use vortex_array::expr::Expression; +use vortex_array::expr::get_item; +use vortex_array::expr::is_root; +use vortex_array::expr::root; +use vortex_array::expr::transform::replace; +use vortex_array::scalar_fn::fns::get_item::GetItem; +use vortex_array::scalar_fn::fns::pack::Pack; +use vortex_array::scalar_fn::fns::root::Root; +use vortex_array::scalar_fn::fns::select::Select; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_scan::plan::ApplyScanPlan; +use vortex_scan::plan::MaskScanPlan; +use vortex_scan::plan::PrepareCtx; +use vortex_scan::plan::PreparedReadRef; +use vortex_scan::plan::PushCtx; +use vortex_scan::plan::ScanPlan; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::ScanStateRef; +use vortex_scan::plan::StateCtx; +use vortex_scan::plan::StructValueScanPlan; +use vortex_scan::plan::literal_scan_plan; +use vortex_scan::plan::request::ScanRequest; + +use crate::layout_v2::Layout; +use crate::layout_v2::LayoutRef; +use crate::layout_v2::LayoutScanPlanCtx; +use crate::layouts_v2::struct_::Struct; +use crate::scan::v2::referenced_fields; +use crate::scan::v2::struct_fields; + +pub(crate) fn new_scan_plan( + layout: Layout, + _req: &mut ScanRequest, + ctx: &LayoutScanPlanCtx, +) -> VortexResult { + let validity = layout + .dtype() + .is_nullable() + .then(|| { + layout + .child(0)? + .new_scan_plan(&mut ScanRequest::empty(), ctx) + }) + .transpose()?; + let fields = struct_fields(layout.dtype())?; + let children = Mutex::new(vec![None; fields.nfields()]); + let field_child_offset = usize::from(layout.dtype().is_nullable()); + Ok(Arc::new(StructScanPlan { + layout: layout.to_layout(), + ctx: ctx.clone(), + fields, + children, + field_child_offset, + validity, + })) +} + +/// Plans struct field expressions through child scan plans. +pub struct StructScanPlan { + layout: LayoutRef, + ctx: LayoutScanPlanCtx, + fields: StructFields, + children: Mutex>>, + field_child_offset: usize, + validity: Option, +} + +impl ScanPlan for StructScanPlan { + fn dtype(&self) -> &DType { + self.layout.dtype() + } + + fn row_count(&self) -> u64 { + self.layout.row_count() + } + + fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { + Ok(Arc::new(())) + } + + fn try_push_expr( + self: Arc, + expr: &Expression, + cx: &mut PushCtx, + ) -> VortexResult> { + let scope = &self.fields; + if let Some(literal) = literal_scan_plan(expr, self.layout.row_count()) { + return Ok(Some(literal)); + } + if is_root(expr) { + return self.push_struct(scope.names().clone(), cx).map(Some); + } + if let Some(name) = root_field(expr) { + let child = self.child_field(name)?; + return Ok(self.apply_validity(child.try_push_expr(&root(), cx)?)); + } + if let Some(selection) = expr.as_opt::