From 532b1126633d55a0e38726be6bd430c8cb8951ac Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 17 Jun 2026 14:38:31 -0400 Subject: [PATCH 01/48] Integrate ScanNode V2 scan path Signed-off-by: Nicholas Gates --- Cargo.lock | 3 + benchmarks/datafusion-bench/src/lib.rs | 56 +- benchmarks/datafusion-bench/src/main.rs | 106 +- docs/developer-guide/index.md | 3 +- .../internals/scan-scheduler.md | 467 ++++++ vortex-datafusion/src/convert/stats.rs | 71 + vortex-datafusion/src/persistent/format.rs | 90 + vortex-datafusion/src/persistent/metrics.rs | 13 + vortex-datafusion/src/persistent/opener.rs | 228 ++- vortex-datafusion/src/persistent/source.rs | 5 + vortex-datafusion/src/v2/source.rs | 235 ++- vortex-datafusion/src/v2/table.rs | 76 +- vortex-duckdb/src/column_statistics.rs | 57 + vortex-duckdb/src/lib.rs | 10 +- vortex-duckdb/src/multi_file.rs | 24 +- vortex-duckdb/src/table_function.rs | 60 +- vortex-ffi/src/scan.rs | 1 + vortex-file/Cargo.toml | 1 + vortex-file/src/file.rs | 34 + vortex-file/src/multi/mod.rs | 61 +- vortex-file/src/multi/scan_v2.rs | 1458 +++++++++++++++++ vortex-file/src/tests.rs | 215 ++- vortex-jni/src/scan.rs | 1 + vortex-layout/src/scan/mod.rs | 1 + vortex-layout/src/scan/v2/evidence.rs | 523 ++++++ vortex-layout/src/scan/v2/layouts/chunked.rs | 988 +++++++++++ vortex-layout/src/scan/v2/layouts/dict.rs | 498 ++++++ vortex-layout/src/scan/v2/layouts/flat.rs | 195 +++ vortex-layout/src/scan/v2/layouts/mod.rs | 10 + vortex-layout/src/scan/v2/layouts/struct_.rs | 169 ++ vortex-layout/src/scan/v2/layouts/zoned.rs | 987 +++++++++++ vortex-layout/src/scan/v2/mod.rs | 98 ++ vortex-layout/src/scan/v2/node.rs | 1202 ++++++++++++++ vortex-layout/src/scan/v2/request.rs | 61 + vortex-layout/src/scan/v2/session.rs | 104 ++ vortex-scan/Cargo.toml | 2 + vortex-scan/src/lib.rs | 105 ++ vortex-scan/src/scheduler.rs | 375 +++++ 38 files changed, 8381 insertions(+), 212 deletions(-) create mode 100644 docs/developer-guide/internals/scan-scheduler.md create mode 100644 vortex-file/src/multi/scan_v2.rs create mode 100644 vortex-layout/src/scan/v2/evidence.rs create mode 100644 vortex-layout/src/scan/v2/layouts/chunked.rs create mode 100644 vortex-layout/src/scan/v2/layouts/dict.rs create mode 100644 vortex-layout/src/scan/v2/layouts/flat.rs create mode 100644 vortex-layout/src/scan/v2/layouts/mod.rs create mode 100644 vortex-layout/src/scan/v2/layouts/struct_.rs create mode 100644 vortex-layout/src/scan/v2/layouts/zoned.rs create mode 100644 vortex-layout/src/scan/v2/mod.rs create mode 100644 vortex-layout/src/scan/v2/node.rs create mode 100644 vortex-layout/src/scan/v2/request.rs create mode 100644 vortex-layout/src/scan/v2/session.rs create mode 100644 vortex-scan/src/scheduler.rs diff --git a/Cargo.lock b/Cargo.lock index 00f5345aa76..2c075a40a85 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9775,6 +9775,7 @@ dependencies = [ "oneshot", "parking_lot", "pin-project-lite", + "temp-env", "tokio", "tracing", "vortex-alp", @@ -10160,6 +10161,7 @@ dependencies = [ name = "vortex-scan" version = "0.1.0" dependencies = [ + "async-lock", "async-trait", "futures", "roaring", @@ -10169,6 +10171,7 @@ dependencies = [ "vortex-error", "vortex-mask", "vortex-session", + "vortex-utils", ] [[package]] diff --git a/benchmarks/datafusion-bench/src/lib.rs b/benchmarks/datafusion-bench/src/lib.rs index c8353eb1f85..fb1a69dc44c 100644 --- a/benchmarks/datafusion-bench/src/lib.rs +++ b/benchmarks/datafusion-bench/src/lib.rs @@ -24,6 +24,10 @@ use object_store::aws::AmazonS3Builder; use object_store::gcp::GoogleCloudStorageBuilder; use object_store::local::LocalFileSystem; use url::Url; +use vortex::scan::ScanScheduler; +use vortex::scan::ScanSchedulerConfig; +use vortex::scan::ScanSchedulerSessionExt; +use vortex::session::VortexSession; use vortex_bench::Format; use vortex_bench::SESSION; use vortex_datafusion::VortexFormat; @@ -45,10 +49,13 @@ pub fn get_session_context() -> SessionContext { .build_arc() .expect("could not build runtime environment"); - let factory = VortexFormatFactory::new().with_options(VortexTableOptions { - projection_pushdown: true, - ..Default::default() - }); + let factory = VortexFormatFactory::new_with_options( + vortex_session_from_env().expect("invalid VORTEX_SCAN_SCHEDULER"), + VortexTableOptions { + projection_pushdown: true, + ..Default::default() + }, + ); let mut session_state_builder = SessionStateBuilder::new() .with_config(SessionConfig::from_env().expect("shouldn't fail")) @@ -109,16 +116,45 @@ pub fn make_object_store( } } -pub fn format_to_df_format(format: Format) -> Arc { +pub fn format_to_df_format(format: Format) -> anyhow::Result> { match format { - Format::Csv => Arc::new(CsvFormat::default()) as _, - Format::Arrow => Arc::new(ArrowFormat), - Format::Parquet => Arc::new(ParquetFormat::new()), + Format::Csv => Ok(Arc::new(CsvFormat::default()) as _), + Format::Arrow => Ok(Arc::new(ArrowFormat)), + Format::Parquet => Ok(Arc::new(ParquetFormat::new())), Format::OnDiskVortex | Format::VortexCompact => { - Arc::new(VortexFormat::new(SESSION.clone())) + Ok(Arc::new(VortexFormat::new(vortex_session_from_env()?))) } Format::OnDiskDuckDB | Format::Lance => { - unimplemented!("Format {format} cannot be turned into a DataFusion `FileFormat`") + anyhow::bail!("Format {format} cannot be turned into a DataFusion `FileFormat`") } } } + +fn vortex_session_from_env() -> anyhow::Result { + let session = SESSION.clone(); + let Ok(mode) = std::env::var("VORTEX_SCAN_SCHEDULER") else { + return Ok(session); + }; + let config = scan_scheduler_config_from_env()?; + Ok(match mode.as_str() { + "unbounded" => session.with_unbounded_scan_scheduler(), + "shared" | "global" => session.with_scan_scheduler(Arc::new(ScanScheduler::new(config))), + "per-query" | "per-scan" => session.with_new_scan_scheduler_per_scan(config), + other => anyhow::bail!( + "Invalid VORTEX_SCAN_SCHEDULER={other}; expected unbounded, shared, or per-query" + ), + }) +} + +fn scan_scheduler_config_from_env() -> anyhow::Result { + Ok(std::env::var("VORTEX_SCAN_MAX_MORSEL_SLOTS") + .ok() + .map(|value| { + value + .parse::() + .map(ScanSchedulerConfig::morsel_slots) + .map_err(|e| anyhow::anyhow!("invalid scan scheduler slot count {value}: {e}")) + }) + .transpose()? + .unwrap_or_else(ScanSchedulerConfig::default_morsel_slots)) +} diff --git a/benchmarks/datafusion-bench/src/main.rs b/benchmarks/datafusion-bench/src/main.rs index b8f9ac42df6..4e8e98ef3a7 100644 --- a/benchmarks/datafusion-bench/src/main.rs +++ b/benchmarks/datafusion-bench/src/main.rs @@ -27,8 +27,6 @@ use datafusion_physical_plan::collect; use futures::StreamExt; use parking_lot::Mutex; use tokio::fs::File; -use vortex::io::filesystem::FileSystemRef; -use vortex::scan::DataSourceRef; use vortex_bench::Benchmark; use vortex_bench::BenchmarkArg; use vortex_bench::CompactionStrategy; @@ -36,7 +34,6 @@ use vortex_bench::Engine; use vortex_bench::Format; use vortex_bench::Opt; use vortex_bench::Opts; -use vortex_bench::SESSION; use vortex_bench::conversions::convert_parquet_directory_to_vortex; use vortex_bench::create_benchmark; use vortex_bench::create_output_writer; @@ -190,7 +187,7 @@ async fn main() -> anyhow::Result<()> { async move { let session = datafusion_bench::get_session_context(); datafusion_bench::make_object_store(&session, benchmark.data_url())?; - register_benchmark_tables(&session, benchmark, format).await?; + register_benchmark_tables(&session, benchmark, format, show_metrics).await?; Ok((session, format)) } }, @@ -246,99 +243,42 @@ async fn main() -> anyhow::Result<()> { Ok(()) } -fn use_scan_api() -> bool { - std::env::var("VORTEX_USE_SCAN_API").is_ok_and(|v| v == "1") -} - async fn register_benchmark_tables( session: &SessionContext, benchmark: &B, format: Format, + _show_metrics: bool, ) -> anyhow::Result<()> { - match format { - Format::Arrow => register_arrow_tables(session, benchmark).await, - _ if use_scan_api() && matches!(format, Format::OnDiskVortex | Format::VortexCompact) => { - register_v2_tables(session, benchmark, format).await - } - _ => { - let benchmark_base = benchmark.data_url().join(&format!("{}/", format.name()))?; - let file_format = format_to_df_format(format); - - for table in benchmark.table_specs().iter() { - let pattern = benchmark.pattern(table.name, format); - let table_url = ListingTableUrl::try_new(benchmark_base.clone(), pattern)?; - - let listing_options = ListingOptions::new(Arc::clone(&file_format)) - .with_session_config_options(session.state().config()); - let mut config = - ListingTableConfig::new(table_url).with_listing_options(listing_options); - - config = match table.schema.as_ref() { - Some(schema) => config.with_schema(Arc::new(schema.clone())), - None => config.infer_schema(&session.state()).await?, - }; - - let listing_table = Arc::new( - ListingTable::try_new(config)?.with_cache( - session - .runtime_env() - .cache_manager - .get_file_statistic_cache(), - ), - ); - - session.register_table(table.name, listing_table)?; - } - - Ok(()) - } + if matches!(format, Format::Arrow) { + return register_arrow_tables(session, benchmark).await; } -} - -/// Register tables using the V2 `VortexTable` + `MultiFileDataSource` path. -async fn register_v2_tables( - session: &SessionContext, - benchmark: &B, - format: Format, -) -> anyhow::Result<()> { - use vortex::file::multi::MultiFileDataSource; - use vortex::io::object_store::ObjectStoreFileSystem; - use vortex::io::session::RuntimeSessionExt; - use vortex::scan::DataSource as _; - use vortex_datafusion::v2::VortexTable; let benchmark_base = benchmark.data_url().join(&format!("{}/", format.name()))?; + let file_format = format_to_df_format(format)?; for table in benchmark.table_specs().iter() { let pattern = benchmark.pattern(table.name, format); - let table_url = ListingTableUrl::try_new(benchmark_base.clone(), pattern.clone())?; - let store = session - .state() - .runtime_env() - .object_store(table_url.object_store())?; - - let fs: FileSystemRef = Arc::new(ObjectStoreFileSystem::new( - Arc::clone(&store), - SESSION.handle(), - )); - let base_prefix = benchmark_base.path().trim_start_matches('/').to_string(); - let fs = fs.with_prefix(base_prefix); - - let glob_pattern = match &pattern { - Some(p) => p.as_str().to_string(), - None => format!("*.{}", format.ext()), - }; + let table_url = ListingTableUrl::try_new(benchmark_base.clone(), pattern)?; + + let listing_options = ListingOptions::new(Arc::clone(&file_format)) + .with_session_config_options(session.state().config()); + let mut config = ListingTableConfig::new(table_url).with_listing_options(listing_options); - let multi_ds = MultiFileDataSource::new(SESSION.clone()) - .with_glob(glob_pattern, Some(fs)) - .build() - .await?; + config = match table.schema.as_ref() { + Some(schema) => config.with_schema(Arc::new(schema.clone())), + None => config.infer_schema(&session.state()).await?, + }; - let arrow_schema = Arc::new(multi_ds.dtype().to_arrow_schema()?); - let data_source: DataSourceRef = Arc::new(multi_ds); + let listing_table = Arc::new( + ListingTable::try_new(config)?.with_cache( + session + .runtime_env() + .cache_manager + .get_file_statistic_cache(), + ), + ); - let table_provider = Arc::new(VortexTable::new(data_source, SESSION.clone(), arrow_schema)); - session.register_table(table.name, table_provider)?; + session.register_table(table.name, listing_table)?; } Ok(()) diff --git a/docs/developer-guide/index.md b/docs/developer-guide/index.md index 37ad276044d..c1fedbad179 100644 --- a/docs/developer-guide/index.md +++ b/docs/developer-guide/index.md @@ -22,6 +22,7 @@ caption: Internals internals/architecture internals/session internals/async-runtime +internals/scan-scheduler internals/vtables internals/execution internals/io @@ -38,4 +39,4 @@ caption: Integrations integrations/datafusion integrations/duckdb integrations/spark -``` \ No newline at end of file +``` diff --git a/docs/developer-guide/internals/scan-scheduler.md b/docs/developer-guide/internals/scan-scheduler.md new file mode 100644 index 00000000000..d81aad83d1a --- /dev/null +++ b/docs/developer-guide/internals/scan-scheduler.md @@ -0,0 +1,467 @@ +# Scan Scheduler + +:::{note} +This is an implementation design for the ScanNode-backed scan path. It describes the scheduler +shape the V2 scan should grow into, not the current behavior of the released scan API. +::: + +The ScanNode scan path needs a resource scheduler that can coordinate work across files, partitions, +and concurrent scans. The scheduler should be explicit and embeddable: a host engine can share one +scheduler across many scans to enforce global limits, or create a fresh scheduler for each query to +isolate resource usage. + +The design uses one shared `ScanScheduler` object for resource arbitration and one per-scan runtime +for query semantics. + +The existing `DataSource` / `ScanRequest` / `DataSourceScan` API remains the public query-engine +boundary for this phase. The scheduler and morsel runtime sit behind that boundary, so the first +implementation can improve V2 execution without introducing a second scan API that mostly duplicates +the current one. + +## Goals + +- Bound scan resource usage across concurrent scans. +- Allow DataFusion users to choose a shared scheduler, a new scheduler per query, or an unbounded + mode. +- Give DuckDB a simple global scheduler owned by the extension session. +- Keep ScanNode planning and morsel ordering local to each scan. +- Make cancellation and permit release reliable when a stream is dropped early. +- Keep scheduler APIs independent of layout internals so other `DataSource` implementations can use + the same resource controls. + +## Non-goals + +- Do not make a process-global singleton the only way to schedule scans. +- Do not put query semantics, filter ordering, evidence planning, or output ordering into the global + scheduler. +- Do not replace the `DataSource` scan API in the first scheduler implementation. If the public API + changes later, it should be because the V2 runtime needs capabilities that cannot be added + compatibly to `ScanRequest` or `DataSourceScan`. +- Do not require every scan integration to expose the same configuration surface immediately. +- Do not solve cluster-wide distributed admission control. The scheduler is process-local. + +## Core Model + +There are three layers: + +1. `ScanScheduler` + Arbitrates global resources such as I/O bytes, decoded bytes, request concurrency, decode task + concurrency, and per-scan fairness. + +2. `ScanTicket` + Represents one logical scan registered with a scheduler. It carries scan identity, cancellation, + priority, metrics, and per-scan limits. + +3. Per-scan `MorselScanRuntime` + Owns the ScanNode graph, evidence/read/aggregate plans, morsel queue, row ordering, limit + handling, dynamic filters, and the choice of which work is useful next. + +`DataSource::scan` constructs this per-scan runtime internally and returns the existing +`DataSourceScan` wrapper. Query engines should not need to know whether a data source is implemented +by the legacy `LayoutReader` path or the V2 ScanNode runtime. + +The scheduler decides whether work may run. The per-scan runtime decides what work should run. + +```text +DataFusion / DuckDB + | + v +DataSource::scan(request) + | + v +resolve scheduler provider + | + v +ScanScheduler::register_scan(meta) -> ScanTicket + | + v +MorselScanRuntime + | + +-- plan next useful morsel + +-- acquire scheduler permits + +-- run evidence / read / decode / aggregate work + +-- release permits on completion or drop +``` + +## Scheduler Ownership + +`ScanScheduler` is an ordinary shared object: + +```rust +pub struct ScanScheduler { + config: ScanSchedulerConfig, + state: ScanSchedulerState, +} +``` + +The object is normally used behind `Arc`. + +```rust +let scheduler = Arc::new(ScanScheduler::new(config)); +``` + +Scheduler ownership is selected by a provider: + +```rust +pub enum ScanSchedulerProvider { + /// Use one scheduler for every scan that shares this provider. + Shared(Arc), + + /// Construct a new scheduler whenever a logical scan starts. + PerScan(ScanSchedulerConfig), + + /// No resource limits. Useful as the compatibility default and for tests. + Unbounded, +} +``` + +The provider is resolved when a logical scan starts, not when a table or data source is registered. +This matters for DataFusion, where a table can be registered once and executed many times. + +```rust +impl ScanSchedulerProvider { + pub fn scheduler_for_scan(&self, meta: &ScanMeta) -> Arc; +} +``` + +## Session Integration + +The scheduler provider should be stored on `VortexSession`, following the same pattern as +`RuntimeSession`. + +```rust +pub struct ScanSchedulerSession { + provider: Arc, +} + +pub trait ScanSchedulerSessionExt: SessionExt { + fn scan_scheduler_provider(&self) -> Arc; + + fn with_scan_scheduler(self, scheduler: Arc) -> Self; + + fn with_new_scan_scheduler_per_scan(self, config: ScanSchedulerConfig) -> Self; + + fn with_unbounded_scan_scheduler(self) -> Self; +} +``` + +The default should be `Unbounded` initially, so enabling the V2 scan does not silently introduce new +resource limits. Integrations can opt into bounded scheduling explicitly. + +The scheduler types should live in `vortex-scan`, not `vortex-layout`, because the resource policy +belongs to the scan API layer and should be reusable by non-layout sources. ScanNode-specific code in +`vortex-layout` can consume tickets and permits through the public scan scheduler API without making +the scheduler understand layout-specific node types. + +## DataFusion Integration + +DataFusion should expose scheduler control in the table/source builders. + +```rust +impl VortexDataSourceBuilder { + pub fn with_scan_scheduler(mut self, scheduler: Arc) -> Self; + + pub fn with_scan_scheduler_provider( + mut self, + provider: Arc, + ) -> Self; + + pub fn with_new_scan_scheduler_per_query( + mut self, + config: ScanSchedulerConfig, + ) -> Self; +} +``` + +The same options should be available on `VortexTable` and `VortexFormatFactory` so users who +register tables through DataFusion's listing format path can still control scheduling. + +For the current V2 DataFusion path, `DataSource::open` creates a single Vortex scan for partition +zero. A per-query scheduler can therefore be resolved immediately before calling +`DataSourceRef::scan`. If DataFusion later produces multiple Vortex scan nodes for one query and +those scans should share a per-query scheduler, the integration should propagate a scheduler through +DataFusion's `TaskContext` or another query-scoped extension and use that as the provider result. + +Recommended DataFusion modes: + +```rust +// One scheduler across an application, tenant, or SessionContext. +let scheduler = Arc::new(ScanScheduler::new(config)); +let source = VortexDataSource::builder(data_source, session) + .with_scan_scheduler(scheduler) + .build() + .await?; + +// A fresh scheduler each time this table is scanned. +let source = VortexDataSource::builder(data_source, session) + .with_new_scan_scheduler_per_query(config) + .build() + .await?; +``` + +Benchmark environment variables can map onto these APIs, but they should not be the primary control +surface: + +```text +VORTEX_SCAN_IMPL=v2 +VORTEX_SCAN_SCHEDULER=unbounded|shared|per-query +VORTEX_SCAN_MAX_MORSEL_SLOTS=... +``` + +## DuckDB Integration + +DuckDB can use one scheduler in the extension's global session. + +```rust +static SCAN_SCHEDULER: LazyLock> = + LazyLock::new(|| Arc::new(ScanScheduler::new(ScanSchedulerConfig::duckdb_default()))); + +static SESSION: LazyLock = LazyLock::new(|| { + let session = VortexSession::default() + .with_handle(RUNTIME.handle()) + .with_scan_scheduler(Arc::clone(&SCAN_SCHEDULER)); + vortex_geo::initialize(&session); + session +}); +``` + +This matches DuckDB's current extension shape: a global runtime and global Vortex session. It still +keeps the scheduler explicit and testable. + +## Work Requests and Permits + +Scan work should acquire scheduler permits before consuming bounded resources. + +The first implementation should not require every `ReadPlan`, `EvidencePlan`, or `AggregatePlan` +to expose pending I/O, decoded-size estimates, or cost statistics. Those estimates are useful, but +they are also hard to get right and would make the initial ScanNode API more rigid. The V2 runtime +already knows the coarse unit of scheduling: the morsel. The MVP scheduler should admit morsels and +let each admitted morsel run its evidence/read/aggregate pipeline internally. + +The MVP `WorkRequest` should be coarse: + +```rust +pub struct WorkRequest { + pub class: ScanWorkClass, + pub slots: u32, +} + +pub enum ScanWorkClass { + FileOpen, + Morsel, + OutputConversion, +} + +impl ScanScheduler { + pub fn register_scan(&self, meta: ScanMeta) -> ScanTicket; + + pub async fn acquire( + &self, + ticket: &ScanTicket, + request: WorkRequest, + ) -> VortexResult; +} +``` + +Richer byte/task fields can be added once the runtime has instrumentation showing which resource +limits matter in practice: + +```rust +pub struct PlanCostHint { + pub estimated_io_bytes: Option, + pub estimated_decoded_bytes: Option, + pub estimated_cpu_units: Option, +} +``` + +If those hints are added, they should remain advisory. A plan that does not provide hints should +still be schedulable with default morsel accounting. + +`WorkPermit` is RAII. Dropping it releases every reserved resource. This is required for early +limit termination, query cancellation, stream drop, and panic-safe cleanup. + +```rust +pub struct WorkPermit { + scheduler: Arc, + reservation: ReservationId, +} + +impl Drop for WorkPermit { + fn drop(&mut self) { + self.scheduler.release(self.reservation); + } +} +``` + +Once byte accounting exists, large work should be allowed to resize reservations after the actual +memory footprint is known: + +```rust +impl WorkPermit { + pub async fn grow_decoded_bytes(&mut self, bytes: u64) -> VortexResult<()>; + + pub fn shrink_decoded_bytes(&mut self, bytes: u64); +} +``` + +This will let the scan reserve from estimates first, then correct accounting after decoding. + +## Resources to Control + +The first implementation should control: + +- Maximum morsels in flight per scan. +- Maximum morsels in flight across a shared scheduler. + +This intentionally approximates the current scan behavior: unordered scans can run several morsels +concurrently, while ordered scans and scans with a pushed-down limit should run with a narrower +window. The default should mirror the existing `ScanBuilder` concurrency factor: + +```text +unordered/no-limit: max_morsels_in_flight = 4 * available_parallelism +ordered or limit: max_morsels_in_flight = 1 +``` + +The shared scheduler can apply the same window globally, per scan, or both. For example, a +DataFusion user can choose one shared scheduler with `4 * available_parallelism` total morsel slots +to cap the whole process, or create a new scheduler per query to preserve the old per-query +behavior. + +Later implementations can add: + +- I/O bytes in flight. +- Decoded/intermediate bytes in flight. +- Number of outstanding I/O operations. +- Number of decode/CPU tasks spawned by the scan path. +- Scheduler-aware segment cache admission. +- Per-scan weights and priorities. +- Storage-class-specific concurrency, such as separate local disk and object store limits. +- Output batch memory handoff, where permits live until the query engine consumes the batch. + +Output memory is the hardest resource to account for because ownership leaves the scan runtime. +The first byte-accounting implementation should bound intermediate scan memory and treat output +batch accounting as a follow-up. + +## Fairness + +A shared scheduler must avoid letting one large scan submit enough work to starve smaller scans. +The initial policy should combine global limits with per-scan windows: + +- Each `ScanTicket` has a maximum number of in-flight morsels. +- Global slot semaphores bound aggregate morsel concurrency. +- Work is admitted only when both the per-scan and global limits allow it. + +This is simpler than a centralized global work queue and avoids making the scheduler responsible for +query semantics. Weighted fair scheduling can be added later if the per-scan windows are not enough. + +## Morsel Runtime + +The V2 scan should move toward an explicit per-scan runtime: + +```rust +pub struct MorselScanRuntime { + scheduler: Arc, + ticket: ScanTicket, + plan: ScanRuntimePlan, + state: ScanRuntimeState, +} +``` + +`ScanRuntimePlan` is internal to the V2 implementation. It contains the files, expanded ScanNode +trees, pushed expressions, evidence plans, read plans, aggregate plans, and reusable per-file state. +It is not a replacement public scan API. + +Execution loop: + +```text +while output is still required: + claim next morsel + acquire per-morsel scheduler permits + read evidence needed for pruning or satisfaction + update row selection + read residual filter columns if needed + evaluate residual filter + read projected values or update aggregate state + emit batch or aggregate partial + release permits +``` + +The scheduler should not know that the work is "zoned evidence" or "dict read plan". It should only +see resource classes, slot counts, and cancellation state in the MVP. Later versions can add byte +estimates, CPU estimates, and priorities as advisory hints. + +## Cancellation + +`ScanTicket` owns a cancellation token. + +```rust +impl ScanTicket { + pub fn cancel(&self); + + pub fn is_cancelled(&self) -> bool; +} +``` + +Cancellation should happen when: + +- The engine drops the stream. +- A limit has been satisfied. +- A scheduler admission wait is cancelled. +- The host engine explicitly cancels the query. + +Queued work must observe the ticket before starting. Running work must release permits on drop. + +## Metrics + +The scheduler should expose per-scheduler and per-scan metrics: + +- Permit wait time by resource. +- Morsels admitted, completed, cancelled, and skipped. +- Per-scan queue/admission delay. + +Later byte/task accounting should add bytes reserved, peak bytes reserved, I/O operations admitted, +and decode tasks admitted. + +DataFusion should attach these to the existing scan metrics path where possible. DuckDB can expose +them through tracing or debug logs first. + +## Implementation Plan + +1. Add scheduler API to `vortex-scan`. + Include `ScanScheduler`, `ScanSchedulerConfig`, `ScanSchedulerProvider`, `ScanSchedulerSession`, + `ScanTicket`, `WorkRequest`, and `WorkPermit`. + +2. Wire the V2 scan to register one ticket per `DataSource::scan` call. + Store the ticket and scheduler in the V2 `DataSourceScan` so all partitions from the same scan + share one resource view. + +3. Add permits around V2 morsel execution. + Start with one scheduler slot per in-flight morsel. Do not require `ReadPlan`, `EvidencePlan`, + or `AggregatePlan` to expose cost estimates in the MVP. Keep byte accounting and output batch + memory accounting out of the MVP. + +4. Add DataFusion builder controls. + Support shared scheduler and per-query scheduler modes on `VortexDataSource`, `VortexTable`, and + `VortexFormatFactory`. + +5. Add DuckDB global scheduler. + Store a shared scheduler in the extension's global `VortexSession`. + +6. Add benchmark env vars. + Use them to compare unbounded, shared, and per-query scheduler modes under TPC-H and ClickBench. + +7. Add fairness and cancellation tests. + Tests should cover permit release on stream drop, per-scan windows, shared limits across two + scans, and per-query isolation. + +## Open Questions + +- Should the default scheduler remain unbounded permanently, or should V2 eventually use bounded + defaults? +- How should DataFusion propagate one per-query scheduler across several Vortex scan nodes in the + same physical plan? +- Should scheduler config be part of the public stable scan API or remain integration-specific until + the V2 scan is more mature? +- How should output batch memory be accounted once ownership moves into DataFusion or DuckDB? +- Should segment cache memory share the scheduler's decoded/intermediate budget, or have a separate + cache budget coordinated by the same scheduler? diff --git a/vortex-datafusion/src/convert/stats.rs b/vortex-datafusion/src/convert/stats.rs index 33a33a78ccf..a37592e4c7e 100644 --- a/vortex-datafusion/src/convert/stats.rs +++ b/vortex-datafusion/src/convert/stats.rs @@ -2,13 +2,23 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use datafusion_common::ColumnStatistics; +use datafusion_common::ScalarValue; use datafusion_common::stats::Precision; +use vortex::array::aggregate_fn::AggregateFnRef; +use vortex::array::aggregate_fn::AggregateFnVTableExt; +use vortex::array::aggregate_fn::EmptyOptions; +use vortex::array::aggregate_fn::fns::max::Max; +use vortex::array::aggregate_fn::fns::min::Min; +use vortex::array::aggregate_fn::fns::null_count::NullCount; +use vortex::array::aggregate_fn::fns::sum::Sum; +use vortex::array::aggregate_fn::fns::uncompressed_size_in_bytes::UncompressedSizeInBytes; use vortex::array::stats::StatsSet; use vortex::dtype::DType; use vortex::dtype::Nullability; use vortex::dtype::PType; use vortex::error::VortexExpect; use vortex::error::VortexResult; +use vortex::error::vortex_err; use vortex::expr::stats::Precision as VortexPrecision; use vortex::expr::stats::Stat; use vortex::scalar::Scalar; @@ -16,7 +26,45 @@ use vortex::scalar::Scalar; use crate::PrecisionExt; use crate::convert::TryToDataFusion; +const MIN_INDEX: usize = 0; +const MAX_INDEX: usize = 1; +const SUM_INDEX: usize = 2; +const NULL_COUNT_INDEX: usize = 3; +const BYTE_SIZE_INDEX: usize = 4; + +pub(crate) fn column_statistics_aggregate_fns() -> Vec { + vec![ + Min.bind(EmptyOptions), + Max.bind(EmptyOptions), + Sum.bind(EmptyOptions), + NullCount.bind(EmptyOptions), + UncompressedSizeInBytes.bind(EmptyOptions), + ] +} + +pub(crate) fn aggregate_stats_to_df( + stats: &[VortexPrecision], +) -> VortexResult { + if stats.len() != BYTE_SIZE_INDEX + 1 { + return Err(vortex_err!( + "expected {} aggregate statistics, got {}", + BYTE_SIZE_INDEX + 1, + stats.len() + )); + } + + Ok(ColumnStatistics { + null_count: scalar_u64_to_df_usize(&stats[NULL_COUNT_INDEX])?, + min_value: scalar_to_df(&stats[MIN_INDEX])?, + max_value: scalar_to_df(&stats[MAX_INDEX])?, + sum_value: scalar_to_df(&stats[SUM_INDEX])?, + distinct_count: Precision::Absent, + byte_size: scalar_u64_to_df_usize(&stats[BYTE_SIZE_INDEX])?, + }) +} + /// Convert a stats set for an array with the given dtype. +#[allow(dead_code)] pub(crate) fn stats_set_to_df( stats_set: &StatsSet, dtype: &DType, @@ -88,6 +136,29 @@ pub(crate) fn is_constant_to_distinct_count( } } +fn scalar_to_df(stat: &VortexPrecision) -> VortexResult> { + match stat { + VortexPrecision::Exact(scalar) => Ok(Precision::Exact(scalar.try_to_df()?)), + VortexPrecision::Inexact(scalar) => Ok(Precision::Inexact(scalar.try_to_df()?)), + VortexPrecision::Absent => Ok(Precision::Absent), + } +} + +fn scalar_u64_to_df_usize(stat: &VortexPrecision) -> VortexResult> { + match stat { + VortexPrecision::Exact(scalar) => Ok(Precision::Exact(scalar_u64_to_usize(scalar)?)), + VortexPrecision::Inexact(scalar) => Ok(Precision::Inexact(scalar_u64_to_usize(scalar)?)), + VortexPrecision::Absent => Ok(Precision::Absent), + } +} + +fn scalar_u64_to_usize(scalar: &Scalar) -> VortexResult { + let Some(value) = scalar.as_primitive().typed_value::() else { + return Err(vortex_err!("expected u64 statistic scalar, got {}", scalar)); + }; + Ok(usize::try_from(value).unwrap_or(usize::MAX)) +} + #[cfg(test)] mod tests { use vortex::expr::stats::Precision as VortexPrecision; diff --git a/vortex-datafusion/src/persistent/format.rs b/vortex-datafusion/src/persistent/format.rs index 0381e550893..ec79cad35cd 100644 --- a/vortex-datafusion/src/persistent/format.rs +++ b/vortex-datafusion/src/persistent/format.rs @@ -57,8 +57,10 @@ use vortex::file::EOF_SIZE; use vortex::file::MAX_POSTSCRIPT_SIZE; use vortex::file::OpenOptionsSessionExt; use vortex::file::VORTEX_FILE_EXTENSION; +use vortex::file::VortexFile; use vortex::io::object_store::ObjectStoreReadAt; use vortex::io::session::RuntimeSessionExt; +use vortex::layout::scan::v2::scan2_enabled; use vortex::scalar::Scalar; use vortex::scalar::ScalarValue as VortexScalarValue; use vortex::session::VortexSession; @@ -68,6 +70,8 @@ use super::sink::VortexSink; use super::source::VortexSource; use crate::PrecisionExt as _; use crate::convert::TryToDataFusion; +use crate::convert::stats::aggregate_stats_to_df; +use crate::convert::stats::column_statistics_aggregate_fns; use crate::convert::stats::is_constant_to_distinct_count; const DEFAULT_FOOTER_INITIAL_READ_SIZE_BYTES: usize = MAX_POSTSCRIPT_SIZE as usize + EOF_SIZE; @@ -427,6 +431,49 @@ impl FileFormat for VortexFormat { let file_metadata_cache = state.runtime_env().cache_manager.get_file_metadata_cache(); SpawnedTask::spawn(async move { + if scan2_enabled().map_err(|error| DataFusionError::External(Box::new(error)))? { + let cached_footer = file_metadata_cache + .get(&object.location) + .filter(|entry| entry.is_valid_for(&object)) + .and_then(|entry| { + entry + .file_metadata + .as_any() + .downcast_ref::() + .map(|vortex_metadata| vortex_metadata.footer().clone()) + }); + let footer_cache_hit = cached_footer.is_some(); + + let reader = Arc::new(ObjectStoreReadAt::new_with_allocator( + store, + object.location.clone(), + session.handle(), + session.allocator(), + )); + let mut open_opts = session + .open_options() + .with_initial_read_size(opts.footer_initial_read_size_bytes) + .with_file_size(object.size); + if let Some(footer) = cached_footer { + open_opts = open_opts.with_footer(footer); + } + + let vxf = open_opts.open_read(reader).await.map_err(|e| { + DataFusionError::Execution(format!( + "Failed to open Vortex file {}: {e}", + object.location + )) + })?; + + if !footer_cache_hit { + let file_metadata = Arc::new(CachedVortexMetadata::new(&vxf)); + let entry = CachedFileMetadataEntry::new(object.clone(), file_metadata); + file_metadata_cache.put(&object.location, entry); + } + + return infer_scan_node_stats(&table_schema, &vxf).await; + } + // Try to get entry metadata first let cached_metadata = file_metadata_cache .get(&object.location) @@ -628,6 +675,49 @@ impl FileFormat for VortexFormat { } } +async fn infer_scan_node_stats(table_schema: &SchemaRef, vxf: &VortexFile) -> DFResult { + let struct_dtype = vxf + .dtype() + .as_struct_fields_opt() + .vortex_expect("dtype is not a struct"); + let funcs = column_statistics_aggregate_fns(); + let mut column_statistics = vec![ColumnStatistics::default(); table_schema.fields().len()]; + let mut requested_columns = Vec::new(); + let mut requested_exprs = Vec::new(); + + for (idx, field) in table_schema.fields().iter().enumerate() { + if struct_dtype.find(field.name()).is_some() { + requested_columns.push(idx); + requested_exprs.push(vortex::expr::get_item( + field.name().as_str(), + vortex::expr::root(), + )); + } + } + + let stats = vxf + .scan_node_statistics_many(&requested_exprs, &funcs) + .await + .map_err(|e| DataFusionError::Execution(format!("Failed to infer scan2 stats: {e}")))?; + for (column_idx, stats) in requested_columns.into_iter().zip(stats) { + column_statistics[column_idx] = aggregate_stats_to_df(&stats).map_err(|e| { + DataFusionError::Execution(format!("Failed to convert scan2 stats: {e}")) + })?; + } + + let total_byte_size = column_statistics + .iter() + .fold(DFPrecision::Exact(0), |acc, cs| acc.add(&cs.byte_size)); + let num_rows = usize::try_from(vxf.row_count()) + .map_err(|_| DataFusionError::Execution("Row count overflow".to_string()))?; + + Ok(Statistics { + num_rows: DFPrecision::Exact(num_rows), + total_byte_size, + column_statistics, + }) +} + fn scalar_stat_to_df( stat: Stat, value: Precision, diff --git a/vortex-datafusion/src/persistent/metrics.rs b/vortex-datafusion/src/persistent/metrics.rs index e3bb1b18868..e94fc426646 100644 --- a/vortex-datafusion/src/persistent/metrics.rs +++ b/vortex-datafusion/src/persistent/metrics.rs @@ -23,6 +23,7 @@ use vortex::metrics::Metric; use vortex::metrics::MetricValue; use crate::persistent::source::VortexSource; +use crate::v2::VortexDataSource; pub(crate) static PARTITION_LABEL: &str = "partition"; pub(crate) static PATH_LABEL: &str = "file_path"; @@ -91,6 +92,18 @@ impl ExecutionPlanVisitor for VortexMetricsFinder { } } + if let Some(scan) = exec.data_source().downcast_ref::() + && let Some(metrics_registry) = scan.metrics_registry() + { + for metric in metrics_registry + .snapshot() + .iter() + .flat_map(metric_to_datafusion) + { + set.push(Arc::new(metric)); + } + } + self.0.push(set); Ok(false) diff --git a/vortex-datafusion/src/persistent/opener.rs b/vortex-datafusion/src/persistent/opener.rs index d50b003f1dc..9e7c49940cc 100644 --- a/vortex-datafusion/src/persistent/opener.rs +++ b/vortex-datafusion/src/persistent/opener.rs @@ -42,12 +42,14 @@ use vortex::dtype::FieldMask; use vortex::error::VortexError; use vortex::error::VortexExpect; use vortex::file::OpenOptionsSessionExt; +use vortex::file::VortexFile; use vortex::io::InstrumentedReadAt; use vortex::layout::LayoutReader; use vortex::layout::scan::scan_builder::ScanBuilder; use vortex::layout::scan::split_by::SplitBy; use vortex::metrics::Label; use vortex::metrics::MetricsRegistry; +use vortex::scan::ScanRequest; use vortex::session::VortexSession; use vortex_utils::aliases::dash_map::DashMap; use vortex_utils::aliases::dash_map::Entry; @@ -69,6 +71,7 @@ pub(crate) struct VortexOpener { pub partition: usize, pub session: VortexSession, pub vortex_reader_factory: Arc, + pub scan_v2: bool, /// Optional table schema projection. The indices are w.r.t. the `table_schema`, which is /// all fields in the final scan result not including the partition columns. pub projection: ProjectionExprs, @@ -137,6 +140,7 @@ impl FileOpener for VortexOpener { let expr_convertor = Arc::clone(&self.expression_convertor); let projection_pushdown = self.projection_pushdown; + let scan_v2 = self.scan_v2; // Replace column access for partition columns with literals #[expect(clippy::disallowed_types)] @@ -302,6 +306,146 @@ impl FileOpener for VortexOpener { .try_map_exprs(|expr| reassign_expr_columns(expr, &stream_schema))?; let projector = leftover_projection.make_projector(&stream_schema)?; + let filter = filter + .and_then(|f| { + // Verify that all filters we've accepted from DataFusion get pushed down. + // This will only fail if the user has not configured a suitable + // PhysicalExprAdapterFactory on the file source to handle rewriting the + // expression to handle missing/reordered columns in the Vortex file. + let (pushed, unpushed): (Vec, Vec) = + split_conjunction(&f) + .into_iter() + .cloned() + .partition(|expr| { + expr_convertor.can_be_pushed_down(expr, &this_file_schema) + }); + + if !unpushed.is_empty() { + return Some(Err(exec_datafusion_err!( + r#"VortexSource accepted but failed to push {} filters. + This should never happen if you have a properly configured + PhysicalExprAdapterFactory configured on the source. + + Failed filters: + + {unpushed:#?} + "#, + unpushed.len() + ))); + } + + make_vortex_predicate(expr_convertor.as_ref(), &pushed).transpose() + }) + .transpose()?; + + if scan_v2 { + let row_range = if let Some(file_range) = file.range { + let byte_range = Range { + start: u64::try_from(file_range.start).map_err(|_| { + exec_datafusion_err!("Vortex file range start is negative") + })?, + end: u64::try_from(file_range.end).map_err(|_| { + exec_datafusion_err!("Vortex file range end is negative") + })?, + }; + if byte_range.start == 0 && byte_range.end == file.object_meta.size { + None + } else { + let natural_split_ranges = scan_node_natural_split_ranges_for_file( + natural_split_ranges.as_ref(), + &file.object_meta.location, + &vxf, + )?; + + let Some(row_range) = split_aligned_row_range( + byte_range, + file.object_meta.size, + natural_split_ranges.as_ref(), + ) else { + return Ok(stream::empty().boxed()); + }; + Some(row_range) + } + } else { + None + }; + + let selection = file + .extensions + .get::() + .and_then(|vortex_plan| vortex_plan.selection().cloned()) + .unwrap_or_default(); + let stream_target_field = + Field::new_struct("", stream_schema.fields().clone(), false); + let file_location = file.object_meta.location.clone(); + let array_stream = vxf + .scan_node_stream(ScanRequest { + projection: scan_projection, + filter, + row_range, + selection, + ordered: has_output_ordering, + limit, + ..Default::default() + }) + .map_err(|e| { + exec_datafusion_err!("Failed to create Vortex scan2 stream: {e}") + })?; + let stream = array_stream + .map(move |chunk| { + let chunk = chunk?; + let mut ctx = session.create_execution_ctx(); + let arrow_session = ctx.session().clone(); + let arrow = arrow_session.arrow().execute_arrow( + chunk, + Some(&stream_target_field), + &mut ctx, + )?; + Ok(RecordBatch::from(arrow.as_struct().clone())) + }) + .map_ok(move |rb| { + // We try and slice the stream into respecting datafusion's configured batch size. + stream::iter( + (0..rb.num_rows().div_ceil(batch_size * 2)) + .flat_map(move |block_idx| { + let offset = block_idx * batch_size * 2; + + // If we have less than two batches worth of rows left, we keep them together as a single batch. + if rb.num_rows() - offset < 2 * batch_size { + let length = rb.num_rows() - offset; + [Some(rb.slice(offset, length)), None].into_iter() + } else { + let first = rb.slice(offset, batch_size); + let second = rb.slice(offset + batch_size, batch_size); + [Some(first), Some(second)].into_iter() + } + }) + .flatten() + .map(Ok), + ) + }) + .map_err(move |e: VortexError| { + DataFusionError::External(Box::new( + e.with_context(format!("Failed to read Vortex file: {file_location}")), + )) + }) + .try_flatten() + .map(move |batch| { + if projector.projection().as_ref().is_empty() { + batch + } else { + batch.and_then(|b| projector.project_batch(&b)) + } + }) + .boxed(); + + return if let Some(file_pruner) = file_pruner { + Ok(PrunableStream::new(file_pruner, stream).boxed()) + } else { + Ok(stream) + }; + } + // We share our layout readers with others partitions in the scan, so we can only need to read each layout in each file once. let layout_reader = match layout_readers.entry(file.object_meta.location.clone()) { Entry::Occupied(mut occupied_entry) => { @@ -364,38 +508,6 @@ impl FileOpener for VortexOpener { } } - let filter = filter - .and_then(|f| { - // Verify that all filters we've accepted from DataFusion get pushed down. - // This will only fail if the user has not configured a suitable - // PhysicalExprAdapterFactory on the file source to handle rewriting the - // expression to handle missing/reordered columns in the Vortex file. - let (pushed, unpushed): (Vec, Vec) = - split_conjunction(&f) - .into_iter() - .cloned() - .partition(|expr| { - expr_convertor.can_be_pushed_down(expr, &this_file_schema) - }); - - if !unpushed.is_empty() { - return Some(Err(exec_datafusion_err!( - r#"VortexSource accepted but failed to push {} filters. - This should never happen if you have a properly configured - PhysicalExprAdapterFactory configured on the source. - - Failed filters: - - {unpushed:#?} - "#, - unpushed.len() - ))); - } - - make_vortex_predicate(expr_convertor.as_ref(), &pushed).transpose() - }) - .transpose()?; - if let Some(limit) = limit && filter.is_none() { @@ -492,6 +604,29 @@ fn natural_split_ranges_for_file( } } +fn scan_node_natural_split_ranges_for_file( + natural_split_ranges: &DashMap]>>, + path: &Path, + file: &VortexFile, +) -> DFResult]>> { + if let Some(split_ranges) = natural_split_ranges.get(path) { + return Ok(Arc::clone(split_ranges.value())); + } + + let split_ranges = file + .scan_node_splits() + .map(Arc::from) + .map_err(|e| exec_datafusion_err!("Failed to compute Vortex scan2 natural splits: {e}"))?; + + match natural_split_ranges.entry(path.clone()) { + Entry::Occupied(entry) => Ok(Arc::clone(entry.get())), + Entry::Vacant(entry) => { + entry.insert(Arc::clone(&split_ranges)); + Ok(split_ranges) + } + } +} + fn compute_natural_split_ranges(layout_reader: &dyn LayoutReader) -> DFResult]>> { let row_count = layout_reader.row_count(); let row_range = 0..row_count; @@ -695,6 +830,7 @@ mod tests { partition: 1, session: SESSION.clone(), vortex_reader_factory: Arc::new(DefaultVortexReaderFactory::new(object_store)), + scan_v2: false, projection: ProjectionExprs::from_indices(&[0], table_schema.file_schema()), filter, file_pruning_predicate: None, @@ -793,6 +929,29 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_open_scan_v2() -> anyhow::Result<()> { + let object_store = Arc::new(InMemory::new()) as Arc; + let file_path = "scan2/file.vortex"; + let batch = record_batch!(("a", Int32, vec![Some(1), Some(2), Some(3)])).unwrap(); + let data_size = + write_arrow_to_vortex(Arc::clone(&object_store), file_path, batch.clone()).await?; + + let table_schema = TableSchema::from_file_schema(batch.schema()); + let mut opener = make_opener(object_store, table_schema, None); + opener.scan_v2 = true; + + let stream = opener + .open(PartitionedFile::new(file_path.to_string(), data_size))? + .await?; + let data = stream.try_collect::>().await?; + let num_rows = data.iter().map(|rb| rb.num_rows()).sum::(); + + assert_eq!(num_rows, 3); + + Ok(()) + } + #[tokio::test] async fn test_open_populates_file_metadata_cache() -> anyhow::Result<()> { let object_store = Arc::new(InMemory::new()) as Arc; @@ -867,6 +1026,7 @@ mod tests { vortex_reader_factory: Arc::new(DefaultVortexReaderFactory::new(Arc::clone( &object_store, ))), + scan_v2: false, projection: ProjectionExprs::from_indices(&[0], table_schema.file_schema()), filter: Some(filter), file_pruning_predicate: None, @@ -954,6 +1114,7 @@ mod tests { partition: 1, session: SESSION.clone(), vortex_reader_factory: Arc::new(DefaultVortexReaderFactory::new(object_store)), + scan_v2: false, projection: ProjectionExprs::from_indices(&[0, 1, 2], &table_schema), filter: None, file_pruning_predicate: None, @@ -1108,6 +1269,7 @@ mod tests { vortex_reader_factory: Arc::new(DefaultVortexReaderFactory::new(Arc::clone( &object_store, ))), + scan_v2: false, projection: ProjectionExprs::from_indices( projection.as_ref(), table_schema.file_schema(), @@ -1171,6 +1333,7 @@ mod tests { partition: 1, session: SESSION.clone(), vortex_reader_factory: Arc::new(DefaultVortexReaderFactory::new(object_store)), + scan_v2: false, projection, filter: None, file_pruning_predicate: None, @@ -1380,6 +1543,7 @@ mod tests { vortex_reader_factory: Arc::new(DefaultVortexReaderFactory::new(Arc::clone( &object_store, ))), + scan_v2: false, projection, filter: None, file_pruning_predicate: None, diff --git a/vortex-datafusion/src/persistent/source.rs b/vortex-datafusion/src/persistent/source.rs index 086c75adda7..c460d081742 100644 --- a/vortex-datafusion/src/persistent/source.rs +++ b/vortex-datafusion/src/persistent/source.rs @@ -6,6 +6,7 @@ use std::ops::Range; use std::sync::Arc; use std::sync::Weak; +use datafusion_common::DataFusionError; use datafusion_common::Result as DFResult; use datafusion_common::config::ConfigOptions; use datafusion_datasource::TableSchema; @@ -32,6 +33,7 @@ use object_store::path::Path; use vortex::error::VortexExpect; use vortex::file::VORTEX_FILE_EXTENSION; use vortex::layout::LayoutReader; +use vortex::layout::scan::v2::scan2_enabled; use vortex::metrics::DefaultMetricsRegistry; use vortex::metrics::MetricsRegistry; use vortex::session::VortexSession; @@ -325,11 +327,14 @@ impl VortexSource { .vortex_reader_factory .clone() .unwrap_or_else(|| Arc::new(DefaultVortexReaderFactory::new(object_store))); + let scan_v2 = + scan2_enabled().map_err(|error| DataFusionError::External(Box::new(error)))?; let opener = VortexOpener { partition, session: self.session.clone(), vortex_reader_factory, + scan_v2, projection: self.projection.clone(), filter: self.vortex_predicate.clone(), file_pruning_predicate: self.full_predicate.clone(), diff --git a/vortex-datafusion/src/v2/source.rs b/vortex-datafusion/src/v2/source.rs index 82fae9bf56d..3b190e6ff84 100644 --- a/vortex-datafusion/src/v2/source.rs +++ b/vortex-datafusion/src/v2/source.rs @@ -71,6 +71,7 @@ use std::fmt; use std::fmt::Formatter; use std::sync::Arc; +use arrow_schema::DataType; use arrow_schema::Field; use arrow_schema::Schema; use arrow_schema::SchemaRef; @@ -97,10 +98,13 @@ use datafusion_physical_plan::stream::RecordBatchStreamAdapter; use futures::StreamExt; use futures::TryStreamExt; use futures::future::try_join_all; +use futures::stream; +use futures::stream::BoxStream; +use tokio::sync::OnceCell; +use vortex::array::ArrayRef; use vortex::array::VortexSessionExecute; use vortex::array::arrow::ArrowSessionExt; use vortex::dtype::DType; -use vortex::dtype::FieldPath; use vortex::dtype::Nullability; use vortex::error::VortexResult; use vortex::error::vortex_bail; @@ -112,8 +116,13 @@ use vortex::expr::root; use vortex::expr::stats::Precision; use vortex::expr::transform::replace; use vortex::io::session::RuntimeSessionExt; +use vortex::metrics::MetricsRegistry; use vortex::scan::DataSourceRef; +use vortex::scan::PlannedMorselScanRef; use vortex::scan::ScanRequest; +use vortex::scan::ScanScheduler; +use vortex::scan::ScanSchedulerConfig; +use vortex::scan::ScanSchedulerProvider; use vortex::session::VortexSession; use vortex_utils::parallelism::get_available_parallelism; @@ -121,7 +130,8 @@ use crate::convert::exprs::DefaultExpressionConvertor; use crate::convert::exprs::ExpressionConvertor; use crate::convert::exprs::ProcessedProjection; use crate::convert::exprs::make_vortex_predicate; -use crate::convert::stats::stats_set_to_df; +use crate::convert::stats::aggregate_stats_to_df; +use crate::convert::stats::column_statistics_aggregate_fns; /// Builder for [`VortexDataSource`]. /// @@ -168,6 +178,8 @@ pub struct VortexDataSourceBuilder { arrow_schema: Option, projection: Option>, + metrics_registry: Option>, + scheduler_provider: Option>, } impl VortexDataSourceBuilder { @@ -198,6 +210,33 @@ impl VortexDataSourceBuilder { self } + /// Attaches a Vortex metrics registry populated by the underlying data source. + /// + /// The V2 adapter does not open files itself, so callers that want Vortex read metrics must + /// also configure the wrapped source to write to this same registry. + pub fn with_metrics_registry(mut self, metrics_registry: Arc) -> Self { + self.metrics_registry = Some(metrics_registry); + self + } + + /// Configures a shared scan scheduler for scans from this DataFusion source. + pub fn with_scan_scheduler(mut self, scheduler: Arc) -> Self { + self.scheduler_provider = Some(Arc::new(ScanSchedulerProvider::Shared(scheduler))); + self + } + + /// Configures the scheduler ownership strategy for scans from this DataFusion source. + pub fn with_scan_scheduler_provider(mut self, provider: Arc) -> Self { + self.scheduler_provider = Some(provider); + self + } + + /// Configures this source to create a new scan scheduler for each Vortex scan. + pub fn with_new_scan_scheduler_per_query(mut self, config: ScanSchedulerConfig) -> Self { + self.scheduler_provider = Some(Arc::new(ScanSchedulerProvider::PerScan(config))); + self + } + /// Builds the [`VortexDataSource`]. /// /// The builder eagerly resolves statistics for the initial projection @@ -242,21 +281,21 @@ impl VortexDataSourceBuilder { }; // We now compute initial statistics. - let field_paths: Vec<_> = fields + let statistics_exprs: Vec<_> = fields .names() .iter() .cloned() - .map(FieldPath::from_name) + .map(|name| get_item(name, root())) .collect(); + let statistics_funcs = column_statistics_aggregate_fns(); let statistics = try_join_all( - field_paths + statistics_exprs .iter() - .map(|path| self.data_source.field_statistics(path)), + .map(|expr| self.data_source.statistics(expr, &statistics_funcs)), ) .await? .iter() - .zip(fields.fields()) - .map(|(stats, dtype)| stats_set_to_df(stats, &dtype)) + .map(|stats| aggregate_stats_to_df(stats)) .collect::>>()?; Ok(VortexDataSource { @@ -275,6 +314,9 @@ impl VortexDataSourceBuilder { limit: None, ordered: false, num_partitions: get_available_parallelism().unwrap_or(1), + metrics_registry: self.metrics_registry, + scheduler_provider: self.scheduler_provider, + morsel_plan: Arc::new(OnceCell::new()), }) } } @@ -287,8 +329,31 @@ impl VortexDataSource { session, arrow_schema: None, projection: None, + metrics_registry: None, + scheduler_provider: None, + } + } + + fn scan_partition_count(&self) -> usize { + if self.should_morsel_repartition() { + self.num_partitions.max(1) + } else { + 1 } } + + fn should_morsel_repartition(&self) -> bool { + self.data_source.supports_morsel_partitioning() && !self.ordered && self.limit.is_none() + } + + fn reset_morsel_plan(&mut self) { + self.morsel_plan = Arc::new(OnceCell::new()); + } + + /// Returns the metrics registry attached to this source, if one was configured. + pub fn metrics_registry(&self) -> Option<&Arc> { + self.metrics_registry.as_ref() + } } /// DataFusion [`DataSource`] backed by a Vortex [`DataSourceRef`]. @@ -301,9 +366,12 @@ impl VortexDataSource { /// During execution, it builds the final Vortex [`ScanRequest`] from the /// current projection, pushed filters, ordering hints, and row limit. /// -/// This integration intentionally reports a single DataFusion output partition. -/// Vortex then handles split-level concurrency internally by polling multiple -/// split streams concurrently. +/// For unordered scans without a limit, this integration reports DataFusion's +/// requested partition count when the wrapped source supports ScanNode morsel +/// partitioning. The async morsel plan is still built lazily in [`DataSource::open`], +/// so partitions beyond the discovered morsel count produce empty streams. +/// Ordered and limited scans use one output partition so the source can preserve +/// semantics. /// /// Use [`crate::VortexSource`] instead when DataFusion should discover and plan /// `.vortex` files on its own. @@ -352,10 +420,18 @@ pub struct VortexDataSource { ordered: bool, /// The requested partition count from DataFusion, populated by [`DataSource::repartitioned`]. - /// We use this as a hint for how many splits to execute concurrently in `open()`, but we - /// always declare to DataFusion that we only have a single partition so that we can - /// internally manage concurrency and fix the problem of partition skew. + /// When morsel partitioning is enabled, this is also the count we report back to DataFusion. + /// The final lazy plan may discover fewer non-empty partitions. num_partitions: usize, + + /// Optional Vortex metrics registry populated by the wrapped source. + metrics_registry: Option>, + + /// Optional scheduler provider passed through the Vortex [`ScanRequest`]. + scheduler_provider: Option>, + + /// Shared planned scan for DataFusion morsel repartitioning. + morsel_plan: Arc>>, } impl fmt::Debug for VortexDataSource { @@ -369,17 +445,40 @@ impl fmt::Debug for VortexDataSource { } } +async fn scan_to_array_stream( + data_source: DataSourceRef, + scan_request: ScanRequest, + num_partitions: usize, +) -> DFResult>> { + let scan = data_source + .scan(scan_request) + .await + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + // Each split.execute() returns a lazy stream whose early polls do preparation + // work (expression resolution, layout traversal, first I/O spawns). We use + // try_flatten_unordered to poll multiple split streams concurrently so that + // the next split is already warm when the current one finishes. + let scan_streams = scan.partitions().map(|split_result| { + let split = split_result?; + split.execute() + }); + + Ok(scan_streams + .try_flatten_unordered(Some(num_partitions * 2)) + .boxed()) +} + impl DataSource for VortexDataSource { fn open( &self, partition: usize, _context: Arc, ) -> DFResult { - // VortexScanSource always uses a single partition since Vortex handles parallelism - // and concurrency internally. - if partition != 0 { + let scan_partition_count = self.scan_partition_count(); + if partition >= scan_partition_count { return Err(DataFusionError::Internal(format!( - "VortexScanSource: expected partition 0, got {partition}" + "VortexScanSource: expected partition in 0..{scan_partition_count}, got {partition}" ))); } @@ -390,6 +489,7 @@ impl DataSource for VortexDataSource { filter: self.filter.clone(), limit: self.limit.map(|l| u64::try_from(l).unwrap_or(u64::MAX)), ordered: self.ordered, + scheduler_provider: self.scheduler_provider.clone(), ..Default::default() }; @@ -401,7 +501,10 @@ impl DataSource for VortexDataSource { false, )); let session = self.session.clone(); - let num_partitions = self.num_partitions; + let num_partitions = self.num_partitions.max(1); + let scan_partition_count = self.scan_partition_count(); + let use_morsel_repartition = self.should_morsel_repartition(); + let morsel_plan = Arc::clone(&self.morsel_plan); // Pre-build the leftover projector (if any) so we can apply it after batch conversion. let leftover_projector = self @@ -410,25 +513,48 @@ impl DataSource for VortexDataSource { .map(|proj| proj.make_projector(&self.projected_schema)) .transpose()?; - // Defer the async DataSource::scan() call to the first poll of the stream. - let stream = futures::stream::once(async move { - let scan = data_source - .scan(scan_request) - .await - .map_err(|e| DataFusionError::External(Box::new(e)))?; - - // Each split.execute() returns a lazy stream whose early polls do preparation - // work (expression resolution, layout traversal, first I/O spawns). We use - // try_flatten_unordered to poll multiple split streams concurrently so that - // the next split is already warm when the current one finishes. - let scan_streams = scan.partitions().map(|split_result| { - let split = split_result?; - split.execute() - }); + // Defer the async DataSource work to the first poll of the stream. + let stream = stream::once(async move { + let array_stream: BoxStream<'static, VortexResult> = if use_morsel_repartition + { + let planned = morsel_plan + .get_or_try_init(|| { + let data_source = Arc::clone(&data_source); + let scan_request = scan_request.clone(); + async move { + data_source + .plan_morsel_partitions(scan_request, scan_partition_count) + .await + .map_err(|e| DataFusionError::External(Box::new(e))) + } + }) + .await?; + + if let Some(planned) = planned { + if partition >= planned.partition_count() { + // DataFusion can schedule every partition it asked us to expose. If the + // final lazy plan found fewer morsels, the surplus partitions are empty. + stream::empty().boxed() + } else { + Arc::clone(planned) + .partition(partition) + .map_err(|e| DataFusionError::External(Box::new(e)))? + .execute() + .map_err(|e| DataFusionError::External(Box::new(e)))? + .boxed() + } + } else if partition == 0 { + scan_to_array_stream(Arc::clone(&data_source), scan_request, num_partitions) + .await? + } else { + stream::empty().boxed() + } + } else { + scan_to_array_stream(Arc::clone(&data_source), scan_request, num_partitions).await? + }; let handle = session.handle(); - let stream = scan_streams - .try_flatten_unordered(Some(num_partitions * 2)) + let stream = array_stream .map(move |result| { let session = session.clone(); let target_field = Arc::clone(&projected_target_field); @@ -489,28 +615,36 @@ impl DataSource for VortexDataSource { _repartition_file_min_size: usize, output_ordering: Option, ) -> DFResult>> { - // Vortex handles parallelism internally — always use a single partition. let mut this = self.clone(); this.num_partitions = target_partitions; this.ordered |= output_ordering.is_some(); + this.reset_morsel_plan(); Ok(Some(Arc::new(this))) } fn output_partitioning(&self) -> Partitioning { - Partitioning::UnknownPartitioning(1) + // Report the engine-requested partition count. We do not pre-open files here just to learn + // the exact morsel count; open() maps any surplus partitions to empty streams. + Partitioning::UnknownPartitioning(self.scan_partition_count()) } fn eq_properties(&self) -> EquivalenceProperties { EquivalenceProperties::new(Arc::clone(&self.leftover_schema)) } - fn partition_statistics(&self, _partition: Option) -> DFResult> { + fn partition_statistics(&self, partition: Option) -> DFResult> { // FIXME(ngates): this should be adjusted based on filters. See DuckDB for heuristics, // and in the future, store the selectivity stats in the session. - let num_rows = estimate_to_df_precision(&self.data_source.row_count()); + let mut num_rows = estimate_to_df_precision(&self.data_source.row_count()); // FIXME(ngates): byte size should be adjusted for the initial projection... - let total_byte_size = estimate_to_df_precision(&self.data_source.byte_size()); + let mut total_byte_size = estimate_to_df_precision(&self.data_source.byte_size()); + + if partition.is_some() { + let partition_count = self.scan_partition_count(); + num_rows = divide_df_precision(num_rows, partition_count); + total_byte_size = divide_df_precision(total_byte_size, partition_count); + } // Column statistics must match the output schema (leftover_schema), which may differ // from the initial schema after try_swapping_with_projection adds computed columns. @@ -526,6 +660,7 @@ impl DataSource for VortexDataSource { fn with_fetch(&self, limit: Option) -> Option> { let mut this = self.clone(); this.limit = limit; + this.reset_morsel_plan(); Some(Arc::new(this)) } @@ -558,7 +693,9 @@ impl DataSource for VortexDataSource { // Compose with the initial projection so the scan operates on the original // source columns, not the initial projection's output columns. - let scan_projection = replace(scan_projection, &root(), self.initial_projection.clone()); + let scan_projection = replace(scan_projection, &root(), self.initial_projection.clone()) + .optimize_recursive(self.data_source.dtype()) + .map_err(|e| DataFusionError::External(Box::new(e)))?; // Compute the scan output schema from the Vortex expression's return dtype. let scan_dtype = scan_projection @@ -586,6 +723,7 @@ impl DataSource for VortexDataSource { this.leftover_schema = Arc::clone(&final_schema); this.leftover_statistics = vec![ColumnStatistics::new_unknown(); final_schema.fields().len()]; + this.reset_morsel_plan(); Ok(Some(Arc::new(this))) } @@ -609,7 +747,8 @@ impl DataSource for VortexDataSource { let pushdown_results: Vec = filters .iter() .map(|expr| { - if convertor.can_be_pushed_down(expr, input_schema) { + let is_boolean = matches!(expr.data_type(input_schema), Ok(DataType::Boolean)); + if is_boolean && convertor.can_be_pushed_down(expr, input_schema) { PushedDown::Yes } else { PushedDown::No @@ -647,6 +786,7 @@ impl DataSource for VortexDataSource { let mut this = self.clone(); this.filter = new_filter; + this.reset_morsel_plan(); Ok( FilterPushdownPropagation::with_parent_pushdown_result(pushdown_results) .with_updated_node(Arc::new(this) as _), @@ -665,3 +805,12 @@ fn estimate_to_df_precision(est: &Precision) -> DFPrecision { Precision::Absent => DFPrecision::Absent, } } + +fn divide_df_precision(est: DFPrecision, divisor: usize) -> DFPrecision { + let divisor = divisor.max(1); + match est { + DFPrecision::Exact(v) => DFPrecision::Exact(v.div_ceil(divisor)), + DFPrecision::Inexact(v) => DFPrecision::Inexact(v.div_ceil(divisor)), + DFPrecision::Absent => DFPrecision::Absent, + } +} diff --git a/vortex-datafusion/src/v2/table.rs b/vortex-datafusion/src/v2/table.rs index b881eb025ab..54642827781 100644 --- a/vortex-datafusion/src/v2/table.rs +++ b/vortex-datafusion/src/v2/table.rs @@ -24,7 +24,11 @@ use datafusion_expr::Expr; use datafusion_expr::TableType; use datafusion_physical_plan::ExecutionPlan; use vortex::expr::stats::Precision as VortexPrecision; +use vortex::metrics::MetricsRegistry; use vortex::scan::DataSourceRef; +use vortex::scan::ScanScheduler; +use vortex::scan::ScanSchedulerConfig; +use vortex::scan::ScanSchedulerProvider; use vortex::session::VortexSession; use crate::v2::source::VortexDataSource; @@ -76,6 +80,8 @@ pub struct VortexTable { data_source: DataSourceRef, session: VortexSession, arrow_schema: SchemaRef, + metrics_registry: Option>, + scheduler_provider: Option>, } impl fmt::Debug for VortexTable { @@ -100,8 +106,37 @@ impl VortexTable { data_source, session, arrow_schema, + metrics_registry: None, + scheduler_provider: None, } } + + /// Attaches a Vortex metrics registry populated by the underlying data source. + /// + /// The V2 table does not open files itself, so callers that want Vortex read metrics must also + /// configure the wrapped source to write to this same registry. + pub fn with_metrics_registry(mut self, metrics_registry: Arc) -> Self { + self.metrics_registry = Some(metrics_registry); + self + } + + /// Configures a shared scan scheduler for scans from this table. + pub fn with_scan_scheduler(mut self, scheduler: Arc) -> Self { + self.scheduler_provider = Some(Arc::new(ScanSchedulerProvider::Shared(scheduler))); + self + } + + /// Configures the scheduler ownership strategy for scans from this table. + pub fn with_scan_scheduler_provider(mut self, provider: Arc) -> Self { + self.scheduler_provider = Some(provider); + self + } + + /// Configures this table to create a new scan scheduler for each Vortex scan. + pub fn with_new_scan_scheduler_per_query(mut self, config: ScanSchedulerConfig) -> Self { + self.scheduler_provider = Some(Arc::new(ScanSchedulerProvider::PerScan(config))); + self + } } #[async_trait] @@ -122,23 +157,30 @@ impl TableProvider for VortexTable { _limit: Option, ) -> DFResult> { // Construct the physical node representing this table. - let data_source = - VortexDataSource::builder(Arc::clone(&self.data_source), self.session.clone()) - .with_arrow_schema(Arc::clone(&self.arrow_schema)) - // We push down the projection now since it can make building the physical plan a lot - // cheaper, e.g. by only computing stats for the projected columns. - .with_some_projection(projection.cloned()) - // We don't push down filters for two reasons: - // 1. Vortex requires a physical expression, not logical. DataFusion will try to push - // the physical filters later. - // 2. There's nothing useful we can do with filters now to reduce the amount of work - // we have to do. - // - // We also don't push down the limit for the same reason, there's nothing useful we - // can do with it. - .build() - .await - .map_err(|e| DataFusionError::External(Box::new(e)))?; + let mut builder = + VortexDataSource::builder(Arc::clone(&self.data_source), self.session.clone()); + if let Some(metrics_registry) = &self.metrics_registry { + builder = builder.with_metrics_registry(Arc::clone(metrics_registry)); + } + if let Some(provider) = &self.scheduler_provider { + builder = builder.with_scan_scheduler_provider(Arc::clone(provider)); + } + let data_source = builder + .with_arrow_schema(Arc::clone(&self.arrow_schema)) + // We push down the projection now since it can make building the physical plan a lot + // cheaper, e.g. by only computing stats for the projected columns. + .with_some_projection(projection.cloned()) + // We don't push down filters for two reasons: + // 1. Vortex requires a physical expression, not logical. DataFusion will try to push + // the physical filters later. + // 2. There's nothing useful we can do with filters now to reduce the amount of work + // we have to do. + // + // We also don't push down the limit for the same reason, there's nothing useful we + // can do with it. + .build() + .await + .map_err(|e| DataFusionError::External(Box::new(e)))?; Ok(DataSourceExec::from_data_source(data_source)) } diff --git a/vortex-duckdb/src/column_statistics.rs b/vortex-duckdb/src/column_statistics.rs index ccc71eeade1..0ef6717b638 100644 --- a/vortex-duckdb/src/column_statistics.rs +++ b/vortex-duckdb/src/column_statistics.rs @@ -1,6 +1,14 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use vortex::array::aggregate_fn::AggregateFnRef; +use vortex::array::aggregate_fn::AggregateFnVTableExt; +use vortex::array::aggregate_fn::EmptyOptions; +use vortex::array::aggregate_fn::fns::max::Max; +use vortex::array::aggregate_fn::fns::min::Min; +use vortex::array::aggregate_fn::fns::null_count::NullCount; +use vortex::array::aggregate_fn::fns::sum::Sum; +use vortex::array::aggregate_fn::fns::uncompressed_size_in_bytes::UncompressedSizeInBytes; use vortex::array::stats::StatsSet; use vortex::dtype::DType; use vortex::error::VortexExpect as _; @@ -12,6 +20,21 @@ use vortex::scalar::ScalarValue; use crate::convert::ToDuckDBScalar as _; use crate::duckdb::Value; +const MIN_INDEX: usize = 0; +const MAX_INDEX: usize = 1; +const NULL_COUNT_INDEX: usize = 3; +const BYTE_SIZE_INDEX: usize = 4; + +pub fn column_statistics_aggregate_fns() -> Vec { + vec![ + Min.bind(EmptyOptions), + Max.bind(EmptyOptions), + Sum.bind(EmptyOptions), + NullCount.bind(EmptyOptions), + UncompressedSizeInBytes.bind(EmptyOptions), + ] +} + #[derive(Debug, Default)] pub struct ColumnStatistics { pub min: Option, @@ -93,4 +116,38 @@ impl ColumnStatisticsAggregate { has_null, } } + + pub fn from_aggregate_stats(stats: &[Precision]) -> Self { + let min = exact_scalar_value(stats.get(MIN_INDEX)); + let max = exact_scalar_value(stats.get(MAX_INDEX)); + let max_string_length = stats + .get(BYTE_SIZE_INDEX) + .and_then(exact_scalar_u64) + .map(|value| u32::try_from(value).unwrap_or(u32::MAX)); + let has_null = stats + .get(NULL_COUNT_INDEX) + .and_then(exact_scalar_u64) + .is_none_or(|count| count > 0); + + Self { + min, + max, + max_string_length, + has_null, + } + } +} + +fn exact_scalar_value(stat: Option<&Precision>) -> Option { + match stat { + Some(Precision::Exact(value)) => value.clone().into_value(), + _ => None, + } +} + +fn exact_scalar_u64(stat: &Precision) -> Option { + match stat { + Precision::Exact(value) => value.as_primitive().typed_value::(), + _ => None, + } } diff --git a/vortex-duckdb/src/lib.rs b/vortex-duckdb/src/lib.rs index 8a1b6884401..035457c341d 100644 --- a/vortex-duckdb/src/lib.rs +++ b/vortex-duckdb/src/lib.rs @@ -5,6 +5,7 @@ use std::ffi::CStr; use std::ffi::c_char; +use std::sync::Arc; use std::sync::LazyLock; use std::sync::OnceLock; @@ -14,6 +15,9 @@ use vortex::error::VortexResult; use vortex::io::runtime::BlockingRuntime; use vortex::io::runtime::current::CurrentThreadRuntime; use vortex::io::session::RuntimeSessionExt; +use vortex::scan::ScanScheduler; +use vortex::scan::ScanSchedulerConfig; +use vortex::scan::ScanSchedulerSessionExt; use vortex::session::VortexSession; use crate::duckdb::Database; @@ -44,8 +48,12 @@ mod e2e_test; // A global runtime for Vortex operations within DuckDB. static RUNTIME: LazyLock = LazyLock::new(CurrentThreadRuntime::new); +static SCAN_SCHEDULER: LazyLock> = + LazyLock::new(|| Arc::new(ScanScheduler::new(ScanSchedulerConfig::duckdb_default()))); static SESSION: LazyLock = LazyLock::new(|| { - let session = VortexSession::default().with_handle(RUNTIME.handle()); + let session = VortexSession::default() + .with_handle(RUNTIME.handle()) + .with_scan_scheduler(Arc::clone(&SCAN_SCHEDULER)); vortex_geo::initialize(&session); session }); diff --git a/vortex-duckdb/src/multi_file.rs b/vortex-duckdb/src/multi_file.rs index 165bcad6677..a925e8163aa 100644 --- a/vortex-duckdb/src/multi_file.rs +++ b/vortex-duckdb/src/multi_file.rs @@ -14,6 +14,8 @@ use vortex::file::multi::MultiFileDataSource; use vortex::io::filesystem::FileSystemRef; use vortex::io::runtime::BlockingRuntime; use vortex::layout::scan::multi::MultiLayoutDataSource; +use vortex::layout::scan::v2::scan2_enabled; +use vortex::scan::DataSourceRef; use vortex_utils::aliases::hash_map::HashMap; use crate::RUNTIME; @@ -23,6 +25,11 @@ use crate::duckdb::ClientContextRef; use crate::duckdb::ExtractedValue; use crate::filesystem::resolve_filesystem; +pub struct BoundMultiFileScan { + pub data_source: DataSourceRef, + pub statistics_source: Option>, +} + /// Parse a glob string into a [`Url`]. /// /// Accepts full URLs (e.g. `s3://bucket/prefix/*.vortex`, `file:///data/*.vortex`) as well as @@ -59,7 +66,7 @@ fn normalize_path(path: std::path::PathBuf) -> std::path::PathBuf { pub fn bind_multi_file_scan( ctx: &ClientContextRef, input: &BindInputRef, -) -> VortexResult { +) -> VortexResult { let glob_url_parameter = input .get_parameter(0) .ok_or_else(|| vortex_err!("Missing file glob parameter"))?; @@ -112,7 +119,20 @@ pub fn bind_multi_file_scan( builder = builder.with_glob(glob_url.path(), Some(fs)); } - builder.build().await + if scan2_enabled()? { + Ok(BoundMultiFileScan { + data_source: builder.build_data_source().await?, + statistics_source: None, + }) + } else { + let statistics_source = Arc::new(builder.build().await?); + let data_source: DataSourceRef = + Arc::::clone(&statistics_source); + Ok(BoundMultiFileScan { + data_source, + statistics_source: Some(statistics_source), + }) + } }) } diff --git a/vortex-duckdb/src/table_function.rs b/vortex-duckdb/src/table_function.rs index 11c5851af27..3a6e901c96f 100644 --- a/vortex-duckdb/src/table_function.rs +++ b/vortex-duckdb/src/table_function.rs @@ -26,6 +26,7 @@ use vortex::array::optimizer::ArrayOptimizer; use vortex::error::VortexExpect; use vortex::error::VortexResult; use vortex::expr::Expression; +use vortex::expr::col; use vortex::expr::stats::Precision; use vortex::file::v2::FileStatsLayoutReader; use vortex::io::kanal_ext::KanalExt as _; @@ -37,7 +38,7 @@ use vortex::metrics::tracing::get_global_labels; use vortex::scalar_fn::fns::binary::Binary; use vortex::scalar_fn::fns::operators::Operator; use vortex::scalar_fn::fns::pack::Pack; -use vortex::scan::DataSource; +use vortex::scan::DataSourceRef; use vortex::scan::ScanRequest; use vortex_utils::parallelism::get_available_parallelism; @@ -45,6 +46,7 @@ use crate::RUNTIME; use crate::SESSION; use crate::column_statistics::ColumnStatistics; use crate::column_statistics::ColumnStatisticsAggregate; +use crate::column_statistics::column_statistics_aggregate_fns; use crate::convert::try_from_bound_expression; use crate::duckdb::BindInputRef; use crate::duckdb::BindResultRef; @@ -63,7 +65,8 @@ use crate::projection::Projection; use crate::projection::extract_schema_from_dtype; pub struct TableFunctionBind { - data_source: Arc, + data_source: DataSourceRef, + statistics_source: Option>, filter_exprs: Vec, column_fields: Vec, // There exists at least one non-optional table filter or at least one @@ -76,6 +79,7 @@ impl Clone for TableFunctionBind { fn clone(&self) -> Self { Self { data_source: Arc::clone(&self.data_source), + statistics_source: self.statistics_source.clone(), // filter_exprs are consumed once in `init_global`. filter_exprs: vec![], column_fields: self.column_fields.clone(), @@ -148,12 +152,13 @@ pub fn bind( result: &mut BindResultRef, ) -> VortexResult { let data_source = bind_multi_file_scan(ctx, input)?; - let column_fields = extract_schema_from_dtype(data_source.dtype())?; + let column_fields = extract_schema_from_dtype(data_source.data_source.dtype())?; for fields in &column_fields { result.add_result_column(&fields.name, &fields.logical_type); } Ok(TableFunctionBind { - data_source: Arc::new(data_source), + data_source: data_source.data_source, + statistics_source: data_source.statistics_source, filter_exprs: vec![], column_fields, has_non_optional_filter: AtomicBool::new(false), @@ -216,6 +221,7 @@ pub fn init_global(init_input: &TableInitInput) -> VortexResult Option { - let children = bind_data.data_source.children(); - // Otherwise we'd have to open all files eagerly which is a performance - // regression. Duckdb's Parquet reader only gets metadata for multiple - // files with a UNION BY NAME and we don't support it (yet) - // See duckdb/common/multi_file/multi_file_function.hpp#L691 - if children.len() != 1 { - return None; - } - let MultiLayoutChild::Opened(reader) = &children[0] else { - return None; - }; - let stats_sets = match reader.as_any().downcast_ref::() { - Some(inner) => inner.file_stats().stats_sets(), - None => return None, - }; - let stats_aggregate = ColumnStatisticsAggregate::new(&stats_sets[column_index]); let dtype = bind_data.column_fields[column_index].dtype.clone(); + if let Some(statistics_source) = bind_data.statistics_source.as_ref() { + let children = statistics_source.children(); + // Otherwise we'd have to open all files eagerly which is a performance + // regression. Duckdb's Parquet reader only gets metadata for multiple + // files with a UNION BY NAME and we don't support it (yet) + // See duckdb/common/multi_file/multi_file_function.hpp#L691 + if children.len() != 1 { + return None; + } + let MultiLayoutChild::Opened(reader) = &children[0] else { + return None; + }; + let stats_sets = match reader.as_any().downcast_ref::() { + Some(inner) => inner.file_stats().stats_sets(), + None => return None, + }; + let stats_aggregate = ColumnStatisticsAggregate::new(&stats_sets[column_index]); + return Some(ColumnStatistics::from(&stats_aggregate, dtype)); + } + + let name = &bind_data.column_fields[column_index].name; + let funcs = column_statistics_aggregate_fns(); + let stats = RUNTIME + .block_on( + bind_data + .data_source + .statistics(&col(name.as_str()), &funcs), + ) + .ok()?; + let stats_aggregate = ColumnStatisticsAggregate::from_aggregate_stats(&stats); Some(ColumnStatistics::from(&stats_aggregate, dtype)) } diff --git a/vortex-ffi/src/scan.rs b/vortex-ffi/src/scan.rs index 921c4f50621..012c4a2e99c 100644 --- a/vortex-ffi/src/scan.rs +++ b/vortex-ffi/src/scan.rs @@ -188,6 +188,7 @@ fn scan_request(opts: *const vx_scan_options) -> VortexResult { limit, partition_selection: Selection::All, partition_range: None, + scheduler_provider: None, }) } diff --git a/vortex-file/Cargo.toml b/vortex-file/Cargo.toml index c4bf980d683..dac12f0e9a2 100644 --- a/vortex-file/Cargo.toml +++ b/vortex-file/Cargo.toml @@ -59,6 +59,7 @@ vortex-zigzag = { workspace = true } vortex-zstd = { workspace = true, optional = true } [dev-dependencies] +temp-env = { workspace = true } tokio = { workspace = true, features = ["full"] } vortex-array = { workspace = true, features = ["_test-harness"] } vortex-io = { workspace = true, features = ["tokio"] } diff --git a/vortex-file/src/file.rs b/vortex-file/src/file.rs index 23e2114b1c3..9267f49f8d9 100644 --- a/vortex-file/src/file.rs +++ b/vortex-file/src/file.rs @@ -15,6 +15,7 @@ use vortex_array::ArrayRef; use vortex_array::Columnar; use vortex_array::IntoArray; use vortex_array::VortexSessionExecute; +use vortex_array::aggregate_fn::AggregateFnRef; use vortex_array::arrays::ConstantArray; use vortex_array::dtype::DType; use vortex_array::dtype::Field; @@ -23,7 +24,10 @@ use vortex_array::dtype::FieldPath; use vortex_array::dtype::FieldPathSet; use vortex_array::expr::Expression; use vortex_array::expr::pruning::checked_pruning_expr; +use vortex_array::expr::stats::Precision; +use vortex_array::scalar::Scalar; use vortex_array::scalar_fn::internal::row_count::substitute_row_count; +use vortex_array::stream::SendableArrayStream; use vortex_error::VortexResult; use vortex_layout::LayoutReader; use vortex_layout::scan::layout::LayoutReaderDataSource; @@ -31,11 +35,13 @@ use vortex_layout::scan::scan_builder::ScanBuilder; use vortex_layout::scan::split_by::SplitBy; use vortex_layout::segments::SegmentSource; use vortex_scan::DataSourceRef; +use vortex_scan::ScanRequest; use vortex_session::VortexSession; use vortex_utils::aliases::hash_map::HashMap; use crate::FileStatistics; use crate::footer::Footer; +use crate::multi::scan_v2; use crate::pruning::extract_relevant_file_stats_as_struct_row; use crate::v2::FileStatsLayoutReader; @@ -188,6 +194,34 @@ impl VortexFile { )) } + /// Execute a ScanNode-backed V2 scan for this file. + pub fn scan_node_stream(&self, request: ScanRequest) -> VortexResult { + scan_v2::scan_node_file_stream(self.clone(), request) + } + + /// Return ScanNode-backed aggregate-function statistics for this file. + pub async fn scan_node_statistics( + &self, + expr: &Expression, + funcs: &[AggregateFnRef], + ) -> VortexResult>> { + scan_v2::scan_node_file_statistics(self.clone(), expr, funcs).await + } + + /// Return ScanNode-backed aggregate-function statistics for several expressions in this file. + pub async fn scan_node_statistics_many( + &self, + exprs: &[Expression], + funcs: &[AggregateFnRef], + ) -> VortexResult>>> { + scan_v2::scan_node_file_statistics_many(self.clone(), exprs, funcs).await + } + + /// Return ScanNode natural row split ranges for this file. + pub fn scan_node_splits(&self) -> VortexResult>> { + scan_v2::scan_node_file_splits(self) + } + /// Returns `true` if file-level statistics prove the expression cannot /// match any rows in this file. /// diff --git a/vortex-file/src/multi/mod.rs b/vortex-file/src/multi/mod.rs index 3abb4ebea2a..b451baac9ed 100644 --- a/vortex-file/src/multi/mod.rs +++ b/vortex-file/src/multi/mod.rs @@ -3,6 +3,7 @@ //! Builder for constructing a [`MultiLayoutDataSource`] from multiple Vortex files. +pub(crate) mod scan_v2; mod session; use std::sync::Arc; @@ -13,18 +14,26 @@ use session::MultiFileSessionExt; use tracing::debug; use vortex_error::VortexResult; use vortex_error::vortex_bail; +use vortex_io::InstrumentedReadAt; +use vortex_io::VortexReadAt; use vortex_io::filesystem::FileListing; use vortex_io::filesystem::FileSystemRef; use vortex_layout::LayoutReaderRef; use vortex_layout::scan::multi::LayoutReaderFactory; use vortex_layout::scan::multi::MultiLayoutDataSource; +use vortex_layout::scan::v2::scan2_enabled; +use vortex_metrics::Label; +use vortex_metrics::MetricsRegistry; use vortex_scan::DataSource; +use vortex_scan::DataSourceRef; use vortex_session::VortexSession; use crate::OpenOptionsSessionExt; use crate::VortexFile; use crate::VortexOpenOptions; +const PATH_LABEL: &str = "file_path"; + /// A builder that discovers multiple Vortex files from glob patterns and constructs a /// [`MultiLayoutDataSource`] to scan them as a single data source. /// @@ -61,6 +70,7 @@ pub struct MultiFileDataSource { /// When the filesystem is None, a local filesystem will be created in build(). glob_sources: Vec<(String, Option)>, open_options_fn: Arc VortexOpenOptions + Send + Sync>, + metrics_registry: Option>, } impl MultiFileDataSource { @@ -70,6 +80,7 @@ impl MultiFileDataSource { session, glob_sources: Vec::new(), open_options_fn: Arc::new(|opts| opts), + metrics_registry: None, } } @@ -105,6 +116,16 @@ impl MultiFileDataSource { self } + /// Configure a shared metrics registry for all files opened by this data source. + /// + /// This instruments both the underlying [`VortexReadAt`] and the Vortex segment source so + /// callers can inspect read sizes, read durations, segment request coalescing, and segment + /// cache behavior for scans that use this data source. + pub fn with_metrics_registry(mut self, metrics_registry: Arc) -> Self { + self.metrics_registry = Some(metrics_registry); + self + } + /// Build the [`DataSource`]. /// /// Discovers files via glob, opens the first file eagerly to determine the schema, @@ -152,7 +173,14 @@ impl MultiFileDataSource { // Open first file eagerly for dtype. let (first_file_listing, first_fs) = &all_files[0]; let open_fn = self.open_options_fn.as_ref(); - let first_file = open_file(first_fs, first_file_listing, &self.session, open_fn).await?; + let first_file = open_file( + first_fs, + first_file_listing, + &self.session, + self.metrics_registry.as_ref(), + open_fn, + ) + .await?; let first_reader = first_file.layout_reader()?; let factories: Vec> = all_files[1..] @@ -163,6 +191,7 @@ impl MultiFileDataSource { file: file.clone(), session: self.session.clone(), open_options_fn: Arc::clone(&self.open_options_fn), + metrics_registry: self.metrics_registry.clone(), }) as Arc }) .collect(); @@ -173,6 +202,18 @@ impl MultiFileDataSource { Ok(inner) } + + /// Build the [`DataSource`] selected by `VORTEX_SCAN_IMPL`. + /// + /// The default is the existing LayoutReader-backed scan. Setting + /// `VORTEX_SCAN_IMPL=v2` (or `scan2`/`scan3`/`native`) builds the ScanNode-backed V2 scan. + pub async fn build_data_source(self) -> VortexResult { + if scan2_enabled()? { + Ok(Arc::new(scan_v2::build_scan_node_data_source(self).await?)) + } else { + Ok(Arc::new(self.build().await?)) + } + } } /// Creates a local filesystem backed by `object_store::local::LocalFileSystem`. @@ -202,6 +243,7 @@ async fn open_file( fs: &FileSystemRef, file: &FileListing, session: &VortexSession, + metrics_registry: Option<&Arc>, open_options_fn: &(dyn Fn(VortexOpenOptions) -> VortexOpenOptions + Send + Sync), ) -> VortexResult { tracing::trace!(path = %file.path, "opening vortex file"); @@ -210,6 +252,16 @@ async fn open_file( // The URI includes the full path (with any filesystem prefix), making it unique // even when different PrefixFileSystem instances strip paths to the same relative name. let source = fs.open_read(&file.path).await?; + let labels = vec![Label::new(PATH_LABEL, file.path.clone())]; + let source = if let Some(metrics_registry) = metrics_registry { + Arc::new(InstrumentedReadAt::new_with_labels( + source, + metrics_registry.as_ref(), + labels.clone(), + )) as Arc + } else { + source + }; let cache_key = source .uri() .map(|u| u.to_string()) @@ -219,6 +271,11 @@ async fn open_file( // so we scope the cache lookup in a block. let options = { let mut options = open_options_fn(session.open_options()); + if let Some(metrics_registry) = metrics_registry { + options = options + .with_metrics_registry(Arc::clone(metrics_registry)) + .with_labels(labels); + } if let Some(size) = file.size { options = options.with_file_size(size); } @@ -243,6 +300,7 @@ struct VortexFileReaderFactory { file: FileListing, session: VortexSession, open_options_fn: Arc VortexOpenOptions + Send + Sync>, + metrics_registry: Option>, } #[async_trait] @@ -252,6 +310,7 @@ impl LayoutReaderFactory for VortexFileReaderFactory { &self.fs, &self.file, &self.session, + self.metrics_registry.as_ref(), self.open_options_fn.as_ref(), ) .await?; diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs new file mode 100644 index 00000000000..c2bbd83e8c3 --- /dev/null +++ b/vortex-file/src/multi/scan_v2.rs @@ -0,0 +1,1458 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! ScanNode-backed multi-file data source. + +use std::any::Any; +use std::collections::VecDeque; +use std::fmt; +use std::ops::Range; +use std::sync::Arc; +use std::sync::atomic::AtomicU64; +use std::sync::atomic::Ordering; + +use async_trait::async_trait; +use futures::FutureExt; +use futures::StreamExt; +use futures::TryStreamExt; +use futures::future::BoxFuture; +use futures::stream; +use tracing::Instrument; +use vortex_array::ArrayRef; +use vortex_array::VortexSessionExecute; +use vortex_array::aggregate_fn::AggregateFnRef; +use vortex_array::dtype::DType; +use vortex_array::dtype::FieldName; +use vortex_array::dtype::FieldPath; +use vortex_array::dtype::StructFields; +use vortex_array::expr::Expression; +use vortex_array::expr::forms::conjuncts; +use vortex_array::expr::stats::Precision; +use vortex_array::expr::stats::Stat; +use vortex_array::scalar::Scalar; +use vortex_array::scalar::ScalarValue; +use vortex_array::scalar_fn::fns::get_item::GetItem; +use vortex_array::scalar_fn::fns::root::Root; +use vortex_array::stats::StatsSet; +use vortex_array::stream::ArrayStreamAdapter; +use vortex_array::stream::ArrayStreamExt; +use vortex_array::stream::SendableArrayStream; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_err; +use vortex_io::filesystem::FileListing; +use vortex_io::filesystem::FileSystemRef; +use vortex_io::runtime::Handle; +use vortex_io::session::RuntimeSessionExt; +use vortex_layout::scan::v2::evidence::PredicateEvidence; +use vortex_layout::scan::v2::evidence::PredicateId; +use vortex_layout::scan::v2::evidence::PredicateVersion; +use vortex_layout::scan::v2::node::AggregatePlanRef; +use vortex_layout::scan::v2::node::EvidencePlanRef; +use vortex_layout::scan::v2::node::EvidenceStateKey; +use vortex_layout::scan::v2::node::ExpandCtx; +use vortex_layout::scan::v2::node::FileReader; +use vortex_layout::scan::v2::node::PlanCtx; +use vortex_layout::scan::v2::node::PushCtx; +use vortex_layout::scan::v2::node::ReadPlanRef; +use vortex_layout::scan::v2::node::RowScope; +use vortex_layout::scan::v2::node::ScanNode; +use vortex_layout::scan::v2::node::ScanNodeRef; +use vortex_layout::scan::v2::node::ScanStateCache; +use vortex_layout::scan::v2::node::ScanStateRef; +use vortex_layout::scan::v2::node::StateCtx; +use vortex_layout::scan::v2::node::StatsPlan; +use vortex_layout::scan::v2::node::StatsPlanRef; +use vortex_layout::scan::v2::request::EvidenceMode; +use vortex_layout::scan::v2::request::EvidenceRequest; +use vortex_layout::scan::v2::request::NodeRequest; +use vortex_layout::scan::v2::validate_temporal_comparisons; +use vortex_mask::Mask; +use vortex_metrics::MetricsRegistry; +use vortex_scan::DataSource; +use vortex_scan::DataSourceScan; +use vortex_scan::DataSourceScanRef; +use vortex_scan::Partition; +use vortex_scan::PartitionRef; +use vortex_scan::PartitionStream; +use vortex_scan::PlannedMorselScan; +use vortex_scan::PlannedMorselScanRef; +use vortex_scan::ScanMeta; +use vortex_scan::ScanRequest; +use vortex_scan::ScanScheduler; +use vortex_scan::ScanSchedulerSessionExt; +use vortex_scan::ScanTicket; +use vortex_scan::WorkRequest; +use vortex_scan::selection::Selection; +use vortex_session::VortexSession; +use vortex_utils::aliases::hash_map::HashMap; +use vortex_utils::parallelism::get_available_parallelism; + +use super::MultiFileDataSource; +use super::create_local_filesystem; +use super::open_file; +use crate::FileStatistics; +use crate::VortexFile; +use crate::VortexOpenOptions; + +const DEFAULT_CONCURRENCY: usize = 8; +const FALLBACK_SPLIT_SIZE: u64 = 100_000; + +struct FileStatsScanNode { + data: ScanNodeRef, + stats: Arc, + fields: StructFields, + row_count: u64, +} + +struct FileStatsExprScanNode { + data: ScanNodeRef, + stats: Arc, + field_idx: usize, + field_dtype: DType, + row_count: u64, +} + +struct FileStatsPlan { + stats: StatsSet, + field_dtype: DType, + row_count: u64, + funcs: Vec, +} + +impl FileStatsScanNode { + fn try_new( + data: ScanNodeRef, + stats: Arc, + dtype: &DType, + row_count: u64, + ) -> Option { + let fields = dtype.as_struct_fields_opt()?.clone(); + Some(Self { + data, + stats, + fields, + row_count, + }) + } + + fn pushed_field(&self, expr: &Expression) -> Option<(usize, FieldName, DType)> { + let name = root_field(expr)?; + let field_idx = self.fields.find(name)?; + let field_dtype = self.fields.field_by_index(field_idx)?; + Some((field_idx, name.clone(), field_dtype)) + } +} + +impl ScanNode for FileStatsScanNode { + type State = ScanStateRef; + + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { + cx.init_node(&self.data) + } + + fn try_push_expr( + self: Arc, + expr: &Expression, + cx: &mut PushCtx, + ) -> VortexResult> { + let Some(data) = Arc::clone(&self.data).try_push_expr(expr, cx)? else { + return Ok(None); + }; + let Some((field_idx, _name, field_dtype)) = self.pushed_field(expr) else { + return Ok(Some(data)); + }; + Ok(Some(Arc::new(FileStatsExprScanNode { + data, + stats: Arc::clone(&self.stats), + field_idx, + field_dtype, + row_count: self.row_count, + }))) + } + + fn plan_read(self: Arc, cx: &mut PlanCtx) -> VortexResult> { + Arc::clone(&self.data).plan_read(cx) + } + + fn plan_evidence(self: Arc, cx: &mut PlanCtx) -> VortexResult> { + Arc::clone(&self.data).plan_evidence(cx) + } + + fn plan_aggregate_partial( + self: Arc, + funcs: &[AggregateFnRef], + cx: &mut PlanCtx, + ) -> VortexResult> { + Arc::clone(&self.data).plan_aggregate_partial(funcs, cx) + } + + fn split_hints(&self) -> Option<&[u64]> { + self.data.split_hints() + } + + fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { + self.data.release(frontier, state.as_ref()) + } + + fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "file_stats:")?; + self.data.fmt_chain(f) + } +} + +impl ScanNode for FileStatsExprScanNode { + type State = ScanStateRef; + + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { + cx.init_node(&self.data) + } + + fn try_push_expr( + self: Arc, + expr: &Expression, + cx: &mut PushCtx, + ) -> VortexResult> { + Arc::clone(&self.data).try_push_expr(expr, cx) + } + + fn plan_read(self: Arc, cx: &mut PlanCtx) -> VortexResult> { + Arc::clone(&self.data).plan_read(cx) + } + + fn plan_evidence(self: Arc, cx: &mut PlanCtx) -> VortexResult> { + Arc::clone(&self.data).plan_evidence(cx) + } + + fn plan_aggregate_partial( + self: Arc, + funcs: &[AggregateFnRef], + cx: &mut PlanCtx, + ) -> VortexResult> { + Arc::clone(&self.data).plan_aggregate_partial(funcs, cx) + } + + fn plan_stats( + self: Arc, + funcs: &[AggregateFnRef], + _cx: &mut PlanCtx, + ) -> VortexResult> { + let stats = self.stats.stats_sets()[self.field_idx].clone(); + Ok(Some(Arc::new(FileStatsPlan { + stats, + field_dtype: self.field_dtype.clone(), + row_count: self.row_count, + funcs: funcs.to_vec(), + }))) + } + + fn split_hints(&self) -> Option<&[u64]> { + self.data.split_hints() + } + + fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { + self.data.release(frontier, state.as_ref()) + } + + fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "file_stats_expr:")?; + self.data.fmt_chain(f) + } +} + +impl StatsPlan for FileStatsPlan { + type State = (); + + fn init_state(&self, _ctx: &VortexSession) -> VortexResult { + Ok(()) + } + + fn stats<'a>( + &'a self, + range: Range, + _io: &'a FileReader, + _state: &'a Self::State, + ) -> BoxFuture<'a, VortexResult>>> { + Box::pin(async move { + if range != (0..self.row_count) { + return Ok(absent_statistics(&self.funcs)); + } + self.funcs + .iter() + .map(|func| self.stat_for_func(func)) + .collect() + }) + } + + fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "file_stats") + } +} + +impl FileStatsPlan { + fn stat_for_func(&self, func: &AggregateFnRef) -> VortexResult> { + let Some(stat) = Stat::from_aggregate_fn(func) else { + return Ok(Precision::Absent); + }; + let Some(dtype) = func.return_dtype(&self.field_dtype) else { + return Ok(Precision::Absent); + }; + self.stats + .get(stat) + .map(|value| Scalar::try_new(dtype, Some(value))) + .transpose() + } +} + +fn root_field(expr: &Expression) -> Option<&FieldName> { + let name = expr.as_opt::()?; + expr.child(0).is::().then_some(name) +} + +fn absent_statistics(funcs: &[AggregateFnRef]) -> Vec> { + funcs.iter().map(|_| Precision::Absent).collect() +} + +fn scalar_precision_to_value(precision: Precision) -> Precision { + match precision { + Precision::Exact(scalar) => scalar + .into_value() + .map(Precision::Exact) + .unwrap_or(Precision::Absent), + Precision::Inexact(scalar) => scalar + .into_value() + .map(Precision::Inexact) + .unwrap_or(Precision::Absent), + Precision::Absent => Precision::Absent, + } +} + +/// Build a scan2 [`DataSource`] from a multi-file builder. +pub(super) async fn build_scan_node_data_source( + builder: MultiFileDataSource, +) -> VortexResult { + if builder.glob_sources.is_empty() { + vortex_bail!("MultiFileDataSource requires at least one glob pattern"); + } + + let local_fs: Option = builder + .glob_sources + .iter() + .any(|(_, fs)| fs.is_none()) + .then(|| create_local_filesystem(&builder.session)) + .transpose()?; + + let mut all_files: Vec<(FileListing, FileSystemRef)> = Vec::new(); + for (glob, maybe_fs) in &builder.glob_sources { + let fs = maybe_fs + .as_ref() + .or(local_fs.as_ref()) + .map(Arc::clone) + .unwrap_or_else(|| unreachable!("local_fs is set when any glob lacks a filesystem")); + let files: Vec = fs.glob(glob)?.try_collect().await?; + for file in files { + all_files.push((file, Arc::clone(&fs))); + } + } + + if all_files.is_empty() { + let globs: Vec<_> = builder + .glob_sources + .iter() + .map(|(glob, _)| glob.as_str()) + .collect(); + vortex_bail!("No files matched the glob pattern(s): {:?}", globs); + } + + let (first_file_listing, first_fs) = &all_files[0]; + let first_file = open_file( + first_fs, + first_file_listing, + &builder.session, + builder.metrics_registry.as_ref(), + builder.open_options_fn.as_ref(), + ) + .await?; + + let factories: Vec> = all_files[1..] + .iter() + .map(|(file, fs)| { + Arc::new(ScanNodeFileFactory { + fs: Arc::clone(fs), + file: file.clone(), + session: builder.session.clone(), + open_options_fn: Arc::clone(&builder.open_options_fn), + metrics_registry: builder.metrics_registry.clone(), + }) as Arc + }) + .collect(); + + Ok(ScanNodeDataSource::new_with_first( + first_file, + factories, + &builder.session, + )) +} + +#[async_trait] +trait VortexFileFactory: 'static + Send + Sync { + async fn open(&self) -> VortexResult>; +} + +struct ScanNodeFileFactory { + fs: FileSystemRef, + file: FileListing, + session: VortexSession, + open_options_fn: Arc VortexOpenOptions + Send + Sync>, + metrics_registry: Option>, +} + +#[async_trait] +impl VortexFileFactory for ScanNodeFileFactory { + async fn open(&self) -> VortexResult> { + let file = open_file( + &self.fs, + &self.file, + &self.session, + self.metrics_registry.as_ref(), + self.open_options_fn.as_ref(), + ) + .await?; + Ok(Some(file)) + } +} + +enum ScanNodeChild { + Opened(VortexFile), + Deferred(Arc), +} + +/// Multi-file data source backed by scan2 ScanNode plans. +pub struct ScanNodeDataSource { + dtype: DType, + session: VortexSession, + children: Vec, + concurrency: usize, +} + +impl ScanNodeDataSource { + fn new_with_first( + first: VortexFile, + remaining: Vec>, + session: &VortexSession, + ) -> Self { + let dtype = first.dtype().clone(); + let concurrency = get_available_parallelism().unwrap_or(DEFAULT_CONCURRENCY); + + let mut children = Vec::with_capacity(1 + remaining.len()); + children.push(ScanNodeChild::Opened(first)); + children.extend(remaining.into_iter().map(ScanNodeChild::Deferred)); + + Self { + dtype, + session: session.clone(), + children, + concurrency, + } + } + + async fn open_files(&self, ordered: bool) -> VortexResult> { + let jobs = self + .children + .iter() + .enumerate() + .map(|(idx, child)| match child { + ScanNodeChild::Opened(file) => { + let file = file.clone(); + async move { Ok(Some((idx, file))) }.boxed() + } + ScanNodeChild::Deferred(factory) => { + let factory = Arc::clone(factory); + async move { + factory + .open() + .instrument(tracing::info_span!("VortexFileFactory::open")) + .await + .map(|file| file.map(|file| (idx, file))) + } + .boxed() + } + }) + .collect::>>>>(); + + let files = if ordered { + stream::iter(jobs) + .buffered(self.concurrency) + .try_filter_map(|file| async move { Ok(file) }) + .try_collect::>() + .await? + } else { + stream::iter(jobs) + .buffer_unordered(self.concurrency) + .try_filter_map(|file| async move { Ok(file) }) + .try_collect::>() + .await? + }; + + let mut files = files; + files.sort_unstable_by_key(|(idx, _)| *idx); + Ok(files) + } +} + +#[async_trait] +impl DataSource for ScanNodeDataSource { + fn dtype(&self) -> &DType { + &self.dtype + } + + fn row_count(&self) -> Precision { + let mut sum: u64 = 0; + let mut opened_count: u64 = 0; + let mut deferred_count: u64 = 0; + + for child in &self.children { + match child { + ScanNodeChild::Opened(file) => { + opened_count += 1; + sum = sum.saturating_add(file.row_count()); + } + ScanNodeChild::Deferred(_) => { + deferred_count += 1; + } + } + } + + let total_count = opened_count + deferred_count; + if total_count == 0 { + return Precision::exact(0u64); + } + + if deferred_count == 0 { + Precision::exact(sum) + } else if opened_count > 0 { + let avg = sum / opened_count; + Precision::inexact(avg.saturating_mul(total_count)) + } else { + Precision::Absent + } + } + + fn deserialize_partition( + &self, + _data: &[u8], + _session: &VortexSession, + ) -> VortexResult { + vortex_bail!("ScanNodeDataSource partitions are not yet serializable") + } + + async fn plan_morsel_partitions( + &self, + scan_request: ScanRequest, + target_partitions: usize, + ) -> VortexResult> { + if scan_request.ordered || scan_request.limit.is_some() { + return Ok(None); + } + + let target_partitions = target_partitions.max(1); + let dtype = scan_request.projection.return_dtype(&self.dtype)?; + + let meta = ScanMeta { + label: Some("scan2".to_string()), + }; + let provider = scan_request + .scheduler_provider + .clone() + .unwrap_or_else(|| self.session.scan_scheduler_provider()); + let scheduler = provider.scheduler_for_scan(&meta); + let ticket = scheduler.register_scan(meta); + + let mut planned_files = Vec::new(); + let mut total_morsels = 0usize; + for (partition_idx, file) in self.open_files(false).await? { + let Some(request) = file_scan_request(partition_idx, &file, scan_request.clone())? + else { + continue; + }; + let prepared = Arc::new(PreparedScanNodeFile::try_new(file, request)?); + let ranges = prepared.splits()?; + if ranges.is_empty() { + continue; + } + total_morsels = total_morsels.saturating_add(ranges.len()); + planned_files.push((prepared, ranges)); + } + + // The physical plan may expose more engine partitions than we can fill with morsels. + // Keep only non-empty planned partitions; engine adapters can return empty streams for + // any surplus advertised partitions. + let partition_count = total_morsels.min(target_partitions); + let mut partitions = vec![Vec::new(); partition_count]; + let mut morsel_idx = 0usize; + for (prepared, ranges) in planned_files { + for range in ranges { + let partition = morsel_idx % partition_count; + partitions[partition].push(PlannedScanNodeMorsel { + prepared: Arc::clone(&prepared), + range, + }); + morsel_idx = morsel_idx.saturating_add(1); + } + } + + let morsel_concurrency = get_available_parallelism().unwrap_or(1).saturating_mul(4); + + Ok(Some(Arc::new(PlannedScanNodeScan { + dtype, + partitions, + scheduler, + ticket, + morsel_concurrency, + }))) + } + + async fn scan(&self, scan_request: ScanRequest) -> VortexResult { + let meta = ScanMeta { + label: Some("scan2".to_string()), + }; + let provider = scan_request + .scheduler_provider + .clone() + .unwrap_or_else(|| self.session.scan_scheduler_provider()); + let scheduler = provider.scheduler_for_scan(&meta); + let ticket = scheduler.register_scan(meta); + + let mut ready = VecDeque::new(); + let mut deferred = VecDeque::new(); + + for child in &self.children { + match child { + ScanNodeChild::Opened(file) => ready.push_back(file.clone()), + ScanNodeChild::Deferred(factory) => deferred.push_back(Arc::clone(factory)), + } + } + + let dtype = scan_request.projection.return_dtype(&self.dtype)?; + + Ok(Box::new(ScanNodeDataSourceScan { + dtype, + request: scan_request, + ready, + deferred, + handle: self.session.handle(), + concurrency: self.concurrency, + scheduler, + ticket, + })) + } + + async fn statistics( + &self, + expr: &Expression, + funcs: &[AggregateFnRef], + ) -> VortexResult>> { + if self.children.len() != 1 { + return Ok(absent_statistics(funcs)); + } + let ScanNodeChild::Opened(file) = &self.children[0] else { + return Ok(absent_statistics(funcs)); + }; + scan_node_file_statistics(file.clone(), expr, funcs).await + } + + async fn field_statistics(&self, field_path: &FieldPath) -> VortexResult { + if field_path.parts().len() != 1 { + return Ok(StatsSet::default()); + } + let Some(field_name) = field_path.parts()[0].as_name() else { + return Ok(StatsSet::default()); + }; + let funcs = Stat::all() + .filter_map(|stat| stat.aggregate_fn().map(|func| (stat, func))) + .collect::>(); + let aggregate_funcs = funcs + .iter() + .map(|(_, func)| func.clone()) + .collect::>(); + let stats = self + .statistics( + &vortex_array::expr::get_item(field_name, vortex_array::expr::root()), + &aggregate_funcs, + ) + .await?; + let mut stats_set = StatsSet::default(); + for ((stat, _), value) in funcs.into_iter().zip(stats) { + stats_set.set(stat, scalar_precision_to_value(value)); + } + Ok(stats_set) + } + + fn supports_morsel_partitioning(&self) -> bool { + true + } +} + +struct ScanNodeDataSourceScan { + dtype: DType, + request: ScanRequest, + ready: VecDeque, + deferred: VecDeque>, + handle: Handle, + concurrency: usize, + scheduler: Arc, + ticket: ScanTicket, +} + +impl DataSourceScan for ScanNodeDataSourceScan { + fn dtype(&self) -> &DType { + &self.dtype + } + + fn partition_count(&self) -> Precision { + let count = self.ready.len() + self.deferred.len(); + if self.deferred.is_empty() { + Precision::exact(count) + } else { + Precision::inexact(count) + } + } + + fn partitions(self: Box) -> PartitionStream { + let Self { + dtype: _, + request, + ready, + deferred, + handle, + concurrency, + scheduler, + ticket, + } = *self; + + let ordered = request.ordered; + let ready_stream = stream::iter(ready).map(Ok); + let spawned = stream::iter(deferred).map(move |factory| { + handle.spawn(async move { + factory + .open() + .instrument(tracing::info_span!("VortexFileFactory::open")) + .await + }) + }); + + let deferred_stream = if ordered { + spawned + .buffered(concurrency) + .filter_map(|result| async move { + match result { + Ok(Some(file)) => Some(Ok(file)), + Ok(None) => None, + Err(error) => Some(Err(error)), + } + }) + .boxed() + } else { + spawned + .buffer_unordered(concurrency) + .filter_map(|result| async move { + match result { + Ok(Some(file)) => Some(Ok(file)), + Ok(None) => None, + Err(error) => Some(Err(error)), + } + }) + .boxed() + }; + + ready_stream + .chain(deferred_stream) + .enumerate() + .filter_map(move |(index, file_result)| { + let request = request.clone(); + let scheduler = Arc::clone(&scheduler); + let ticket = ticket.clone(); + async move { + match file_result { + Ok(file) => { + file_partition(index, file, request, scheduler, ticket).transpose() + } + Err(error) => Some(Err(error)), + } + } + }) + .boxed() + } +} + +fn file_partition( + partition_idx: usize, + file: VortexFile, + request: ScanRequest, + scheduler: Arc, + ticket: ScanTicket, +) -> VortexResult> { + let Some(request) = file_scan_request(partition_idx, &file, request)? else { + return Ok(None); + }; + + Ok(Some(Box::new(ScanNodePartition { + file, + request, + index: partition_idx, + scheduler, + ticket, + }))) +} + +pub(crate) fn scan_node_file_stream( + file: VortexFile, + request: ScanRequest, +) -> VortexResult { + let dtype = request.projection.return_dtype(file.dtype())?; + let meta = ScanMeta { + label: Some("scan2".to_string()), + }; + let provider = request + .scheduler_provider + .clone() + .unwrap_or_else(|| file.session().scan_scheduler_provider()); + let scheduler = provider.scheduler_for_scan(&meta); + let ticket = scheduler.register_scan(meta); + + let Some(partition) = file_partition(0, file, request, scheduler, ticket)? else { + return Ok(ArrayStreamExt::boxed(ArrayStreamAdapter::new( + dtype, + stream::empty(), + ))); + }; + partition.execute() +} + +pub(crate) async fn scan_node_file_statistics( + file: VortexFile, + expr: &Expression, + funcs: &[AggregateFnRef], +) -> VortexResult>> { + let mut stats = scan_node_file_statistics_many(file, std::slice::from_ref(expr), funcs).await?; + Ok(stats.pop().unwrap_or_else(|| absent_statistics(funcs))) +} + +pub(crate) async fn scan_node_file_statistics_many( + file: VortexFile, + exprs: &[Expression], + funcs: &[AggregateFnRef], +) -> VortexResult>>> { + let session = file.session().clone(); + let root = expand_file_root(&file, &session)?; + let reader = FileReader::new(file.segment_source(), session); + let mut result = Vec::with_capacity(exprs.len()); + for expr in exprs { + let pushed = push_expr(&root, expr, file.dtype(), reader.session())?; + let Some(plan) = pushed.plan_stats(funcs, &mut PlanCtx::new(reader.session().clone()))? + else { + result.push(absent_statistics(funcs)); + continue; + }; + let state = plan.init_state(reader.session())?; + result.push( + plan.stats(0..file.row_count(), &reader, state.as_ref()) + .await?, + ); + } + Ok(result) +} + +pub(crate) fn scan_node_file_splits(file: &VortexFile) -> VortexResult>> { + let session = file.session().clone(); + let root = expand_file_root(file, &session)?; + let row_count = file.row_count(); + let mut points = vec![0, row_count]; + if let Some(hints) = root.split_hints() { + points.extend( + hints + .iter() + .copied() + .filter(|&hint| 0 < hint && hint < row_count), + ); + } + points.sort_unstable(); + points.dedup(); + Ok(points + .windows(2) + .filter_map(|window| { + let range = window[0]..window[1]; + (range.start < range.end).then_some(range) + }) + .collect()) +} + +fn expand_file_root(file: &VortexFile, session: &VortexSession) -> VortexResult { + let mut node_request = NodeRequest::empty(); + let root = ExpandCtx::new(session.clone()).expand(file.footer().layout(), &mut node_request)?; + Ok(match file.footer().statistics().cloned() { + Some(stats) => FileStatsScanNode::try_new( + Arc::clone(&root), + Arc::new(stats), + file.dtype(), + file.row_count(), + ) + .map(|node| Arc::new(node) as ScanNodeRef) + .unwrap_or(root), + None => root, + }) +} + +fn file_scan_request( + partition_idx: usize, + file: &VortexFile, + request: ScanRequest, +) -> VortexResult> { + let partition_idx_u64 = partition_idx as u64; + if let Some(range) = &request.partition_range + && !range.contains(&partition_idx_u64) + { + return Ok(None); + } + match &request.partition_selection { + Selection::IncludeByIndex(buffer) => { + if buffer.as_slice().binary_search(&partition_idx_u64).is_err() { + return Ok(None); + } + } + Selection::ExcludeByIndex(buffer) => { + if buffer.as_slice().binary_search(&partition_idx_u64).is_ok() { + return Ok(None); + } + } + _ => {} + }; + + let row_count = file.row_count(); + let row_range = request.row_range.clone().unwrap_or(0..row_count); + check_range(&row_range, row_count)?; + + if let Some(filter) = &request.filter + && file.can_prune(filter)? + { + return Ok(None); + } + + Ok(Some(ScanRequest { + row_range: Some(row_range), + ..request + })) +} + +struct ScanNodePartition { + file: VortexFile, + request: ScanRequest, + index: usize, + scheduler: Arc, + ticket: ScanTicket, +} + +impl Partition for ScanNodePartition { + fn as_any(&self) -> &dyn Any { + self + } + + fn index(&self) -> usize { + self.index + } + + fn row_count(&self) -> Precision { + let Some(row_range) = self.request.row_range.as_ref() else { + return Precision::Absent; + }; + let row_count = row_range.end - row_range.start; + let row_count = self.request.selection.row_count(row_count); + let row_count = self + .request + .limit + .map_or(row_count, |limit| row_count.min(limit)); + + if self.request.filter.is_some() { + Precision::inexact(row_count) + } else { + Precision::exact(row_count) + } + } + + fn byte_size(&self) -> Precision { + Precision::Absent + } + + fn execute(self: Box) -> VortexResult { + let ScanNodePartition { + file, + request, + index: _, + scheduler, + ticket, + } = *self; + + let prepared = Arc::new(PreparedScanNodeFile::try_new(file, request)?); + let dtype = prepared.dtype.clone(); + let ranges = prepared.splits()?; + let session = prepared.session.clone(); + let ordered = prepared.ordered; + let concurrency = if ordered || prepared.limit_remaining.is_some() { + 1 + } else { + get_available_parallelism().unwrap_or(1) * 4 + }; + + let tasks = ranges + .into_iter() + .map(|range| { + let prepared = Arc::clone(&prepared); + let scheduler = Arc::clone(&scheduler); + let ticket = ticket.clone(); + async move { + let _permit = scheduler.acquire(&ticket, WorkRequest::morsel()).await?; + prepared.read_range(range).await + } + .boxed() + }) + .collect::>>>>(); + + let handle = session.handle(); + let stream = stream::iter(tasks).map(move |task| handle.spawn(task)); + let stream = if ordered { + stream.buffered(concurrency).boxed() + } else { + stream.buffer_unordered(concurrency).boxed() + }; + let stream = stream.filter_map(|result| async move { result.transpose() }); + + Ok(ArrayStreamExt::boxed(ArrayStreamAdapter::new( + dtype, stream, + ))) + } +} + +struct PlannedScanNodeScan { + dtype: DType, + partitions: Vec>, + scheduler: Arc, + ticket: ScanTicket, + morsel_concurrency: usize, +} + +#[derive(Clone)] +struct PlannedScanNodeMorsel { + prepared: Arc, + range: Range, +} + +impl PlannedMorselScan for PlannedScanNodeScan { + fn dtype(&self) -> &DType { + &self.dtype + } + + fn partition_count(&self) -> usize { + self.partitions.len() + } + + fn partition(self: Arc, partition: usize) -> VortexResult { + if partition >= self.partitions.len() { + vortex_bail!( + "planned scan partition {partition} is outside 0..{}", + self.partitions.len() + ); + } + + Ok(Box::new(PlannedScanNodePartition { + planned: self, + index: partition, + })) + } +} + +struct PlannedScanNodePartition { + planned: Arc, + index: usize, +} + +impl Partition for PlannedScanNodePartition { + fn as_any(&self) -> &dyn Any { + self + } + + fn index(&self) -> usize { + self.index + } + + fn row_count(&self) -> Precision { + let mut row_count = 0u64; + let mut has_filter = false; + + for morsel in &self.planned.partitions[self.index] { + let range_len = morsel.range.end - morsel.range.start; + row_count = row_count.saturating_add(morsel.prepared.selection.row_count(range_len)); + has_filter |= !morsel.prepared.predicates.is_empty(); + } + + if has_filter { + Precision::inexact(row_count) + } else { + Precision::exact(row_count) + } + } + + fn byte_size(&self) -> Precision { + Precision::Absent + } + + fn execute(self: Box) -> VortexResult { + let PlannedScanNodePartition { planned, index } = *self; + let morsels = planned.partitions[index].clone(); + let dtype = planned.dtype.clone(); + let scheduler = Arc::clone(&planned.scheduler); + let ticket = planned.ticket.clone(); + let concurrency = planned.morsel_concurrency; + + let stream = stream::iter(morsels).map(move |morsel| { + let handle = morsel.prepared.session.handle(); + let scheduler = Arc::clone(&scheduler); + let ticket = ticket.clone(); + handle.spawn( + async move { + let _permit = scheduler.acquire(&ticket, WorkRequest::morsel()).await?; + morsel.prepared.read_range(morsel.range).await + } + .instrument(tracing::trace_span!("scan2_morsel")), + ) + }); + + let stream = stream + .buffer_unordered(concurrency) + .filter_map(|result| async move { result.transpose() }); + + Ok(ArrayStreamExt::boxed(ArrayStreamAdapter::new( + dtype, stream, + ))) + } +} + +struct PreparedScanNodeFile { + session: VortexSession, + reader: FileReader, + dtype: DType, + row_range: Range, + selection: Selection, + ordered: bool, + limit_remaining: Option, + root: ScanNodeRef, + projection: ReadPlanRef, + projection_state: ScanStateRef, + predicates: Vec, +} + +struct PredicatePlan { + id: PredicateId, + expr: Expression, + read: ReadPlanRef, + read_state: ScanStateRef, + evidence: Vec<(EvidencePlanRef, ScanStateRef)>, +} + +impl PreparedScanNodeFile { + fn try_new(file: VortexFile, request: ScanRequest) -> VortexResult { + let session = file.session().clone(); + let dtype = request.projection.return_dtype(file.dtype())?; + let projection = request.projection.optimize_recursive(file.dtype())?; + let filter = request + .filter + .map(|filter| filter.optimize_recursive(file.dtype())) + .transpose()?; + + let root = expand_file_root(&file, &session)?; + let reader = FileReader::new(file.segment_source(), session.clone()); + + let mut node_cache = ScanStateCache::default(); + let mut state_ctx = StateCtx::new(&session, &mut node_cache); + + let projection_plan = plan_read(&root, &projection, file.dtype(), &session)?; + let projection_state = projection_plan.init_state(&mut state_ctx)?; + + let mut evidence_state_cache: HashMap = HashMap::default(); + let predicates = filter + .as_ref() + .map(conjuncts) + .unwrap_or_default() + .into_iter() + .enumerate() + .map(|(idx, expr)| { + let id = PredicateId::new( + u32::try_from(idx).map_err(|_| vortex_err!("too many predicates"))?, + ); + let pushed = push_expr(&root, &expr, file.dtype(), &session)?; + let read = Arc::clone(&pushed) + .plan_read(&mut PlanCtx::new(session.clone()))? + .ok_or_else(|| vortex_err!("scan2 could not plan predicate read {expr}"))?; + let read_state = read.init_state(&mut state_ctx)?; + let evidence = pushed + .plan_evidence(&mut PlanCtx::new(session.clone()))? + .into_iter() + .map(|plan| { + let state = if let Some(key) = plan.state_cache_key() { + if let Some(state) = evidence_state_cache.get(&key) { + Arc::clone(state) + } else { + let state = plan.init_state(&session)?; + evidence_state_cache.insert(key, Arc::clone(&state)); + state + } + } else { + plan.init_state(&session)? + }; + Ok((plan, state)) + }) + .collect::>>()?; + Ok(PredicatePlan { + id, + expr, + read, + read_state, + evidence, + }) + }) + .collect::>>()?; + + Ok(Self { + session, + reader, + dtype, + row_range: request + .row_range + .ok_or_else(|| vortex_err!("scan2 partition row range missing"))?, + selection: request.selection, + ordered: request.ordered, + limit_remaining: request.limit.map(AtomicU64::new), + root, + projection: projection_plan, + projection_state, + predicates, + }) + } + + fn splits(&self) -> VortexResult>> { + let mut points = vec![self.row_range.start]; + if let Some(hints) = self.root.split_hints() { + points.extend( + hints + .iter() + .copied() + .filter(|&hint| self.row_range.start < hint && hint < self.row_range.end), + ); + } + if points.len() == 1 { + let mut next = self + .row_range + .start + .saturating_add(FALLBACK_SPLIT_SIZE) + .min(self.row_range.end); + while next < self.row_range.end { + points.push(next); + next = next + .saturating_add(FALLBACK_SPLIT_SIZE) + .min(self.row_range.end); + } + } + points.push(self.row_range.end); + points.sort_unstable(); + points.dedup(); + Ok(points + .windows(2) + .filter_map(|window| { + let range = window[0]..window[1]; + (range.start < range.end).then_some(range) + }) + .collect()) + } + + async fn read_range(&self, range: Range) -> VortexResult> { + let len = range_len(&range)?; + let selected = self.selection.row_mask(&range).mask().clone(); + if selected.all_false() { + return Ok(None); + } + + let mut ctx = self.session.create_execution_ctx(); + let Some(selected) = self + .morsel_selection(range.clone(), selected, &mut ctx) + .await? + else { + return Ok(None); + }; + + if selected.all_false() { + return Ok(None); + } + + let selected = if let Some(limit_remaining) = &self.limit_remaining { + limit_mask(selected, limit_remaining)? + } else { + selected + }; + if selected.all_false() { + return Ok(None); + } + if selected.len() != len { + vortex_bail!( + "scan2 projection selection length {} does not match range length {len}", + selected.len() + ); + } + + let array = self + .projection + .read_scoped( + range, + RowScope::selected(&selected), + &self.reader, + self.projection_state.as_ref(), + &mut ctx, + ) + .await?; + Ok(Some(array)) + } + + async fn morsel_selection( + &self, + range: Range, + mut selected: Mask, + ctx: &mut vortex_array::ExecutionCtx, + ) -> VortexResult> { + let len = range_len(&range)?; + let full_domain = Mask::new_true(len); + let mut evidence = Vec::with_capacity(self.predicates.len()); + + for predicate in &self.predicates { + let acc = self.gather_evidence(predicate, &range).await?; + selected = &selected & acc.maybe(); + if selected.all_false() { + return Ok(None); + } + evidence.push((predicate, acc)); + } + + for (predicate, acc) in evidence { + let need = &selected & &acc.unproven(); + if need.all_false() { + continue; + } + let rows = RowScope::try_new(&full_domain, &need)?; + let result = predicate + .read + .read_scoped( + range.clone(), + rows, + &self.reader, + predicate.read_state.as_ref(), + ctx, + ) + .await? + .execute::(ctx)?; + if result.len() != len { + vortex_bail!( + "residual result length {} does not match morsel length {len}", + result.len() + ); + } + let pass = &result & &need; + selected = &selected.bitand_not(&need) | &pass; + if selected.all_false() { + return Ok(None); + } + } + Ok(Some(selected)) + } + + async fn gather_evidence( + &self, + predicate: &PredicatePlan, + range: &Range, + ) -> VortexResult { + let mut acc = + PredicateEvidence::new(predicate.id, PredicateVersion::STATIC, range.clone())?; + for (plan, state) in &predicate.evidence { + let req = EvidenceRequest { + id: predicate.id, + version: PredicateVersion::STATIC, + predicate: &predicate.expr, + range: range.clone(), + mode: EvidenceMode::Normal, + }; + for fragment in plan.evidence(&req, &self.reader, state.as_ref()).await? { + acc.absorb(fragment)?; + } + if acc.all_false() { + break; + } + } + Ok(acc) + } +} + +fn push_expr( + root: &ScanNodeRef, + expr: &Expression, + dtype: &DType, + session: &VortexSession, +) -> VortexResult { + validate_temporal_comparisons(expr, dtype)?; + Arc::clone(root) + .try_push_expr(expr, &mut PushCtx::new(session.clone()))? + .ok_or_else(|| vortex_err!("scan2 could not push expression {expr}")) +} + +fn plan_read( + root: &ScanNodeRef, + expr: &Expression, + dtype: &DType, + session: &VortexSession, +) -> VortexResult { + push_expr(root, expr, dtype, session)? + .plan_read(&mut PlanCtx::new(session.clone()))? + .ok_or_else(|| vortex_err!("scan2 could not plan read for expression {expr}")) +} + +fn check_range(range: &Range, row_count: u64) -> VortexResult<()> { + if range.start > range.end || range.end > row_count { + vortex_bail!( + "scan2 row range {:?} is out of bounds for row count {}", + range, + row_count + ); + } + range_len(range).map(|_| ()) +} + +fn range_len(range: &Range) -> VortexResult { + let len = range + .end + .checked_sub(range.start) + .ok_or_else(|| vortex_err!("scan2 row range end is before start: {range:?}"))?; + usize::try_from(len).map_err(|_| vortex_err!("scan2 row range exceeds usize")) +} + +fn limit_mask(mask: Mask, remaining: &AtomicU64) -> VortexResult { + let available = remaining.load(Ordering::Relaxed); + if available == 0 { + return Ok(Mask::new_false(mask.len())); + } + let true_count = mask.true_count(); + if true_count as u64 <= available { + remaining.fetch_sub(true_count as u64, Ordering::Relaxed); + return Ok(mask); + } + let take = usize::try_from(available).unwrap_or(usize::MAX); + remaining.store(0, Ordering::Relaxed); + Ok(Mask::from_indices( + mask.len(), + (0..mask.len()).filter(|idx| mask.value(*idx)).take(take), + )) +} diff --git a/vortex-file/src/tests.rs b/vortex-file/src/tests.rs index e320cf2e9d9..e66503a91ab 100644 --- a/vortex-file/src/tests.rs +++ b/vortex-file/src/tests.rs @@ -46,6 +46,7 @@ use vortex_array::expr::lt_eq; use vortex_array::expr::or; use vortex_array::expr::root; use vortex_array::expr::select; +use vortex_array::expr::stats::Precision; use vortex_array::extension::datetime::TimeUnit; use vortex_array::extension::datetime::Timestamp; use vortex_array::extension::datetime::TimestampOptions; @@ -60,6 +61,7 @@ use vortex_array::stream::ArrayStreamAdapter; use vortex_array::stream::ArrayStreamExt; use vortex_array::validity::Validity; use vortex_buffer::Buffer; +use vortex_buffer::ByteBuffer; use vortex_buffer::ByteBufferMut; use vortex_buffer::buffer; use vortex_error::VortexResult; @@ -75,8 +77,11 @@ use crate::VERSION; use crate::VortexFile; use crate::WriteOptionsSessionExt; use crate::footer::SegmentSpec; +use crate::multi::MultiFileDataSource; -static SESSION: LazyLock = LazyLock::new(|| { +static SESSION: LazyLock = LazyLock::new(new_test_session); + +fn new_test_session() -> VortexSession { let session = VortexSession::empty() .with::() .with::() @@ -86,7 +91,213 @@ static SESSION: LazyLock = LazyLock::new(|| { crate::register_default_encodings(&session); session -}); +} + +fn exact_u32_stat(stat: &Precision) -> Option { + stat.as_ref() + .as_exact()? + .as_primitive() + .typed_value::() +} + +fn exact_u64_stat(stat: &Precision) -> Option { + stat.as_ref() + .as_exact()? + .as_primitive() + .typed_value::() +} + +#[test] +fn multi_file_scan_node_data_source_filters_and_projects() -> VortexResult<()> { + use vortex_io::runtime::BlockingRuntime; + use vortex_io::runtime::single::SingleThreadRuntime; + use vortex_io::session::RuntimeSessionExt; + + let runtime = SingleThreadRuntime::default(); + let session = new_test_session().with_handle(runtime.handle()); + + temp_env::with_var("VORTEX_SCAN_IMPL", Some("v2"), || { + runtime.block_on(async { + use async_trait::async_trait; + use futures::stream; + use futures::stream::BoxStream; + use vortex_array::aggregate_fn::AggregateFnVTableExt; + use vortex_array::aggregate_fn::EmptyOptions; + use vortex_array::aggregate_fn::fns::max::Max; + use vortex_array::aggregate_fn::fns::min::Min; + use vortex_array::aggregate_fn::fns::null_count::NullCount; + use vortex_io::VortexReadAt; + use vortex_io::filesystem::FileListing; + use vortex_io::filesystem::FileSystem; + use vortex_io::filesystem::FileSystemRef; + + #[derive(Debug)] + struct MemoryFileSystem { + files: std::collections::BTreeMap, + } + + #[async_trait] + impl FileSystem for MemoryFileSystem { + fn list(&self, prefix: &str) -> BoxStream<'_, VortexResult> { + let listings = self + .files + .iter() + .filter_map(move |(path, bytes)| { + path.starts_with(prefix).then_some(Ok(FileListing { + path: path.clone(), + size: Some(bytes.len() as u64), + })) + }) + .collect::>(); + stream::iter(listings).boxed() + } + + async fn head(&self, path: &str) -> VortexResult> { + Ok(self.files.get(path).map(|bytes| FileListing { + path: path.to_string(), + size: Some(bytes.len() as u64), + })) + } + + async fn open_read(&self, path: &str) -> VortexResult> { + self.files + .get(path) + .cloned() + .map(|bytes| Arc::new(bytes) as Arc) + .ok_or_else(|| vortex_error::vortex_err!("missing test file {path}")) + } + + async fn delete(&self, _path: &str) -> VortexResult<()> { + Ok(()) + } + } + + async fn write_part( + session: &VortexSession, + values: ArrayRef, + ) -> VortexResult { + let mut buf = ByteBufferMut::empty(); + session + .write_options() + .write(&mut buf, values.to_array_stream()) + .await?; + Ok(buf.freeze()) + } + + async fn write_part_with_stats( + session: &VortexSession, + values: ArrayRef, + ) -> VortexResult { + let mut buf = ByteBufferMut::empty(); + let mut writer = session + .write_options() + .with_file_statistics(PRUNING_STATS.to_vec()) + .writer(&mut buf, values.dtype().clone()); + writer.push(values).await?; + writer.finish().await?; + Ok(buf.freeze()) + } + + let single = + StructArray::from_fields(&[("numbers", buffer![10u32, 20, 30].into_array())])? + .into_array(); + let single_fs: FileSystemRef = Arc::new(MemoryFileSystem { + files: std::collections::BTreeMap::from_iter([( + "single.vortex".to_string(), + write_part_with_stats(&session, single).await?, + )]), + }); + let single_source = MultiFileDataSource::new(session.clone()) + .with_glob("single.vortex", Some(single_fs)) + .build_data_source() + .await?; + let stats = single_source + .statistics( + &col("numbers"), + &[ + Min.bind(EmptyOptions), + Max.bind(EmptyOptions), + NullCount.bind(EmptyOptions), + ], + ) + .await?; + assert_eq!(exact_u32_stat(&stats[0]), Some(10)); + assert_eq!(exact_u32_stat(&stats[1]), Some(30)); + assert_eq!(exact_u64_stat(&stats[2]), Some(0)); + + let first = StructArray::from_fields(&[("numbers", buffer![1u32, 2, 3].into_array())])? + .into_array(); + let second = + StructArray::from_fields(&[("numbers", buffer![4u32, 5, 6].into_array())])? + .into_array(); + + let fs: FileSystemRef = Arc::new(MemoryFileSystem { + files: std::collections::BTreeMap::from_iter([ + ( + "part-0.vortex".to_string(), + write_part(&session, first).await?, + ), + ( + "part-1.vortex".to_string(), + write_part(&session, second).await?, + ), + ]), + }); + + let data_source = MultiFileDataSource::new(session.clone()) + .with_glob("part-*.vortex", Some(fs)) + .build_data_source() + .await?; + let scan = data_source + .scan(vortex_scan::ScanRequest { + projection: col("numbers"), + filter: Some(gt(col("numbers"), lit(2u32))), + ordered: true, + ..Default::default() + }) + .await?; + + let dtype = scan.dtype().clone(); + let stream = scan + .partitions() + .then(|partition| async move { partition?.execute() }) + .try_flatten() + .boxed(); + let actual = ArrayStreamAdapter::new(dtype, stream).read_all().await?; + + assert_arrays_eq!(actual, buffer![3u32, 4, 5, 6].into_array()); + + let planned = data_source + .plan_morsel_partitions( + vortex_scan::ScanRequest { + projection: col("numbers"), + filter: Some(gt(col("numbers"), lit(2u32))), + ..Default::default() + }, + 128, + ) + .await? + .ok_or_else(|| { + vortex_error::vortex_err!("scan node data source must plan morsel partitions") + })?; + + assert_eq!(planned.partition_count(), 2); + + let dtype = planned.dtype().clone(); + let stream = stream::iter(0..planned.partition_count()) + .then(|partition| { + let planned = Arc::clone(&planned); + async move { planned.partition(partition)?.execute() } + }) + .try_flatten() + .boxed(); + let actual = ArrayStreamAdapter::new(dtype, stream).read_all().await?; + + assert_arrays_eq!(actual, buffer![3u32, 4, 5, 6].into_array()); + Ok(()) + }) + }) +} #[tokio::test] async fn test_eof_values() { diff --git a/vortex-jni/src/scan.rs b/vortex-jni/src/scan.rs index 606ec8cd040..360d92aa989 100644 --- a/vortex-jni/src/scan.rs +++ b/vortex-jni/src/scan.rs @@ -119,6 +119,7 @@ fn build_scan_request( limit, partition_selection: Selection::All, partition_range: None, + scheduler_provider: None, }) } diff --git a/vortex-layout/src/scan/mod.rs b/vortex-layout/src/scan/mod.rs index 98fd1918a42..ab003641eb5 100644 --- a/vortex-layout/src/scan/mod.rs +++ b/vortex-layout/src/scan/mod.rs @@ -12,6 +12,7 @@ mod splits; mod tasks; #[cfg(test)] mod test; +pub mod v2; /// A heuristic for an ideal split size. /// diff --git a/vortex-layout/src/scan/v2/evidence.rs b/vortex-layout/src/scan/v2/evidence.rs new file mode 100644 index 00000000000..89a9a477f3f --- /dev/null +++ b/vortex-layout/src/scan/v2/evidence.rs @@ -0,0 +1,523 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Predicate evidence: coverage-bearing answers for planned predicates. +//! +//! A scan2 predicate is answered at runtime by *evidence fragments*: +//! row ranges paired with what a producer proves about the +//! predicate over them (plan 017 SP1). The whole-morsel verdicts of the +//! v1 scan (`RangeClassification`) become the degenerate case of one +//! fragment covering the morsel; finer coverage is first-class, so a zone +//! map can prove interior zones while leaving edge rows unknown, and an +//! index can return sparse row masks without forcing the whole morsel +//! down the same path. +//! +//! Exactness is explicit in the returned evidence kind. +//! [`PredicateEvidenceKind::ExactMask`] proves both selected and rejected +//! rows — the source may suppress residual evaluation for the covered +//! range. [`PredicateEvidenceKind::CandidateMask`] proves only that +//! masked-out rows are rejected; masked-in rows must still run the +//! residual predicate. Approximate producers must return candidate +//! evidence directly. + +use std::ops::Range; + +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_err; +use vortex_mask::Mask; + +/// Identifies one predicate of a scan. Stable for the lifetime +/// of the expanded scan: producers and the source combine evidence by +/// predicate id (never by expression text), so rewritten or derived +/// predicate forms stay tied to their original predicate. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct PredicateId(u32); + +impl PredicateId { + /// The id of the `idx`-th predicate. + pub fn new(idx: u32) -> Self { + Self(idx) + } + + /// This id as an index into the scan's predicate list. + pub fn as_usize(self) -> usize { + self.0 as usize + } +} + +impl std::fmt::Display for PredicateId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "p{}", self.0) + } +} + +/// Distinguishes successive values of a dynamic predicate (a runtime +/// boundary that tightens between morsels). Static predicates stay at +/// version zero. Evidence only combines within one (id, version) pair. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash)] +pub struct PredicateVersion(u64); + +impl PredicateVersion { + /// The version for static predicates. + pub const STATIC: Self = Self(0); + + /// A dynamic predicate's version, from its boundary slot. + pub fn new(version: u64) -> Self { + Self(version) + } +} + +/// Exactness metadata for producers that need to degrade an evidence kind +/// before returning it. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum Exactness { + /// Evidence may be exact: `AllTrue`, `AllFalse`, and `ExactMask` are + /// accepted as returned. + Exact, + /// Evidence is at most a candidate: masked-out rows are rejected, + /// masked-in rows must still run the residual predicate. `ExactMask` + /// degrades to `CandidateMask` and `AllTrue` to `Unknown`; `AllFalse` + /// stays, since rejecting rows is within a candidate's authority. + Candidate, +} + +impl Exactness { + /// The stronger of two exactness values. + #[allow(dead_code)] + pub(crate) fn max(self, other: Self) -> Self { + match (self, other) { + (Self::Candidate, Self::Candidate) => Self::Candidate, + _ => Self::Exact, + } + } +} + +/// What a fragment proves about a predicate over its row range. +#[derive(Clone, Debug)] +pub enum PredicateEvidenceKind { + /// The predicate is false for every row in the range. + AllFalse, + /// The predicate is true for every row in the range. + AllTrue, + /// Exact per-row verdicts: set rows are true, unset rows are false. + /// Residual evaluation is unnecessary for the covered range. + ExactMask(Mask), + /// Unset rows are proven false; set rows are only candidates and must + /// still run the residual predicate. Approximate indexes must use + /// this kind. + CandidateMask(Mask), + /// The producer proves nothing about the range. + Unknown, +} + +impl PredicateEvidenceKind { + /// This kind degraded to a producer's advertised exactness. + #[allow(dead_code)] + pub(crate) fn cap(self, ceiling: Exactness) -> Self { + match ceiling { + Exactness::Exact => self, + Exactness::Candidate => match self { + Self::AllTrue => Self::Unknown, + Self::ExactMask(mask) => Self::CandidateMask(mask), + other => other, + }, + } + } +} + +/// One producer's answer for one predicate over one row range, in the +/// producer's row coordinates (for column producers these coincide with +/// file row coordinates). +#[derive(Clone, Debug)] +pub struct EvidenceFragment { + /// The rows this fragment covers. + pub rows: Range, + /// What is proven over them. + pub kind: PredicateEvidenceKind, +} + +impl EvidenceFragment { + /// A fragment proving `kind` over `rows`. + pub fn new(rows: Range, kind: PredicateEvidenceKind) -> Self { + Self { rows, kind } + } +} + +/// Accumulated evidence for one predicate over one morsel: fragments fold +/// into two morsel-local masks. +/// +/// - `maybe`: rows that may still satisfy the predicate. Starts all-true; +/// every proof of falseness clears bits. +/// - `proven`: rows whose verdict is exactly known (true *or* false), so +/// residual evaluation cannot change it. Starts all-false. +/// +/// The invariant `!maybe ⊆ proven` holds throughout: a row is only +/// removed from `maybe` by evidence that proves it false. +pub struct PredicateEvidence { + id: PredicateId, + version: PredicateVersion, + /// The morsel's row range in file coordinates. + range: Range, + maybe: Mask, + proven: Mask, +} + +impl PredicateEvidence { + /// Fresh evidence (nothing proven) for one predicate over the morsel + /// `range`. + pub fn new( + id: PredicateId, + version: PredicateVersion, + range: Range, + ) -> VortexResult { + let len = range_len(&range)?; + Ok(Self { + id, + version, + range, + maybe: Mask::new_true(len), + proven: Mask::new_false(len), + }) + } + + /// The predicate this evidence answers. + pub fn id(&self) -> PredicateId { + self.id + } + + /// Rows that may still satisfy the predicate (morsel-local). + pub fn maybe(&self) -> &Mask { + &self.maybe + } + + /// Rows whose verdict residual evaluation may not change + /// (morsel-local). + pub fn proven(&self) -> &Mask { + &self.proven + } + + /// Rows that may satisfy the predicate but are not exactly proven: + /// the rows the residual predicate must evaluate. + pub fn unproven(&self) -> Mask { + // The common whole-morsel verdicts skip the bit traversal: with + // nothing proven everything in `maybe` is residual, and with + // everything proven nothing is. + if self.proven.all_false() { + return self.maybe.clone(); + } + if self.proven.all_true() { + return Mask::new_false(self.maybe.len()); + } + self.maybe.clone().bitand_not(&self.proven) + } + + /// Whether no row of the morsel can satisfy the predicate. + pub fn all_false(&self) -> bool { + self.maybe.all_false() + } + + /// Fold one fragment in. Fragments outside the morsel range are + /// clipped (wholly disjoint fragments are ignored); fragment masks + /// must match their declared row range. + pub fn absorb(&mut self, fragment: EvidenceFragment) -> VortexResult<()> { + let span = fragment.rows.start.max(self.range.start)..fragment.rows.end.min(self.range.end); + if span.start >= span.end { + return Ok(()); + } + // The covered span in morsel-local coordinates. + let local = usize::try_from(span.start - self.range.start) + .map_err(|_| vortex_err!("morsel exceeds usize"))? + ..usize::try_from(span.end - self.range.start) + .map_err(|_| vortex_err!("morsel exceeds usize"))?; + let len = self.maybe.len(); + // Fragments covering the whole morsel — the dominant case when a + // zone run spans it — combine without building placement masks. + let whole = local.start == 0 && local.end == len; + match fragment.kind { + PredicateEvidenceKind::Unknown => {} + PredicateEvidenceKind::AllFalse if whole => { + self.maybe = Mask::new_false(len); + self.proven = Mask::new_true(len); + } + PredicateEvidenceKind::AllFalse => { + self.maybe = &self.maybe & &constrain(len, &local, None)?; + self.proven = &self.proven | &prove(len, &local, None)?; + } + PredicateEvidenceKind::AllTrue if whole => { + self.proven = Mask::new_true(len); + } + PredicateEvidenceKind::AllTrue => { + self.proven = &self.proven | &prove(len, &local, None)?; + } + PredicateEvidenceKind::ExactMask(mask) if whole => { + let mask = clip_mask(mask, &fragment.rows, &span)?; + self.maybe = &self.maybe & &mask; + self.proven = Mask::new_true(len); + } + PredicateEvidenceKind::ExactMask(mask) => { + let mask = clip_mask(mask, &fragment.rows, &span)?; + self.maybe = &self.maybe & &constrain(len, &local, Some(&mask))?; + self.proven = &self.proven | &prove(len, &local, None)?; + } + PredicateEvidenceKind::CandidateMask(mask) if whole => { + let mask = clip_mask(mask, &fragment.rows, &span)?; + self.maybe = &self.maybe & &mask; + self.proven = &self.proven | &!mask; + } + PredicateEvidenceKind::CandidateMask(mask) => { + let mask = clip_mask(mask, &fragment.rows, &span)?; + self.maybe = &self.maybe & &constrain(len, &local, Some(&mask))?; + // Only the masked-out rows are proven (false); masked-in + // rows remain candidates. + let rejected = !mask; + self.proven = &self.proven | &prove(len, &local, Some(&rejected))?; + } + } + Ok(()) + } + + /// The version this evidence was requested at. + pub fn version(&self) -> PredicateVersion { + self.version + } +} + +/// A morsel-length AND-constraint: `mask` (or all-false, proving the span +/// rejected) inside `span`, `true` — the AND identity — outside it. +fn constrain(len: usize, span: &Range, mask: Option<&Mask>) -> VortexResult { + placed(len, span, mask, false, true) +} + +/// A morsel-length proof: `mask` (or all-true, proving the whole span) +/// inside `span`, `false` — the OR identity — outside it. +fn prove(len: usize, span: &Range, mask: Option<&Mask>) -> VortexResult { + placed(len, span, mask, true, false) +} + +/// `mask` (or `fill`) placed at `span`, `outside` everywhere else. +fn placed( + len: usize, + span: &Range, + mask: Option<&Mask>, + fill: bool, + outside: bool, +) -> VortexResult { + let span_len = span.end - span.start; + let inner = match mask { + Some(mask) => { + if mask.len() != span_len { + vortex_bail!( + "evidence mask length {} does not match its row range length {span_len}", + mask.len() + ); + } + mask.clone() + } + None => Mask::new(span_len, fill), + }; + let lead = Mask::new(span.start, outside); + let tail = Mask::new(len - span.end, outside); + Mask::concat([&lead, &inner, &tail].into_iter()) +} + +/// Slice a fragment's mask down to the morsel-clipped part of its range. +fn clip_mask(mask: Mask, rows: &Range, span: &Range) -> VortexResult { + let full = range_len(rows)?; + if mask.len() != full { + vortex_bail!( + "evidence mask length {} does not match its row range {rows:?}", + mask.len() + ); + } + if span == rows { + return Ok(mask); + } + let start = usize::try_from(span.start - rows.start) + .map_err(|_| vortex_err!("evidence fragment exceeds usize"))?; + let end = usize::try_from(span.end - rows.start) + .map_err(|_| vortex_err!("evidence fragment exceeds usize"))?; + Ok(mask.slice(start..end)) +} + +fn range_len(range: &Range) -> VortexResult { + usize::try_from(range.end.saturating_sub(range.start)) + .map_err(|_| vortex_err!("row range {range:?} exceeds usize")) +} + +#[cfg(test)] +mod tests { + use vortex_error::VortexExpect; + + use super::*; + + fn evidence(range: Range) -> PredicateEvidence { + PredicateEvidence::new(PredicateId::new(0), PredicateVersion::STATIC, range) + .vortex_expect("fresh evidence") + } + + /// Nothing absorbed: everything is a candidate, nothing is proven. + #[test] + fn fresh_evidence_proves_nothing() { + let acc = evidence(100..200); + assert!(acc.maybe().all_true()); + assert!(acc.proven().all_false()); + assert!(acc.unproven().all_true()); + assert!(!acc.all_false()); + } + + /// A whole-range AllFalse fragment kills the morsel. + #[test] + fn all_false_whole_range() -> VortexResult<()> { + let mut acc = evidence(100..200); + acc.absorb(EvidenceFragment::new( + 100..200, + PredicateEvidenceKind::AllFalse, + ))?; + assert!(acc.all_false()); + assert!(acc.proven().all_true()); + assert!(acc.unproven().all_false()); + Ok(()) + } + + /// Partial coverage: an interior AllFalse zone clears only its span + /// and leaves the edges unproven. + #[test] + fn partial_all_false_leaves_edges_unproven() -> VortexResult<()> { + let mut acc = evidence(100..200); + acc.absorb(EvidenceFragment::new( + 120..150, + PredicateEvidenceKind::AllFalse, + ))?; + assert!(!acc.all_false()); + assert_eq!(acc.maybe().true_count(), 70); + assert_eq!(acc.proven().true_count(), 30); + assert!(!acc.maybe().value(20)); // row 120 + assert!(acc.maybe().value(50)); // row 150 + assert_eq!(acc.unproven().true_count(), 70); + Ok(()) + } + + /// AllTrue proves rows without shrinking the candidate set. + #[test] + fn all_true_proves_without_filtering() -> VortexResult<()> { + let mut acc = evidence(0..100); + acc.absorb(EvidenceFragment::new(0..40, PredicateEvidenceKind::AllTrue))?; + assert!(acc.maybe().all_true()); + assert_eq!(acc.proven().true_count(), 40); + assert_eq!(acc.unproven().true_count(), 60); + Ok(()) + } + + /// An exact mask proves its whole span: selected rows survive, + /// rejected rows leave, and no residual evaluation remains there. + #[test] + fn exact_mask_proves_whole_span() -> VortexResult<()> { + let mut acc = evidence(0..100); + acc.absorb(EvidenceFragment::new( + 10..20, + PredicateEvidenceKind::ExactMask(Mask::from_indices(10, [2, 5])), + ))?; + assert_eq!(acc.maybe().true_count(), 92); + assert!(acc.maybe().value(12)); + assert!(acc.maybe().value(15)); + assert!(!acc.maybe().value(13)); + assert_eq!(acc.proven().true_count(), 10); + // The two surviving rows are proven, not residual candidates. + assert!(!acc.unproven().value(12)); + Ok(()) + } + + /// A candidate mask rejects masked-out rows but keeps masked-in rows + /// residual. + #[test] + fn candidate_mask_keeps_residual() -> VortexResult<()> { + let mut acc = evidence(0..100); + acc.absorb(EvidenceFragment::new( + 10..20, + PredicateEvidenceKind::CandidateMask(Mask::from_indices(10, [2, 5])), + ))?; + assert_eq!(acc.maybe().true_count(), 92); + // Rejected rows are proven false; candidates are not proven. + assert_eq!(acc.proven().true_count(), 8); + assert!(acc.unproven().value(12)); + assert!(acc.unproven().value(15)); + assert!(!acc.unproven().value(13)); + Ok(()) + } + + /// Fragments combine: evidence from several producers intersects + /// candidates and unions proofs. + #[test] + fn fragments_combine_across_producers() -> VortexResult<()> { + let mut acc = evidence(0..100); + // A zone map proves rows 0..50 all-false. + acc.absorb(EvidenceFragment::new( + 0..50, + PredicateEvidenceKind::AllFalse, + ))?; + // An index proves rows 40..100 exactly: only rows 60 and 70 match. + acc.absorb(EvidenceFragment::new( + 40..100, + PredicateEvidenceKind::ExactMask(Mask::from_indices(60, [20, 30])), + ))?; + assert_eq!(acc.maybe().true_count(), 2); + assert!(acc.maybe().value(60)); + assert!(acc.maybe().value(70)); + assert!(acc.proven().all_true()); + assert!(acc.unproven().all_false()); + Ok(()) + } + + /// Fragments are clipped to the morsel range, masks included. + #[test] + fn fragments_clip_to_range() -> VortexResult<()> { + let mut acc = evidence(100..200); + // Covers 50..150 in file coordinates; only 100..150 is in range. + acc.absorb(EvidenceFragment::new( + 50..150, + PredicateEvidenceKind::ExactMask(Mask::from_indices(100, [40, 60])), + ))?; + // Index 40 falls before the morsel; index 60 is row 110. + assert_eq!(acc.maybe().true_count(), 51); + assert!(acc.maybe().value(10)); + assert_eq!(acc.proven().true_count(), 50); + // Wholly disjoint fragments are ignored. + acc.absorb(EvidenceFragment::new( + 0..100, + PredicateEvidenceKind::AllFalse, + ))?; + assert_eq!(acc.maybe().true_count(), 51); + Ok(()) + } + + /// A mask whose length does not match its declared range is an error. + #[test] + fn mismatched_mask_length_is_an_error() { + let mut acc = evidence(0..100); + let result = acc.absorb(EvidenceFragment::new( + 0..50, + PredicateEvidenceKind::ExactMask(Mask::new_true(10)), + )); + assert!(result.is_err()); + } + + /// Candidate ceilings degrade exact evidence kinds. + #[test] + fn exactness_ceiling_caps_kinds() { + let exact = PredicateEvidenceKind::ExactMask(Mask::new_true(4)); + assert!(matches!( + exact.cap(Exactness::Candidate), + PredicateEvidenceKind::CandidateMask(_) + )); + assert!(matches!( + PredicateEvidenceKind::AllTrue.cap(Exactness::Candidate), + PredicateEvidenceKind::Unknown + )); + // Rejection stays within a candidate's authority. + assert!(matches!( + PredicateEvidenceKind::AllFalse.cap(Exactness::Candidate), + PredicateEvidenceKind::AllFalse + )); + } +} diff --git a/vortex-layout/src/scan/v2/layouts/chunked.rs b/vortex-layout/src/scan/v2/layouts/chunked.rs new file mode 100644 index 00000000000..42a71a4c11b --- /dev/null +++ b/vortex-layout/src/scan/v2/layouts/chunked.rs @@ -0,0 +1,988 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Scan2 rule for chunked layouts. +//! +//! Chunks stay *lazy*: children are resolved from the footer and expanded +//! through their own rules per request, never pre-planned. Chunked is +//! therefore a lazy pushdown boundary: pushed expressions are recorded +//! once, then replayed into each concrete child only when a read, +//! evidence request, or aggregate touches that chunk. This lets +//! child-local layouts such as zoned, dictionary, or index wrappers keep +//! their scan behavior without expanding every chunk up front. +//! +//! The selected read path is where chunking pays off (plan 017 SP5): a +//! chunk whose selection slice is empty is skipped outright — its node is +//! never expanded, its state never created, its segments never fetched. + +use std::fmt; +use std::ops::Range; +use std::sync::Arc; +#[cfg(debug_assertions)] +use std::sync::atomic::AtomicU64; +#[cfg(debug_assertions)] +use std::sync::atomic::Ordering; + +use futures::future::BoxFuture; +use parking_lot::Mutex; +use rustc_hash::FxHashMap; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::aggregate_fn::AggregateFnRef; +use vortex_array::arrays::ChunkedArray; +use vortex_array::arrays::ConstantArray; +use vortex_array::dtype::DType; +use vortex_array::expr::Expression; +use vortex_array::expr::is_root; +use vortex_array::expr::root; +use vortex_array::scalar::Scalar; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_err; +use vortex_session::VortexSession; + +use crate::LayoutEncodingId; +use crate::LayoutRef; +use crate::layouts::chunked::ChunkedLayoutEncoding; +use crate::scan::v2::evidence::EvidenceFragment; +use crate::scan::v2::node::AggregateAnswer; +use crate::scan::v2::node::AggregatePlan; +use crate::scan::v2::node::AggregatePlanRef; +use crate::scan::v2::node::EvidencePlan; +use crate::scan::v2::node::EvidencePlanRef; +use crate::scan::v2::node::ExpandCtx; +use crate::scan::v2::node::FileReader; +use crate::scan::v2::node::LayoutScanRule; +use crate::scan::v2::node::PlanCtx; +use crate::scan::v2::node::PushCtx; +use crate::scan::v2::node::ReadPlan; +use crate::scan::v2::node::ReadPlanRef; +use crate::scan::v2::node::RowScope; +use crate::scan::v2::node::ScanNode; +use crate::scan::v2::node::ScanNodeRef; +use crate::scan::v2::node::ScanStateCache; +use crate::scan::v2::node::ScanStateRef; +use crate::scan::v2::node::StateCtx; +use crate::scan::v2::node::downcast_state; +use crate::scan::v2::request::EvidenceMode; +use crate::scan::v2::request::EvidenceRequest; +use crate::scan::v2::request::NodeRequest; + +/// Scan2 rule for `vortex.chunked`. +#[derive(Debug)] +pub struct ChunkedScanRule; + +impl LayoutScanRule for ChunkedScanRule { + type Node = ChunkedScanNode; + + fn id(&self) -> LayoutEncodingId { + ChunkedLayoutEncoding.id() + } + + fn expand( + &self, + layout: &LayoutRef, + _req: &mut NodeRequest, + cx: &ExpandCtx, + ) -> VortexResult { + let mut offsets = layout + .child_row_offsets() + .map(|offset| offset.ok_or_else(|| vortex_err!("chunked layout with auxiliary child"))) + .collect::>>()?; + offsets.push(layout.row_count()); + Ok(ChunkedScanNode { + layout: Arc::clone(layout), + offsets, + cx: cx.clone(), + children: Mutex::new(FxHashMap::default()), + reads: Mutex::new(FxHashMap::default()), + }) + } +} + +/// Reads a chunked layout: cumulative chunk offsets +/// (`offsets.len() == chunks + 1`), with chunk children expanded lazily +/// through their own rules. +pub struct ChunkedScanNode { + layout: LayoutRef, + offsets: Vec, + cx: ExpandCtx, + /// Lazily expanded chunk nodes, shared across queries. + children: Mutex>, + /// Lazily planned chunk reads, shared across queries. + reads: Mutex>, +} + +/// Per-query states of the lazily expanded chunk nodes. Chunk states +/// behind the scan's morsel frontier are dropped by +/// [`ScanNode::release`], so a long scan retains the working set, not +/// every chunk it touched. +#[derive(Default)] +pub struct ChunkedScanState { + children: Mutex>, + node_states: Mutex>, + /// Every chunk whose state was ever created (never cleared by + /// release), for read-avoidance tests. + #[cfg(any(test, debug_assertions))] + created: Mutex>, + /// Highest released frontier, for the debug no-read-behind check. + #[cfg(debug_assertions)] + released: AtomicU64, +} + +/// A pushed expression over a chunked layout. +/// +/// Chunk children remain lazy: this node records the expression once and +/// replays expression pushdown into each concrete child only when a read, +/// evidence request, or aggregate touches that chunk. +pub struct ChunkedExprScanNode { + chunked: Arc, + expr: Expression, + dtype: DType, + children: Mutex>, + reads: Mutex>, +} + +/// Per-query states of lazily pushed chunk children. +pub struct ChunkedExprScanState { + chunked: ScanStateRef, + children: Mutex>, + #[cfg(debug_assertions)] + released: AtomicU64, +} + +struct ChunkedEvidencePlan { + node: Arc, +} + +enum ChunkedAggregateNode { + Root(Arc), + Expr(Arc), +} + +struct ChunkedAggregatePlan { + node: ChunkedAggregateNode, + dtype: DType, + funcs: Vec, +} + +struct ChunkedReadPlan { + node: Arc, +} + +struct ChunkedExprReadPlan { + node: Arc, +} + +#[derive(Default)] +struct ChunkedEvidenceState { + children: Mutex>>, + recheck_children: Mutex>>, +} + +#[derive(Default)] +struct ChunkedAggregateState { + children: Mutex>>, +} + +impl ChunkedScanState { + /// The number of chunk states currently retained. + #[allow(dead_code)] + #[cfg(any(test, debug_assertions))] + pub fn retained_children(&self) -> usize { + self.children.lock().len() + } + + /// Whether chunk `idx` was ever read this query (release does not + /// clear this). + #[allow(dead_code)] + #[cfg(any(test, debug_assertions))] + pub fn touched(&self, idx: usize) -> bool { + self.created.lock().contains(&idx) + } +} + +impl ChunkedScanNode { + /// The scan node for chunk `idx`, expanding it on first use. Lazy + /// expansion is independent of pushed predicate expressions. + fn child(&self, idx: usize) -> VortexResult { + if let Some(hit) = self.children.lock().get(&idx) { + return Ok(Arc::clone(hit)); + } + let node = self.cx.expand_free(&self.layout.child(idx)?)?; + self.children.lock().insert(idx, Arc::clone(&node)); + Ok(node) + } + + /// The planned value read for chunk `idx`, creating it on first use. + fn child_read(&self, idx: usize, session: &VortexSession) -> VortexResult { + if let Some(hit) = self.reads.lock().get(&idx) { + return Ok(Arc::clone(hit)); + } + let node = self.child(idx)?; + let mut cx = PlanCtx::new(session.clone()); + let read = node + .plan_read(&mut cx)? + .ok_or_else(|| vortex_err!("chunked child {idx} did not produce a read plan"))?; + let mut reads = self.reads.lock(); + Ok(Arc::clone(reads.entry(idx).or_insert(read))) + } + + /// Chunk `idx`'s per-query state, creating it on first use. + fn child_read_state( + &self, + idx: usize, + read: &ReadPlanRef, + state: &ChunkedScanState, + io: &FileReader, + ) -> VortexResult { + if let Some(hit) = state.children.lock().get(&idx) { + return Ok(Arc::clone(hit)); + } + let mut caches = state.node_states.lock(); + let cache = caches.entry(idx).or_default(); + let mut cx = StateCtx::new(io.session(), cache); + let child_state = read.init_state(&mut cx)?; + state.children.lock().insert(idx, Arc::clone(&child_state)); + #[cfg(any(test, debug_assertions))] + state.created.lock().insert(idx); + Ok(child_state) + } + + fn first_chunk(&self, start: u64) -> usize { + self.offsets + .partition_point(|&offset| offset <= start) + .saturating_sub(1) + } +} + +impl ChunkedExprScanNode { + fn new(chunked: Arc, expr: Expression, dtype: DType) -> Self { + Self { + chunked, + expr, + dtype, + children: Mutex::new(FxHashMap::default()), + reads: Mutex::new(FxHashMap::default()), + } + } + + fn child(&self, idx: usize, session: &VortexSession) -> VortexResult { + if let Some(hit) = self.children.lock().get(&idx) { + return Ok(Arc::clone(hit)); + } + let child = self.chunked.child(idx)?; + let mut cx = PushCtx::new(session.clone()); + let pushed = child.try_push_expr(&self.expr, &mut cx)?.ok_or_else(|| { + vortex_err!( + "chunked child {idx} could not push expression {}", + self.expr + ) + })?; + let mut children = self.children.lock(); + Ok(Arc::clone(children.entry(idx).or_insert(pushed))) + } + + /// The planned value read for pushed chunk child `idx`. + fn child_read(&self, idx: usize, session: &VortexSession) -> VortexResult { + if let Some(hit) = self.reads.lock().get(&idx) { + return Ok(Arc::clone(hit)); + } + let node = self.child(idx, session)?; + let mut cx = PlanCtx::new(session.clone()); + let read = node.plan_read(&mut cx)?.ok_or_else(|| { + vortex_err!("chunked expression child {idx} did not produce a read plan") + })?; + let mut reads = self.reads.lock(); + Ok(Arc::clone(reads.entry(idx).or_insert(read))) + } + + fn child_read_state( + &self, + idx: usize, + read: &ReadPlanRef, + state: &ChunkedExprScanState, + io: &FileReader, + ) -> VortexResult { + if let Some(hit) = state.children.lock().get(&idx) { + return Ok(Arc::clone(hit)); + } + let chunked_state = state + .chunked + .downcast_ref::() + .ok_or_else(|| vortex_err!("chunked expression state type mismatch"))?; + let mut caches = chunked_state.node_states.lock(); + let cache = caches.entry(idx).or_default(); + let mut cx = StateCtx::new(io.session(), cache); + let child_state = read.init_state(&mut cx)?; + let mut children = state.children.lock(); + Ok(Arc::clone(children.entry(idx).or_insert(child_state))) + } +} + +impl ChunkedAggregateNode { + fn offsets(&self) -> &[u64] { + match self { + Self::Root(node) => &node.offsets, + Self::Expr(node) => &node.chunked.offsets, + } + } + + fn first_chunk(&self, start: u64) -> usize { + match self { + Self::Root(node) => node.first_chunk(start), + Self::Expr(node) => node.chunked.first_chunk(start), + } + } + + fn child(&self, idx: usize, io: &FileReader) -> VortexResult { + match self { + Self::Root(node) => node.child(idx), + Self::Expr(node) => node.child(idx, io.session()), + } + } +} + +impl ChunkedAggregatePlan { + fn child_plan( + &self, + idx: usize, + state: &ChunkedAggregateState, + io: &FileReader, + ) -> VortexResult> { + if let Some(hit) = state.children.lock().get(&idx) { + return Ok(hit.clone()); + } + let child = self.node.child(idx, io)?; + let mut plan_ctx = PlanCtx::new(io.session().clone()); + let planned = match child.plan_aggregate_partial(&self.funcs, &mut plan_ctx)? { + Some(plan) => { + let plan_state = plan.init_state(io.session())?; + Some((plan, plan_state)) + } + None => None, + }; + let mut children = state.children.lock(); + Ok(children.entry(idx).or_insert(planned).clone()) + } +} + +impl AggregatePlan for ChunkedAggregatePlan { + type State = ChunkedAggregateState; + + fn init_state(&self, _ctx: &VortexSession) -> VortexResult { + Ok(ChunkedAggregateState::default()) + } + + fn aggregate_partial<'a>( + &'a self, + range: Range, + io: &'a FileReader, + state: &'a ChunkedAggregateState, + ) -> BoxFuture<'a, VortexResult>>> { + Box::pin(async move { + if range.start >= range.end { + return Ok(None); + } + let mut accumulators = self + .funcs + .iter() + .map(|func| { + func.state_dtype(&self.dtype) + .map(|_| func.accumulator(&self.dtype)) + .transpose() + }) + .collect::>>()?; + let mut contributed = vec![false; self.funcs.len()]; + let mut covered = vec![false; self.funcs.len()]; + let mut residuals: Vec>> = vec![Vec::new(); self.funcs.len()]; + let push_residual = + |residual: &mut Vec>, span: Range| match residual.last_mut() { + Some(last) if last.end == span.start => last.end = span.end, + _ => residual.push(span), + }; + + let offsets = self.node.offsets(); + let mut idx = self.node.first_chunk(range.start); + while idx + 1 < offsets.len() && offsets[idx] < range.end { + let chunk_start = offsets[idx]; + let chunk_end = offsets[idx + 1]; + let local = range.start.saturating_sub(chunk_start) + ..(range.end.min(chunk_end) - chunk_start); + let answers = match self.child_plan(idx, state, io)? { + Some((plan, plan_state)) => { + plan.aggregate_partial(local.clone(), io, plan_state.as_ref()) + .await? + } + None => None, + }; + match answers { + Some(answers) => { + for (func_idx, answer) in answers.into_iter().enumerate() { + let has_partial = answer.partial.is_some(); + let mut residual_rows = 0; + for span in answer.residual { + residual_rows += span.end - span.start; + push_residual( + &mut residuals[func_idx], + chunk_start + span.start..chunk_start + span.end, + ); + } + if let Some(partial) = answer.partial { + let Some(Some(acc)) = accumulators.get_mut(func_idx) else { + vortex_bail!("chunk answered an unsupported aggregate"); + }; + acc.combine_partials(partial)?; + contributed[func_idx] = true; + } + covered[func_idx] |= + has_partial || residual_rows < local.end - local.start; + } + } + None => { + for residual in residuals.iter_mut() { + push_residual( + residual, + chunk_start + local.start..chunk_start + local.end, + ); + } + } + } + idx += 1; + } + if !covered.iter().any(|&covered| covered) { + return Ok(None); + } + let mut answers = Vec::with_capacity(self.funcs.len()); + for ((accumulator, contributed), residual) in + accumulators.iter_mut().zip(contributed).zip(residuals) + { + let partial = match accumulator { + Some(acc) if contributed => Some(acc.flush()?), + _ => None, + }; + answers.push(AggregateAnswer { partial, residual }); + } + Ok(Some(answers)) + }) + } + + fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "chunked") + } +} + +impl ScanNode for ChunkedScanNode { + type State = ChunkedScanState; + + fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { + Ok(ChunkedScanState::default()) + } + + fn plan_read(self: Arc, _cx: &mut PlanCtx) -> VortexResult> { + Ok(Some(Arc::new(ChunkedReadPlan { node: self }))) + } + + fn try_push_expr( + self: Arc, + expr: &Expression, + _cx: &mut PushCtx, + ) -> VortexResult> { + if is_root(expr) { + return Ok(Some(self)); + } + let dtype = expr.return_dtype(self.layout.dtype())?; + Ok(Some(Arc::new(ChunkedExprScanNode::new( + self, + expr.clone(), + dtype, + )))) + } + + fn plan_evidence(self: Arc, _cx: &mut PlanCtx) -> VortexResult> { + Ok(vec![Arc::new(ChunkedEvidencePlan { + node: Arc::new(ChunkedExprScanNode::new( + Arc::clone(&self), + root(), + self.layout.dtype().clone(), + )), + })]) + } + + fn plan_aggregate_partial( + self: Arc, + funcs: &[AggregateFnRef], + _cx: &mut PlanCtx, + ) -> VortexResult> { + if funcs.is_empty() { + return Ok(None); + } + Ok(Some(Arc::new(ChunkedAggregatePlan { + node: ChunkedAggregateNode::Root(Arc::clone(&self)), + dtype: self.layout.dtype().clone(), + funcs: funcs.to_vec(), + }))) + } + + fn split_hints(&self) -> Option<&[u64]> { + Some(&self.offsets) + } + + /// Drop chunk states wholly behind the frontier and recurse into the + /// boundary chunk so nested layouts release their own state. The + /// expanded chunk *nodes* stay: they are shared across queries and + /// hold no data. + fn release(&self, frontier: u64, state: &ChunkedScanState) -> VortexResult<()> { + state + .children + .lock() + .retain(|&idx, _| self.offsets[idx + 1] > frontier); + state + .node_states + .lock() + .retain(|&idx, _| self.offsets[idx + 1] > frontier); + let idx = self.first_chunk(frontier); + if idx + 1 < self.offsets.len() && self.offsets[idx] < frontier { + let child_state = state.children.lock().get(&idx).cloned(); + let child = self.reads.lock().get(&idx).cloned(); + if let (Some(child), Some(child_state)) = (child, child_state) { + child.release(frontier - self.offsets[idx], child_state.as_ref())?; + } + } + #[cfg(debug_assertions)] + state.released.fetch_max(frontier, Ordering::Relaxed); + Ok(()) + } + + fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "chunked({})", self.offsets.len().saturating_sub(1)) + } +} + +impl ReadPlan for ChunkedReadPlan { + type State = ScanStateRef; + + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { + let node: ScanNodeRef = Arc::::clone(&self.node); + cx.init_node(&node) + } + + /// The chunked scoped read: slice the selection and demand per + /// overlapping chunk, skip chunks whose selection is all-false, and + /// represent selected-but-undemanded chunks with dtype-default filler + /// without expanding or reading the child. + fn read_scoped<'a>( + &'a self, + range: Range, + rows: RowScope<'a>, + io: &'a FileReader, + state: &'a Self::State, + local_ctx: &'a mut ExecutionCtx, + ) -> BoxFuture<'a, VortexResult> { + let state = match downcast_state::(state.as_ref()) { + Ok(state) => state, + Err(e) => return Box::pin(async move { Err(e) }), + }; + Box::pin(async move { + if range.start >= range.end { + vortex_bail!("empty chunked scoped read range"); + } + #[cfg(debug_assertions)] + { + let released = state.released.load(Ordering::Relaxed); + debug_assert!( + range.start >= released, + "chunked read {range:?} below the released frontier {released}" + ); + } + let range_len = usize::try_from(range.end - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + if rows.selection.len() != range_len { + vortex_bail!( + "selection length {} does not match range length {range_len}", + rows.selection.len() + ); + } + if rows.demand.len() != range_len { + vortex_bail!( + "demand length {} does not match range length {range_len}", + rows.demand.len() + ); + } + if rows.selection.all_false() { + return Ok( + ConstantArray::new(Scalar::default_value(self.node.layout.dtype()), 0) + .into_array(), + ); + } + + let dtype = self.node.layout.dtype().clone(); + let dense_scope = rows.selection.all_true() && rows.demand.all_true(); + let selected_scope = !dense_scope && rows.demands_all_selected(); + let mut parts = Vec::new(); + let mut idx = self.node.first_chunk(range.start); + while idx + 1 < self.node.offsets.len() && self.node.offsets[idx] < range.end { + let chunk_start = self.node.offsets[idx]; + let chunk_end = self.node.offsets[idx + 1]; + let local = range.start.saturating_sub(chunk_start) + ..(range.end.min(chunk_end) - chunk_start); + let sel_start = usize::try_from(chunk_start.max(range.start) - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + let sel_end = usize::try_from(chunk_end.min(range.end) - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + let chunk_selection = rows.selection.slice(sel_start..sel_end); + idx += 1; + if chunk_selection.all_false() { + continue; + } + let chunk_demand = rows.demand.slice(sel_start..sel_end); + if chunk_demand.all_false() { + parts.push( + ConstantArray::new( + Scalar::default_value(&dtype), + chunk_selection.true_count(), + ) + .into_array(), + ); + continue; + } + let chunk_idx = idx - 1; + let read = self.node.child_read(chunk_idx, io.session())?; + let child_state = self.node.child_read_state(chunk_idx, &read, state, io)?; + let chunk = if dense_scope || selected_scope { + read.read_scoped( + local, + RowScope::selected(&chunk_selection), + io, + child_state.as_ref(), + local_ctx, + ) + .await? + } else { + let chunk_rows = RowScope::try_new(&chunk_selection, &chunk_demand)?; + read.read_scoped(local, chunk_rows, io, child_state.as_ref(), local_ctx) + .await? + }; + if chunk.len() != chunk_selection.true_count() { + vortex_bail!( + "scoped chunk read returned length {}, expected {}", + chunk.len(), + chunk_selection.true_count() + ); + } + parts.push(chunk); + } + match parts.len() { + 0 => vortex_bail!("chunked scoped read range {range:?} out of bounds"), + 1 => Ok(parts.swap_remove(0)), + _ => Ok(ChunkedArray::try_new(parts, dtype)?.into_array()), + } + }) + } + + fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { + self.node + .release(frontier, downcast_state::(state.as_ref())?) + } + + fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.node.fmt_chain(f) + } +} + +impl ScanNode for ChunkedExprScanNode { + type State = ChunkedExprScanState; + + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { + let chunked: ScanNodeRef = Arc::::clone(&self.chunked); + Ok(ChunkedExprScanState { + chunked: cx.init_node(&chunked)?, + children: Mutex::new(FxHashMap::default()), + #[cfg(debug_assertions)] + released: AtomicU64::new(0), + }) + } + + fn plan_read(self: Arc, _cx: &mut PlanCtx) -> VortexResult> { + Ok(Some(Arc::new(ChunkedExprReadPlan { node: self }))) + } + + fn plan_evidence(self: Arc, _cx: &mut PlanCtx) -> VortexResult> { + Ok(vec![Arc::new(ChunkedEvidencePlan { node: self })]) + } + + fn plan_aggregate_partial( + self: Arc, + funcs: &[AggregateFnRef], + _cx: &mut PlanCtx, + ) -> VortexResult> { + if funcs.is_empty() { + return Ok(None); + } + Ok(Some(Arc::new(ChunkedAggregatePlan { + node: ChunkedAggregateNode::Expr(Arc::clone(&self)), + dtype: self.dtype.clone(), + funcs: funcs.to_vec(), + }))) + } + + fn split_hints(&self) -> Option<&[u64]> { + Some(&self.chunked.offsets) + } + + fn release(&self, frontier: u64, state: &ChunkedExprScanState) -> VortexResult<()> { + state + .children + .lock() + .retain(|&idx, _| self.chunked.offsets[idx + 1] > frontier); + if let Some(chunked_state) = state.chunked.downcast_ref::() { + chunked_state + .node_states + .lock() + .retain(|&idx, _| self.chunked.offsets[idx + 1] > frontier); + } + let idx = self.chunked.first_chunk(frontier); + if idx + 1 < self.chunked.offsets.len() && self.chunked.offsets[idx] < frontier { + let child_state = state.children.lock().get(&idx).cloned(); + let child = self.reads.lock().get(&idx).cloned(); + if let (Some(child), Some(child_state)) = (child, child_state) { + child.release(frontier - self.chunked.offsets[idx], child_state.as_ref())?; + } + } + #[cfg(debug_assertions)] + state.released.fetch_max(frontier, Ordering::Relaxed); + Ok(()) + } + + fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "chunked_expr({})", self.expr) + } +} + +impl ReadPlan for ChunkedExprReadPlan { + type State = ScanStateRef; + + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { + let node: ScanNodeRef = Arc::::clone(&self.node); + cx.init_node(&node) + } + + fn read_scoped<'a>( + &'a self, + range: Range, + rows: RowScope<'a>, + io: &'a FileReader, + state: &'a Self::State, + local_ctx: &'a mut ExecutionCtx, + ) -> BoxFuture<'a, VortexResult> { + let state = match downcast_state::(state.as_ref()) { + Ok(state) => state, + Err(e) => return Box::pin(async move { Err(e) }), + }; + Box::pin(async move { + if range.start >= range.end { + vortex_bail!("empty chunked scoped read range"); + } + #[cfg(debug_assertions)] + { + let released = state.released.load(Ordering::Relaxed); + debug_assert!( + range.start >= released, + "chunked expression read {range:?} below the released frontier {released}" + ); + } + let range_len = usize::try_from(range.end - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + if rows.selection.len() != range_len { + vortex_bail!( + "selection length {} does not match range length {range_len}", + rows.selection.len() + ); + } + if rows.demand.len() != range_len { + vortex_bail!( + "demand length {} does not match range length {range_len}", + rows.demand.len() + ); + } + if rows.selection.all_false() { + return Ok( + ConstantArray::new(Scalar::default_value(&self.node.dtype), 0).into_array(), + ); + } + + let dense_scope = rows.selection.all_true() && rows.demand.all_true(); + let selected_scope = !dense_scope && rows.demands_all_selected(); + let mut parts = Vec::new(); + let mut idx = self.node.chunked.first_chunk(range.start); + while idx + 1 < self.node.chunked.offsets.len() + && self.node.chunked.offsets[idx] < range.end + { + let chunk_start = self.node.chunked.offsets[idx]; + let chunk_end = self.node.chunked.offsets[idx + 1]; + let local = range.start.saturating_sub(chunk_start) + ..(range.end.min(chunk_end) - chunk_start); + let sel_start = usize::try_from(chunk_start.max(range.start) - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + let sel_end = usize::try_from(chunk_end.min(range.end) - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + let chunk_selection = rows.selection.slice(sel_start..sel_end); + idx += 1; + if chunk_selection.all_false() { + continue; + } + let chunk_demand = rows.demand.slice(sel_start..sel_end); + if chunk_demand.all_false() { + parts.push( + ConstantArray::new( + Scalar::default_value(&self.node.dtype), + chunk_selection.true_count(), + ) + .into_array(), + ); + continue; + } + let chunk_idx = idx - 1; + let read = self.node.child_read(chunk_idx, io.session())?; + let child_state = self.node.child_read_state(chunk_idx, &read, state, io)?; + let chunk = if dense_scope || selected_scope { + read.read_scoped( + local, + RowScope::selected(&chunk_selection), + io, + child_state.as_ref(), + local_ctx, + ) + .await? + } else { + let chunk_rows = RowScope::try_new(&chunk_selection, &chunk_demand)?; + read.read_scoped(local, chunk_rows, io, child_state.as_ref(), local_ctx) + .await? + }; + if chunk.len() != chunk_selection.true_count() { + vortex_bail!( + "scoped chunk read returned length {}, expected {}", + chunk.len(), + chunk_selection.true_count() + ); + } + parts.push(chunk); + } + match parts.len() { + 0 => vortex_bail!("chunked scoped read range {range:?} out of bounds"), + 1 => Ok(parts.swap_remove(0)), + _ => Ok(ChunkedArray::try_new(parts, self.node.dtype.clone())?.into_array()), + } + }) + } + + fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { + self.node.release( + frontier, + downcast_state::(state.as_ref())?, + ) + } + + fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.node.fmt_chain(f) + } +} + +impl EvidencePlan for ChunkedEvidencePlan { + type State = ChunkedEvidenceState; + + fn init_state(&self, _ctx: &VortexSession) -> VortexResult { + Ok(ChunkedEvidenceState::default()) + } + + fn evidence<'a>( + &'a self, + req: &'a EvidenceRequest<'a>, + io: &'a FileReader, + state: &'a ChunkedEvidenceState, + ) -> BoxFuture<'a, VortexResult>> { + Box::pin(async move { + if req.range.start >= req.range.end { + return Ok(Vec::new()); + } + let mut fragments = Vec::new(); + let mut idx = self.node.chunked.first_chunk(req.range.start); + while idx + 1 < self.node.chunked.offsets.len() + && self.node.chunked.offsets[idx] < req.range.end + { + let chunk_start = self.node.chunked.offsets[idx]; + let chunk_end = self.node.chunked.offsets[idx + 1]; + let local = req.range.start.saturating_sub(chunk_start) + ..(req.range.end.min(chunk_end) - chunk_start); + let recheck = req.mode == EvidenceMode::RecheckBeforeProjection; + let child_plans = if let Some(hit) = state.children.lock().get(&idx) { + hit.clone() + } else if recheck { + if let Some(hit) = state.recheck_children.lock().get(&idx) { + hit.clone() + } else { + let node = self.node.child(idx, io.session())?; + let mut plan_ctx = PlanCtx::new(io.session().clone()); + let plans = node.plan_evidence(&mut plan_ctx)?; + let planned = plans + .into_iter() + .filter(|plan| plan.recheck_before_projection()) + .map(|plan| { + let plan_state = plan.init_state(io.session())?; + Ok((plan, plan_state)) + }) + .collect::>>()?; + let mut children = state.recheck_children.lock(); + children.entry(idx).or_insert(planned).clone() + } + } else { + let node = self.node.child(idx, io.session())?; + let mut plan_ctx = PlanCtx::new(io.session().clone()); + let plans = node.plan_evidence(&mut plan_ctx)?; + let planned = plans + .into_iter() + .map(|plan| { + let plan_state = plan.init_state(io.session())?; + Ok((plan, plan_state)) + }) + .collect::>>()?; + let mut children = state.children.lock(); + children.entry(idx).or_insert(planned).clone() + }; + if !child_plans.is_empty() { + let child_req = EvidenceRequest { + id: req.id, + version: req.version, + predicate: req.predicate, + range: local, + mode: req.mode, + }; + for (plan, plan_state) in child_plans { + if recheck && !plan.recheck_before_projection() { + continue; + } + for fragment in plan.evidence(&child_req, io, plan_state.as_ref()).await? { + fragments.push(translate_fragment(fragment, chunk_start)); + } + } + } + idx += 1; + } + Ok(fragments) + }) + } + + fn recheck_before_projection(&self) -> bool { + true + } + + fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "chunked") + } +} + +fn translate_fragment(mut fragment: EvidenceFragment, offset: u64) -> EvidenceFragment { + fragment.rows = fragment.rows.start + offset..fragment.rows.end + offset; + fragment +} diff --git a/vortex-layout/src/scan/v2/layouts/dict.rs b/vortex-layout/src/scan/v2/layouts/dict.rs new file mode 100644 index 00000000000..5e3021c7196 --- /dev/null +++ b/vortex-layout/src/scan/v2/layouts/dict.rs @@ -0,0 +1,498 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Scan2 rule for dictionary layouts. +//! +//! Value reads keep the v1 shape — values read once per query and +//! cached, codes read per range (selection-aware), the pair rebuilt as a +//! lazy `DictArray`. New is the runtime value-domain rewrite (plan 017 +//! SP7): pushed dictionary predicate nodes answer by evaluating the +//! predicate over the *dictionary values* once per query, then mapping +//! the per-value verdicts through the codes: +//! +//! - no value satisfies the predicate (and null does not either): the +//! whole column is proven all-false without reading a single code; +//! - every value satisfies it: all-true the same way; +//! - otherwise the per-value mask maps through the range's codes into an +//! exact per-row mask, costing a code read but never a value decode at +//! data scale. +//! +//! The rewrite is exact: evaluating the predicate over the values array +//! and indexing the result by code is the same value-domain evaluation +//! vortex's expression machinery performs over a `DictArray`, including +//! null routing (a null row takes the predicate's verdict on null). A +//! predicate whose evaluation over the values fails is recorded as +//! unanswerable and falls through to residual evaluation rather than +//! failing the scan. + +use std::fmt; +use std::ops::Range; +use std::sync::Arc; + +use futures::future::BoxFuture; +use parking_lot::Mutex; +use rustc_hash::FxHashMap; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::DictArray; +use vortex_array::dtype::DType; +use vortex_array::expr::Expression; +use vortex_array::expr::is_root; +use vortex_array::optimizer::ArrayOptimizer; +use vortex_array::scalar::Scalar; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_err; +use vortex_mask::Mask; +use vortex_session::VortexSession; + +use crate::LayoutEncodingId; +use crate::LayoutRef; +use crate::layouts::dict::Dict; +use crate::layouts::dict::DictLayoutEncoding; +use crate::scan::v2::evidence::EvidenceFragment; +use crate::scan::v2::evidence::PredicateEvidenceKind; +use crate::scan::v2::node::DynReadPlan; +use crate::scan::v2::node::EvidencePlan; +use crate::scan::v2::node::EvidencePlanRef; +use crate::scan::v2::node::ExpandCtx; +use crate::scan::v2::node::FileReader; +use crate::scan::v2::node::LayoutScanRule; +use crate::scan::v2::node::PlanCtx; +use crate::scan::v2::node::PushCtx; +use crate::scan::v2::node::ReadPlan; +use crate::scan::v2::node::ReadPlanRef; +use crate::scan::v2::node::RowScope; +use crate::scan::v2::node::ScanNode; +use crate::scan::v2::node::ScanNodeRef; +use crate::scan::v2::node::ScanStateCache; +use crate::scan::v2::node::ScanStateRef; +use crate::scan::v2::node::StateCtx; +use crate::scan::v2::node::read_dense; +use crate::scan::v2::request::EvidenceRequest; +use crate::scan::v2::request::NodeRequest; + +/// Scan2 rule for `vortex.dict`. +#[derive(Debug)] +pub struct DictScanRule; + +impl LayoutScanRule for DictScanRule { + type Node = DictScanNode; + + fn id(&self) -> LayoutEncodingId { + DictLayoutEncoding.id() + } + + fn expand( + &self, + layout: &LayoutRef, + _req: &mut NodeRequest, + cx: &ExpandCtx, + ) -> VortexResult { + if !layout.is::() { + vortex_bail!("dict scan2 rule applied to {}", layout.encoding_id()); + } + let values = layout.child(0)?; + let codes = layout.child(1)?; + Ok(DictScanNode { + dtype: layout.dtype().clone(), + values_len: values.row_count(), + // Values and codes live in other row domains. + values: cx.expand_free(&values)?, + codes: cx.expand_free(&codes)?, + }) + } +} + +/// Reads a dict layout: shared values (another row domain, read once per +/// query) plus a codes chain in this node's row domain. +pub struct DictScanNode { + dtype: DType, + values: ScanNodeRef, + values_len: u64, + codes: ScanNodeRef, +} + +/// One predicate's value-domain rewrite, computed once per query. +enum ValueVerdicts { + /// The predicate could not be evaluated over the values; produce no + /// evidence and let residual evaluation handle it. + Unanswerable, + /// Per-value verdicts plus the verdict for null rows. + Verdicts { + /// `true` at value `v`: rows coded `v` satisfy the predicate. + mask: Mask, + /// Whether a null row satisfies the predicate. + null_verdict: bool, + }, +} + +/// Per-query state: the cached values relation, the child states, and +/// the per-predicate value-domain verdicts. +pub struct DictScanState { + values: Mutex>, + values_state: ScanStateRef, + codes_state: ScanStateRef, + verdicts: Mutex>>, +} + +/// Planned dictionary value-domain evidence for one predicate. +struct DictEvidencePlan { + dtype: DType, + values_read: ReadPlanRef, + values_len: u64, + codes_read: ReadPlanRef, + predicate: Expression, +} + +/// A pushed scalar expression over a dictionary value. +struct DictExprScanNode { + dict: Arc, + expr: Expression, +} + +struct DictReadPlan { + node: Arc, + values_read: ReadPlanRef, + codes_read: ReadPlanRef, +} + +struct DictExprReadPlan { + node: Arc, + input: ReadPlanRef, +} + +impl DictScanNode { + /// The values relation, read once per query. + async fn values( + &self, + values_read: &dyn DynReadPlan, + io: &FileReader, + state: &DictScanState, + local: &mut ExecutionCtx, + ) -> VortexResult { + if let Some(hit) = state.values.lock().clone() { + return Ok(hit); + } + let selection = Mask::new_true( + usize::try_from(self.values_len) + .map_err(|_| vortex_err!("dictionary values length exceeds usize"))?, + ); + let values = values_read + .read_scoped( + 0..self.values_len, + RowScope::selected(&selection), + io, + state.values_state.as_ref(), + local, + ) + .await?; + *state.values.lock() = Some(values.clone()); + Ok(values) + } +} + +impl DictEvidencePlan { + async fn values(&self, io: &FileReader, state: &DictScanState) -> VortexResult { + if let Some(hit) = state.values.lock().clone() { + return Ok(hit); + } + let values = read_dense( + self.values_read.as_ref(), + 0..self.values_len, + io, + state.values_state.as_ref(), + ) + .await?; + *state.values.lock() = Some(values.clone()); + Ok(values) + } + + async fn verdicts( + &self, + io: &FileReader, + state: &DictScanState, + ) -> VortexResult> { + if let Some(hit) = state.verdicts.lock().get(&self.predicate) { + return Ok(Arc::clone(hit)); + } + let values = self.values(io, state).await?; + let mut ctx = io.session().create_execution_ctx(); + let computed = (|| -> VortexResult { + let mask = values + .clone() + .apply(&self.predicate)? + .execute::(&mut ctx)?; + let null_verdict = if self.dtype.is_nullable() { + let null = ConstantArray::new(Scalar::null(self.dtype.clone()), 1).into_array(); + null.apply(&self.predicate)? + .execute::(&mut ctx)? + .value(0) + } else { + false + }; + Ok(ValueVerdicts::Verdicts { mask, null_verdict }) + })(); + let verdicts = Arc::new(match computed { + Ok(verdicts) => verdicts, + Err(error) => { + tracing::debug!( + predicate = %self.predicate, + %error, + "dict value-domain rewrite unanswerable" + ); + ValueVerdicts::Unanswerable + } + }); + state + .verdicts + .lock() + .insert(self.predicate.clone(), Arc::clone(&verdicts)); + Ok(verdicts) + } +} + +impl EvidencePlan for DictEvidencePlan { + type State = DictScanState; + + fn init_state(&self, ctx: &VortexSession) -> VortexResult { + let mut cache = ScanStateCache::default(); + let mut cx = StateCtx::new(ctx, &mut cache); + Ok(DictScanState { + values: Mutex::new(None), + values_state: self.values_read.init_state(&mut cx)?, + codes_state: self.codes_read.init_state(&mut cx)?, + verdicts: Mutex::new(FxHashMap::default()), + }) + } + + fn evidence<'a>( + &'a self, + req: &'a EvidenceRequest<'a>, + io: &'a FileReader, + state: &'a DictScanState, + ) -> BoxFuture<'a, VortexResult>> { + Box::pin(async move { + let verdicts = self.verdicts(io, state).await?; + let ValueVerdicts::Verdicts { mask, null_verdict } = verdicts.as_ref() else { + return Ok(Vec::new()); + }; + let nullable = self.dtype.is_nullable(); + if mask.all_false() && !*null_verdict { + return Ok(vec![EvidenceFragment::new( + req.range.clone(), + PredicateEvidenceKind::AllFalse, + )]); + } + if mask.all_true() && (!nullable || *null_verdict) { + return Ok(vec![EvidenceFragment::new( + req.range.clone(), + PredicateEvidenceKind::AllTrue, + )]); + } + let codes = read_dense( + self.codes_read.as_ref(), + req.range.clone(), + io, + state.codes_state.as_ref(), + ) + .await?; + let mut ctx = io.session().create_execution_ctx(); + let verdict_values = BoolArray::from(mask.to_bit_buffer()).into_array(); + let mut rows = DictArray::try_new(codes.clone(), verdict_values)? + .into_array() + .execute::(&mut ctx)?; + if *null_verdict { + let valid = codes.validity()?.execute_mask(codes.len(), &mut ctx)?; + rows = &rows | &!valid; + } + Ok(vec![EvidenceFragment::new( + req.range.clone(), + PredicateEvidenceKind::ExactMask(rows), + )]) + }) + } + + fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "dict") + } +} + +impl ScanNode for DictScanNode { + type State = DictScanState; + + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { + Ok(DictScanState { + values: Mutex::new(None), + values_state: cx.init_node(&self.values)?, + codes_state: cx.init_node(&self.codes)?, + verdicts: Mutex::new(FxHashMap::default()), + }) + } + + fn try_push_expr( + self: Arc, + expr: &Expression, + _cx: &mut PushCtx, + ) -> VortexResult> { + if is_root(expr) { + Ok(Some(self)) + } else { + Ok(Some(Arc::new(DictExprScanNode { + dict: self, + expr: expr.clone(), + }))) + } + } + + fn split_hints(&self) -> Option<&[u64]> { + self.codes.split_hints() + } + + fn plan_read(self: Arc, cx: &mut PlanCtx) -> VortexResult> { + let values_read = Arc::clone(&self.values) + .plan_read(cx)? + .ok_or_else(|| vortex_err!("dictionary values did not produce a read plan"))?; + let codes_read = Arc::clone(&self.codes) + .plan_read(cx)? + .ok_or_else(|| vortex_err!("dictionary codes did not produce a read plan"))?; + Ok(Some(Arc::new(DictReadPlan { + node: self, + values_read, + codes_read, + }))) + } + + /// Codes live in this node's row domain and release with it. The + /// cached values relation and per-predicate verdicts stay — they are + /// read once per query by design and consulted by every remaining + /// morsel. + fn release(&self, frontier: u64, state: &DictScanState) -> VortexResult<()> { + self.codes.release(frontier, state.codes_state.as_ref()) + } + + fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "dict(")?; + self.codes.fmt_chain(f)?; + write!(f, ")") + } +} + +impl ScanNode for DictExprScanNode { + type State = DictScanState; + + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { + self.dict.init_state(cx) + } + + fn plan_read(self: Arc, cx: &mut PlanCtx) -> VortexResult> { + let input = Arc::clone(&self.dict).plan_read(cx)?.ok_or_else(|| { + vortex_err!("dictionary expression input did not produce a read plan") + })?; + Ok(Some(Arc::new(DictExprReadPlan { node: self, input }))) + } + + fn plan_evidence(self: Arc, cx: &mut PlanCtx) -> VortexResult> { + let values_read = Arc::clone(&self.dict.values) + .plan_read(cx)? + .ok_or_else(|| vortex_err!("dictionary values did not produce a read plan"))?; + let codes_read = Arc::clone(&self.dict.codes) + .plan_read(cx)? + .ok_or_else(|| vortex_err!("dictionary codes did not produce a read plan"))?; + Ok(vec![Arc::new(DictEvidencePlan { + dtype: self.dict.dtype.clone(), + values_read, + values_len: self.dict.values_len, + codes_read, + predicate: self.expr.clone(), + })]) + } + + fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { + self.dict.release(frontier, state) + } + + fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "dict_expr({})", self.expr) + } +} + +impl ReadPlan for DictReadPlan { + type State = DictScanState; + + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { + Ok(DictScanState { + values: Mutex::new(None), + values_state: self.values_read.init_state(cx)?, + codes_state: self.codes_read.init_state(cx)?, + verdicts: Mutex::new(FxHashMap::default()), + }) + } + + fn read_scoped<'a>( + &'a self, + range: Range, + rows: RowScope<'a>, + io: &'a FileReader, + state: &'a Self::State, + local: &'a mut ExecutionCtx, + ) -> BoxFuture<'a, VortexResult> { + Box::pin(async move { + let values = self + .node + .values(self.values_read.as_ref(), io, state, local) + .await?; + let codes = self + .codes_read + .read_scoped(range, rows, io, state.codes_state.as_ref(), local) + .await?; + DictArray::try_new(codes, values)?.into_array().optimize() + }) + } + + fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { + self.codes_read + .release(frontier, state.codes_state.as_ref()) + } + + fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.node.fmt_chain(f) + } +} + +impl ReadPlan for DictExprReadPlan { + type State = ScanStateRef; + + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { + self.input.init_state(cx) + } + + fn read_scoped<'a>( + &'a self, + range: Range, + rows: RowScope<'a>, + io: &'a FileReader, + state: &'a Self::State, + local: &'a mut ExecutionCtx, + ) -> BoxFuture<'a, VortexResult> { + Box::pin(async move { + let input = self + .input + .read_scoped(range, rows, io, state.as_ref(), local) + .await?; + input.apply(&self.node.expr)?.execute::(local) + }) + } + + fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { + self.input.release(frontier, state.as_ref()) + } + + fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.node.fmt_chain(f) + } +} diff --git a/vortex-layout/src/scan/v2/layouts/flat.rs b/vortex-layout/src/scan/v2/layouts/flat.rs new file mode 100644 index 00000000000..d88c4ad77d0 --- /dev/null +++ b/vortex-layout/src/scan/v2/layouts/flat.rs @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Scan2 rule for flat layouts: one segment, parsed lazily, decoded on +//! demand. +//! +//! A flat leaf exposes no evidence producers — it has no statistics or +//! index — and keeps the default selection path: its segment decodes whole, so a +//! selected read is the dense parse followed by a lazy filter, which +//! vortex pushes through the encodings. + +use std::fmt; +use std::ops::Range; +use std::sync::Arc; + +use futures::future::BoxFuture; +use parking_lot::Mutex; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::serde::SerializedArray; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_err; + +use crate::LayoutEncodingId; +use crate::LayoutRef; +use crate::layouts::flat::Flat; +use crate::layouts::flat::FlatLayoutEncoding; +use crate::scan::v2::node::ExpandCtx; +use crate::scan::v2::node::FileReader; +use crate::scan::v2::node::LayoutScanRule; +use crate::scan::v2::node::PlanCtx; +use crate::scan::v2::node::ReadPlan; +use crate::scan::v2::node::ReadPlanRef; +use crate::scan::v2::node::RowScope; +use crate::scan::v2::node::ScanNode; +use crate::scan::v2::node::ScanNodeRef; +use crate::scan::v2::node::ScanStateRef; +use crate::scan::v2::node::StateCtx; +use crate::scan::v2::node::downcast_state; +use crate::scan::v2::request::NodeRequest; + +/// Scan2 rule for `vortex.flat`. +#[derive(Debug)] +pub struct FlatScanRule; + +impl LayoutScanRule for FlatScanRule { + type Node = FlatScanNode; + + fn id(&self) -> LayoutEncodingId { + FlatLayoutEncoding.id() + } + + fn expand( + &self, + layout: &LayoutRef, + _req: &mut NodeRequest, + _cx: &ExpandCtx, + ) -> VortexResult { + if !layout.is::() { + vortex_bail!("flat scan2 rule applied to {}", layout.encoding_id()); + } + Ok(FlatScanNode { + layout: Arc::clone(layout), + }) + } +} + +/// Reads a flat layout: fetches its segment once per query, parses it +/// into a (lazy) array, and slices per request. +pub struct FlatScanNode { + layout: LayoutRef, +} + +/// Per-query cache of the parsed (still lazy) array. Concurrent decodes +/// are benign: the segment fetch is deduplicated by the shared segment +/// source, and last-write-wins on the parsed array. +#[derive(Default)] +pub struct FlatScanState { + array: Mutex>, +} + +struct FlatReadPlan { + node: Arc, +} + +impl ScanNode for FlatScanNode { + type State = FlatScanState; + + fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { + Ok(FlatScanState::default()) + } + + fn plan_read(self: Arc, _cx: &mut PlanCtx) -> VortexResult> { + Ok(Some(Arc::new(FlatReadPlan { node: self }))) + } + + /// A flat leaf releases only once *wholly* behind the frontier: a + /// partially-covered flat is the working set, and dropping it would + /// thrash the segment fetch. + fn release(&self, frontier: u64, state: &FlatScanState) -> VortexResult<()> { + if frontier >= self.layout.row_count() { + *state.array.lock() = None; + } + Ok(()) + } + + fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "flat") + } +} + +impl ReadPlan for FlatReadPlan { + type State = ScanStateRef; + + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { + let node: ScanNodeRef = Arc::::clone(&self.node); + cx.init_node(&node) + } + + fn read_scoped<'a>( + &'a self, + range: Range, + rows: RowScope<'a>, + io: &'a FileReader, + state: &'a Self::State, + _local: &'a mut ExecutionCtx, + ) -> BoxFuture<'a, VortexResult> { + let state = match downcast_state::(state.as_ref()) { + Ok(state) => state, + Err(e) => return Box::pin(async move { Err(e) }), + }; + Box::pin(async move { + let array = if let Some(hit) = state.array.lock().clone() { + hit + } else { + let decoded = decode_flat(&self.node.layout, io).await?; + *state.array.lock() = Some(decoded.clone()); + decoded + }; + let dense = slice_to_range(array, &range)?; + if rows.selection.len() != dense.len() { + vortex_bail!( + "selection length {} does not match read range length {}", + rows.selection.len(), + dense.len() + ); + } + if rows.demand.len() != dense.len() { + vortex_bail!( + "demand length {} does not match read range length {}", + rows.demand.len(), + dense.len() + ); + } + if rows.selection.all_true() { + return Ok(dense); + } + dense.filter(rows.selection.clone()) + }) + } + + fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { + self.node + .release(frontier, downcast_state::(state.as_ref())?) + } + + fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.node.fmt_chain(f) + } +} + +pub(crate) async fn decode_flat(layout: &LayoutRef, io: &FileReader) -> VortexResult { + let Some(flat) = layout.as_opt::() else { + vortex_bail!("expected flat layout, got {}", layout.encoding_id()); + }; + let row_count = usize::try_from(layout.row_count()) + .map_err(|_| vortex_err!("layout row count exceeds usize"))?; + let segment = io.segments().request(flat.segment_id()).await?; + let parts = if let Some(tree) = flat.array_tree() { + SerializedArray::from_flatbuffer_and_segment(tree.clone(), segment)? + } else { + SerializedArray::try_from(segment)? + }; + parts.decode(layout.dtype(), row_count, flat.array_ctx(), io.session()) +} + +pub(crate) fn slice_to_range(array: ArrayRef, range: &Range) -> VortexResult { + let start = usize::try_from(range.start).map_err(|_| vortex_err!("row range exceeds usize"))?; + let end = usize::try_from(range.end).map_err(|_| vortex_err!("row range exceeds usize"))?; + if start == 0 && end == array.len() { + return Ok(array); + } + array.slice(start..end) +} diff --git a/vortex-layout/src/scan/v2/layouts/mod.rs b/vortex-layout/src/scan/v2/layouts/mod.rs new file mode 100644 index 00000000000..8875858389b --- /dev/null +++ b/vortex-layout/src/scan/v2/layouts/mod.rs @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Built-in scan2 layout rules. + +pub mod chunked; +pub mod dict; +pub mod flat; +pub mod struct_; +pub mod zoned; diff --git a/vortex-layout/src/scan/v2/layouts/struct_.rs b/vortex-layout/src/scan/v2/layouts/struct_.rs new file mode 100644 index 00000000000..cbc62f29e58 --- /dev/null +++ b/vortex-layout/src/scan/v2/layouts/struct_.rs @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Scan2 rule for struct layouts: plans field access expressions. +//! +//! A struct node treats field access as scalar expression pushdown: +//! `get_item(field, root())` pushes to the field child, and `select(...)` +//! becomes a virtual struct node assembled from pushed child nodes. + +use std::fmt; +use std::sync::Arc; + +use parking_lot::Mutex; +use rustc_hash::FxHashMap; +use vortex_array::dtype::FieldName; +use vortex_array::dtype::FieldNames; +use vortex_array::expr::Expression; +use vortex_array::expr::get_item; +use vortex_array::expr::is_root; +use vortex_array::expr::root; +use vortex_array::expr::transform::replace; +use vortex_array::scalar_fn::fns::get_item::GetItem; +use vortex_array::scalar_fn::fns::root::Root; +use vortex_array::scalar_fn::fns::select::Select; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; + +use crate::LayoutChildType; +use crate::LayoutEncodingId; +use crate::LayoutRef; +use crate::layouts::struct_::StructLayoutEncoding; +use crate::scan::v2::node::ApplyScanNode; +use crate::scan::v2::node::ExpandCtx; +use crate::scan::v2::node::LayoutScanRule; +use crate::scan::v2::node::PlanCtx; +use crate::scan::v2::node::PushCtx; +use crate::scan::v2::node::ReadPlanRef; +use crate::scan::v2::node::ScanNode; +use crate::scan::v2::node::ScanNodeRef; +use crate::scan::v2::node::StateCtx; +use crate::scan::v2::node::StructValueScanNode; +use crate::scan::v2::referenced_fields; +use crate::scan::v2::request::NodeRequest; +use crate::scan::v2::struct_fields; + +/// Scan2 rule for `vortex.struct`. +#[derive(Debug)] +pub struct StructScanRule; + +impl LayoutScanRule for StructScanRule { + type Node = StructScanNode; + + fn id(&self) -> LayoutEncodingId { + StructLayoutEncoding.id() + } + + fn expand( + &self, + layout: &LayoutRef, + _req: &mut NodeRequest, + cx: &ExpandCtx, + ) -> VortexResult { + let validity = layout + .dtype() + .is_nullable() + .then(|| cx.expand(&layout.child(0)?, &mut NodeRequest::empty())) + .transpose()?; + Ok(StructScanNode { + layout: Arc::clone(layout), + cx: cx.clone(), + children: Mutex::new(FxHashMap::default()), + validity, + }) + } +} + +/// Plans struct field expressions through child scan nodes. +pub struct StructScanNode { + layout: LayoutRef, + cx: ExpandCtx, + children: Mutex>, + validity: Option, +} + +impl ScanNode for StructScanNode { + type State = (); + + fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult<()> { + Ok(()) + } + + fn try_push_expr( + self: Arc, + expr: &Expression, + cx: &mut PushCtx, + ) -> VortexResult> { + let scope = struct_fields(self.layout.dtype())?; + if is_root(expr) { + return self.push_struct(scope.names().clone(), cx).map(Some); + } + if let Some(name) = root_field(expr) { + let child = self.child_field(name)?; + return child.try_push_expr(&root(), cx); + } + if let Some(selection) = expr.as_opt::() && expr.child(0).is::() @@ -112,7 +113,7 @@ impl ScanNode for StructScanNode { if let [name] = fields.as_slice() { let scoped = replace(expr.clone(), &get_item(name.clone(), root()), root()); let child = self.child_field(name)?; - return child.try_push_expr(&scoped, cx); + return Ok(self.apply_validity(child.try_push_expr(&scoped, cx)?)); } let input = self.push_struct(fields.clone().into(), cx)?; Ok(Some(Arc::new(ApplyScanNode::new(input, expr.clone())))) @@ -128,6 +129,21 @@ impl ScanNode for StructScanNode { } impl StructScanNode { + /// Apply this struct's validity to a pushed single-field node. + /// + /// The single-field fast paths route straight to a child node, bypassing + /// the parent struct's validity. When the struct is nullable we wrap the + /// child in a [`MaskScanNode`] so the parent's null mask is applied to the + /// child result, mirroring the v1 struct reader's `array.mask(validity)`. + fn apply_validity(&self, pushed: Option) -> Option { + match (pushed, &self.validity) { + (Some(node), Some(validity)) => { + Some(Arc::new(MaskScanNode::new(node, Arc::clone(validity)))) + } + (pushed, _) => pushed, + } + } + fn child_field(&self, name: &FieldName) -> VortexResult { if let Some(hit) = self.children.lock().get(name) { return Ok(Arc::clone(hit)); diff --git a/vortex-layout/src/scan/v2/node.rs b/vortex-layout/src/scan/v2/node.rs index 9a111513247..ee3f24c9ba3 100644 --- a/vortex-layout/src/scan/v2/node.rs +++ b/vortex-layout/src/scan/v2/node.rs @@ -33,6 +33,7 @@ use vortex_array::IntoArray; use vortex_array::VortexSessionExecute; use vortex_array::aggregate_fn::AggregateFnRef; use vortex_array::arrays::StructArray; +use vortex_array::builtins::ArrayBuiltins; use vortex_array::dtype::FieldNames; use vortex_array::expr::Expression; use vortex_array::expr::is_root; @@ -1169,6 +1170,120 @@ impl ReadPlan for ApplyReadPlan { } } +/// Virtual node that applies a parent struct's validity to another node's root +/// value. +/// +/// Reads the `input` value and a non-nullable boolean `validity` array in the +/// same row domain and produces `mask(input, validity)`: rows where validity is +/// false become null. This mirrors the v1 struct reader's `array.mask(validity)` +/// behaviour when a single field is projected out of a nullable struct. +pub struct MaskScanNode { + input: ScanNodeRef, + validity: ScanNodeRef, +} + +impl MaskScanNode { + /// Create a node that masks `input` with a parent struct's `validity`. + /// + /// `validity` must read a non-nullable boolean array in the same row domain + /// as `input` (the struct layout's validity child). + pub fn new(input: ScanNodeRef, validity: ScanNodeRef) -> Self { + Self { input, validity } + } +} + +/// Per-query state for a [`MaskScanNode`]. +pub struct MaskState { + input: ScanStateRef, + validity: ScanStateRef, +} + +struct MaskReadPlan { + node: Arc, + input: ReadPlanRef, + validity: ReadPlanRef, +} + +impl ScanNode for MaskScanNode { + type State = MaskState; + + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { + Ok(MaskState { + input: cx.init_node(&self.input)?, + validity: cx.init_node(&self.validity)?, + }) + } + + fn plan_read(self: Arc, cx: &mut PlanCtx) -> VortexResult> { + let input = Arc::clone(&self.input) + .plan_read(cx)? + .ok_or_else(|| vortex_err!("mask input did not produce a read plan"))?; + let validity = Arc::clone(&self.validity) + .plan_read(cx)? + .ok_or_else(|| vortex_err!("mask validity did not produce a read plan"))?; + Ok(Some(Arc::new(MaskReadPlan { + node: self, + input, + validity, + }))) + } + + fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { + self.input.release(frontier, state.input.as_ref())?; + self.validity.release(frontier, state.validity.as_ref()) + } + + fn split_hints(&self) -> Option<&[u64]> { + self.input.split_hints() + } + + fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "mask:")?; + self.input.fmt_chain(f) + } +} + +impl ReadPlan for MaskReadPlan { + type State = MaskState; + + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { + Ok(MaskState { + input: self.input.init_state(cx)?, + validity: self.validity.init_state(cx)?, + }) + } + + fn read_scoped<'a>( + &'a self, + range: Range, + rows: RowScope<'a>, + io: &'a FileReader, + state: &'a Self::State, + local: &'a mut ExecutionCtx, + ) -> BoxFuture<'a, VortexResult> { + Box::pin(async move { + let input = self + .input + .read_scoped(range.clone(), rows, io, state.input.as_ref(), local) + .await?; + let validity = self + .validity + .read_scoped(range, rows, io, state.validity.as_ref(), local) + .await?; + input.mask(validity)?.execute::(local) + }) + } + + fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { + self.input.release(frontier, state.input.as_ref())?; + self.validity.release(frontier, state.validity.as_ref()) + } + + fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + ScanNode::fmt_chain(self.node.as_ref(), f) + } +} + /// Executable predicate evidence for one planned predicate expression. pub trait EvidencePlan: 'static + Send + Sync { /// The per-query state this evidence plan executes against. From 12fb901cddf11acc0933a8ea17f1d066f9f43db5 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 17 Jun 2026 22:00:30 -0400 Subject: [PATCH 05/48] perf(scan-v2): order conjuncts by cost and evaluate residual filters filter-first Port of the V1 multi-conjunct filter behavior to the V2 PartitionWorkScheduler driver: (1) sort filter conjuncts cheapest-first in PreparedScanNodeFile::try_new so expensive residuals (e.g. FSST LIKE) run after cheap selective ones; (2) when the demanded-row density falls below EXPR_EVAL_THRESHOLD (0.2), read the residual predicate with selection=need so the leaf returns the compacted array and the expression evaluates over only the demanded rows, scattering the verdict back via Mask::intersect_by_rank. Adds V1-vs-V2 differential cases (low- and high-density multi-conjunct) and a predicate_cost unit test. Improves ClickBench multi-conjunct filters (q22 701->547ms, q23 now < V1). A separate single-LIKE FSST amplification (q21) remains and is tracked separately. Signed-off-by: Nicholas Gates Co-Authored-By: Claude Opus 4.8 --- vortex-file/src/multi/scan_v2.rs | 121 ++++++++++++++++++--- vortex-file/src/scan_v1_v2_differential.rs | 70 ++++++++++++ 2 files changed, 174 insertions(+), 17 deletions(-) diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index c01d34a24f4..4ee19c6d16b 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -110,6 +110,10 @@ use crate::VortexOpenOptions; const DEFAULT_CONCURRENCY: usize = 8; const FALLBACK_SPLIT_SIZE: u64 = 100_000; +/// Below this demanded-row density, evaluate a residual predicate over only the demanded rows +/// (filter-first) rather than the whole morsel. Mirrors the V1 flat-reader threshold. +const EXPR_EVAL_THRESHOLD: f64 = 0.2; + struct FileStatsScanNode { data: ScanNodeRef, stats: Arc, @@ -321,6 +325,36 @@ fn root_field(expr: &Expression) -> Option<&FieldName> { expr.child(0).is::().then_some(name) } +/// Static cost estimate for a filter conjunct, used to order predicate evaluation cheapest-first. +/// +/// We sum a per-node cost over the whole expression tree. Primitive comparisons, null checks and +/// data access (`vortex.binary`, `vortex.between`, `vortex.is_null`, `vortex.get_item`, ...) are +/// cheap; per-row string/byte work (`vortex.like`, `vortex.byte_length`, `vortex.list.contains`) +/// and opaque/dynamic functions are expensive. Unrecognized functions get a moderate cost so they +/// sort after primitives but ahead of known-expensive matchers. +fn predicate_cost(expr: &Expression) -> u64 { + fn node_cost(expr: &Expression) -> u64 { + match expr.id().as_str() { + // Free or near-free structural / access nodes. + "vortex.root" | "vortex.literal" | "vortex.get_item" => 0, + // Cheap primitive predicates. + "vortex.binary" | "vortex.between" | "vortex.is_null" | "vortex.is_not_null" + | "vortex.not" | "vortex.fill_null" | "vortex.cast" => 1, + // Expensive per-row string / byte / matching work, and fallible UDFs. + "vortex.like" | "vortex.byte_length" | "vortex.list.contains" => 100, + "vortex.dynamic" | "vortex.variant_get" | "vortex.parquet.variant" => 100, + // Unknown functions: more expensive than primitives, cheaper than known matchers. + _ => 10, + } + } + + let mut cost = node_cost(expr); + for child in expr.children().iter() { + cost = cost.saturating_add(predicate_cost(child)); + } + cost +} + fn absent_statistics(funcs: &[AggregateFnRef]) -> Vec> { funcs.iter().map(|_| Precision::Absent).collect() } @@ -1707,10 +1741,13 @@ impl PreparedScanNodeFile { let projection_state = projection_plan.init_state(&mut state_ctx)?; let mut evidence_state_cache: HashMap = HashMap::default(); - let predicates = filter - .as_ref() - .map(conjuncts) - .unwrap_or_default() + // Run cheap, likely-selective conjuncts first so an expensive residual (e.g. an FSST `LIKE`) + // only evaluates over the rows that survive the cheaper predicates. AND is commutative, so + // reordering is semantically safe; `PredicateId`s are assigned by final slot below (after the + // sort) so each predicate's evidence/read stay self-consistent with its id. + let mut ordered_conjuncts = filter.as_ref().map(conjuncts).unwrap_or_default(); + ordered_conjuncts.sort_by_cached_key(predicate_cost); + let predicates = ordered_conjuncts .into_iter() .enumerate() .map(|(idx, expr)| { @@ -1911,21 +1948,48 @@ impl PreparedScanNodeFile { self.session.handle(), registered, async move { - let full_domain = Mask::new_true(len); - let rows = RowScope::try_new(&full_domain, &need)?; let predicate = &prepared.predicates[predicate_idx]; let mut ctx = prepared.session.create_execution_ctx(); - let result = predicate - .read - .read_scoped( - range, - rows, - &prepared.reader, - predicate.read_state.as_ref(), - &mut ctx, - ) - .await? - .execute::(&mut ctx)?; + // Filter-first: when few rows are demanded, read with selection = `need` so the leaf + // returns the compacted (filtered) array and an expensive residual (e.g. an FSST + // `LIKE`) evaluates over only `need.true_count()` rows. The compacted verdict is + // scattered back into the morsel domain via `intersect_by_rank`, giving a full-length + // mask identical to the dense path's `result & need`. Mirrors V1's flat-reader gate. + let result = if need.density() < EXPR_EVAL_THRESHOLD { + let compact = predicate + .read + .read_scoped( + range, + RowScope::selected(&need), + &prepared.reader, + predicate.read_state.as_ref(), + &mut ctx, + ) + .await? + .execute::(&mut ctx)?; + if compact.len() != need.true_count() { + vortex_bail!( + "compacted residual result length {} does not match demanded row count {}", + compact.len(), + need.true_count() + ); + } + need.intersect_by_rank(&compact) + } else { + let full_domain = Mask::new_true(len); + let rows = RowScope::try_new(&full_domain, &need)?; + predicate + .read + .read_scoped( + range, + rows, + &prepared.reader, + predicate.read_state.as_ref(), + &mut ctx, + ) + .await? + .execute::(&mut ctx)? + }; Ok(PredicateWorkOutput { morsel_id, predicate_idx, @@ -2090,3 +2154,26 @@ fn limit_mask(mask: Mask, remaining: &AtomicU64) -> VortexResult { (0..mask.len()).filter(|idx| mask.value(*idx)).take(take), )) } + +#[cfg(test)] +mod tests { + use vortex_array::expr::get_item; + use vortex_array::expr::like; + use vortex_array::expr::lit; + use vortex_array::expr::not_eq; + use vortex_array::expr::root; + + use super::predicate_cost; + + #[test] + fn predicate_cost_orders_cheap_before_expensive() { + let cheap = not_eq(get_item("search", root()), lit("")); + let expensive = like(get_item("url", root()), lit("%google%")); + assert!( + predicate_cost(&cheap) < predicate_cost(&expensive), + "primitive comparison must be cheaper than LIKE: cheap={}, expensive={}", + predicate_cost(&cheap), + predicate_cost(&expensive), + ); + } +} diff --git a/vortex-file/src/scan_v1_v2_differential.rs b/vortex-file/src/scan_v1_v2_differential.rs index 2dba42348d2..a21fef9a18d 100644 --- a/vortex-file/src/scan_v1_v2_differential.rs +++ b/vortex-file/src/scan_v1_v2_differential.rs @@ -159,6 +159,52 @@ fn zoned() -> ArrayRef { .into_array() } +/// A `keep` flag column plus a `name` string column, for multi-conjunct filter tests: +/// `id != 0` is a cheap, selective predicate; `name LIKE '%match%'` is the expensive +/// residual that should run filter-first once `id` has narrowed the demanded rows. +fn id_and_name(keep: &[u32], names: &[&str]) -> ArrayRef { + StructArray::from_fields(&[ + ( + "id", + PrimitiveArray::from_iter(keep.iter().copied()).into_array(), + ), + ( + "name", + VarBinViewArray::from_iter_str(names.iter().copied()).into_array(), + ), + ]) + .unwrap() + .into_array() +} + +/// 16 names where most rows contain the `match` needle (decoys), so a residual `LIKE` +/// that ignored the cheaper predicate would diverge from V1. +const MULTI_CONJUNCT_NAMES: [&str; 16] = [ + "row0_match", + "row1_match", + "no_hit_here", + "row3_match", + "row4_match", + "row5_match", + "row6_match", + "row7_match", + "row8_match", + "has_match_inside", + "row10_match", + "row11_match", + "row12_match", + "row13_match", + "row14_match", + "row15_match", +]; + +fn multi_conjunct_filter() -> Expression { + vortex_array::expr::and( + vortex_array::expr::not_eq(get_item("id", root()), lit(0u32)), + vortex_array::expr::like(get_item("name", root()), lit("%match%")), + ) +} + /// Outer struct is non-nullable (so the file writes), but it contains a nullable /// nested struct `a` with a non-nullable field `b.c`. Projecting `a.b.c` (or /// selecting `c` out of `a.b`) must preserve the nulls of the nullable `a.b` @@ -242,6 +288,30 @@ async fn differential_zoned_filter() -> VortexResult<()> { assert_v1_eq_v2(&file, request(root(), Some(filter))).await } +/// Low-density multi-conjunct filter: `id != 0` keeps 2/16 rows (density 0.125 < 0.2), +/// so the expensive `name LIKE '%match%'` runs filter-first over only the demanded rows +/// and its compacted verdict is scattered back. Asserted against the V1 reference, which +/// catches any off-by-rank error in the scatter-back. +#[tokio::test] +async fn differential_multi_conjunct_filter_first() -> VortexResult<()> { + let mut keep = [0u32; 16]; + keep[2] = 1; + keep[9] = 1; + let file = write_file(id_and_name(&keep, &MULTI_CONJUNCT_NAMES), false).await?; + assert_v1_eq_v2(&file, request(root(), Some(multi_conjunct_filter()))).await +} + +/// High-density multi-conjunct filter: `id != 0` keeps 14/16 rows (density 0.875 > 0.2), +/// so the residual takes the dense path. Must still match V1. +#[tokio::test] +async fn differential_multi_conjunct_dense() -> VortexResult<()> { + let mut keep = [1u32; 16]; + keep[2] = 0; + keep[9] = 0; + let file = write_file(id_and_name(&keep, &MULTI_CONJUNCT_NAMES), false).await?; + assert_v1_eq_v2(&file, request(root(), Some(multi_conjunct_filter()))).await +} + /// Reproduces the struct-null bug: projecting a single deep field out of a /// nullable nested struct must apply the parent struct's validity. The V2 /// single-field fast path previously bypassed `self.validity`. From abfe671b062802327460195598eca76f140a6bb5 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Thu, 18 Jun 2026 14:27:24 -0400 Subject: [PATCH 06/48] perf(scan-v2): distribute scan morsels across DataFusion partitions V2 parallelizes the join probe, aggregate, and Arrow decode ACROSS DataFusion partitions (V1 instead fans one partition into many split tasks). When a query projected a heavily-encoded column (e.g. a single RunEnd chunk for lineitem.l_orderkey), the opener fed split_aligned_row_range coarse chunk boundaries, which collapsed every byte-range file_group onto one partition and serialized the probe ~2-wide (TPC-H q4 ran 2.6x slower than V1). Feed split_aligned_row_range the scan's own morsel ranges instead: the read-column chunk hints, or the 100k-row fallback when a read column is a single chunk (mirroring PreparedScanNodeFile::splits). Each morsel lands wholly in one partition, so the scan spreads across all of DataFusion's byte-range file_groups with no collapse and no chunk straddling a partition boundary. The assignment is contiguous per partition, so it is correct even when the scan output must preserve order. Also run the Vortex->Arrow conversion on the runtime CPU pool (handle.spawn_cpu + buffered/buffer_unordered) so decode fans out within a partition rather than running serially on the consumer poll thread. TPC-H SF1 (datafusion-bench, VORTEX_SCAN_IMPL=v2): q4 goes from 2.6x slower than V1 to faster than V1; overall ~parity. Signed-off-by: Nicholas Gates Co-Authored-By: Claude Opus 4.8 --- vortex-datafusion/src/persistent/opener.rs | 248 +++++++++++++-------- 1 file changed, 159 insertions(+), 89 deletions(-) diff --git a/vortex-datafusion/src/persistent/opener.rs b/vortex-datafusion/src/persistent/opener.rs index 92325c782b9..9b8b418942b 100644 --- a/vortex-datafusion/src/persistent/opener.rs +++ b/vortex-datafusion/src/persistent/opener.rs @@ -39,11 +39,14 @@ use tracing::Instrument; use vortex::array::VortexSessionExecute; use vortex::array::arrow::ArrowSessionExt; use vortex::dtype::FieldMask; +use vortex::dtype::Nullability; use vortex::error::VortexError; use vortex::error::VortexExpect; +use vortex::expr::pack; use vortex::file::OpenOptionsSessionExt; use vortex::file::VortexFile; use vortex::io::InstrumentedReadAt; +use vortex::io::session::RuntimeSessionExt; use vortex::layout::LayoutReader; use vortex::layout::scan::scan_builder::ScanBuilder; use vortex::layout::scan::split_by::SplitBy; @@ -53,6 +56,7 @@ use vortex::scan::ScanRequest; use vortex::session::VortexSession; use vortex_utils::aliases::dash_map::DashMap; use vortex_utils::aliases::dash_map::Entry; +use vortex_utils::parallelism::get_available_parallelism; use crate::VortexAccessPlan; use crate::convert::exprs::ExpressionConvertor; @@ -351,18 +355,36 @@ impl FileOpener for VortexOpener { if byte_range.start == 0 && byte_range.end == file.object_meta.size { None } else { - let natural_split_ranges = scan_node_natural_split_ranges_for_file( + // Distribute the scan's own morsels across DataFusion's byte-range + // file_groups. The morsels are the units the scan actually reads (read-column + // chunk hints, or the 100k-row fallback for single-chunk columns), so each + // morsel lands wholly in one partition: no collapse onto a single partition + // (which serialized the probe), and no chunk straddling a partition boundary + // (which would re-decode it). V2 needs this because it parallelizes the + // scan/probe ACROSS DataFusion partitions, unlike V1 which fans out within + // one partition. + let read_expr = match &filter { + Some(filter) => pack( + [ + ("projection", scan_projection.clone()), + ("filter", filter.clone()), + ], + Nullability::NonNullable, + ), + None => scan_projection.clone(), + }; + let morsels = scan_node_morsel_ranges_for_file( natural_split_ranges.as_ref(), &file.object_meta.location, &vxf, - &scan_projection, + &read_expr, ) .await?; let Some(row_range) = split_aligned_row_range( byte_range, file.object_meta.size, - natural_split_ranges.as_ref(), + morsels.as_ref(), ) else { return Ok(stream::empty().boxed()); }; @@ -393,8 +415,17 @@ impl FileOpener for VortexOpener { .map_err(|e| { exec_datafusion_err!("Failed to create Vortex scan2 stream: {e}") })?; - let stream = array_stream - .map(move |chunk| { + // The Vortex->Arrow conversion (decode + canonicalize) is CPU-bound, so spawn each + // chunk's conversion onto the runtime's CPU pool and buffer them. This fans the + // decode out within a single partition instead of running serially on the consumer's + // poll thread, which matters for scans with few partitions (e.g. small tables). + // `buffered` preserves order for ordered consumers. + let handle = session.handle(); + let decode_concurrency = 4 * get_available_parallelism().unwrap_or(1); + let converted = array_stream.map(move |chunk| { + let session = session.clone(); + let stream_target_field = stream_target_field.clone(); + handle.spawn_cpu(move || { let chunk = chunk?; let mut ctx = session.create_execution_ctx(); let arrow_session = ctx.session().clone(); @@ -405,41 +436,47 @@ impl FileOpener for VortexOpener { )?; Ok(RecordBatch::from(arrow.as_struct().clone())) }) - .map_ok(move |rb| { - // We try and slice the stream into respecting datafusion's configured batch size. - stream::iter( - (0..rb.num_rows().div_ceil(batch_size * 2)) - .flat_map(move |block_idx| { - let offset = block_idx * batch_size * 2; - - // If we have less than two batches worth of rows left, we keep them together as a single batch. - if rb.num_rows() - offset < 2 * batch_size { - let length = rb.num_rows() - offset; - [Some(rb.slice(offset, length)), None].into_iter() - } else { - let first = rb.slice(offset, batch_size); - let second = rb.slice(offset + batch_size, batch_size); - [Some(first), Some(second)].into_iter() - } - }) - .flatten() - .map(Ok), - ) - }) - .map_err(move |e: VortexError| { - DataFusionError::External(Box::new( - e.with_context(format!("Failed to read Vortex file: {file_location}")), - )) - }) - .try_flatten() - .map(move |batch| { - if projector.projection().as_ref().is_empty() { - batch - } else { - batch.and_then(|b| projector.project_batch(&b)) - } - }) - .boxed(); + }); + let stream = if has_output_ordering { + converted.buffered(decode_concurrency).boxed() + } else { + converted.buffer_unordered(decode_concurrency).boxed() + } + .map_ok(move |rb| { + // We try and slice the stream into respecting datafusion's configured batch size. + stream::iter( + (0..rb.num_rows().div_ceil(batch_size * 2)) + .flat_map(move |block_idx| { + let offset = block_idx * batch_size * 2; + + // If we have less than two batches worth of rows left, we keep them together as a single batch. + if rb.num_rows() - offset < 2 * batch_size { + let length = rb.num_rows() - offset; + [Some(rb.slice(offset, length)), None].into_iter() + } else { + let first = rb.slice(offset, batch_size); + let second = rb.slice(offset + batch_size, batch_size); + [Some(first), Some(second)].into_iter() + } + }) + .flatten() + .map(Ok), + ) + }) + .map_err(move |e: VortexError| { + DataFusionError::External(Box::new( + e.with_context(format!("Failed to read Vortex file: {file_location}")), + )) + }) + .try_flatten() + .map(move |batch| { + if projector.projection().as_ref().is_empty() { + batch + } else { + batch.and_then(|b| projector.project_batch(&b)) + } + }) + .boxed(); return if let Some(file_pruner) = file_pruner { Ok(PrunableStream::new(file_pruner, stream).boxed()) @@ -449,32 +486,8 @@ impl FileOpener for VortexOpener { } // We share our layout readers with others partitions in the scan, so we can only need to read each layout in each file once. - let layout_reader = match layout_readers.entry(file.object_meta.location.clone()) { - Entry::Occupied(mut occupied_entry) => { - if let Some(reader) = occupied_entry.get().upgrade() { - tracing::trace!("reusing layout reader for {}", occupied_entry.key()); - reader - } else { - tracing::trace!("creating layout reader for {}", occupied_entry.key()); - let reader = vxf.layout_reader().map_err(|e| { - DataFusionError::Execution(format!( - "Failed to create layout reader: {e}" - )) - })?; - occupied_entry.insert(Arc::downgrade(&reader)); - reader - } - } - Entry::Vacant(vacant_entry) => { - tracing::trace!("creating layout reader for {}", vacant_entry.key()); - let reader = vxf.layout_reader().map_err(|e| { - DataFusionError::Execution(format!("Failed to create layout reader: {e}")) - })?; - vacant_entry.insert(Arc::downgrade(&reader)); - - reader - } - }; + let layout_reader = + layout_reader_for_file(layout_readers.as_ref(), &file.object_meta.location, &vxf)?; let mut scan_builder = ScanBuilder::new(session.clone(), Arc::clone(&layout_reader)); @@ -586,41 +599,46 @@ impl FileOpener for VortexOpener { } } -fn natural_split_ranges_for_file( - natural_split_ranges: &DashMap]>>, +/// Get or create a shared layout reader for a file. Layout readers are cached (weakly) per path so +/// each file's layout is parsed only once across all partitions of a scan. +fn layout_reader_for_file( + layout_readers: &DashMap>, path: &Path, - layout_reader: &Arc, -) -> DFResult]>> { - if let Some(split_ranges) = natural_split_ranges.get(path) { - return Ok(Arc::clone(split_ranges.value())); - } - - let split_ranges = compute_natural_split_ranges(layout_reader.as_ref())?; - - match natural_split_ranges.entry(path.clone()) { - Entry::Occupied(entry) => Ok(Arc::clone(entry.get())), - Entry::Vacant(entry) => { - entry.insert(Arc::clone(&split_ranges)); - Ok(split_ranges) + vxf: &VortexFile, +) -> DFResult> { + let create = || { + vxf.layout_reader() + .map_err(|e| DataFusionError::Execution(format!("Failed to create layout reader: {e}"))) + }; + + match layout_readers.entry(path.clone()) { + Entry::Occupied(mut occupied_entry) => { + if let Some(reader) = occupied_entry.get().upgrade() { + Ok(reader) + } else { + let reader = create()?; + occupied_entry.insert(Arc::downgrade(&reader)); + Ok(reader) + } + } + Entry::Vacant(vacant_entry) => { + let reader = create()?; + vacant_entry.insert(Arc::downgrade(&reader)); + Ok(reader) } } } -async fn scan_node_natural_split_ranges_for_file( +fn natural_split_ranges_for_file( natural_split_ranges: &DashMap]>>, path: &Path, - file: &VortexFile, - projection: &vortex::expr::Expression, + layout_reader: &Arc, ) -> DFResult]>> { if let Some(split_ranges) = natural_split_ranges.get(path) { return Ok(Arc::clone(split_ranges.value())); } - let split_ranges = file - .plan_splits(projection) - .await - .map(Arc::from) - .map_err(|e| exec_datafusion_err!("Failed to compute Vortex scan2 natural splits: {e}"))?; + let split_ranges = compute_natural_split_ranges(layout_reader.as_ref())?; match natural_split_ranges.entry(path.clone()) { Entry::Occupied(entry) => Ok(Arc::clone(entry.get())), @@ -645,6 +663,58 @@ fn compute_natural_split_ranges(layout_reader: &dyn LayoutReader) -> DFResult]>>, + path: &Path, + file: &VortexFile, + read_expr: &vortex::expr::Expression, +) -> DFResult]>> { + if let Some(ranges) = morsel_ranges.get(path) { + return Ok(Arc::clone(ranges.value())); + } + + let chunks = file + .plan_splits(read_expr) + .await + .map_err(|e| exec_datafusion_err!("Failed to compute Vortex scan2 splits: {e}"))?; + + let ranges: Arc<[Range]> = if chunks.len() > 1 { + chunks.into() + } else { + // Single chunk (or none): mirror the scan's fallback of FALLBACK_SPLIT_SIZE-row morsels so + // a single large chunk still spreads across partitions. + let row_count = file.row_count(); + let mut ranges = Vec::new(); + let mut start = 0u64; + while start < row_count { + let end = start + .saturating_add(SCAN_FALLBACK_SPLIT_SIZE) + .min(row_count); + ranges.push(start..end); + start = end; + } + ranges.into() + }; + + match morsel_ranges.entry(path.clone()) { + Entry::Occupied(entry) => Ok(Arc::clone(entry.get())), + Entry::Vacant(entry) => { + entry.insert(Arc::clone(&ranges)); + Ok(ranges) + } + } +} + /// Translate a DataFusion byte range to the contiguous natural split ranges it owns. /// Most splits are assigned by midpoint, but the leading split stays with the range that owns /// byte 0 so a tiny first byte range still claims the first rows. From 426fd3d0564e04460403e285503f299f23fba552 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Thu, 18 Jun 2026 14:27:32 -0400 Subject: [PATCH 07/48] bench(datafusion): dump per-operator annotated plan under VORTEX_BENCH_FULL_PLAN With --show-metrics and VORTEX_BENCH_FULL_PLAN=1, print the DataFusion EXPLAIN ANALYZE-style annotated plan (elapsed_compute / output_rows per operator) to stderr, to localize where wall time goes across scan, HashJoin build/probe, and aggregate. Signed-off-by: Nicholas Gates Co-Authored-By: Claude Opus 4.8 --- benchmarks/datafusion-bench/src/main.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/benchmarks/datafusion-bench/src/main.rs b/benchmarks/datafusion-bench/src/main.rs index 4e8e98ef3a7..043fe26e494 100644 --- a/benchmarks/datafusion-bench/src/main.rs +++ b/benchmarks/datafusion-bench/src/main.rs @@ -379,7 +379,21 @@ pub async fn execute_query( /// Print Vortex metrics from execution plans. fn print_metrics(plans: &[(usize, Format, Arc)]) { + // VORTEX_BENCH_FULL_PLAN=1 dumps the full per-operator annotated plan (DataFusion + // EXPLAIN ANALYZE-style: elapsed_compute / output_rows per operator), to localize where + // wall time goes (scan vs HashJoin build/probe vs aggregate). + let full_plan = std::env::var_os("VORTEX_BENCH_FULL_PLAN").is_some(); for (query_idx, format, plan) in plans { + if full_plan { + eprintln!("=== annotated plan query={query_idx}, {format} ==="); + eprintln!( + "{}", + datafusion_physical_plan::display::DisplayableExecutionPlan::with_metrics( + plan.as_ref() + ) + .indent(true) + ); + } let metric_sets = VortexMetricsFinder::find_all(plan.as_ref()); if metric_sets.is_empty() { continue; From 463e6b6f0f13d11f76b4ec86c96e0d5f9c37f6b5 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Thu, 18 Jun 2026 15:54:58 -0400 Subject: [PATCH 08/48] Add v2 layout vtable scan path Signed-off-by: Nicholas Gates --- vortex-file/src/footer/mod.rs | 18 +- vortex-file/src/multi/scan_v2.rs | 6 +- vortex-layout/src/layout_v2.rs | 1034 ++++++++++++++++++ vortex-layout/src/lib.rs | 1 + vortex-layout/src/scan/v2/layouts/chunked.rs | 55 +- vortex-layout/src/scan/v2/layouts/dict.rs | 54 +- vortex-layout/src/scan/v2/layouts/flat.rs | 55 +- vortex-layout/src/scan/v2/layouts/mod.rs | 2 +- vortex-layout/src/scan/v2/layouts/struct_.rs | 56 +- vortex-layout/src/scan/v2/layouts/zoned.rs | 62 +- vortex-layout/src/scan/v2/mod.rs | 3 +- vortex-layout/src/scan/v2/node.rs | 84 +- vortex-layout/src/scan/v2/request.rs | 2 +- vortex-layout/src/scan/v2/session.rs | 59 +- vortex-layout/src/session.rs | 43 +- 15 files changed, 1217 insertions(+), 317 deletions(-) create mode 100644 vortex-layout/src/layout_v2.rs diff --git a/vortex-file/src/footer/mod.rs b/vortex-file/src/footer/mod.rs index ac2153f7cd1..6fdf9597c6a 100644 --- a/vortex-file/src/footer/mod.rs +++ b/vortex-file/src/footer/mod.rs @@ -35,6 +35,7 @@ use vortex_flatbuffers::footer as fb; use vortex_layout::LayoutEncodingId; use vortex_layout::LayoutRef; use vortex_layout::layout_from_flatbuffer_with_options; +use vortex_layout::layout_v2; use vortex_layout::session::LayoutSessionExt; use vortex_session::VortexSession; use vortex_session::registry::ReadContext; @@ -43,6 +44,7 @@ use vortex_session::registry::ReadContext; #[derive(Debug, Clone)] pub struct Footer { root_layout: LayoutRef, + root_layout2: Option, segments: Arc<[SegmentSpec]>, statistics: Option, // The specific arrays used within the file, in the order they were registered. @@ -60,6 +62,7 @@ impl Footer { ) -> Self { Self { root_layout, + root_layout2: None, segments, statistics, array_read_ctx, @@ -102,13 +105,20 @@ impl Footer { let array_read_ctx = ReadContext::new(array_ids); let root_layout = layout_from_flatbuffer_with_options( - layout_bytes, + layout_bytes.clone(), &dtype, &layout_read_ctx, &array_read_ctx, session.layouts().registry(), session.allows_unknown(), )?; + let root_layout2 = layout_v2::layout_from_flatbuffer( + layout_bytes, + &dtype, + &layout_read_ctx, + &array_read_ctx, + session.layouts().v2_registry(), + )?; let segments: Arc<[SegmentSpec]> = fb_footer .segment_specs() @@ -124,6 +134,7 @@ impl Footer { Ok(Self { root_layout, + root_layout2: Some(root_layout2), segments, statistics, array_read_ctx, @@ -136,6 +147,11 @@ impl Footer { &self.root_layout } + /// Returns the root v2 layout of the file, when available. + pub fn layout2(&self) -> Option<&layout_v2::LayoutRef> { + self.root_layout2.as_ref() + } + /// Returns the segment map of the file. pub fn segment_map(&self) -> &Arc<[SegmentSpec]> { &self.segments diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index 4ee19c6d16b..221e06e1263 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -957,7 +957,11 @@ fn split_ranges_from_node(node: &ScanNodeRef, row_count: u64) -> VortexResult VortexResult { let mut node_request = NodeRequest::empty(); - let root = ExpandCtx::new(session.clone()).expand(file.footer().layout(), &mut node_request)?; + let layout = file + .footer() + .layout2() + .ok_or_else(|| vortex_err!("scan2 requires a v2 footer layout"))?; + let root = ExpandCtx::new(session.clone()).expand(layout, &mut node_request)?; Ok(match file.footer().statistics().cloned() { Some(stats) => FileStatsScanNode::try_new( Arc::clone(&root), diff --git a/vortex-layout/src/layout_v2.rs b/vortex-layout/src/layout_v2.rs new file mode 100644 index 00000000000..2e828b0cff9 --- /dev/null +++ b/vortex-layout/src/layout_v2.rs @@ -0,0 +1,1034 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::any::Any; +use std::env; +use std::fmt::Debug; +use std::fmt::Formatter; +use std::ops::Deref; +use std::sync::Arc; +use std::sync::LazyLock; + +use flatbuffers::Follow; +use flatbuffers::VerifierOptions; +use flatbuffers::root_with_opts; +use once_cell::sync::OnceCell; +use vortex_array::DeserializeMetadata; +use vortex_array::EmptyMetadata; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::expr::stats::Stat; +use vortex_array::stats::stats_from_bitset_bytes; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; +use vortex_error::vortex_err; +use vortex_flatbuffers::FlatBuffer; +use vortex_flatbuffers::layout; +use vortex_session::registry::ReadContext; +use vortex_session::registry::Registry; + +use crate::LayoutChildType; +use crate::LayoutId; +use crate::layouts::zoned::zone_map::ZoneMap; +use crate::scan::v2::layouts::chunked as scan_chunked; +use crate::scan::v2::layouts::dict as scan_dict; +use crate::scan::v2::layouts::flat as scan_flat; +use crate::scan::v2::layouts::struct_ as scan_struct; +use crate::scan::v2::layouts::zoned as scan_zoned; +use crate::scan::v2::node::ExpandCtx; +use crate::scan::v2::node::ScanNodeRef; +use crate::scan::v2::request::NodeRequest; +use crate::segments::SegmentId; + +/// A reference-counted, type-erased v2 layout. +#[derive(Clone)] +pub struct LayoutRef(Arc); + +/// Reference-counted v2 layout-vtable plugin. +pub type LayoutVTableRef = Arc; + +/// Registry mapping layout IDs to v2 layout-vtable plugins. +pub type LayoutVTableRegistry = Registry; + +static LAYOUT_VERIFIER: LazyLock = LazyLock::new(|| VerifierOptions { + max_tables: env::var("VORTEX_MAX_LAYOUT_TABLES") + .ok() + .and_then(|lmt| lmt.parse::().ok()) + .unwrap_or(1000000), + max_depth: env::var("VORTEX_MAX_LAYOUT_DEPTH") + .ok() + .and_then(|lmt| lmt.parse::().ok()) + .unwrap_or(64), + max_apparent_size: 1 << 31, + ignore_missing_null_terminator: false, +}); + +/// Layout-specific behavior for the v2 layout model. +/// +/// Common layout fields live in [`Layout`] and are handled by the erased adapter. The vtable only +/// supplies layout-specific data interpretation, child typing, and runtime scan expansion. +pub trait VTable: 'static + Clone + Send + Sync + Debug { + /// Layout-specific data. Common fields such as dtype, row count, children, and segments are + /// stored by the adapter. + type LayoutData: 'static + Send + Sync + Clone + Debug; + + /// Returns the ID of this layout encoding. + fn id(&self) -> LayoutId; + + /// Deserialize layout-specific data from serialized metadata. + /// + /// Common fields are provided in `args`, but remain owned by [`LayoutParts`]. Implementations + /// should only return layout-specific data. + fn deserialize(&self, _args: &LayoutDeserializeArgs<'_>) -> VortexResult { + vortex_bail!( + "layout v2 deserialization is not implemented for {}", + self.id() + ) + } + + /// Returns the expected dtype of child `idx`. + fn child_dtype(layout: Layout, idx: usize) -> VortexResult; + + /// Returns the relationship between child `idx` and its parent. + fn child_type(layout: Layout, idx: usize) -> VortexResult; + + /// Expand this layout into a scan2 node. + fn new_scan_node( + layout: Layout, + req: &mut NodeRequest, + cx: &ExpandCtx, + ) -> VortexResult; +} + +/// Object-safe plugin for deserializing v2 layouts by ID. +pub trait LayoutVTablePlugin: 'static + Send + Sync { + /// Returns the ID of this layout encoding. + fn id(&self) -> LayoutId; + + /// Deserialize a type-erased v2 layout. + fn deserialize(&self, args: LayoutDeserializeArgs<'_>) -> VortexResult; +} + +impl Debug for dyn LayoutVTablePlugin { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("LayoutVTablePlugin") + .field(&self.id()) + .finish() + } +} + +impl LayoutVTablePlugin for V { + fn id(&self) -> LayoutId { + VTable::id(self) + } + + fn deserialize(&self, args: LayoutDeserializeArgs<'_>) -> VortexResult { + Ok(LayoutParts::deserialize(self.clone(), args)?.into_layout()) + } +} + +/// Common serialized layout fields made available while deserializing v2 layout data. +pub struct LayoutDeserializeArgs<'a> { + /// The logical dtype of this layout. + pub dtype: &'a DType, + /// The row count of this layout. + pub row_count: u64, + /// The layout-specific metadata payload. + pub metadata: &'a [u8], + /// Segment IDs referenced directly by this layout. + pub segment_ids: Vec, + /// Lazy child access for this layout. + pub children: Arc, + /// Array read context captured from the file footer. + pub array_ctx: &'a ReadContext, +} + +/// Pieces used to construct a v2 layout. +pub struct LayoutParts { + vtable: V, + dtype: DType, + row_count: u64, + segment_ids: Vec, + children: Arc, + data: V::LayoutData, +} + +impl LayoutParts { + /// Create layout parts from common fields and vtable-specific data. + pub fn new( + vtable: V, + dtype: DType, + row_count: u64, + segment_ids: Vec, + children: Arc, + data: V::LayoutData, + ) -> Self { + Self { + vtable, + dtype, + row_count, + segment_ids, + children, + data, + } + } + + /// Deserialize layout-specific data and hoist common fields into layout parts. + pub fn deserialize(vtable: V, args: LayoutDeserializeArgs<'_>) -> VortexResult { + let data = vtable.deserialize(&args)?; + Ok(Self { + vtable, + dtype: args.dtype.clone(), + row_count: args.row_count, + segment_ids: args.segment_ids, + children: args.children, + data, + }) + } + + /// Convert these parts into a typed layout. + pub fn into_typed(self) -> Layout { + Layout::from_parts(self) + } + + /// Erase these parts into a layout reference. + pub fn into_layout(self) -> LayoutRef { + self.into_typed().into_layout() + } +} + +/// A typed v2 layout handle. +pub struct Layout { + inner: Arc>, +} + +struct LayoutInner { + vtable: V, + dtype: DType, + row_count: u64, + segment_ids: Vec, + children: Arc, + data: V::LayoutData, +} + +impl Layout { + /// Create a typed layout from explicit construction parts. + pub fn from_parts(parts: LayoutParts) -> Self { + Self { + inner: Arc::new(LayoutInner { + vtable: parts.vtable, + dtype: parts.dtype, + row_count: parts.row_count, + segment_ids: parts.segment_ids, + children: parts.children, + data: parts.data, + }), + } + } + + /// Returns this layout's vtable. + pub fn vtable(&self) -> &V { + &self.inner.vtable + } + + /// Returns the layout-specific data. + pub fn data(&self) -> &V::LayoutData { + &self.inner.data + } + + /// Returns this layout's dtype. + pub fn dtype(&self) -> &DType { + &self.inner.dtype + } + + /// Returns this layout's row count. + pub fn row_count(&self) -> u64 { + self.inner.row_count + } + + /// Returns this layout's segment IDs. + pub fn segment_ids(&self) -> &[SegmentId] { + &self.inner.segment_ids + } + + /// Returns this layout's children adapter. + pub fn children(&self) -> &Arc { + &self.inner.children + } + + /// Returns the number of children. + pub fn nchildren(&self) -> usize { + self.inner.children.nchildren() + } + + /// Returns child `idx`, materializing it lazily. + pub fn child(&self, idx: usize) -> VortexResult { + let dtype = V::child_dtype(self.clone(), idx)?; + self.inner.children.child(idx, &dtype) + } + + /// Returns the row count of child `idx`. + pub fn child_row_count(&self, idx: usize) -> VortexResult { + self.inner.children.child_row_count(idx) + } + + /// Returns the relationship between child `idx` and this layout. + pub fn child_type(&self, idx: usize) -> VortexResult { + V::child_type(self.clone(), idx) + } + + /// Erase this typed layout into a layout reference. + pub fn to_layout(&self) -> LayoutRef { + self.clone().into_layout() + } + + /// Erase this typed layout into a layout reference. + pub fn into_layout(self) -> LayoutRef { + LayoutRef(Arc::new(self)) + } +} + +impl Clone for Layout { + fn clone(&self) -> Self { + Self { + inner: Arc::clone(&self.inner), + } + } +} + +impl Debug for Layout { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Layout") + .field("encoding_id", &self.inner.vtable.id()) + .field("dtype", &self.inner.dtype) + .field("row_count", &self.inner.row_count) + .field("segment_ids", &self.inner.segment_ids) + .field("data", &self.inner.data) + .finish() + } +} + +impl Deref for Layout { + type Target = V::LayoutData; + + fn deref(&self) -> &Self::Target { + self.data() + } +} + +trait DynLayout: 'static + Send + Sync + Debug { + fn as_any(&self) -> &dyn Any; + + fn dyn_encoding_id(&self) -> LayoutId; + + fn dyn_dtype(&self) -> &DType; + + fn dyn_row_count(&self) -> u64; + + fn dyn_segment_ids(&self) -> &[SegmentId]; + + fn dyn_nchildren(&self) -> usize; + + fn dyn_child(&self, idx: usize) -> VortexResult; + + fn dyn_child_row_count(&self, idx: usize) -> VortexResult; + + fn dyn_child_type(&self, idx: usize) -> VortexResult; + + fn dyn_new_scan_node(&self, req: &mut NodeRequest, cx: &ExpandCtx) + -> VortexResult; +} + +impl LayoutRef { + /// Downcast this layout to a typed v2 layout handle. + pub fn as_opt(&self) -> Option> { + self.0.as_any().downcast_ref::>().cloned() + } + + /// Returns a cloned layout reference. + pub fn to_layout(&self) -> LayoutRef { + self.clone() + } + + /// Returns the layout encoding ID. + pub fn encoding_id(&self) -> LayoutId { + self.0.dyn_encoding_id() + } + + /// Returns this layout's dtype. + pub fn dtype(&self) -> &DType { + self.0.dyn_dtype() + } + + /// Returns this layout's row count. + pub fn row_count(&self) -> u64 { + self.0.dyn_row_count() + } + + /// Returns this layout's segment IDs. + pub fn segment_ids(&self) -> &[SegmentId] { + self.0.dyn_segment_ids() + } + + /// Returns the number of children. + pub fn nchildren(&self) -> usize { + self.0.dyn_nchildren() + } + + /// Returns child `idx`, materializing it lazily. + pub fn child(&self, idx: usize) -> VortexResult { + self.0.dyn_child(idx) + } + + /// Returns the row count of child `idx`. + pub fn child_row_count(&self, idx: usize) -> VortexResult { + self.0.dyn_child_row_count(idx) + } + + /// Returns the relationship between child `idx` and this layout. + pub fn child_type(&self, idx: usize) -> VortexResult { + self.0.dyn_child_type(idx) + } + + /// Expand this layout into a scan2 node. + pub fn new_scan_node( + &self, + req: &mut NodeRequest, + cx: &ExpandCtx, + ) -> VortexResult { + self.0.dyn_new_scan_node(req, cx) + } + + /// Returns an iterator over child row offsets. + pub fn child_row_offsets(&self) -> impl Iterator>> + '_ { + (0..self.nchildren()).map(|idx| Ok(self.child_type(idx)?.row_offset())) + } +} + +impl Debug for LayoutRef { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + Debug::fmt(&self.0, f) + } +} + +impl DynLayout for Layout { + fn as_any(&self) -> &dyn Any { + self + } + + fn dyn_encoding_id(&self) -> LayoutId { + self.vtable().id() + } + + fn dyn_dtype(&self) -> &DType { + &self.inner.dtype + } + + fn dyn_row_count(&self) -> u64 { + self.inner.row_count + } + + fn dyn_segment_ids(&self) -> &[SegmentId] { + &self.inner.segment_ids + } + + fn dyn_nchildren(&self) -> usize { + self.inner.children.nchildren() + } + + fn dyn_child(&self, idx: usize) -> VortexResult { + Layout::child(self, idx) + } + + fn dyn_child_row_count(&self, idx: usize) -> VortexResult { + self.inner.children.child_row_count(idx) + } + + fn dyn_child_type(&self, idx: usize) -> VortexResult { + V::child_type(self.clone(), idx) + } + + fn dyn_new_scan_node( + &self, + req: &mut NodeRequest, + cx: &ExpandCtx, + ) -> VortexResult { + V::new_scan_node(self.clone(), req, cx) + } +} + +/// Lazily provides v2 layout children. +pub trait LayoutChildren: 'static + Send + Sync { + /// Returns child `idx`, validating its dtype. + fn child(&self, idx: usize, dtype: &DType) -> VortexResult; + + /// Returns child `idx`'s row count. + fn child_row_count(&self, idx: usize) -> VortexResult; + + /// Returns the number of children. + fn nchildren(&self) -> usize; +} + +impl Debug for dyn LayoutChildren { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("LayoutChildren") + .field("nchildren", &self.nchildren()) + .finish() + } +} + +#[derive(Clone)] +struct ViewedLayoutChildren { + flatbuffer: FlatBuffer, + flatbuffer_loc: usize, + array_ctx: ReadContext, + layout_ctx: ReadContext, + layouts: LayoutVTableRegistry, + cache: Arc<[OnceCell]>, +} + +impl ViewedLayoutChildren { + unsafe fn new_unchecked( + flatbuffer: FlatBuffer, + flatbuffer_loc: usize, + array_ctx: ReadContext, + layout_ctx: ReadContext, + layouts: LayoutVTableRegistry, + ) -> Self { + // SAFETY: guaranteed by caller. + let nchildren = unsafe { layout::Layout::follow(flatbuffer.as_ref(), flatbuffer_loc) } + .children() + .unwrap_or_default() + .len(); + let cache = vec![OnceCell::new(); nchildren].into_boxed_slice().into(); + Self { + flatbuffer, + flatbuffer_loc, + array_ctx, + layout_ctx, + layouts, + cache, + } + } + + fn flatbuffer(&self) -> layout::Layout<'_> { + // SAFETY: flatbuffer_loc is produced from a verified flatbuffer table. + unsafe { layout::Layout::follow(self.flatbuffer.as_ref(), self.flatbuffer_loc) } + } +} + +impl LayoutChildren for ViewedLayoutChildren { + fn child(&self, idx: usize, dtype: &DType) -> VortexResult { + if idx >= self.cache.len() { + vortex_bail!("Child index out of bounds: {idx} of {}", self.cache.len()); + } + let child = self.cache[idx].get_or_try_init(|| { + let fb_child = self.flatbuffer().children().unwrap_or_default().get(idx); + // SAFETY: same verified flatbuffer; fb_child._tab.loc() is a valid table location. + let children = unsafe { + ViewedLayoutChildren::new_unchecked( + self.flatbuffer.clone(), + fb_child._tab.loc(), + self.array_ctx.clone(), + self.layout_ctx.clone(), + self.layouts.clone(), + ) + }; + layout_from_fb_layout( + fb_child, + dtype, + self.layout_ctx.clone(), + self.array_ctx.clone(), + self.layouts.clone(), + Arc::new(children), + ) + })?; + Ok(child.clone()) + } + + fn child_row_count(&self, idx: usize) -> VortexResult { + if idx >= self.cache.len() { + vortex_bail!("Child index out of bounds: {idx} of {}", self.cache.len()); + } + Ok(self + .flatbuffer() + .children() + .unwrap_or_default() + .get(idx) + .row_count()) + } + + fn nchildren(&self) -> usize { + self.cache.len() + } +} + +/// Parse a v2 [`LayoutRef`] from a layout flatbuffer. +pub fn layout_from_flatbuffer( + flatbuffer: FlatBuffer, + dtype: &DType, + layout_ctx: &ReadContext, + array_ctx: &ReadContext, + layouts: &LayoutVTableRegistry, +) -> VortexResult { + let fb_layout = root_with_opts::(&LAYOUT_VERIFIER, &flatbuffer)?; + // SAFETY: the flatbuffer was verified by root_with_opts. + let children = unsafe { + ViewedLayoutChildren::new_unchecked( + flatbuffer.clone(), + fb_layout._tab.loc(), + array_ctx.clone(), + layout_ctx.clone(), + layouts.clone(), + ) + }; + layout_from_fb_layout( + fb_layout, + dtype, + layout_ctx.clone(), + array_ctx.clone(), + layouts.clone(), + Arc::new(children), + ) +} + +fn layout_from_fb_layout( + fb_layout: layout::Layout<'_>, + dtype: &DType, + layout_ctx: ReadContext, + array_ctx: ReadContext, + layouts: LayoutVTableRegistry, + children: Arc, +) -> VortexResult { + let encoding_id = layout_ctx + .resolve(fb_layout.encoding()) + .ok_or_else(|| vortex_err!("Invalid layout encoding ID: {}", fb_layout.encoding()))?; + let vtable = layouts + .find(&encoding_id) + .ok_or_else(|| vortex_err!("Invalid v2 layout encoding ID: {encoding_id}"))?; + vtable.deserialize(LayoutDeserializeArgs { + dtype, + row_count: fb_layout.row_count(), + metadata: fb_layout + .metadata() + .map(|m| m.bytes()) + .unwrap_or_else(|| &[]), + segment_ids: fb_layout + .segments() + .unwrap_or_default() + .iter() + .map(SegmentId::from) + .collect(), + children, + array_ctx: &array_ctx, + }) +} + +fn metadata_bool_field(metadata: &[u8], field_number: u64) -> VortexResult> { + Ok(metadata_varint_field(metadata, field_number)?.map(|value| value != 0)) +} + +fn metadata_varint_field(metadata: &[u8], field_number: u64) -> VortexResult> { + let mut offset = 0; + while offset < metadata.len() { + let key = read_varint(metadata, &mut offset)?; + let field = key >> 3; + let wire_type = key & 0x07; + if field == field_number { + if wire_type != 0 { + vortex_bail!("metadata field {field_number} is not a varint"); + } + return Ok(Some(read_varint(metadata, &mut offset)?)); + } + skip_proto_field(metadata, &mut offset, wire_type)?; + } + Ok(None) +} + +fn metadata_bytes_field(metadata: &[u8], field_number: u64) -> VortexResult>> { + let mut offset = 0; + while offset < metadata.len() { + let key = read_varint(metadata, &mut offset)?; + let field = key >> 3; + let wire_type = key & 0x07; + if field == field_number { + if wire_type != 2 { + vortex_bail!("metadata field {field_number} is not length-delimited"); + } + let len = usize::try_from(read_varint(metadata, &mut offset)?)?; + let end = offset + len; + if end > metadata.len() { + vortex_bail!("metadata field extends past end of buffer"); + } + return Ok(Some(metadata[offset..end].to_vec())); + } + skip_proto_field(metadata, &mut offset, wire_type)?; + } + Ok(None) +} + +fn skip_proto_field(metadata: &[u8], offset: &mut usize, wire_type: u64) -> VortexResult<()> { + match wire_type { + 0 => { + read_varint(metadata, offset)?; + } + 1 => { + *offset += 8; + } + 2 => { + let len = usize::try_from(read_varint(metadata, offset)?)?; + *offset += len; + } + 5 => { + *offset += 4; + } + _ => vortex_bail!("unsupported protobuf wire type {wire_type}"), + } + if *offset > metadata.len() { + vortex_bail!("metadata field extends past end of buffer"); + } + Ok(()) +} + +fn read_varint(metadata: &[u8], offset: &mut usize) -> VortexResult { + let mut value = 0u64; + for shift in (0..64).step_by(7) { + let byte = *metadata + .get(*offset) + .ok_or_else(|| vortex_err!("truncated protobuf varint"))?; + *offset += 1; + value |= u64::from(byte & 0x7f) << shift; + if byte & 0x80 == 0 { + return Ok(value); + } + } + vortex_bail!("protobuf varint exceeds 64 bits") +} + +/// V2 flat layout vtable. +#[derive(Clone, Debug)] +pub struct Flat; + +/// V2 flat layout data. +#[derive(Clone, Debug)] +pub struct FlatData { + pub(crate) segment_id: SegmentId, + pub(crate) array_ctx: ReadContext, + pub(crate) array_tree: Option, +} + +impl FlatData { + /// Returns the serialized array segment ID. + pub fn segment_id(&self) -> SegmentId { + self.segment_id + } + + /// Returns the array read context. + pub fn array_ctx(&self) -> &ReadContext { + &self.array_ctx + } + + /// Returns the optional inline array encoding tree. + pub fn array_tree(&self) -> Option<&ByteBuffer> { + self.array_tree.as_ref() + } +} + +impl VTable for Flat { + type LayoutData = FlatData; + + fn id(&self) -> LayoutId { + LayoutId::new("vortex.flat") + } + + fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { + vortex_ensure!( + args.segment_ids.len() == 1, + "Flat layout must have exactly one segment ID" + ); + Ok(FlatData { + segment_id: args.segment_ids[0], + array_ctx: args.array_ctx.clone(), + array_tree: metadata_bytes_field(args.metadata, 1)?.map(ByteBuffer::from), + }) + } + + fn child_dtype(_layout: Layout, idx: usize) -> VortexResult { + vortex_bail!("Flat layout has no child {idx}") + } + + fn child_type(_layout: Layout, idx: usize) -> VortexResult { + vortex_bail!("Flat layout has no child {idx}") + } + + fn new_scan_node( + layout: Layout, + req: &mut NodeRequest, + cx: &ExpandCtx, + ) -> VortexResult { + scan_flat::new_scan_node(layout, req, cx) + } +} + +/// V2 chunked layout vtable. +#[derive(Clone, Debug)] +pub struct Chunked; + +/// V2 chunked layout data. +#[derive(Clone, Debug)] +pub struct ChunkedData { + pub(crate) chunk_offsets: Vec, +} + +impl ChunkedData { + /// Returns the cumulative chunk offsets. + pub fn chunk_offsets(&self) -> &[u64] { + &self.chunk_offsets + } +} + +impl VTable for Chunked { + type LayoutData = ChunkedData; + + fn id(&self) -> LayoutId { + LayoutId::new("vortex.chunked") + } + + fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { + EmptyMetadata::deserialize(args.metadata)?; + let mut chunk_offsets = Vec::with_capacity(args.children.nchildren() + 1); + chunk_offsets.push(0); + for idx in 0..args.children.nchildren() { + let next = chunk_offsets[idx] + args.children.child_row_count(idx)?; + chunk_offsets.push(next); + } + vortex_ensure!( + chunk_offsets.last().copied() == Some(args.row_count), + "Chunked child row counts do not add up to parent row count" + ); + Ok(ChunkedData { chunk_offsets }) + } + + fn child_dtype(layout: Layout, _idx: usize) -> VortexResult { + Ok(layout.dtype().clone()) + } + + fn child_type(layout: Layout, idx: usize) -> VortexResult { + let offset = *layout + .data() + .chunk_offsets + .get(idx) + .ok_or_else(|| vortex_err!("Chunked child index out of bounds: {idx}"))?; + Ok(LayoutChildType::Chunk((idx, offset))) + } + + fn new_scan_node( + layout: Layout, + req: &mut NodeRequest, + cx: &ExpandCtx, + ) -> VortexResult { + scan_chunked::new_scan_node(layout, req, cx) + } +} + +/// V2 struct layout vtable. +#[derive(Clone, Debug)] +pub struct Struct; + +impl VTable for Struct { + type LayoutData = (); + + fn id(&self) -> LayoutId { + LayoutId::new("vortex.struct") + } + + fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { + EmptyMetadata::deserialize(args.metadata)?; + Ok(()) + } + + fn child_dtype(layout: Layout, idx: usize) -> VortexResult { + let schema_index = if layout.dtype().is_nullable() { + idx.saturating_sub(1) + } else { + idx + }; + if idx == 0 && layout.dtype().is_nullable() { + Ok(DType::Bool(Nullability::NonNullable)) + } else { + layout + .dtype() + .as_struct_fields_opt() + .and_then(|fields| fields.field_by_index(schema_index)) + .ok_or_else(|| vortex_err!("Missing struct field {schema_index}")) + } + } + + fn child_type(layout: Layout, idx: usize) -> VortexResult { + let schema_index = if layout.dtype().is_nullable() { + idx.saturating_sub(1) + } else { + idx + }; + if idx == 0 && layout.dtype().is_nullable() { + Ok(LayoutChildType::Auxiliary("validity".into())) + } else { + let name = layout + .dtype() + .as_struct_fields_opt() + .and_then(|fields| fields.field_name(schema_index)) + .ok_or_else(|| vortex_err!("Missing struct field {schema_index}"))?; + Ok(LayoutChildType::Field(name.clone())) + } + } + + fn new_scan_node( + layout: Layout, + req: &mut NodeRequest, + cx: &ExpandCtx, + ) -> VortexResult { + scan_struct::new_scan_node(layout, req, cx) + } +} + +/// V2 dictionary layout vtable. +#[derive(Clone, Debug)] +pub struct Dict; + +/// V2 dictionary layout data. +#[derive(Clone, Debug)] +pub struct DictData { + pub(crate) codes_dtype: DType, + pub(crate) all_values_referenced: bool, +} + +impl DictData { + /// Returns whether all dictionary values are definitely referenced. + pub fn has_all_values_referenced(&self) -> bool { + self.all_values_referenced + } +} + +impl VTable for Dict { + type LayoutData = DictData; + + fn id(&self) -> LayoutId { + LayoutId::new("vortex.dict") + } + + fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { + let codes_ptype = metadata_varint_field(args.metadata, 1)? + .ok_or_else(|| vortex_err!("Dict metadata missing codes ptype"))?; + let codes_ptype = PType::try_from(i32::try_from(codes_ptype)?)?; + let codes_nullable = metadata_bool_field(args.metadata, 2)? + .map(Nullability::from) + .unwrap_or_else(|| args.dtype.nullability()); + Ok(DictData { + codes_dtype: DType::Primitive(codes_ptype, codes_nullable), + all_values_referenced: metadata_bool_field(args.metadata, 3)?.unwrap_or(false), + }) + } + + fn child_dtype(layout: Layout, idx: usize) -> VortexResult { + match idx { + 0 => Ok(layout.dtype().clone()), + 1 => Ok(layout.data().codes_dtype.clone()), + _ => vortex_bail!("Dict child index out of bounds: {idx}"), + } + } + + fn child_type(_layout: Layout, idx: usize) -> VortexResult { + match idx { + 0 => Ok(LayoutChildType::Auxiliary("values".into())), + 1 => Ok(LayoutChildType::Transparent("codes".into())), + _ => vortex_bail!("Dict child index out of bounds: {idx}"), + } + } + + fn new_scan_node( + layout: Layout, + req: &mut NodeRequest, + cx: &ExpandCtx, + ) -> VortexResult { + scan_dict::new_scan_node(layout, req, cx) + } +} + +/// V2 zoned layout vtable. +#[derive(Clone, Debug)] +pub struct Zoned; + +/// V2 zoned layout data. +#[derive(Clone, Debug)] +pub struct ZonedData { + pub(crate) zone_len: usize, + pub(crate) present_stats: Arc<[Stat]>, +} + +impl ZonedData { + /// Returns the configured zone length. + pub fn zone_len(&self) -> usize { + self.zone_len + } + + /// Returns the stats present in the zone table. + pub fn present_stats(&self) -> &Arc<[Stat]> { + &self.present_stats + } +} + +impl VTable for Zoned { + type LayoutData = ZonedData; + + fn id(&self) -> LayoutId { + LayoutId::new("vortex.stats") + } + + fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { + vortex_ensure!( + args.metadata.len() >= 4, + "Zoned metadata must contain at least 4 bytes for zone length, got {}", + args.metadata.len() + ); + let mut zone_len = [0; 4]; + zone_len.copy_from_slice(&args.metadata[0..4]); + Ok(ZonedData { + zone_len: u32::from_le_bytes(zone_len) as usize, + present_stats: stats_from_bitset_bytes(&args.metadata[4..]).into(), + }) + } + + fn child_dtype(layout: Layout, idx: usize) -> VortexResult { + match idx { + 0 => Ok(layout.dtype().clone()), + 1 => { + #[expect(deprecated)] + let dtype = ZoneMap::dtype_for_stats_table( + layout.dtype(), + layout.data().present_stats.as_ref(), + ); + Ok(dtype) + } + _ => vortex_bail!("Zoned child index out of bounds: {idx}"), + } + } + + fn child_type(_layout: Layout, idx: usize) -> VortexResult { + match idx { + 0 => Ok(LayoutChildType::Transparent("data".into())), + 1 => Ok(LayoutChildType::Auxiliary("zones".into())), + _ => vortex_bail!("Zoned child index out of bounds: {idx}"), + } + } + + fn new_scan_node( + layout: Layout, + req: &mut NodeRequest, + cx: &ExpandCtx, + ) -> VortexResult { + scan_zoned::new_scan_node(layout, req, cx) + } +} diff --git a/vortex-layout/src/lib.rs b/vortex-layout/src/lib.rs index aca3d36c04a..aca978552d0 100644 --- a/vortex-layout/src/lib.rs +++ b/vortex-layout/src/lib.rs @@ -1,6 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +pub mod layout_v2; pub mod layouts; pub use children::*; diff --git a/vortex-layout/src/scan/v2/layouts/chunked.rs b/vortex-layout/src/scan/v2/layouts/chunked.rs index a9863a592f6..88af1db5998 100644 --- a/vortex-layout/src/scan/v2/layouts/chunked.rs +++ b/vortex-layout/src/scan/v2/layouts/chunked.rs @@ -1,10 +1,10 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Scan2 rule for chunked layouts. +//! Scan2 vtable support for chunked layouts. //! //! Chunks stay *lazy*: children are resolved from the footer and expanded -//! through their own rules per request, never pre-planned. Chunked is +//! through their own layout scan vtables per request, never pre-planned. Chunked is //! therefore a lazy pushdown boundary: pushed expressions are recorded //! once, then replayed into each concrete child only when a read, //! evidence request, or aggregate touches that chunk. This lets @@ -42,9 +42,9 @@ use vortex_error::vortex_bail; use vortex_error::vortex_err; use vortex_session::VortexSession; -use crate::LayoutEncodingId; -use crate::LayoutRef; -use crate::layouts::chunked::ChunkedLayoutEncoding; +use crate::layout_v2::Chunked; +use crate::layout_v2::Layout; +use crate::layout_v2::LayoutRef; use crate::scan::v2::evidence::EvidenceFragment; use crate::scan::v2::node::AggregateAnswer; use crate::scan::v2::node::AggregatePlan; @@ -53,7 +53,6 @@ use crate::scan::v2::node::EvidencePlan; use crate::scan::v2::node::EvidencePlanRef; use crate::scan::v2::node::ExpandCtx; use crate::scan::v2::node::FileReader; -use crate::scan::v2::node::LayoutScanRule; use crate::scan::v2::node::PlanCtx; use crate::scan::v2::node::PushCtx; use crate::scan::v2::node::ReadPlan; @@ -71,41 +70,23 @@ use crate::scan::v2::request::NodeRequest; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; -/// Scan2 rule for `vortex.chunked`. -#[derive(Debug)] -pub struct ChunkedScanRule; - -impl LayoutScanRule for ChunkedScanRule { - type Node = ChunkedScanNode; - - fn id(&self) -> LayoutEncodingId { - ChunkedLayoutEncoding.id() - } - - fn expand( - &self, - layout: &LayoutRef, - _req: &mut NodeRequest, - cx: &ExpandCtx, - ) -> VortexResult { - let mut offsets = layout - .child_row_offsets() - .map(|offset| offset.ok_or_else(|| vortex_err!("chunked layout with auxiliary child"))) - .collect::>>()?; - offsets.push(layout.row_count()); - Ok(ChunkedScanNode { - layout: Arc::clone(layout), - offsets, - cx: cx.clone(), - children: Mutex::new(FxHashMap::default()), - reads: Mutex::new(FxHashMap::default()), - }) - } +pub(crate) fn new_scan_node( + layout: Layout, + _req: &mut NodeRequest, + cx: &ExpandCtx, +) -> VortexResult { + Ok(Arc::new(ChunkedScanNode { + layout: layout.to_layout(), + offsets: layout.data().chunk_offsets().to_vec(), + cx: cx.clone(), + children: Mutex::new(FxHashMap::default()), + reads: Mutex::new(FxHashMap::default()), + })) } /// Reads a chunked layout: cumulative chunk offsets /// (`offsets.len() == chunks + 1`), with chunk children expanded lazily -/// through their own rules. +/// through their own layout vtables. pub struct ChunkedScanNode { layout: LayoutRef, offsets: Vec, diff --git a/vortex-layout/src/scan/v2/layouts/dict.rs b/vortex-layout/src/scan/v2/layouts/dict.rs index 84ded84b037..8a031e6277a 100644 --- a/vortex-layout/src/scan/v2/layouts/dict.rs +++ b/vortex-layout/src/scan/v2/layouts/dict.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Scan2 rule for dictionary layouts. +//! Scan2 vtable support for dictionary layouts. //! //! Value reads keep the v1 shape — values read once per query and //! cached, codes read per range (selection-aware), the pair rebuilt as a @@ -45,15 +45,12 @@ use vortex_array::expr::is_root; use vortex_array::optimizer::ArrayOptimizer; use vortex_array::scalar::Scalar; use vortex_error::VortexResult; -use vortex_error::vortex_bail; use vortex_error::vortex_err; use vortex_mask::Mask; use vortex_session::VortexSession; -use crate::LayoutEncodingId; -use crate::LayoutRef; -use crate::layouts::dict::Dict; -use crate::layouts::dict::DictLayoutEncoding; +use crate::layout_v2::Dict; +use crate::layout_v2::Layout; use crate::scan::v2::evidence::EvidenceFragment; use crate::scan::v2::evidence::PredicateEvidenceKind; use crate::scan::v2::node::DynReadPlan; @@ -61,7 +58,6 @@ use crate::scan::v2::node::EvidencePlan; use crate::scan::v2::node::EvidencePlanRef; use crate::scan::v2::node::ExpandCtx; use crate::scan::v2::node::FileReader; -use crate::scan::v2::node::LayoutScanRule; use crate::scan::v2::node::PlanCtx; use crate::scan::v2::node::PushCtx; use crate::scan::v2::node::ReadPlan; @@ -78,36 +74,20 @@ use crate::scan::v2::request::NodeRequest; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; -/// Scan2 rule for `vortex.dict`. -#[derive(Debug)] -pub struct DictScanRule; - -impl LayoutScanRule for DictScanRule { - type Node = DictScanNode; - - fn id(&self) -> LayoutEncodingId { - DictLayoutEncoding.id() - } - - fn expand( - &self, - layout: &LayoutRef, - _req: &mut NodeRequest, - cx: &ExpandCtx, - ) -> VortexResult { - if !layout.is::() { - vortex_bail!("dict scan2 rule applied to {}", layout.encoding_id()); - } - let values = layout.child(0)?; - let codes = layout.child(1)?; - Ok(DictScanNode { - dtype: layout.dtype().clone(), - values_len: values.row_count(), - // Values and codes live in other row domains. - values: cx.expand_free(&values)?, - codes: cx.expand_free(&codes)?, - }) - } +pub(crate) fn new_scan_node( + layout: Layout, + _req: &mut NodeRequest, + cx: &ExpandCtx, +) -> VortexResult { + let values = layout.child(0)?; + let codes = layout.child(1)?; + Ok(Arc::new(DictScanNode { + dtype: layout.dtype().clone(), + values_len: values.row_count(), + // Values and codes live in other row domains. + values: cx.expand_free(&values)?, + codes: cx.expand_free(&codes)?, + })) } /// Reads a dict layout: shared values (another row domain, read once per diff --git a/vortex-layout/src/scan/v2/layouts/flat.rs b/vortex-layout/src/scan/v2/layouts/flat.rs index 6ce3850c787..8932faec41d 100644 --- a/vortex-layout/src/scan/v2/layouts/flat.rs +++ b/vortex-layout/src/scan/v2/layouts/flat.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Scan2 rule for flat layouts: one segment, parsed lazily, decoded on +//! Scan2 vtable support for flat layouts: one segment, parsed lazily, decoded on //! demand. //! //! A flat leaf exposes no evidence producers — it has no statistics or @@ -22,13 +22,11 @@ use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; -use crate::LayoutEncodingId; -use crate::LayoutRef; -use crate::layouts::flat::Flat; -use crate::layouts::flat::FlatLayoutEncoding; +use crate::layout_v2::Flat; +use crate::layout_v2::Layout; +use crate::layout_v2::LayoutRef; use crate::scan::v2::node::ExpandCtx; use crate::scan::v2::node::FileReader; -use crate::scan::v2::node::LayoutScanRule; use crate::scan::v2::node::PlanCtx; use crate::scan::v2::node::ReadPlan; use crate::scan::v2::node::ReadPlanRef; @@ -42,30 +40,14 @@ use crate::scan::v2::request::NodeRequest; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; -/// Scan2 rule for `vortex.flat`. -#[derive(Debug)] -pub struct FlatScanRule; - -impl LayoutScanRule for FlatScanRule { - type Node = FlatScanNode; - - fn id(&self) -> LayoutEncodingId { - FlatLayoutEncoding.id() - } - - fn expand( - &self, - layout: &LayoutRef, - _req: &mut NodeRequest, - _cx: &ExpandCtx, - ) -> VortexResult { - if !layout.is::() { - vortex_bail!("flat scan2 rule applied to {}", layout.encoding_id()); - } - Ok(FlatScanNode { - layout: Arc::clone(layout), - }) - } +pub(crate) fn new_scan_node( + layout: Layout, + _req: &mut NodeRequest, + _cx: &ExpandCtx, +) -> VortexResult { + Ok(Arc::new(FlatScanNode { + layout: layout.to_layout(), + })) } /// Reads a flat layout: fetches its segment once per query, parses it @@ -176,7 +158,7 @@ impl ReadPlan for FlatReadPlan { ); }; Ok(SegmentRequests::exact(vec![ - cx.request_for_segment(flat.segment_id())?, + cx.request_for_segment(flat.data().segment_id())?, ])) } @@ -196,13 +178,18 @@ pub(crate) async fn decode_flat(layout: &LayoutRef, io: &FileReader) -> VortexRe }; let row_count = usize::try_from(layout.row_count()) .map_err(|_| vortex_err!("layout row count exceeds usize"))?; - let segment = io.segments().request(flat.segment_id()).await?; - let parts = if let Some(tree) = flat.array_tree() { + let segment = io.segments().request(flat.data().segment_id()).await?; + let parts = if let Some(tree) = flat.data().array_tree() { SerializedArray::from_flatbuffer_and_segment(tree.clone(), segment)? } else { SerializedArray::try_from(segment)? }; - parts.decode(layout.dtype(), row_count, flat.array_ctx(), io.session()) + parts.decode( + layout.dtype(), + row_count, + flat.data().array_ctx(), + io.session(), + ) } pub(crate) fn slice_to_range(array: ArrayRef, range: &Range) -> VortexResult { diff --git a/vortex-layout/src/scan/v2/layouts/mod.rs b/vortex-layout/src/scan/v2/layouts/mod.rs index 8875858389b..83023252437 100644 --- a/vortex-layout/src/scan/v2/layouts/mod.rs +++ b/vortex-layout/src/scan/v2/layouts/mod.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Built-in scan2 layout rules. +//! Built-in scan2 layout-vtable implementations. pub mod chunked; pub mod dict; diff --git a/vortex-layout/src/scan/v2/layouts/struct_.rs b/vortex-layout/src/scan/v2/layouts/struct_.rs index 5284910421b..029cd59ae31 100644 --- a/vortex-layout/src/scan/v2/layouts/struct_.rs +++ b/vortex-layout/src/scan/v2/layouts/struct_.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Scan2 rule for struct layouts: plans field access expressions. +//! Scan2 vtable support for struct layouts: plans field access expressions. //! //! A struct node treats field access as scalar expression pushdown: //! `get_item(field, root())` pushes to the field child, and `select(...)` @@ -26,12 +26,11 @@ use vortex_error::VortexResult; use vortex_error::vortex_bail; use crate::LayoutChildType; -use crate::LayoutEncodingId; -use crate::LayoutRef; -use crate::layouts::struct_::StructLayoutEncoding; +use crate::layout_v2::Layout; +use crate::layout_v2::LayoutRef; +use crate::layout_v2::Struct; use crate::scan::v2::node::ApplyScanNode; use crate::scan::v2::node::ExpandCtx; -use crate::scan::v2::node::LayoutScanRule; use crate::scan::v2::node::MaskScanNode; use crate::scan::v2::node::PlanCtx; use crate::scan::v2::node::PushCtx; @@ -44,35 +43,22 @@ use crate::scan::v2::referenced_fields; use crate::scan::v2::request::NodeRequest; use crate::scan::v2::struct_fields; -/// Scan2 rule for `vortex.struct`. -#[derive(Debug)] -pub struct StructScanRule; - -impl LayoutScanRule for StructScanRule { - type Node = StructScanNode; - - fn id(&self) -> LayoutEncodingId { - StructLayoutEncoding.id() - } - - fn expand( - &self, - layout: &LayoutRef, - _req: &mut NodeRequest, - cx: &ExpandCtx, - ) -> VortexResult { - let validity = layout - .dtype() - .is_nullable() - .then(|| cx.expand(&layout.child(0)?, &mut NodeRequest::empty())) - .transpose()?; - Ok(StructScanNode { - layout: Arc::clone(layout), - cx: cx.clone(), - children: Mutex::new(FxHashMap::default()), - validity, - }) - } +pub(crate) fn new_scan_node( + layout: Layout, + _req: &mut NodeRequest, + cx: &ExpandCtx, +) -> VortexResult { + let validity = layout + .dtype() + .is_nullable() + .then(|| cx.expand(&layout.child(0)?, &mut NodeRequest::empty())) + .transpose()?; + Ok(Arc::new(StructScanNode { + layout: layout.to_layout(), + cx: cx.clone(), + children: Mutex::new(FxHashMap::default()), + validity, + })) } /// Plans struct field expressions through child scan nodes. @@ -149,7 +135,7 @@ impl StructScanNode { return Ok(Arc::clone(hit)); } for idx in 0..self.layout.nchildren() { - if let LayoutChildType::Field(field) = self.layout.child_type(idx) + if let Ok(LayoutChildType::Field(field)) = self.layout.child_type(idx) && field == *name { let mut req = NodeRequest::empty(); diff --git a/vortex-layout/src/scan/v2/layouts/zoned.rs b/vortex-layout/src/scan/v2/layouts/zoned.rs index 09b1625ce25..9a4b728b78e 100644 --- a/vortex-layout/src/scan/v2/layouts/zoned.rs +++ b/vortex-layout/src/scan/v2/layouts/zoned.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Scan2 rule for zoned (zone-map) layouts: the canonical proof producer. +//! Scan2 vtable support for zoned (zone-map) layouts: the canonical proof producer. //! //! Reading delegates straight to the data child. Pushed predicate nodes //! expose zone-map evidence plans: per predicate, the falsification and @@ -40,17 +40,14 @@ use vortex_array::expr::root; use vortex_array::expr::stats::Stat; use vortex_array::scalar::Scalar; use vortex_error::VortexResult; -use vortex_error::vortex_bail; use vortex_error::vortex_err; use vortex_mask::Mask; use vortex_session::VortexSession; -use crate::LayoutEncodingId; -use crate::LayoutRef; +use crate::layout_v2::Layout; +use crate::layout_v2::Zoned; use crate::layouts::zoned::MAX_IS_TRUNCATED; use crate::layouts::zoned::MIN_IS_TRUNCATED; -use crate::layouts::zoned::Zoned; -use crate::layouts::zoned::ZonedLayoutEncoding; use crate::layouts::zoned::zone_map::ZoneMap; use crate::scan::v2::evidence::EvidenceFragment; use crate::scan::v2::evidence::PredicateEvidenceKind; @@ -63,7 +60,6 @@ use crate::scan::v2::node::EvidencePlanRef; use crate::scan::v2::node::EvidenceStateKey; use crate::scan::v2::node::ExpandCtx; use crate::scan::v2::node::FileReader; -use crate::scan::v2::node::LayoutScanRule; use crate::scan::v2::node::PlanCtx; use crate::scan::v2::node::PushCtx; use crate::scan::v2::node::ReadPlan; @@ -80,46 +76,30 @@ use crate::scan::v2::request::NodeRequest; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; -/// Scan2 rule for `vortex.zoned`. -#[derive(Debug)] -pub struct ZonedScanRule; - -impl LayoutScanRule for ZonedScanRule { - type Node = ZonedScanNode; - - fn id(&self) -> LayoutEncodingId { - ZonedLayoutEncoding.id() - } - - fn expand( - &self, - layout: &LayoutRef, - req: &mut NodeRequest, - cx: &ExpandCtx, - ) -> VortexResult { - let Some(zoned) = layout.as_opt::() else { - vortex_bail!("zoned scan2 rule applied to {}", layout.encoding_id()); - }; - let zones = layout.child(1)?; - Ok(ZonedScanNode { - // The data child preserves this node's rows: pass the - // expansion request through. - data: cx.expand(&layout.child(0)?, req)?, - nzones: zones.row_count(), - zones: cx.expand_free(&zones)?, - column_dtype: layout.dtype().clone(), - zone_len: zoned.zone_len() as u64, - row_count: layout.row_count(), - present_stats: Arc::clone(zoned.present_stats()), - }) - } +pub(crate) fn new_scan_node( + layout: Layout, + req: &mut NodeRequest, + cx: &ExpandCtx, +) -> VortexResult { + let zones = layout.child(1)?; + Ok(Arc::new(ZonedScanNode { + // The data child preserves this node's rows: pass the + // expansion request through. + data: cx.expand(&layout.child(0)?, req)?, + nzones: zones.row_count(), + zones: cx.expand_free(&zones)?, + column_dtype: layout.dtype().clone(), + zone_len: layout.data().zone_len() as u64, + row_count: layout.row_count(), + present_stats: Arc::clone(layout.data().present_stats()), + })) } /// Reads a zoned layout by delegating to its data child; produces /// per-zone predicate evidence from the stats table. pub struct ZonedScanNode { data: ScanNodeRef, - /// The zones child (per-zone stats table), read through its own rule. + /// The zones child (per-zone stats table), read through its own layout vtable. zones: ScanNodeRef, nzones: u64, column_dtype: DType, diff --git a/vortex-layout/src/scan/v2/mod.rs b/vortex-layout/src/scan/v2/mod.rs index 3aa0110f0d0..1d6f9b78c35 100644 --- a/vortex-layout/src/scan/v2/mod.rs +++ b/vortex-layout/src/scan/v2/mod.rs @@ -3,7 +3,7 @@ //! Scan2 layout-node machinery. //! -//! This module contains the layout-tree expansion rules and executable +//! This module contains the layout-tree expansion vtables and executable //! [`ScanNode`](node::ScanNode) plans used by the alternate scan implementation. pub mod evidence; @@ -12,7 +12,6 @@ pub mod session; pub(crate) mod layouts; pub mod node; - use vortex_array::dtype::DType; use vortex_array::dtype::FieldName; use vortex_array::dtype::StructFields; diff --git a/vortex-layout/src/scan/v2/node.rs b/vortex-layout/src/scan/v2/node.rs index ee3f24c9ba3..e77fb6149ac 100644 --- a/vortex-layout/src/scan/v2/node.rs +++ b/vortex-layout/src/scan/v2/node.rs @@ -5,12 +5,11 @@ //! capabilities (plan 017). //! //! Like the v1 scan, a file's layout tree expands into one node per -//! layout through session-registered rules, and the typed traits here are +//! layout through v2 layout-vtable scan expansion, and the typed traits here are //! author-facing: the engine works through the blanket-implemented -//! [`DynScanNode`] / [`DynLayoutScanRule`] adapters. Three things are -//! new: +//! [`DynScanNode`] adapter. Three things are new: //! -//! - expansion is *negotiation*: rules see the scoped scan request before +//! - expansion is *negotiation*: layout scan vtables see the scoped scan request before //! expression pushdown plans reads and evidence (see [`super::request`]); //! - expression pushdown returns another scan node whose root value is //! the pushed expression, so reads and evidence are planned from @@ -46,12 +45,10 @@ use vortex_error::vortex_err; use vortex_mask::Mask; use vortex_session::VortexSession; -use crate::LayoutEncodingId; -use crate::LayoutRef; +use crate::layout_v2::LayoutRef; use crate::scan::v2::evidence::EvidenceFragment; use crate::scan::v2::request::EvidenceRequest; use crate::scan::v2::request::NodeRequest; -use crate::scan::v2::session::ScanV2SessionExt; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; use crate::segments::SegmentSource; @@ -88,9 +85,6 @@ pub type ScanStateRef = Arc; /// A reference-counted, type-erased scan2 node. pub type ScanNodeRef = Arc; -/// A reference-counted, type-erased scan2 rule. -pub type ScanRuleRef = Arc; - /// A reference-counted, type-erased evidence plan. pub type EvidencePlanRef = Arc; @@ -249,27 +243,6 @@ pub struct AggregateAnswer { pub residual: Vec>, } -/// One layout encoding's scan2 behaviour, registered per -/// [`LayoutEncodingId`]. Not object safe; the engine resolves rules as -/// [`ScanRuleRef`]s through the blanket [`DynLayoutScanRule`] adapter. -pub trait LayoutScanRule: 'static + fmt::Debug + Send + Sync { - /// The scan-tree node this rule expands to. - type Node: ScanNode; - - /// The layout encoding this rule reads. - fn id(&self) -> LayoutEncodingId; - - /// Expand one layout node into a scan node. Row-preserving children - /// receive the same request object; children in another row domain - /// receive [`NodeRequest::empty`]. - fn expand( - &self, - layout: &LayoutRef, - req: &mut NodeRequest, - cx: &ExpandCtx, - ) -> VortexResult; -} - /// A node in the expanded scan2 tree. Nodes are shared across queries; /// all per-file/query caching lives in the node's `State`. pub trait ScanNode: 'static + Send + Sync { @@ -1410,36 +1383,6 @@ fn downcast_erased_state(state: &ScanState) -> VortexR }) } -/// Object-safe view of a [`LayoutScanRule`]. Blanket-implemented; never -/// by hand. -pub trait DynLayoutScanRule: fmt::Debug + Send + Sync { - /// The layout encoding this rule reads. - fn id(&self) -> LayoutEncodingId; - - /// Expand one layout node into a type-erased scan2 node. - fn expand( - &self, - layout: &LayoutRef, - req: &mut NodeRequest, - cx: &ExpandCtx, - ) -> VortexResult; -} - -impl DynLayoutScanRule for R { - fn id(&self) -> LayoutEncodingId { - LayoutScanRule::id(self) - } - - fn expand( - &self, - layout: &LayoutRef, - req: &mut NodeRequest, - cx: &ExpandCtx, - ) -> VortexResult { - Ok(Arc::new(LayoutScanRule::expand(self, layout, req, cx)?)) - } -} - /// Recover a node's concrete file/query global state from its erased form. pub(crate) fn downcast_state(state: &ScanState) -> VortexResult<&T::State> { state.downcast_ref::().ok_or_else(|| { @@ -1450,8 +1393,8 @@ pub(crate) fn downcast_state(state: &ScanState) -> VortexResult<&T: }) } -/// Resolves layout encodings to their registered scan2 rules during -/// expansion. Rules recurse into child layouts through +/// Expands layout encodings through their vtable-provided scan2 nodes. +/// Scan vtables recurse into child layouts through /// [`ExpandCtx::expand`] (passing the scoped request through /// row-preserving children) or [`ExpandCtx::expand_free`] (for children /// in another row domain, and for lazy runtime expansion). @@ -1461,27 +1404,20 @@ pub struct ExpandCtx { } impl ExpandCtx { - /// An expansion context resolving rules from `session`. + /// An expansion context carrying the session used by scan nodes. pub fn new(session: VortexSession) -> Self { Self { session } } - /// The session rules are resolved from. + /// The session scan nodes are expanded with. pub fn session(&self) -> &VortexSession { &self.session } - /// Expand `layout` through its encoding's registered scan2 rule, + /// Expand `layout` through its encoding's scan2 vtable, /// negotiating `req` on the way down. pub fn expand(&self, layout: &LayoutRef, req: &mut NodeRequest) -> VortexResult { - let id = layout.encoding_id(); - let rule = self.session.scan_v2_rules().find(&id).ok_or_else(|| { - vortex_err!( - "no scan2 rule registered for layout encoding {id}; register one with \ - ScanV2Session::register" - ) - })?; - rule.expand(layout, req, self) + layout.new_scan_node(req, self) } /// Expand `layout` with an empty request: for children in another row diff --git a/vortex-layout/src/scan/v2/request.rs b/vortex-layout/src/scan/v2/request.rs index f11a42e366d..1b1dd047d3e 100644 --- a/vortex-layout/src/scan/v2/request.rs +++ b/vortex-layout/src/scan/v2/request.rs @@ -28,7 +28,7 @@ pub enum EvidenceMode { /// Expansion-time context reserved for layout-local scan setup. /// -/// Scan2 no longer carries predicates through expansion. Layout rules +/// Scan2 no longer carries predicates through expansion. Layout scan vtables /// must expose expression behavior through /// [`ScanNode::try_push_expr`](super::node::ScanNode::try_push_expr), /// [`ScanNode::plan_read`](super::node::ScanNode::plan_read), and diff --git a/vortex-layout/src/scan/v2/session.rs b/vortex-layout/src/scan/v2/session.rs index 7d53068ac95..fd716ec542f 100644 --- a/vortex-layout/src/scan/v2/session.rs +++ b/vortex-layout/src/scan/v2/session.rs @@ -1,8 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Session registry for layout scan2 rules, plus the session-level -//! default for which scan implementation `VortexScanExec` expands to. +//! Session-level default for which scan implementation `VortexScanExec` expands to. use std::any::Any; use std::sync::atomic::AtomicBool; @@ -10,52 +9,15 @@ use std::sync::atomic::Ordering; use vortex_session::SessionExt; use vortex_session::SessionVar; -use vortex_session::registry::Registry; - -use crate::LayoutEncodingId; -use crate::scan::v2::layouts::chunked::ChunkedScanRule; -use crate::scan::v2::layouts::dict::DictScanRule; -use crate::scan::v2::layouts::flat::FlatScanRule; -use crate::scan::v2::layouts::struct_::StructScanRule; -use crate::scan::v2::layouts::zoned::ZonedScanRule; -use crate::scan::v2::node::LayoutScanRule; -use crate::scan::v2::node::ScanRuleRef; - -/// The registry mapping layout encodings to scan2 rules. -pub type ScanRuleRegistry = Registry; - -/// Session variable holding the engine's layout scan2 rules, keyed by -/// [`LayoutEncodingId`], and the session default for the scan -/// implementation swap. The default registers the built-in rules (flat, -/// chunked, struct, dict, zoned); third-party layout crates register -/// their own the same way. +/// Session variable holding the scan implementation default. #[derive(Debug)] pub struct ScanV2Session { - registry: ScanRuleRegistry, /// Whether `VortexScanExec` expands through scan2 when the node does /// not choose explicitly (see `VortexScanExec::with_scan2`). default_enabled: AtomicBool, } impl ScanV2Session { - /// Register a scan2 rule for the layout encoding it names. - pub fn register(&self, rule: R) { - self.registry.register( - LayoutScanRule::id(&rule), - std::sync::Arc::new(rule) as ScanRuleRef, - ); - } - - /// Find the rule registered for a layout encoding. - pub fn find(&self, id: &LayoutEncodingId) -> Option { - self.registry.find(id) - } - - /// The underlying registry. - pub fn registry(&self) -> &ScanRuleRegistry { - &self.registry - } - /// Whether scans expand through scan2 by default in this session. pub fn default_enabled(&self) -> bool { self.default_enabled.load(Ordering::Relaxed) @@ -70,16 +32,9 @@ impl ScanV2Session { impl Default for ScanV2Session { fn default() -> Self { - let session = Self { - registry: ScanRuleRegistry::default(), + Self { default_enabled: AtomicBool::new(false), - }; - session.register(FlatScanRule); - session.register(ChunkedScanRule); - session.register(StructScanRule); - session.register(DictScanRule); - session.register(ZonedScanRule); - session + } } } @@ -93,10 +48,10 @@ impl SessionVar for ScanV2Session { } } -/// Session accessor for the engine's scan2 rules. +/// Session accessor for the scan2 implementation switch. pub trait ScanV2SessionExt: SessionExt { - /// The layout scan2 rules registered with this session. - fn scan_v2_rules(&self) -> vortex_session::Ref<'_, ScanV2Session> { + /// The scan2 session variable. + fn scan_v2(&self) -> vortex_session::Ref<'_, ScanV2Session> { self.get::() } } diff --git a/vortex-layout/src/session.rs b/vortex-layout/src/session.rs index 370fe391ca0..b1e5bf7cb5e 100644 --- a/vortex-layout/src/session.rs +++ b/vortex-layout/src/session.rs @@ -2,6 +2,7 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use std::any::Any; +use std::sync::Arc; use vortex_session::Ref; use vortex_session::SessionExt; @@ -9,6 +10,8 @@ use vortex_session::SessionVar; use vortex_session::registry::Registry; use crate::LayoutEncodingRef; +use crate::layout_v2; +use crate::layout_v2::VTable as _; use crate::layouts::chunked::ChunkedLayoutEncoding; use crate::layouts::dict::DictLayoutEncoding; use crate::layouts::flat::FlatLayoutEncoding; @@ -21,6 +24,7 @@ pub type LayoutRegistry = Registry; #[derive(Debug)] pub struct LayoutSession { registry: LayoutRegistry, + v2_registry: layout_v2::LayoutVTableRegistry, } impl LayoutSession { @@ -40,11 +44,23 @@ impl LayoutSession { pub fn registry(&self) -> &LayoutRegistry { &self.registry } + + /// Register a v2 layout vtable in the session, replacing any existing vtable with the same ID. + pub fn register_v2(&self, vtable: V) { + self.v2_registry + .register(vtable.id(), Arc::new(vtable) as layout_v2::LayoutVTableRef); + } + + /// Returns the v2 layout vtable registry. + pub fn v2_registry(&self) -> &layout_v2::LayoutVTableRegistry { + &self.v2_registry + } } impl Default for LayoutSession { fn default() -> Self { let layouts = LayoutRegistry::default(); + let v2_layouts = layout_v2::LayoutVTableRegistry::default(); // Register the built-in layout encodings. layouts.register(ChunkedLayoutEncoding.id(), ChunkedLayoutEncoding.as_ref()); @@ -53,7 +69,32 @@ impl Default for LayoutSession { layouts.register(ZonedLayoutEncoding.id(), ZonedLayoutEncoding.as_ref()); layouts.register(DictLayoutEncoding.id(), DictLayoutEncoding.as_ref()); - Self { registry: layouts } + // Register the built-in v2 layout vtables. + v2_layouts.register( + layout_v2::Chunked.id(), + Arc::new(layout_v2::Chunked) as layout_v2::LayoutVTableRef, + ); + v2_layouts.register( + layout_v2::Flat.id(), + Arc::new(layout_v2::Flat) as layout_v2::LayoutVTableRef, + ); + v2_layouts.register( + layout_v2::Struct.id(), + Arc::new(layout_v2::Struct) as layout_v2::LayoutVTableRef, + ); + v2_layouts.register( + layout_v2::Zoned.id(), + Arc::new(layout_v2::Zoned) as layout_v2::LayoutVTableRef, + ); + v2_layouts.register( + layout_v2::Dict.id(), + Arc::new(layout_v2::Dict) as layout_v2::LayoutVTableRef, + ); + + Self { + registry: layouts, + v2_registry: v2_layouts, + } } } From a000b1da99e98dfef7a423855e283c167789a9da Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 19 Jun 2026 09:42:40 -0400 Subject: [PATCH 09/48] Optimize scan2 predicate evidence scheduling Signed-off-by: Nicholas Gates --- vortex-file/src/multi/scan_v2.rs | 159 +++++---- vortex-layout/src/scan/v2/layouts/dict.rs | 378 ++++++++-------------- 2 files changed, 227 insertions(+), 310 deletions(-) diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index 221e06e1263..5c17cc029d1 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -46,7 +46,9 @@ use vortex_io::filesystem::FileListing; use vortex_io::filesystem::FileSystemRef; use vortex_io::runtime::Handle; use vortex_io::session::RuntimeSessionExt; +use vortex_layout::scan::v2::evidence::EvidenceFragment; use vortex_layout::scan::v2::evidence::PredicateEvidence; +use vortex_layout::scan::v2::evidence::PredicateEvidenceKind; use vortex_layout::scan::v2::evidence::PredicateId; use vortex_layout::scan::v2::evidence::PredicateVersion; use vortex_layout::scan::v2::node::AggregatePlanRef; @@ -109,6 +111,7 @@ use crate::VortexOpenOptions; const DEFAULT_CONCURRENCY: usize = 8; const FALLBACK_SPLIT_SIZE: u64 = 100_000; +const DEFAULT_EVIDENCE_MORSEL_WINDOW: usize = 8; /// Below this demanded-row density, evaluate a residual predicate over only the demanded rows /// (filter-first) rather than the whole morsel. Mirrors the V1 flat-reader threshold. @@ -616,6 +619,7 @@ impl DataSource for ScanNodeDataSource { let mut planned_files = Vec::new(); let mut total_morsels = 0usize; + let mut has_runtime_evidence = false; for (partition_idx, file) in self.open_files(false).await? { let Some(request) = file_scan_request(partition_idx, &file, scan_request.clone())? else { @@ -626,6 +630,7 @@ impl DataSource for ScanNodeDataSource { if ranges.is_empty() { continue; } + has_runtime_evidence |= prepared.has_runtime_evidence(); total_morsels = total_morsels.saturating_add(ranges.len()); planned_files.push((prepared, ranges)); } @@ -649,7 +654,7 @@ impl DataSource for ScanNodeDataSource { let default_window = get_available_parallelism().unwrap_or(1).saturating_mul(4); let (morsel_plan_window, morsel_launch_window) = - morsel_windows(&scheduler, false, default_window); + morsel_windows(&scheduler, false, has_runtime_evidence, default_window); Ok(Some(Arc::new(PlannedScanNodeScan { dtype, @@ -1073,13 +1078,6 @@ struct EvidenceWorkOutput { evidence: PredicateEvidence, } -struct PredicateWorkOutput { - morsel_id: usize, - predicate_idx: usize, - need: Mask, - result: Mask, -} - struct ProjectionWorkOutput { morsel_id: usize, array: ArrayRef, @@ -1087,7 +1085,6 @@ struct ProjectionWorkOutput { enum WorkOutput { Evidence(EvidenceWorkOutput), - Predicate(PredicateWorkOutput), Projection(ProjectionWorkOutput), } @@ -1104,7 +1101,6 @@ struct PlannedMorselWork { struct MorselState { prepared: Arc, range: Range, - len: usize, selected: Mask, evidence: Vec>, pending_evidence: usize, @@ -1140,6 +1136,7 @@ const WEIGHTED_PHASES: &[ScanIoPhase] = &[ fn morsel_windows( scheduler: &ScanScheduler, limited: bool, + has_runtime_evidence: bool, default_window: usize, ) -> (usize, usize) { if limited { @@ -1148,13 +1145,25 @@ fn morsel_windows( let launch_window = scheduler .config() .morsel_launch_window() - .unwrap_or(default_window) + .unwrap_or_else(|| { + if has_runtime_evidence { + default_window.min(DEFAULT_EVIDENCE_MORSEL_WINDOW) + } else { + default_window + } + }) .max(1); let plan_window = scheduler .config() .morsel_plan_window() .map(|window| window.max(launch_window).max(1)) - .unwrap_or(usize::MAX); + .unwrap_or_else(|| { + if has_runtime_evidence { + launch_window + } else { + usize::MAX + } + }); (plan_window, launch_window) } @@ -1341,7 +1350,6 @@ impl PartitionWorkSchedulerState { fn complete_work(&mut self, output: WorkOutput) -> VortexResult> { match output { WorkOutput::Evidence(output) => self.complete_evidence(output), - WorkOutput::Predicate(output) => self.complete_predicate(output), WorkOutput::Projection(output) => { Ok(self.finish_output_morsel(output.morsel_id, output.array)) } @@ -1370,36 +1378,6 @@ impl PartitionWorkSchedulerState { Ok(None) } - fn complete_predicate( - &mut self, - output: PredicateWorkOutput, - ) -> VortexResult> { - let Some(morsel) = self - .morsels - .get_mut(output.morsel_id) - .and_then(Option::as_mut) - else { - return Ok(None); - }; - if output.result.len() != morsel.len { - vortex_bail!( - "residual result length {} does not match morsel length {}", - output.result.len(), - morsel.len - ); - } - let pass = &output.result & &output.need; - let selected = std::mem::take(&mut morsel.selected); - morsel.selected = &selected.bitand_not(&output.need) | &pass; - if morsel.selected.all_false() { - return Ok(self.finish_empty_morsel(output.morsel_id)); - } - let next = output.predicate_idx.saturating_add(1); - morsel.next_predicate = morsel.next_predicate.max(next); - self.enqueue_next_predicate_or_projection(output.morsel_id)?; - Ok(None) - } - fn enqueue_next_predicate_or_projection(&mut self, morsel_id: usize) -> VortexResult<()> { loop { let Some(morsel) = self.morsels.get(morsel_id).and_then(Option::as_ref) else { @@ -1424,6 +1402,38 @@ impl PartitionWorkSchedulerState { } let predicate_idx = morsel.next_predicate; + if morsel.evidence[predicate_idx].is_none() { + let should_probe = { + let predicate = &morsel.prepared.predicates[predicate_idx]; + !predicate.evidence.is_empty() + && morsel.selected.density() >= EXPR_EVAL_THRESHOLD + }; + if should_probe { + let work = morsel.prepared.plan_evidence_work( + morsel_id, + predicate_idx, + morsel.range.clone(), + )?; + let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) + else { + return Ok(()); + }; + morsel.pending_evidence = morsel.pending_evidence.saturating_add(1); + self.evidence_queue.push_back(work); + return Ok(()); + } + + let evidence = PredicateEvidence::new( + morsel.prepared.predicates[predicate_idx].id, + PredicateVersion::STATIC, + morsel.range.clone(), + )?; + let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { + return Ok(()); + }; + morsel.evidence[predicate_idx] = Some(evidence); + continue; + } let evidence = morsel.evidence[predicate_idx].as_ref().ok_or_else(|| { vortex_err!("missing evidence for predicate {predicate_idx} before residual read") })?; @@ -1558,6 +1568,7 @@ impl Partition for ScanNodePartition { let (plan_window, launch_window) = morsel_windows( &scheduler, prepared.limit_remaining.is_some(), + prepared.has_runtime_evidence(), default_window, ); let morsels = ranges @@ -1828,43 +1839,35 @@ impl PreparedScanNodeFile { ) } + fn has_runtime_evidence(&self) -> bool { + self.predicates + .iter() + .any(|predicate| !predicate.evidence.is_empty()) + } + fn plan_morsel( self: &Arc, - morsel_id: usize, + _morsel_id: usize, range: Range, ) -> VortexResult> { - let len = range_len(&range)?; let selected = self.selection.row_mask(&range).mask().clone(); if selected.all_false() { return Ok(None); } - let mut state = MorselState { + let state = MorselState { prepared: Arc::clone(self), - range: range.clone(), - len, + range, selected, evidence: (0..self.predicates.len()).map(|_| None).collect(), pending_evidence: 0, next_predicate: 0, }; - let mut evidence = Vec::with_capacity(self.predicates.len()); - for predicate_idx in 0..self.predicates.len() { - let predicate = &self.predicates[predicate_idx]; - if predicate.evidence.is_empty() { - state.evidence[predicate_idx] = Some(PredicateEvidence::new( - predicate.id, - PredicateVersion::STATIC, - range.clone(), - )?); - continue; - } - state.pending_evidence = state.pending_evidence.saturating_add(1); - evidence.push(self.plan_evidence_work(morsel_id, predicate_idx, range.clone())?); - } - - Ok(Some(PlannedMorselWork { state, evidence })) + Ok(Some(PlannedMorselWork { + state, + evidence: Vec::new(), + })) } fn plan_evidence_work( @@ -1963,7 +1966,7 @@ impl PreparedScanNodeFile { let compact = predicate .read .read_scoped( - range, + range.clone(), RowScope::selected(&need), &prepared.reader, predicate.read_state.as_ref(), @@ -1985,7 +1988,7 @@ impl PreparedScanNodeFile { predicate .read .read_scoped( - range, + range.clone(), rows, &prepared.reader, predicate.read_state.as_ref(), @@ -1994,16 +1997,29 @@ impl PreparedScanNodeFile { .await? .execute::(&mut ctx)? }; - Ok(PredicateWorkOutput { + if result.len() != len { + vortex_bail!( + "residual result length {} does not match morsel length {len}", + result.len() + ); + } + let pass = &result & &need; + let exact = !&need | &pass; + let mut evidence = + PredicateEvidence::new(predicate.id, PredicateVersion::STATIC, range.clone())?; + evidence.absorb(EvidenceFragment::new( + range, + PredicateEvidenceKind::ExactMask(exact), + ))?; + Ok(EvidenceWorkOutput { morsel_id, predicate_idx, - need, - result, + evidence, }) } .boxed(), ) - .into_queued(morsel_id, WorkOutput::Predicate)) + .into_queued(morsel_id, WorkOutput::Evidence)) } fn plan_projection_work( @@ -2012,6 +2028,9 @@ impl PreparedScanNodeFile { range: Range, selected: Mask, ) -> VortexResult> { + // Projection consumes the final selected rows after every predicate plan has contributed + // metadata evidence and, if needed, exact residual evidence. There is no separate + // predicate-demand mask at this point. let len = range_len(&range)?; let selected = if let Some(limit_remaining) = &self.limit_remaining { limit_mask(selected, limit_remaining)? diff --git a/vortex-layout/src/scan/v2/layouts/dict.rs b/vortex-layout/src/scan/v2/layouts/dict.rs index 8a031e6277a..e3c0a564bdb 100644 --- a/vortex-layout/src/scan/v2/layouts/dict.rs +++ b/vortex-layout/src/scan/v2/layouts/dict.rs @@ -3,27 +3,15 @@ //! Scan2 vtable support for dictionary layouts. //! -//! Value reads keep the v1 shape — values read once per query and -//! cached, codes read per range (selection-aware), the pair rebuilt as a -//! lazy `DictArray`. New is the runtime value-domain rewrite (plan 017 -//! SP7): pushed dictionary predicate nodes answer by evaluating the -//! predicate over the *dictionary values* once per query, then mapping -//! the per-value verdicts through the codes: +//! Value reads keep the v1 shape: values read once per query and cached, +//! codes read per range (selection-aware), the pair rebuilt as a lazy +//! `DictArray`. Pushed dictionary expressions also try to evaluate the +//! expression over the dictionary values once per query, then reuse the +//! resulting value-domain array with per-range codes. //! -//! - no value satisfies the predicate (and null does not either): the -//! whole column is proven all-false without reading a single code; -//! - every value satisfies it: all-true the same way; -//! - otherwise the per-value mask maps through the range's codes into an -//! exact per-row mask, costing a code read but never a value decode at -//! data scale. -//! -//! The rewrite is exact: evaluating the predicate over the values array -//! and indexing the result by code is the same value-domain evaluation -//! vortex's expression machinery performs over a `DictArray`, including -//! null routing (a null row takes the predicate's verdict on null). A -//! predicate whose evaluation over the values fails is recorded as -//! unanswerable and falls through to residual evaluation rather than -//! failing the scan. +//! Dictionary predicate evidence is intentionally absent for now. Without +//! zone maps or indexes, reading dictionary values speculatively can cost +//! more than it proves; exact row-domain predicate work owns the codes read. use std::fmt; use std::ops::Range; @@ -35,27 +23,20 @@ use rustc_hash::FxHashMap; use vortex_array::ArrayRef; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; -use vortex_array::VortexSessionExecute; use vortex_array::arrays::BoolArray; -use vortex_array::arrays::ConstantArray; use vortex_array::arrays::DictArray; use vortex_array::dtype::DType; use vortex_array::expr::Expression; use vortex_array::expr::is_root; use vortex_array::optimizer::ArrayOptimizer; -use vortex_array::scalar::Scalar; +use vortex_array::validity::Validity; use vortex_error::VortexResult; use vortex_error::vortex_err; use vortex_mask::Mask; -use vortex_session::VortexSession; use crate::layout_v2::Dict; use crate::layout_v2::Layout; -use crate::scan::v2::evidence::EvidenceFragment; -use crate::scan::v2::evidence::PredicateEvidenceKind; use crate::scan::v2::node::DynReadPlan; -use crate::scan::v2::node::EvidencePlan; -use crate::scan::v2::node::EvidencePlanRef; use crate::scan::v2::node::ExpandCtx; use crate::scan::v2::node::FileReader; use crate::scan::v2::node::PlanCtx; @@ -65,11 +46,8 @@ use crate::scan::v2::node::ReadPlanRef; use crate::scan::v2::node::RowScope; use crate::scan::v2::node::ScanNode; use crate::scan::v2::node::ScanNodeRef; -use crate::scan::v2::node::ScanStateCache; use crate::scan::v2::node::ScanStateRef; use crate::scan::v2::node::StateCtx; -use crate::scan::v2::node::read_dense; -use crate::scan::v2::request::EvidenceRequest; use crate::scan::v2::request::NodeRequest; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; @@ -82,7 +60,6 @@ pub(crate) fn new_scan_node( let values = layout.child(0)?; let codes = layout.child(1)?; Ok(Arc::new(DictScanNode { - dtype: layout.dtype().clone(), values_len: values.row_count(), // Values and codes live in other row domains. values: cx.expand_free(&values)?, @@ -93,42 +70,18 @@ pub(crate) fn new_scan_node( /// Reads a dict layout: shared values (another row domain, read once per /// query) plus a codes chain in this node's row domain. pub struct DictScanNode { - dtype: DType, values: ScanNodeRef, values_len: u64, codes: ScanNodeRef, } -/// One predicate's value-domain rewrite, computed once per query. -enum ValueVerdicts { - /// The predicate could not be evaluated over the values; produce no - /// evidence and let residual evaluation handle it. - Unanswerable, - /// Per-value verdicts plus the verdict for null rows. - Verdicts { - /// `true` at value `v`: rows coded `v` satisfy the predicate. - mask: Mask, - /// Whether a null row satisfies the predicate. - null_verdict: bool, - }, -} - /// Per-query state: the cached values relation, the child states, and -/// the per-predicate value-domain verdicts. +/// cached value-domain expression results. pub struct DictScanState { values: Mutex>, values_state: ScanStateRef, codes_state: ScanStateRef, - verdicts: Mutex>>, -} - -/// Planned dictionary value-domain evidence for one predicate. -struct DictEvidencePlan { - dtype: DType, - values_read: ReadPlanRef, - values_len: u64, - codes_read: ReadPlanRef, - predicate: Expression, + value_exprs: Mutex>>, } /// A pushed scalar expression over a dictionary value. @@ -145,7 +98,20 @@ struct DictReadPlan { struct DictExprReadPlan { node: Arc, - input: ReadPlanRef, + values_read: ReadPlanRef, + codes_read: ReadPlanRef, +} + +fn value_expr_is_expensive(expr: &Expression) -> bool { + matches!( + expr.id().as_str(), + "vortex.like" + | "vortex.byte_length" + | "vortex.list.contains" + | "vortex.dynamic" + | "vortex.variant_get" + | "vortex.parquet.variant" + ) || expr.children().iter().any(value_expr_is_expensive) } impl DictScanNode { @@ -178,163 +144,6 @@ impl DictScanNode { } } -impl DictEvidencePlan { - async fn values(&self, io: &FileReader, state: &DictScanState) -> VortexResult { - if let Some(hit) = state.values.lock().clone() { - return Ok(hit); - } - let values = read_dense( - self.values_read.as_ref(), - 0..self.values_len, - io, - state.values_state.as_ref(), - ) - .await?; - *state.values.lock() = Some(values.clone()); - Ok(values) - } - - async fn verdicts( - &self, - io: &FileReader, - state: &DictScanState, - ) -> VortexResult> { - if let Some(hit) = state.verdicts.lock().get(&self.predicate) { - return Ok(Arc::clone(hit)); - } - let values = self.values(io, state).await?; - let mut ctx = io.session().create_execution_ctx(); - let computed = (|| -> VortexResult { - let mask = values - .clone() - .apply(&self.predicate)? - .execute::(&mut ctx)?; - let null_verdict = if self.dtype.is_nullable() { - let null = ConstantArray::new(Scalar::null(self.dtype.clone()), 1).into_array(); - null.apply(&self.predicate)? - .execute::(&mut ctx)? - .value(0) - } else { - false - }; - Ok(ValueVerdicts::Verdicts { mask, null_verdict }) - })(); - let verdicts = Arc::new(match computed { - Ok(verdicts) => verdicts, - Err(error) => { - tracing::debug!( - predicate = %self.predicate, - %error, - "dict value-domain rewrite unanswerable" - ); - ValueVerdicts::Unanswerable - } - }); - state - .verdicts - .lock() - .insert(self.predicate.clone(), Arc::clone(&verdicts)); - Ok(verdicts) - } -} - -impl EvidencePlan for DictEvidencePlan { - type State = DictScanState; - - fn init_state(&self, ctx: &VortexSession) -> VortexResult { - let mut cache = ScanStateCache::default(); - let mut cx = StateCtx::new(ctx, &mut cache); - Ok(DictScanState { - values: Mutex::new(None), - values_state: self.values_read.init_state(&mut cx)?, - codes_state: self.codes_read.init_state(&mut cx)?, - verdicts: Mutex::new(FxHashMap::default()), - }) - } - - fn evidence<'a>( - &'a self, - req: &'a EvidenceRequest<'a>, - io: &'a FileReader, - state: &'a DictScanState, - ) -> BoxFuture<'a, VortexResult>> { - Box::pin(async move { - let verdicts = self.verdicts(io, state).await?; - let ValueVerdicts::Verdicts { mask, null_verdict } = verdicts.as_ref() else { - return Ok(Vec::new()); - }; - let nullable = self.dtype.is_nullable(); - if mask.all_false() && !*null_verdict { - return Ok(vec![EvidenceFragment::new( - req.range.clone(), - PredicateEvidenceKind::AllFalse, - )]); - } - if mask.all_true() && (!nullable || *null_verdict) { - return Ok(vec![EvidenceFragment::new( - req.range.clone(), - PredicateEvidenceKind::AllTrue, - )]); - } - let codes = read_dense( - self.codes_read.as_ref(), - req.range.clone(), - io, - state.codes_state.as_ref(), - ) - .await?; - let mut ctx = io.session().create_execution_ctx(); - let verdict_values = BoolArray::from(mask.to_bit_buffer()).into_array(); - let mut rows = DictArray::try_new(codes.clone(), verdict_values)? - .into_array() - .execute::(&mut ctx)?; - if *null_verdict { - let valid = codes.validity()?.execute_mask(codes.len(), &mut ctx)?; - rows = &rows | &!valid; - } - Ok(vec![EvidenceFragment::new( - req.range.clone(), - PredicateEvidenceKind::ExactMask(rows), - )]) - }) - } - - fn segment_requests( - &self, - req: &EvidenceRequest<'_>, - state: &Self::State, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - let Some(verdicts) = state.verdicts.lock().get(&self.predicate).cloned() else { - return Ok(SegmentRequests::unknown()); - }; - let ValueVerdicts::Verdicts { mask, null_verdict } = verdicts.as_ref() else { - return Ok(SegmentRequests::none()); - }; - let nullable = self.dtype.is_nullable(); - if mask.all_false() && !*null_verdict { - return Ok(SegmentRequests::none()); - } - if mask.all_true() && (!nullable || *null_verdict) { - return Ok(SegmentRequests::none()); - } - let selection = Mask::new_true( - usize::try_from(req.range.end - req.range.start) - .map_err(|_| vortex_err!("dictionary evidence range exceeds usize"))?, - ); - self.codes_read.segment_requests( - req.range.clone(), - RowScope::selected(&selection), - state.codes_state.as_ref(), - cx, - ) - } - - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "dict") - } -} - impl ScanNode for DictScanNode { type State = DictScanState; @@ -343,7 +152,7 @@ impl ScanNode for DictScanNode { values: Mutex::new(None), values_state: cx.init_node(&self.values)?, codes_state: cx.init_node(&self.codes)?, - verdicts: Mutex::new(FxHashMap::default()), + value_exprs: Mutex::new(FxHashMap::default()), }) } @@ -381,9 +190,9 @@ impl ScanNode for DictScanNode { } /// Codes live in this node's row domain and release with it. The - /// cached values relation and per-predicate verdicts stay — they are - /// read once per query by design and consulted by every remaining - /// morsel. + /// cached values relation and value-domain expression results stay: + /// they are read once per query by design and consulted by every + /// remaining morsel. fn release(&self, frontier: u64, state: &DictScanState) -> VortexResult<()> { self.codes.release(frontier, state.codes_state.as_ref()) } @@ -403,26 +212,17 @@ impl ScanNode for DictExprScanNode { } fn plan_read(self: Arc, cx: &mut PlanCtx) -> VortexResult> { - let input = Arc::clone(&self.dict).plan_read(cx)?.ok_or_else(|| { - vortex_err!("dictionary expression input did not produce a read plan") - })?; - Ok(Some(Arc::new(DictExprReadPlan { node: self, input }))) - } - - fn plan_evidence(self: Arc, cx: &mut PlanCtx) -> VortexResult> { let values_read = Arc::clone(&self.dict.values) .plan_read(cx)? .ok_or_else(|| vortex_err!("dictionary values did not produce a read plan"))?; let codes_read = Arc::clone(&self.dict.codes) .plan_read(cx)? .ok_or_else(|| vortex_err!("dictionary codes did not produce a read plan"))?; - Ok(vec![Arc::new(DictEvidencePlan { - dtype: self.dict.dtype.clone(), + Ok(Some(Arc::new(DictExprReadPlan { + node: self, values_read, - values_len: self.dict.values_len, codes_read, - predicate: self.expr.clone(), - })]) + }))) } fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { @@ -442,7 +242,7 @@ impl ReadPlan for DictReadPlan { values: Mutex::new(None), values_state: self.values_read.init_state(cx)?, codes_state: self.codes_read.init_state(cx)?, - verdicts: Mutex::new(FxHashMap::default()), + value_exprs: Mutex::new(FxHashMap::default()), }) } @@ -506,11 +306,64 @@ impl ReadPlan for DictReadPlan { } } +impl DictExprReadPlan { + async fn value_expr( + &self, + io: &FileReader, + state: &DictScanState, + local: &mut ExecutionCtx, + ) -> VortexResult> { + if let Some(hit) = state.value_exprs.lock().get(&self.node.expr).cloned() { + return Ok(hit); + } + let values = self + .node + .dict + .values(self.values_read.as_ref(), io, state, local) + .await?; + let computed = values.apply(&self.node.expr).and_then(|array| { + match array.clone().execute::(local) { + Ok(mask) => { + let DType::Bool(nullability) = array.dtype() else { + return array.execute::(local); + }; + Ok( + BoolArray::new(mask.to_bit_buffer(), Validity::from(nullability)) + .into_array(), + ) + } + Err(_) => array.execute::(local), + } + }); + let value_expr = match computed { + Ok(array) => Some(array), + Err(error) => { + tracing::debug!( + predicate = %self.node.expr, + %error, + "dict value-domain expression read unavailable" + ); + None + } + }; + state + .value_exprs + .lock() + .insert(self.node.expr.clone(), value_expr.clone()); + Ok(value_expr) + } +} + impl ReadPlan for DictExprReadPlan { - type State = ScanStateRef; + type State = DictScanState; fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - self.input.init_state(cx) + Ok(DictScanState { + values: Mutex::new(None), + values_state: self.values_read.init_state(cx)?, + codes_state: self.codes_read.init_state(cx)?, + value_exprs: Mutex::new(FxHashMap::default()), + }) } fn read_scoped<'a>( @@ -522,10 +375,35 @@ impl ReadPlan for DictExprReadPlan { local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { - let input = self - .input - .read_scoped(range, rows, io, state.as_ref(), local) + let value_expr = if !value_expr_is_expensive(&self.node.expr) + || matches!( + usize::try_from(self.node.dict.values_len), + Ok(values_len) if values_len <= rows.demand.true_count() + ) { + self.value_expr(io, state, local).await? + } else { + None + }; + let codes = self + .codes_read + .read_scoped(range.clone(), rows, io, state.codes_state.as_ref(), local) .await?; + if let Some(value_expr) = value_expr { + let all_valid = !codes.dtype().is_nullable() + || codes + .validity()? + .execute_mask(codes.len(), local)? + .all_true(); + if all_valid { + return Ok(DictArray::try_new(codes, value_expr)?.into_array()); + } + } + let values = self + .node + .dict + .values(self.values_read.as_ref(), io, state, local) + .await?; + let input = DictArray::try_new(codes, values)?.into_array().optimize()?; input.apply(&self.node.expr)?.execute::(local) }) } @@ -537,11 +415,31 @@ impl ReadPlan for DictExprReadPlan { state: &Self::State, cx: &mut SegmentPlanCtx, ) -> VortexResult { - self.input.segment_requests(range, rows, state.as_ref(), cx) + let values_selection = Mask::new_true( + usize::try_from(self.node.dict.values_len) + .map_err(|_| vortex_err!("dictionary values length exceeds usize"))?, + ); + let mut requests = self.values_read.segment_requests( + 0..self.node.dict.values_len, + RowScope::selected(&values_selection), + state.values_state.as_ref(), + cx, + )?; + if requests.is_unknown() { + return Ok(requests); + } + requests.extend(self.codes_read.segment_requests( + range, + rows, + state.codes_state.as_ref(), + cx, + )?); + Ok(requests) } fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { - self.input.release(frontier, state.as_ref()) + self.codes_read + .release(frontier, state.codes_state.as_ref()) } fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { From 45edf172bc4b2b08607fa7249e9302dda79f8a1a Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 19 Jun 2026 14:14:10 -0400 Subject: [PATCH 10/48] Optimize scan2 sparse dictionary reads Signed-off-by: Nicholas Gates --- vortex-layout/src/scan/v2/layouts/dict.rs | 373 ++++++++++++++++--- vortex-layout/src/scan/v2/layouts/struct_.rs | 7 + vortex-layout/src/scan/v2/node.rs | 1 + 3 files changed, 338 insertions(+), 43 deletions(-) diff --git a/vortex-layout/src/scan/v2/layouts/dict.rs b/vortex-layout/src/scan/v2/layouts/dict.rs index e3c0a564bdb..3303ba04f35 100644 --- a/vortex-layout/src/scan/v2/layouts/dict.rs +++ b/vortex-layout/src/scan/v2/layouts/dict.rs @@ -17,26 +17,37 @@ use std::fmt; use std::ops::Range; use std::sync::Arc; +use futures::FutureExt; use futures::future::BoxFuture; +use futures::try_join; use parking_lot::Mutex; use rustc_hash::FxHashMap; use vortex_array::ArrayRef; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; use vortex_array::arrays::BoolArray; use vortex_array::arrays::DictArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::SharedArray; use vortex_array::dtype::DType; +use vortex_array::dtype::NativePType; use vortex_array::expr::Expression; use vortex_array::expr::is_root; +use vortex_array::match_each_integer_ptype; use vortex_array::optimizer::ArrayOptimizer; use vortex_array::validity::Validity; +use vortex_buffer::BufferMut; +use vortex_error::VortexError; use vortex_error::VortexResult; +use vortex_error::vortex_bail; use vortex_error::vortex_err; +use vortex_mask::AllOr; use vortex_mask::Mask; use crate::layout_v2::Dict; use crate::layout_v2::Layout; -use crate::scan::v2::node::DynReadPlan; +use crate::layouts::SharedArrayFuture; use crate::scan::v2::node::ExpandCtx; use crate::scan::v2::node::FileReader; use crate::scan::v2::node::PlanCtx; @@ -78,7 +89,7 @@ pub struct DictScanNode { /// Per-query state: the cached values relation, the child states, and /// cached value-domain expression results. pub struct DictScanState { - values: Mutex>, + values: Mutex>, values_state: ScanStateRef, codes_state: ScanStateRef, value_exprs: Mutex>>, @@ -102,6 +113,8 @@ struct DictExprReadPlan { codes_read: ReadPlanRef, } +const SPARSE_DICT_VALUES_DENSITY_THRESHOLD: f64 = 0.2; + fn value_expr_is_expensive(expr: &Expression) -> bool { matches!( expr.id().as_str(), @@ -114,33 +127,69 @@ fn value_expr_is_expensive(expr: &Expression) -> bool { ) || expr.children().iter().any(value_expr_is_expensive) } +fn sparse_dict_candidate(values_len: u64, rows: RowScope<'_>) -> bool { + rows.demands_all_selected() + && rows.selection.density() < SPARSE_DICT_VALUES_DENSITY_THRESHOLD + && matches!( + usize::try_from(values_len), + Ok(values_len) if values_len > rows.demand.true_count() + ) +} + +fn sparse_value_expr_candidate(expr: &Expression, values_len: u64, rows: RowScope<'_>) -> bool { + sparse_dict_candidate(values_len, rows) && value_expr_is_expensive(expr) +} + impl DictScanNode { - /// The values relation, read once per query. - async fn values( + /// The values relation wrapped in a `SharedArray`, read once per query. + fn values( &self, - values_read: &dyn DynReadPlan, + values_read: ReadPlanRef, io: &FileReader, state: &DictScanState, - local: &mut ExecutionCtx, - ) -> VortexResult { + ) -> SharedArrayFuture { if let Some(hit) = state.values.lock().clone() { - return Ok(hit); + return hit; } - let selection = Mask::new_true( - usize::try_from(self.values_len) - .map_err(|_| vortex_err!("dictionary values length exceeds usize"))?, - ); - let values = values_read - .read_scoped( - 0..self.values_len, - RowScope::selected(&selection), - io, - state.values_state.as_ref(), - local, - ) - .await?; - *state.values.lock() = Some(values.clone()); - Ok(values) + + let mut guard = state.values.lock(); + if let Some(hit) = guard.clone() { + return hit; + } + + let values_len = self.values_len; + let io = io.clone(); + let values_state = Arc::clone(&state.values_state); + let future = async move { + let selection = + Mask::new_true(usize::try_from(values_len).map_err(|_| { + Arc::new(vortex_err!("dictionary values length exceeds usize")) + })?); + let mut local = io.session().create_execution_ctx(); + let values = values_read + .read_scoped( + 0..values_len, + RowScope::selected(&selection), + &io, + values_state.as_ref(), + &mut local, + ) + .await + .map_err(Arc::new)?; + // The shared future single-flights IO. `SharedArray` separately memoizes execution of + // the full dictionary values across batches; sparse selected reads bypass this path. + Ok(SharedArray::new(values).into_array()) + } + .boxed() + .shared(); + + *guard = Some(future.clone()); + future + } + + fn build_dict(&self, codes: ArrayRef, values: ArrayRef) -> VortexResult { + // SAFETY: the codes and values children come from a validated dictionary layout. + Ok(unsafe { DictArray::new_unchecked(codes, values) }.into_array()) } } @@ -255,15 +304,48 @@ impl ReadPlan for DictReadPlan { local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { - let values = self - .node - .values(self.values_read.as_ref(), io, state, local) - .await?; - let codes = self - .codes_read - .read_scoped(range, rows, io, state.codes_state.as_ref(), local) - .await?; - DictArray::try_new(codes, values)?.into_array().optimize() + if sparse_dict_candidate(self.node.values_len, rows) { + let codes = self + .codes_read + .read_scoped(range.clone(), rows, io, state.codes_state.as_ref(), local) + .await?; + let values_len = usize::try_from(self.node.values_len) + .map_err(|_| vortex_err!("dictionary values length exceeds usize"))?; + if let Some((compact_codes, value_selection)) = + compact_codes_and_value_selection(codes.clone(), values_len, local)? + { + let values = self + .values_read + .read_scoped( + 0..self.node.values_len, + RowScope::selected(&value_selection), + io, + state.values_state.as_ref(), + local, + ) + .await?; + return self.node.build_dict(compact_codes, values)?.optimize(); + } + + let values = self + .node + .values(Arc::clone(&self.values_read), io, state) + .await + .map_err(VortexError::from)?; + return self.node.build_dict(codes, values)?.optimize(); + } + + let values = async { + self.node + .values(Arc::clone(&self.values_read), io, state) + .await + .map_err(VortexError::from) + }; + let codes = + self.codes_read + .read_scoped(range, rows, io, state.codes_state.as_ref(), local); + let (values, codes) = try_join!(values, codes)?; + self.node.build_dict(codes, values)?.optimize() }) } @@ -274,6 +356,12 @@ impl ReadPlan for DictReadPlan { state: &Self::State, cx: &mut SegmentPlanCtx, ) -> VortexResult { + if sparse_dict_candidate(self.node.values_len, rows) { + return self + .codes_read + .segment_requests(range, rows, state.codes_state.as_ref(), cx); + } + let values_selection = Mask::new_true( usize::try_from(self.node.values_len) .map_err(|_| vortex_err!("dictionary values length exceeds usize"))?, @@ -319,8 +407,9 @@ impl DictExprReadPlan { let values = self .node .dict - .values(self.values_read.as_ref(), io, state, local) - .await?; + .values(Arc::clone(&self.values_read), io, state) + .await + .map_err(VortexError::from)?; let computed = values.apply(&self.node.expr).and_then(|array| { match array.clone().execute::(local) { Ok(mask) => { @@ -352,6 +441,189 @@ impl DictExprReadPlan { .insert(self.node.expr.clone(), value_expr.clone()); Ok(value_expr) } + + async fn sparse_expr( + &self, + codes: ArrayRef, + io: &FileReader, + state: &DictScanState, + local: &mut ExecutionCtx, + ) -> VortexResult> { + let values_len = usize::try_from(self.node.dict.values_len) + .map_err(|_| vortex_err!("dictionary values length exceeds usize"))?; + let Some((compact_codes, value_selection)) = + compact_codes_and_value_selection(codes, values_len, local)? + else { + return Ok(None); + }; + + let values = self + .values_read + .read_scoped( + 0..self.node.dict.values_len, + RowScope::selected(&value_selection), + io, + state.values_state.as_ref(), + local, + ) + .await?; + let input = self + .node + .dict + .build_dict(compact_codes, values)? + .optimize()?; + let computed = input + .apply(&self.node.expr) + .and_then(|array| array.execute::(local)); + match computed { + Ok(array) => Ok(Some(array)), + Err(error) => { + tracing::debug!( + predicate = %self.node.expr, + %error, + "sparse dict expression read unavailable" + ); + Ok(None) + } + } + } +} + +fn compact_codes_and_value_selection( + codes: ArrayRef, + values_len: usize, + local: &mut ExecutionCtx, +) -> VortexResult> { + let codes = codes.execute::(local)?; + let validity = codes.validity()?; + let valid = validity.execute_mask(codes.len(), local)?; + if valid.all_false() { + return Ok(None); + } + + match_each_integer_ptype!(codes.ptype(), |Code| { + compact_codes_and_value_selection_typed::( + codes.as_slice::(), + validity, + &valid, + values_len, + ) + }) +} + +fn compact_codes_and_value_selection_typed( + codes: &[Code], + validity: Validity, + valid: &Mask, + values_len: usize, +) -> VortexResult> +where + Code: NativePType + TryFrom, + usize: TryFrom, +{ + let referenced = referenced_values(codes, valid, values_len)?; + if referenced.is_empty() || referenced.len() == values_len { + return Ok(None); + } + + let compact = remap_codes(codes, valid, values_len, &referenced)?; + let value_selection = Mask::from_indices(values_len, referenced); + let compact_codes = PrimitiveArray::new(compact.freeze(), validity).into_array(); + Ok(Some((compact_codes, value_selection))) +} + +fn referenced_values( + codes: &[Code], + valid: &Mask, + values_len: usize, +) -> VortexResult> +where + Code: Copy + fmt::Display, + usize: TryFrom, +{ + let mut referenced = Vec::with_capacity(valid.true_count().min(values_len)); + match valid.bit_buffer() { + AllOr::All => { + for &code in codes { + referenced.push(checked_code_index(code, values_len)?); + } + } + AllOr::None => {} + AllOr::Some(mask) => { + for idx in mask.set_indices() { + referenced.push(checked_code_index(codes[idx], values_len)?); + } + } + } + referenced.sort_unstable(); + referenced.dedup(); + Ok(referenced) +} + +fn remap_codes( + codes: &[Code], + valid: &Mask, + values_len: usize, + referenced: &[usize], +) -> VortexResult> +where + Code: Copy + Default + fmt::Display + TryFrom, + usize: TryFrom, +{ + let mut compact = BufferMut::::with_capacity(codes.len()); + match valid.bit_buffer() { + AllOr::All => { + for &code in codes { + compact.push(compact_code(code, values_len, referenced)?); + } + } + AllOr::None => compact.extend(std::iter::repeat_n(Code::default(), codes.len())), + AllOr::Some(mask) => { + let mut valid_indices = mask.set_indices(); + let mut next_valid = valid_indices.next(); + for (idx, &code) in codes.iter().enumerate() { + if next_valid == Some(idx) { + compact.push(compact_code(code, values_len, referenced)?); + next_valid = valid_indices.next(); + } else { + compact.push(Code::default()); + } + } + } + } + Ok(compact) +} + +fn checked_code_index(code: Code, values_len: usize) -> VortexResult +where + Code: Copy + fmt::Display, + usize: TryFrom, +{ + let idx = usize::try_from(code) + .map_err(|_| vortex_err!("invalid negative dictionary code {code}"))?; + if idx >= values_len { + vortex_bail!( + "dictionary code {idx} out of bounds for values length {}", + values_len + ); + } + Ok(idx) +} + +fn compact_code(code: Code, values_len: usize, referenced: &[usize]) -> VortexResult +where + Code: Copy + fmt::Display + TryFrom, + usize: TryFrom, +{ + let idx = checked_code_index(code, values_len)?; + let rank = referenced.binary_search(&idx).map_err(|_| { + vortex_err!("dictionary code {idx} missing from sparse referenced value set") + })?; + Code::try_from(rank).map_err(|_| { + vortex_err!( + "sparse dictionary code rank {rank} cannot be represented by original code type" + ) + }) } impl ReadPlan for DictExprReadPlan { @@ -375,11 +647,14 @@ impl ReadPlan for DictExprReadPlan { local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { - let value_expr = if !value_expr_is_expensive(&self.node.expr) - || matches!( - usize::try_from(self.node.dict.values_len), - Ok(values_len) if values_len <= rows.demand.true_count() - ) { + let sparse_candidate = + sparse_value_expr_candidate(&self.node.expr, self.node.dict.values_len, rows); + let value_expr = if !sparse_candidate + && (!value_expr_is_expensive(&self.node.expr) + || matches!( + usize::try_from(self.node.dict.values_len), + Ok(values_len) if values_len <= rows.demand.true_count() + )) { self.value_expr(io, state, local).await? } else { None @@ -395,15 +670,21 @@ impl ReadPlan for DictExprReadPlan { .execute_mask(codes.len(), local)? .all_true(); if all_valid { - return Ok(DictArray::try_new(codes, value_expr)?.into_array()); + return self.node.dict.build_dict(codes, value_expr); } } + if sparse_candidate + && let Some(result) = self.sparse_expr(codes.clone(), io, state, local).await? + { + return Ok(result); + } let values = self .node .dict - .values(self.values_read.as_ref(), io, state, local) - .await?; - let input = DictArray::try_new(codes, values)?.into_array().optimize()?; + .values(Arc::clone(&self.values_read), io, state) + .await + .map_err(VortexError::from)?; + let input = self.node.dict.build_dict(codes, values)?.optimize()?; input.apply(&self.node.expr)?.execute::(local) }) } @@ -415,6 +696,12 @@ impl ReadPlan for DictExprReadPlan { state: &Self::State, cx: &mut SegmentPlanCtx, ) -> VortexResult { + if sparse_value_expr_candidate(&self.node.expr, self.node.dict.values_len, rows) { + return self + .codes_read + .segment_requests(range, rows, state.codes_state.as_ref(), cx); + } + let values_selection = Mask::new_true( usize::try_from(self.node.dict.values_len) .map_err(|_| vortex_err!("dictionary values length exceeds usize"))?, diff --git a/vortex-layout/src/scan/v2/layouts/struct_.rs b/vortex-layout/src/scan/v2/layouts/struct_.rs index 029cd59ae31..927f958f368 100644 --- a/vortex-layout/src/scan/v2/layouts/struct_.rs +++ b/vortex-layout/src/scan/v2/layouts/struct_.rs @@ -20,6 +20,7 @@ use vortex_array::expr::is_root; use vortex_array::expr::root; use vortex_array::expr::transform::replace; use vortex_array::scalar_fn::fns::get_item::GetItem; +use vortex_array::scalar_fn::fns::pack::Pack; use vortex_array::scalar_fn::fns::root::Root; use vortex_array::scalar_fn::fns::select::Select; use vortex_error::VortexResult; @@ -95,6 +96,12 @@ impl ScanNode for StructScanNode { let names = selection.normalize_to_included_fields(scope.names())?; return self.push_struct(names, cx).map(Some); } + if let Some(pack) = expr.as_opt::() + && pack.names.len() == 1 + && expr.child(0).is::() + { + return self.push_struct(pack.names.clone(), cx).map(Some); + } let fields = referenced_fields(expr, &scope); if let [name] = fields.as_slice() { let scoped = replace(expr.clone(), &get_item(name.clone(), root()), root()); diff --git a/vortex-layout/src/scan/v2/node.rs b/vortex-layout/src/scan/v2/node.rs index e77fb6149ac..f0564ef6a5b 100644 --- a/vortex-layout/src/scan/v2/node.rs +++ b/vortex-layout/src/scan/v2/node.rs @@ -54,6 +54,7 @@ use crate::segments::SegmentRequests; use crate::segments::SegmentSource; /// Per-file/query IO context for scan2 reads. +#[derive(Clone)] pub struct FileReader { segments: Arc, session: VortexSession, From 99dd46a738f1d85893683a93d6924a7245297466 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 19 Jun 2026 15:07:23 -0400 Subject: [PATCH 11/48] Widen sparse dict reads for filtered projections Signed-off-by: Nicholas Gates --- vortex-layout/src/scan/v2/layouts/dict.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vortex-layout/src/scan/v2/layouts/dict.rs b/vortex-layout/src/scan/v2/layouts/dict.rs index 3303ba04f35..896a654e3fe 100644 --- a/vortex-layout/src/scan/v2/layouts/dict.rs +++ b/vortex-layout/src/scan/v2/layouts/dict.rs @@ -113,8 +113,6 @@ struct DictExprReadPlan { codes_read: ReadPlanRef, } -const SPARSE_DICT_VALUES_DENSITY_THRESHOLD: f64 = 0.2; - fn value_expr_is_expensive(expr: &Expression) -> bool { matches!( expr.id().as_str(), @@ -129,7 +127,7 @@ fn value_expr_is_expensive(expr: &Expression) -> bool { fn sparse_dict_candidate(values_len: u64, rows: RowScope<'_>) -> bool { rows.demands_all_selected() - && rows.selection.density() < SPARSE_DICT_VALUES_DENSITY_THRESHOLD + && !rows.selection.all_true() && matches!( usize::try_from(values_len), Ok(values_len) if values_len > rows.demand.true_count() From 84a4d7ed9e2bba46d1cd5e239d2913212f4be16c Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 19 Jun 2026 15:39:56 -0400 Subject: [PATCH 12/48] Avoid dense sparse dict compaction Signed-off-by: Nicholas Gates --- vortex-layout/src/scan/v2/layouts/dict.rs | 1 + vortex-layout/src/scan/v2/layouts/flat.rs | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/vortex-layout/src/scan/v2/layouts/dict.rs b/vortex-layout/src/scan/v2/layouts/dict.rs index 896a654e3fe..9bca52e594d 100644 --- a/vortex-layout/src/scan/v2/layouts/dict.rs +++ b/vortex-layout/src/scan/v2/layouts/dict.rs @@ -128,6 +128,7 @@ fn value_expr_is_expensive(expr: &Expression) -> bool { fn sparse_dict_candidate(values_len: u64, rows: RowScope<'_>) -> bool { rows.demands_all_selected() && !rows.selection.all_true() + && rows.selection.density() < 0.5 && matches!( usize::try_from(values_len), Ok(values_len) if values_len > rows.demand.true_count() diff --git a/vortex-layout/src/scan/v2/layouts/flat.rs b/vortex-layout/src/scan/v2/layouts/flat.rs index 8932faec41d..3bcc3be627a 100644 --- a/vortex-layout/src/scan/v2/layouts/flat.rs +++ b/vortex-layout/src/scan/v2/layouts/flat.rs @@ -148,9 +148,17 @@ impl ReadPlan for FlatReadPlan { &self, _range: Range, _rows: RowScope<'_>, - _state: &Self::State, + state: &Self::State, cx: &mut SegmentPlanCtx, ) -> VortexResult { + if downcast_state::(state.as_ref())? + .array + .lock() + .is_some() + { + return Ok(SegmentRequests::none()); + } + let Some(flat) = self.node.layout.as_opt::() else { vortex_bail!( "expected flat layout, got {}", From 28bc2b4ef4e78a6659044f9ea8d148f8fd233586 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 19 Jun 2026 16:25:05 -0400 Subject: [PATCH 13/48] Share dict scan value caches Signed-off-by: Nicholas Gates --- vortex-layout/src/scan/v2/layouts/dict.rs | 92 +++++++++++++++++------ 1 file changed, 68 insertions(+), 24 deletions(-) diff --git a/vortex-layout/src/scan/v2/layouts/dict.rs b/vortex-layout/src/scan/v2/layouts/dict.rs index 9bca52e594d..146cdefebc3 100644 --- a/vortex-layout/src/scan/v2/layouts/dict.rs +++ b/vortex-layout/src/scan/v2/layouts/dict.rs @@ -59,6 +59,7 @@ use crate::scan::v2::node::ScanNode; use crate::scan::v2::node::ScanNodeRef; use crate::scan::v2::node::ScanStateRef; use crate::scan::v2::node::StateCtx; +use crate::scan::v2::node::downcast_state; use crate::scan::v2::request::NodeRequest; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; @@ -88,11 +89,44 @@ pub struct DictScanNode { /// Per-query state: the cached values relation, the child states, and /// cached value-domain expression results. +#[derive(Clone)] pub struct DictScanState { - values: Mutex>, + shared: DictSharedState, values_state: ScanStateRef, codes_state: ScanStateRef, - value_exprs: Mutex>>, +} + +#[derive(Clone)] +struct DictSharedState { + values: Arc>>, + value_exprs: Arc>>>, +} + +impl DictScanState { + fn new(values_state: ScanStateRef, codes_state: ScanStateRef) -> Self { + Self { + shared: DictSharedState::default(), + values_state, + codes_state, + } + } + + fn with_child_states(&self, values_state: ScanStateRef, codes_state: ScanStateRef) -> Self { + Self { + shared: self.shared.clone(), + values_state, + codes_state, + } + } +} + +impl Default for DictSharedState { + fn default() -> Self { + Self { + values: Arc::new(Mutex::new(None)), + value_exprs: Arc::new(Mutex::new(FxHashMap::default())), + } + } } /// A pushed scalar expression over a dictionary value. @@ -147,11 +181,11 @@ impl DictScanNode { io: &FileReader, state: &DictScanState, ) -> SharedArrayFuture { - if let Some(hit) = state.values.lock().clone() { + if let Some(hit) = state.shared.values.lock().clone() { return hit; } - let mut guard = state.values.lock(); + let mut guard = state.shared.values.lock(); if let Some(hit) = guard.clone() { return hit; } @@ -196,12 +230,10 @@ impl ScanNode for DictScanNode { type State = DictScanState; fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - Ok(DictScanState { - values: Mutex::new(None), - values_state: cx.init_node(&self.values)?, - codes_state: cx.init_node(&self.codes)?, - value_exprs: Mutex::new(FxHashMap::default()), - }) + Ok(DictScanState::new( + cx.init_node(&self.values)?, + cx.init_node(&self.codes)?, + )) } fn try_push_expr( @@ -256,7 +288,8 @@ impl ScanNode for DictExprScanNode { type State = DictScanState; fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - self.dict.init_state(cx) + let node: ScanNodeRef = Arc::::clone(&self.dict); + Ok(downcast_state::(cx.init_node(&node)?.as_ref())?.clone()) } fn plan_read(self: Arc, cx: &mut PlanCtx) -> VortexResult> { @@ -286,12 +319,14 @@ impl ReadPlan for DictReadPlan { type State = DictScanState; fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - Ok(DictScanState { - values: Mutex::new(None), - values_state: self.values_read.init_state(cx)?, - codes_state: self.codes_read.init_state(cx)?, - value_exprs: Mutex::new(FxHashMap::default()), - }) + let node: ScanNodeRef = Arc::::clone(&self.node); + let base = cx.init_node(&node)?; + Ok( + downcast_state::(base.as_ref())?.with_child_states( + self.values_read.init_state(cx)?, + self.codes_read.init_state(cx)?, + ), + ) } fn read_scoped<'a>( @@ -400,7 +435,13 @@ impl DictExprReadPlan { state: &DictScanState, local: &mut ExecutionCtx, ) -> VortexResult> { - if let Some(hit) = state.value_exprs.lock().get(&self.node.expr).cloned() { + if let Some(hit) = state + .shared + .value_exprs + .lock() + .get(&self.node.expr) + .cloned() + { return Ok(hit); } let values = self @@ -435,6 +476,7 @@ impl DictExprReadPlan { } }; state + .shared .value_exprs .lock() .insert(self.node.expr.clone(), value_expr.clone()); @@ -629,12 +671,14 @@ impl ReadPlan for DictExprReadPlan { type State = DictScanState; fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - Ok(DictScanState { - values: Mutex::new(None), - values_state: self.values_read.init_state(cx)?, - codes_state: self.codes_read.init_state(cx)?, - value_exprs: Mutex::new(FxHashMap::default()), - }) + let node: ScanNodeRef = Arc::::clone(&self.node.dict); + let base = cx.init_node(&node)?; + Ok( + downcast_state::(base.as_ref())?.with_child_states( + self.values_read.init_state(cx)?, + self.codes_read.init_state(cx)?, + ), + ) } fn read_scoped<'a>( From 1ee55db72a453f7e3f79eba250162b6d192ee4ef Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 19 Jun 2026 20:40:24 -0400 Subject: [PATCH 14/48] Share prepared scan state Signed-off-by: Nicholas Gates --- vortex-file/src/multi/scan_v2.rs | 219 ++--- vortex-layout/src/scan/v2/evidence.rs | 2 +- vortex-layout/src/scan/v2/layouts/chunked.rs | 471 +++++------ vortex-layout/src/scan/v2/layouts/dict.rs | 186 ++--- vortex-layout/src/scan/v2/layouts/flat.rs | 50 +- vortex-layout/src/scan/v2/layouts/struct_.rs | 9 +- vortex-layout/src/scan/v2/layouts/zoned.rs | 299 +++---- vortex-layout/src/scan/v2/node.rs | 812 +++++++++---------- vortex-layout/src/scan/v2/request.rs | 37 +- 9 files changed, 899 insertions(+), 1186 deletions(-) diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index 5c17cc029d1..f005aa0c7e4 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -51,25 +51,23 @@ use vortex_layout::scan::v2::evidence::PredicateEvidence; use vortex_layout::scan::v2::evidence::PredicateEvidenceKind; use vortex_layout::scan::v2::evidence::PredicateId; use vortex_layout::scan::v2::evidence::PredicateVersion; -use vortex_layout::scan::v2::node::AggregatePlanRef; -use vortex_layout::scan::v2::node::EvidencePlanRef; -use vortex_layout::scan::v2::node::EvidenceStateKey; use vortex_layout::scan::v2::node::ExpandCtx; use vortex_layout::scan::v2::node::FileReader; -use vortex_layout::scan::v2::node::PlanCtx; +use vortex_layout::scan::v2::node::OwnedRowScope; +use vortex_layout::scan::v2::node::PrepareCtx; +use vortex_layout::scan::v2::node::PreparedAggregateRef; +use vortex_layout::scan::v2::node::PreparedEvidenceRef; +use vortex_layout::scan::v2::node::PreparedReadRef; +use vortex_layout::scan::v2::node::PreparedStats; +use vortex_layout::scan::v2::node::PreparedStatsRef; use vortex_layout::scan::v2::node::PushCtx; -use vortex_layout::scan::v2::node::ReadPlanRef; -use vortex_layout::scan::v2::node::RowScope; use vortex_layout::scan::v2::node::ScanNode; use vortex_layout::scan::v2::node::ScanNodeRef; -use vortex_layout::scan::v2::node::ScanStateCache; use vortex_layout::scan::v2::node::ScanStateRef; use vortex_layout::scan::v2::node::StateCtx; -use vortex_layout::scan::v2::node::StatsPlan; -use vortex_layout::scan::v2::node::StatsPlanRef; use vortex_layout::scan::v2::request::EvidenceMode; -use vortex_layout::scan::v2::request::EvidenceRequest; use vortex_layout::scan::v2::request::NodeRequest; +use vortex_layout::scan::v2::request::OwnedEvidenceRequest; use vortex_layout::scan::v2::validate_temporal_comparisons; use vortex_layout::segments::ScanIoPhase; use vortex_layout::segments::ScheduledSegmentSource; @@ -99,7 +97,6 @@ use vortex_scan::SegmentSourceMeta; use vortex_scan::WorkRequest; use vortex_scan::selection::Selection; use vortex_session::VortexSession; -use vortex_utils::aliases::hash_map::HashMap; use vortex_utils::parallelism::get_available_parallelism; use super::MultiFileDataSource; @@ -132,7 +129,7 @@ struct FileStatsExprScanNode { row_count: u64, } -struct FileStatsPlan { +struct FilePreparedStats { stats: StatsSet, field_dtype: DType, row_count: u64, @@ -190,20 +187,23 @@ impl ScanNode for FileStatsScanNode { }))) } - fn plan_read(self: Arc, cx: &mut PlanCtx) -> VortexResult> { - Arc::clone(&self.data).plan_read(cx) + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { + Arc::clone(&self.data).prepare_read(cx) } - fn plan_evidence(self: Arc, cx: &mut PlanCtx) -> VortexResult> { - Arc::clone(&self.data).plan_evidence(cx) + fn prepare_evidence( + self: Arc, + cx: &mut PrepareCtx, + ) -> VortexResult> { + Arc::clone(&self.data).prepare_evidence(cx) } - fn plan_aggregate_partial( + fn prepare_aggregate_partial( self: Arc, funcs: &[AggregateFnRef], - cx: &mut PlanCtx, - ) -> VortexResult> { - Arc::clone(&self.data).plan_aggregate_partial(funcs, cx) + cx: &mut PrepareCtx, + ) -> VortexResult> { + Arc::clone(&self.data).prepare_aggregate_partial(funcs, cx) } fn split_hints(&self) -> Option<&[u64]> { @@ -235,29 +235,32 @@ impl ScanNode for FileStatsExprScanNode { Arc::clone(&self.data).try_push_expr(expr, cx) } - fn plan_read(self: Arc, cx: &mut PlanCtx) -> VortexResult> { - Arc::clone(&self.data).plan_read(cx) + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { + Arc::clone(&self.data).prepare_read(cx) } - fn plan_evidence(self: Arc, cx: &mut PlanCtx) -> VortexResult> { - Arc::clone(&self.data).plan_evidence(cx) + fn prepare_evidence( + self: Arc, + cx: &mut PrepareCtx, + ) -> VortexResult> { + Arc::clone(&self.data).prepare_evidence(cx) } - fn plan_aggregate_partial( + fn prepare_aggregate_partial( self: Arc, funcs: &[AggregateFnRef], - cx: &mut PlanCtx, - ) -> VortexResult> { - Arc::clone(&self.data).plan_aggregate_partial(funcs, cx) + cx: &mut PrepareCtx, + ) -> VortexResult> { + Arc::clone(&self.data).prepare_aggregate_partial(funcs, cx) } - fn plan_stats( + fn prepare_stats( self: Arc, funcs: &[AggregateFnRef], - _cx: &mut PlanCtx, - ) -> VortexResult> { + _cx: &mut PrepareCtx, + ) -> VortexResult> { let stats = self.stats.stats_sets()[self.field_idx].clone(); - Ok(Some(Arc::new(FileStatsPlan { + Ok(Some(Arc::new(FilePreparedStats { stats, field_dtype: self.field_dtype.clone(), row_count: self.row_count, @@ -279,7 +282,7 @@ impl ScanNode for FileStatsExprScanNode { } } -impl StatsPlan for FileStatsPlan { +impl PreparedStats for FilePreparedStats { type State = (); fn init_state(&self, _ctx: &VortexSession) -> VortexResult { @@ -303,12 +306,12 @@ impl StatsPlan for FileStatsPlan { }) } - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "file_stats") } } -impl FileStatsPlan { +impl FilePreparedStats { fn stat_for_func(&self, func: &AggregateFnRef) -> VortexResult> { let Some(stat) = Stat::from_aggregate_fn(func) else { return Ok(Precision::Absent); @@ -903,7 +906,8 @@ pub(crate) async fn scan_node_file_statistics_many( let mut result = Vec::with_capacity(exprs.len()); for expr in exprs { let pushed = push_expr(&root, expr, file.dtype(), reader.session())?; - let Some(plan) = pushed.plan_stats(funcs, &mut PlanCtx::new(reader.session().clone()))? + let Some(plan) = + pushed.prepare_stats(funcs, &mut PrepareCtx::new(reader.session().clone()))? else { result.push(absent_statistics(funcs)); continue; @@ -930,7 +934,7 @@ pub(crate) async fn scan_node_file_plan_splits( let session = file.session().clone(); let root = expand_file_root(&file, &session)?; let pushed = push_expr(&root, projection, file.dtype(), &session)?; - let Some(plan) = pushed.plan_splits(&mut PlanCtx::new(session.clone()))? else { + let Some(plan) = pushed.prepare_splits(&mut PrepareCtx::new(session.clone()))? else { return Ok(std::iter::once(0..file.row_count()).collect()); }; let reader = FileReader::new(file.segment_source(), session.clone()); @@ -1701,17 +1705,15 @@ struct PreparedScanNodeFile { scheduled_segment_source: Arc, segment_future_cache: Arc, root: ScanNodeRef, - projection: ReadPlanRef, - projection_state: ScanStateRef, - predicates: Vec, + projection: PreparedReadRef, + predicates: Vec, } -struct PredicatePlan { +struct PreparedPredicate { id: PredicateId, expr: Expression, - read: ReadPlanRef, - read_state: ScanStateRef, - evidence: Vec<(EvidencePlanRef, ScanStateRef)>, + read: PreparedReadRef, + evidence: Vec, } struct RegisteredScheduledSegmentSource { @@ -1749,13 +1751,10 @@ impl PreparedScanNodeFile { session.clone(), ); - let mut node_cache = ScanStateCache::default(); - let mut state_ctx = StateCtx::new(&session, &mut node_cache); + let mut prepare_ctx = PrepareCtx::new(session.clone()); + let projection_plan = + prepare_read(&root, &projection, file.dtype(), &session, &mut prepare_ctx)?; - let projection_plan = plan_read(&root, &projection, file.dtype(), &session)?; - let projection_state = projection_plan.init_state(&mut state_ctx)?; - - let mut evidence_state_cache: HashMap = HashMap::default(); // Run cheap, likely-selective conjuncts first so an expensive residual (e.g. an FSST `LIKE`) // only evaluates over the rows that survive the cheaper predicates. AND is commutative, so // reordering is semantically safe; `PredicateId`s are assigned by final slot below (after the @@ -1771,32 +1770,13 @@ impl PreparedScanNodeFile { ); let pushed = push_expr(&root, &expr, file.dtype(), &session)?; let read = Arc::clone(&pushed) - .plan_read(&mut PlanCtx::new(session.clone()))? + .prepare_read(&mut prepare_ctx)? .ok_or_else(|| vortex_err!("scan2 could not plan predicate read {expr}"))?; - let read_state = read.init_state(&mut state_ctx)?; - let evidence = pushed - .plan_evidence(&mut PlanCtx::new(session.clone()))? - .into_iter() - .map(|plan| { - let state = if let Some(key) = plan.state_cache_key() { - if let Some(state) = evidence_state_cache.get(&key) { - Arc::clone(state) - } else { - let state = plan.init_state(&session)?; - evidence_state_cache.insert(key, Arc::clone(&state)); - state - } - } else { - plan.init_state(&session)? - }; - Ok((plan, state)) - }) - .collect::>>()?; - Ok(PredicatePlan { + let evidence = pushed.prepare_evidence(&mut prepare_ctx)?; + Ok(PreparedPredicate { id, expr, read, - read_state, evidence, }) }) @@ -1817,7 +1797,6 @@ impl PreparedScanNodeFile { segment_future_cache, root, projection: projection_plan, - projection_state, predicates, }) } @@ -1878,17 +1857,20 @@ impl PreparedScanNodeFile { ) -> VortexResult { let predicate = &self.predicates[predicate_idx]; let mut registered = SubmittedSegmentRequests::default(); - let req = EvidenceRequest { + let req = OwnedEvidenceRequest { id: predicate.id, version: PredicateVersion::STATIC, - predicate: &predicate.expr, + predicate: predicate.expr.clone(), range: range.clone(), mode: EvidenceMode::Normal, }; - for (plan, state) in &predicate.evidence { + let mut tasks = Vec::with_capacity(predicate.evidence.len()); + for plan in &predicate.evidence { + let task = Arc::clone(plan).begin_evidence(req.clone())?; let mut segment_ctx = self.segment_plan_ctx(ScanIoPhase::EvidenceProbe); - let requests = plan.segment_requests(&req, state.as_ref(), &mut segment_ctx)?; + let requests = task.segment_requests(&mut segment_ctx)?; registered.extend(self.submit_segment_requests(requests)); + tasks.push(task); } let prepared = Arc::clone(self); @@ -1900,18 +1882,8 @@ impl PreparedScanNodeFile { let predicate = &prepared.predicates[predicate_idx]; let mut acc = PredicateEvidence::new(predicate.id, PredicateVersion::STATIC, range.clone())?; - let req = EvidenceRequest { - id: predicate.id, - version: PredicateVersion::STATIC, - predicate: &predicate.expr, - range: range.clone(), - mode: EvidenceMode::Normal, - }; - for (plan, state) in &predicate.evidence { - for fragment in plan - .evidence(&req, &prepared.reader, state.as_ref()) - .await? - { + for task in tasks { + for fragment in task.evidence(&prepared.reader).await? { acc.absorb(fragment)?; } if acc.all_false() { @@ -1937,16 +1909,16 @@ impl PreparedScanNodeFile { need: Mask, ) -> VortexResult { let len = range_len(&range)?; - let full_domain = Mask::new_true(len); - let rows = RowScope::try_new(&full_domain, &need)?; let predicate = &self.predicates[predicate_idx]; + let compact = need.density() < EXPR_EVAL_THRESHOLD; + let rows = if compact { + OwnedRowScope::selected(need.clone()) + } else { + OwnedRowScope::try_new(Mask::new_true(len), need.clone())? + }; + let task = Arc::clone(&predicate.read).begin_read(range.clone(), rows)?; let mut segment_ctx = self.segment_plan_ctx(ScanIoPhase::PredicateRead); - let requests = predicate.read.segment_requests( - range.clone(), - rows, - predicate.read_state.as_ref(), - &mut segment_ctx, - )?; + let requests = task.segment_requests(&mut segment_ctx)?; let registered = self.submit_segment_requests(requests); let prepared = Arc::clone(self); @@ -1962,16 +1934,9 @@ impl PreparedScanNodeFile { // `LIKE`) evaluates over only `need.true_count()` rows. The compacted verdict is // scattered back into the morsel domain via `intersect_by_rank`, giving a full-length // mask identical to the dense path's `result & need`. Mirrors V1's flat-reader gate. - let result = if need.density() < EXPR_EVAL_THRESHOLD { - let compact = predicate - .read - .read_scoped( - range.clone(), - RowScope::selected(&need), - &prepared.reader, - predicate.read_state.as_ref(), - &mut ctx, - ) + let result = if compact { + let compact = task + .read(&prepared.reader, &mut ctx) .await? .execute::(&mut ctx)?; if compact.len() != need.true_count() { @@ -1983,17 +1948,8 @@ impl PreparedScanNodeFile { } need.intersect_by_rank(&compact) } else { - let full_domain = Mask::new_true(len); - let rows = RowScope::try_new(&full_domain, &need)?; - predicate - .read - .read_scoped( - range.clone(), - rows, - &prepared.reader, - predicate.read_state.as_ref(), - &mut ctx, - ) + task + .read(&prepared.reader, &mut ctx) .await? .execute::(&mut ctx)? }; @@ -2047,13 +2003,10 @@ impl PreparedScanNodeFile { ); } + let task = + Arc::clone(&self.projection).begin_read(range, OwnedRowScope::selected(selected))?; let mut segment_ctx = self.segment_plan_ctx(ScanIoPhase::ProjectionRead); - let requests = self.projection.segment_requests( - range.clone(), - RowScope::selected(&selected), - self.projection_state.as_ref(), - &mut segment_ctx, - )?; + let requests = task.segment_requests(&mut segment_ctx)?; let registered = self.submit_segment_requests(requests); let prepared = Arc::clone(self); @@ -2064,16 +2017,7 @@ impl PreparedScanNodeFile { registered, async move { let mut ctx = prepared.session.create_execution_ctx(); - let array = prepared - .projection - .read_scoped( - range, - RowScope::selected(&selected), - &prepared.reader, - prepared.projection_state.as_ref(), - &mut ctx, - ) - .await?; + let array = task.read(&prepared.reader, &mut ctx).await?; Ok(ProjectionWorkOutput { morsel_id, array }) } .boxed(), @@ -2130,14 +2074,15 @@ fn push_expr( .ok_or_else(|| vortex_err!("scan2 could not push expression {expr}")) } -fn plan_read( +fn prepare_read( root: &ScanNodeRef, expr: &Expression, dtype: &DType, session: &VortexSession, -) -> VortexResult { + cx: &mut PrepareCtx, +) -> VortexResult { push_expr(root, expr, dtype, session)? - .plan_read(&mut PlanCtx::new(session.clone()))? + .prepare_read(cx)? .ok_or_else(|| vortex_err!("scan2 could not plan read for expression {expr}")) } diff --git a/vortex-layout/src/scan/v2/evidence.rs b/vortex-layout/src/scan/v2/evidence.rs index 89a9a477f3f..2440708cd1f 100644 --- a/vortex-layout/src/scan/v2/evidence.rs +++ b/vortex-layout/src/scan/v2/evidence.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Predicate evidence: coverage-bearing answers for planned predicates. +//! Predicate evidence: coverage-bearing answers for prepared predicates. //! //! A scan2 predicate is answered at runtime by *evidence fragments*: //! row ranges paired with what a producer proves about the diff --git a/vortex-layout/src/scan/v2/layouts/chunked.rs b/vortex-layout/src/scan/v2/layouts/chunked.rs index 88af1db5998..547b2166628 100644 --- a/vortex-layout/src/scan/v2/layouts/chunked.rs +++ b/vortex-layout/src/scan/v2/layouts/chunked.rs @@ -47,23 +47,23 @@ use crate::layout_v2::Layout; use crate::layout_v2::LayoutRef; use crate::scan::v2::evidence::EvidenceFragment; use crate::scan::v2::node::AggregateAnswer; -use crate::scan::v2::node::AggregatePlan; -use crate::scan::v2::node::AggregatePlanRef; -use crate::scan::v2::node::EvidencePlan; -use crate::scan::v2::node::EvidencePlanRef; use crate::scan::v2::node::ExpandCtx; use crate::scan::v2::node::FileReader; -use crate::scan::v2::node::PlanCtx; +use crate::scan::v2::node::PrepareCtx; +use crate::scan::v2::node::PreparedAggregate; +use crate::scan::v2::node::PreparedAggregateRef; +use crate::scan::v2::node::PreparedEvidence; +use crate::scan::v2::node::PreparedEvidenceRef; +use crate::scan::v2::node::PreparedRead; +use crate::scan::v2::node::PreparedReadRef; +use crate::scan::v2::node::PreparedStateCacheRef; +use crate::scan::v2::node::PreparedStateKey; use crate::scan::v2::node::PushCtx; -use crate::scan::v2::node::ReadPlan; -use crate::scan::v2::node::ReadPlanRef; use crate::scan::v2::node::RowScope; use crate::scan::v2::node::ScanNode; use crate::scan::v2::node::ScanNodeRef; -use crate::scan::v2::node::ScanStateCache; use crate::scan::v2::node::ScanStateRef; use crate::scan::v2::node::StateCtx; -use crate::scan::v2::node::downcast_state; use crate::scan::v2::request::EvidenceMode; use crate::scan::v2::request::EvidenceRequest; use crate::scan::v2::request::NodeRequest; @@ -80,7 +80,6 @@ pub(crate) fn new_scan_node( offsets: layout.data().chunk_offsets().to_vec(), cx: cx.clone(), children: Mutex::new(FxHashMap::default()), - reads: Mutex::new(FxHashMap::default()), })) } @@ -93,8 +92,6 @@ pub struct ChunkedScanNode { cx: ExpandCtx, /// Lazily expanded chunk nodes, shared across queries. children: Mutex>, - /// Lazily planned chunk reads, shared across queries. - reads: Mutex>, } /// Per-query states of the lazily expanded chunk nodes. Chunk states @@ -103,8 +100,8 @@ pub struct ChunkedScanNode { /// every chunk it touched. #[derive(Default)] pub struct ChunkedScanState { - children: Mutex>, - node_states: Mutex>, + reads: Mutex>, + child_state_caches: Mutex>, /// Every chunk whose state was ever created (never cleared by /// release), for read-avoidance tests. #[cfg(any(test, debug_assertions))] @@ -124,19 +121,19 @@ pub struct ChunkedExprScanNode { expr: Expression, dtype: DType, children: Mutex>, - reads: Mutex>, } /// Per-query states of lazily pushed chunk children. pub struct ChunkedExprScanState { - chunked: ScanStateRef, - children: Mutex>, + chunked: Arc, + reads: Mutex>, #[cfg(debug_assertions)] released: AtomicU64, } -struct ChunkedEvidencePlan { +struct ChunkedPreparedEvidence { node: Arc, + state: Arc, } enum ChunkedAggregateNode { @@ -144,37 +141,50 @@ enum ChunkedAggregateNode { Expr(Arc), } -struct ChunkedAggregatePlan { +struct ChunkedPreparedAggregate { node: ChunkedAggregateNode, + chunked_state: Arc, dtype: DType, funcs: Vec, } -struct ChunkedReadPlan { +struct ChunkedPreparedRead { node: Arc, + state: Arc, } -struct ChunkedExprReadPlan { +struct ChunkedExprPreparedRead { node: Arc, + state: Arc, } -#[derive(Default)] struct ChunkedEvidenceState { - children: Mutex>>, - recheck_children: Mutex>>, + chunked: Arc, + children: Mutex>>, + recheck_children: Mutex>>, } #[derive(Default)] struct ChunkedAggregateState { - children: Mutex>>, + children: Mutex>>, } impl ChunkedScanState { + fn child_prepare_ctx(&self, idx: usize, session: &VortexSession) -> PrepareCtx { + if let Some(hit) = self.child_state_caches.lock().get(&idx) { + return PrepareCtx::with_state_cache(session.clone(), Arc::clone(hit)); + } + let cache = Default::default(); + let mut caches = self.child_state_caches.lock(); + let cache = Arc::clone(caches.entry(idx).or_insert(cache)); + PrepareCtx::with_state_cache(session.clone(), cache) + } + /// The number of chunk states currently retained. #[allow(dead_code)] #[cfg(any(test, debug_assertions))] pub fn retained_children(&self) -> usize { - self.children.lock().len() + self.reads.lock().len() } /// Whether chunk `idx` was ever read this query (release does not @@ -186,7 +196,23 @@ impl ChunkedScanState { } } +impl ChunkedEvidenceState { + fn new(chunked: Arc) -> Self { + Self { + chunked, + children: Mutex::new(FxHashMap::default()), + recheck_children: Mutex::new(FxHashMap::default()), + } + } +} + impl ChunkedScanNode { + fn scan_state(&self, cx: &mut PrepareCtx) -> VortexResult> { + let key = + PreparedStateKey::new::(self as *const Self as *const () as usize); + cx.shared_state(key, || Ok(ChunkedScanState::default())) + } + /// The scan node for chunk `idx`, expanding it on first use. Lazy /// expansion is independent of pushed predicate expressions. fn child(&self, idx: usize) -> VortexResult { @@ -199,38 +225,24 @@ impl ChunkedScanNode { } /// The planned value read for chunk `idx`, creating it on first use. - fn child_read(&self, idx: usize, session: &VortexSession) -> VortexResult { - if let Some(hit) = self.reads.lock().get(&idx) { - return Ok(Arc::clone(hit)); - } - let node = self.child(idx)?; - let mut cx = PlanCtx::new(session.clone()); - let read = node - .plan_read(&mut cx)? - .ok_or_else(|| vortex_err!("chunked child {idx} did not produce a read plan"))?; - let mut reads = self.reads.lock(); - Ok(Arc::clone(reads.entry(idx).or_insert(read))) - } - - /// Chunk `idx`'s per-query state, creating it on first use. - fn child_read_state( + fn child_read( &self, idx: usize, - read: &ReadPlanRef, state: &ChunkedScanState, session: &VortexSession, - ) -> VortexResult { - if let Some(hit) = state.children.lock().get(&idx) { + ) -> VortexResult { + if let Some(hit) = state.reads.lock().get(&idx) { return Ok(Arc::clone(hit)); } - let mut caches = state.node_states.lock(); - let cache = caches.entry(idx).or_default(); - let mut cx = StateCtx::new(session, cache); - let child_state = read.init_state(&mut cx)?; - state.children.lock().insert(idx, Arc::clone(&child_state)); + let node = self.child(idx)?; + let mut cx = state.child_prepare_ctx(idx, session); + let read = node + .prepare_read(&mut cx)? + .ok_or_else(|| vortex_err!("chunked child {idx} did not produce a prepared read"))?; + let mut reads = state.reads.lock(); #[cfg(any(test, debug_assertions))] state.created.lock().insert(idx); - Ok(child_state) + Ok(Arc::clone(reads.entry(idx).or_insert(read))) } fn first_chunk(&self, start: u64) -> usize { @@ -247,7 +259,6 @@ impl ChunkedExprScanNode { expr, dtype, children: Mutex::new(FxHashMap::default()), - reads: Mutex::new(FxHashMap::default()), } } @@ -268,39 +279,22 @@ impl ChunkedExprScanNode { } /// The planned value read for pushed chunk child `idx`. - fn child_read(&self, idx: usize, session: &VortexSession) -> VortexResult { - if let Some(hit) = self.reads.lock().get(&idx) { - return Ok(Arc::clone(hit)); - } - let node = self.child(idx, session)?; - let mut cx = PlanCtx::new(session.clone()); - let read = node.plan_read(&mut cx)?.ok_or_else(|| { - vortex_err!("chunked expression child {idx} did not produce a read plan") - })?; - let mut reads = self.reads.lock(); - Ok(Arc::clone(reads.entry(idx).or_insert(read))) - } - - fn child_read_state( + fn child_read( &self, idx: usize, - read: &ReadPlanRef, state: &ChunkedExprScanState, session: &VortexSession, - ) -> VortexResult { - if let Some(hit) = state.children.lock().get(&idx) { + ) -> VortexResult { + if let Some(hit) = state.reads.lock().get(&idx) { return Ok(Arc::clone(hit)); } - let chunked_state = state - .chunked - .downcast_ref::() - .ok_or_else(|| vortex_err!("chunked expression state type mismatch"))?; - let mut caches = chunked_state.node_states.lock(); - let cache = caches.entry(idx).or_default(); - let mut cx = StateCtx::new(session, cache); - let child_state = read.init_state(&mut cx)?; - let mut children = state.children.lock(); - Ok(Arc::clone(children.entry(idx).or_insert(child_state))) + let node = self.child(idx, session)?; + let mut cx = state.chunked.child_prepare_ctx(idx, session); + let read = node.prepare_read(&mut cx)?.ok_or_else(|| { + vortex_err!("chunked expression child {idx} did not produce a prepared read") + })?; + let mut reads = state.reads.lock(); + Ok(Arc::clone(reads.entry(idx).or_insert(read))) } } @@ -327,19 +321,19 @@ impl ChunkedAggregateNode { } } -impl ChunkedAggregatePlan { +impl ChunkedPreparedAggregate { fn child_plan( &self, idx: usize, state: &ChunkedAggregateState, io: &FileReader, - ) -> VortexResult> { + ) -> VortexResult> { if let Some(hit) = state.children.lock().get(&idx) { return Ok(hit.clone()); } let child = self.node.child(idx, io)?; - let mut plan_ctx = PlanCtx::new(io.session().clone()); - let planned = match child.plan_aggregate_partial(&self.funcs, &mut plan_ctx)? { + let mut plan_ctx = self.chunked_state.child_prepare_ctx(idx, io.session()); + let planned = match child.prepare_aggregate_partial(&self.funcs, &mut plan_ctx)? { Some(plan) => { let plan_state = plan.init_state(io.session())?; Some((plan, plan_state)) @@ -351,7 +345,7 @@ impl ChunkedAggregatePlan { } } -impl AggregatePlan for ChunkedAggregatePlan { +impl PreparedAggregate for ChunkedPreparedAggregate { type State = ChunkedAggregateState; fn init_state(&self, _ctx: &VortexSession) -> VortexResult { @@ -451,7 +445,7 @@ impl AggregatePlan for ChunkedAggregatePlan { }) } - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "chunked") } } @@ -463,8 +457,9 @@ impl ScanNode for ChunkedScanNode { Ok(ChunkedScanState::default()) } - fn plan_read(self: Arc, _cx: &mut PlanCtx) -> VortexResult> { - Ok(Some(Arc::new(ChunkedReadPlan { node: self }))) + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { + let state = self.scan_state(cx)?; + Ok(Some(Arc::new(ChunkedPreparedRead { node: self, state }))) } fn try_push_expr( @@ -483,26 +478,34 @@ impl ScanNode for ChunkedScanNode { )))) } - fn plan_evidence(self: Arc, _cx: &mut PlanCtx) -> VortexResult> { - Ok(vec![Arc::new(ChunkedEvidencePlan { - node: Arc::new(ChunkedExprScanNode::new( - Arc::clone(&self), - root(), - self.layout.dtype().clone(), - )), - })]) + fn prepare_evidence( + self: Arc, + cx: &mut PrepareCtx, + ) -> VortexResult> { + let node = Arc::new(ChunkedExprScanNode::new( + Arc::clone(&self), + root(), + self.layout.dtype().clone(), + )); + let chunked_state = self.scan_state(cx)?; + let key = + PreparedStateKey::new::(Arc::as_ptr(&self) as *const () as usize); + let state = cx.shared_state(key, || Ok(ChunkedEvidenceState::new(chunked_state)))?; + Ok(vec![Arc::new(ChunkedPreparedEvidence { node, state })]) } - fn plan_aggregate_partial( + fn prepare_aggregate_partial( self: Arc, funcs: &[AggregateFnRef], - _cx: &mut PlanCtx, - ) -> VortexResult> { + cx: &mut PrepareCtx, + ) -> VortexResult> { if funcs.is_empty() { return Ok(None); } - Ok(Some(Arc::new(ChunkedAggregatePlan { + let chunked_state = self.scan_state(cx)?; + Ok(Some(Arc::new(ChunkedPreparedAggregate { node: ChunkedAggregateNode::Root(Arc::clone(&self)), + chunked_state, dtype: self.layout.dtype().clone(), funcs: funcs.to_vec(), }))) @@ -518,19 +521,18 @@ impl ScanNode for ChunkedScanNode { /// hold no data. fn release(&self, frontier: u64, state: &ChunkedScanState) -> VortexResult<()> { state - .children + .reads .lock() .retain(|&idx, _| self.offsets[idx + 1] > frontier); state - .node_states + .child_state_caches .lock() .retain(|&idx, _| self.offsets[idx + 1] > frontier); let idx = self.first_chunk(frontier); if idx + 1 < self.offsets.len() && self.offsets[idx] < frontier { - let child_state = state.children.lock().get(&idx).cloned(); - let child = self.reads.lock().get(&idx).cloned(); - if let (Some(child), Some(child_state)) = (child, child_state) { - child.release(frontier - self.offsets[idx], child_state.as_ref())?; + let child = state.reads.lock().get(&idx).cloned(); + if let Some(child) = child { + child.release(frontier - self.offsets[idx])?; } } #[cfg(debug_assertions)] @@ -543,14 +545,7 @@ impl ScanNode for ChunkedScanNode { } } -impl ReadPlan for ChunkedReadPlan { - type State = ScanStateRef; - - fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - let node: ScanNodeRef = Arc::::clone(&self.node); - cx.init_node(&node) - } - +impl PreparedRead for ChunkedPreparedRead { /// The chunked scoped read: slice the selection and demand per /// overlapping chunk, skip chunks whose selection is all-false, and /// represent selected-but-undemanded chunks with dtype-default filler @@ -560,20 +555,15 @@ impl ReadPlan for ChunkedReadPlan { range: Range, rows: RowScope<'a>, io: &'a FileReader, - state: &'a Self::State, local_ctx: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { - let state = match downcast_state::(state.as_ref()) { - Ok(state) => state, - Err(e) => return Box::pin(async move { Err(e) }), - }; Box::pin(async move { if range.start >= range.end { vortex_bail!("empty chunked scoped read range"); } #[cfg(debug_assertions)] { - let released = state.released.load(Ordering::Relaxed); + let released = self.state.released.load(Ordering::Relaxed); debug_assert!( range.start >= released, "chunked read {range:?} below the released frontier {released}" @@ -631,23 +621,13 @@ impl ReadPlan for ChunkedReadPlan { continue; } let chunk_idx = idx - 1; - let read = self.node.child_read(chunk_idx, io.session())?; - let child_state = - self.node - .child_read_state(chunk_idx, &read, state, io.session())?; + let read = self.node.child_read(chunk_idx, &self.state, io.session())?; let chunk = if dense_scope || selected_scope { - read.read_scoped( - local, - RowScope::selected(&chunk_selection), - io, - child_state.as_ref(), - local_ctx, - ) - .await? + read.read_scoped(local, RowScope::selected(&chunk_selection), io, local_ctx) + .await? } else { let chunk_rows = RowScope::try_new(&chunk_selection, &chunk_demand)?; - read.read_scoped(local, chunk_rows, io, child_state.as_ref(), local_ctx) - .await? + read.read_scoped(local, chunk_rows, io, local_ctx).await? }; if chunk.len() != chunk_selection.true_count() { vortex_bail!( @@ -670,16 +650,14 @@ impl ReadPlan for ChunkedReadPlan { &self, range: Range, rows: RowScope<'_>, - state: &Self::State, cx: &mut SegmentPlanCtx, ) -> VortexResult { - let state = downcast_state::(state.as_ref())?; if range.start >= range.end { vortex_bail!("empty chunked scoped read range"); } #[cfg(debug_assertions)] { - let released = state.released.load(Ordering::Relaxed); + let released = self.state.released.load(Ordering::Relaxed); debug_assert!( range.start >= released, "chunked request planning {range:?} below the released frontier {released}" @@ -728,20 +706,12 @@ impl ReadPlan for ChunkedReadPlan { continue; } let chunk_idx = idx - 1; - let read = self.node.child_read(chunk_idx, cx.session())?; - let child_state = self - .node - .child_read_state(chunk_idx, &read, state, cx.session())?; + let read = self.node.child_read(chunk_idx, &self.state, cx.session())?; let chunk_requests = if dense_scope || selected_scope { - read.segment_requests( - local, - RowScope::selected(&chunk_selection), - child_state.as_ref(), - cx, - )? + read.segment_requests(local, RowScope::selected(&chunk_selection), cx)? } else { let chunk_rows = RowScope::try_new(&chunk_selection, &chunk_demand)?; - read.segment_requests(local, chunk_rows, child_state.as_ref(), cx)? + read.segment_requests(local, chunk_rows, cx)? }; requests.extend(chunk_requests); if requests.is_unknown() { @@ -754,12 +724,11 @@ impl ReadPlan for ChunkedReadPlan { Ok(requests) } - fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { - self.node - .release(frontier, downcast_state::(state.as_ref())?) + fn release(&self, frontier: u64) -> VortexResult<()> { + self.node.release(frontier, &self.state) } - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.node.fmt_chain(f) } } @@ -768,33 +737,59 @@ impl ScanNode for ChunkedExprScanNode { type State = ChunkedExprScanState; fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - let chunked: ScanNodeRef = Arc::::clone(&self.chunked); + let _ = cx; Ok(ChunkedExprScanState { - chunked: cx.init_node(&chunked)?, - children: Mutex::new(FxHashMap::default()), + chunked: Arc::new(ChunkedScanState::default()), + reads: Mutex::new(FxHashMap::default()), #[cfg(debug_assertions)] released: AtomicU64::new(0), }) } - fn plan_read(self: Arc, _cx: &mut PlanCtx) -> VortexResult> { - Ok(Some(Arc::new(ChunkedExprReadPlan { node: self }))) + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { + let key = + PreparedStateKey::new::(Arc::as_ptr(&self) as *const () as usize); + let chunked = self.chunked.scan_state(cx)?; + let state = cx.shared_state(key, || { + Ok(ChunkedExprScanState { + chunked, + reads: Mutex::new(FxHashMap::default()), + #[cfg(debug_assertions)] + released: AtomicU64::new(0), + }) + })?; + Ok(Some(Arc::new(ChunkedExprPreparedRead { + node: self, + state, + }))) } - fn plan_evidence(self: Arc, _cx: &mut PlanCtx) -> VortexResult> { - Ok(vec![Arc::new(ChunkedEvidencePlan { node: self })]) + fn prepare_evidence( + self: Arc, + cx: &mut PrepareCtx, + ) -> VortexResult> { + let key = + PreparedStateKey::new::(Arc::as_ptr(&self) as *const () as usize); + let chunked = self.chunked.scan_state(cx)?; + let state = cx.shared_state(key, || Ok(ChunkedEvidenceState::new(chunked)))?; + Ok(vec![Arc::new(ChunkedPreparedEvidence { + node: self, + state, + })]) } - fn plan_aggregate_partial( + fn prepare_aggregate_partial( self: Arc, funcs: &[AggregateFnRef], - _cx: &mut PlanCtx, - ) -> VortexResult> { + cx: &mut PrepareCtx, + ) -> VortexResult> { if funcs.is_empty() { return Ok(None); } - Ok(Some(Arc::new(ChunkedAggregatePlan { + let chunked_state = self.chunked.scan_state(cx)?; + Ok(Some(Arc::new(ChunkedPreparedAggregate { node: ChunkedAggregateNode::Expr(Arc::clone(&self)), + chunked_state, dtype: self.dtype.clone(), funcs: funcs.to_vec(), }))) @@ -806,21 +801,14 @@ impl ScanNode for ChunkedExprScanNode { fn release(&self, frontier: u64, state: &ChunkedExprScanState) -> VortexResult<()> { state - .children + .reads .lock() .retain(|&idx, _| self.chunked.offsets[idx + 1] > frontier); - if let Some(chunked_state) = state.chunked.downcast_ref::() { - chunked_state - .node_states - .lock() - .retain(|&idx, _| self.chunked.offsets[idx + 1] > frontier); - } let idx = self.chunked.first_chunk(frontier); if idx + 1 < self.chunked.offsets.len() && self.chunked.offsets[idx] < frontier { - let child_state = state.children.lock().get(&idx).cloned(); - let child = self.reads.lock().get(&idx).cloned(); - if let (Some(child), Some(child_state)) = (child, child_state) { - child.release(frontier - self.chunked.offsets[idx], child_state.as_ref())?; + let child = state.reads.lock().get(&idx).cloned(); + if let Some(child) = child { + child.release(frontier - self.chunked.offsets[idx])?; } } #[cfg(debug_assertions)] @@ -833,33 +821,21 @@ impl ScanNode for ChunkedExprScanNode { } } -impl ReadPlan for ChunkedExprReadPlan { - type State = ScanStateRef; - - fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - let node: ScanNodeRef = Arc::::clone(&self.node); - cx.init_node(&node) - } - +impl PreparedRead for ChunkedExprPreparedRead { fn read_scoped<'a>( &'a self, range: Range, rows: RowScope<'a>, io: &'a FileReader, - state: &'a Self::State, local_ctx: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { - let state = match downcast_state::(state.as_ref()) { - Ok(state) => state, - Err(e) => return Box::pin(async move { Err(e) }), - }; Box::pin(async move { if range.start >= range.end { vortex_bail!("empty chunked scoped read range"); } #[cfg(debug_assertions)] { - let released = state.released.load(Ordering::Relaxed); + let released = self.state.released.load(Ordering::Relaxed); debug_assert!( range.start >= released, "chunked expression read {range:?} below the released frontier {released}" @@ -917,23 +893,13 @@ impl ReadPlan for ChunkedExprReadPlan { continue; } let chunk_idx = idx - 1; - let read = self.node.child_read(chunk_idx, io.session())?; - let child_state = - self.node - .child_read_state(chunk_idx, &read, state, io.session())?; + let read = self.node.child_read(chunk_idx, &self.state, io.session())?; let chunk = if dense_scope || selected_scope { - read.read_scoped( - local, - RowScope::selected(&chunk_selection), - io, - child_state.as_ref(), - local_ctx, - ) - .await? + read.read_scoped(local, RowScope::selected(&chunk_selection), io, local_ctx) + .await? } else { let chunk_rows = RowScope::try_new(&chunk_selection, &chunk_demand)?; - read.read_scoped(local, chunk_rows, io, child_state.as_ref(), local_ctx) - .await? + read.read_scoped(local, chunk_rows, io, local_ctx).await? }; if chunk.len() != chunk_selection.true_count() { vortex_bail!( @@ -956,16 +922,14 @@ impl ReadPlan for ChunkedExprReadPlan { &self, range: Range, rows: RowScope<'_>, - state: &Self::State, cx: &mut SegmentPlanCtx, ) -> VortexResult { - let state = downcast_state::(state.as_ref())?; if range.start >= range.end { vortex_bail!("empty chunked scoped read range"); } #[cfg(debug_assertions)] { - let released = state.released.load(Ordering::Relaxed); + let released = self.state.released.load(Ordering::Relaxed); debug_assert!( range.start >= released, "chunked expression request planning {range:?} below the released frontier {released}" @@ -1016,20 +980,12 @@ impl ReadPlan for ChunkedExprReadPlan { continue; } let chunk_idx = idx - 1; - let read = self.node.child_read(chunk_idx, cx.session())?; - let child_state = self - .node - .child_read_state(chunk_idx, &read, state, cx.session())?; + let read = self.node.child_read(chunk_idx, &self.state, cx.session())?; let chunk_requests = if dense_scope || selected_scope { - read.segment_requests( - local, - RowScope::selected(&chunk_selection), - child_state.as_ref(), - cx, - )? + read.segment_requests(local, RowScope::selected(&chunk_selection), cx)? } else { let chunk_rows = RowScope::try_new(&chunk_selection, &chunk_demand)?; - read.segment_requests(local, chunk_rows, child_state.as_ref(), cx)? + read.segment_requests(local, chunk_rows, cx)? }; requests.extend(chunk_requests); if requests.is_unknown() { @@ -1042,30 +998,20 @@ impl ReadPlan for ChunkedExprReadPlan { Ok(requests) } - fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { - self.node.release( - frontier, - downcast_state::(state.as_ref())?, - ) + fn release(&self, frontier: u64) -> VortexResult<()> { + self.node.release(frontier, &self.state) } - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.node.fmt_chain(f) } } -impl EvidencePlan for ChunkedEvidencePlan { - type State = ChunkedEvidenceState; - - fn init_state(&self, _ctx: &VortexSession) -> VortexResult { - Ok(ChunkedEvidenceState::default()) - } - +impl PreparedEvidence for ChunkedPreparedEvidence { fn evidence<'a>( &'a self, req: &'a EvidenceRequest<'a>, io: &'a FileReader, - state: &'a ChunkedEvidenceState, ) -> BoxFuture<'a, VortexResult>> { Box::pin(async move { if req.range.start >= req.range.end { @@ -1081,38 +1027,27 @@ impl EvidencePlan for ChunkedEvidencePlan { let local = req.range.start.saturating_sub(chunk_start) ..(req.range.end.min(chunk_end) - chunk_start); let recheck = req.mode == EvidenceMode::RecheckBeforeProjection; - let child_plans = if let Some(hit) = state.children.lock().get(&idx) { + let child_plans = if let Some(hit) = self.state.children.lock().get(&idx) { hit.clone() } else if recheck { - if let Some(hit) = state.recheck_children.lock().get(&idx) { + if let Some(hit) = self.state.recheck_children.lock().get(&idx) { hit.clone() } else { let node = self.node.child(idx, io.session())?; - let mut plan_ctx = PlanCtx::new(io.session().clone()); - let plans = node.plan_evidence(&mut plan_ctx)?; + let mut plan_ctx = self.state.chunked.child_prepare_ctx(idx, io.session()); + let plans = node.prepare_evidence(&mut plan_ctx)?; let planned = plans .into_iter() .filter(|plan| plan.recheck_before_projection()) - .map(|plan| { - let plan_state = plan.init_state(io.session())?; - Ok((plan, plan_state)) - }) - .collect::>>()?; - let mut children = state.recheck_children.lock(); + .collect::>(); + let mut children = self.state.recheck_children.lock(); children.entry(idx).or_insert(planned).clone() } } else { let node = self.node.child(idx, io.session())?; - let mut plan_ctx = PlanCtx::new(io.session().clone()); - let plans = node.plan_evidence(&mut plan_ctx)?; - let planned = plans - .into_iter() - .map(|plan| { - let plan_state = plan.init_state(io.session())?; - Ok((plan, plan_state)) - }) - .collect::>>()?; - let mut children = state.children.lock(); + let mut plan_ctx = self.state.chunked.child_prepare_ctx(idx, io.session()); + let planned = node.prepare_evidence(&mut plan_ctx)?; + let mut children = self.state.children.lock(); children.entry(idx).or_insert(planned).clone() }; if !child_plans.is_empty() { @@ -1123,11 +1058,11 @@ impl EvidencePlan for ChunkedEvidencePlan { range: local, mode: req.mode, }; - for (plan, plan_state) in child_plans { + for plan in child_plans { if recheck && !plan.recheck_before_projection() { continue; } - for fragment in plan.evidence(&child_req, io, plan_state.as_ref()).await? { + for fragment in plan.evidence(&child_req, io).await? { fragments.push(translate_fragment(fragment, chunk_start)); } } @@ -1141,7 +1076,6 @@ impl EvidencePlan for ChunkedEvidencePlan { fn segment_requests( &self, req: &EvidenceRequest<'_>, - state: &Self::State, cx: &mut SegmentPlanCtx, ) -> VortexResult { if req.range.start >= req.range.end { @@ -1158,38 +1092,27 @@ impl EvidencePlan for ChunkedEvidencePlan { let local = req.range.start.saturating_sub(chunk_start) ..(req.range.end.min(chunk_end) - chunk_start); let recheck = req.mode == EvidenceMode::RecheckBeforeProjection; - let child_plans = if let Some(hit) = state.children.lock().get(&idx) { + let child_plans = if let Some(hit) = self.state.children.lock().get(&idx) { hit.clone() } else if recheck { - if let Some(hit) = state.recheck_children.lock().get(&idx) { + if let Some(hit) = self.state.recheck_children.lock().get(&idx) { hit.clone() } else { let node = self.node.child(idx, cx.session())?; - let mut plan_ctx = PlanCtx::new(cx.session().clone()); - let plans = node.plan_evidence(&mut plan_ctx)?; + let mut plan_ctx = self.state.chunked.child_prepare_ctx(idx, cx.session()); + let plans = node.prepare_evidence(&mut plan_ctx)?; let planned = plans .into_iter() .filter(|plan| plan.recheck_before_projection()) - .map(|plan| { - let plan_state = plan.init_state(cx.session())?; - Ok((plan, plan_state)) - }) - .collect::>>()?; - let mut children = state.recheck_children.lock(); + .collect::>(); + let mut children = self.state.recheck_children.lock(); children.entry(idx).or_insert(planned).clone() } } else { let node = self.node.child(idx, cx.session())?; - let mut plan_ctx = PlanCtx::new(cx.session().clone()); - let plans = node.plan_evidence(&mut plan_ctx)?; - let planned = plans - .into_iter() - .map(|plan| { - let plan_state = plan.init_state(cx.session())?; - Ok((plan, plan_state)) - }) - .collect::>>()?; - let mut children = state.children.lock(); + let mut plan_ctx = self.state.chunked.child_prepare_ctx(idx, cx.session()); + let planned = node.prepare_evidence(&mut plan_ctx)?; + let mut children = self.state.children.lock(); children.entry(idx).or_insert(planned).clone() }; if !child_plans.is_empty() { @@ -1200,11 +1123,11 @@ impl EvidencePlan for ChunkedEvidencePlan { range: local, mode: req.mode, }; - for (plan, plan_state) in child_plans { + for plan in child_plans { if recheck && !plan.recheck_before_projection() { continue; } - requests.extend(plan.segment_requests(&child_req, plan_state.as_ref(), cx)?); + requests.extend(plan.segment_requests(&child_req, cx)?); if requests.is_unknown() { return Ok(requests); } @@ -1219,7 +1142,7 @@ impl EvidencePlan for ChunkedEvidencePlan { true } - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "chunked") } } diff --git a/vortex-layout/src/scan/v2/layouts/dict.rs b/vortex-layout/src/scan/v2/layouts/dict.rs index 146cdefebc3..a136a7cd19f 100644 --- a/vortex-layout/src/scan/v2/layouts/dict.rs +++ b/vortex-layout/src/scan/v2/layouts/dict.rs @@ -50,16 +50,15 @@ use crate::layout_v2::Layout; use crate::layouts::SharedArrayFuture; use crate::scan::v2::node::ExpandCtx; use crate::scan::v2::node::FileReader; -use crate::scan::v2::node::PlanCtx; +use crate::scan::v2::node::PrepareCtx; +use crate::scan::v2::node::PreparedRead; +use crate::scan::v2::node::PreparedReadRef; +use crate::scan::v2::node::PreparedStateKey; use crate::scan::v2::node::PushCtx; -use crate::scan::v2::node::ReadPlan; -use crate::scan::v2::node::ReadPlanRef; use crate::scan::v2::node::RowScope; use crate::scan::v2::node::ScanNode; use crate::scan::v2::node::ScanNodeRef; -use crate::scan::v2::node::ScanStateRef; use crate::scan::v2::node::StateCtx; -use crate::scan::v2::node::downcast_state; use crate::scan::v2::request::NodeRequest; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; @@ -87,13 +86,11 @@ pub struct DictScanNode { codes: ScanNodeRef, } -/// Per-query state: the cached values relation, the child states, and -/// cached value-domain expression results. +/// Per-query dictionary caches: the shared values relation and cached +/// value-domain expression results. #[derive(Clone)] pub struct DictScanState { shared: DictSharedState, - values_state: ScanStateRef, - codes_state: ScanStateRef, } #[derive(Clone)] @@ -103,19 +100,9 @@ struct DictSharedState { } impl DictScanState { - fn new(values_state: ScanStateRef, codes_state: ScanStateRef) -> Self { + fn new() -> Self { Self { shared: DictSharedState::default(), - values_state, - codes_state, - } - } - - fn with_child_states(&self, values_state: ScanStateRef, codes_state: ScanStateRef) -> Self { - Self { - shared: self.shared.clone(), - values_state, - codes_state, } } } @@ -135,16 +122,18 @@ struct DictExprScanNode { expr: Expression, } -struct DictReadPlan { +struct DictPreparedRead { node: Arc, - values_read: ReadPlanRef, - codes_read: ReadPlanRef, + state: Arc, + values_read: PreparedReadRef, + codes_read: PreparedReadRef, } -struct DictExprReadPlan { +struct DictExprPreparedRead { node: Arc, - values_read: ReadPlanRef, - codes_read: ReadPlanRef, + state: Arc, + values_read: PreparedReadRef, + codes_read: PreparedReadRef, } fn value_expr_is_expensive(expr: &Expression) -> bool { @@ -177,7 +166,7 @@ impl DictScanNode { /// The values relation wrapped in a `SharedArray`, read once per query. fn values( &self, - values_read: ReadPlanRef, + values_read: PreparedReadRef, io: &FileReader, state: &DictScanState, ) -> SharedArrayFuture { @@ -192,7 +181,6 @@ impl DictScanNode { let values_len = self.values_len; let io = io.clone(); - let values_state = Arc::clone(&state.values_state); let future = async move { let selection = Mask::new_true(usize::try_from(values_len).map_err(|_| { @@ -204,7 +192,6 @@ impl DictScanNode { 0..values_len, RowScope::selected(&selection), &io, - values_state.as_ref(), &mut local, ) .await @@ -229,11 +216,8 @@ impl DictScanNode { impl ScanNode for DictScanNode { type State = DictScanState; - fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - Ok(DictScanState::new( - cx.init_node(&self.values)?, - cx.init_node(&self.codes)?, - )) + fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { + Ok(DictScanState::new()) } fn try_push_expr( @@ -255,15 +239,18 @@ impl ScanNode for DictScanNode { self.codes.split_hints() } - fn plan_read(self: Arc, cx: &mut PlanCtx) -> VortexResult> { + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { + let key = PreparedStateKey::new::(Arc::as_ptr(&self) as *const () as usize); + let state = cx.shared_state(key, || Ok(DictScanState::new()))?; let values_read = Arc::clone(&self.values) - .plan_read(cx)? - .ok_or_else(|| vortex_err!("dictionary values did not produce a read plan"))?; + .prepare_read(cx)? + .ok_or_else(|| vortex_err!("dictionary values did not produce a prepared read"))?; let codes_read = Arc::clone(&self.codes) - .plan_read(cx)? - .ok_or_else(|| vortex_err!("dictionary codes did not produce a read plan"))?; - Ok(Some(Arc::new(DictReadPlan { + .prepare_read(cx)? + .ok_or_else(|| vortex_err!("dictionary codes did not produce a prepared read"))?; + Ok(Some(Arc::new(DictPreparedRead { node: self, + state, values_read, codes_read, }))) @@ -274,7 +261,8 @@ impl ScanNode for DictScanNode { /// they are read once per query by design and consulted by every /// remaining morsel. fn release(&self, frontier: u64, state: &DictScanState) -> VortexResult<()> { - self.codes.release(frontier, state.codes_state.as_ref()) + let _ = (frontier, state); + Ok(()) } fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -287,20 +275,23 @@ impl ScanNode for DictScanNode { impl ScanNode for DictExprScanNode { type State = DictScanState; - fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - let node: ScanNodeRef = Arc::::clone(&self.dict); - Ok(downcast_state::(cx.init_node(&node)?.as_ref())?.clone()) + fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { + Ok(DictScanState::new()) } - fn plan_read(self: Arc, cx: &mut PlanCtx) -> VortexResult> { + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { + let key = + PreparedStateKey::new::(Arc::as_ptr(&self.dict) as *const () as usize); + let state = cx.shared_state(key, || Ok(DictScanState::new()))?; let values_read = Arc::clone(&self.dict.values) - .plan_read(cx)? - .ok_or_else(|| vortex_err!("dictionary values did not produce a read plan"))?; + .prepare_read(cx)? + .ok_or_else(|| vortex_err!("dictionary values did not produce a prepared read"))?; let codes_read = Arc::clone(&self.dict.codes) - .plan_read(cx)? - .ok_or_else(|| vortex_err!("dictionary codes did not produce a read plan"))?; - Ok(Some(Arc::new(DictExprReadPlan { + .prepare_read(cx)? + .ok_or_else(|| vortex_err!("dictionary codes did not produce a prepared read"))?; + Ok(Some(Arc::new(DictExprPreparedRead { node: self, + state, values_read, codes_read, }))) @@ -315,33 +306,19 @@ impl ScanNode for DictExprScanNode { } } -impl ReadPlan for DictReadPlan { - type State = DictScanState; - - fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - let node: ScanNodeRef = Arc::::clone(&self.node); - let base = cx.init_node(&node)?; - Ok( - downcast_state::(base.as_ref())?.with_child_states( - self.values_read.init_state(cx)?, - self.codes_read.init_state(cx)?, - ), - ) - } - +impl PreparedRead for DictPreparedRead { fn read_scoped<'a>( &'a self, range: Range, rows: RowScope<'a>, io: &'a FileReader, - state: &'a Self::State, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { if sparse_dict_candidate(self.node.values_len, rows) { let codes = self .codes_read - .read_scoped(range.clone(), rows, io, state.codes_state.as_ref(), local) + .read_scoped(range.clone(), rows, io, local) .await?; let values_len = usize::try_from(self.node.values_len) .map_err(|_| vortex_err!("dictionary values length exceeds usize"))?; @@ -354,7 +331,6 @@ impl ReadPlan for DictReadPlan { 0..self.node.values_len, RowScope::selected(&value_selection), io, - state.values_state.as_ref(), local, ) .await?; @@ -363,7 +339,7 @@ impl ReadPlan for DictReadPlan { let values = self .node - .values(Arc::clone(&self.values_read), io, state) + .values(Arc::clone(&self.values_read), io, &self.state) .await .map_err(VortexError::from)?; return self.node.build_dict(codes, values)?.optimize(); @@ -371,13 +347,11 @@ impl ReadPlan for DictReadPlan { let values = async { self.node - .values(Arc::clone(&self.values_read), io, state) + .values(Arc::clone(&self.values_read), io, &self.state) .await .map_err(VortexError::from) }; - let codes = - self.codes_read - .read_scoped(range, rows, io, state.codes_state.as_ref(), local); + let codes = self.codes_read.read_scoped(range, rows, io, local); let (values, codes) = try_join!(values, codes)?; self.node.build_dict(codes, values)?.optimize() }) @@ -387,13 +361,10 @@ impl ReadPlan for DictReadPlan { &self, range: Range, rows: RowScope<'_>, - state: &Self::State, cx: &mut SegmentPlanCtx, ) -> VortexResult { if sparse_dict_candidate(self.node.values_len, rows) { - return self - .codes_read - .segment_requests(range, rows, state.codes_state.as_ref(), cx); + return self.codes_read.segment_requests(range, rows, cx); } let values_selection = Mask::new_true( @@ -403,32 +374,25 @@ impl ReadPlan for DictReadPlan { let mut requests = self.values_read.segment_requests( 0..self.node.values_len, RowScope::selected(&values_selection), - state.values_state.as_ref(), cx, )?; if requests.is_unknown() { return Ok(requests); } - requests.extend(self.codes_read.segment_requests( - range, - rows, - state.codes_state.as_ref(), - cx, - )?); + requests.extend(self.codes_read.segment_requests(range, rows, cx)?); Ok(requests) } - fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { - self.codes_read - .release(frontier, state.codes_state.as_ref()) + fn release(&self, frontier: u64) -> VortexResult<()> { + self.codes_read.release(frontier) } - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.node.fmt_chain(f) } } -impl DictExprReadPlan { +impl DictExprPreparedRead { async fn value_expr( &self, io: &FileReader, @@ -487,7 +451,6 @@ impl DictExprReadPlan { &self, codes: ArrayRef, io: &FileReader, - state: &DictScanState, local: &mut ExecutionCtx, ) -> VortexResult> { let values_len = usize::try_from(self.node.dict.values_len) @@ -504,7 +467,6 @@ impl DictExprReadPlan { 0..self.node.dict.values_len, RowScope::selected(&value_selection), io, - state.values_state.as_ref(), local, ) .await?; @@ -667,26 +629,12 @@ where }) } -impl ReadPlan for DictExprReadPlan { - type State = DictScanState; - - fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - let node: ScanNodeRef = Arc::::clone(&self.node.dict); - let base = cx.init_node(&node)?; - Ok( - downcast_state::(base.as_ref())?.with_child_states( - self.values_read.init_state(cx)?, - self.codes_read.init_state(cx)?, - ), - ) - } - +impl PreparedRead for DictExprPreparedRead { fn read_scoped<'a>( &'a self, range: Range, rows: RowScope<'a>, io: &'a FileReader, - state: &'a Self::State, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { @@ -698,13 +646,13 @@ impl ReadPlan for DictExprReadPlan { usize::try_from(self.node.dict.values_len), Ok(values_len) if values_len <= rows.demand.true_count() )) { - self.value_expr(io, state, local).await? + self.value_expr(io, &self.state, local).await? } else { None }; let codes = self .codes_read - .read_scoped(range.clone(), rows, io, state.codes_state.as_ref(), local) + .read_scoped(range.clone(), rows, io, local) .await?; if let Some(value_expr) = value_expr { let all_valid = !codes.dtype().is_nullable() @@ -717,14 +665,14 @@ impl ReadPlan for DictExprReadPlan { } } if sparse_candidate - && let Some(result) = self.sparse_expr(codes.clone(), io, state, local).await? + && let Some(result) = self.sparse_expr(codes.clone(), io, local).await? { return Ok(result); } let values = self .node .dict - .values(Arc::clone(&self.values_read), io, state) + .values(Arc::clone(&self.values_read), io, &self.state) .await .map_err(VortexError::from)?; let input = self.node.dict.build_dict(codes, values)?.optimize()?; @@ -736,13 +684,10 @@ impl ReadPlan for DictExprReadPlan { &self, range: Range, rows: RowScope<'_>, - state: &Self::State, cx: &mut SegmentPlanCtx, ) -> VortexResult { if sparse_value_expr_candidate(&self.node.expr, self.node.dict.values_len, rows) { - return self - .codes_read - .segment_requests(range, rows, state.codes_state.as_ref(), cx); + return self.codes_read.segment_requests(range, rows, cx); } let values_selection = Mask::new_true( @@ -752,27 +697,20 @@ impl ReadPlan for DictExprReadPlan { let mut requests = self.values_read.segment_requests( 0..self.node.dict.values_len, RowScope::selected(&values_selection), - state.values_state.as_ref(), cx, )?; if requests.is_unknown() { return Ok(requests); } - requests.extend(self.codes_read.segment_requests( - range, - rows, - state.codes_state.as_ref(), - cx, - )?); + requests.extend(self.codes_read.segment_requests(range, rows, cx)?); Ok(requests) } - fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { - self.codes_read - .release(frontier, state.codes_state.as_ref()) + fn release(&self, frontier: u64) -> VortexResult<()> { + self.codes_read.release(frontier) } - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.node.fmt_chain(f) } } diff --git a/vortex-layout/src/scan/v2/layouts/flat.rs b/vortex-layout/src/scan/v2/layouts/flat.rs index 3bcc3be627a..9c3d6732d3a 100644 --- a/vortex-layout/src/scan/v2/layouts/flat.rs +++ b/vortex-layout/src/scan/v2/layouts/flat.rs @@ -27,15 +27,14 @@ use crate::layout_v2::Layout; use crate::layout_v2::LayoutRef; use crate::scan::v2::node::ExpandCtx; use crate::scan::v2::node::FileReader; -use crate::scan::v2::node::PlanCtx; -use crate::scan::v2::node::ReadPlan; -use crate::scan::v2::node::ReadPlanRef; +use crate::scan::v2::node::PrepareCtx; +use crate::scan::v2::node::PreparedRead; +use crate::scan::v2::node::PreparedReadRef; +use crate::scan::v2::node::PreparedStateKey; use crate::scan::v2::node::RowScope; use crate::scan::v2::node::ScanNode; use crate::scan::v2::node::ScanNodeRef; -use crate::scan::v2::node::ScanStateRef; use crate::scan::v2::node::StateCtx; -use crate::scan::v2::node::downcast_state; use crate::scan::v2::request::NodeRequest; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; @@ -64,8 +63,9 @@ pub struct FlatScanState { array: Mutex>, } -struct FlatReadPlan { +struct FlatPreparedRead { node: Arc, + state: Arc, } impl ScanNode for FlatScanNode { @@ -75,8 +75,10 @@ impl ScanNode for FlatScanNode { Ok(FlatScanState::default()) } - fn plan_read(self: Arc, _cx: &mut PlanCtx) -> VortexResult> { - Ok(Some(Arc::new(FlatReadPlan { node: self }))) + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { + let key = PreparedStateKey::new::(Arc::as_ptr(&self) as *const () as usize); + let state = cx.shared_state(key, || Ok(FlatScanState::default()))?; + Ok(Some(Arc::new(FlatPreparedRead { node: self, state }))) } /// A flat leaf releases only once *wholly* behind the frontier: a @@ -94,32 +96,20 @@ impl ScanNode for FlatScanNode { } } -impl ReadPlan for FlatReadPlan { - type State = ScanStateRef; - - fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - let node: ScanNodeRef = Arc::::clone(&self.node); - cx.init_node(&node) - } - +impl PreparedRead for FlatPreparedRead { fn read_scoped<'a>( &'a self, range: Range, rows: RowScope<'a>, io: &'a FileReader, - state: &'a Self::State, _local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { - let state = match downcast_state::(state.as_ref()) { - Ok(state) => state, - Err(e) => return Box::pin(async move { Err(e) }), - }; Box::pin(async move { - let array = if let Some(hit) = state.array.lock().clone() { + let array = if let Some(hit) = self.state.array.lock().clone() { hit } else { let decoded = decode_flat(&self.node.layout, io).await?; - *state.array.lock() = Some(decoded.clone()); + *self.state.array.lock() = Some(decoded.clone()); decoded }; let dense = slice_to_range(array, &range)?; @@ -148,14 +138,9 @@ impl ReadPlan for FlatReadPlan { &self, _range: Range, _rows: RowScope<'_>, - state: &Self::State, cx: &mut SegmentPlanCtx, ) -> VortexResult { - if downcast_state::(state.as_ref())? - .array - .lock() - .is_some() - { + if self.state.array.lock().is_some() { return Ok(SegmentRequests::none()); } @@ -170,12 +155,11 @@ impl ReadPlan for FlatReadPlan { ])) } - fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { - self.node - .release(frontier, downcast_state::(state.as_ref())?) + fn release(&self, frontier: u64) -> VortexResult<()> { + self.node.release(frontier, &self.state) } - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.node.fmt_chain(f) } } diff --git a/vortex-layout/src/scan/v2/layouts/struct_.rs b/vortex-layout/src/scan/v2/layouts/struct_.rs index 927f958f368..8e261b167d7 100644 --- a/vortex-layout/src/scan/v2/layouts/struct_.rs +++ b/vortex-layout/src/scan/v2/layouts/struct_.rs @@ -33,9 +33,9 @@ use crate::layout_v2::Struct; use crate::scan::v2::node::ApplyScanNode; use crate::scan::v2::node::ExpandCtx; use crate::scan::v2::node::MaskScanNode; -use crate::scan::v2::node::PlanCtx; +use crate::scan::v2::node::PrepareCtx; +use crate::scan::v2::node::PreparedReadRef; use crate::scan::v2::node::PushCtx; -use crate::scan::v2::node::ReadPlanRef; use crate::scan::v2::node::ScanNode; use crate::scan::v2::node::ScanNodeRef; use crate::scan::v2::node::StateCtx; @@ -112,7 +112,10 @@ impl ScanNode for StructScanNode { Ok(Some(Arc::new(ApplyScanNode::new(input, expr.clone())))) } - fn plan_read(self: Arc, _cx: &mut PlanCtx) -> VortexResult> { + fn prepare_read( + self: Arc, + _cx: &mut PrepareCtx, + ) -> VortexResult> { Ok(None) } diff --git a/vortex-layout/src/scan/v2/layouts/zoned.rs b/vortex-layout/src/scan/v2/layouts/zoned.rs index 9a4b728b78e..0a145e5c273 100644 --- a/vortex-layout/src/scan/v2/layouts/zoned.rs +++ b/vortex-layout/src/scan/v2/layouts/zoned.rs @@ -4,7 +4,7 @@ //! Scan2 vtable support for zoned (zone-map) layouts: the canonical proof producer. //! //! Reading delegates straight to the data child. Pushed predicate nodes -//! expose zone-map evidence plans: per predicate, the falsification and +//! expose zone-map prepared evidence: per predicate, the falsification and //! satisfaction rewrites ([`Expression::falsify`] / //! [`Expression::satisfy`]) are evaluated over the zone map once per //! query, and evidence walks the per-zone masks. @@ -52,22 +52,20 @@ use crate::layouts::zoned::zone_map::ZoneMap; use crate::scan::v2::evidence::EvidenceFragment; use crate::scan::v2::evidence::PredicateEvidenceKind; use crate::scan::v2::node::AggregateAnswer; -use crate::scan::v2::node::AggregatePlan; -use crate::scan::v2::node::AggregatePlanRef; -use crate::scan::v2::node::DynReadPlan; -use crate::scan::v2::node::EvidencePlan; -use crate::scan::v2::node::EvidencePlanRef; -use crate::scan::v2::node::EvidenceStateKey; use crate::scan::v2::node::ExpandCtx; use crate::scan::v2::node::FileReader; -use crate::scan::v2::node::PlanCtx; +use crate::scan::v2::node::PrepareCtx; +use crate::scan::v2::node::PreparedAggregate; +use crate::scan::v2::node::PreparedAggregateRef; +use crate::scan::v2::node::PreparedEvidence; +use crate::scan::v2::node::PreparedEvidenceRef; +use crate::scan::v2::node::PreparedRead; +use crate::scan::v2::node::PreparedReadRef; +use crate::scan::v2::node::PreparedStateKey; use crate::scan::v2::node::PushCtx; -use crate::scan::v2::node::ReadPlan; -use crate::scan::v2::node::ReadPlanRef; use crate::scan::v2::node::RowScope; use crate::scan::v2::node::ScanNode; use crate::scan::v2::node::ScanNodeRef; -use crate::scan::v2::node::ScanStateCache; use crate::scan::v2::node::ScanStateRef; use crate::scan::v2::node::StateCtx; use crate::scan::v2::node::read_dense; @@ -121,7 +119,6 @@ struct PredicateMasks { /// columns prepared for range aggregation. pub struct ZonedScanState { data: ScanStateRef, - zones: ScanStateRef, /// The decoded per-zone stats table. table: Mutex>>, zone_map: Mutex>>, @@ -130,9 +127,9 @@ pub struct ZonedScanState { } /// Planned evidence for one predicate over a zoned node. -struct ZonedEvidencePlan { - zones_read: ReadPlanRef, - zones_key: usize, +struct ZonedPreparedEvidence { + state: Arc, + zones_read: PreparedReadRef, nzones: u64, column_dtype: DType, zone_len: u64, @@ -144,16 +141,16 @@ struct ZonedEvidencePlan { } /// Planned ungrouped aggregate over a zoned node's root value. -struct ZonedAggregatePlan { +struct ZonedPreparedAggregate { node: Arc, - zones_read: ReadPlanRef, + state: Arc, + zones_read: PreparedReadRef, funcs: Vec, } -struct ZonedReadPlan { +struct ZonedPreparedRead { node: Arc, - data: ReadPlanRef, - zones: ReadPlanRef, + data: PreparedReadRef, } /// A pushed scalar expression through a zoned wrapper. Reads delegate to @@ -172,10 +169,9 @@ struct ZonedExprScanNode { satisfier: Option, } -struct ZonedExprReadPlan { +struct ZonedExprPreparedRead { node: Arc, - data: ReadPlanRef, - zones: ReadPlanRef, + data: PreparedReadRef, } /// The zone coverage of one aggregate request: the requested rows, the @@ -227,18 +223,38 @@ impl ZonedScanState { } impl ZonedScanNode { + fn shared_zone_state(&self, cx: &mut PrepareCtx) -> VortexResult> { + let key = + PreparedStateKey::new::(Arc::as_ptr(&self.zones) as *const () as usize); + cx.shared_state(key, || Ok(Self::empty_state())) + } + + fn empty_state_with_data(data: ScanStateRef) -> ZonedScanState { + ZonedScanState { + data, + table: Mutex::new(None), + zone_map: Mutex::new(None), + masks: Mutex::new(FxHashMap::default()), + stat_columns: Mutex::new(FxHashMap::default()), + } + } + + fn empty_state() -> ZonedScanState { + Self::empty_state_with_data(Arc::new(())) + } + /// The decoded per-zone stats table, read once per query. Concurrent /// decodes are benign (the segment fetch is shared; last-write-wins). async fn table( &self, - zones_read: &dyn DynReadPlan, + zones_read: &PreparedReadRef, io: &FileReader, state: &ZonedScanState, ) -> VortexResult> { if let Some(hit) = state.table.lock().clone() { return Ok(hit); } - let zones = read_dense(zones_read, 0..self.nzones, io, state.zones.as_ref()).await?; + let zones = read_dense(zones_read, 0..self.nzones, io).await?; let mut ctx = io.session().create_execution_ctx(); let table = Arc::new(zones.execute::(&mut ctx)?); *state.table.lock() = Some(Arc::clone(&table)); @@ -251,7 +267,7 @@ impl ZonedScanNode { async fn stat_column( &self, stat: Stat, - zones_read: &dyn DynReadPlan, + zones_read: &PreparedReadRef, io: &FileReader, state: &ZonedScanState, ) -> VortexResult>> { @@ -295,7 +311,7 @@ impl ZonedScanNode { &self, span: &ZoneSpan, func: &AggregateFnRef, - zones_read: &dyn DynReadPlan, + zones_read: &PreparedReadRef, io: &FileReader, state: &ZonedScanState, ctx: &mut ExecutionCtx, @@ -453,7 +469,7 @@ impl ZonedScanNode { &'a self, range: Range, funcs: &'a [AggregateFnRef], - zones_read: &'a dyn DynReadPlan, + zones_read: &'a PreparedReadRef, io: &'a FileReader, state: &'a ZonedScanState, ) -> BoxFuture<'a, VortexResult>>> { @@ -499,7 +515,7 @@ impl ZonedScanNode { } } -impl ZonedEvidencePlan { +impl ZonedPreparedEvidence { async fn table( &self, io: &FileReader, @@ -508,13 +524,7 @@ impl ZonedEvidencePlan { if let Some(hit) = state.table.lock().clone() { return Ok(hit); } - let zones = read_dense( - self.zones_read.as_ref(), - 0..self.nzones, - io, - state.zones.as_ref(), - ) - .await?; + let zones = read_dense(&self.zones_read, 0..self.nzones, io).await?; let mut ctx = io.session().create_execution_ctx(); let table = Arc::new(zones.execute::(&mut ctx)?); *state.table.lock() = Some(Arc::clone(&table)); @@ -588,32 +598,16 @@ impl ZonedEvidencePlan { } } -impl EvidencePlan for ZonedEvidencePlan { - type State = ZonedScanState; - - fn init_state(&self, ctx: &VortexSession) -> VortexResult { - let mut cache = ScanStateCache::default(); - let mut cx = StateCtx::new(ctx, &mut cache); - Ok(ZonedScanState { - data: Arc::new(()), - zones: self.zones_read.init_state(&mut cx)?, - table: Mutex::new(None), - zone_map: Mutex::new(None), - masks: Mutex::new(FxHashMap::default()), - stat_columns: Mutex::new(FxHashMap::default()), - }) - } - +impl PreparedEvidence for ZonedPreparedEvidence { fn evidence<'a>( &'a self, req: &'a EvidenceRequest<'a>, io: &'a FileReader, - state: &'a ZonedScanState, ) -> BoxFuture<'a, VortexResult>> { Box::pin(async move { let mut fragments = Vec::new(); if self.zone_len > 0 && (self.falsifier.is_some() || self.satisfier.is_some()) { - let masks = self.predicate_masks(io, state).await?; + let masks = self.predicate_masks(io, &self.state).await?; let zones = self.zone_range(&req.range); let mut run: Option<(Range, bool)> = None; for zone in zones { @@ -649,7 +643,6 @@ impl EvidencePlan for ZonedEvidencePlan { fn segment_requests( &self, _req: &EvidenceRequest<'_>, - state: &Self::State, cx: &mut SegmentPlanCtx, ) -> VortexResult { if self.zone_len == 0 || (self.falsifier.is_none() && self.satisfier.is_none()) { @@ -659,23 +652,15 @@ impl EvidencePlan for ZonedEvidencePlan { usize::try_from(self.nzones) .map_err(|_| vortex_err!("zoned stats length exceeds usize"))?, ); - self.zones_read.segment_requests( - 0..self.nzones, - RowScope::selected(&selection), - state.zones.as_ref(), - cx, - ) - } - - fn state_cache_key(&self) -> Option { - Some(EvidenceStateKey::new::(self.zones_key)) + self.zones_read + .segment_requests(0..self.nzones, RowScope::selected(&selection), cx) } fn recheck_before_projection(&self) -> bool { true } - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "zoned") } } @@ -684,28 +669,14 @@ impl ScanNode for ZonedScanNode { type State = ZonedScanState; fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - Ok(ZonedScanState { - data: cx.init_node(&self.data)?, - zones: cx.init_node(&self.zones)?, - table: Mutex::new(None), - zone_map: Mutex::new(None), - masks: Mutex::new(FxHashMap::default()), - stat_columns: Mutex::new(FxHashMap::default()), - }) + Ok(Self::empty_state_with_data(cx.init_node(&self.data)?)) } - fn plan_read(self: Arc, cx: &mut PlanCtx) -> VortexResult> { + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { let data = Arc::clone(&self.data) - .plan_read(cx)? - .ok_or_else(|| vortex_err!("zoned data child did not produce a read plan"))?; - let zones = Arc::clone(&self.zones) - .plan_read(cx)? - .ok_or_else(|| vortex_err!("zoned stats child did not produce a read plan"))?; - Ok(Some(Arc::new(ZonedReadPlan { - node: self, - data, - zones, - }))) + .prepare_read(cx)? + .ok_or_else(|| vortex_err!("zoned data child did not produce a prepared read"))?; + Ok(Some(Arc::new(ZonedPreparedRead { node: self, data }))) } fn try_push_expr( @@ -742,8 +713,11 @@ impl ScanNode for ZonedScanNode { }))) } - fn plan_evidence(self: Arc, cx: &mut PlanCtx) -> VortexResult> { - let mut plans = Arc::clone(&self.data).plan_evidence(cx)?; + fn prepare_evidence( + self: Arc, + cx: &mut PrepareCtx, + ) -> VortexResult> { + let mut plans = Arc::clone(&self.data).prepare_evidence(cx)?; let predicate = root(); let is_predicate = matches!(predicate.return_dtype(&self.column_dtype)?, DType::Bool(_)); let (falsifier, satisfier) = if self.zone_len > 0 && is_predicate { @@ -755,15 +729,15 @@ impl ScanNode for ZonedScanNode { (None, None) }; if falsifier.is_some() || satisfier.is_some() { - let zones_key = Arc::as_ptr(&self.zones) as *const () as usize; + let state = self.shared_zone_state(cx)?; let zones_read = Arc::clone(&self.zones) - .plan_read(cx)? - .ok_or_else(|| vortex_err!("zoned stats child did not produce a read plan"))?; + .prepare_read(cx)? + .ok_or_else(|| vortex_err!("zoned stats child did not produce a prepared read"))?; plans.insert( 0, - Arc::new(ZonedEvidencePlan { + Arc::new(ZonedPreparedEvidence { + state, zones_read, - zones_key, nzones: self.nzones, column_dtype: self.column_dtype.clone(), zone_len: self.zone_len, @@ -778,19 +752,21 @@ impl ScanNode for ZonedScanNode { Ok(plans) } - fn plan_aggregate_partial( + fn prepare_aggregate_partial( self: Arc, funcs: &[AggregateFnRef], - cx: &mut PlanCtx, - ) -> VortexResult> { + cx: &mut PrepareCtx, + ) -> VortexResult> { if funcs.is_empty() { return Ok(None); } let zones_read = Arc::clone(&self.zones) - .plan_read(cx)? - .ok_or_else(|| vortex_err!("zoned stats child did not produce a read plan"))?; - Ok(Some(Arc::new(ZonedAggregatePlan { + .prepare_read(cx)? + .ok_or_else(|| vortex_err!("zoned stats child did not produce a prepared read"))?; + let state = self.shared_zone_state(cx)?; + Ok(Some(Arc::new(ZonedPreparedAggregate { node: self, + state, zones_read, funcs: funcs.to_vec(), }))) @@ -814,79 +790,53 @@ impl ScanNode for ZonedScanNode { } } -impl ReadPlan for ZonedReadPlan { - type State = ZonedScanState; - - fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - Ok(ZonedScanState { - data: self.data.init_state(cx)?, - zones: self.zones.init_state(cx)?, - table: Mutex::new(None), - zone_map: Mutex::new(None), - masks: Mutex::new(FxHashMap::default()), - stat_columns: Mutex::new(FxHashMap::default()), - }) - } - +impl PreparedRead for ZonedPreparedRead { fn read_scoped<'a>( &'a self, range: Range, rows: RowScope<'a>, io: &'a FileReader, - state: &'a Self::State, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { - self.data - .read_scoped(range, rows, io, state.data.as_ref(), local) + self.data.read_scoped(range, rows, io, local) } fn segment_requests( &self, range: Range, rows: RowScope<'_>, - state: &Self::State, cx: &mut SegmentPlanCtx, ) -> VortexResult { - self.data - .segment_requests(range, rows, state.data.as_ref(), cx) + self.data.segment_requests(range, rows, cx) } - fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { - self.data.release(frontier, state.data.as_ref()) + fn release(&self, frontier: u64) -> VortexResult<()> { + self.data.release(frontier) } - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.node.fmt_chain(f) } } -impl AggregatePlan for ZonedAggregatePlan { - type State = ZonedScanState; +impl PreparedAggregate for ZonedPreparedAggregate { + type State = (); - fn init_state(&self, ctx: &VortexSession) -> VortexResult { - let mut cache = ScanStateCache::default(); - let mut cx = StateCtx::new(ctx, &mut cache); - Ok(ZonedScanState { - data: Arc::new(()), - zones: self.zones_read.init_state(&mut cx)?, - table: Mutex::new(None), - zone_map: Mutex::new(None), - masks: Mutex::new(FxHashMap::default()), - stat_columns: Mutex::new(FxHashMap::default()), - }) + fn init_state(&self, _ctx: &VortexSession) -> VortexResult { + Ok(()) } fn aggregate_partial<'a>( &'a self, range: Range, io: &'a FileReader, - state: &'a ZonedScanState, + _state: &'a Self::State, ) -> BoxFuture<'a, VortexResult>>> { self.node - .aggregate_partial(range, &self.funcs, self.zones_read.as_ref(), io, state) + .aggregate_partial(range, &self.funcs, &self.zones_read, io, &self.state) } - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "zoned") } } @@ -895,42 +845,36 @@ impl ScanNode for ZonedExprScanNode { type State = ZonedScanState; fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - Ok(ZonedScanState { - data: cx.init_node(&self.data)?, - zones: cx.init_node(&self.zones)?, - table: Mutex::new(None), - zone_map: Mutex::new(None), - masks: Mutex::new(FxHashMap::default()), - stat_columns: Mutex::new(FxHashMap::default()), - }) + Ok(ZonedScanNode::empty_state_with_data( + cx.init_node(&self.data)?, + )) } - fn plan_read(self: Arc, cx: &mut PlanCtx) -> VortexResult> { - let data = Arc::clone(&self.data).plan_read(cx)?.ok_or_else(|| { - vortex_err!("zoned expression data child did not produce a read plan") + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { + let data = Arc::clone(&self.data).prepare_read(cx)?.ok_or_else(|| { + vortex_err!("zoned expression data child did not produce a prepared read") })?; - let zones = Arc::clone(&self.zones) - .plan_read(cx)? - .ok_or_else(|| vortex_err!("zoned stats child did not produce a read plan"))?; - Ok(Some(Arc::new(ZonedExprReadPlan { - node: self, - data, - zones, - }))) + Ok(Some(Arc::new(ZonedExprPreparedRead { node: self, data }))) } - fn plan_evidence(self: Arc, cx: &mut PlanCtx) -> VortexResult> { - let mut plans = Arc::clone(&self.data).plan_evidence(cx)?; + fn prepare_evidence( + self: Arc, + cx: &mut PrepareCtx, + ) -> VortexResult> { + let mut plans = Arc::clone(&self.data).prepare_evidence(cx)?; if self.falsifier.is_some() || self.satisfier.is_some() { - let zones_key = Arc::as_ptr(&self.zones) as *const () as usize; + let key = PreparedStateKey::new::( + Arc::as_ptr(&self.zones) as *const () as usize, + ); + let state = cx.shared_state(key, || Ok(ZonedScanNode::empty_state()))?; let zones_read = Arc::clone(&self.zones) - .plan_read(cx)? - .ok_or_else(|| vortex_err!("zoned stats child did not produce a read plan"))?; + .prepare_read(cx)? + .ok_or_else(|| vortex_err!("zoned stats child did not produce a prepared read"))?; plans.insert( 0, - Arc::new(ZonedEvidencePlan { + Arc::new(ZonedPreparedEvidence { + state, zones_read, - zones_key, nzones: self.nzones, column_dtype: self.column_dtype.clone(), zone_len: self.zone_len, @@ -954,48 +898,31 @@ impl ScanNode for ZonedExprScanNode { } } -impl ReadPlan for ZonedExprReadPlan { - type State = ZonedScanState; - - fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - Ok(ZonedScanState { - data: self.data.init_state(cx)?, - zones: self.zones.init_state(cx)?, - table: Mutex::new(None), - zone_map: Mutex::new(None), - masks: Mutex::new(FxHashMap::default()), - stat_columns: Mutex::new(FxHashMap::default()), - }) - } - +impl PreparedRead for ZonedExprPreparedRead { fn read_scoped<'a>( &'a self, range: Range, rows: RowScope<'a>, io: &'a FileReader, - state: &'a Self::State, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { - self.data - .read_scoped(range, rows, io, state.data.as_ref(), local) + self.data.read_scoped(range, rows, io, local) } fn segment_requests( &self, range: Range, rows: RowScope<'_>, - state: &Self::State, cx: &mut SegmentPlanCtx, ) -> VortexResult { - self.data - .segment_requests(range, rows, state.data.as_ref(), cx) + self.data.segment_requests(range, rows, cx) } - fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { - self.data.release(frontier, state.data.as_ref()) + fn release(&self, frontier: u64) -> VortexResult<()> { + self.data.release(frontier) } - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.node.fmt_chain(f) } } diff --git a/vortex-layout/src/scan/v2/node.rs b/vortex-layout/src/scan/v2/node.rs index f0564ef6a5b..38905ada810 100644 --- a/vortex-layout/src/scan/v2/node.rs +++ b/vortex-layout/src/scan/v2/node.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! The scan2 tree: per-layout nodes with value, proof, and mask +//! The scan2 tree: immutable per-layout nodes with value, proof, and mask //! capabilities (plan 017). //! //! Like the v1 scan, a file's layout tree expands into one node per @@ -10,11 +10,11 @@ //! [`DynScanNode`] adapter. Three things are new: //! //! - expansion is *negotiation*: layout scan vtables see the scoped scan request before -//! expression pushdown plans reads and evidence (see [`super::request`]); +//! expression pushdown prepares reads and evidence (see [`super::request`]); //! - expression pushdown returns another scan node whose root value is -//! the pushed expression, so reads and evidence are planned from +//! the pushed expression, so reads and evidence are prepared from //! `root()` of that node instead of reparsing expressions; and -//! - executable value read plans use one scoped primitive: selection +//! - executable prepared reads use one scoped primitive: selection //! controls output cardinality, and demand controls which selected rows //! must contain meaningful values. @@ -25,6 +25,7 @@ use std::sync::Arc; use std::sync::OnceLock; use futures::future::BoxFuture; +use parking_lot::Mutex; use rustc_hash::FxHashMap; use vortex_array::ArrayRef; use vortex_array::ExecutionCtx; @@ -49,6 +50,7 @@ use crate::layout_v2::LayoutRef; use crate::scan::v2::evidence::EvidenceFragment; use crate::scan::v2::request::EvidenceRequest; use crate::scan::v2::request::NodeRequest; +use crate::scan::v2::request::OwnedEvidenceRequest; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; use crate::segments::SegmentSource; @@ -86,42 +88,25 @@ pub type ScanStateRef = Arc; /// A reference-counted, type-erased scan2 node. pub type ScanNodeRef = Arc; -/// A reference-counted, type-erased evidence plan. -pub type EvidencePlanRef = Arc; +/// A reference-counted, type-erased prepared evidence handle. +pub type PreparedEvidenceRef = Arc; -/// A reference-counted, type-erased read plan. -pub type ReadPlanRef = Arc; +/// A reference-counted, type-erased prepared read handle. +pub type PreparedReadRef = Arc; -/// A reference-counted, type-erased split plan. -pub type SplitPlanRef = Arc; +/// A reference-counted, type-erased prepared split handle. +pub type PreparedSplitRef = Arc; -/// A reference-counted, type-erased ungrouped aggregate plan. -pub type AggregatePlanRef = Arc; +/// A reference-counted, type-erased prepared ungrouped aggregate handle. +pub type PreparedAggregateRef = Arc; -/// A reference-counted, type-erased metadata statistics plan. -pub type StatsPlanRef = Arc; +/// A reference-counted, type-erased prepared metadata statistics handle. +pub type PreparedStatsRef = Arc; /// Per-file/query cache of scan-node global state while a file's planned /// reads are initialized. pub type ScanStateCache = FxHashMap; -/// Key for evidence plans whose per-query state can be shared by several -/// planned predicates in the same file. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -pub struct EvidenceStateKey { - type_id: TypeId, - key: usize, -} - -impl EvidenceStateKey { - pub fn new(key: usize) -> Self { - Self { - type_id: TypeId::of::(), - key, - } - } -} - /// Context for expression pushdown. pub struct PushCtx { session: VortexSession, @@ -137,24 +122,86 @@ impl PushCtx { } } -/// Context for turning pushed expressions into executable read/evidence plans. -pub struct PlanCtx { +/// Context for turning pushed expressions into prepared read/evidence handles. +pub struct PrepareCtx { session: VortexSession, + state_cache: PreparedStateCacheRef, } -impl PlanCtx { +impl PrepareCtx { + /// Create a preparation context with an empty prepared-state cache. pub fn new(session: VortexSession) -> Self { - Self { session } + Self::with_state_cache(session, Arc::new(PreparedStateCache::default())) + } + + /// Create a preparation context backed by an existing prepared-state cache. + pub fn with_state_cache(session: VortexSession, state_cache: PreparedStateCacheRef) -> Self { + Self { + session, + state_cache, + } } pub fn session(&self) -> &VortexSession { &self.session } + + /// The prepared-state cache backing this context. + pub fn state_cache(&self) -> PreparedStateCacheRef { + Arc::clone(&self.state_cache) + } + + pub fn shared_state( + &mut self, + key: PreparedStateKey, + init: impl FnOnce() -> VortexResult, + ) -> VortexResult> + where + T: Send + Sync + 'static, + { + if let Some(hit) = self.state_cache.shared_states.lock().get(&key) { + return Arc::downcast::(Arc::clone(hit)) + .map_err(|_| vortex_err!("prepared shared state type mismatch")); + } + + let state = Arc::new(init()?); + let mut shared_states = self.state_cache.shared_states.lock(); + if let Some(hit) = shared_states.get(&key) { + return Arc::downcast::(Arc::clone(hit)) + .map_err(|_| vortex_err!("prepared shared state type mismatch")); + } + shared_states.insert(key, Arc::::clone(&state)); + Ok(state) + } +} + +/// Shared cache for scan/file-level prepared state. +#[derive(Default)] +pub struct PreparedStateCache { + shared_states: Mutex>, +} + +/// Reference-counted prepared-state cache. +pub type PreparedStateCacheRef = Arc; + +/// A typed key for prepared-file shared state. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub struct PreparedStateKey { + type_id: TypeId, + key: usize, +} + +impl PreparedStateKey { + pub fn new(key: usize) -> Self { + Self { + type_id: TypeId::of::(), + key, + } + } } -/// Context for initializing node global state. All read plans for one file -/// share this context, so the same node instance gets one state object even -/// when several pushed expressions reference it. +/// Context for initializing type-erased scan-node state used by the remaining +/// node-level release and non-read prepared paths. pub struct StateCtx<'a> { session: &'a VortexSession, node_cache: &'a mut ScanStateCache, @@ -224,7 +271,35 @@ impl<'a> RowScope<'a> { } } -/// One aggregate plan's mixed-coverage answer. +/// Owned row scope for a morsel-level read task. +#[derive(Clone, Debug)] +pub struct OwnedRowScope { + selection: Mask, + demand: Mask, +} + +impl OwnedRowScope { + pub fn selected(selection: Mask) -> Self { + Self { + demand: selection.clone(), + selection, + } + } + + pub fn try_new(selection: Mask, demand: Mask) -> VortexResult { + RowScope::try_new(&selection, &demand)?; + Ok(Self { selection, demand }) + } + + pub fn as_scope(&self) -> RowScope<'_> { + RowScope { + selection: &self.selection, + demand: &self.demand, + } + } +} + +/// One prepared aggregate's mixed-coverage answer. /// /// The covered rows are the requested range minus `residual`; `partial` /// accounts for exactly those rows, each once. An all-null span counts @@ -244,11 +319,15 @@ pub struct AggregateAnswer { pub residual: Vec>, } -/// A node in the expanded scan2 tree. Nodes are shared across queries; -/// all per-file/query caching lives in the node's `State`. +/// A node in the expanded scan2 tree. +/// +/// A `ScanNode` is immutable physical scan structure: layout metadata, child node +/// references, pushdown behavior, and split hints. Runtime caches live in state +/// objects created while preparing reads, evidence, statistics, and aggregates for +/// a file scan. pub trait ScanNode: 'static + Send + Sync { - /// Per-file/query global state: decoded arrays, decoded index state, - /// child node states, and other frontier-released caches. + /// Per-file/query node state: decoded arrays, decoded index state, child node states, and + /// other frontier-released caches shared by prepared handles for this node. type State: Send + Sync + 'static; /// Create this node's per-file/query state. @@ -276,61 +355,67 @@ pub trait ScanNode: 'static + Send + Sync { } } - /// Plan value reads for this node's root value. - fn plan_read(self: Arc, cx: &mut PlanCtx) -> VortexResult>; + /// Prepare value reads for this node's root value. + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult>; - /// Plan natural row splits for this node's root value. + /// Prepare natural row splits for this node's root value. /// - /// The default converts this node's cheap split hints into an executable plan. Nodes can + /// The default converts this node's cheap split hints into an executable handle. Nodes can /// override this when split discovery needs request-specific state, I/O, or cost estimates. - fn plan_splits(self: Arc, _cx: &mut PlanCtx) -> VortexResult> + fn prepare_splits( + self: Arc, + _cx: &mut PrepareCtx, + ) -> VortexResult> where Self: Sized, { Ok(self .split_hints() - .map(|hints| Arc::new(HintSplitPlan::new(hints.to_vec())) as SplitPlanRef)) + .map(|hints| Arc::new(HintPreparedSplit::new(hints.to_vec())) as PreparedSplitRef)) } - /// Plan predicate evidence for this node's root boolean value. + /// Prepare predicate evidence for this node's root boolean value. /// - /// Planning performs no IO and returns a direct executable handle. The + /// Preparation performs no IO and returns a direct executable handle. The /// handle may precompute expression rewrites or accepted predicate /// fragments, but runtime state remains in [`Self::State`]. - fn plan_evidence(self: Arc, _cx: &mut PlanCtx) -> VortexResult> + fn prepare_evidence( + self: Arc, + _cx: &mut PrepareCtx, + ) -> VortexResult> where Self: Sized, { Ok(Vec::new()) } - /// Plan ungrouped aggregates over this node's root value. + /// Prepare ungrouped aggregates over this node's root value. /// - /// The returned plan answers all `funcs` together over a runtime row + /// The returned handle answers all `funcs` together over a runtime row /// range, producing one [`AggregateAnswer`] per function. `None` means /// this node cannot answer these aggregates from layout metadata and /// the caller should read rows normally. - fn plan_aggregate_partial( + fn prepare_aggregate_partial( self: Arc, _funcs: &[AggregateFnRef], - _cx: &mut PlanCtx, - ) -> VortexResult> + _cx: &mut PrepareCtx, + ) -> VortexResult> where Self: Sized, { Ok(None) } - /// Plan metadata statistics for this node's root value. + /// Prepare metadata statistics for this node's root value. /// - /// The returned plan answers the requested aggregate functions positionally over runtime row + /// The returned handle answers the requested aggregate functions positionally over runtime row /// ranges using metadata only. `None` means this node cannot answer these functions from /// metadata. - fn plan_stats( + fn prepare_stats( self: Arc, _funcs: &[AggregateFnRef], - _cx: &mut PlanCtx, - ) -> VortexResult> + _cx: &mut PrepareCtx, + ) -> VortexResult> where Self: Sized, { @@ -354,19 +439,18 @@ pub trait ScanNode: 'static + Send + Sync { fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result; } -/// Read every row in `range` through a read plan. +/// Read every row in `range` through a prepared read. pub(crate) fn read_dense<'a>( - read: &'a dyn DynReadPlan, + read: &'a PreparedReadRef, range: Range, io: &'a FileReader, - state: &'a ScanState, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { let len = range_len(&range)?; - let selection = Mask::new_true(len); + let rows = OwnedRowScope::selected(Mask::new_true(len)); let mut local = io.session().create_execution_ctx(); - read.read_scoped(range, RowScope::selected(&selection), io, state, &mut local) - .await + let task = Arc::clone(read).begin_read(range, rows)?; + task.read(io, &mut local).await }) } @@ -391,28 +475,34 @@ pub trait DynScanNode: Send + Sync { cx: &mut PushCtx, ) -> VortexResult>; - /// Plan value reads for this node's root value. - fn plan_read(self: Arc, cx: &mut PlanCtx) -> VortexResult>; + /// Prepare value reads for this node's root value. + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult>; - /// Plan natural row splits for this node's root value. - fn plan_splits(self: Arc, cx: &mut PlanCtx) -> VortexResult>; + /// Prepare natural row splits for this node's root value. + fn prepare_splits( + self: Arc, + cx: &mut PrepareCtx, + ) -> VortexResult>; - /// Plan predicate evidence for this node's root boolean value. - fn plan_evidence(self: Arc, cx: &mut PlanCtx) -> VortexResult>; + /// Prepare predicate evidence for this node's root boolean value. + fn prepare_evidence( + self: Arc, + cx: &mut PrepareCtx, + ) -> VortexResult>; - /// Plan ungrouped aggregates for this node's root value. - fn plan_aggregate_partial( + /// Prepare ungrouped aggregates for this node's root value. + fn prepare_aggregate_partial( self: Arc, funcs: &[AggregateFnRef], - cx: &mut PlanCtx, - ) -> VortexResult>; + cx: &mut PrepareCtx, + ) -> VortexResult>; - /// Plan metadata statistics for this node's root value. - fn plan_stats( + /// Prepare metadata statistics for this node's root value. + fn prepare_stats( self: Arc, funcs: &[AggregateFnRef], - cx: &mut PlanCtx, - ) -> VortexResult>; + cx: &mut PrepareCtx, + ) -> VortexResult>; /// Preferred morsel boundaries (see [`ScanNode::split_hints`]). fn split_hints(&self) -> Option<&[u64]>; @@ -437,32 +527,38 @@ impl DynScanNode for T { ScanNode::try_push_expr(self, expr, cx) } - fn plan_read(self: Arc, cx: &mut PlanCtx) -> VortexResult> { - ScanNode::plan_read(self, cx) + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { + ScanNode::prepare_read(self, cx) } - fn plan_splits(self: Arc, cx: &mut PlanCtx) -> VortexResult> { - ScanNode::plan_splits(self, cx) + fn prepare_splits( + self: Arc, + cx: &mut PrepareCtx, + ) -> VortexResult> { + ScanNode::prepare_splits(self, cx) } - fn plan_evidence(self: Arc, cx: &mut PlanCtx) -> VortexResult> { - ScanNode::plan_evidence(self, cx) + fn prepare_evidence( + self: Arc, + cx: &mut PrepareCtx, + ) -> VortexResult> { + ScanNode::prepare_evidence(self, cx) } - fn plan_aggregate_partial( + fn prepare_aggregate_partial( self: Arc, funcs: &[AggregateFnRef], - cx: &mut PlanCtx, - ) -> VortexResult> { - ScanNode::plan_aggregate_partial(self, funcs, cx) + cx: &mut PrepareCtx, + ) -> VortexResult> { + ScanNode::prepare_aggregate_partial(self, funcs, cx) } - fn plan_stats( + fn prepare_stats( self: Arc, funcs: &[AggregateFnRef], - cx: &mut PlanCtx, - ) -> VortexResult> { - ScanNode::plan_stats(self, funcs, cx) + cx: &mut PrepareCtx, + ) -> VortexResult> { + ScanNode::prepare_stats(self, funcs, cx) } fn split_hints(&self) -> Option<&[u64]> { @@ -478,14 +574,13 @@ impl DynScanNode for T { } } -/// Executable value read plan for one pushed expression. -pub trait ReadPlan: 'static + Send + Sync { - /// The per-query state this read plan executes against. - type State: Send + Sync + 'static; - - /// Create this read plan's per-file/query global state. - fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult; - +/// Prepared value read for one pushed expression. +/// +/// A `PreparedRead` is the scan-level runtime handle for a fixed read route. It +/// may hold child prepared reads and initializes route-scoped state once per +/// prepared file scan; each `read_scoped` call executes that route for one +/// morsel row scope. +pub trait PreparedRead: 'static + Send + Sync { /// Read the live rows of `range`, with [`RowScope`] defining output /// cardinality (`selection`) and meaningful-value demand (`demand`). fn read_scoped<'a>( @@ -493,7 +588,6 @@ pub trait ReadPlan: 'static + Send + Sync { range: Range, rows: RowScope<'a>, io: &'a FileReader, - state: &'a Self::State, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult>; @@ -502,101 +596,81 @@ pub trait ReadPlan: 'static + Send + Sync { &self, _range: Range, _rows: RowScope<'_>, - _state: &Self::State, _cx: &mut SegmentPlanCtx, ) -> VortexResult { Ok(SegmentRequests::unknown()) } /// Release state behind the completed-row frontier. - fn release(&self, _frontier: u64, _state: &Self::State) -> VortexResult<()> { + fn release(&self, _frontier: u64) -> VortexResult<()> { Ok(()) } /// Compact description for plan display. - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "read") } } -/// Object-safe view of a [`ReadPlan`]. -pub trait DynReadPlan: Send + Sync { - /// Create this read plan's per-file/query state, type-erased. - fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult; - - /// Read rows in a selection/demand scope. - fn read_scoped<'a>( - &'a self, +impl dyn PreparedRead { + /// Create a morsel-level read task for this prepared read. + pub fn begin_read( + self: Arc, range: Range, - rows: RowScope<'a>, + rows: OwnedRowScope, + ) -> VortexResult> { + Ok(Box::new(DefaultReadTask { + read: self, + range, + rows, + })) + } +} + +/// A morsel-level read task. +pub trait ReadTask: Send { + /// Return scheduler-visible segment requests needed for this task, when known exactly. + fn segment_requests(&self, cx: &mut SegmentPlanCtx) -> VortexResult; + + /// Execute the read task. + fn read<'a>( + self: Box, io: &'a FileReader, - state: &'a ScanState, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult>; +} - /// Return scheduler-visible segment requests needed for this read, when known exactly. - fn segment_requests( - &self, - range: Range, - rows: RowScope<'_>, - state: &ScanState, - cx: &mut SegmentPlanCtx, - ) -> VortexResult; - - /// Release state behind the completed-row frontier. - fn release(&self, frontier: u64, state: &ScanState) -> VortexResult<()>; - - /// Compact description for plan display. - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result; +struct DefaultReadTask { + read: PreparedReadRef, + range: Range, + rows: OwnedRowScope, } -impl DynReadPlan for T { - fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - Ok(Arc::new(ReadPlan::init_state(self, cx)?)) +impl ReadTask for DefaultReadTask { + fn segment_requests(&self, cx: &mut SegmentPlanCtx) -> VortexResult { + self.read + .segment_requests(self.range.clone(), self.rows.as_scope(), cx) } - fn read_scoped<'a>( - &'a self, - range: Range, - rows: RowScope<'a>, + fn read<'a>( + self: Box, io: &'a FileReader, - state: &'a ScanState, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { - let state = match downcast_erased_state::(state) { - Ok(state) => state, - Err(e) => return Box::pin(async move { Err(e) }), - }; - ReadPlan::read_scoped(self, range, rows, io, state, local) - } - - fn segment_requests( - &self, - range: Range, - rows: RowScope<'_>, - state: &ScanState, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - let state = downcast_erased_state::(state)?; - ReadPlan::segment_requests(self, range, rows, state, cx) - } - - fn release(&self, frontier: u64, state: &ScanState) -> VortexResult<()> { - let state = downcast_erased_state::(state)?; - ReadPlan::release(self, frontier, state) - } - - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - ReadPlan::fmt_plan(self, f) + Box::pin(async move { + self.read + .read_scoped(self.range, self.rows.as_scope(), io, local) + .await + }) } } -/// Executable split plan for one pushed expression. -pub trait SplitPlan: 'static + Send + Sync { - /// The per-query state this split plan executes against. +/// Prepared split discovery for one pushed expression. +pub trait PreparedSplit: 'static + Send + Sync { + /// The per-query state this prepared split executes against. type State: Send + Sync + 'static; - /// Create this split plan's per-query state. + /// Create this prepared split's per-query state. fn init_state(&self, ctx: &VortexSession) -> VortexResult; /// Return natural row ranges inside `range`. @@ -608,17 +682,17 @@ pub trait SplitPlan: 'static + Send + Sync { ) -> BoxFuture<'a, VortexResult>>>; /// Compact description for plan display. - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "splits") } } -/// Object-safe view of a [`SplitPlan`]. -pub trait DynSplitPlan: Send + Sync { - /// Create this split plan's per-query state, type-erased. +/// Object-safe view of a [`PreparedSplit`]. +pub trait DynPreparedSplit: Send + Sync { + /// Create this prepared split's per-query state, type-erased. fn init_state(&self, ctx: &VortexSession) -> VortexResult; - /// Execute the planned split query. + /// Execute the prepared split query. fn splits<'a>( &'a self, range: Range, @@ -627,12 +701,12 @@ pub trait DynSplitPlan: Send + Sync { ) -> BoxFuture<'a, VortexResult>>>; /// Compact description for plan display. - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result; + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result; } -impl DynSplitPlan for T { +impl DynPreparedSplit for T { fn init_state(&self, ctx: &VortexSession) -> VortexResult { - Ok(Arc::new(SplitPlan::init_state(self, ctx)?)) + Ok(Arc::new(PreparedSplit::init_state(self, ctx)?)) } fn splits<'a>( @@ -645,25 +719,25 @@ impl DynSplitPlan for T { Ok(state) => state, Err(e) => return Box::pin(async move { Err(e) }), }; - SplitPlan::splits(self, range, io, state) + PreparedSplit::splits(self, range, io, state) } - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - SplitPlan::fmt_plan(self, f) + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + PreparedSplit::fmt_prepared(self, f) } } -struct HintSplitPlan { +struct HintPreparedSplit { hints: Vec, } -impl HintSplitPlan { +impl HintPreparedSplit { fn new(hints: Vec) -> Self { Self { hints } } } -impl SplitPlan for HintSplitPlan { +impl PreparedSplit for HintPreparedSplit { type State = (); fn init_state(&self, _ctx: &VortexSession) -> VortexResult { @@ -696,23 +770,23 @@ impl SplitPlan for HintSplitPlan { }) } - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "hint_splits") } } -/// Executable ungrouped aggregate plan for one pushed expression. -pub trait AggregatePlan: 'static + Send + Sync { - /// The per-query state this aggregate plan executes against. +/// Prepared ungrouped aggregate for one pushed expression. +pub trait PreparedAggregate: 'static + Send + Sync { + /// The per-query state this prepared aggregate executes against. type State: Send + Sync + 'static; - /// Create this aggregate plan's per-query state. + /// Create this prepared aggregate's per-query state. fn init_state(&self, ctx: &VortexSession) -> VortexResult; /// Answer ungrouped aggregates over every row of `range`. /// - /// Returns one [`AggregateAnswer`] per planned function. `None` means - /// this plan cannot answer any function for this range and the caller + /// Returns one [`AggregateAnswer`] per prepared function. `None` means + /// this prepared aggregate cannot answer any function for this range and the caller /// should read and accumulate the range normally. fn aggregate_partial<'a>( &'a self, @@ -722,17 +796,17 @@ pub trait AggregatePlan: 'static + Send + Sync { ) -> BoxFuture<'a, VortexResult>>>; /// Compact description for plan display. - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "aggregate") } } -/// Object-safe view of an [`AggregatePlan`]. -pub trait DynAggregatePlan: Send + Sync { - /// Create this aggregate plan's per-query state, type-erased. +/// Object-safe view of a [`PreparedAggregate`]. +pub trait DynPreparedAggregate: Send + Sync { + /// Create this prepared aggregate's per-query state, type-erased. fn init_state(&self, ctx: &VortexSession) -> VortexResult; - /// Execute the planned aggregates. + /// Execute the prepared aggregates. fn aggregate_partial<'a>( &'a self, range: Range, @@ -741,12 +815,12 @@ pub trait DynAggregatePlan: Send + Sync { ) -> BoxFuture<'a, VortexResult>>>; /// Compact description for plan display. - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result; + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result; } -impl DynAggregatePlan for T { +impl DynPreparedAggregate for T { fn init_state(&self, ctx: &VortexSession) -> VortexResult { - Ok(Arc::new(AggregatePlan::init_state(self, ctx)?)) + Ok(Arc::new(PreparedAggregate::init_state(self, ctx)?)) } fn aggregate_partial<'a>( @@ -759,26 +833,26 @@ impl DynAggregatePlan for T { Ok(state) => state, Err(e) => return Box::pin(async move { Err(e) }), }; - AggregatePlan::aggregate_partial(self, range, io, state) + PreparedAggregate::aggregate_partial(self, range, io, state) } - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - AggregatePlan::fmt_plan(self, f) + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + PreparedAggregate::fmt_prepared(self, f) } } -/// Executable metadata statistics plan for one pushed expression. -pub trait StatsPlan: 'static + Send + Sync { - /// The per-query state this statistics plan executes against. +/// Prepared metadata statistics for one pushed expression. +pub trait PreparedStats: 'static + Send + Sync { + /// The per-query state this prepared statistics handle executes against. type State: Send + Sync + 'static; - /// Create this statistics plan's per-query state. + /// Create this prepared statistics handle's per-query state. fn init_state(&self, ctx: &VortexSession) -> VortexResult; /// Answer aggregate-function statistics over every row of `range`. /// /// The returned vector is positional against the functions passed to - /// [`ScanNode::plan_stats`]. Each element is exact, inexact, or absent for the requested + /// [`ScanNode::prepare_stats`]. Each element is exact, inexact, or absent for the requested /// aggregate function over `range`. Implementations must not read row values merely to improve /// an estimate. fn stats<'a>( @@ -789,17 +863,17 @@ pub trait StatsPlan: 'static + Send + Sync { ) -> BoxFuture<'a, VortexResult>>>; /// Compact description for plan display. - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "stats") } } -/// Object-safe view of a [`StatsPlan`]. -pub trait DynStatsPlan: Send + Sync { - /// Create this statistics plan's per-query state, type-erased. +/// Object-safe view of a [`PreparedStats`]. +pub trait DynPreparedStats: Send + Sync { + /// Create this prepared statistics handle's per-query state, type-erased. fn init_state(&self, ctx: &VortexSession) -> VortexResult; - /// Execute the planned statistics query. + /// Execute the prepared statistics query. fn stats<'a>( &'a self, range: Range, @@ -808,12 +882,12 @@ pub trait DynStatsPlan: Send + Sync { ) -> BoxFuture<'a, VortexResult>>>; /// Compact description for plan display. - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result; + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result; } -impl DynStatsPlan for T { +impl DynPreparedStats for T { fn init_state(&self, ctx: &VortexSession) -> VortexResult { - Ok(Arc::new(StatsPlan::init_state(self, ctx)?)) + Ok(Arc::new(PreparedStats::init_state(self, ctx)?)) } fn stats<'a>( @@ -826,11 +900,11 @@ impl DynStatsPlan for T { Ok(state) => state, Err(e) => return Box::pin(async move { Err(e) }), }; - StatsPlan::stats(self, range, io, state) + PreparedStats::stats(self, range, io, state) } - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - StatsPlan::fmt_plan(self, f) + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + PreparedStats::fmt_prepared(self, f) } } @@ -878,10 +952,10 @@ pub struct StructValueState { validity: Option, } -struct StructValueReadPlan { +struct StructValuePreparedRead { node: Arc, - fields: Vec, - validity: Option, + fields: Vec, + validity: Option, } impl ScanNode for StructValueScanNode { @@ -901,14 +975,14 @@ impl ScanNode for StructValueScanNode { Ok(StructValueState { fields, validity }) } - fn plan_read(self: Arc, cx: &mut PlanCtx) -> VortexResult> { + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { let fields = self .fields .iter() .map(|field| { Arc::clone(field) - .plan_read(cx)? - .ok_or_else(|| vortex_err!("struct field did not produce a read plan")) + .prepare_read(cx)? + .ok_or_else(|| vortex_err!("struct field did not produce a prepared read")) }) .collect::>>()?; let validity = self @@ -916,11 +990,11 @@ impl ScanNode for StructValueScanNode { .as_ref() .map(|validity| { Arc::clone(validity) - .plan_read(cx)? - .ok_or_else(|| vortex_err!("struct validity did not produce a read plan")) + .prepare_read(cx)? + .ok_or_else(|| vortex_err!("struct validity did not produce a prepared read")) }) .transpose()?; - Ok(Some(Arc::new(StructValueReadPlan { + Ok(Some(Arc::new(StructValuePreparedRead { node: self, fields, validity, @@ -948,56 +1022,25 @@ impl ScanNode for StructValueScanNode { } } -impl ReadPlan for StructValueReadPlan { - type State = StructValueState; - - fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - let fields = self - .fields - .iter() - .map(|field| field.init_state(cx)) - .collect::>>()?; - let validity = self - .validity - .as_ref() - .map(|validity| validity.init_state(cx)) - .transpose()?; - Ok(StructValueState { fields, validity }) - } - +impl PreparedRead for StructValuePreparedRead { fn read_scoped<'a>( &'a self, range: Range, rows: RowScope<'a>, io: &'a FileReader, - state: &'a Self::State, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { - if self.node.fields.len() != state.fields.len() { - vortex_bail!( - "struct value state length {} does not match field count {}", - state.fields.len(), - self.node.fields.len() - ); - } let mut arrays = Vec::with_capacity(self.fields.len()); - for (field, state) in self.fields.iter().zip(&state.fields) { - arrays.push( - field - .read_scoped(range.clone(), rows, io, state.as_ref(), local) - .await?, - ); + for field in &self.fields { + arrays.push(field.read_scoped(range.clone(), rows, io, local).await?); } - let validity = match (&self.validity, &state.validity) { - (Some(validity), Some(state)) => { - let array = validity - .read_scoped(range, rows, io, state.as_ref(), local) - .await?; + let validity = match &self.validity { + Some(validity) => { + let array = validity.read_scoped(range, rows, io, local).await?; Validity::Array(array) } - (None, None) => Validity::NonNullable, - _ => vortex_bail!("struct value validity plan/state mismatch"), + None => Validity::NonNullable, }; Ok(StructArray::try_new( self.node.names.clone(), @@ -1013,45 +1056,32 @@ impl ReadPlan for StructValueReadPlan { &self, range: Range, rows: RowScope<'_>, - state: &Self::State, cx: &mut SegmentPlanCtx, ) -> VortexResult { - if self.node.fields.len() != state.fields.len() { - vortex_bail!( - "struct value state length {} does not match field count {}", - state.fields.len(), - self.node.fields.len() - ); - } - let mut requests = SegmentRequests::none(); - for (field, state) in self.fields.iter().zip(&state.fields) { - requests.extend(field.segment_requests(range.clone(), rows, state.as_ref(), cx)?); + for field in &self.fields { + requests.extend(field.segment_requests(range.clone(), rows, cx)?); if requests.is_unknown() { return Ok(requests); } } - match (&self.validity, &state.validity) { - (Some(validity), Some(state)) => { - requests.extend(validity.segment_requests(range, rows, state.as_ref(), cx)?); - } - (None, None) => {} - _ => vortex_bail!("struct value validity plan/state mismatch"), + if let Some(validity) = &self.validity { + requests.extend(validity.segment_requests(range, rows, cx)?); } Ok(requests) } - fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { - for (field, state) in self.fields.iter().zip(&state.fields) { - field.release(frontier, state.as_ref())?; + fn release(&self, frontier: u64) -> VortexResult<()> { + for field in &self.fields { + field.release(frontier)?; } - if let (Some(validity), Some(state)) = (&self.validity, &state.validity) { - validity.release(frontier, state.as_ref())?; + if let Some(validity) = &self.validity { + validity.release(frontier)?; } Ok(()) } - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { ScanNode::fmt_chain(self.node.as_ref(), f) } } @@ -1069,9 +1099,9 @@ impl ApplyScanNode { } } -struct ApplyReadPlan { +struct ApplyPreparedRead { node: Arc, - input: ReadPlanRef, + input: PreparedReadRef, } impl ScanNode for ApplyScanNode { @@ -1081,11 +1111,11 @@ impl ScanNode for ApplyScanNode { cx.init_node(&self.input) } - fn plan_read(self: Arc, cx: &mut PlanCtx) -> VortexResult> { + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { let input = Arc::clone(&self.input) - .plan_read(cx)? - .ok_or_else(|| vortex_err!("apply input did not produce a read plan"))?; - Ok(Some(Arc::new(ApplyReadPlan { node: self, input }))) + .prepare_read(cx)? + .ok_or_else(|| vortex_err!("apply input did not produce a prepared read"))?; + Ok(Some(Arc::new(ApplyPreparedRead { node: self, input }))) } fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { @@ -1101,26 +1131,16 @@ impl ScanNode for ApplyScanNode { } } -impl ReadPlan for ApplyReadPlan { - type State = ScanStateRef; - - fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - self.input.init_state(cx) - } - +impl PreparedRead for ApplyPreparedRead { fn read_scoped<'a>( &'a self, range: Range, rows: RowScope<'a>, io: &'a FileReader, - state: &'a Self::State, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { - let input = self - .input - .read_scoped(range, rows, io, state.as_ref(), local) - .await?; + let input = self.input.read_scoped(range, rows, io, local).await?; input.apply(&self.node.expr)?.execute::(local) }) } @@ -1129,17 +1149,16 @@ impl ReadPlan for ApplyReadPlan { &self, range: Range, rows: RowScope<'_>, - state: &Self::State, cx: &mut SegmentPlanCtx, ) -> VortexResult { - self.input.segment_requests(range, rows, state.as_ref(), cx) + self.input.segment_requests(range, rows, cx) } - fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { - self.input.release(frontier, state.as_ref()) + fn release(&self, frontier: u64) -> VortexResult<()> { + self.input.release(frontier) } - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { ScanNode::fmt_chain(self.node.as_ref(), f) } } @@ -1172,10 +1191,10 @@ pub struct MaskState { validity: ScanStateRef, } -struct MaskReadPlan { +struct MaskPreparedRead { node: Arc, - input: ReadPlanRef, - validity: ReadPlanRef, + input: PreparedReadRef, + validity: PreparedReadRef, } impl ScanNode for MaskScanNode { @@ -1188,14 +1207,14 @@ impl ScanNode for MaskScanNode { }) } - fn plan_read(self: Arc, cx: &mut PlanCtx) -> VortexResult> { + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { let input = Arc::clone(&self.input) - .plan_read(cx)? - .ok_or_else(|| vortex_err!("mask input did not produce a read plan"))?; + .prepare_read(cx)? + .ok_or_else(|| vortex_err!("mask input did not produce a prepared read"))?; let validity = Arc::clone(&self.validity) - .plan_read(cx)? - .ok_or_else(|| vortex_err!("mask validity did not produce a read plan"))?; - Ok(Some(Arc::new(MaskReadPlan { + .prepare_read(cx)? + .ok_or_else(|| vortex_err!("mask validity did not produce a prepared read"))?; + Ok(Some(Arc::new(MaskPreparedRead { node: self, input, validity, @@ -1217,80 +1236,53 @@ impl ScanNode for MaskScanNode { } } -impl ReadPlan for MaskReadPlan { - type State = MaskState; - - fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - Ok(MaskState { - input: self.input.init_state(cx)?, - validity: self.validity.init_state(cx)?, - }) - } - +impl PreparedRead for MaskPreparedRead { fn read_scoped<'a>( &'a self, range: Range, rows: RowScope<'a>, io: &'a FileReader, - state: &'a Self::State, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { let input = self .input - .read_scoped(range.clone(), rows, io, state.input.as_ref(), local) - .await?; - let validity = self - .validity - .read_scoped(range, rows, io, state.validity.as_ref(), local) + .read_scoped(range.clone(), rows, io, local) .await?; + let validity = self.validity.read_scoped(range, rows, io, local).await?; input.mask(validity)?.execute::(local) }) } - fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { - self.input.release(frontier, state.input.as_ref())?; - self.validity.release(frontier, state.validity.as_ref()) + fn release(&self, frontier: u64) -> VortexResult<()> { + self.input.release(frontier)?; + self.validity.release(frontier) } - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { ScanNode::fmt_chain(self.node.as_ref(), f) } } -/// Executable predicate evidence for one planned predicate expression. -pub trait EvidencePlan: 'static + Send + Sync { - /// The per-query state this evidence plan executes against. - type State: Send + Sync + 'static; - - /// Create this evidence plan's per-query state. - fn init_state(&self, ctx: &VortexSession) -> VortexResult; - - /// Produce evidence for the planned predicate over `req.range`. +/// Prepared predicate evidence for one predicate expression. +pub trait PreparedEvidence: 'static + Send + Sync { + /// Produce evidence for the prepared predicate over `req.range`. fn evidence<'a>( &'a self, req: &'a EvidenceRequest<'a>, io: &'a FileReader, - state: &'a Self::State, ) -> BoxFuture<'a, VortexResult>>; /// Return scheduler-visible segment requests needed for this evidence, when known exactly. fn segment_requests( &self, _req: &EvidenceRequest<'_>, - _state: &Self::State, _cx: &mut SegmentPlanCtx, ) -> VortexResult { Ok(SegmentRequests::unknown()) } - /// A key for sharing this plan's state with sibling evidence plans - /// in the same file. The default keeps one state per planned route. - fn state_cache_key(&self) -> Option { - None - } - - /// Whether this plan is cheap enough to re-run immediately before a + /// Whether this handle is cheap enough to re-run immediately before a /// projection read when a dynamic predicate boundary changes while /// the morsel is in flight. fn recheck_before_projection(&self) -> bool { @@ -1298,80 +1290,51 @@ pub trait EvidencePlan: 'static + Send + Sync { } /// Compact description for plan display. - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "evidence") } } -/// Object-safe view of an [`EvidencePlan`]. -pub trait DynEvidencePlan: Send + Sync { - /// Create this evidence plan's per-query state, type-erased. - fn init_state(&self, ctx: &VortexSession) -> VortexResult; +impl dyn PreparedEvidence { + /// Create a morsel-level evidence task for this prepared evidence handle. + pub fn begin_evidence( + self: Arc, + req: OwnedEvidenceRequest, + ) -> VortexResult> { + Ok(Box::new(DefaultEvidenceTask { + evidence: self, + req, + })) + } +} + +/// A morsel-level evidence task. +pub trait EvidenceTask: Send { + /// Return scheduler-visible segment requests needed for this task, when known exactly. + fn segment_requests(&self, cx: &mut SegmentPlanCtx) -> VortexResult; - /// Produce evidence for the planned predicate over `req.range`. + /// Execute the evidence task. fn evidence<'a>( - &'a self, - req: &'a EvidenceRequest<'a>, + self: Box, io: &'a FileReader, - state: &'a ScanState, ) -> BoxFuture<'a, VortexResult>>; +} - /// Return scheduler-visible segment requests needed for this evidence, when known exactly. - fn segment_requests( - &self, - req: &EvidenceRequest<'_>, - state: &ScanState, - cx: &mut SegmentPlanCtx, - ) -> VortexResult; - - /// A key for sharing this plan's state with sibling evidence plans. - fn state_cache_key(&self) -> Option; - - /// Whether this plan should run in the projection recheck pass. - fn recheck_before_projection(&self) -> bool; - - /// Compact description for plan display. - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result; +struct DefaultEvidenceTask { + evidence: PreparedEvidenceRef, + req: OwnedEvidenceRequest, } -impl DynEvidencePlan for T { - fn init_state(&self, ctx: &VortexSession) -> VortexResult { - Ok(Arc::new(EvidencePlan::init_state(self, ctx)?)) +impl EvidenceTask for DefaultEvidenceTask { + fn segment_requests(&self, cx: &mut SegmentPlanCtx) -> VortexResult { + self.evidence.segment_requests(&self.req.as_request(), cx) } fn evidence<'a>( - &'a self, - req: &'a EvidenceRequest<'a>, + self: Box, io: &'a FileReader, - state: &'a ScanState, ) -> BoxFuture<'a, VortexResult>> { - let state = match downcast_erased_state::(state) { - Ok(state) => state, - Err(e) => return Box::pin(async move { Err(e) }), - }; - EvidencePlan::evidence(self, req, io, state) - } - - fn segment_requests( - &self, - req: &EvidenceRequest<'_>, - state: &ScanState, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - let state = downcast_erased_state::(state)?; - EvidencePlan::segment_requests(self, req, state, cx) - } - - fn state_cache_key(&self) -> Option { - EvidencePlan::state_cache_key(self) - } - - fn recheck_before_projection(&self) -> bool { - EvidencePlan::recheck_before_projection(self) - } - - fn fmt_plan(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - EvidencePlan::fmt_plan(self, f) + Box::pin(async move { self.evidence.evidence(&self.req.as_request(), io).await }) } } @@ -1451,16 +1414,19 @@ mod tests { Ok(()) } - fn plan_read(self: Arc, _cx: &mut PlanCtx) -> VortexResult> { + fn prepare_read( + self: Arc, + _cx: &mut PrepareCtx, + ) -> VortexResult> { Ok(None) } - fn plan_stats( + fn prepare_stats( self: Arc, funcs: &[AggregateFnRef], - _cx: &mut PlanCtx, - ) -> VortexResult> { - Ok(Some(Arc::new(TestStatsPlan { len: funcs.len() }))) + _cx: &mut PrepareCtx, + ) -> VortexResult> { + Ok(Some(Arc::new(TestPreparedStats { len: funcs.len() }))) } fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -1468,11 +1434,11 @@ mod tests { } } - struct TestStatsPlan { + struct TestPreparedStats { len: usize, } - impl StatsPlan for TestStatsPlan { + impl PreparedStats for TestPreparedStats { type State = (); fn init_state(&self, _ctx: &VortexSession) -> VortexResult { @@ -1506,7 +1472,7 @@ mod tests { let funcs = vec![Min.bind(EmptyOptions), Max.bind(EmptyOptions)]; let plan = node - .plan_stats(&funcs, &mut PlanCtx::new(session.clone()))? + .prepare_stats(&funcs, &mut PrepareCtx::new(session.clone()))? .ok_or_else(|| vortex_err!("test scan node did not return a stats plan"))?; let state = plan.init_state(&session)?; let io = FileReader::new(Arc::new(TestSegments::default()), session); diff --git a/vortex-layout/src/scan/v2/request.rs b/vortex-layout/src/scan/v2/request.rs index 1b1dd047d3e..1551838668e 100644 --- a/vortex-layout/src/scan/v2/request.rs +++ b/vortex-layout/src/scan/v2/request.rs @@ -6,8 +6,8 @@ //! Expansion produces layout-local [`ScanNode`](super::node::ScanNode) //! trees. Predicate, projection, aggregate, and dynamic-filter handling //! then push expressions into those nodes and ask the resulting nodes for -//! executable plans. Evidence requests are the per-morsel inputs to those -//! already-planned evidence handles. +//! prepared runtime handles. Evidence requests are the per-morsel inputs to +//! those prepared evidence handles. use std::ops::Range; @@ -31,8 +31,8 @@ pub enum EvidenceMode { /// Scan2 no longer carries predicates through expansion. Layout scan vtables /// must expose expression behavior through /// [`ScanNode::try_push_expr`](super::node::ScanNode::try_push_expr), -/// [`ScanNode::plan_read`](super::node::ScanNode::plan_read), and -/// [`ScanNode::plan_evidence`](super::node::ScanNode::plan_evidence). +/// [`ScanNode::prepare_read`](super::node::ScanNode::prepare_read), and +/// [`ScanNode::prepare_evidence`](super::node::ScanNode::prepare_evidence). #[derive(Debug, Default)] pub struct NodeRequest; @@ -43,8 +43,35 @@ impl NodeRequest { } } -/// A runtime evidence request: one planned predicate expression, scoped +/// A runtime evidence request: one prepared predicate expression, scoped /// to the producer's row domain, over one row range. +#[derive(Clone, Debug)] +pub struct OwnedEvidenceRequest { + /// The predicate's stable id within this scan. + pub id: PredicateId, + /// The predicate's version. + pub version: PredicateVersion, + /// The predicate with `root()` rebased to the producer's rows. + pub predicate: Expression, + /// The rows evidence is requested for, in the producer's coordinates. + pub range: Range, + /// Which evidence pass is requesting fragments. + pub mode: EvidenceMode, +} + +impl OwnedEvidenceRequest { + /// Borrow this owned request for a prepared evidence handle. + pub fn as_request(&self) -> EvidenceRequest<'_> { + EvidenceRequest { + id: self.id, + version: self.version, + predicate: &self.predicate, + range: self.range.clone(), + mode: self.mode, + } + } +} + #[derive(Debug)] pub struct EvidenceRequest<'a> { /// The predicate's stable id within this scan. From 3b0fcfa23e57330b20df4b41b655309bbe1f3c81 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 19 Jun 2026 20:40:34 -0400 Subject: [PATCH 15/48] Run PR benchmarks with scan2 Signed-off-by: Nicholas Gates --- .github/workflows/bench-pr.yml | 2 ++ .github/workflows/sql-benchmarks.yml | 6 ++++++ .github/workflows/sql-pr.yml | 1 + 3 files changed, 9 insertions(+) diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml index cfab4f7fd3b..dc9755b2cd1 100644 --- a/.github/workflows/bench-pr.yml +++ b/.github/workflows/bench-pr.yml @@ -76,6 +76,7 @@ jobs: shell: bash env: RUST_BACKTRACE: full + VORTEX_SCAN_IMPL: "v2" VORTEX_EXPERIMENTAL_PATCHED_ARRAY: "1" FLAT_LAYOUT_INLINE_ARRAY_NODE: "1" run: | @@ -129,3 +130,4 @@ jobs: secrets: inherit with: mode: "pr" + vortex_scan_impl: "v2" diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml index b46fe455a34..1b0ae4deb6c 100644 --- a/.github/workflows/sql-benchmarks.yml +++ b/.github/workflows/sql-benchmarks.yml @@ -10,6 +10,11 @@ on: required: false type: string default: i7i.metal-24xl + vortex_scan_impl: + required: false + type: string + default: "" + description: "Optional VORTEX_SCAN_IMPL value for Vortex file scans" benchmark_matrix: required: false type: string @@ -282,6 +287,7 @@ jobs: bench: timeout-minutes: 120 env: + VORTEX_SCAN_IMPL: ${{ inputs.vortex_scan_impl }} VORTEX_EXPERIMENTAL_PATCHED_ARRAY: "1" FLAT_LAYOUT_INLINE_ARRAY_NODE: "1" # Makes python output nicer diff --git a/.github/workflows/sql-pr.yml b/.github/workflows/sql-pr.yml index 420425a4321..dd3097c0059 100644 --- a/.github/workflows/sql-pr.yml +++ b/.github/workflows/sql-pr.yml @@ -22,3 +22,4 @@ jobs: secrets: inherit with: mode: "pr" + vortex_scan_impl: "v2" From 26183941e776694464f01c8f69b21770c9723a3c Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 19 Jun 2026 21:59:01 -0400 Subject: [PATCH 16/48] Move scan plan runtime into vortex-scan Rename the runtime scan node API to ScanPlan and move the plan and segment primitives into vortex-scan. Layout v2 now expands directly through layout.new_scan_plan with a plan ScanRequest, and the docs describe the v2 path as the layout scan model. Signed-off-by: Nicholas Gates --- Cargo.lock | 1 + docs/concepts/file-format.md | 2 +- docs/concepts/layouts.md | 93 +++-- docs/concepts/scanning.md | 174 ++++++---- docs/developer-guide/extending/index.md | 4 +- .../extending/writing-a-layout.md | 165 ++++++++- .../integrations/datafusion.md | 37 +- docs/developer-guide/integrations/duckdb.md | 17 +- docs/developer-guide/internals/io.md | 29 +- .../internals/scan-scheduler.md | 140 ++++---- .../internals/serialization.md | 5 +- docs/developer-guide/internals/session.md | 15 +- docs/developer-guide/internals/vtables.md | 18 +- docs/specs/file-format.md | 3 +- vortex-datafusion/src/persistent/format.rs | 6 +- vortex-datafusion/src/persistent/opener.rs | 8 +- vortex-datafusion/src/v2/source.rs | 2 +- vortex-file/src/file.rs | 28 +- vortex-file/src/multi/mod.rs | 4 +- vortex-file/src/multi/scan_v2.rs | 245 +++++++------- vortex-file/src/scan_v1_v2_differential.rs | 10 +- vortex-file/src/tests.rs | 4 +- vortex-layout/src/layout_v2.rs | 95 +++--- vortex-layout/src/scan/v2/layouts/chunked.rs | 113 ++++--- vortex-layout/src/scan/v2/layouts/dict.rs | 62 ++-- vortex-layout/src/scan/v2/layouts/flat.rs | 38 +-- vortex-layout/src/scan/v2/layouts/struct_.rs | 76 +++-- vortex-layout/src/scan/v2/layouts/zoned.rs | 93 +++-- vortex-layout/src/scan/v2/mod.rs | 12 +- vortex-layout/src/segments/mod.rs | 42 +-- vortex-scan/Cargo.toml | 6 +- vortex-scan/src/lib.rs | 3 + .../v2 => vortex-scan/src/plan}/evidence.rs | 12 +- .../v2/node.rs => vortex-scan/src/plan/mod.rs | 320 +++++++++--------- .../v2 => vortex-scan/src/plan}/request.rs | 23 +- vortex-scan/src/scheduler.rs | 2 +- vortex-scan/src/segments/mod.rs | 47 +++ .../src/segments/scheduled.rs | 10 +- .../src/segments/source.rs | 3 +- 39 files changed, 1102 insertions(+), 865 deletions(-) rename {vortex-layout/src/scan/v2 => vortex-scan/src/plan}/evidence.rs (97%) rename vortex-layout/src/scan/v2/node.rs => vortex-scan/src/plan/mod.rs (82%) rename {vortex-layout/src/scan/v2 => vortex-scan/src/plan}/request.rs (79%) create mode 100644 vortex-scan/src/segments/mod.rs rename {vortex-layout => vortex-scan}/src/segments/scheduled.rs (99%) rename {vortex-layout => vortex-scan}/src/segments/source.rs (89%) diff --git a/Cargo.lock b/Cargo.lock index 3f7b7002c87..1e8f646028f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10167,6 +10167,7 @@ dependencies = [ "futures", "parking_lot", "roaring", + "rustc-hash", "tracing", "vortex-array", "vortex-buffer", diff --git a/docs/concepts/file-format.md b/docs/concepts/file-format.md index 36a6fe0a935..f97276840fc 100644 --- a/docs/concepts/file-format.md +++ b/docs/concepts/file-format.md @@ -5,7 +5,7 @@ The writer accepts a stream of Vortex arrays, applies a layout strategy to organ and serializes the layout and its segments into a single file. The bulk of the file format specification describes the representation of the footer bytes such that the -layout tree can be reconstructed for scans. +layout tree can be reconstructed and expanded into scan plans. See the [Vortex File Format Specification](../specs/file-format.md) for full details. diff --git a/docs/concepts/layouts.md b/docs/concepts/layouts.md index f6503b3330e..9399a46670c 100644 --- a/docs/concepts/layouts.md +++ b/docs/concepts/layouts.md @@ -1,48 +1,79 @@ # Layouts -Layouts are the out-of-memory equivalent of [Vortex arrays](/concepts/arrays). They are similarly hierarchical, -with an associated vtable, metadata, dtype, children, and lazy buffers known as "segments". +Layouts are the out-of-memory equivalent of [Vortex arrays](/concepts/arrays). A layout describes +how a logical array is organized across children and file segments so that scans can read only the +data they need. -The tree-structure of a layout can be serialized and persisted. During deserialization, the layout is bound to a -segment source that can lazily fetch the data buffers as needed. This abstraction allows Vortex to implement highly -efficient columnar scans over any block storage including local disk, object stores, remote caches like Redis, -Postgres block storage, and more. +The serialized layout tree is stored in a file footer. During deserialization, Vortex resolves each +layout encoding ID through the session's layout registry and constructs a typed `Layout`: -In fact, the [Vortex file format](/concepts/file-format) is just a serialized layout tree with the data segments -stored in the same file. +- common fields are hoisted into `Layout`: dtype, row count, child access, and segment IDs; +- layout-specific metadata lives in `V::LayoutData`; +- the erased `LayoutRef` lets heterogeneous layout nodes live in one tree; and +- child layouts are materialized lazily from the footer FlatBuffer when a scan route asks for them. + +A layout does not execute a scan directly. Its vtable expands the typed layout into a +[`ScanPlan`](scanning.md), and the scan runtime prepares evidence, predicate, projection, +statistics, and aggregate work from that node tree. ## Built-in Layouts -As with arrays, Vortex provides a number of built-in layouts, and users can define their own custom layouts. +As with arrays, Vortex provides a number of built-in layouts, and users can define their own custom +layouts. + +| Name | Description | +|--------------------|--------------------------------------------------------------------------------------------------------| +| `FlatLayout` | Stores one serialized Vortex array in one segment. | +| `StructLayout` | Stores named child layouts corresponding to fields of a struct dtype. | +| `ChunkedLayout` | Stores row-wise partitioned child layouts and exposes chunk boundaries as natural scan splits. | +| `DictionaryLayout` | Stores dictionary values in one child and row-domain codes in another child. | +| `ZonedLayout` | Stores a data child plus zone statistics that can produce predicate evidence before reading row data. | + +## Layout Children + +Child relationships are part of the layout contract. A child can be: + +- a field child, such as one column of a struct; +- a chunk child, covering a row range of the parent; +- a transparent child, such as the data child of a zoned wrapper; or +- an auxiliary child, such as a validity bitmap, dictionary values, or zone statistics. + +The parent vtable defines each child's expected dtype and relationship. This lets Vortex validate +lazy child access without deserializing the entire tree up front. + +## Layouts and Segments + +Layouts refer to data buffers by `SegmentId`. A segment source, such as a Vortex file or an +in-memory buffer, maps those logical segment IDs to bytes. This indirection keeps the layout tree +independent of where the bytes live: local disk, object storage, an embedded buffer, or a remote +cache can all back the same logical layout structure. -| Name | Description | -|--------------------|---------------------------------------------------------------------------------------------------------| -| `FlatLayout` | A layout that holds a single serialized Vortex array. | -| `StructLayout` | A layout that holds a collection of named child layouts, corresponding to an associated `StructDType`. | -| `ChunkedLayout ` | A layout that holds a collection of row-wise partitioned child layouts. | -| `DictionaryLayout` | A layout that shares a single dictionary of values with a child layout holding indices. | -| `ZonedLayout` | A layout that stores a zone-map of statistics to perform filter pruning. | +The scan path asks prepared reads and prepared evidence handles for segment requests when the +requests are known exactly. The segment source handles caching, coalescing, and in-flight +deduplication. +## Example: Parquet Row Groups -### Example: Parquet Row Groups +Layouts can be composed together in arbitrary hierarchical structures. This allows writers to model +the performance characteristics of other file formats or storage systems. -Layouts can be composed together in arbitrary hierarchical structures. This allows users of Vortex to configure -writers that model the performance characteristics of other file formats or storage systems. +As an example, a Parquet-like layout could use: -As an example, suppose we want to replicate the behavior of Parquet row groups in Vortex. We would define a layout that -looked something like: +- `ChunkedLayout(ChunkBy::RowCount(100_000))` at the top level for row groups. +- `StructLayout` inside each row group to split data by column. +- `ChunkedLayout(ChunkBy::CompressedSize(64k))` inside each column for page-like pieces. +- `FlatLayout` leaves that store serialized array chunks. -* `ChunkedLayout(ChunkBy::RowCount(100_000))` - at the top-level, we define row-groups of at most 100k rows. - * `StructLayout` - Parquet then splits the row group into individual columns known as column chunks. - * `ChunkedLayout(ChunkBy::CompressedSize(64k))` - finally, each column chunk is split into pages by compressed - size. +The scan runtime would still see one `ScanPlan` tree. Column projections route through the struct +node, row-range work routes through chunked nodes, and leaf reads touch only the flat segments needed +for the current morsel. ## Layout Strategies -A `LayoutStrategy` defines how to construct a layout tree from a stream of Vortex arrays. These strategies can -partition arrays by column, by row-groups, or by any other arbitrary scheme. Some strategies compute pruning stats, -others apply compression to the data. +A `LayoutStrategy` defines how to construct a layout tree from a stream of Vortex arrays. Strategies +can partition arrays by column, by row range, by size, or by any other scheme. Some strategies +compute pruning statistics, and others choose compression or buffering policies for leaf data. -For segment sinks that are locality-aware, such as a Vortex file, layout strategies can make use of sequence IDs. -These are powerful logical clocks that allow layouts to parallelize writes and compression tasks while maintaining -full control and determinism over where segments are written into the file. +For segment sinks that are locality-aware, such as a Vortex file, layout strategies can use sequence +IDs. These logical clocks let layouts parallelize writes and compression tasks while retaining +deterministic control over where segments are written. diff --git a/docs/concepts/scanning.md b/docs/concepts/scanning.md index 393b7d2d83c..890bc315157 100644 --- a/docs/concepts/scanning.md +++ b/docs/concepts/scanning.md @@ -1,93 +1,131 @@ -# Scan API +# Scanning + +Vortex scans are built around the layout tree stored in a file footer. A scan opens the file, +deserializes the root layout, expands that layout into a `ScanPlan` tree, and prepares executable +runtime handles for predicates, projections, statistics, and aggregates. + +The query engine sees a standard scan request: a projection, an optional filter, ordering +requirements, limits, and split preferences. The layout and scan layers decide how to satisfy that +request with the least data movement. + +```text +footer layout bytes + | + v +LayoutRef / Layout + | + v +ScanPlan tree + | + +-- push expressions into layout-local row domains + +-- prepare predicate evidence + +-- prepare residual predicate reads + +-- prepare projection reads + +-- prepare statistics and aggregate answers + | + v +morsel execution -> array batches +``` -:::{note} -The Scan API is on the roadmap and under active development. The core `Source` trait and scan pipeline -are functional, but the full API surface is not yet fully defined or implemented. -::: +## Layout Expansion -The Vortex Scan API defines a standard interface between data storage and query engines. It solves the -N x M problem of having N different storage backends and M different query engines by providing a common -interface that both sides can implement against. +Each layout encoding has a layout vtable. The serialized form stores common fields such as dtype, +row count, child layouts, and segment IDs. Deserialization hoists those common fields into +`Layout` and leaves only layout-specific metadata in `V::LayoutData`. -``` - Storage Query Engines - ─────── ───────────── +The layout vtable's scan hook expands a `Layout` into a `ScanPlan`. This keeps serialized layout +concerns separate from runtime execution: layouts describe the physical organization of data, +whereas `ScanPlan`s expose what that organization can do during a scan. - Vortex Files ──► ┌──────────────┐ ──► DuckDB - Parquet Files ──► │ Scan API │ ──► DataFusion - Iceberg Tables ──► └──────────────┘ ──► Spark -``` +Layout children are lazy. Accessing a child validates the dtype expected by the parent and +materializes that child from the same footer FlatBuffer only when a scan route actually needs it. +For example, a struct layout does not need to deserialize every column child when the query reads +only a few fields. + +## Scan Plans -Storage backends implement the `Source` trait for reads. Query engines issue a scan request -describing the filter and projection to push down, and the source returns a stream of -independently-executable splits that can be run concurrently to produce result arrays. An -equivalent `Sink` trait exists for the write path, accepting an array stream and writing it to -the underlying storage. +A `ScanPlan` is an immutable runtime view of a layout. It can: -## Motivation +- push an expression into the plan's row domain; +- prepare value reads for the plan's root value; +- prepare predicate evidence; +- provide natural split hints; +- answer statistics or partial aggregates from metadata; and +- release cached state behind a completed row frontier. -Traditional data integrations require each storage backend and query engine to agree on a common -interchange format, typically Apache Arrow. This means the storage backend must fully decompress its -data into Arrow arrays, even if the query engine could operate on the compressed representation -directly. +Pushing an expression returns another `ScanPlan` whose `root()` value is that expression. A struct +plan can route `field("a")` to the child for column `a`; a dictionary plan can apply some +expressions once over dictionary values and reuse the result with per-row codes; a generic apply +plan handles expressions that cannot be pushed into a specialized layout. -The Vortex Scan API avoids this by allowing data to flow between storage and query engines in its -native compressed encoding. For example, the DuckDB integration can receive FSST-encoded string -arrays directly from a Vortex file and pass them into DuckDB's own internal FSST format without -any decompression step. +## Prepared Runtime Handles -## Source +Planning a scan creates prepared handles from the `ScanPlan` tree: -A **Source** represents any scannable tabular data. It accepts a scan request (filter, projection, -limit) and returns a stream of independently-executable splits. An equivalent **Sink** interface -exists for the write path, allowing query engines to both read from and write to any storage -backend through a single pair of interfaces. +- `PreparedEvidence` produces evidence fragments for one predicate expression. +- `PreparedRead` reads one pushed projection or residual predicate expression. +- `PreparedStats` and `PreparedAggregate` answer metadata-backed statistics and aggregates. +- `PreparedSplit` reports row ranges that are natural units of scan work. -### Splits +Prepared handles are scan-level runtime objects. They can hold child prepared handles and shared +state, but they do not choose the next row range themselves. The scan driver chooses explicit +morsel ranges and asks prepared handles to work on those ranges. -A source produces splits, each representing an independent unit of work that can be executed in -parallel. A split typically corresponds to a range of rows in a layout, such as a chunk or a set -of row-group partitions. +Each morsel carries a `RowScope`: -Each split carries size and row count estimates that query engines use for scheduling decisions. -Splits can also be serialized for distributed execution across remote workers. +- `selection` says which rows in the requested range remain live. +- `demand` says which selected rows need meaningful values from this operation. -### Remote Sources +This lets a projection skip data that no longer affects output, while still preserving output +cardinality for selected rows. -A source may front remote storage rather than local files. In this case, the split's execution -issues a remote call and receives the result over the network. The -[Vortex IPC format](../specs/ipc-format.md) can be used as the wire protocol for these calls, allowing -compressed arrays to be transferred without decompression. This gives remote sources the same -zero-decompression benefits as local scans -- the data stays in its compressed encoding end-to-end, -from remote storage through the network and into the query engine. +## Predicate Evidence -## Filter Pushdown +Predicates are decomposed into independent expressions. Before reading row data for a predicate, +the scan asks available prepared evidence handles whether metadata can prove something about the +requested rows. -Filter expressions are decomposed into individual conjuncts (AND-separated terms) and evaluated -independently. The scan tracks the selectivity of each conjunct using a probabilistic sketch -and dynamically reorders them so that the most selective predicates are evaluated first. This -means that as a scan progresses, it learns the most efficient evaluation order for the filter. +Evidence is a statement over the row domain. A zone map can prove that a range cannot match a +predicate; file or layout statistics can prove that a predicate is already satisfied; other +evidence sources can leave a range unknown. Unknown rows continue to residual predicate reads, +which materialize only the columns needed to compute the predicate exactly. -Filters are evaluated in two stages. First, pruning evaluation uses statistics stored in a -`ZonedLayout` auxiliary `zones` child to eliminate entire row zones without reading the underlying -data child. These pruning predicates are falsification checks derived from the original filter, for -example by comparing a zone's min/max values against the requested predicate. Second, filter -evaluation materializes only the filter-referenced columns and computes a row mask of matching -rows. +Prepared evidence handles are expected to be cheap relative to projection reads. They should use +layout metadata, statistics, indexes, or already-prepared shared state rather than speculatively +reading large data columns. Cheap evidence can also opt into a final `recheck_before_projection` +pass, which is useful when dynamic filters change while a morsel is in flight. ## Projection Pushdown -Projection expressions describe the output schema of the scan. The scan analyzes the projection -and filter expressions to compute two field masks: which columns are needed for filtering, and -which are needed for the final output. Only the union of these columns is read from storage. +Projection pushdown is expression pushdown through the `ScanPlan` tree. The scan prepares reads only +for the requested output expressions, and each layout decides how much of its child tree those reads +need. + +For a struct layout, field access routes to the named child and avoids unrelated columns. For a +chunked layout, the read is sliced by chunk and only overlapping chunks with demanded rows are +visited. For a dictionary layout, values can be shared across row ranges while codes are read for +the requested rows. + +## State and Caches + +The scan path uses several layers of state: + +- The segment source owns physical I/O, coalescing, segment caching, and in-flight segment + deduplication. +- The expanded `ScanPlan` tree is immutable and safe to share. +- `PrepareCtx` owns a prepared-state cache for scan/file-level state shared by prepared reads, + evidence, aggregate, and stats handles. +- A layout plan can create child-local prepared-state caches so repeated pushes into the same child + share decoded dictionaries, zone tables, or other expensive setup without leaking state across + unrelated row domains. +- Morsel tasks own only the row range and masks needed for that operation. -Columns needed exclusively for filtering are discarded after the filter mask is computed, so they -never appear in the output stream. This separation ensures minimal data movement throughout the -pipeline. +When ordered scans advance, prepared reads and scan plans receive a release frontier. Layouts use +that frontier to drop caches that only cover rows that cannot be read again. ## Query Engine Integration -Query engines integrate with the Scan API by translating their internal plan representations into -scan requests and consuming the resulting array stream in their preferred format. Integrations -exist for DuckDB, DataFusion, Spark, and Trino, with each engine converting its native filter -and projection representations into Vortex [expressions](expressions.md). +Query engines translate their native expressions into Vortex expressions and submit a scan request. +Vortex handles layout expansion, evidence, residual predicates, projection reads, and array +production. Integrations then export the produced Vortex arrays to the engine's preferred batch +format, such as Arrow `RecordBatch`es for DataFusion or DuckDB `DataChunk`s for DuckDB. diff --git a/docs/developer-guide/extending/index.md b/docs/developer-guide/extending/index.md index ff004a69bc5..315f9170cfe 100644 --- a/docs/developer-guide/extending/index.md +++ b/docs/developer-guide/extending/index.md @@ -13,8 +13,8 @@ The following topics are planned for this section: and Arrow interoperability. - **Writing an Encoding** -- implementing a custom array encoding with compression and decompression logic. -- **Writing a Layout** -- implementing the LayoutReader and LayoutWriter traits for custom - on-disk data organizations. +- **Writing a Layout** -- implementing a layout vtable, lazy child contracts, and ScanPlan + expansion for custom on-disk data organizations. - **Writing a Compute Function** -- the dispatch model, implementing kernels, vtable registration, and testing. diff --git a/docs/developer-guide/extending/writing-a-layout.md b/docs/developer-guide/extending/writing-a-layout.md index 112f7c6a80d..738061a7e8c 100644 --- a/docs/developer-guide/extending/writing-a-layout.md +++ b/docs/developer-guide/extending/writing-a-layout.md @@ -1,13 +1,160 @@ # Writing a Layout -:::{warning} -This page is under construction. -::: +A Vortex layout plugin describes serialized layout metadata and how that layout expands into the +scan runtime. Layout plugins do not implement a separate reader trait. Instead, they implement the +layout vtable, deserialize layout-specific data into `Layout`, and return a `ScanPlan` for +runtime reads. -Planned content: +## Layout Vtable -- What a layout is and when you need a custom one -- Implementing the LayoutReader trait -- Implementing the LayoutWriter trait -- Registering layouts with a session -- Testing layout implementations +The layout vtable lives in `vortex_layout::layout_v2`. Its shape follows the same plugin pattern as +the other Vortex vtables: + +- `Layout` is the typed layout handle. +- `LayoutRef` is the type-erased layout handle. +- `LayoutParts` constructs typed layouts from common fields plus `V::LayoutData`. +- `DynLayout` is private erased dispatch plumbing. +- `LayoutVTablePlugin` is the registry object used for ID-based deserialization. + +Common fields are hoisted out of the plugin-specific data. The vtable receives dtype, row count, +segment IDs, lazy child access, and layout metadata during deserialization, but returns only the +layout-specific `LayoutData`. + +```rust +use vortex_layout::layout_v2; +use vortex_layout::{LayoutChildType, LayoutId}; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::request::ScanRequest; +use vortex_session::VortexSession; + +#[derive(Clone, Debug)] +pub struct MyLayout; + +#[derive(Clone, Debug)] +pub struct MyLayoutData { + // layout-specific metadata +} + +impl layout_v2::VTable for MyLayout { + type LayoutData = MyLayoutData; + + fn id(&self) -> LayoutId { + LayoutId::new("example.my_layout") + } + + fn deserialize( + &self, + args: &layout_v2::LayoutDeserializeArgs<'_>, + ) -> vortex_error::VortexResult { + // Parse args.metadata and validate args.segment_ids / args.children. + Ok(MyLayoutData {}) + } + + fn child_dtype( + layout: layout_v2::Layout, + idx: usize, + ) -> vortex_error::VortexResult { + // Return the dtype expected for child `idx`. + Ok(layout.dtype().clone()) + } + + fn child_type( + _layout: layout_v2::Layout, + idx: usize, + ) -> vortex_error::VortexResult { + Ok(LayoutChildType::Transparent(format!("child-{idx}").into())) + } + + fn new_scan_plan( + layout: layout_v2::Layout, + req: &mut ScanRequest, + session: &VortexSession, + ) -> vortex_error::VortexResult { + // Expand the layout into a runtime ScanPlan. + todo!() + } +} +``` + +## Deserialization + +`LayoutDeserializeArgs` contains the common serialized fields: + +- `dtype`: the logical dtype of this layout; +- `row_count`: the number of rows in this layout's row domain; +- `metadata`: plugin-specific metadata bytes; +- `segment_ids`: logical segments referenced directly by this layout; +- `children`: lazy child access; +- `array_ctx`: the array read context captured from the file footer. + +Use `deserialize` to validate invariants that are local to the layout. For example, a flat layout +requires exactly one segment ID, and a chunked layout verifies that child row counts add up to the +parent row count. + +Do not eagerly deserialize children unless the layout metadata itself requires it. Child access is +intentionally lazy so projection and predicate pushdown can avoid unrelated branches of a wide +layout tree. + +## Child Contracts + +`child_dtype` and `child_type` define the contract between a parent layout and its children. The +scan path calls `layout.child(idx)`, which asks the parent for the expected dtype and then +materializes that child from the footer. + +Use `LayoutChildType` to describe how child rows relate to parent rows: + +- `Field(name)` for struct fields; +- `Chunk((idx, offset))` for row-range chunks; +- `Transparent(name)` for wrappers whose data child shares the parent row domain; +- `Auxiliary(name)` for metadata or support children such as validity, dictionary values, or zone + statistics. + +These relationships are used by debugging tools, split planning, and scan expansion. + +## ScanPlan Expansion + +`new_scan_plan` turns a typed layout into an immutable runtime `ScanPlan`. The plan should hold +layout metadata and child plan references, not per-morsel state. Runtime state belongs in prepared +handles or state caches created during preparation. + +A `ScanPlan` implementation can specialize: + +- `try_push_expr` to route expressions into children or rewrite them into a cheaper row domain; +- `prepare_read` to produce a `PreparedRead` for the plan's root value; +- `prepare_evidence` to produce cheap predicate evidence from metadata or indexes; +- `prepare_stats` and `prepare_aggregate_partial` for metadata-backed answers; +- `split_hints` to expose natural morsel boundaries; and +- `release` to drop caches behind the completed-row frontier. + +The layout vtable expands child layouts by calling `child.new_scan_plan(req, session)`. Pass the +same `ScanRequest` through for children in the same row domain, and use a fresh +`ScanRequest::empty()` for children in independent row domains such as dictionary values or zone +statistics. This keeps the layout plugin responsible for its local structure while the scan runtime +owns predicate ordering, morsel execution, and output assembly. + +## State Placement + +Keep state at the narrowest level that can safely reuse it: + +- `ScanPlan` stores immutable structure only. +- `PrepareCtx::shared_state` stores scan/file-level prepared state shared across prepared reads, + evidence, statistics, and aggregate handles. +- Layouts with independent child row domains can create child-local prepared-state caches so one + child shares dictionaries, zone tables, or decoded setup without colliding with another child. +- `ReadTask` and `EvidenceTask` own only one morsel's range and masks. +- Segment bytes belong to the segment source and segment cache, not to layout plans. + +This separation lets a scan clone and prepare many pushed expressions while still sharing expensive +setup where the row domain is the same. + +## Registration + +Register layout vtables through the session's layout registry: + +```rust +use vortex_layout::LayoutSessionExt; + +session.layouts().register_v2(MyLayout); +``` + +The session resolves serialized layout IDs through this registry when opening a Vortex file. diff --git a/docs/developer-guide/integrations/datafusion.md b/docs/developer-guide/integrations/datafusion.md index 93f45be0a8d..52d421ecb91 100644 --- a/docs/developer-guide/integrations/datafusion.md +++ b/docs/developer-guide/integrations/datafusion.md @@ -21,9 +21,9 @@ discovered file becomes a `PartitionedFile` that DataFusion assigns to execution Vortex implements the `FileOpener` trait to open individual files on demand as DataFusion's executor schedules them. -Layout readers are cached across partitions using a shared concurrent map keyed by file path. -This avoids redundant footer parsing when the same file is accessed by multiple partitions or -repeated queries. +Opened file metadata and scan preparation state are shared where possible across partitions keyed +by file path. This avoids redundant footer parsing and repeated layout expansion when the same file +is accessed by multiple partitions or repeated queries. ## Threading Model @@ -33,24 +33,25 @@ runtime handle. All I/O -- file opens, segment reads, object store fetches -- is Tokio tasks and scheduled across Tokio's multi-threaded executor. DataFusion's physical executor manages parallelism by assigning partitions to its own task pool. -Each partition opens its files and drives a `ScanBuilder` that returns an async stream of -record batches. Multiple partitions execute concurrently, with DataFusion controlling the degree -of parallelism. +Each partition opens its files and drives a Vortex file scan backed by layout expansion and +`ScanPlan` prepared reads. The scan returns an async stream of record batches. Multiple partitions +execute concurrently, with DataFusion controlling the degree of parallelism. ## Filter and Projection Pushdown The integration converts DataFusion physical expressions into Vortex expressions using an `ExpressionConvertor` trait. Supported predicates (comparisons, LIKE, IS NULL, IN lists, casts) -are pushed into the Vortex scan where they participate in pruning and filter evaluation at the -layout level. Unsupported predicates remain in the DataFusion plan and are evaluated after the -scan. +are pushed into the Vortex scan where they participate in layout-level evidence, pruning, and +residual filter evaluation. Unsupported predicates remain in the DataFusion plan and are evaluated +after the scan. Filter pushdown operates at two levels. The full predicate is used to prune entire files before they are opened, using file-level statistics. The subset of predicates that Vortex can evaluate efficiently is pushed into the per-file scan for row-level filtering. Projection pushdown maps DataFusion's requested column indices to Vortex field names and passes -them as a projection expression to the scan. Only the requested columns are read from storage. +them as projection expressions to the scan. Struct layouts route those expressions to the requested +field children, so only the requested columns are read from storage. The integration supports pluggable expression conversion via a custom `ExpressionConvertor`, allowing engine-specific rewrites or schema adaptation when file schemas diverge from the table @@ -61,15 +62,9 @@ schema. Vortex arrays produced by the scan are converted to Arrow `RecordBatch`es for consumption by DataFusion. Batches are sliced to respect DataFusion's configured batch size preference. -## Future Work +## Dynamic Filters -The current integration builds directly on the `ScanBuilder` and layout reader APIs. Future work -will migrate it to use the [Scan API](/concepts/scanning) `Source` trait, which will simplify -the integration by providing a standard interface for file discovery, partitioning, and pushdown -that is shared across all engine integrations. - -Other planned improvements include projection expression pushdown, which would allow DataFusion -to push complex projection expressions (such as extracting nested struct fields) into the Vortex -scan rather than materializing entire columns and projecting afterwards. Additionally, better -support for dynamic expressions would enable use-cases like top-k queries, where the scan's -filter expression is updated during execution as the query engine discovers tighter bounds. +Dynamic expressions support use-cases like top-k queries, where the query engine discovers tighter +bounds during execution. When a dynamic predicate version changes, cheap prepared evidence handles can recheck +in-flight morsels before projection so the scan avoids reading output rows that are no longer +needed. diff --git a/docs/developer-guide/integrations/duckdb.md b/docs/developer-guide/integrations/duckdb.md index 060f5fca973..77501bebd0a 100644 --- a/docs/developer-guide/integrations/duckdb.md +++ b/docs/developer-guide/integrations/duckdb.md @@ -42,13 +42,15 @@ the [runtime documentation](../internals/async-runtime.md) for more on this trad DuckDB's planner pushes filter predicates into the scan via the `pushdown_complex_filter` callback. These are converted from DuckDB's bound expression representation into Vortex expressions and stored alongside any table filter expressions. During scanning, the combined -filter is applied to the `ScanBuilder` for each file. +filter is pushed into the Vortex file scan for each file. Files can be pruned entirely before opening if their statistics prove that no rows can match -the filter. +the filter. For opened files, layout-level evidence can prune row ranges before residual +predicate reads materialize row data. Projection pushdown maps DuckDB's requested column indices to Vortex field names and passes -them as a projection expression to the scan. +them as projection expressions to the scan. Struct layouts route those expressions to field +children, so unrelated columns are not read. ## Data Export @@ -61,9 +63,8 @@ canonical (Arrow-compatible) conversion before export. Results are exported in chunks matching DuckDB's standard vector size to align with its vectorized execution model. -## Future Work +## Scan Runtime -The current integration builds directly on the `ScanBuilder`, layout reader, and file APIs. -Future work will migrate it to use the [Scan API](/concepts/scanning) `Source` trait, unifying -file discovery, multi-file coordination, and pushdown behind a single interface shared across -all engine integrations. +DuckDB workers consume chunks from a shared scan stream. Vortex opens files, expands layouts into +ScanPlan trees, prepares evidence and projection reads, and exports produced arrays into DuckDB's +native vector format. diff --git a/docs/developer-guide/internals/io.md b/docs/developer-guide/internals/io.md index da5c99e6333..9871093af46 100644 --- a/docs/developer-guide/internals/io.md +++ b/docs/developer-guide/internals/io.md @@ -80,10 +80,9 @@ callers request the same segment simultaneously. `SegmentId` is scoped to one file's footer segment map. A shared cache used across several opened files must include file/source identity in its effective key to avoid collisions between, for -example, `SegmentId(0)` in two different files. The current `SegmentSource` adapter is installed -per opened file, but the `SegmentCache` trait key is still a raw `SegmentId`; a cache object shared -across unrelated files must namespace entries itself. The scheduler-aware path should make this -namespace explicit with a source-scoped segment cache key. +example, `SegmentId(0)` in two different files. The `SegmentSource` adapter is installed per opened +file, and scheduler-aware segment requests carry source identity when scan work is coordinated +across sources. ## Backend Adaptation @@ -101,15 +100,13 @@ compatibility. ## Scan Scheduler Integration -The ScanNode scheduler design keeps `VortexReadAt` as the common adapter for positional byte -sources, but makes segment-future creation explicit. A prepared `VortexFile` binds layout -`SegmentId`s to a registered segment source, and scheduled morsel futures request segments through a -scheduler context. The scheduler sees the source ID, segment ID, byte size, and priority metadata, -but not physical byte locations. Cacheable segment reads carry a source-scoped segment cache key. - -The intermediate model lets constructing a morsel future register the `SegmentFuture`s it will later -await, while the scheduler owns an in-flight future cache keyed by `(SegmentSourceId, SegmentId)`. -The end state can make this stricter by requiring execution to read only from a submitted segment -resolver. In both models, the segment source remains responsible for segment-cache lookup, -backend-specific physical coalescing, and submission. See [Scan Scheduler](scan-scheduler.md) for -the target design. +The ScanPlan scheduler keeps `VortexReadAt` as the common adapter for positional byte sources, but +makes segment requests visible to scan planning. A prepared `VortexFile` binds layout `SegmentId`s +to a segment source, and morsel tasks can report the segment requests they need before execution. +The scheduler sees the source ID, segment ID, byte size, and priority metadata, but not physical +byte locations. Cacheable segment reads carry a source-scoped segment cache key. + +Prepared reads and evidence tasks request segments through the segment source. The source remains +responsible for segment-cache lookup, backend-specific physical coalescing, in-flight +deduplication, and submission. See [Scan Scheduler](scan-scheduler.md) for scheduler resource +coordination. diff --git a/docs/developer-guide/internals/scan-scheduler.md b/docs/developer-guide/internals/scan-scheduler.md index de93fa353c2..9f9ed111ec7 100644 --- a/docs/developer-guide/internals/scan-scheduler.md +++ b/docs/developer-guide/internals/scan-scheduler.md @@ -1,11 +1,11 @@ # Scan Scheduler :::{note} -This is an implementation design for the ScanNode-backed scan path. It describes the scheduler -shape the V2 scan should grow into, not the current behavior of the released scan API. +This is an implementation design for scheduler-aware ScanPlan execution. It describes the resource +coordination shape that the scan runtime is growing toward. ::: -The ScanNode scan path needs a resource scheduler that can coordinate work across files, partitions, +The ScanPlan scan path needs a resource scheduler that can coordinate work across files, partitions, and concurrent scans. The scheduler should be explicit and embeddable: a host engine can share one scheduler across many scans to enforce global limits, or create a fresh scheduler for each query to isolate resource usage. @@ -15,7 +15,7 @@ for query semantics. The existing `DataSource` / `ScanRequest` / `DataSourceScan` API remains the public query-engine boundary for this phase. The scheduler and morsel runtime sit behind that boundary, so the first -implementation can improve V2 execution without introducing a second scan API that mostly duplicates +implementation can improve scan execution without introducing a second scan API that mostly duplicates the current one. ## Goals @@ -24,7 +24,7 @@ the current one. - Allow DataFusion users to choose a shared scheduler, a new scheduler per query, or an unbounded mode. - Give DuckDB a simple global scheduler owned by the extension session. -- Keep ScanNode planning and morsel ordering local to each scan. +- Keep ScanPlan planning and morsel ordering local to each scan. - Make I/O planning explicit enough that future evidence, predicate, and projection reads can be deduplicated, batched, and prioritized without relying on hidden unpolled futures inside layout readers. @@ -40,11 +40,11 @@ the current one. - Do not put query semantics, filter ordering, evidence planning, or output ordering into the global scheduler. - Do not replace the `DataSource` scan API in the first scheduler implementation. If the public API - changes later, it should be because the V2 runtime needs capabilities that cannot be added + changes later, it should be because the ScanPlan runtime needs capabilities that cannot be added compatibly to `ScanRequest` or `DataSourceScan`. - Do not require every scan integration to expose the same configuration surface immediately. - Do not solve cluster-wide distributed admission control. The scheduler is process-local. -- Do not design an opaque I/O path in the first implementation. If a future custom `ScanNode` needs +- Do not design an opaque I/O path in the first implementation. If a future custom `ScanPlan` needs non-segment I/O, add that as a small extension point next to `SegmentRequest`. ## Core Model @@ -60,12 +60,11 @@ There are three layers: priority, metrics, and per-scan limits. 3. Per-scan `MorselScanRuntime` - Owns the ScanNode graph, evidence/read/aggregate plans, morsel queue, row ordering, limit + Owns the ScanPlan graph, evidence/read/aggregate plans, morsel queue, row ordering, limit handling, dynamic filters, and the choice of which work is useful next. `DataSource::scan` constructs this per-scan runtime internally and returns the existing -`DataSourceScan` wrapper. Query engines should not need to know whether a data source is implemented -by the legacy `LayoutReader` path or the V2 ScanNode runtime. +`DataSourceScan` wrapper. Query engines do not need to know the internal ScanPlan topology. The scheduler decides whether work may run. The per-scan runtime decides what work should run. @@ -152,13 +151,13 @@ pub trait ScanSchedulerSessionExt: SessionExt { } ``` -The default should be `Unbounded` initially, so enabling the V2 scan does not silently introduce new +The default can be `Unbounded` initially, so adopting the scheduler does not silently introduce new resource limits. Integrations can opt into bounded scheduling explicitly. The scheduler types should live in `vortex-scan`, not `vortex-layout`, because the resource policy -belongs to the scan API layer and should be reusable by non-layout sources. ScanNode-specific code in +belongs to the scan API layer and should be reusable by non-layout sources. ScanPlan-specific code in `vortex-layout` can consume tickets and permits through the public scan scheduler API without making -the scheduler understand layout-specific node types. +the scheduler understand layout-specific plan types. ## DataFusion Integration @@ -183,9 +182,9 @@ impl VortexDataSourceBuilder { The same options should be available on `VortexTable` and `VortexFormatFactory` so users who register tables through DataFusion's listing format path can still control scheduling. -For the current V2 DataFusion path, `DataSource::open` creates a single Vortex scan for partition -zero. A per-query scheduler can therefore be resolved immediately before calling -`DataSourceRef::scan`. If DataFusion later produces multiple Vortex scan nodes for one query and +For DataFusion, `DataSource::open` creates a single Vortex scan for partition zero. A per-query +scheduler can therefore be resolved immediately before calling +`DataSourceRef::scan`. If DataFusion later produces multiple Vortex scan plans for one query and those scans should share a per-query scheduler, the integration should propagate a scheduler through DataFusion's `TaskContext` or another query-scoped extension and use that as the provider result. @@ -210,7 +209,6 @@ Benchmark environment variables can map onto these APIs, but they should not be surface: ```text -VORTEX_SCAN_IMPL=v2 VORTEX_SCAN_SCHEDULER=unbounded|shared|per-query VORTEX_SCAN_MAX_MORSEL_SLOTS=... ``` @@ -239,9 +237,9 @@ keeps the scheduler explicit and testable. Scan work should acquire scheduler permits before consuming bounded resources. -The first implementation should not require every `ReadPlan`, `EvidencePlan`, or `AggregatePlan` +The first implementation should not require every `PreparedRead`, `PreparedEvidence`, or `PreparedAggregate` to expose pending I/O, decoded-size estimates, or cost statistics. Those estimates are useful, but -they are also hard to get right and would make the initial ScanNode API more rigid. The V2 runtime +they are also hard to get right and would make the initial ScanPlan API more rigid. The scan runtime already knows the coarse unit of scheduling: the morsel. The MVP scheduler should admit morsels and let each admitted morsel run its evidence/read/aggregate pipeline internally. @@ -274,14 +272,14 @@ Richer byte/task fields can be added once the runtime has instrumentation showin limits matter in practice: ```rust -pub struct PlanCostHint { +pub struct PreparedCostHint { pub estimated_io_bytes: Option, pub estimated_decoded_bytes: Option, pub estimated_cpu_units: Option, } ``` -If those hints are added, they should remain advisory. A plan that does not provide hints should +If those hints are added, they should remain advisory. A prepared handle that does not provide hints should still be schedulable with default morsel accounting. `WorkPermit` is RAII. Dropping it releases every reserved resource. This is required for early @@ -315,11 +313,8 @@ This will let the scan reserve from estimates first, then correct accounting aft ## Explicit Segment Request Model -The legacy layout reader gets useful coalescing from a side effect: creating a `SegmentFuture` -registers the underlying read with `FileSegmentSource`, even if the future has not been polled yet. -That makes future reads visible to the I/O stream, but it hides I/O shape from the scan scheduler. - -The ScanNode path should make this boundary explicit. Layouts still refer to logical segments by +The ScanPlan path makes segment requests explicit enough for scheduling while keeping physical I/O +inside the segment source. Layouts still refer to logical segments by `SegmentId`, and the scheduler should stay at that same abstraction level. It should know which registered source owns the segment and roughly how many bytes the segment costs, but it should not need the segment's physical byte location: @@ -344,7 +339,7 @@ pub enum ScanIoPhase { ``` `SegmentId` is not a physical I/O address. It is a layout-local reference. A `VortexFile` binds -that reference when it instantiates a ScanNode tree: +that reference when it instantiates a ScanPlan tree: ```text footer segment map + opened byte source @@ -354,7 +349,7 @@ SegmentId -> SegmentInfo { bytes, cacheability, source-local metadata } ``` For normal Vortex files, the source is the `VortexReadAt` returned by `VortexOpenOptions` or -`FileSystem::open_read`. For a custom ScanNode, the source might be an HTTP range reader, an +`FileSystem::open_read`. For a custom ScanPlan, the source might be an HTTP range reader, an in-memory reader, or another backend that can provide segment payloads. The first implementation should only support segment requests. A future non-segment I/O hook can be @@ -401,12 +396,12 @@ deduplication. The minimum guarantee is scan-local identity: all requests with t `SegmentSourceId` target the same registered source and may be deduped or batched together. For a prepared `VortexFile`, source registration happens during file preparation, before layout -plans produce runtime segment requests. Layout nodes should not know how a file was opened. A flat -layout can continue to store `segment_id`; the prepared file state translates that ID to a +plans produce runtime segment requests. Layout-specific plans should not know how a file was +opened. A flat layout can continue to store `segment_id`; the prepared file state translates that ID to a `SegmentRequest` using the bound segment table. -Custom ScanNodes that own independent I/O register their own sources during preparation or state -initialization. For example, an HTTP-backed node can register a source that maps `SegmentId`s to +Custom ScanPlans that own independent I/O register their own sources during preparation or state +initialization. For example, an HTTP-backed plan can register a source that maps `SegmentId`s to HTTP range requests internally and produce `SegmentRequest`s against the returned `SegmentSourceId`. @@ -433,7 +428,7 @@ impl ScheduleCtx<'_> { `request_segment` is synchronous. It dedupes by `(SegmentSourceId, SegmentId)`, submits to the registered source when needed, and returns a shared future for the logical segment payload. Adjacent -morsels and different plans that touch the same segment receive clones of the same shared future +morsels and different prepared handles that touch the same segment receive clones of the same shared future while it remains in flight. The scheduler should therefore: @@ -447,8 +442,8 @@ The scheduler should therefore: - submit ordered batches or windows of segment requests to each source. For the current intermediate implementation, scans without a pushed-down limit should default to an -unbounded planning window and a bounded launch window. This deliberately mirrors the useful V1 -behavior: constructing the planned morsel registers segment futures for the whole scan, while only +unbounded planning window and a bounded launch window. Constructing the planned morsel registers +segment futures for the scan window, while only the launch window controls how many morsels are actively polled and decoded. Ordered scans use the same planning and launch machinery, but projection completions are buffered behind an ordered emission frontier. Scans with a pushed-down limit should continue using a `1/1` plan/launch window @@ -541,16 +536,16 @@ but it does not need to own eviction to schedule scans correctly. ## Scheduled Morsel Futures -`ScanNode` and `Plan` serve different purposes: +`ScanPlan` and prepared handles serve different purposes: -- `ScanNode` is the expanded layout tree with capabilities. It answers whether a layout can push an +- `ScanPlan` is the expanded layout tree with capabilities. It answers whether a layout can push an expression, produce evidence, read values, split work, or answer statistics. -- `Plan` is a reusable compiled route through that tree for one purpose, such as reading a - projection expression or producing one predicate's evidence. It should not own frontier state and - should not have an `execute_next(len)` API. +- A prepared handle is a reusable compiled route through that tree for one purpose, such as reading + a projection expression or producing one predicate's evidence. It should not own frontier state + and should not have an `execute_next(len)` API. -The drive/cursor owns frontier state and chooses explicit morsel ranges. Plans execute explicit -work: +The drive/cursor owns frontier state and chooses explicit morsel ranges. Prepared handles execute +explicit work: ```rust pub struct MorselScope<'a> { @@ -565,7 +560,7 @@ pub struct ScheduledRead<'a> { pub future: BoxFuture<'a, VortexResult>, } -pub trait ReadPlan { +pub trait ScheduledPreparedRead { fn schedule_morsel<'a>( &'a self, scope: MorselScope<'a>, @@ -609,14 +604,15 @@ sharpens later work. Predicate reads should be ordered by expected selectivity p reads should stay near the accepted-row frontier so the scan does not retain an entire filtered stream before emitting output. -## End-State Plan Introspection +## End-State Prepared-Handle Introspection The stricter end state can add explicit request introspection on top of scheduled morsel futures. -In that model, plans describe the segments they would need before execution, the scheduler submits -those requests, and execution receives a resolver backed by the submitted request set: +In that model, prepared handles describe the segments they would need before execution, the +scheduler submits those requests, and execution receives a resolver backed by the submitted request +set: ```rust -pub trait ReadPlan { +pub trait PreparedRead { fn segment_requests( &self, range: Range, @@ -628,7 +624,7 @@ pub trait ReadPlan { } } -pub trait EvidencePlan { +pub trait PreparedEvidence { fn segment_requests( &self, req: &EvidenceRequest<'_>, @@ -640,17 +636,17 @@ pub trait EvidencePlan { } ``` -Leaf plans can provide exact requests. A flat leaf reports the segment bound to its `segment_id`. +Leaf prepared handles can provide exact requests. A flat leaf reports the segment bound to its `segment_id`. Zoned evidence reports the shared stats-table setup read separately from cheap per-morsel probes. -Struct and apply plans compose child requests. Chunked plans use `selection` and `demand` to include +Struct and apply prepared reads compose child requests. Chunked prepared reads use `selection` and `demand` to include only the chunks that actually require data, preserving the current selected-but-undemanded behavior where default filler can be produced without expanding or reading a child. -In strict mode, plans that return `unknown` cannot use the strict resolver without falling back to +In strict mode, prepared handles that return `unknown` cannot use the strict resolver without falling back to an explicit late request path. That fallback should be observable in metrics and should eventually disappear from core layouts. This is why the scheduled-morsel-future model is the better -intermediate step: it makes I/O registration authoritative without requiring every plan to expose a -perfect request set on day one. +intermediate step: it makes I/O registration authoritative without requiring every prepared handle +to expose a perfect request set on day one. ## Morsel Pipeline @@ -697,7 +693,7 @@ layout-node behavior. The scheduler-aware runtime should own the per-file morsel frontier. Each prepared file tracks the set of morsels that may still read state. When a morsel is emitted or pruned, the runtime advances -the contiguous completed frontier and calls release hooks on read plans and scan nodes. +the contiguous completed frontier and calls release hooks on prepared reads and scan plans. This is required for lookahead. Without a frontier, running evidence and predicate work far ahead can leave decoded chunks, flat arrays, zone maps, and masks retained longer than intended. The release @@ -722,7 +718,7 @@ The first implementation should control active execution: This intentionally approximates the current scan behavior: scans without a pushed-down limit can run several morsels concurrently. Ordered scans keep the same work window, but emit projection results through an ordered frontier. Scans with a pushed-down limit should run with a narrower launch -window. The default launch window should mirror the existing `ScanBuilder` concurrency factor: +window. The default launch window should be proportional to available scan parallelism: ```text no limit: max_morsels_in_flight = 4 * available_parallelism @@ -731,8 +727,7 @@ limit: max_morsels_in_flight = 1 The shared scheduler can apply the same window globally, per scan, or both. For example, a DataFusion user can choose one shared scheduler with `4 * available_parallelism` total morsel slots -to cap the whole process, or create a new scheduler per query to preserve the old per-query -behavior. +to cap the whole process, or create a new scheduler per query to isolate resource accounting. Later implementations can add: @@ -763,7 +758,7 @@ query semantics. Weighted fair scheduling can be added later if the per-scan win ## Morsel Runtime -The V2 scan should move toward an explicit per-scan runtime. The MVP can still execute one whole +The scan should move toward an explicit per-scan runtime. The MVP can still execute one whole morsel after acquiring one coarse scheduler permit, but the runtime boundary should be chosen so it can later split a morsel into evidence, predicate, projection, emit, and release work without changing the public `DataSource` API. @@ -777,8 +772,9 @@ pub struct MorselScanRuntime { } ``` -`ScanRuntimePlan` is internal to the V2 implementation. It contains the files, expanded ScanNode -trees, pushed expressions, evidence plans, read plans, aggregate plans, and reusable per-file state. +`ScanRuntimePlan` is internal to the scan implementation. It contains the files, expanded ScanPlan +trees, pushed expressions, prepared evidence handles, prepared reads, prepared aggregate handles, +and reusable per-file state. It is not a replacement public scan API. MVP execution loop: @@ -808,7 +804,7 @@ while output is still required: advance the per-file frontier and release state behind it ``` -The scheduler should not know that the work is "zoned evidence" or "dict read plan". It should see +The scheduler should not know that the work is "zoned evidence" or "dict prepared read". It should see resource classes, source IDs, segment requests, slot counts, cancellation state, and priorities. The per-scan runtime maps layout-specific plan behavior into those generic scheduler inputs. @@ -853,13 +849,13 @@ them through tracing or debug logs first. Include `ScanScheduler`, `ScanSchedulerConfig`, `ScanSchedulerProvider`, `ScanSchedulerSession`, `ScanTicket`, `WorkRequest`, and `WorkPermit`. -2. Wire the V2 scan to register one ticket per `DataSource::scan` call. - Store the ticket and scheduler in the V2 `DataSourceScan` so all partitions from the same scan +2. Wire the scan to register one ticket per `DataSource::scan` call. + Store the ticket and scheduler in the `DataSourceScan` so all partitions from the same scan share one resource view. -3. Add permits around V2 morsel execution. - Start with one scheduler slot per in-flight morsel. Do not require `ReadPlan`, `EvidencePlan`, - or `AggregatePlan` to expose cost estimates in the MVP. Keep byte accounting and output batch +3. Add permits around morsel execution. + Start with one scheduler slot per in-flight morsel. Do not require `PreparedRead`, + `PreparedEvidence`, or `PreparedAggregate` to expose cost estimates in the MVP. Keep byte accounting and output batch memory accounting out of the MVP. 4. Add DataFusion builder controls. @@ -885,8 +881,8 @@ them through tracing or debug logs first. Key it by `(SegmentSourceId, SegmentId)`. `ScheduleCtx::request_segment` should synchronously submit or reuse the logical segment request and return a shared future for the segment payload. -10. Convert plan execution to scheduled morsel future construction. - `ReadPlan` and `EvidencePlan` should expose synchronous future constructors for explicit +10. Convert prepared-handle execution to scheduled morsel future construction. + `PreparedRead` and `PreparedEvidence` should expose synchronous future constructors for explicit morsel ranges. Constructing the future registers all segment futures it will await. The drive can then construct work ahead until byte/frontier/memory thresholds are full and decide which futures to poll. @@ -907,7 +903,7 @@ them through tracing or debug logs first. release. Use observed selectivity and I/O cost to reprioritize predicate work within each scan. 14. Drive the morsel frontier. - Track completed/pruned morsels per file and call read-plan/scan-node release hooks as the + Track completed/pruned morsels per file and call prepared-read/scan-plan release hooks as the contiguous frontier advances. 15. Add strict end-state segment resolution. @@ -918,12 +914,12 @@ them through tracing or debug logs first. ## Open Questions -- Should the default scheduler remain unbounded permanently, or should V2 eventually use bounded +- Should the default scheduler remain unbounded permanently, or should ScanPlan scans eventually use bounded defaults? -- How should DataFusion propagate one per-query scheduler across several Vortex scan nodes in the +- How should DataFusion propagate one per-query scheduler across several Vortex scan plans in the same physical plan? - Should scheduler config be part of the public stable scan API or remain integration-specific until - the V2 scan is more mature? + the ScanPlan scan is more mature? - How should output batch memory be accounted once ownership moves into DataFusion or DuckDB? - Should segment cache memory share the scheduler's decoded/intermediate budget, or have a separate cache budget coordinated by the same scheduler? diff --git a/docs/developer-guide/internals/serialization.md b/docs/developer-guide/internals/serialization.md index 11cfe720c47..7f5628d3be1 100644 --- a/docs/developer-guide/internals/serialization.md +++ b/docs/developer-guide/internals/serialization.md @@ -88,8 +88,9 @@ The postscript locates four regions by offset and length: configs. The layout FlatBuffer is a tree of `Layout` nodes, each containing an encoding ID, row count, -metadata, child layouts, and segment indices. This tree is deserialized and bound to a segment -source to create a `LayoutReader` that can lazily fetch data on demand. +metadata, child layouts, and segment indices. This tree is deserialized into lazy `LayoutRef` +nodes. During a scan, layout vtables expand those nodes into a `ScanPlan` tree that requests +segments from the bound segment source on demand. ## FlatBuffers diff --git a/docs/developer-guide/internals/session.md b/docs/developer-guide/internals/session.md index 5aff27845af..489a1c10cba 100644 --- a/docs/developer-guide/internals/session.md +++ b/docs/developer-guide/internals/session.md @@ -30,7 +30,7 @@ Each Vortex crate defines a session variable that holds a registry for its exten | `DTypeSession` | `vortex-array` | Extension dtype vtables (Date, Time, ...) | | `ArraySession` | `vortex-array` | Array encoding vtables (ALP, FSST, ...) | | `ScalarFnSession` | `vortex-array` | Scalar function vtables | -| `LayoutSession` | `vortex-layout` | Layout encoding vtables (Flat, Chunked, ...) | +| `LayoutSession` | `vortex-layout` | Layout vtable plugins (Flat, Chunked, ...) | | `RuntimeSession` | `vortex-io` | Async runtime handle | | `CudaSession` | `vortex-cuda` | CUDA context, kernels, and stream pool | @@ -47,8 +47,8 @@ Plugins register with the session by accessing the relevant component and callin // Register a custom array encoding session.arrays().register(MyEncoding); -// Register a custom layout -session.layouts().register(MyLayout::encoding()); +// Register a custom layout vtable +session.layouts().register_v2(MyLayout); // Register a custom scalar function session.scalar_fns().register(MyScalarFnVTable); @@ -61,7 +61,7 @@ to register all built-in encodings. ## Explicit Passing Sessions are passed explicitly through constructors and method arguments. This means every API -that needs access to registries -- file readers, writers, scan builders, layout readers -- receives +that needs access to registries -- file readers, writers, scan sources, layout vtables -- receives the session directly rather than reaching for global state. ```rust @@ -75,8 +75,11 @@ session.write_options() .write(&mut file, array_stream) .await?; -// Scanning a layout -ScanBuilder::new(session.clone(), layout_reader) +// Scanning a file +let stream = session.open_options() + .open_path("data.vortex") + .await? + .scan()? .with_filter(expr) .into_array_stream()?; ``` diff --git a/docs/developer-guide/internals/vtables.md b/docs/developer-guide/internals/vtables.md index 77283ea012d..0e73bca2fda 100644 --- a/docs/developer-guide/internals/vtables.md +++ b/docs/developer-guide/internals/vtables.md @@ -206,11 +206,21 @@ Currently uses `VTable` (unqualified), `VTableAdapter`, `DynExprVTable` (sealed and `ExprVTable` (confusingly, the erased ref). Needs renaming to `ExprVTable`, `DynExpr`, `ExprRef`. Introduce `Expr` data struct, remove `VTableAdapter`. -### Layout -- Not started +### Layout -- Implemented for serialized scan layouts -Currently uses `VTable` (unqualified), `LayoutAdapter`, and `Layout` (sealed trait doubling -as public API). Needs renaming to `LayoutVTable`, `DynLayout`, `LayoutRef`. Introduce -`Layout` data struct, remove `LayoutAdapter`. +The scan layout path follows this pattern in `vortex_layout::layout_v2`: + +- `layout_v2::VTable` is the layout vtable implemented by layout plugins. +- `Layout` is the typed layout handle with common fields hoisted: dtype, row count, segment IDs, + and lazy child access. +- `V::LayoutData` stores only layout-specific metadata. +- `LayoutRef` is the public type-erased layout handle. +- `DynLayout` is private erased dispatch plumbing. +- `LayoutVTablePlugin` is the registry object used for ID-based footer deserialization. + +The layout vtable also owns scan expansion through `new_scan_plan`. This keeps serialized layout +metadata and runtime scan behavior registered at the same plugin point: deserializing a layout +produces `Layout`, and scanning it expands that typed layout into a `ScanPlan`. ### Array -- Not started diff --git a/docs/specs/file-format.md b/docs/specs/file-format.md index 425294f2d99..8704a510e40 100644 --- a/docs/specs/file-format.md +++ b/docs/specs/file-format.md @@ -75,8 +75,9 @@ valid to store a `Float64` array, a `Boolean` array, or any other root data type ## Footer The footer is a flat buffer serialized `Footer` object. This object contains all the information required to -load the root `Layout` object into a usable `LayoutReader`). +deserialize the root `Layout` object into a `LayoutRef` and bind its segment IDs to file byte ranges. For example, it contains the locations, compression schemes, encryption schemes, and required alignment of all segments in the file. +The scan runtime expands that root layout into a `ScanPlan` tree when a query is executed. :::{literalinclude} ../../vortex-flatbuffers/flatbuffers/vortex-file/footer.fbs :start-after: [footer] diff --git a/vortex-datafusion/src/persistent/format.rs b/vortex-datafusion/src/persistent/format.rs index ec79cad35cd..ad333b6ba6d 100644 --- a/vortex-datafusion/src/persistent/format.rs +++ b/vortex-datafusion/src/persistent/format.rs @@ -471,7 +471,7 @@ impl FileFormat for VortexFormat { file_metadata_cache.put(&object.location, entry); } - return infer_scan_node_stats(&table_schema, &vxf).await; + return infer_scan_plan_stats(&table_schema, &vxf).await; } // Try to get entry metadata first @@ -675,7 +675,7 @@ impl FileFormat for VortexFormat { } } -async fn infer_scan_node_stats(table_schema: &SchemaRef, vxf: &VortexFile) -> DFResult { +async fn infer_scan_plan_stats(table_schema: &SchemaRef, vxf: &VortexFile) -> DFResult { let struct_dtype = vxf .dtype() .as_struct_fields_opt() @@ -696,7 +696,7 @@ async fn infer_scan_node_stats(table_schema: &SchemaRef, vxf: &VortexFile) -> DF } let stats = vxf - .scan_node_statistics_many(&requested_exprs, &funcs) + .scan_plan_statistics_many(&requested_exprs, &funcs) .await .map_err(|e| DataFusionError::Execution(format!("Failed to infer scan2 stats: {e}")))?; for (column_idx, stats) in requested_columns.into_iter().zip(stats) { diff --git a/vortex-datafusion/src/persistent/opener.rs b/vortex-datafusion/src/persistent/opener.rs index 9b8b418942b..ebb52b18dd9 100644 --- a/vortex-datafusion/src/persistent/opener.rs +++ b/vortex-datafusion/src/persistent/opener.rs @@ -373,7 +373,7 @@ impl FileOpener for VortexOpener { ), None => scan_projection.clone(), }; - let morsels = scan_node_morsel_ranges_for_file( + let morsels = scan_plan_morsel_ranges_for_file( natural_split_ranges.as_ref(), &file.object_meta.location, &vxf, @@ -403,7 +403,7 @@ impl FileOpener for VortexOpener { Field::new_struct("", stream_schema.fields().clone(), false); let file_location = file.object_meta.location.clone(); let array_stream = vxf - .scan_node_stream(ScanRequest { + .scan_plan_stream(ScanRequest { projection: scan_projection, filter, row_range, @@ -668,12 +668,12 @@ fn compute_natural_split_ranges(layout_reader: &dyn LayoutReader) -> DFResult]>>, path: &Path, file: &VortexFile, diff --git a/vortex-datafusion/src/v2/source.rs b/vortex-datafusion/src/v2/source.rs index 3b190e6ff84..bcb609fb58f 100644 --- a/vortex-datafusion/src/v2/source.rs +++ b/vortex-datafusion/src/v2/source.rs @@ -367,7 +367,7 @@ impl VortexDataSource { /// current projection, pushed filters, ordering hints, and row limit. /// /// For unordered scans without a limit, this integration reports DataFusion's -/// requested partition count when the wrapped source supports ScanNode morsel +/// requested partition count when the wrapped source supports ScanPlan morsel /// partitioning. The async morsel plan is still built lazily in [`DataSource::open`], /// so partitions beyond the discovered morsel count produce empty streams. /// Ordered and limited scans use one output partition so the source can preserve diff --git a/vortex-file/src/file.rs b/vortex-file/src/file.rs index 3d534551391..e728a775231 100644 --- a/vortex-file/src/file.rs +++ b/vortex-file/src/file.rs @@ -215,37 +215,37 @@ impl VortexFile { )) } - /// Execute a ScanNode-backed V2 scan for this file. - pub fn scan_node_stream(&self, request: ScanRequest) -> VortexResult { - scan_v2::scan_node_file_stream(self.clone(), request) + /// Execute a ScanPlan-backed scan for this file. + pub fn scan_plan_stream(&self, request: ScanRequest) -> VortexResult { + scan_v2::scan_plan_file_stream(self.clone(), request) } - /// Return ScanNode-backed aggregate-function statistics for this file. - pub async fn scan_node_statistics( + /// Return ScanPlan-backed aggregate-function statistics for this file. + pub async fn scan_plan_statistics( &self, expr: &Expression, funcs: &[AggregateFnRef], ) -> VortexResult>> { - scan_v2::scan_node_file_statistics(self.clone(), expr, funcs).await + scan_v2::scan_plan_file_statistics(self.clone(), expr, funcs).await } - /// Return ScanNode-backed aggregate-function statistics for several expressions in this file. - pub async fn scan_node_statistics_many( + /// Return ScanPlan-backed aggregate-function statistics for several expressions in this file. + pub async fn scan_plan_statistics_many( &self, exprs: &[Expression], funcs: &[AggregateFnRef], ) -> VortexResult>>> { - scan_v2::scan_node_file_statistics_many(self.clone(), exprs, funcs).await + scan_v2::scan_plan_file_statistics_many(self.clone(), exprs, funcs).await } - /// Return ScanNode natural row split ranges for this file. - pub fn scan_node_splits(&self) -> VortexResult>> { - scan_v2::scan_node_file_splits(self) + /// Return ScanPlan natural row split ranges for this file. + pub fn scan_plan_splits(&self) -> VortexResult>> { + scan_v2::scan_plan_file_splits(self) } - /// Plan ScanNode natural row split ranges for a projected scan of this file. + /// Plan ScanPlan natural row split ranges for a projected scan of this file. pub async fn plan_splits(&self, projection: &Expression) -> VortexResult>> { - scan_v2::scan_node_file_plan_splits(self.clone(), projection).await + scan_v2::scan_plan_file_plan_splits(self.clone(), projection).await } /// Returns `true` if file-level statistics prove the expression cannot diff --git a/vortex-file/src/multi/mod.rs b/vortex-file/src/multi/mod.rs index b451baac9ed..0813ade16ed 100644 --- a/vortex-file/src/multi/mod.rs +++ b/vortex-file/src/multi/mod.rs @@ -206,10 +206,10 @@ impl MultiFileDataSource { /// Build the [`DataSource`] selected by `VORTEX_SCAN_IMPL`. /// /// The default is the existing LayoutReader-backed scan. Setting - /// `VORTEX_SCAN_IMPL=v2` (or `scan2`/`scan3`/`native`) builds the ScanNode-backed V2 scan. + /// `VORTEX_SCAN_IMPL=v2` (or `scan2`/`scan3`/`native`) builds the ScanPlan-backed V2 scan. pub async fn build_data_source(self) -> VortexResult { if scan2_enabled()? { - Ok(Arc::new(scan_v2::build_scan_node_data_source(self).await?)) + Ok(Arc::new(scan_v2::build_scan_plan_data_source(self).await?)) } else { Ok(Arc::new(self.build().await?)) } diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index f005aa0c7e4..ed3d3b08e72 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! ScanNode-backed multi-file data source. +//! ScanPlan-backed multi-file data source. use std::any::Any; use std::collections::BTreeMap; @@ -46,37 +46,7 @@ use vortex_io::filesystem::FileListing; use vortex_io::filesystem::FileSystemRef; use vortex_io::runtime::Handle; use vortex_io::session::RuntimeSessionExt; -use vortex_layout::scan::v2::evidence::EvidenceFragment; -use vortex_layout::scan::v2::evidence::PredicateEvidence; -use vortex_layout::scan::v2::evidence::PredicateEvidenceKind; -use vortex_layout::scan::v2::evidence::PredicateId; -use vortex_layout::scan::v2::evidence::PredicateVersion; -use vortex_layout::scan::v2::node::ExpandCtx; -use vortex_layout::scan::v2::node::FileReader; -use vortex_layout::scan::v2::node::OwnedRowScope; -use vortex_layout::scan::v2::node::PrepareCtx; -use vortex_layout::scan::v2::node::PreparedAggregateRef; -use vortex_layout::scan::v2::node::PreparedEvidenceRef; -use vortex_layout::scan::v2::node::PreparedReadRef; -use vortex_layout::scan::v2::node::PreparedStats; -use vortex_layout::scan::v2::node::PreparedStatsRef; -use vortex_layout::scan::v2::node::PushCtx; -use vortex_layout::scan::v2::node::ScanNode; -use vortex_layout::scan::v2::node::ScanNodeRef; -use vortex_layout::scan::v2::node::ScanStateRef; -use vortex_layout::scan::v2::node::StateCtx; -use vortex_layout::scan::v2::request::EvidenceMode; -use vortex_layout::scan::v2::request::NodeRequest; -use vortex_layout::scan::v2::request::OwnedEvidenceRequest; use vortex_layout::scan::v2::validate_temporal_comparisons; -use vortex_layout::segments::ScanIoPhase; -use vortex_layout::segments::ScheduledSegmentSource; -use vortex_layout::segments::ScheduledSegmentSourceReader; -use vortex_layout::segments::SegmentFutureCache; -use vortex_layout::segments::SegmentPlanCtx; -use vortex_layout::segments::SegmentRequests; -use vortex_layout::segments::SubmittedSegmentRequests; -use vortex_layout::segments::submit_segment_requests_cached; use vortex_mask::Mask; use vortex_metrics::MetricsRegistry; use vortex_scan::DataSource; @@ -88,13 +58,42 @@ use vortex_scan::PartitionStream; use vortex_scan::PlannedMorselScan; use vortex_scan::PlannedMorselScanRef; use vortex_scan::ScanMeta; -use vortex_scan::ScanRequest; +use vortex_scan::ScanRequest as DataSourceScanRequest; use vortex_scan::ScanScheduler; use vortex_scan::ScanSchedulerSessionExt; use vortex_scan::ScanTicket; use vortex_scan::SegmentSourceId; use vortex_scan::SegmentSourceMeta; use vortex_scan::WorkRequest; +use vortex_scan::plan::FileReader; +use vortex_scan::plan::OwnedRowScope; +use vortex_scan::plan::PrepareCtx; +use vortex_scan::plan::PreparedAggregateRef; +use vortex_scan::plan::PreparedEvidenceRef; +use vortex_scan::plan::PreparedReadRef; +use vortex_scan::plan::PreparedStats; +use vortex_scan::plan::PreparedStatsRef; +use vortex_scan::plan::PushCtx; +use vortex_scan::plan::ScanPlan; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::ScanStateRef; +use vortex_scan::plan::StateCtx; +use vortex_scan::plan::evidence::EvidenceFragment; +use vortex_scan::plan::evidence::PredicateEvidence; +use vortex_scan::plan::evidence::PredicateEvidenceKind; +use vortex_scan::plan::evidence::PredicateId; +use vortex_scan::plan::evidence::PredicateVersion; +use vortex_scan::plan::request::EvidenceMode; +use vortex_scan::plan::request::OwnedEvidenceRequest; +use vortex_scan::plan::request::ScanRequest; +use vortex_scan::segments::ScanIoPhase; +use vortex_scan::segments::ScheduledSegmentSource; +use vortex_scan::segments::ScheduledSegmentSourceReader; +use vortex_scan::segments::SegmentFutureCache; +use vortex_scan::segments::SegmentPlanCtx; +use vortex_scan::segments::SegmentRequests; +use vortex_scan::segments::SubmittedSegmentRequests; +use vortex_scan::segments::submit_segment_requests_cached; use vortex_scan::selection::Selection; use vortex_session::VortexSession; use vortex_utils::parallelism::get_available_parallelism; @@ -114,15 +113,15 @@ const DEFAULT_EVIDENCE_MORSEL_WINDOW: usize = 8; /// (filter-first) rather than the whole morsel. Mirrors the V1 flat-reader threshold. const EXPR_EVAL_THRESHOLD: f64 = 0.2; -struct FileStatsScanNode { - data: ScanNodeRef, +struct FileStatsScanPlan { + data: ScanPlanRef, stats: Arc, fields: StructFields, row_count: u64, } -struct FileStatsExprScanNode { - data: ScanNodeRef, +struct FileStatsExprScanPlan { + data: ScanPlanRef, stats: Arc, field_idx: usize, field_dtype: DType, @@ -136,9 +135,9 @@ struct FilePreparedStats { funcs: Vec, } -impl FileStatsScanNode { +impl FileStatsScanPlan { fn try_new( - data: ScanNodeRef, + data: ScanPlanRef, stats: Arc, dtype: &DType, row_count: u64, @@ -160,25 +159,25 @@ impl FileStatsScanNode { } } -impl ScanNode for FileStatsScanNode { +impl ScanPlan for FileStatsScanPlan { type State = ScanStateRef; fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - cx.init_node(&self.data) + cx.init_plan(&self.data) } fn try_push_expr( self: Arc, expr: &Expression, cx: &mut PushCtx, - ) -> VortexResult> { + ) -> VortexResult> { let Some(data) = Arc::clone(&self.data).try_push_expr(expr, cx)? else { return Ok(None); }; let Some((field_idx, _name, field_dtype)) = self.pushed_field(expr) else { return Ok(Some(data)); }; - Ok(Some(Arc::new(FileStatsExprScanNode { + Ok(Some(Arc::new(FileStatsExprScanPlan { data, stats: Arc::clone(&self.stats), field_idx, @@ -220,18 +219,18 @@ impl ScanNode for FileStatsScanNode { } } -impl ScanNode for FileStatsExprScanNode { +impl ScanPlan for FileStatsExprScanPlan { type State = ScanStateRef; fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - cx.init_node(&self.data) + cx.init_plan(&self.data) } fn try_push_expr( self: Arc, expr: &Expression, cx: &mut PushCtx, - ) -> VortexResult> { + ) -> VortexResult> { Arc::clone(&self.data).try_push_expr(expr, cx) } @@ -380,9 +379,9 @@ fn scalar_precision_to_value(precision: Precision) -> Precision VortexResult { +) -> VortexResult { if builder.glob_sources.is_empty() { vortex_bail!("MultiFileDataSource requires at least one glob pattern"); } @@ -429,7 +428,7 @@ pub(super) async fn build_scan_node_data_source( let factories: Vec> = all_files[1..] .iter() .map(|(file, fs)| { - Arc::new(ScanNodeFileFactory { + Arc::new(ScanPlanFileFactory { fs: Arc::clone(fs), file: file.clone(), session: builder.session.clone(), @@ -439,7 +438,7 @@ pub(super) async fn build_scan_node_data_source( }) .collect(); - Ok(ScanNodeDataSource::new_with_first( + Ok(ScanPlanDataSource::new_with_first( first_file, factories, &builder.session, @@ -451,7 +450,7 @@ trait VortexFileFactory: 'static + Send + Sync { async fn open(&self) -> VortexResult>; } -struct ScanNodeFileFactory { +struct ScanPlanFileFactory { fs: FileSystemRef, file: FileListing, session: VortexSession, @@ -460,7 +459,7 @@ struct ScanNodeFileFactory { } #[async_trait] -impl VortexFileFactory for ScanNodeFileFactory { +impl VortexFileFactory for ScanPlanFileFactory { async fn open(&self) -> VortexResult> { let file = open_file( &self.fs, @@ -474,20 +473,20 @@ impl VortexFileFactory for ScanNodeFileFactory { } } -enum ScanNodeChild { +enum ScanPlanChild { Opened(VortexFile), Deferred(Arc), } -/// Multi-file data source backed by scan2 ScanNode plans. -pub struct ScanNodeDataSource { +/// Multi-file data source backed by scan2 ScanPlan plans. +pub struct ScanPlanDataSource { dtype: DType, session: VortexSession, - children: Vec, + children: Vec, concurrency: usize, } -impl ScanNodeDataSource { +impl ScanPlanDataSource { fn new_with_first( first: VortexFile, remaining: Vec>, @@ -497,8 +496,8 @@ impl ScanNodeDataSource { let concurrency = get_available_parallelism().unwrap_or(DEFAULT_CONCURRENCY); let mut children = Vec::with_capacity(1 + remaining.len()); - children.push(ScanNodeChild::Opened(first)); - children.extend(remaining.into_iter().map(ScanNodeChild::Deferred)); + children.push(ScanPlanChild::Opened(first)); + children.extend(remaining.into_iter().map(ScanPlanChild::Deferred)); Self { dtype, @@ -514,11 +513,11 @@ impl ScanNodeDataSource { .iter() .enumerate() .map(|(idx, child)| match child { - ScanNodeChild::Opened(file) => { + ScanPlanChild::Opened(file) => { let file = file.clone(); async move { Ok(Some((idx, file))) }.boxed() } - ScanNodeChild::Deferred(factory) => { + ScanPlanChild::Deferred(factory) => { let factory = Arc::clone(factory); async move { factory @@ -553,7 +552,7 @@ impl ScanNodeDataSource { } #[async_trait] -impl DataSource for ScanNodeDataSource { +impl DataSource for ScanPlanDataSource { fn dtype(&self) -> &DType { &self.dtype } @@ -565,11 +564,11 @@ impl DataSource for ScanNodeDataSource { for child in &self.children { match child { - ScanNodeChild::Opened(file) => { + ScanPlanChild::Opened(file) => { opened_count += 1; sum = sum.saturating_add(file.row_count()); } - ScanNodeChild::Deferred(_) => { + ScanPlanChild::Deferred(_) => { deferred_count += 1; } } @@ -595,12 +594,12 @@ impl DataSource for ScanNodeDataSource { _data: &[u8], _session: &VortexSession, ) -> VortexResult { - vortex_bail!("ScanNodeDataSource partitions are not yet serializable") + vortex_bail!("ScanPlanDataSource partitions are not yet serializable") } async fn plan_morsel_partitions( &self, - scan_request: ScanRequest, + scan_request: DataSourceScanRequest, target_partitions: usize, ) -> VortexResult> { if scan_request.ordered || scan_request.limit.is_some() { @@ -628,7 +627,7 @@ impl DataSource for ScanNodeDataSource { else { continue; }; - let prepared = Arc::new(PreparedScanNodeFile::try_new(file, request, &ticket)?); + let prepared = Arc::new(PreparedScanPlanFile::try_new(file, request, &ticket)?); let ranges = prepared.splits()?; if ranges.is_empty() { continue; @@ -647,7 +646,7 @@ impl DataSource for ScanNodeDataSource { for (prepared, ranges) in planned_files { for range in ranges { let partition = morsel_idx % partition_count; - partitions[partition].push(PlannedScanNodeMorsel { + partitions[partition].push(PlannedScanPlanMorsel { prepared: Arc::clone(&prepared), range, }); @@ -659,7 +658,7 @@ impl DataSource for ScanNodeDataSource { let (morsel_plan_window, morsel_launch_window) = morsel_windows(&scheduler, false, has_runtime_evidence, default_window); - Ok(Some(Arc::new(PlannedScanNodeScan { + Ok(Some(Arc::new(PlannedScanPlanScan { dtype, partitions, scheduler, @@ -669,7 +668,7 @@ impl DataSource for ScanNodeDataSource { }))) } - async fn scan(&self, scan_request: ScanRequest) -> VortexResult { + async fn scan(&self, scan_request: DataSourceScanRequest) -> VortexResult { let meta = ScanMeta { label: Some("scan2".to_string()), }; @@ -685,14 +684,14 @@ impl DataSource for ScanNodeDataSource { for child in &self.children { match child { - ScanNodeChild::Opened(file) => ready.push_back(file.clone()), - ScanNodeChild::Deferred(factory) => deferred.push_back(Arc::clone(factory)), + ScanPlanChild::Opened(file) => ready.push_back(file.clone()), + ScanPlanChild::Deferred(factory) => deferred.push_back(Arc::clone(factory)), } } let dtype = scan_request.projection.return_dtype(&self.dtype)?; - Ok(Box::new(ScanNodeDataSourceScan { + Ok(Box::new(ScanPlanDataSourceScan { dtype, request: scan_request, ready, @@ -712,10 +711,10 @@ impl DataSource for ScanNodeDataSource { if self.children.len() != 1 { return Ok(absent_statistics(funcs)); } - let ScanNodeChild::Opened(file) = &self.children[0] else { + let ScanPlanChild::Opened(file) = &self.children[0] else { return Ok(absent_statistics(funcs)); }; - scan_node_file_statistics(file.clone(), expr, funcs).await + scan_plan_file_statistics(file.clone(), expr, funcs).await } async fn field_statistics(&self, field_path: &FieldPath) -> VortexResult { @@ -750,9 +749,9 @@ impl DataSource for ScanNodeDataSource { } } -struct ScanNodeDataSourceScan { +struct ScanPlanDataSourceScan { dtype: DType, - request: ScanRequest, + request: DataSourceScanRequest, ready: VecDeque, deferred: VecDeque>, handle: Handle, @@ -761,7 +760,7 @@ struct ScanNodeDataSourceScan { ticket: ScanTicket, } -impl DataSourceScan for ScanNodeDataSourceScan { +impl DataSourceScan for ScanPlanDataSourceScan { fn dtype(&self) -> &DType { &self.dtype } @@ -845,7 +844,7 @@ impl DataSourceScan for ScanNodeDataSourceScan { fn file_partition( partition_idx: usize, file: VortexFile, - request: ScanRequest, + request: DataSourceScanRequest, scheduler: Arc, ticket: ScanTicket, ) -> VortexResult> { @@ -853,7 +852,7 @@ fn file_partition( return Ok(None); }; - Ok(Some(Box::new(ScanNodePartition { + Ok(Some(Box::new(ScanPlanPartition { file, request, index: partition_idx, @@ -862,9 +861,9 @@ fn file_partition( }))) } -pub(crate) fn scan_node_file_stream( +pub(crate) fn scan_plan_file_stream( file: VortexFile, - request: ScanRequest, + request: DataSourceScanRequest, ) -> VortexResult { let dtype = request.projection.return_dtype(file.dtype())?; let meta = ScanMeta { @@ -886,16 +885,16 @@ pub(crate) fn scan_node_file_stream( partition.execute() } -pub(crate) async fn scan_node_file_statistics( +pub(crate) async fn scan_plan_file_statistics( file: VortexFile, expr: &Expression, funcs: &[AggregateFnRef], ) -> VortexResult>> { - let mut stats = scan_node_file_statistics_many(file, std::slice::from_ref(expr), funcs).await?; + let mut stats = scan_plan_file_statistics_many(file, std::slice::from_ref(expr), funcs).await?; Ok(stats.pop().unwrap_or_else(|| absent_statistics(funcs))) } -pub(crate) async fn scan_node_file_statistics_many( +pub(crate) async fn scan_plan_file_statistics_many( file: VortexFile, exprs: &[Expression], funcs: &[AggregateFnRef], @@ -921,13 +920,13 @@ pub(crate) async fn scan_node_file_statistics_many( Ok(result) } -pub(crate) fn scan_node_file_splits(file: &VortexFile) -> VortexResult>> { +pub(crate) fn scan_plan_file_splits(file: &VortexFile) -> VortexResult>> { let session = file.session().clone(); let root = expand_file_root(file, &session)?; split_ranges_from_node(&root, file.row_count()) } -pub(crate) async fn scan_node_file_plan_splits( +pub(crate) async fn scan_plan_file_plan_splits( file: VortexFile, projection: &Expression, ) -> VortexResult>> { @@ -943,7 +942,7 @@ pub(crate) async fn scan_node_file_plan_splits( .await } -fn split_ranges_from_node(node: &ScanNodeRef, row_count: u64) -> VortexResult>> { +fn split_ranges_from_node(node: &ScanPlanRef, row_count: u64) -> VortexResult>> { let mut points = vec![0, row_count]; if let Some(hints) = node.split_hints() { points.extend( @@ -964,21 +963,21 @@ fn split_ranges_from_node(node: &ScanNodeRef, row_count: u64) -> VortexResult VortexResult { - let mut node_request = NodeRequest::empty(); +fn expand_file_root(file: &VortexFile, session: &VortexSession) -> VortexResult { + let mut plan_request = ScanRequest::empty(); let layout = file .footer() .layout2() .ok_or_else(|| vortex_err!("scan2 requires a v2 footer layout"))?; - let root = ExpandCtx::new(session.clone()).expand(layout, &mut node_request)?; + let root = layout.new_scan_plan(&mut plan_request, session)?; Ok(match file.footer().statistics().cloned() { - Some(stats) => FileStatsScanNode::try_new( + Some(stats) => FileStatsScanPlan::try_new( Arc::clone(&root), Arc::new(stats), file.dtype(), file.row_count(), ) - .map(|node| Arc::new(node) as ScanNodeRef) + .map(|node| Arc::new(node) as ScanPlanRef) .unwrap_or(root), None => root, }) @@ -987,8 +986,8 @@ fn expand_file_root(file: &VortexFile, session: &VortexSession) -> VortexResult< fn file_scan_request( partition_idx: usize, file: &VortexFile, - request: ScanRequest, -) -> VortexResult> { + request: DataSourceScanRequest, +) -> VortexResult> { let partition_idx_u64 = partition_idx as u64; if let Some(range) = &request.partition_range && !range.contains(&partition_idx_u64) @@ -1019,7 +1018,7 @@ fn file_scan_request( return Ok(None); } - Ok(Some(ScanRequest { + Ok(Some(DataSourceScanRequest { row_range: Some(row_range), ..request })) @@ -1103,7 +1102,7 @@ struct PlannedMorselWork { } struct MorselState { - prepared: Arc, + prepared: Arc, range: Range, selected: Mask, evidence: Vec>, @@ -1112,7 +1111,7 @@ struct MorselState { } struct PartitionWorkSchedulerState { - pending: VecDeque, + pending: VecDeque, morsels: Vec>, active_morsels: usize, next_morsel_id: usize, @@ -1172,7 +1171,7 @@ fn morsel_windows( } fn partition_work_stream( - morsels: Vec, + morsels: Vec, scheduler: Arc, ticket: ScanTicket, ordered: bool, @@ -1516,15 +1515,15 @@ impl PartitionWorkSchedulerState { } } -struct ScanNodePartition { +struct ScanPlanPartition { file: VortexFile, - request: ScanRequest, + request: DataSourceScanRequest, index: usize, scheduler: Arc, ticket: ScanTicket, } -impl Partition for ScanNodePartition { +impl Partition for ScanPlanPartition { fn as_any(&self) -> &dyn Any { self } @@ -1556,7 +1555,7 @@ impl Partition for ScanNodePartition { } fn execute(self: Box) -> VortexResult { - let ScanNodePartition { + let ScanPlanPartition { file, request, index: _, @@ -1564,7 +1563,7 @@ impl Partition for ScanNodePartition { ticket, } = *self; - let prepared = Arc::new(PreparedScanNodeFile::try_new(file, request, &ticket)?); + let prepared = Arc::new(PreparedScanPlanFile::try_new(file, request, &ticket)?); let dtype = prepared.dtype.clone(); let ranges = prepared.splits()?; let ordered = prepared.ordered; @@ -1577,7 +1576,7 @@ impl Partition for ScanNodePartition { ); let morsels = ranges .into_iter() - .map(|range| PlannedScanNodeMorsel { + .map(|range| PlannedScanPlanMorsel { prepared: Arc::clone(&prepared), range, }) @@ -1598,9 +1597,9 @@ impl Partition for ScanNodePartition { } } -struct PlannedScanNodeScan { +struct PlannedScanPlanScan { dtype: DType, - partitions: Vec>, + partitions: Vec>, scheduler: Arc, ticket: ScanTicket, morsel_plan_window: usize, @@ -1608,12 +1607,12 @@ struct PlannedScanNodeScan { } #[derive(Clone)] -struct PlannedScanNodeMorsel { - prepared: Arc, +struct PlannedScanPlanMorsel { + prepared: Arc, range: Range, } -impl PlannedMorselScan for PlannedScanNodeScan { +impl PlannedMorselScan for PlannedScanPlanScan { fn dtype(&self) -> &DType { &self.dtype } @@ -1630,19 +1629,19 @@ impl PlannedMorselScan for PlannedScanNodeScan { ); } - Ok(Box::new(PlannedScanNodePartition { + Ok(Box::new(PlannedScanPlanPartition { planned: self, index: partition, })) } } -struct PlannedScanNodePartition { - planned: Arc, +struct PlannedScanPlanPartition { + planned: Arc, index: usize, } -impl Partition for PlannedScanNodePartition { +impl Partition for PlannedScanPlanPartition { fn as_any(&self) -> &dyn Any { self } @@ -1673,7 +1672,7 @@ impl Partition for PlannedScanNodePartition { } fn execute(self: Box) -> VortexResult { - let PlannedScanNodePartition { planned, index } = *self; + let PlannedScanPlanPartition { planned, index } = *self; let morsels = planned.partitions[index].clone(); let dtype = planned.dtype.clone(); let scheduler = Arc::clone(&planned.scheduler); @@ -1693,7 +1692,7 @@ impl Partition for PlannedScanNodePartition { } } -struct PreparedScanNodeFile { +struct PreparedScanPlanFile { session: VortexSession, reader: FileReader, dtype: DType, @@ -1704,7 +1703,7 @@ struct PreparedScanNodeFile { segment_source_id: SegmentSourceId, scheduled_segment_source: Arc, segment_future_cache: Arc, - root: ScanNodeRef, + root: ScanPlanRef, projection: PreparedReadRef, predicates: Vec, } @@ -1720,8 +1719,12 @@ struct RegisteredScheduledSegmentSource { source: Arc, } -impl PreparedScanNodeFile { - fn try_new(file: VortexFile, request: ScanRequest, ticket: &ScanTicket) -> VortexResult { +impl PreparedScanPlanFile { + fn try_new( + file: VortexFile, + request: DataSourceScanRequest, + ticket: &ScanTicket, + ) -> VortexResult { let session = file.session().clone(); let dtype = request.projection.return_dtype(file.dtype())?; let projection = request.projection.optimize_recursive(file.dtype())?; @@ -2063,11 +2066,11 @@ impl PreparedScanNodeFile { } fn push_expr( - root: &ScanNodeRef, + root: &ScanPlanRef, expr: &Expression, dtype: &DType, session: &VortexSession, -) -> VortexResult { +) -> VortexResult { validate_temporal_comparisons(expr, dtype)?; Arc::clone(root) .try_push_expr(expr, &mut PushCtx::new(session.clone()))? @@ -2075,7 +2078,7 @@ fn push_expr( } fn prepare_read( - root: &ScanNodeRef, + root: &ScanPlanRef, expr: &Expression, dtype: &DType, session: &VortexSession, diff --git a/vortex-file/src/scan_v1_v2_differential.rs b/vortex-file/src/scan_v1_v2_differential.rs index a21fef9a18d..a8c89c6d4e1 100644 --- a/vortex-file/src/scan_v1_v2_differential.rs +++ b/vortex-file/src/scan_v1_v2_differential.rs @@ -2,12 +2,12 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors //! Differential tests that scan the same [`ScanRequest`] through both the V1 -//! (LayoutReader-based) and V2 (ScanNode-based) scan paths and assert the +//! (LayoutReader-based) and V2 (ScanPlan-based) scan paths and assert the //! outputs are identical. //! //! V1 is driven through [`VortexFile::scan`] + //! [`ScanBuilder::into_array_stream`]; V2 is driven directly through -//! [`VortexFile::scan_node_stream`]. Neither side flips the process-global +//! [`VortexFile::scan_plan_stream`]. Neither side flips the process-global //! `VORTEX_SCAN_IMPL` env var, so the two implementations run side by side in //! the same test process. @@ -51,7 +51,7 @@ use crate::WriteOptionsSessionExt; static SESSION: LazyLock = LazyLock::new(crate::tests::new_test_session); /// Write `array` to an in-memory Vortex file, optionally with file statistics -/// (which exercises the V2 `FileStatsScanNode` path and V1 `FileStatsLayoutReader`). +/// (which exercises the V2 `FileStatsScanPlan` path and V1 `FileStatsLayoutReader`). async fn write_file(array: ArrayRef, with_stats: bool) -> VortexResult { let mut buf = ByteBufferMut::empty(); if with_stats { @@ -82,9 +82,9 @@ async fn scan_v1(file: &VortexFile, request: &ScanRequest) -> VortexResult VortexResult { - file.scan_node_stream(request.clone())?.read_all().await + file.scan_plan_stream(request.clone())?.read_all().await } /// Scan the same request through both paths and assert the outputs are equal. diff --git a/vortex-file/src/tests.rs b/vortex-file/src/tests.rs index 45689f57b67..a095a436b3c 100644 --- a/vortex-file/src/tests.rs +++ b/vortex-file/src/tests.rs @@ -108,7 +108,7 @@ fn exact_u64_stat(stat: &Precision) -> Option { } #[test] -fn multi_file_scan_node_data_source_filters_and_projects() -> VortexResult<()> { +fn multi_file_scan_plan_data_source_filters_and_projects() -> VortexResult<()> { use vortex_io::runtime::BlockingRuntime; use vortex_io::runtime::single::SingleThreadRuntime; use vortex_io::session::RuntimeSessionExt; @@ -278,7 +278,7 @@ fn multi_file_scan_node_data_source_filters_and_projects() -> VortexResult<()> { ) .await? .ok_or_else(|| { - vortex_error::vortex_err!("scan node data source must plan morsel partitions") + vortex_error::vortex_err!("scan plan data source must plan morsel partitions") })?; assert_eq!(planned.partition_count(), 2); diff --git a/vortex-layout/src/layout_v2.rs b/vortex-layout/src/layout_v2.rs index 2e828b0cff9..9740ed7b0c8 100644 --- a/vortex-layout/src/layout_v2.rs +++ b/vortex-layout/src/layout_v2.rs @@ -27,6 +27,9 @@ use vortex_error::vortex_ensure; use vortex_error::vortex_err; use vortex_flatbuffers::FlatBuffer; use vortex_flatbuffers::layout; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::request::ScanRequest; +use vortex_session::VortexSession; use vortex_session::registry::ReadContext; use vortex_session::registry::Registry; @@ -38,9 +41,6 @@ use crate::scan::v2::layouts::dict as scan_dict; use crate::scan::v2::layouts::flat as scan_flat; use crate::scan::v2::layouts::struct_ as scan_struct; use crate::scan::v2::layouts::zoned as scan_zoned; -use crate::scan::v2::node::ExpandCtx; -use crate::scan::v2::node::ScanNodeRef; -use crate::scan::v2::request::NodeRequest; use crate::segments::SegmentId; /// A reference-counted, type-erased v2 layout. @@ -95,12 +95,12 @@ pub trait VTable: 'static + Clone + Send + Sync + Debug { /// Returns the relationship between child `idx` and its parent. fn child_type(layout: Layout, idx: usize) -> VortexResult; - /// Expand this layout into a scan2 node. - fn new_scan_node( + /// Expand this layout into a physical scan plan. + fn new_scan_plan( layout: Layout, - req: &mut NodeRequest, - cx: &ExpandCtx, - ) -> VortexResult; + req: &mut ScanRequest, + session: &VortexSession, + ) -> VortexResult; } /// Object-safe plugin for deserializing v2 layouts by ID. @@ -338,8 +338,11 @@ trait DynLayout: 'static + Send + Sync + Debug { fn dyn_child_type(&self, idx: usize) -> VortexResult; - fn dyn_new_scan_node(&self, req: &mut NodeRequest, cx: &ExpandCtx) - -> VortexResult; + fn dyn_new_scan_plan( + &self, + req: &mut ScanRequest, + session: &VortexSession, + ) -> VortexResult; } impl LayoutRef { @@ -393,13 +396,13 @@ impl LayoutRef { self.0.dyn_child_type(idx) } - /// Expand this layout into a scan2 node. - pub fn new_scan_node( + /// Expand this layout into a physical scan plan. + pub fn new_scan_plan( &self, - req: &mut NodeRequest, - cx: &ExpandCtx, - ) -> VortexResult { - self.0.dyn_new_scan_node(req, cx) + req: &mut ScanRequest, + session: &VortexSession, + ) -> VortexResult { + self.0.dyn_new_scan_plan(req, session) } /// Returns an iterator over child row offsets. @@ -451,12 +454,12 @@ impl DynLayout for Layout { V::child_type(self.clone(), idx) } - fn dyn_new_scan_node( + fn dyn_new_scan_plan( &self, - req: &mut NodeRequest, - cx: &ExpandCtx, - ) -> VortexResult { - V::new_scan_node(self.clone(), req, cx) + req: &mut ScanRequest, + session: &VortexSession, + ) -> VortexResult { + V::new_scan_plan(self.clone(), req, session) } } @@ -764,12 +767,12 @@ impl VTable for Flat { vortex_bail!("Flat layout has no child {idx}") } - fn new_scan_node( + fn new_scan_plan( layout: Layout, - req: &mut NodeRequest, - cx: &ExpandCtx, - ) -> VortexResult { - scan_flat::new_scan_node(layout, req, cx) + req: &mut ScanRequest, + session: &VortexSession, + ) -> VortexResult { + scan_flat::new_scan_plan(layout, req, session) } } @@ -825,12 +828,12 @@ impl VTable for Chunked { Ok(LayoutChildType::Chunk((idx, offset))) } - fn new_scan_node( + fn new_scan_plan( layout: Layout, - req: &mut NodeRequest, - cx: &ExpandCtx, - ) -> VortexResult { - scan_chunked::new_scan_node(layout, req, cx) + req: &mut ScanRequest, + session: &VortexSession, + ) -> VortexResult { + scan_chunked::new_scan_plan(layout, req, session) } } @@ -885,12 +888,12 @@ impl VTable for Struct { } } - fn new_scan_node( + fn new_scan_plan( layout: Layout, - req: &mut NodeRequest, - cx: &ExpandCtx, - ) -> VortexResult { - scan_struct::new_scan_node(layout, req, cx) + req: &mut ScanRequest, + session: &VortexSession, + ) -> VortexResult { + scan_struct::new_scan_plan(layout, req, session) } } @@ -948,12 +951,12 @@ impl VTable for Dict { } } - fn new_scan_node( + fn new_scan_plan( layout: Layout, - req: &mut NodeRequest, - cx: &ExpandCtx, - ) -> VortexResult { - scan_dict::new_scan_node(layout, req, cx) + req: &mut ScanRequest, + session: &VortexSession, + ) -> VortexResult { + scan_dict::new_scan_plan(layout, req, session) } } @@ -1024,11 +1027,11 @@ impl VTable for Zoned { } } - fn new_scan_node( + fn new_scan_plan( layout: Layout, - req: &mut NodeRequest, - cx: &ExpandCtx, - ) -> VortexResult { - scan_zoned::new_scan_node(layout, req, cx) + req: &mut ScanRequest, + session: &VortexSession, + ) -> VortexResult { + scan_zoned::new_scan_plan(layout, req, session) } } diff --git a/vortex-layout/src/scan/v2/layouts/chunked.rs b/vortex-layout/src/scan/v2/layouts/chunked.rs index 547b2166628..bb8a152775f 100644 --- a/vortex-layout/src/scan/v2/layouts/chunked.rs +++ b/vortex-layout/src/scan/v2/layouts/chunked.rs @@ -40,45 +40,44 @@ use vortex_array::scalar::Scalar; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; +use vortex_scan::plan::AggregateAnswer; +use vortex_scan::plan::FileReader; +use vortex_scan::plan::PrepareCtx; +use vortex_scan::plan::PreparedAggregate; +use vortex_scan::plan::PreparedAggregateRef; +use vortex_scan::plan::PreparedEvidence; +use vortex_scan::plan::PreparedEvidenceRef; +use vortex_scan::plan::PreparedRead; +use vortex_scan::plan::PreparedReadRef; +use vortex_scan::plan::PreparedStateCacheRef; +use vortex_scan::plan::PreparedStateKey; +use vortex_scan::plan::PushCtx; +use vortex_scan::plan::RowScope; +use vortex_scan::plan::ScanPlan; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::ScanStateRef; +use vortex_scan::plan::StateCtx; +use vortex_scan::plan::evidence::EvidenceFragment; +use vortex_scan::plan::request::EvidenceMode; +use vortex_scan::plan::request::EvidenceRequest; +use vortex_scan::plan::request::ScanRequest; use vortex_session::VortexSession; use crate::layout_v2::Chunked; use crate::layout_v2::Layout; use crate::layout_v2::LayoutRef; -use crate::scan::v2::evidence::EvidenceFragment; -use crate::scan::v2::node::AggregateAnswer; -use crate::scan::v2::node::ExpandCtx; -use crate::scan::v2::node::FileReader; -use crate::scan::v2::node::PrepareCtx; -use crate::scan::v2::node::PreparedAggregate; -use crate::scan::v2::node::PreparedAggregateRef; -use crate::scan::v2::node::PreparedEvidence; -use crate::scan::v2::node::PreparedEvidenceRef; -use crate::scan::v2::node::PreparedRead; -use crate::scan::v2::node::PreparedReadRef; -use crate::scan::v2::node::PreparedStateCacheRef; -use crate::scan::v2::node::PreparedStateKey; -use crate::scan::v2::node::PushCtx; -use crate::scan::v2::node::RowScope; -use crate::scan::v2::node::ScanNode; -use crate::scan::v2::node::ScanNodeRef; -use crate::scan::v2::node::ScanStateRef; -use crate::scan::v2::node::StateCtx; -use crate::scan::v2::request::EvidenceMode; -use crate::scan::v2::request::EvidenceRequest; -use crate::scan::v2::request::NodeRequest; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; -pub(crate) fn new_scan_node( +pub(crate) fn new_scan_plan( layout: Layout, - _req: &mut NodeRequest, - cx: &ExpandCtx, -) -> VortexResult { - Ok(Arc::new(ChunkedScanNode { + _req: &mut ScanRequest, + session: &VortexSession, +) -> VortexResult { + Ok(Arc::new(ChunkedScanPlan { layout: layout.to_layout(), offsets: layout.data().chunk_offsets().to_vec(), - cx: cx.clone(), + session: session.clone(), children: Mutex::new(FxHashMap::default()), })) } @@ -86,17 +85,17 @@ pub(crate) fn new_scan_node( /// Reads a chunked layout: cumulative chunk offsets /// (`offsets.len() == chunks + 1`), with chunk children expanded lazily /// through their own layout vtables. -pub struct ChunkedScanNode { +pub struct ChunkedScanPlan { layout: LayoutRef, offsets: Vec, - cx: ExpandCtx, + session: VortexSession, /// Lazily expanded chunk nodes, shared across queries. - children: Mutex>, + children: Mutex>, } /// Per-query states of the lazily expanded chunk nodes. Chunk states /// behind the scan's morsel frontier are dropped by -/// [`ScanNode::release`], so a long scan retains the working set, not +/// [`ScanPlan::release`], so a long scan retains the working set, not /// every chunk it touched. #[derive(Default)] pub struct ChunkedScanState { @@ -116,11 +115,11 @@ pub struct ChunkedScanState { /// Chunk children remain lazy: this node records the expression once and /// replays expression pushdown into each concrete child only when a read, /// evidence request, or aggregate touches that chunk. -pub struct ChunkedExprScanNode { - chunked: Arc, +pub struct ChunkedExprScanPlan { + chunked: Arc, expr: Expression, dtype: DType, - children: Mutex>, + children: Mutex>, } /// Per-query states of lazily pushed chunk children. @@ -132,13 +131,13 @@ pub struct ChunkedExprScanState { } struct ChunkedPreparedEvidence { - node: Arc, + node: Arc, state: Arc, } enum ChunkedAggregateNode { - Root(Arc), - Expr(Arc), + Root(Arc), + Expr(Arc), } struct ChunkedPreparedAggregate { @@ -149,12 +148,12 @@ struct ChunkedPreparedAggregate { } struct ChunkedPreparedRead { - node: Arc, + node: Arc, state: Arc, } struct ChunkedExprPreparedRead { - node: Arc, + node: Arc, state: Arc, } @@ -206,22 +205,26 @@ impl ChunkedEvidenceState { } } -impl ChunkedScanNode { +impl ChunkedScanPlan { fn scan_state(&self, cx: &mut PrepareCtx) -> VortexResult> { let key = PreparedStateKey::new::(self as *const Self as *const () as usize); cx.shared_state(key, || Ok(ChunkedScanState::default())) } - /// The scan node for chunk `idx`, expanding it on first use. Lazy + /// The scan plan for chunk `idx`, expanding it on first use. Lazy /// expansion is independent of pushed predicate expressions. - fn child(&self, idx: usize) -> VortexResult { + fn child(&self, idx: usize) -> VortexResult { if let Some(hit) = self.children.lock().get(&idx) { return Ok(Arc::clone(hit)); } - let node = self.cx.expand_free(&self.layout.child(idx)?)?; - self.children.lock().insert(idx, Arc::clone(&node)); - Ok(node) + let mut req = ScanRequest::empty(); + let plan = self + .layout + .child(idx)? + .new_scan_plan(&mut req, &self.session)?; + self.children.lock().insert(idx, Arc::clone(&plan)); + Ok(plan) } /// The planned value read for chunk `idx`, creating it on first use. @@ -252,8 +255,8 @@ impl ChunkedScanNode { } } -impl ChunkedExprScanNode { - fn new(chunked: Arc, expr: Expression, dtype: DType) -> Self { +impl ChunkedExprScanPlan { + fn new(chunked: Arc, expr: Expression, dtype: DType) -> Self { Self { chunked, expr, @@ -262,7 +265,7 @@ impl ChunkedExprScanNode { } } - fn child(&self, idx: usize, session: &VortexSession) -> VortexResult { + fn child(&self, idx: usize, session: &VortexSession) -> VortexResult { if let Some(hit) = self.children.lock().get(&idx) { return Ok(Arc::clone(hit)); } @@ -313,7 +316,7 @@ impl ChunkedAggregateNode { } } - fn child(&self, idx: usize, io: &FileReader) -> VortexResult { + fn child(&self, idx: usize, io: &FileReader) -> VortexResult { match self { Self::Root(node) => node.child(idx), Self::Expr(node) => node.child(idx, io.session()), @@ -450,7 +453,7 @@ impl PreparedAggregate for ChunkedPreparedAggregate { } } -impl ScanNode for ChunkedScanNode { +impl ScanPlan for ChunkedScanPlan { type State = ChunkedScanState; fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { @@ -466,12 +469,12 @@ impl ScanNode for ChunkedScanNode { self: Arc, expr: &Expression, _cx: &mut PushCtx, - ) -> VortexResult> { + ) -> VortexResult> { if is_root(expr) { return Ok(Some(self)); } let dtype = expr.return_dtype(self.layout.dtype())?; - Ok(Some(Arc::new(ChunkedExprScanNode::new( + Ok(Some(Arc::new(ChunkedExprScanPlan::new( self, expr.clone(), dtype, @@ -482,7 +485,7 @@ impl ScanNode for ChunkedScanNode { self: Arc, cx: &mut PrepareCtx, ) -> VortexResult> { - let node = Arc::new(ChunkedExprScanNode::new( + let node = Arc::new(ChunkedExprScanPlan::new( Arc::clone(&self), root(), self.layout.dtype().clone(), @@ -733,7 +736,7 @@ impl PreparedRead for ChunkedPreparedRead { } } -impl ScanNode for ChunkedExprScanNode { +impl ScanPlan for ChunkedExprScanPlan { type State = ChunkedExprScanState; fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { diff --git a/vortex-layout/src/scan/v2/layouts/dict.rs b/vortex-layout/src/scan/v2/layouts/dict.rs index a136a7cd19f..b6b2a22b9cc 100644 --- a/vortex-layout/src/scan/v2/layouts/dict.rs +++ b/vortex-layout/src/scan/v2/layouts/dict.rs @@ -44,46 +44,46 @@ use vortex_error::vortex_bail; use vortex_error::vortex_err; use vortex_mask::AllOr; use vortex_mask::Mask; +use vortex_scan::plan::FileReader; +use vortex_scan::plan::PrepareCtx; +use vortex_scan::plan::PreparedRead; +use vortex_scan::plan::PreparedReadRef; +use vortex_scan::plan::PreparedStateKey; +use vortex_scan::plan::PushCtx; +use vortex_scan::plan::RowScope; +use vortex_scan::plan::ScanPlan; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::StateCtx; +use vortex_scan::plan::request::ScanRequest; +use vortex_session::VortexSession; use crate::layout_v2::Dict; use crate::layout_v2::Layout; use crate::layouts::SharedArrayFuture; -use crate::scan::v2::node::ExpandCtx; -use crate::scan::v2::node::FileReader; -use crate::scan::v2::node::PrepareCtx; -use crate::scan::v2::node::PreparedRead; -use crate::scan::v2::node::PreparedReadRef; -use crate::scan::v2::node::PreparedStateKey; -use crate::scan::v2::node::PushCtx; -use crate::scan::v2::node::RowScope; -use crate::scan::v2::node::ScanNode; -use crate::scan::v2::node::ScanNodeRef; -use crate::scan::v2::node::StateCtx; -use crate::scan::v2::request::NodeRequest; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; -pub(crate) fn new_scan_node( +pub(crate) fn new_scan_plan( layout: Layout, - _req: &mut NodeRequest, - cx: &ExpandCtx, -) -> VortexResult { + _req: &mut ScanRequest, + session: &VortexSession, +) -> VortexResult { let values = layout.child(0)?; let codes = layout.child(1)?; - Ok(Arc::new(DictScanNode { + Ok(Arc::new(DictScanPlan { values_len: values.row_count(), // Values and codes live in other row domains. - values: cx.expand_free(&values)?, - codes: cx.expand_free(&codes)?, + values: values.new_scan_plan(&mut ScanRequest::empty(), session)?, + codes: codes.new_scan_plan(&mut ScanRequest::empty(), session)?, })) } /// Reads a dict layout: shared values (another row domain, read once per /// query) plus a codes chain in this node's row domain. -pub struct DictScanNode { - values: ScanNodeRef, +pub struct DictScanPlan { + values: ScanPlanRef, values_len: u64, - codes: ScanNodeRef, + codes: ScanPlanRef, } /// Per-query dictionary caches: the shared values relation and cached @@ -117,20 +117,20 @@ impl Default for DictSharedState { } /// A pushed scalar expression over a dictionary value. -struct DictExprScanNode { - dict: Arc, +struct DictExprScanPlan { + dict: Arc, expr: Expression, } struct DictPreparedRead { - node: Arc, + node: Arc, state: Arc, values_read: PreparedReadRef, codes_read: PreparedReadRef, } struct DictExprPreparedRead { - node: Arc, + node: Arc, state: Arc, values_read: PreparedReadRef, codes_read: PreparedReadRef, @@ -162,7 +162,7 @@ fn sparse_value_expr_candidate(expr: &Expression, values_len: u64, rows: RowScop sparse_dict_candidate(values_len, rows) && value_expr_is_expensive(expr) } -impl DictScanNode { +impl DictScanPlan { /// The values relation wrapped in a `SharedArray`, read once per query. fn values( &self, @@ -213,7 +213,7 @@ impl DictScanNode { } } -impl ScanNode for DictScanNode { +impl ScanPlan for DictScanPlan { type State = DictScanState; fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { @@ -224,11 +224,11 @@ impl ScanNode for DictScanNode { self: Arc, expr: &Expression, _cx: &mut PushCtx, - ) -> VortexResult> { + ) -> VortexResult> { if is_root(expr) { Ok(Some(self)) } else { - Ok(Some(Arc::new(DictExprScanNode { + Ok(Some(Arc::new(DictExprScanPlan { dict: self, expr: expr.clone(), }))) @@ -272,7 +272,7 @@ impl ScanNode for DictScanNode { } } -impl ScanNode for DictExprScanNode { +impl ScanPlan for DictExprScanPlan { type State = DictScanState; fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { diff --git a/vortex-layout/src/scan/v2/layouts/flat.rs b/vortex-layout/src/scan/v2/layouts/flat.rs index 9c3d6732d3a..386334d1192 100644 --- a/vortex-layout/src/scan/v2/layouts/flat.rs +++ b/vortex-layout/src/scan/v2/layouts/flat.rs @@ -21,37 +21,37 @@ use vortex_array::serde::SerializedArray; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; +use vortex_scan::plan::FileReader; +use vortex_scan::plan::PrepareCtx; +use vortex_scan::plan::PreparedRead; +use vortex_scan::plan::PreparedReadRef; +use vortex_scan::plan::PreparedStateKey; +use vortex_scan::plan::RowScope; +use vortex_scan::plan::ScanPlan; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::StateCtx; +use vortex_scan::plan::request::ScanRequest; +use vortex_session::VortexSession; use crate::layout_v2::Flat; use crate::layout_v2::Layout; use crate::layout_v2::LayoutRef; -use crate::scan::v2::node::ExpandCtx; -use crate::scan::v2::node::FileReader; -use crate::scan::v2::node::PrepareCtx; -use crate::scan::v2::node::PreparedRead; -use crate::scan::v2::node::PreparedReadRef; -use crate::scan::v2::node::PreparedStateKey; -use crate::scan::v2::node::RowScope; -use crate::scan::v2::node::ScanNode; -use crate::scan::v2::node::ScanNodeRef; -use crate::scan::v2::node::StateCtx; -use crate::scan::v2::request::NodeRequest; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; -pub(crate) fn new_scan_node( +pub(crate) fn new_scan_plan( layout: Layout, - _req: &mut NodeRequest, - _cx: &ExpandCtx, -) -> VortexResult { - Ok(Arc::new(FlatScanNode { + _req: &mut ScanRequest, + _session: &VortexSession, +) -> VortexResult { + Ok(Arc::new(FlatScanPlan { layout: layout.to_layout(), })) } /// Reads a flat layout: fetches its segment once per query, parses it /// into a (lazy) array, and slices per request. -pub struct FlatScanNode { +pub struct FlatScanPlan { layout: LayoutRef, } @@ -64,11 +64,11 @@ pub struct FlatScanState { } struct FlatPreparedRead { - node: Arc, + node: Arc, state: Arc, } -impl ScanNode for FlatScanNode { +impl ScanPlan for FlatScanPlan { type State = FlatScanState; fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { diff --git a/vortex-layout/src/scan/v2/layouts/struct_.rs b/vortex-layout/src/scan/v2/layouts/struct_.rs index 8e261b167d7..064e76ad322 100644 --- a/vortex-layout/src/scan/v2/layouts/struct_.rs +++ b/vortex-layout/src/scan/v2/layouts/struct_.rs @@ -25,52 +25,56 @@ use vortex_array::scalar_fn::fns::root::Root; use vortex_array::scalar_fn::fns::select::Select; use vortex_error::VortexResult; use vortex_error::vortex_bail; +use vortex_scan::plan::ApplyScanPlan; +use vortex_scan::plan::MaskScanPlan; +use vortex_scan::plan::PrepareCtx; +use vortex_scan::plan::PreparedReadRef; +use vortex_scan::plan::PushCtx; +use vortex_scan::plan::ScanPlan; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::StateCtx; +use vortex_scan::plan::StructValueScanPlan; +use vortex_scan::plan::request::ScanRequest; +use vortex_session::VortexSession; use crate::LayoutChildType; use crate::layout_v2::Layout; use crate::layout_v2::LayoutRef; use crate::layout_v2::Struct; -use crate::scan::v2::node::ApplyScanNode; -use crate::scan::v2::node::ExpandCtx; -use crate::scan::v2::node::MaskScanNode; -use crate::scan::v2::node::PrepareCtx; -use crate::scan::v2::node::PreparedReadRef; -use crate::scan::v2::node::PushCtx; -use crate::scan::v2::node::ScanNode; -use crate::scan::v2::node::ScanNodeRef; -use crate::scan::v2::node::StateCtx; -use crate::scan::v2::node::StructValueScanNode; use crate::scan::v2::referenced_fields; -use crate::scan::v2::request::NodeRequest; use crate::scan::v2::struct_fields; -pub(crate) fn new_scan_node( +pub(crate) fn new_scan_plan( layout: Layout, - _req: &mut NodeRequest, - cx: &ExpandCtx, -) -> VortexResult { + _req: &mut ScanRequest, + session: &VortexSession, +) -> VortexResult { let validity = layout .dtype() .is_nullable() - .then(|| cx.expand(&layout.child(0)?, &mut NodeRequest::empty())) + .then(|| { + layout + .child(0)? + .new_scan_plan(&mut ScanRequest::empty(), session) + }) .transpose()?; - Ok(Arc::new(StructScanNode { + Ok(Arc::new(StructScanPlan { layout: layout.to_layout(), - cx: cx.clone(), + session: session.clone(), children: Mutex::new(FxHashMap::default()), validity, })) } -/// Plans struct field expressions through child scan nodes. -pub struct StructScanNode { +/// Plans struct field expressions through child scan plans. +pub struct StructScanPlan { layout: LayoutRef, - cx: ExpandCtx, - children: Mutex>, - validity: Option, + session: VortexSession, + children: Mutex>, + validity: Option, } -impl ScanNode for StructScanNode { +impl ScanPlan for StructScanPlan { type State = (); fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult<()> { @@ -81,7 +85,7 @@ impl ScanNode for StructScanNode { self: Arc, expr: &Expression, cx: &mut PushCtx, - ) -> VortexResult> { + ) -> VortexResult> { let scope = struct_fields(self.layout.dtype())?; if is_root(expr) { return self.push_struct(scope.names().clone(), cx).map(Some); @@ -109,7 +113,7 @@ impl ScanNode for StructScanNode { return Ok(self.apply_validity(child.try_push_expr(&scoped, cx)?)); } let input = self.push_struct(fields.clone().into(), cx)?; - Ok(Some(Arc::new(ApplyScanNode::new(input, expr.clone())))) + Ok(Some(Arc::new(ApplyScanPlan::new(input, expr.clone())))) } fn prepare_read( @@ -124,23 +128,23 @@ impl ScanNode for StructScanNode { } } -impl StructScanNode { +impl StructScanPlan { /// Apply this struct's validity to a pushed single-field node. /// /// The single-field fast paths route straight to a child node, bypassing /// the parent struct's validity. When the struct is nullable we wrap the - /// child in a [`MaskScanNode`] so the parent's null mask is applied to the + /// child in a [`MaskScanPlan`] so the parent's null mask is applied to the /// child result, mirroring the v1 struct reader's `array.mask(validity)`. - fn apply_validity(&self, pushed: Option) -> Option { + fn apply_validity(&self, pushed: Option) -> Option { match (pushed, &self.validity) { (Some(node), Some(validity)) => { - Some(Arc::new(MaskScanNode::new(node, Arc::clone(validity)))) + Some(Arc::new(MaskScanPlan::new(node, Arc::clone(validity)))) } (pushed, _) => pushed, } } - fn child_field(&self, name: &FieldName) -> VortexResult { + fn child_field(&self, name: &FieldName) -> VortexResult { if let Some(hit) = self.children.lock().get(name) { return Ok(Arc::clone(hit)); } @@ -148,8 +152,10 @@ impl StructScanNode { if let Ok(LayoutChildType::Field(field)) = self.layout.child_type(idx) && field == *name { - let mut req = NodeRequest::empty(); - let child = self.cx.expand(&self.layout.child(idx)?, &mut req)?; + let child = self + .layout + .child(idx)? + .new_scan_plan(&mut ScanRequest::empty(), &self.session)?; let mut children = self.children.lock(); return Ok(Arc::clone(children.entry(name.clone()).or_insert(child))); } @@ -157,7 +163,7 @@ impl StructScanNode { vortex_bail!("field {name} not found in struct layout") } - fn push_struct(&self, names: FieldNames, cx: &mut PushCtx) -> VortexResult { + fn push_struct(&self, names: FieldNames, cx: &mut PushCtx) -> VortexResult { let fields = names .iter() .map(|name| { @@ -167,7 +173,7 @@ impl StructScanNode { .ok_or_else(|| vortex_error::vortex_err!("field {name} did not push root")) }) .collect::>>()?; - Ok(Arc::new(StructValueScanNode::new( + Ok(Arc::new(StructValueScanPlan::new( names, fields, self.validity.clone(), diff --git a/vortex-layout/src/scan/v2/layouts/zoned.rs b/vortex-layout/src/scan/v2/layouts/zoned.rs index 0a145e5c273..3d9c2fa5fcd 100644 --- a/vortex-layout/src/scan/v2/layouts/zoned.rs +++ b/vortex-layout/src/scan/v2/layouts/zoned.rs @@ -42,6 +42,27 @@ use vortex_array::scalar::Scalar; use vortex_error::VortexResult; use vortex_error::vortex_err; use vortex_mask::Mask; +use vortex_scan::plan::AggregateAnswer; +use vortex_scan::plan::FileReader; +use vortex_scan::plan::PrepareCtx; +use vortex_scan::plan::PreparedAggregate; +use vortex_scan::plan::PreparedAggregateRef; +use vortex_scan::plan::PreparedEvidence; +use vortex_scan::plan::PreparedEvidenceRef; +use vortex_scan::plan::PreparedRead; +use vortex_scan::plan::PreparedReadRef; +use vortex_scan::plan::PreparedStateKey; +use vortex_scan::plan::PushCtx; +use vortex_scan::plan::RowScope; +use vortex_scan::plan::ScanPlan; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::ScanStateRef; +use vortex_scan::plan::StateCtx; +use vortex_scan::plan::evidence::EvidenceFragment; +use vortex_scan::plan::evidence::PredicateEvidenceKind; +use vortex_scan::plan::read_dense; +use vortex_scan::plan::request::EvidenceRequest; +use vortex_scan::plan::request::ScanRequest; use vortex_session::VortexSession; use crate::layout_v2::Layout; @@ -49,43 +70,21 @@ use crate::layout_v2::Zoned; use crate::layouts::zoned::MAX_IS_TRUNCATED; use crate::layouts::zoned::MIN_IS_TRUNCATED; use crate::layouts::zoned::zone_map::ZoneMap; -use crate::scan::v2::evidence::EvidenceFragment; -use crate::scan::v2::evidence::PredicateEvidenceKind; -use crate::scan::v2::node::AggregateAnswer; -use crate::scan::v2::node::ExpandCtx; -use crate::scan::v2::node::FileReader; -use crate::scan::v2::node::PrepareCtx; -use crate::scan::v2::node::PreparedAggregate; -use crate::scan::v2::node::PreparedAggregateRef; -use crate::scan::v2::node::PreparedEvidence; -use crate::scan::v2::node::PreparedEvidenceRef; -use crate::scan::v2::node::PreparedRead; -use crate::scan::v2::node::PreparedReadRef; -use crate::scan::v2::node::PreparedStateKey; -use crate::scan::v2::node::PushCtx; -use crate::scan::v2::node::RowScope; -use crate::scan::v2::node::ScanNode; -use crate::scan::v2::node::ScanNodeRef; -use crate::scan::v2::node::ScanStateRef; -use crate::scan::v2::node::StateCtx; -use crate::scan::v2::node::read_dense; -use crate::scan::v2::request::EvidenceRequest; -use crate::scan::v2::request::NodeRequest; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; -pub(crate) fn new_scan_node( +pub(crate) fn new_scan_plan( layout: Layout, - req: &mut NodeRequest, - cx: &ExpandCtx, -) -> VortexResult { + req: &mut ScanRequest, + session: &VortexSession, +) -> VortexResult { let zones = layout.child(1)?; - Ok(Arc::new(ZonedScanNode { + Ok(Arc::new(ZonedScanPlan { // The data child preserves this node's rows: pass the // expansion request through. - data: cx.expand(&layout.child(0)?, req)?, + data: layout.child(0)?.new_scan_plan(req, session)?, nzones: zones.row_count(), - zones: cx.expand_free(&zones)?, + zones: zones.new_scan_plan(&mut ScanRequest::empty(), session)?, column_dtype: layout.dtype().clone(), zone_len: layout.data().zone_len() as u64, row_count: layout.row_count(), @@ -95,10 +94,10 @@ pub(crate) fn new_scan_node( /// Reads a zoned layout by delegating to its data child; produces /// per-zone predicate evidence from the stats table. -pub struct ZonedScanNode { - data: ScanNodeRef, +pub struct ZonedScanPlan { + data: ScanPlanRef, /// The zones child (per-zone stats table), read through its own layout vtable. - zones: ScanNodeRef, + zones: ScanPlanRef, nzones: u64, column_dtype: DType, zone_len: u64, @@ -142,23 +141,23 @@ struct ZonedPreparedEvidence { /// Planned ungrouped aggregate over a zoned node's root value. struct ZonedPreparedAggregate { - node: Arc, + node: Arc, state: Arc, zones_read: PreparedReadRef, funcs: Vec, } struct ZonedPreparedRead { - node: Arc, + node: Arc, data: PreparedReadRef, } /// A pushed scalar expression through a zoned wrapper. Reads delegate to /// the pushed data-child expression; evidence combines zone-map proof for /// the expression with any child evidence for the same pushed value. -struct ZonedExprScanNode { - data: ScanNodeRef, - zones: ScanNodeRef, +struct ZonedExprScanPlan { + data: ScanPlanRef, + zones: ScanPlanRef, nzones: u64, column_dtype: DType, zone_len: u64, @@ -170,7 +169,7 @@ struct ZonedExprScanNode { } struct ZonedExprPreparedRead { - node: Arc, + node: Arc, data: PreparedReadRef, } @@ -222,7 +221,7 @@ impl ZonedScanState { } } -impl ZonedScanNode { +impl ZonedScanPlan { fn shared_zone_state(&self, cx: &mut PrepareCtx) -> VortexResult> { let key = PreparedStateKey::new::(Arc::as_ptr(&self.zones) as *const () as usize); @@ -665,11 +664,11 @@ impl PreparedEvidence for ZonedPreparedEvidence { } } -impl ScanNode for ZonedScanNode { +impl ScanPlan for ZonedScanPlan { type State = ZonedScanState; fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - Ok(Self::empty_state_with_data(cx.init_node(&self.data)?)) + Ok(Self::empty_state_with_data(cx.init_plan(&self.data)?)) } fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { @@ -683,7 +682,7 @@ impl ScanNode for ZonedScanNode { self: Arc, expr: &Expression, cx: &mut PushCtx, - ) -> VortexResult> { + ) -> VortexResult> { if is_root(expr) { return Ok(Some(self)); } @@ -699,7 +698,7 @@ impl ScanNode for ZonedScanNode { } else { (None, None) }; - Ok(Some(Arc::new(ZonedExprScanNode { + Ok(Some(Arc::new(ZonedExprScanPlan { data, zones: Arc::clone(&self.zones), nzones: self.nzones, @@ -841,12 +840,12 @@ impl PreparedAggregate for ZonedPreparedAggregate { } } -impl ScanNode for ZonedExprScanNode { +impl ScanPlan for ZonedExprScanPlan { type State = ZonedScanState; fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - Ok(ZonedScanNode::empty_state_with_data( - cx.init_node(&self.data)?, + Ok(ZonedScanPlan::empty_state_with_data( + cx.init_plan(&self.data)?, )) } @@ -866,7 +865,7 @@ impl ScanNode for ZonedExprScanNode { let key = PreparedStateKey::new::( Arc::as_ptr(&self.zones) as *const () as usize, ); - let state = cx.shared_state(key, || Ok(ZonedScanNode::empty_state()))?; + let state = cx.shared_state(key, || Ok(ZonedScanPlan::empty_state()))?; let zones_read = Arc::clone(&self.zones) .prepare_read(cx)? .ok_or_else(|| vortex_err!("zoned stats child did not produce a prepared read"))?; diff --git a/vortex-layout/src/scan/v2/mod.rs b/vortex-layout/src/scan/v2/mod.rs index 1d6f9b78c35..8f8f97fa10c 100644 --- a/vortex-layout/src/scan/v2/mod.rs +++ b/vortex-layout/src/scan/v2/mod.rs @@ -1,17 +1,14 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Scan2 layout-node machinery. +//! Scan2 layout plan machinery. //! //! This module contains the layout-tree expansion vtables and executable -//! [`ScanNode`](node::ScanNode) plans used by the alternate scan implementation. +//! [`ScanPlan`](vortex_scan::plan::ScanPlan) plans used by the alternate scan implementation. -pub mod evidence; -pub mod request; pub mod session; pub(crate) mod layouts; -pub mod node; use vortex_array::dtype::DType; use vortex_array::dtype::FieldName; use vortex_array::dtype::StructFields; @@ -22,6 +19,8 @@ use vortex_array::scalar_fn::fns::binary::Binary; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; +pub use vortex_scan::plan::evidence; +pub use vortex_scan::plan::request; /// Environment variable selecting the file scan implementation. /// @@ -29,7 +28,8 @@ use vortex_error::vortex_err; /// /// - `v1`, `scan`, `scan_builder`, `scan-builder`, `layout-reader`, or unset: use the /// existing LayoutReader-based scan. -/// - `v2`, `scan2`, `scan3`, or `native`: use the scan2 [`node::ScanNode`] implementation. +/// - `v2`, `scan2`, `scan3`, or `native`: use the scan2 +/// [`ScanPlan`](vortex_scan::plan::ScanPlan) implementation. pub const SCAN_IMPL_ENV: &str = "VORTEX_SCAN_IMPL"; /// Returns whether the scan2 implementation should be used by scan data sources. diff --git a/vortex-layout/src/segments/mod.rs b/vortex-layout/src/segments/mod.rs index 493629f27e3..e3f7047ea40 100644 --- a/vortex-layout/src/segments/mod.rs +++ b/vortex-layout/src/segments/mod.rs @@ -2,55 +2,15 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors mod cache; -mod scheduled; mod shared; mod sink; -mod source; #[cfg(any(test, feature = "_test-harness"))] mod test; -use std::fmt::Display; -use std::ops::Deref; - pub use cache::*; -pub use scheduled::*; pub use shared::*; pub use sink::*; -pub use source::*; #[cfg(any(test, feature = "_test-harness"))] pub use test::*; -use vortex_error::VortexError; - -/// The identifier for a single segment. -// TODO(ngates): should this be a `[u8]` instead? Allowing for arbitrary segment identifiers? -#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct SegmentId(u32); - -impl From for SegmentId { - fn from(value: u32) -> Self { - Self(value) - } -} - -impl TryFrom for SegmentId { - type Error = VortexError; - - fn try_from(value: usize) -> Result { - Ok(Self::from(u32::try_from(value)?)) - } -} - -impl Deref for SegmentId { - type Target = u32; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl Display for SegmentId { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "SegmentId({})", self.0) - } -} +pub use vortex_scan::segments::*; diff --git a/vortex-scan/Cargo.toml b/vortex-scan/Cargo.toml index c8144160a39..5c0cf465f5e 100644 --- a/vortex-scan/Cargo.toml +++ b/vortex-scan/Cargo.toml @@ -22,14 +22,18 @@ vortex-buffer = { workspace = true } vortex-error = { workspace = true } vortex-mask = { workspace = true } vortex-session = { workspace = true } -vortex-utils = { workspace = true } +vortex-utils = { workspace = true, features = ["dashmap"] } async-lock = { workspace = true } async-trait = { workspace = true } futures = { workspace = true } parking_lot = { workspace = true } roaring = { workspace = true } +rustc-hash = { workspace = true } tracing = { workspace = true } +[dev-dependencies] +futures = { workspace = true, features = ["executor"] } + [lints] workspace = true diff --git a/vortex-scan/src/lib.rs b/vortex-scan/src/lib.rs index 9c78e58a612..3a7d583d791 100644 --- a/vortex-scan/src/lib.rs +++ b/vortex-scan/src/lib.rs @@ -22,8 +22,10 @@ //! * We should add a way for the client to negotiate capabilities with the data source, for //! example which encodings it knows about. +pub mod plan; pub mod row_mask; pub mod scheduler; +pub mod segments; pub mod selection; use std::any::Any; @@ -44,6 +46,7 @@ pub use scheduler::SegmentSourceId; pub use scheduler::SegmentSourceMeta; pub use scheduler::WorkPermit; pub use scheduler::WorkRequest; +pub use segments::*; use selection::Selection; use vortex_array::aggregate_fn::AggregateFnRef; use vortex_array::dtype::DType; diff --git a/vortex-layout/src/scan/v2/evidence.rs b/vortex-scan/src/plan/evidence.rs similarity index 97% rename from vortex-layout/src/scan/v2/evidence.rs rename to vortex-scan/src/plan/evidence.rs index 2440708cd1f..86786f87d27 100644 --- a/vortex-layout/src/scan/v2/evidence.rs +++ b/vortex-scan/src/plan/evidence.rs @@ -3,14 +3,12 @@ //! Predicate evidence: coverage-bearing answers for prepared predicates. //! -//! A scan2 predicate is answered at runtime by *evidence fragments*: +//! A scan predicate is answered at runtime by *evidence fragments*: //! row ranges paired with what a producer proves about the -//! predicate over them (plan 017 SP1). The whole-morsel verdicts of the -//! v1 scan (`RangeClassification`) become the degenerate case of one -//! fragment covering the morsel; finer coverage is first-class, so a zone -//! map can prove interior zones while leaving edge rows unknown, and an -//! index can return sparse row masks without forcing the whole morsel -//! down the same path. +//! predicate over them. A whole-morsel verdict is the degenerate case of one +//! fragment covering the morsel; finer coverage is first-class, so a zone map +//! can prove interior zones while leaving edge rows unknown, and an index can +//! return sparse row masks without forcing the whole morsel down the same path. //! //! Exactness is explicit in the returned evidence kind. //! [`PredicateEvidenceKind::ExactMask`] proves both selected and rejected diff --git a/vortex-layout/src/scan/v2/node.rs b/vortex-scan/src/plan/mod.rs similarity index 82% rename from vortex-layout/src/scan/v2/node.rs rename to vortex-scan/src/plan/mod.rs index 38905ada810..23e24ede89d 100644 --- a/vortex-layout/src/scan/v2/node.rs +++ b/vortex-scan/src/plan/mod.rs @@ -1,23 +1,23 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! The scan2 tree: immutable per-layout nodes with value, proof, and mask -//! capabilities (plan 017). +//! Physical scan plans with value, proof, and mask capabilities. //! -//! Like the v1 scan, a file's layout tree expands into one node per -//! layout through v2 layout-vtable scan expansion, and the typed traits here are -//! author-facing: the engine works through the blanket-implemented -//! [`DynScanNode`] adapter. Three things are new: +//! A [`ScanPlan`] is immutable physical scan structure. Layouts are one way to +//! instantiate scan plans, but the runtime traits in this module are not tied to +//! serialized layouts. Engines work through the blanket-implemented +//! [`DynScanPlan`] adapter: //! -//! - expansion is *negotiation*: layout scan vtables see the scoped scan request before -//! expression pushdown prepares reads and evidence (see [`super::request`]); -//! - expression pushdown returns another scan node whose root value is +//! - expression pushdown returns another scan plan whose root value is //! the pushed expression, so reads and evidence are prepared from -//! `root()` of that node instead of reparsing expressions; and +//! `root()` of that plan instead of reparsing expressions; and //! - executable prepared reads use one scoped primitive: selection //! controls output cardinality, and demand controls which selected rows //! must contain meaningful values. +pub mod evidence; +pub mod request; + use std::any::TypeId; use std::fmt; use std::ops::Range; @@ -46,16 +46,14 @@ use vortex_error::vortex_err; use vortex_mask::Mask; use vortex_session::VortexSession; -use crate::layout_v2::LayoutRef; -use crate::scan::v2::evidence::EvidenceFragment; -use crate::scan::v2::request::EvidenceRequest; -use crate::scan::v2::request::NodeRequest; -use crate::scan::v2::request::OwnedEvidenceRequest; +use self::evidence::EvidenceFragment; +use self::request::EvidenceRequest; +use self::request::OwnedEvidenceRequest; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; use crate::segments::SegmentSource; -/// Per-file/query IO context for scan2 reads. +/// Per-file/query IO context for scan plan reads. #[derive(Clone)] pub struct FileReader { segments: Arc, @@ -79,14 +77,14 @@ impl FileReader { } } -/// A scan2 node's per-file/query global state, type-erased. +/// A scan plan's per-file/query global state, type-erased. pub type ScanState = dyn std::any::Any + Send + Sync; -/// A reference to a scan2 node's per-file/query global state. +/// A reference to a scan plan's per-file/query global state. pub type ScanStateRef = Arc; -/// A reference-counted, type-erased scan2 node. -pub type ScanNodeRef = Arc; +/// A reference-counted, type-erased scan plan. +pub type ScanPlanRef = Arc; /// A reference-counted, type-erased prepared evidence handle. pub type PreparedEvidenceRef = Arc; @@ -103,7 +101,7 @@ pub type PreparedAggregateRef = Arc; /// A reference-counted, type-erased prepared metadata statistics handle. pub type PreparedStatsRef = Arc; -/// Per-file/query cache of scan-node global state while a file's planned +/// Per-file/query cache of scan-plan global state while a file's planned /// reads are initialized. pub type ScanStateCache = FxHashMap; @@ -113,10 +111,12 @@ pub struct PushCtx { } impl PushCtx { + /// Create an expression-pushdown context for one scan session. pub fn new(session: VortexSession) -> Self { Self { session } } + /// Return the scan session used while pushing expressions. pub fn session(&self) -> &VortexSession { &self.session } @@ -142,6 +142,7 @@ impl PrepareCtx { } } + /// Return the scan session used while preparing runtime handles. pub fn session(&self) -> &VortexSession { &self.session } @@ -151,6 +152,7 @@ impl PrepareCtx { Arc::clone(&self.state_cache) } + /// Return shared prepared state for `key`, initializing it on first use. pub fn shared_state( &mut self, key: PreparedStateKey, @@ -192,6 +194,7 @@ pub struct PreparedStateKey { } impl PreparedStateKey { + /// Create a key scoped by the caller's concrete state type and numeric identity. pub fn new(key: usize) -> Self { Self { type_id: TypeId::of::(), @@ -200,41 +203,44 @@ impl PreparedStateKey { } } -/// Context for initializing type-erased scan-node state used by the remaining -/// node-level release and non-read prepared paths. +/// Context for initializing type-erased scan-plan state used by release and +/// non-read prepared paths. pub struct StateCtx<'a> { session: &'a VortexSession, - node_cache: &'a mut ScanStateCache, + plan_cache: &'a mut ScanStateCache, } impl<'a> StateCtx<'a> { - pub fn new(session: &'a VortexSession, node_cache: &'a mut ScanStateCache) -> Self { + /// Create a state-initialization context backed by a scan-plan state cache. + pub fn new(session: &'a VortexSession, plan_cache: &'a mut ScanStateCache) -> Self { Self { session, - node_cache, + plan_cache, } } + /// Return the scan session used while initializing plan state. pub fn session(&self) -> &VortexSession { self.session } - pub fn init_node(&mut self, node: &ScanNodeRef) -> VortexResult { - let key = scan_node_key(node); - if let Some(hit) = self.node_cache.get(&key) { + /// Initialize or reuse state for a child plan. + pub fn init_plan(&mut self, plan: &ScanPlanRef) -> VortexResult { + let key = scan_plan_key(plan); + if let Some(hit) = self.plan_cache.get(&key) { return Ok(Arc::clone(hit)); } - let state = node.init_state(self)?; - self.node_cache.insert(key, Arc::clone(&state)); + let state = plan.init_state(self)?; + self.plan_cache.insert(key, Arc::clone(&state)); Ok(state) } } -fn scan_node_key(node: &ScanNodeRef) -> usize { - Arc::as_ptr(node) as *const () as usize +fn scan_plan_key(plan: &ScanPlanRef) -> usize { + Arc::as_ptr(plan) as *const () as usize } -/// One operation's row scope in a scan2 node's input row domain. +/// One operation's row scope in a scan plan's input row domain. #[derive(Clone, Copy, Debug)] pub struct RowScope<'a> { /// Rows still semantically live in the input domain. @@ -244,6 +250,7 @@ pub struct RowScope<'a> { } impl<'a> RowScope<'a> { + /// Create a scope where every selected row is demanded. pub fn selected(selection: &'a Mask) -> Self { Self { selection, @@ -251,6 +258,7 @@ impl<'a> RowScope<'a> { } } + /// Create a scope, validating that demand is a subset of selection. pub fn try_new(selection: &'a Mask, demand: &'a Mask) -> VortexResult { if selection.len() != demand.len() { vortex_bail!( @@ -265,6 +273,7 @@ impl<'a> RowScope<'a> { Ok(Self { selection, demand }) } + /// Return whether every selected row is demanded. pub fn demands_all_selected(self) -> bool { std::ptr::eq(self.selection, self.demand) || self.demand.true_count() == self.selection.true_count() @@ -279,6 +288,7 @@ pub struct OwnedRowScope { } impl OwnedRowScope { + /// Create an owned scope where every selected row is demanded. pub fn selected(selection: Mask) -> Self { Self { demand: selection.clone(), @@ -286,11 +296,13 @@ impl OwnedRowScope { } } + /// Create an owned scope, validating that demand is a subset of selection. pub fn try_new(selection: Mask, demand: Mask) -> VortexResult { RowScope::try_new(&selection, &demand)?; Ok(Self { selection, demand }) } + /// Borrow this owned scope as a [`RowScope`]. pub fn as_scope(&self) -> RowScope<'_> { RowScope { selection: &self.selection, @@ -315,52 +327,52 @@ pub struct AggregateAnswer { /// provably all-null spans). pub partial: Option, /// Rows of the requested range the statistics could not answer, as - /// disjoint ascending spans in this node's row coordinates. + /// disjoint ascending spans in this plan's row coordinates. pub residual: Vec>, } -/// A node in the expanded scan2 tree. +/// A plan in a physical scan tree. /// -/// A `ScanNode` is immutable physical scan structure: layout metadata, child node +/// A `ScanPlan` is immutable physical scan structure: metadata, child plan /// references, pushdown behavior, and split hints. Runtime caches live in state /// objects created while preparing reads, evidence, statistics, and aggregates for /// a file scan. -pub trait ScanNode: 'static + Send + Sync { - /// Per-file/query node state: decoded arrays, decoded index state, child node states, and - /// other frontier-released caches shared by prepared handles for this node. +pub trait ScanPlan: 'static + Send + Sync { + /// Per-file/query plan state: decoded arrays, decoded index state, child plan states, and + /// other frontier-released caches shared by prepared handles for this plan. type State: Send + Sync + 'static; - /// Create this node's per-file/query state. + /// Create this plan's per-file/query state. fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult; - /// Try to push `expr` into this node's row domain. The returned node's + /// Try to push `expr` into this plan's row domain. The returned plan's /// root value is exactly `expr` in the input row domain. /// - /// The default accepts `root()` as this node and otherwise builds a - /// generic scalar-apply node over this node's root value. Layouts + /// The default accepts `root()` as this plan and otherwise builds a + /// generic scalar-apply plan over this plan's root value. Layouts /// specialize when they can route or rewrite the expression, e.g. /// struct field access or list-offset functions. fn try_push_expr( self: Arc, expr: &Expression, _cx: &mut PushCtx, - ) -> VortexResult> + ) -> VortexResult> where Self: Sized, { if is_root(expr) { Ok(Some(self)) } else { - Ok(Some(Arc::new(ApplyScanNode::new(self, expr.clone())))) + Ok(Some(Arc::new(ApplyScanPlan::new(self, expr.clone())))) } } - /// Prepare value reads for this node's root value. + /// Prepare value reads for this plan's root value. fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult>; - /// Prepare natural row splits for this node's root value. + /// Prepare natural row splits for this plan's root value. /// - /// The default converts this node's cheap split hints into an executable handle. Nodes can + /// The default converts this plan's cheap split hints into an executable handle. Plans can /// override this when split discovery needs request-specific state, I/O, or cost estimates. fn prepare_splits( self: Arc, @@ -374,7 +386,7 @@ pub trait ScanNode: 'static + Send + Sync { .map(|hints| Arc::new(HintPreparedSplit::new(hints.to_vec())) as PreparedSplitRef)) } - /// Prepare predicate evidence for this node's root boolean value. + /// Prepare predicate evidence for this plan's root boolean value. /// /// Preparation performs no IO and returns a direct executable handle. The /// handle may precompute expression rewrites or accepted predicate @@ -389,11 +401,11 @@ pub trait ScanNode: 'static + Send + Sync { Ok(Vec::new()) } - /// Prepare ungrouped aggregates over this node's root value. + /// Prepare ungrouped aggregates over this plan's root value. /// /// The returned handle answers all `funcs` together over a runtime row /// range, producing one [`AggregateAnswer`] per function. `None` means - /// this node cannot answer these aggregates from layout metadata and + /// this plan cannot answer these aggregates from layout metadata and /// the caller should read rows normally. fn prepare_aggregate_partial( self: Arc, @@ -406,10 +418,10 @@ pub trait ScanNode: 'static + Send + Sync { Ok(None) } - /// Prepare metadata statistics for this node's root value. + /// Prepare metadata statistics for this plan's root value. /// /// The returned handle answers the requested aggregate functions positionally over runtime row - /// ranges using metadata only. `None` means this node cannot answer these functions from + /// ranges using metadata only. `None` means this plan cannot answer these functions from /// metadata. fn prepare_stats( self: Arc, @@ -440,7 +452,7 @@ pub trait ScanNode: 'static + Send + Sync { } /// Read every row in `range` through a prepared read. -pub(crate) fn read_dense<'a>( +pub fn read_dense<'a>( read: &'a PreparedReadRef, range: Range, io: &'a FileReader, @@ -462,87 +474,87 @@ fn range_len(range: &Range) -> VortexResult { usize::try_from(len).map_err(|_| vortex_err!("read range exceeds usize")) } -/// Object-safe view of a [`ScanNode`]. Blanket-implemented; never by +/// Object-safe view of a [`ScanPlan`]. Blanket-implemented; never by /// hand. -pub trait DynScanNode: Send + Sync { - /// Create this node's per-file/query state, type-erased. +pub trait DynScanPlan: Send + Sync { + /// Create this plan's per-file/query state, type-erased. fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult; - /// Try to push an expression into this node's row domain. + /// Try to push an expression into this plan's row domain. fn try_push_expr( self: Arc, expr: &Expression, cx: &mut PushCtx, - ) -> VortexResult>; + ) -> VortexResult>; - /// Prepare value reads for this node's root value. + /// Prepare value reads for this plan's root value. fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult>; - /// Prepare natural row splits for this node's root value. + /// Prepare natural row splits for this plan's root value. fn prepare_splits( self: Arc, cx: &mut PrepareCtx, ) -> VortexResult>; - /// Prepare predicate evidence for this node's root boolean value. + /// Prepare predicate evidence for this plan's root boolean value. fn prepare_evidence( self: Arc, cx: &mut PrepareCtx, ) -> VortexResult>; - /// Prepare ungrouped aggregates for this node's root value. + /// Prepare ungrouped aggregates for this plan's root value. fn prepare_aggregate_partial( self: Arc, funcs: &[AggregateFnRef], cx: &mut PrepareCtx, ) -> VortexResult>; - /// Prepare metadata statistics for this node's root value. + /// Prepare metadata statistics for this plan's root value. fn prepare_stats( self: Arc, funcs: &[AggregateFnRef], cx: &mut PrepareCtx, ) -> VortexResult>; - /// Preferred morsel boundaries (see [`ScanNode::split_hints`]). + /// Preferred morsel boundaries (see [`ScanPlan::split_hints`]). fn split_hints(&self) -> Option<&[u64]>; - /// Release state behind the frontier (see [`ScanNode::release`]). + /// Release state behind the frontier (see [`ScanPlan::release`]). fn release(&self, frontier: u64, state: &ScanState) -> VortexResult<()>; /// Reader-chain description for plan display. fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result; } -impl DynScanNode for T { +impl DynScanPlan for T { fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - Ok(Arc::new(ScanNode::init_state(self, cx)?)) + Ok(Arc::new(ScanPlan::init_state(self, cx)?)) } fn try_push_expr( self: Arc, expr: &Expression, cx: &mut PushCtx, - ) -> VortexResult> { - ScanNode::try_push_expr(self, expr, cx) + ) -> VortexResult> { + ScanPlan::try_push_expr(self, expr, cx) } fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { - ScanNode::prepare_read(self, cx) + ScanPlan::prepare_read(self, cx) } fn prepare_splits( self: Arc, cx: &mut PrepareCtx, ) -> VortexResult> { - ScanNode::prepare_splits(self, cx) + ScanPlan::prepare_splits(self, cx) } fn prepare_evidence( self: Arc, cx: &mut PrepareCtx, ) -> VortexResult> { - ScanNode::prepare_evidence(self, cx) + ScanPlan::prepare_evidence(self, cx) } fn prepare_aggregate_partial( @@ -550,7 +562,7 @@ impl DynScanNode for T { funcs: &[AggregateFnRef], cx: &mut PrepareCtx, ) -> VortexResult> { - ScanNode::prepare_aggregate_partial(self, funcs, cx) + ScanPlan::prepare_aggregate_partial(self, funcs, cx) } fn prepare_stats( @@ -558,19 +570,19 @@ impl DynScanNode for T { funcs: &[AggregateFnRef], cx: &mut PrepareCtx, ) -> VortexResult> { - ScanNode::prepare_stats(self, funcs, cx) + ScanPlan::prepare_stats(self, funcs, cx) } fn split_hints(&self) -> Option<&[u64]> { - ScanNode::split_hints(self) + ScanPlan::split_hints(self) } fn release(&self, frontier: u64, state: &ScanState) -> VortexResult<()> { - ScanNode::release(self, frontier, downcast_state::(state)?) + ScanPlan::release(self, frontier, downcast_state::(state)?) } fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - ScanNode::fmt_chain(self, f) + ScanPlan::fmt_chain(self, f) } } @@ -852,7 +864,7 @@ pub trait PreparedStats: 'static + Send + Sync { /// Answer aggregate-function statistics over every row of `range`. /// /// The returned vector is positional against the functions passed to - /// [`ScanNode::prepare_stats`]. Each element is exact, inexact, or absent for the requested + /// [`ScanPlan::prepare_stats`]. Each element is exact, inexact, or absent for the requested /// aggregate function over `range`. Implementations must not read row values merely to improve /// an estimate. fn stats<'a>( @@ -908,17 +920,18 @@ impl DynPreparedStats for T { } } -/// Virtual node that assembles a struct root value from child nodes in +/// Virtual plan that assembles a struct root value from child plans in /// the same row domain. -pub struct StructValueScanNode { +pub struct StructValueScanPlan { names: FieldNames, - fields: Vec, - validity: Option, + fields: Vec, + validity: Option, split_hints: OnceLock>>, } -impl StructValueScanNode { - pub fn new(names: FieldNames, fields: Vec, validity: Option) -> Self { +impl StructValueScanPlan { + /// Create a virtual plan that assembles a struct from child field plans. + pub fn new(names: FieldNames, fields: Vec, validity: Option) -> Self { Self { names, fields, @@ -946,31 +959,31 @@ impl StructValueScanNode { } } -/// Per-query state for a virtual struct-value node. +/// Per-query state for a virtual struct-value plan. pub struct StructValueState { fields: Vec, validity: Option, } struct StructValuePreparedRead { - node: Arc, + plan: Arc, fields: Vec, validity: Option, } -impl ScanNode for StructValueScanNode { +impl ScanPlan for StructValueScanPlan { type State = StructValueState; fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { let fields = self .fields .iter() - .map(|field| cx.init_node(field)) + .map(|field| cx.init_plan(field)) .collect::>>()?; let validity = self .validity .as_ref() - .map(|validity| cx.init_node(validity)) + .map(|validity| cx.init_plan(validity)) .transpose()?; Ok(StructValueState { fields, validity }) } @@ -995,7 +1008,7 @@ impl ScanNode for StructValueScanNode { }) .transpose()?; Ok(Some(Arc::new(StructValuePreparedRead { - node: self, + plan: self, fields, validity, }))) @@ -1043,7 +1056,7 @@ impl PreparedRead for StructValuePreparedRead { None => Validity::NonNullable, }; Ok(StructArray::try_new( - self.node.names.clone(), + self.plan.names.clone(), arrays, rows.selection.true_count(), validity, @@ -1082,40 +1095,41 @@ impl PreparedRead for StructValuePreparedRead { } fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - ScanNode::fmt_chain(self.node.as_ref(), f) + ScanPlan::fmt_chain(self.plan.as_ref(), f) } } -/// Virtual node that applies a scalar expression to another node's root +/// Virtual plan that applies a scalar expression to another plan's root /// value. -pub struct ApplyScanNode { - input: ScanNodeRef, +pub struct ApplyScanPlan { + input: ScanPlanRef, expr: Expression, } -impl ApplyScanNode { - pub fn new(input: ScanNodeRef, expr: Expression) -> Self { +impl ApplyScanPlan { + /// Create a virtual plan that applies `expr` to `input`. + pub fn new(input: ScanPlanRef, expr: Expression) -> Self { Self { input, expr } } } struct ApplyPreparedRead { - node: Arc, + plan: Arc, input: PreparedReadRef, } -impl ScanNode for ApplyScanNode { +impl ScanPlan for ApplyScanPlan { type State = ScanStateRef; fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { - cx.init_node(&self.input) + cx.init_plan(&self.input) } fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { let input = Arc::clone(&self.input) .prepare_read(cx)? .ok_or_else(|| vortex_err!("apply input did not produce a prepared read"))?; - Ok(Some(Arc::new(ApplyPreparedRead { node: self, input }))) + Ok(Some(Arc::new(ApplyPreparedRead { plan: self, input }))) } fn release(&self, frontier: u64, state: &Self::State) -> VortexResult<()> { @@ -1141,7 +1155,7 @@ impl PreparedRead for ApplyPreparedRead { ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { let input = self.input.read_scoped(range, rows, io, local).await?; - input.apply(&self.node.expr)?.execute::(local) + input.apply(&self.plan.expr)?.execute::(local) }) } @@ -1159,51 +1173,51 @@ impl PreparedRead for ApplyPreparedRead { } fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - ScanNode::fmt_chain(self.node.as_ref(), f) + ScanPlan::fmt_chain(self.plan.as_ref(), f) } } -/// Virtual node that applies a parent struct's validity to another node's root +/// Virtual plan that applies a parent struct's validity to another plan's root /// value. /// /// Reads the `input` value and a non-nullable boolean `validity` array in the /// same row domain and produces `mask(input, validity)`: rows where validity is -/// false become null. This mirrors the v1 struct reader's `array.mask(validity)` -/// behaviour when a single field is projected out of a nullable struct. -pub struct MaskScanNode { - input: ScanNodeRef, - validity: ScanNodeRef, +/// false become null. This preserves parent-struct validity when a single field +/// is projected out of a nullable struct. +pub struct MaskScanPlan { + input: ScanPlanRef, + validity: ScanPlanRef, } -impl MaskScanNode { - /// Create a node that masks `input` with a parent struct's `validity`. +impl MaskScanPlan { + /// Create a plan that masks `input` with a parent struct's `validity`. /// /// `validity` must read a non-nullable boolean array in the same row domain /// as `input` (the struct layout's validity child). - pub fn new(input: ScanNodeRef, validity: ScanNodeRef) -> Self { + pub fn new(input: ScanPlanRef, validity: ScanPlanRef) -> Self { Self { input, validity } } } -/// Per-query state for a [`MaskScanNode`]. +/// Per-query state for a [`MaskScanPlan`]. pub struct MaskState { input: ScanStateRef, validity: ScanStateRef, } struct MaskPreparedRead { - node: Arc, + plan: Arc, input: PreparedReadRef, validity: PreparedReadRef, } -impl ScanNode for MaskScanNode { +impl ScanPlan for MaskScanPlan { type State = MaskState; fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { Ok(MaskState { - input: cx.init_node(&self.input)?, - validity: cx.init_node(&self.validity)?, + input: cx.init_plan(&self.input)?, + validity: cx.init_plan(&self.validity)?, }) } @@ -1215,7 +1229,7 @@ impl ScanNode for MaskScanNode { .prepare_read(cx)? .ok_or_else(|| vortex_err!("mask validity did not produce a prepared read"))?; Ok(Some(Arc::new(MaskPreparedRead { - node: self, + plan: self, input, validity, }))) @@ -1260,7 +1274,7 @@ impl PreparedRead for MaskPreparedRead { } fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - ScanNode::fmt_chain(self.node.as_ref(), f) + ScanPlan::fmt_chain(self.plan.as_ref(), f) } } @@ -1341,57 +1355,22 @@ impl EvidenceTask for DefaultEvidenceTask { fn downcast_erased_state(state: &ScanState) -> VortexResult<&T> { state.downcast_ref::().ok_or_else(|| { vortex_err!( - "scan2 state type mismatch: expected {}", + "scan plan state type mismatch: expected {}", std::any::type_name::() ) }) } -/// Recover a node's concrete file/query global state from its erased form. -pub(crate) fn downcast_state(state: &ScanState) -> VortexResult<&T::State> { +/// Recover a plan's concrete file/query global state from its erased form. +pub(crate) fn downcast_state(state: &ScanState) -> VortexResult<&T::State> { state.downcast_ref::().ok_or_else(|| { vortex_err!( - "scan2 state type mismatch: expected {}", + "scan plan state type mismatch: expected {}", std::any::type_name::() ) }) } -/// Expands layout encodings through their vtable-provided scan2 nodes. -/// Scan vtables recurse into child layouts through -/// [`ExpandCtx::expand`] (passing the scoped request through -/// row-preserving children) or [`ExpandCtx::expand_free`] (for children -/// in another row domain, and for lazy runtime expansion). -#[derive(Clone)] -pub struct ExpandCtx { - session: VortexSession, -} - -impl ExpandCtx { - /// An expansion context carrying the session used by scan nodes. - pub fn new(session: VortexSession) -> Self { - Self { session } - } - - /// The session scan nodes are expanded with. - pub fn session(&self) -> &VortexSession { - &self.session - } - - /// Expand `layout` through its encoding's scan2 vtable, - /// negotiating `req` on the way down. - pub fn expand(&self, layout: &LayoutRef, req: &mut NodeRequest) -> VortexResult { - layout.new_scan_node(req, self) - } - - /// Expand `layout` with an empty request: for children in another row - /// domain (dictionary values, zone tables, index postings) and for - /// chunk children expanded lazily at runtime. - pub fn expand_free(&self, layout: &LayoutRef) -> VortexResult { - self.expand(layout, &mut NodeRequest::empty()) - } -} - #[cfg(test)] mod tests { use std::sync::Arc; @@ -1400,14 +1379,23 @@ mod tests { use vortex_array::aggregate_fn::EmptyOptions; use vortex_array::aggregate_fn::fns::max::Max; use vortex_array::aggregate_fn::fns::min::Min; + use vortex_array::buffer::BufferHandle; use vortex_array::dtype::Nullability; + use vortex_buffer::ByteBuffer; use super::*; - use crate::segments::TestSegments; + + struct TestSegments; + + impl SegmentSource for TestSegments { + fn request(&self, _id: crate::segments::SegmentId) -> crate::segments::SegmentFuture { + Box::pin(async { Ok(BufferHandle::new_host(ByteBuffer::from(Vec::::new()))) }) + } + } struct TestStatsNode; - impl ScanNode for TestStatsNode { + impl ScanPlan for TestStatsNode { type State = (); fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { @@ -1468,14 +1456,14 @@ mod tests { #[test] fn stats_plan_erasure_preserves_positional_results() -> VortexResult<()> { let session = VortexSession::empty(); - let node: ScanNodeRef = Arc::new(TestStatsNode); + let plan_root: ScanPlanRef = Arc::new(TestStatsNode); let funcs = vec![Min.bind(EmptyOptions), Max.bind(EmptyOptions)]; - let plan = node + let plan = plan_root .prepare_stats(&funcs, &mut PrepareCtx::new(session.clone()))? - .ok_or_else(|| vortex_err!("test scan node did not return a stats plan"))?; + .ok_or_else(|| vortex_err!("test scan plan did not return a stats plan"))?; let state = plan.init_state(&session)?; - let io = FileReader::new(Arc::new(TestSegments::default()), session); + let io = FileReader::new(Arc::new(TestSegments), session); let stats = futures::executor::block_on(plan.stats(10..20, &io, state.as_ref()))?; assert_eq!(stats.len(), funcs.len()); diff --git a/vortex-layout/src/scan/v2/request.rs b/vortex-scan/src/plan/request.rs similarity index 79% rename from vortex-layout/src/scan/v2/request.rs rename to vortex-scan/src/plan/request.rs index 1551838668e..863c10a6314 100644 --- a/vortex-layout/src/scan/v2/request.rs +++ b/vortex-scan/src/plan/request.rs @@ -1,11 +1,11 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Runtime evidence requests for scan2. +//! Runtime evidence requests for scan plans. //! -//! Expansion produces layout-local [`ScanNode`](super::node::ScanNode) +//! Expansion produces layout-local [`ScanPlan`](super::ScanPlan) //! trees. Predicate, projection, aggregate, and dynamic-filter handling -//! then push expressions into those nodes and ask the resulting nodes for +//! then push expressions into those plans and ask the resulting plans for //! prepared runtime handles. Evidence requests are the per-morsel inputs to //! those prepared evidence handles. @@ -13,8 +13,8 @@ use std::ops::Range; use vortex_array::expr::Expression; -use crate::scan::v2::evidence::PredicateId; -use crate::scan::v2::evidence::PredicateVersion; +use super::evidence::PredicateId; +use super::evidence::PredicateVersion; /// Runtime evidence pass kind. #[derive(Clone, Copy, Debug, PartialEq, Eq)] @@ -28,15 +28,15 @@ pub enum EvidenceMode { /// Expansion-time context reserved for layout-local scan setup. /// -/// Scan2 no longer carries predicates through expansion. Layout scan vtables +/// Layout expansion does not carry predicates directly. Layout scan vtables /// must expose expression behavior through -/// [`ScanNode::try_push_expr`](super::node::ScanNode::try_push_expr), -/// [`ScanNode::prepare_read`](super::node::ScanNode::prepare_read), and -/// [`ScanNode::prepare_evidence`](super::node::ScanNode::prepare_evidence). +/// [`ScanPlan::try_push_expr`](super::ScanPlan::try_push_expr), +/// [`ScanPlan::prepare_read`](super::ScanPlan::prepare_read), and +/// [`ScanPlan::prepare_evidence`](super::ScanPlan::prepare_evidence). #[derive(Debug, Default)] -pub struct NodeRequest; +pub struct ScanRequest; -impl NodeRequest { +impl ScanRequest { /// A request with no relation-scoped predicate payload. pub fn empty() -> Self { Self @@ -72,6 +72,7 @@ impl OwnedEvidenceRequest { } } +/// Borrowed runtime evidence request for a prepared evidence handle. #[derive(Debug)] pub struct EvidenceRequest<'a> { /// The predicate's stable id within this scan. diff --git a/vortex-scan/src/scheduler.rs b/vortex-scan/src/scheduler.rs index 4203c8bdf0e..f30d40ecbd0 100644 --- a/vortex-scan/src/scheduler.rs +++ b/vortex-scan/src/scheduler.rs @@ -3,7 +3,7 @@ //! Coarse-grained resource scheduling for scans. //! -//! The scheduler deliberately starts with one primitive: a slot permit. The V2 ScanNode runtime +//! The scheduler deliberately starts with one primitive: a slot permit. The ScanPlan runtime //! uses one slot per in-flight morsel, which is enough to preserve the existing scan concurrency //! model while giving integrations a shared object they can use to bound concurrent work across //! scans. diff --git a/vortex-scan/src/segments/mod.rs b/vortex-scan/src/segments/mod.rs new file mode 100644 index 00000000000..939ae412718 --- /dev/null +++ b/vortex-scan/src/segments/mod.rs @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Segment sources and scheduler-visible segment request planning. + +mod scheduled; +mod source; + +use std::fmt::Display; +use std::ops::Deref; + +pub use scheduled::*; +pub use source::*; +use vortex_error::VortexError; + +/// The identifier for a single logical segment. +// TODO(ngates): should this be a `[u8]` instead? Allowing for arbitrary segment identifiers? +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct SegmentId(u32); + +impl From for SegmentId { + fn from(value: u32) -> Self { + Self(value) + } +} + +impl TryFrom for SegmentId { + type Error = VortexError; + + fn try_from(value: usize) -> Result { + Ok(Self::from(u32::try_from(value)?)) + } +} + +impl Deref for SegmentId { + type Target = u32; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl Display for SegmentId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "SegmentId({})", self.0) + } +} diff --git a/vortex-layout/src/segments/scheduled.rs b/vortex-scan/src/segments/scheduled.rs similarity index 99% rename from vortex-layout/src/segments/scheduled.rs rename to vortex-scan/src/segments/scheduled.rs index 3dffa5a1ef8..b4befd95d00 100644 --- a/vortex-layout/src/segments/scheduled.rs +++ b/vortex-scan/src/segments/scheduled.rs @@ -15,12 +15,12 @@ use vortex_error::VortexError; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_err; -use vortex_scan::SegmentSourceId; use vortex_session::VortexSession; use vortex_utils::aliases::dash_map::DashMap; use vortex_utils::aliases::dash_map::Entry; use vortex_utils::aliases::hash_set::HashSet; +use crate::scheduler::SegmentSourceId; use crate::segments::SegmentFuture; use crate::segments::SegmentId; use crate::segments::SegmentSource; @@ -109,7 +109,7 @@ impl CancelGroup { /// A scheduler-visible request for one logical segment payload. /// /// The first scheduler API intentionally only models segment payloads. If a future custom -/// `ScanNode` needs opaque or non-segment I/O, add that request shape next to this type rather +/// `ScanPlan` needs opaque or non-segment I/O, add that request shape next to this type rather /// than smuggling physical locations into `SegmentRequest`. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub struct SegmentRequest { @@ -723,11 +723,11 @@ mod tests { use parking_lot::Mutex; use vortex_array::buffer::BufferHandle; use vortex_buffer::ByteBuffer; - use vortex_scan::ScanMeta; - use vortex_scan::ScanScheduler; - use vortex_scan::SegmentSourceMeta; use super::*; + use crate::ScanMeta; + use crate::ScanScheduler; + use crate::SegmentSourceMeta; struct TestSegmentSource; diff --git a/vortex-layout/src/segments/source.rs b/vortex-scan/src/segments/source.rs similarity index 89% rename from vortex-layout/src/segments/source.rs rename to vortex-scan/src/segments/source.rs index a48a79b2889..9ff8a901c93 100644 --- a/vortex-layout/src/segments/source.rs +++ b/vortex-scan/src/segments/source.rs @@ -6,10 +6,11 @@ use vortex_array::buffer::BufferHandle; use vortex_error::VortexResult; use crate::segments::SegmentId; + /// Static future resolving to a segment byte buffer. pub type SegmentFuture = BoxFuture<'static, VortexResult>; -/// A trait for providing segment data to a [`crate::LayoutReader`]. +/// A trait for providing logical segment data to a scan plan. pub trait SegmentSource: 'static + Send + Sync { /// Request a segment, returning a future that will eventually resolve to the segment data. fn request(&self, id: SegmentId) -> SegmentFuture; From ea9ef647ed8e17f3959e55d5074f3d7f646cad39 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 19 Jun 2026 23:37:02 -0400 Subject: [PATCH 17/48] Fix struct binding Signed-off-by: Nicholas Gates --- vortex-array/src/arrays/patched/mod.rs | 2 +- vortex-file/src/multi/scan_v2.rs | 93 ++++++++++++++++++-- vortex-file/src/scan_v1_v2_differential.rs | 13 +++ vortex-layout/src/scan/v2/layouts/dict.rs | 3 +- vortex-layout/src/scan/v2/layouts/struct_.rs | 59 ++++++++++++- vortex-layout/src/scan/v2/mod.rs | 2 + 6 files changed, 160 insertions(+), 12 deletions(-) diff --git a/vortex-array/src/arrays/patched/mod.rs b/vortex-array/src/arrays/patched/mod.rs index 0d4a4dbdb5a..a88628e5587 100644 --- a/vortex-array/src/arrays/patched/mod.rs +++ b/vortex-array/src/arrays/patched/mod.rs @@ -43,7 +43,7 @@ //! `indices` and `values` are aligned and accessed together. //! //! ```text -//! +//! //! chunk 0 chunk 0 chunk 0 chunk 0 chunk 0 chunk 0 //! lane 0 lane 1 lane 2 lane 3 lane 4 lane 5 //! ┌────────────┬────────────┬────────────┬────────────┬────────────┬────────────┐ diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index e9d93aef5b3..ce8d5544f6a 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -33,6 +33,7 @@ use vortex_array::expr::stats::Precision; use vortex_array::expr::stats::Stat; use vortex_array::scalar::Scalar; use vortex_array::scalar::ScalarValue; +use vortex_array::scalar_fn::fns::dynamic::DynamicExprUpdates; use vortex_array::scalar_fn::fns::get_item::GetItem; use vortex_array::scalar_fn::fns::root::Root; use vortex_array::stats::StatsSet; @@ -47,6 +48,7 @@ use vortex_io::filesystem::FileSystemRef; use vortex_io::runtime::Handle; use vortex_io::session::RuntimeSessionExt; use vortex_layout::scan::v2::validate_temporal_comparisons; +use vortex_layout::scan::v2::with_row_idx; use vortex_mask::Mask; use vortex_metrics::MetricsRegistry; use vortex_scan::DataSource; @@ -968,6 +970,7 @@ fn expand_file_root(file: &VortexFile, session: &VortexSession) -> VortexResult< .layout2() .ok_or_else(|| vortex_err!("scan2 requires a v2 footer layout"))?; let root = layout.new_scan_plan(&mut plan_request, session)?; + let root = with_row_idx(root, file.dtype().clone(), 0); Ok(match file.footer().statistics().cloned() { Some(stats) => FileStatsScanPlan::try_new( Arc::clone(&root), @@ -1106,6 +1109,7 @@ struct MorselState { evidence: Vec>, pending_evidence: usize, next_predicate: usize, + next_recheck_predicate: usize, } struct PartitionWorkSchedulerState { @@ -1388,6 +1392,12 @@ impl PartitionWorkSchedulerState { return Ok(()); } if morsel.next_predicate >= morsel.prepared.predicates.len() { + if self.enqueue_recheck_evidence(morsel_id)? { + return Ok(()); + } + let Some(morsel) = self.morsels.get(morsel_id).and_then(Option::as_ref) else { + return Ok(()); + }; let projection = morsel.prepared.plan_projection_work( morsel_id, morsel.range.clone(), @@ -1414,6 +1424,8 @@ impl PartitionWorkSchedulerState { morsel_id, predicate_idx, morsel.range.clone(), + morsel.prepared.predicates[predicate_idx].version(), + EvidenceMode::Normal, )?; let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { @@ -1426,7 +1438,7 @@ impl PartitionWorkSchedulerState { let evidence = PredicateEvidence::new( morsel.prepared.predicates[predicate_idx].id, - PredicateVersion::STATIC, + morsel.prepared.predicates[predicate_idx].version(), morsel.range.clone(), )?; let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { @@ -1452,6 +1464,7 @@ impl PartitionWorkSchedulerState { predicate_idx, morsel.range.clone(), need, + morsel.prepared.predicates[predicate_idx].version(), )?; let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { return Ok(()); @@ -1462,6 +1475,49 @@ impl PartitionWorkSchedulerState { } } + fn enqueue_recheck_evidence(&mut self, morsel_id: usize) -> VortexResult { + loop { + let Some(morsel) = self.morsels.get(morsel_id).and_then(Option::as_ref) else { + return Ok(false); + }; + if morsel.next_recheck_predicate >= morsel.prepared.predicates.len() { + return Ok(false); + } + + let predicate_idx = morsel.next_recheck_predicate; + let predicate = &morsel.prepared.predicates[predicate_idx]; + let current_version = predicate.version(); + let evidence_version = morsel.evidence[predicate_idx] + .as_ref() + .map(PredicateEvidence::version) + .unwrap_or(PredicateVersion::STATIC); + + if predicate.dynamic_updates.is_some() + && predicate.has_recheck_evidence() + && current_version != evidence_version + { + let work = morsel.prepared.plan_evidence_work( + morsel_id, + predicate_idx, + morsel.range.clone(), + current_version, + EvidenceMode::RecheckBeforeProjection, + )?; + let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { + return Ok(false); + }; + morsel.pending_evidence = morsel.pending_evidence.saturating_add(1); + self.evidence_queue.push_back(work); + return Ok(true); + } + + let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { + return Ok(false); + }; + morsel.next_recheck_predicate = morsel.next_recheck_predicate.saturating_add(1); + } + } + fn finish_empty_morsel(&mut self, morsel_id: usize) -> Option { if self.finish_morsel(morsel_id) && self.ordered { self.completed_morsels @@ -1709,10 +1765,26 @@ struct PreparedScanPlanFile { struct PreparedPredicate { id: PredicateId, expr: Expression, + dynamic_updates: Option, read: PreparedReadRef, evidence: Vec, } +impl PreparedPredicate { + fn version(&self) -> PredicateVersion { + self.dynamic_updates + .as_ref() + .map(|updates| PredicateVersion::new(updates.version())) + .unwrap_or(PredicateVersion::STATIC) + } + + fn has_recheck_evidence(&self) -> bool { + self.evidence + .iter() + .any(|plan| plan.recheck_before_projection()) + } +} + struct RegisteredScheduledSegmentSource { source: Arc, } @@ -1774,9 +1846,11 @@ impl PreparedScanPlanFile { .prepare_read(&mut prepare_ctx)? .ok_or_else(|| vortex_err!("scan2 could not plan predicate read {expr}"))?; let evidence = pushed.prepare_evidence(&mut prepare_ctx)?; + let dynamic_updates = DynamicExprUpdates::new(&expr); Ok(PreparedPredicate { id, expr, + dynamic_updates, read, evidence, }) @@ -1842,6 +1916,7 @@ impl PreparedScanPlanFile { evidence: (0..self.predicates.len()).map(|_| None).collect(), pending_evidence: 0, next_predicate: 0, + next_recheck_predicate: 0, }; Ok(Some(PlannedMorselWork { @@ -1855,18 +1930,23 @@ impl PreparedScanPlanFile { morsel_id: usize, predicate_idx: usize, range: Range, + version: PredicateVersion, + mode: EvidenceMode, ) -> VortexResult { let predicate = &self.predicates[predicate_idx]; let mut registered = SubmittedSegmentRequests::default(); let req = OwnedEvidenceRequest { id: predicate.id, - version: PredicateVersion::STATIC, + version, predicate: predicate.expr.clone(), range: range.clone(), - mode: EvidenceMode::Normal, + mode, }; let mut tasks = Vec::with_capacity(predicate.evidence.len()); for plan in &predicate.evidence { + if mode == EvidenceMode::RecheckBeforeProjection && !plan.recheck_before_projection() { + continue; + } let task = Arc::clone(plan).begin_evidence(req.clone())?; let mut segment_ctx = self.segment_plan_ctx(ScanIoPhase::EvidenceProbe); let requests = task.segment_requests(&mut segment_ctx)?; @@ -1881,8 +1961,7 @@ impl PreparedScanPlanFile { registered, async move { let predicate = &prepared.predicates[predicate_idx]; - let mut acc = - PredicateEvidence::new(predicate.id, PredicateVersion::STATIC, range.clone())?; + let mut acc = PredicateEvidence::new(predicate.id, version, range.clone())?; for task in tasks { for fragment in task.evidence(&prepared.reader).await? { acc.absorb(fragment)?; @@ -1908,6 +1987,7 @@ impl PreparedScanPlanFile { predicate_idx: usize, range: Range, need: Mask, + version: PredicateVersion, ) -> VortexResult { let len = range_len(&range)?; let predicate = &self.predicates[predicate_idx]; @@ -1964,8 +2044,7 @@ impl PreparedScanPlanFile { } let pass = &result & &need; let exact = !&need | &pass; - let mut evidence = - PredicateEvidence::new(predicate.id, PredicateVersion::STATIC, range.clone())?; + let mut evidence = PredicateEvidence::new(predicate.id, version, range.clone())?; evidence.absorb(EvidenceFragment::new( range, PredicateEvidenceKind::ExactMask(exact), diff --git a/vortex-file/src/scan_v1_v2_differential.rs b/vortex-file/src/scan_v1_v2_differential.rs index a8c89c6d4e1..dd05d722478 100644 --- a/vortex-file/src/scan_v1_v2_differential.rs +++ b/vortex-file/src/scan_v1_v2_differential.rs @@ -33,6 +33,8 @@ use vortex_array::expr::Expression; use vortex_array::expr::get_item; use vortex_array::expr::gt; use vortex_array::expr::lit; +use vortex_array::expr::merge; +use vortex_array::expr::pack; use vortex_array::expr::root; use vortex_array::expr::select; use vortex_array::stats::PRUNING_STATS; @@ -41,6 +43,7 @@ use vortex_array::validity::Validity; use vortex_buffer::ByteBufferMut; use vortex_buffer::buffer; use vortex_error::VortexResult; +use vortex_layout::layouts::row_idx::row_idx; use vortex_scan::ScanRequest; use vortex_session::VortexSession; @@ -312,6 +315,16 @@ async fn differential_multi_conjunct_dense() -> VortexResult<()> { assert_v1_eq_v2(&file, request(root(), Some(multi_conjunct_filter()))).await } +#[tokio::test] +async fn differential_single_field_merge_select_projection() -> VortexResult<()> { + let file = write_file(flat_primitive(false), true).await?; + let projection = merge([ + pack([("file_row_number", row_idx())], Nullability::NonNullable), + select(["numbers"], root()), + ]); + assert_v1_eq_v2(&file, request(projection, None)).await +} + /// Reproduces the struct-null bug: projecting a single deep field out of a /// nullable nested struct must apply the parent struct's validity. The V2 /// single-field fast path previously bypassed `self.validity`. diff --git a/vortex-layout/src/scan/v2/layouts/dict.rs b/vortex-layout/src/scan/v2/layouts/dict.rs index 96fd059db99..18fedfa5ed9 100644 --- a/vortex-layout/src/scan/v2/layouts/dict.rs +++ b/vortex-layout/src/scan/v2/layouts/dict.rs @@ -140,10 +140,11 @@ struct DictExprPreparedRead { } fn value_expr_is_expensive(expr: &Expression) -> bool { + // TODO: Move this cost classification onto ScalarFnVTable instead of matching function IDs + // here. matches!( expr.id().as_str(), "vortex.like" - | "vortex.byte_length" | "vortex.list.contains" | "vortex.dynamic" | "vortex.variant_get" diff --git a/vortex-layout/src/scan/v2/layouts/struct_.rs b/vortex-layout/src/scan/v2/layouts/struct_.rs index ab994af23cc..229a3079eb5 100644 --- a/vortex-layout/src/scan/v2/layouts/struct_.rs +++ b/vortex-layout/src/scan/v2/layouts/struct_.rs @@ -100,13 +100,14 @@ impl ScanPlan for StructScanPlan { return self.push_struct(names, cx).map(Some); } if let Some(pack) = expr.as_opt::() - && pack.names.len() == 1 - && expr.child(0).is::() + && is_direct_field_projection(expr, &pack.names) { return self.push_struct(pack.names.clone(), cx).map(Some); } let fields = referenced_fields(expr, &scope); - if let [name] = fields.as_slice() { + if let [name] = fields.as_slice() + && can_push_as_single_field(expr, name) + { let scoped = replace(expr.clone(), &get_item(name.clone(), root()), root()); let child = self.child_field(name)?; return Ok(self.apply_validity(child.try_push_expr(&scoped, cx)?)); @@ -184,3 +185,55 @@ fn root_field(expr: &Expression) -> Option<&FieldName> { let name = expr.as_opt::()?; expr.child(0).is::().then_some(name) } + +fn is_direct_field_projection(expr: &Expression, names: &FieldNames) -> bool { + if names.len() != expr.children().len() { + return false; + } + + names + .iter() + .zip(expr.children().iter()) + .all(|(name, child)| root_field(child).is_some_and(|field| field == name)) +} + +fn can_push_as_single_field(expr: &Expression, name: &FieldName) -> bool { + if let Some(field) = root_field(expr) { + return field == name; + } + if expr.is::() { + return false; + } + expr.children() + .iter() + .all(|child| can_push_as_single_field(child, name)) +} + +#[cfg(test)] +mod tests { + use vortex_array::dtype::Nullability; + use vortex_array::expr::pack; + + use super::*; + + #[test] + fn pack_of_root_field_is_direct_projection() { + let expr = pack( + [("labels", get_item("labels", root()))], + Nullability::NonNullable, + ); + let pack = expr.as_opt::().expect("pack expression"); + + assert!(is_direct_field_projection(&expr, &pack.names)); + assert!(can_push_as_single_field(&expr, &FieldName::from("labels"))); + } + + #[test] + fn pack_of_root_is_not_child_field_projection() { + let expr = pack([("labels", root())], Nullability::NonNullable); + let pack = expr.as_opt::().expect("pack expression"); + + assert!(!is_direct_field_projection(&expr, &pack.names)); + assert!(!can_push_as_single_field(&expr, &FieldName::from("labels"))); + } +} diff --git a/vortex-layout/src/scan/v2/mod.rs b/vortex-layout/src/scan/v2/mod.rs index 8f8f97fa10c..bb182ed3de2 100644 --- a/vortex-layout/src/scan/v2/mod.rs +++ b/vortex-layout/src/scan/v2/mod.rs @@ -9,6 +9,8 @@ pub mod session; pub(crate) mod layouts; +mod row_idx; +pub use row_idx::with_row_idx; use vortex_array::dtype::DType; use vortex_array::dtype::FieldName; use vortex_array::dtype::StructFields; From f7b42b7160f3d5324a2f65cf72185fc8cc6a6904 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 19 Jun 2026 23:37:05 -0400 Subject: [PATCH 18/48] Fix struct binding Signed-off-by: Nicholas Gates --- vortex-layout/src/scan/v2/row_idx.rs | 309 +++++++++++++++++++++++++++ 1 file changed, 309 insertions(+) create mode 100644 vortex-layout/src/scan/v2/row_idx.rs diff --git a/vortex-layout/src/scan/v2/row_idx.rs b/vortex-layout/src/scan/v2/row_idx.rs new file mode 100644 index 00000000000..c65dcb5449f --- /dev/null +++ b/vortex-layout/src/scan/v2/row_idx.rs @@ -0,0 +1,309 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt; +use std::ops::Range; +use std::sync::Arc; + +use futures::future::BoxFuture; +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::dtype::DType; +use vortex_array::dtype::FieldName; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::expr::Expression; +use vortex_array::expr::is_root; +use vortex_array::expr::root; +use vortex_array::expr::transform::PartitionedExpr; +use vortex_array::expr::transform::partition; +use vortex_array::expr::transform::replace; +use vortex_array::scalar::PValue; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_scan::plan::ApplyScanPlan; +use vortex_scan::plan::FileReader; +use vortex_scan::plan::PrepareCtx; +use vortex_scan::plan::PreparedRead; +use vortex_scan::plan::PreparedReadRef; +use vortex_scan::plan::PushCtx; +use vortex_scan::plan::RowScope; +use vortex_scan::plan::ScanPlan; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::ScanStateRef; +use vortex_scan::plan::StateCtx; +use vortex_scan::plan::StructValueScanPlan; +use vortex_scan::plan::default_try_push_expr; +use vortex_scan::segments::SegmentPlanCtx; +use vortex_scan::segments::SegmentRequests; +use vortex_sequence::Sequence; +use vortex_sequence::SequenceArray; + +use crate::layouts::row_idx::RowIdx; +use crate::layouts::row_idx::row_idx; + +pub fn with_row_idx(root: ScanPlanRef, dtype: DType, row_offset: u64) -> ScanPlanRef { + Arc::new(RowIdxScanPlan { + child: root, + dtype, + row_offset, + }) +} + +struct RowIdxScanPlan { + child: ScanPlanRef, + dtype: DType, + row_offset: u64, +} + +enum Partitioning { + RowIdx(Expression), + Child(Expression), + Partitioned(Arc>), +} + +#[derive(Clone, PartialEq, Eq, Hash)] +enum Partition { + RowIdx, + Child, +} + +impl Partition { + fn name(&self) -> &str { + match self { + Partition::RowIdx => "row_idx", + Partition::Child => "child", + } + } +} + +impl From for FieldName { + fn from(value: Partition) -> Self { + FieldName::from(value.name()) + } +} + +impl fmt::Display for Partition { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.name()) + } +} + +impl RowIdxScanPlan { + fn partition_expr(&self, expr: &Expression) -> VortexResult { + if !contains_row_idx(expr) { + return Ok(Partitioning::Child(expr.clone())); + } + + let mut partitioned = partition(expr.clone(), &self.dtype, |expr| { + if expr.is::() { + vec![Partition::RowIdx] + } else if is_root(expr) { + vec![Partition::Child] + } else { + vec![] + } + })?; + + if partitioned.partitions.len() == 1 { + return Ok(match &partitioned.partition_annotations[0] { + Partition::RowIdx => { + Partitioning::RowIdx(replace(expr.clone(), &row_idx(), root())) + } + Partition::Child => Partitioning::Child(expr.clone()), + }); + } + + partitioned.partitions = partitioned + .partitions + .into_iter() + .map(|p| replace(p, &row_idx(), root())) + .collect(); + + Ok(Partitioning::Partitioned(Arc::new(partitioned))) + } +} + +impl ScanPlan for RowIdxScanPlan { + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { + cx.init_plan(&self.child) + } + + fn try_push_expr( + self: Arc, + expr: &Expression, + cx: &mut PushCtx, + ) -> VortexResult> { + match self.partition_expr(expr)? { + Partitioning::RowIdx(expr) => Ok(Some(Arc::new(RowIdxExprScanPlan::try_new( + self.row_offset, + expr, + )?))), + Partitioning::Child(expr) => Arc::clone(&self.child).try_push_expr(&expr, cx), + Partitioning::Partitioned(partitioned) => { + let mut fields = Vec::with_capacity(partitioned.partitions.len()); + for (expr, annotation) in partitioned + .partitions + .iter() + .zip(partitioned.partition_annotations.iter()) + { + let field = match annotation { + Partition::RowIdx => { + Arc::new(RowIdxExprScanPlan::try_new(self.row_offset, expr.clone())?) + as ScanPlanRef + } + Partition::Child => Arc::clone(&self.child) + .try_push_expr(expr, cx)? + .ok_or_else(|| { + vortex_error::vortex_err!( + "row_idx child partition did not push expression {expr}" + ) + })?, + }; + fields.push(field); + } + let input = Arc::new(StructValueScanPlan::new( + partitioned.partition_names.clone(), + fields, + None, + )); + Ok(Some(Arc::new(ApplyScanPlan::new( + input, + partitioned.root.clone(), + )))) + } + } + } + + fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { + Arc::clone(&self.child).prepare_read(cx) + } + + fn release(&self, frontier: u64, state: &vortex_scan::plan::ScanState) -> VortexResult<()> { + self.child.release(frontier, state) + } + + fn split_hints(&self) -> Option<&[u64]> { + self.child.split_hints() + } + + fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "row_idx:")?; + self.child.fmt_chain(f) + } +} + +struct RowIdxExprScanPlan { + row_offset: u64, + expr: Expression, + dtype: DType, +} + +impl RowIdxExprScanPlan { + fn try_new(row_offset: u64, expr: Expression) -> VortexResult { + let dtype = expr.return_dtype(&row_idx_dtype())?; + Ok(Self { + row_offset, + expr, + dtype, + }) + } +} + +struct RowIdxPreparedRead { + plan: Arc, +} + +impl ScanPlan for RowIdxExprScanPlan { + fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { + Ok(Arc::new(())) + } + + fn try_push_expr( + self: Arc, + expr: &Expression, + _cx: &mut PushCtx, + ) -> VortexResult> { + default_try_push_expr(self, expr) + } + + fn prepare_read( + self: Arc, + _cx: &mut PrepareCtx, + ) -> VortexResult> { + Ok(Some(Arc::new(RowIdxPreparedRead { plan: self }))) + } + + fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "row_idx({})", self.expr) + } +} + +impl PreparedRead for RowIdxPreparedRead { + fn read_scoped<'a>( + &'a self, + range: Range, + rows: RowScope<'a>, + _io: &'a FileReader, + local: &'a mut ExecutionCtx, + ) -> BoxFuture<'a, VortexResult> { + Box::pin(async move { + let dense = idx_array(self.plan.row_offset, &range).into_array(); + if rows.selection.len() != dense.len() { + vortex_bail!( + "selection length {} does not match row_idx range length {}", + rows.selection.len(), + dense.len() + ); + } + if rows.demand.len() != dense.len() { + vortex_bail!( + "demand length {} does not match row_idx range length {}", + rows.demand.len(), + dense.len() + ); + } + let selected = if rows.selection.all_true() { + dense + } else { + dense.filter(rows.selection.clone())? + }; + selected.apply(&self.plan.expr)?.execute::(local) + }) + } + + fn segment_requests( + &self, + _range: Range, + _rows: RowScope<'_>, + _cx: &mut SegmentPlanCtx, + ) -> VortexResult { + Ok(SegmentRequests::none()) + } + + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "row_idx({}) -> {}", self.plan.expr, self.plan.dtype) + } +} + +fn idx_array(row_offset: u64, row_range: &Range) -> SequenceArray { + Sequence::try_new( + PValue::U64(row_offset + row_range.start), + PValue::U64(1), + PType::U64, + Nullability::NonNullable, + usize::try_from(row_range.end - row_range.start) + .vortex_expect("row range length must fit in usize"), + ) + .vortex_expect("failed to create row index array") +} + +fn row_idx_dtype() -> DType { + DType::Primitive(PType::U64, Nullability::NonNullable) +} + +fn contains_row_idx(expr: &Expression) -> bool { + expr.is::() || expr.children().iter().any(contains_row_idx) +} From 962c099c70ec15ab772f5d8d4084ccb22c5a3d96 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Sat, 20 Jun 2026 12:23:07 -0400 Subject: [PATCH 19/48] Chunk DuckDB exports on chunk boundaries Signed-off-by: Nicholas Gates --- vortex-duckdb/src/exporter/chunked.rs | 145 +++++++++++++++++++++++++ vortex-duckdb/src/exporter/dict.rs | 3 +- vortex-duckdb/src/exporter/mod.rs | 28 ++++- vortex-duckdb/src/exporter/struct_.rs | 6 + vortex-duckdb/src/exporter/validity.rs | 4 + 5 files changed, 184 insertions(+), 2 deletions(-) create mode 100644 vortex-duckdb/src/exporter/chunked.rs diff --git a/vortex-duckdb/src/exporter/chunked.rs b/vortex-duckdb/src/exporter/chunked.rs new file mode 100644 index 00000000000..6e6b6642030 --- /dev/null +++ b/vortex-duckdb/src/exporter/chunked.rs @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex::array::ExecutionCtx; +use vortex::array::IntoArray; +use vortex::array::arrays::ChunkedArray; +use vortex::array::arrays::chunked::ChunkedArrayExt; +use vortex::error::VortexResult; +use vortex::error::vortex_ensure; + +use crate::duckdb::VectorRef; +use crate::exporter::ColumnExporter; +use crate::exporter::ConversionCache; +use crate::exporter::canonical; +use crate::exporter::new_array_exporter; + +struct ChunkedExporter { + chunk_offsets: Vec, + chunks: Vec>, +} + +pub(crate) fn new_exporter_with_flatten( + array: ChunkedArray, + cache: &ConversionCache, + ctx: &mut ExecutionCtx, + flatten: bool, +) -> VortexResult> { + if flatten { + return canonical::new_exporter(array.into_array(), cache, ctx); + } + + let chunk_offsets = array.chunk_offsets().to_vec(); + let chunks = array + .chunks() + .iter() + .map(|chunk| new_array_exporter(chunk.clone(), cache, ctx)) + .collect::>>()?; + + Ok(Box::new(ChunkedExporter { + chunk_offsets, + chunks, + })) +} + +impl ChunkedExporter { + fn chunk_index(&self, offset: usize) -> usize { + self.chunk_offsets + .partition_point(|&chunk_offset| chunk_offset <= offset) + .saturating_sub(1) + } +} + +impl ColumnExporter for ChunkedExporter { + fn preferred_batch_len(&self, offset: usize, max_len: usize) -> usize { + if max_len == 0 || self.chunks.is_empty() { + return 0; + } + + let chunk_idx = self.chunk_index(offset); + let chunk_start = self.chunk_offsets[chunk_idx]; + let chunk_end = self.chunk_offsets[chunk_idx + 1]; + let len = (chunk_end - offset).min(max_len); + self.chunks[chunk_idx].preferred_batch_len(offset - chunk_start, len) + } + + fn export( + &self, + offset: usize, + len: usize, + vector: &mut VectorRef, + ctx: &mut ExecutionCtx, + ) -> VortexResult<()> { + if len == 0 { + return Ok(()); + } + + let chunk_idx = self.chunk_index(offset); + let chunk_start = self.chunk_offsets[chunk_idx]; + let chunk_end = self.chunk_offsets[chunk_idx + 1]; + let offset_in_chunk = offset - chunk_start; + vortex_ensure!( + offset + len <= chunk_end, + "chunked DuckDB export range {offset}..{} crosses chunk boundary at {chunk_end}", + offset + len + ); + + self.chunks[chunk_idx].export(offset_in_chunk, len, vector, ctx) + } +} + +#[cfg(test)] +mod tests { + use vortex::array::IntoArray; + use vortex::array::VortexSessionExecute; + use vortex::array::arrays::ChunkedArray; + use vortex::array::arrays::DictArray; + use vortex::array::arrays::StructArray; + use vortex::array::arrays::VarBinViewArray; + use vortex::buffer::buffer; + use vortex::error::VortexResult; + + use crate::SESSION; + use crate::duckdb::DataChunk; + use crate::duckdb::LogicalType; + use crate::exporter::ArrayExporter; + use crate::exporter::ConversionCache; + + #[test] + fn chunked_exporter_emits_chunk_aligned_vectors() -> VortexResult<()> { + let values0 = VarBinViewArray::from_iter_str(["a", "b"]).into_array(); + let chunk0 = DictArray::try_new(buffer![0u8, 1].into_array(), values0)?.into_array(); + let dtype = chunk0.dtype().clone(); + + let values1 = VarBinViewArray::from_iter_str(["c", "d", "e"]).into_array(); + let chunk1 = DictArray::try_new(buffer![0u8, 1, 2].into_array(), values1)?.into_array(); + + let field = ChunkedArray::try_new(vec![chunk0, chunk1], dtype)?.into_array(); + let array = StructArray::from_fields(&[("field", field)])?; + let mut exporter = ArrayExporter::try_new( + &array, + &ConversionCache::default(), + SESSION.create_execution_ctx(), + )?; + let mut chunk = DataChunk::new([LogicalType::varchar()]); + + assert!(exporter.export(&mut chunk, None, None)?); + assert_eq!( + format!("{}", String::try_from(&*chunk)?), + r#"Chunk - [1 Columns] +- DICTIONARY VARCHAR: 2 = [ a, b] +"# + ); + + assert!(exporter.export(&mut chunk, None, None)?); + assert_eq!( + format!("{}", String::try_from(&*chunk)?), + r#"Chunk - [1 Columns] +- DICTIONARY VARCHAR: 3 = [ c, d, e] +"# + ); + + assert!(!exporter.export(&mut chunk, None, None)?); + Ok(()) + } +} diff --git a/vortex-duckdb/src/exporter/dict.rs b/vortex-duckdb/src/exporter/dict.rs index cba2f85591f..45aa724dd86 100644 --- a/vortex-duckdb/src/exporter/dict.rs +++ b/vortex-duckdb/src/exporter/dict.rs @@ -25,6 +25,7 @@ use crate::exporter::all_invalid; use crate::exporter::cache::ConversionCache; use crate::exporter::constant; use crate::exporter::new_array_exporter; +use crate::exporter::new_array_exporter_with_flatten; struct DictExporter { // Store the dictionary values once and export the same dictionary with each codes chunk. @@ -104,7 +105,7 @@ pub(crate) fn new_exporter_with_flatten( None => { // Create a new reusable dictionary for the values. let mut reusable_dict = ReusableDict::new(values.dtype().try_into()?, values.len()); - new_array_exporter(values.clone(), cache, ctx)?.export( + new_array_exporter_with_flatten(values.clone(), cache, ctx, true)?.export( 0, values.len(), reusable_dict.vector(), diff --git a/vortex-duckdb/src/exporter/mod.rs b/vortex-duckdb/src/exporter/mod.rs index 7282594ec45..1dad7dc9cda 100644 --- a/vortex-duckdb/src/exporter/mod.rs +++ b/vortex-duckdb/src/exporter/mod.rs @@ -5,6 +5,7 @@ mod all_invalid; mod bool; mod cache; mod canonical; +mod chunked; mod constant; mod decimal; mod dict; @@ -26,6 +27,7 @@ pub use cache::ConversionCache; pub use decimal::precision_to_duckdb_storage_size; use vortex::array::ArrayRef; use vortex::array::ExecutionCtx; +use vortex::array::arrays::Chunked; use vortex::array::arrays::Constant; use vortex::array::arrays::Dict; use vortex::array::arrays::List; @@ -37,6 +39,7 @@ use vortex::encodings::sequence::Sequence; use vortex::error::VortexExpect; use vortex::error::VortexResult; use vortex::error::vortex_bail; +use vortex::error::vortex_ensure; use crate::duckdb::DataChunkRef; use crate::duckdb::VectorRef; @@ -96,8 +99,18 @@ impl ArrayExporter { vortex_bail!("Expected {expected_cols} columns in output chunk, got {chunk_cols}"); } - let chunk_len = duckdb_vector_size().min(self.remaining); let position = self.array_len - self.remaining; + let mut chunk_len = duckdb_vector_size().min(self.remaining); + if !zero_projection { + for field in &self.fields { + chunk_len = field.preferred_batch_len(position, chunk_len); + } + } + vortex_ensure!( + chunk_len > 0, + "column exporter returned zero rows for non-empty export" + ); + self.remaining -= chunk_len; chunk.set_len(chunk_len); @@ -156,6 +169,14 @@ impl ArrayExporter { /// the offset, len and `WritableVector` as options. Not sure what it should return though? /// This would allow Vortex extension authors to plug into the DuckDB exporter system. pub trait ColumnExporter: 'static { + /// Preferred number of rows to export next, capped by `max_len`. + /// + /// Exporters that preserve physical structure can use this to keep a DuckDB vector inside a + /// natural boundary, such as a chunked-array child boundary. + fn preferred_batch_len(&self, _offset: usize, max_len: usize) -> usize { + max_len + } + /// Export the given range of data from the Vortex array to the DuckDB vector. fn export( &self, @@ -201,6 +222,11 @@ fn new_array_exporter_with_flatten( Err(array) => array, }; + let array = match array.try_downcast::() { + Ok(array) => return chunked::new_exporter_with_flatten(array, cache, ctx, flatten), + Err(array) => array, + }; + let array = match array.try_downcast::() { Ok(array) => return list::new_exporter(array, cache, ctx), Err(array) => array, diff --git a/vortex-duckdb/src/exporter/struct_.rs b/vortex-duckdb/src/exporter/struct_.rs index e9aca03a3bb..dbe65dd90d0 100644 --- a/vortex-duckdb/src/exporter/struct_.rs +++ b/vortex-duckdb/src/exporter/struct_.rs @@ -61,6 +61,12 @@ pub(crate) fn new_exporter( } impl ColumnExporter for StructExporter { + fn preferred_batch_len(&self, offset: usize, max_len: usize) -> usize { + self.children + .iter() + .fold(max_len, |len, child| child.preferred_batch_len(offset, len)) + } + fn export( &self, offset: usize, diff --git a/vortex-duckdb/src/exporter/validity.rs b/vortex-duckdb/src/exporter/validity.rs index 429699e9f55..13774b33f27 100644 --- a/vortex-duckdb/src/exporter/validity.rs +++ b/vortex-duckdb/src/exporter/validity.rs @@ -70,6 +70,10 @@ pub(crate) fn new_exporter( } impl ColumnExporter for ValidityExporter { + fn preferred_batch_len(&self, offset: usize, max_len: usize) -> usize { + self.exporter.preferred_batch_len(offset, max_len) + } + fn export( &self, offset: usize, From 45373cd34b23b1842e30633399c5d255dc004e3f Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Sat, 20 Jun 2026 20:26:40 -0400 Subject: [PATCH 20/48] Optimize V2 scan stats and literals Signed-off-by: Nicholas Gates --- .agents/skills/bench-performance/SKILL.md | 8 + encodings/fsst/src/compute/like.rs | 23 +- encodings/fsst/src/dfa/mod.rs | 4 +- .../cpp/include/duckdb_vx/table_function.h | 2 + vortex-duckdb/cpp/table_function.cpp | 3 +- vortex-duckdb/src/column_statistics.rs | 8 +- vortex-duckdb/src/ffi.rs | 6 + vortex-duckdb/src/projection.rs | 7 + vortex-duckdb/src/table_function.rs | 102 ++++++- vortex-file/src/file.rs | 19 ++ vortex-file/src/multi/scan_v2.rs | 74 ++++- vortex-layout/src/scan/v2/layouts/flat.rs | 4 +- vortex-layout/src/scan/v2/layouts/struct_.rs | 4 + vortex-scan/src/plan/mod.rs | 273 +++++++++++++++++- 14 files changed, 492 insertions(+), 45 deletions(-) diff --git a/.agents/skills/bench-performance/SKILL.md b/.agents/skills/bench-performance/SKILL.md index 821f0e022ee..99ef33fa5e9 100644 --- a/.agents/skills/bench-performance/SKILL.md +++ b/.agents/skills/bench-performance/SKILL.md @@ -45,6 +45,14 @@ Do not wait for a deep code read before showing benchmark comparisons or first s - engine/format target(s), for example `datafusion:vortex` versus `datafusion:parquet`; - runtime environment toggles, if the branch exposes any. + If the checkout is an agent worktree, keep benchmark data in the canonical checkout cache rather + than downloading or generating it inside the worktree. Prefer a `file://` data URL that points at + `/Users/ngates/git/vortex/vortex-bench/data/...` (or the user's main checkout equivalent), for + example `--opt remote-data-dir=file:///Users/ngates/git/vortex/vortex-bench/data/clickbench_partitioned/` + when the benchmark supports `remote-data-dir`. For local-only suites such as `statpopgen`, run + from the main checkout or arrange the suite's `vortex-bench/data//...` path to reuse that + canonical cache before generating data. + 3. Run a small comparable benchmark through `vx-bench`: ```bash diff --git a/encodings/fsst/src/compute/like.rs b/encodings/fsst/src/compute/like.rs index ae2402a6e51..2da1410eb25 100644 --- a/encodings/fsst/src/compute/like.rs +++ b/encodings/fsst/src/compute/like.rs @@ -3,10 +3,12 @@ use vortex_array::ArrayRef; use vortex_array::ArrayView; +use vortex_array::Canonical; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::BoolArray; use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt; use vortex_array::arrays::varbin::VarBinArrayExt; use vortex_array::match_each_integer_ptype; use vortex_array::scalar_fn::fns::like::LikeKernel; @@ -15,9 +17,13 @@ use vortex_error::VortexResult; use crate::FSST; use crate::FSSTArrayExt; +use crate::canonical::canonicalize_fsst; use crate::dfa::FsstMatcher; +use crate::dfa::LikeKind; use crate::dfa::dfa_scan_to_bitbuf; +const DECODE_CONTAINS_MAX_NEEDLE_LEN: usize = 16; + impl LikeKernel for FSST { fn like( array: ArrayView<'_, Self>, @@ -47,9 +53,24 @@ impl LikeKernel for FSST { return Ok(None); }; + let like_kind = LikeKind::parse(pattern_bytes); + if let Some(LikeKind::Contains(needle)) = like_kind + && !needle.is_empty() + && needle.len() <= DECODE_CONTAINS_MAX_NEEDLE_LEN + { + // For short substring patterns, bulk FSST decode plus Arrow's memmem-backed LIKE is + // faster than walking the compressed stream through the byte-at-a-time DFA. + let decoded = canonicalize_fsst(array, ctx)?; + let result = vortex_array::scalar_fn::fns::like::Like + .try_new_array(array.len(), options, [decoded, pattern.clone()])? + .into_array() + .execute::(ctx)? + .into_bool(); + return Ok(Some(result.into_array())); + } + let symbols = array.symbols(); let symbol_lengths = array.symbol_lengths(); - let Some(matcher) = FsstMatcher::try_new(symbols.as_slice(), symbol_lengths.as_slice(), pattern_bytes)? else { diff --git a/encodings/fsst/src/dfa/mod.rs b/encodings/fsst/src/dfa/mod.rs index 5f67f92997e..2fcafb19cfc 100644 --- a/encodings/fsst/src/dfa/mod.rs +++ b/encodings/fsst/src/dfa/mod.rs @@ -211,7 +211,7 @@ impl FsstMatcher { } /// The subset of LIKE patterns we can handle without decompression. -enum LikeKind<'a> { +pub(crate) enum LikeKind<'a> { /// `prefix%` Prefix(Cow<'a, [u8]>), /// `%needle%` @@ -219,7 +219,7 @@ enum LikeKind<'a> { } impl<'a> LikeKind<'a> { - fn parse(pattern: &'a [u8]) -> Option { + pub(crate) fn parse(pattern: &'a [u8]) -> Option { Self::parse_prefix(pattern).or_else(|| Self::parse_contains(pattern)) } diff --git a/vortex-duckdb/cpp/include/duckdb_vx/table_function.h b/vortex-duckdb/cpp/include/duckdb_vx/table_function.h index 550e1cf3635..0f87ceb3d88 100644 --- a/vortex-duckdb/cpp/include/duckdb_vx/table_function.h +++ b/vortex-duckdb/cpp/include/duckdb_vx/table_function.h @@ -86,6 +86,8 @@ typedef struct { typedef struct { idx_t estimated_cardinality; bool has_estimated_cardinality; + idx_t max_cardinality; + bool has_max_cardinality; } duckdb_vx_node_statistics; typedef struct { diff --git a/vortex-duckdb/cpp/table_function.cpp b/vortex-duckdb/cpp/table_function.cpp index f18557a2d11..c8aaa5cc5af 100644 --- a/vortex-duckdb/cpp/table_function.cpp +++ b/vortex-duckdb/cpp/table_function.cpp @@ -298,7 +298,8 @@ unique_ptr c_cardinality(ClientContext &, const FunctionData *bi auto out = make_uniq(); out->has_estimated_cardinality = stats.has_estimated_cardinality; out->estimated_cardinality = stats.estimated_cardinality; - out->has_max_cardinality = false; + out->has_max_cardinality = stats.has_max_cardinality; + out->max_cardinality = stats.max_cardinality; return out; } diff --git a/vortex-duckdb/src/column_statistics.rs b/vortex-duckdb/src/column_statistics.rs index 0ef6717b638..1aad42e619e 100644 --- a/vortex-duckdb/src/column_statistics.rs +++ b/vortex-duckdb/src/column_statistics.rs @@ -7,7 +7,6 @@ use vortex::array::aggregate_fn::EmptyOptions; use vortex::array::aggregate_fn::fns::max::Max; use vortex::array::aggregate_fn::fns::min::Min; use vortex::array::aggregate_fn::fns::null_count::NullCount; -use vortex::array::aggregate_fn::fns::sum::Sum; use vortex::array::aggregate_fn::fns::uncompressed_size_in_bytes::UncompressedSizeInBytes; use vortex::array::stats::StatsSet; use vortex::dtype::DType; @@ -22,14 +21,13 @@ use crate::duckdb::Value; const MIN_INDEX: usize = 0; const MAX_INDEX: usize = 1; -const NULL_COUNT_INDEX: usize = 3; -const BYTE_SIZE_INDEX: usize = 4; +const NULL_COUNT_INDEX: usize = 2; +const BYTE_SIZE_INDEX: usize = 3; pub fn column_statistics_aggregate_fns() -> Vec { vec![ Min.bind(EmptyOptions), Max.bind(EmptyOptions), - Sum.bind(EmptyOptions), NullCount.bind(EmptyOptions), UncompressedSizeInBytes.bind(EmptyOptions), ] @@ -75,7 +73,7 @@ impl ColumnStatistics { } } -#[derive(Default)] +#[derive(Clone, Default)] pub struct ColumnStatisticsAggregate { pub min: Option, pub max: Option, diff --git a/vortex-duckdb/src/ffi.rs b/vortex-duckdb/src/ffi.rs index a07ae3b17a4..880c3f7f01c 100644 --- a/vortex-duckdb/src/ffi.rs +++ b/vortex-duckdb/src/ffi.rs @@ -172,6 +172,12 @@ pub unsafe extern "C-unwind" fn duckdb_table_function_cardinality( match cardinality(bind_data) { Cardinality::Unknown => {} + Cardinality::Exact(c) => { + node_stats.has_estimated_cardinality = true; + node_stats.estimated_cardinality = c as _; + node_stats.has_max_cardinality = true; + node_stats.max_cardinality = c as _; + } Cardinality::Estimate(c) => { node_stats.has_estimated_cardinality = true; node_stats.estimated_cardinality = c as _; diff --git a/vortex-duckdb/src/projection.rs b/vortex-duckdb/src/projection.rs index 4521115666c..ba7df81af79 100644 --- a/vortex-duckdb/src/projection.rs +++ b/vortex-duckdb/src/projection.rs @@ -50,6 +50,7 @@ pub struct Projection { pub projection: Expression, pub file_index_column_pos: Option, pub file_row_number_column_pos: Option, + pub is_zero_column: bool, } impl Projection { @@ -106,6 +107,10 @@ impl Projection { real_column_count += 1; } + let is_zero_column = real_column_count == 0 + && file_index_column_pos.is_none() + && file_row_number_column_pos.is_none(); + // Duckdb can request less columns than there are in table i.e. [0, 1] with // 5 columns total. is_star &= real_column_count == column_fields.len() as u64; @@ -123,6 +128,7 @@ impl Projection { projection, file_index_column_pos, file_row_number_column_pos, + is_zero_column, }; } @@ -185,6 +191,7 @@ impl Projection { projection, file_index_column_pos, file_row_number_column_pos, + is_zero_column, } } } diff --git a/vortex-duckdb/src/table_function.rs b/vortex-duckdb/src/table_function.rs index ffd3f281a7a..328f2ae7a8b 100644 --- a/vortex-duckdb/src/table_function.rs +++ b/vortex-duckdb/src/table_function.rs @@ -5,26 +5,32 @@ use std::cmp::max; use std::fmt::Formatter; use std::fmt::{self}; use std::sync::Arc; +use std::sync::OnceLock; use std::sync::atomic::AtomicBool; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use custom_labels::CURRENT_LABELSET; use futures::StreamExt; +use futures::stream; use itertools::Itertools; use num_traits::AsPrimitive; use static_assertions::assert_impl_all; use tracing::debug; use vortex::array::ArrayRef; use vortex::array::Canonical; +use vortex::array::IntoArray; use vortex::array::VortexSessionExecute as _; use vortex::array::arrays::ScalarFn; use vortex::array::arrays::Struct; use vortex::array::arrays::StructArray; use vortex::array::arrays::scalar_fn::ScalarFnArrayExt; use vortex::array::optimizer::ArrayOptimizer; +use vortex::array::validity::Validity; +use vortex::dtype::FieldNames; use vortex::error::VortexExpect; use vortex::error::VortexResult; +use vortex::error::vortex_err; use vortex::expr::Expression; use vortex::expr::col; use vortex::expr::stats::Precision; @@ -40,6 +46,7 @@ use vortex::scalar_fn::fns::operators::Operator; use vortex::scalar_fn::fns::pack::Pack; use vortex::scan::DataSourceRef; use vortex::scan::ScanRequest; +use vortex::scan::selection::Selection; use vortex_utils::parallelism::get_available_parallelism; use crate::RUNTIME; @@ -56,6 +63,7 @@ use crate::duckdb::DuckdbStringMapRef; use crate::duckdb::ExpressionRef; use crate::duckdb::TableInitInput; use crate::duckdb::Value; +use crate::duckdb::duckdb_vector_size; use crate::exporter::ArrayExporter; use crate::exporter::ConversionCache; use crate::multi_file::bind_multi_file_scan; @@ -69,6 +77,7 @@ pub struct TableFunctionBind { statistics_source: Option>, filter_exprs: Vec, column_fields: Vec, + column_statistics: Arc>>>, // There exists at least one non-optional table filter or at least one // complex filter is pushed down. has_non_optional_filter: AtomicBool, @@ -83,6 +92,7 @@ impl Clone for TableFunctionBind { // filter_exprs are consumed once in `init_global`. filter_exprs: vec![], column_fields: self.column_fields.clone(), + column_statistics: Arc::clone(&self.column_statistics), has_non_optional_filter: AtomicBool::new( self.has_non_optional_filter.load(Ordering::Relaxed), ), @@ -142,6 +152,8 @@ pub struct PartitionData { pub enum Cardinality { /// Unknown number of rows Unknown, + /// The exact number of rows. + Exact(u64), /// An estimate of the number of rows. Estimate(u64), } @@ -156,6 +168,11 @@ pub fn bind(input: &BindInputRef, result: &mut BindResultRef) -> VortexResult>(), + ), column_fields, has_non_optional_filter: AtomicBool::new(false), }) @@ -172,6 +189,7 @@ pub fn init_global(init_input: &TableInitInput) -> VortexResult VortexResult VortexResult TableFunctionGlobal { + TableFunctionGlobal { + iterator: zero_column_iterator(row_count), + batch_id: AtomicU64::new(0), + bytes_total: Arc::new(AtomicU64::new(row_count)), + bytes_read: AtomicU64::new(0), + file_index_column_pos: None, + file_row_number_column_pos: None, + } +} + +fn zero_column_iterator(row_count: u64) -> DataSourceIterator { + let vector_size = u64::try_from(duckdb_vector_size()) + .unwrap_or(u64::MAX) + .max(1); + RUNTIME.block_on_stream_thread_safe(move |_handle| { + let cache = Arc::new(ConversionCache::default()); + stream::unfold((row_count, cache), move |(remaining, cache)| async move { + if remaining == 0 { + return None; + } + let batch_len = remaining.min(vector_size); + let item = usize::try_from(batch_len) + .map_err(|_| vortex_err!("zero-column batch length exceeds usize")) + .and_then(zero_column_array) + .map(|array| (array, Arc::clone(&cache))); + Some((item, (remaining - batch_len, cache))) + }) + }) +} + +fn zero_column_array(len: usize) -> VortexResult { + Ok( + StructArray::try_new(FieldNames::empty(), Vec::new(), len, Validity::NonNullable)? + .into_array(), + ) +} + pub fn init_local(global: &TableFunctionGlobal) -> TableFunctionLocal { unsafe { use custom_labels::sys; @@ -454,6 +522,18 @@ pub fn pushdown_projection_expression( /// Get column-wise statistics. Available only if we're reading a single file. pub fn statistics(bind_data: &TableFunctionBind, column_index: usize) -> Option { let dtype = bind_data.column_fields[column_index].dtype.clone(); + let stats_aggregate = bind_data + .column_statistics + .get(column_index)? + .get_or_init(|| column_statistics_aggregate(bind_data, column_index)) + .as_ref()?; + Some(ColumnStatistics::from(stats_aggregate, dtype)) +} + +fn column_statistics_aggregate( + bind_data: &TableFunctionBind, + column_index: usize, +) -> Option { if let Some(statistics_source) = bind_data.statistics_source.as_ref() { let children = statistics_source.children(); // Otherwise we'd have to open all files eagerly which is a performance @@ -470,8 +550,7 @@ pub fn statistics(bind_data: &TableFunctionBind, column_index: usize) -> Option< Some(inner) => inner.file_stats().stats_sets(), None => return None, }; - let stats_aggregate = ColumnStatisticsAggregate::new(&stats_sets[column_index]); - return Some(ColumnStatistics::from(&stats_aggregate, dtype)); + return Some(ColumnStatisticsAggregate::new(&stats_sets[column_index])); } let name = &bind_data.column_fields[column_index].name; @@ -483,8 +562,7 @@ pub fn statistics(bind_data: &TableFunctionBind, column_index: usize) -> Option< .statistics(&col(name.as_str()), &funcs), ) .ok()?; - let stats_aggregate = ColumnStatisticsAggregate::from_aggregate_stats(&stats); - Some(ColumnStatistics::from(&stats_aggregate, dtype)) + Some(ColumnStatisticsAggregate::from_aggregate_stats(&stats)) } /// Duckdb requires post-filter cardinality estimates, otherwise join planner @@ -498,12 +576,18 @@ pub fn statistics(bind_data: &TableFunctionBind, column_index: usize) -> Option< /// here. const DEFAULT_SELECTIVITY: f64 = 0.2; pub fn cardinality(bind_data: &TableFunctionBind) -> Cardinality { + let has_non_optional_filter = bind_data.has_non_optional_filter.load(Ordering::Relaxed); match bind_data.data_source.row_count() { - Precision::Exact(v) | Precision::Inexact(v) => { - if !bind_data.has_non_optional_filter.load(Ordering::Relaxed) { - // Although we may have an exact upper bound here, reporting - // it as exact has a negative performance impact on tpcds as - // it's not a real post-filter calculation. + Precision::Exact(v) => { + if !has_non_optional_filter { + return Cardinality::Exact(v); + } + let post_cardinality = v as f64 * DEFAULT_SELECTIVITY; + let post_cardinality: u64 = post_cardinality.as_(); + Cardinality::Estimate(max(1, post_cardinality)) + } + Precision::Inexact(v) => { + if !has_non_optional_filter { return Cardinality::Estimate(v); } let post_cardinality = v as f64 * DEFAULT_SELECTIVITY; diff --git a/vortex-file/src/file.rs b/vortex-file/src/file.rs index ebf88f967aa..0e833134389 100644 --- a/vortex-file/src/file.rs +++ b/vortex-file/src/file.rs @@ -30,6 +30,7 @@ use vortex_layout::segments::SegmentInfo; use vortex_layout::segments::SegmentSource; use vortex_scan::DataSourceRef; use vortex_scan::ScanRequest; +use vortex_scan::plan::ScanPlanRef; use vortex_session::VortexSession; use crate::FileStatistics; @@ -55,6 +56,8 @@ pub struct VortexFile { session: VortexSession, /// None id LayoutReader caching is turned off layout_reader_cache: Option>>, + /// Shared cache for the v2 physical scan plan root. + scan_plan_root_cache: Arc>, } fn layout_reader( @@ -100,6 +103,7 @@ impl VortexFile { scheduled_segment_source, session, layout_reader_cache: None, + scan_plan_root_cache: Arc::new(OnceLock::new()), } } @@ -111,6 +115,7 @@ impl VortexFile { scheduled_segment_source: self.scheduled_segment_source, session: self.session, layout_reader_cache: Some(OnceLock::new()), + scan_plan_root_cache: self.scan_plan_root_cache, } } @@ -184,6 +189,20 @@ impl VortexFile { } } + pub(crate) fn scan_plan_root(&self) -> VortexResult { + if let Some(root) = self.scan_plan_root_cache.get() { + return Ok(Arc::clone(root)); + } + + let root = scan_v2::build_file_scan_plan_root(self)?; + if self.scan_plan_root_cache.set(Arc::clone(&root)).is_err() + && let Some(root) = self.scan_plan_root_cache.get() + { + return Ok(Arc::clone(root)); + } + Ok(root) + } + /// Create a [`DataSource`](vortex_scan::DataSource) from this file for scanning. /// /// Wraps the file's layout reader with [`FileStatsLayoutReader`] (when file-level diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index ce8d5544f6a..12a1fe6abd6 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -199,6 +199,33 @@ impl ScanPlan for FileStatsScanPlan { Arc::clone(&self.data).prepare_evidence(cx) } + fn prepare_field_stats( + self: Arc, + field_path: &FieldPath, + funcs: &[AggregateFnRef], + cx: &mut PrepareCtx, + ) -> VortexResult> { + if field_path.parts().len() != 1 { + return Arc::clone(&self.data).prepare_field_stats(field_path, funcs, cx); + } + let Some(name) = field_path.parts()[0].as_name() else { + return Arc::clone(&self.data).prepare_field_stats(field_path, funcs, cx); + }; + let Some(field_idx) = self.fields.find(name) else { + return Ok(None); + }; + let Some(field_dtype) = self.fields.field_by_index(field_idx) else { + return Ok(None); + }; + let stats = self.stats.stats_sets()[field_idx].clone(); + Ok(Some(Arc::new(FilePreparedStats { + stats, + field_dtype, + row_count: self.row_count, + funcs: funcs.to_vec(), + }))) + } + fn prepare_aggregate_partial( self: Arc, funcs: &[AggregateFnRef], @@ -254,11 +281,15 @@ impl ScanPlan for FileStatsExprScanPlan { Arc::clone(&self.data).prepare_aggregate_partial(funcs, cx) } - fn prepare_stats( + fn prepare_field_stats( self: Arc, + field_path: &FieldPath, funcs: &[AggregateFnRef], - _cx: &mut PrepareCtx, + cx: &mut PrepareCtx, ) -> VortexResult> { + if !field_path.is_root() { + return Arc::clone(&self.data).prepare_field_stats(field_path, funcs, cx); + } let stats = self.stats.stats_sets()[self.field_idx].clone(); Ok(Some(Arc::new(FilePreparedStats { stats, @@ -330,6 +361,13 @@ fn root_field(expr: &Expression) -> Option<&FieldName> { expr.child(0).is::().then_some(name) } +fn root_field_path(expr: &Expression) -> Option { + if expr.is::() { + return Some(FieldPath::root()); + } + root_field(expr).cloned().map(FieldPath::from_name) +} + /// Static cost estimate for a filter conjunct, used to order predicate evaluation cheapest-first. /// /// We sum a per-node cost over the whole expression tree. Primitive comparisons, null checks and @@ -900,14 +938,25 @@ pub(crate) async fn scan_plan_file_statistics_many( funcs: &[AggregateFnRef], ) -> VortexResult>>> { let session = file.session().clone(); - let root = expand_file_root(&file, &session)?; + let root = file.scan_plan_root()?; let reader = FileReader::new(file.segment_source(), session); let mut result = Vec::with_capacity(exprs.len()); for expr in exprs { - let pushed = push_expr(&root, expr, file.dtype(), reader.session())?; - let Some(plan) = - pushed.prepare_stats(funcs, &mut PrepareCtx::new(reader.session().clone()))? - else { + let plan = if let Some(field_path) = root_field_path(expr) { + Arc::clone(&root).prepare_field_stats( + &field_path, + funcs, + &mut PrepareCtx::new(reader.session().clone()), + )? + } else { + let pushed = push_expr(&root, expr, file.dtype(), reader.session())?; + pushed.prepare_field_stats( + &FieldPath::root(), + funcs, + &mut PrepareCtx::new(reader.session().clone()), + )? + }; + let Some(plan) = plan else { result.push(absent_statistics(funcs)); continue; }; @@ -921,8 +970,7 @@ pub(crate) async fn scan_plan_file_statistics_many( } pub(crate) fn scan_plan_file_splits(file: &VortexFile) -> VortexResult>> { - let session = file.session().clone(); - let root = expand_file_root(file, &session)?; + let root = file.scan_plan_root()?; split_ranges_from_node(&root, file.row_count()) } @@ -931,7 +979,7 @@ pub(crate) async fn scan_plan_file_plan_splits( projection: &Expression, ) -> VortexResult>> { let session = file.session().clone(); - let root = expand_file_root(&file, &session)?; + let root = file.scan_plan_root()?; let pushed = push_expr(&root, projection, file.dtype(), &session)?; let Some(plan) = pushed.prepare_splits(&mut PrepareCtx::new(session.clone()))? else { return Ok(std::iter::once(0..file.row_count()).collect()); @@ -963,13 +1011,13 @@ fn split_ranges_from_node(node: &ScanPlanRef, row_count: u64) -> VortexResult VortexResult { +pub(crate) fn build_file_scan_plan_root(file: &VortexFile) -> VortexResult { let mut plan_request = ScanRequest::empty(); let layout = file .footer() .layout2() .ok_or_else(|| vortex_err!("scan2 requires a v2 footer layout"))?; - let root = layout.new_scan_plan(&mut plan_request, session)?; + let root = layout.new_scan_plan(&mut plan_request, file.session())?; let root = with_row_idx(root, file.dtype().clone(), 0); Ok(match file.footer().statistics().cloned() { Some(stats) => FileStatsScanPlan::try_new( @@ -1803,7 +1851,7 @@ impl PreparedScanPlanFile { .map(|filter| filter.optimize_recursive(file.dtype())) .transpose()?; - let root = expand_file_root(&file, &session)?; + let root = file.scan_plan_root()?; let registered_source = Arc::new(RegisteredScheduledSegmentSource { source: file.scheduled_segment_source(), }); diff --git a/vortex-layout/src/scan/v2/layouts/flat.rs b/vortex-layout/src/scan/v2/layouts/flat.rs index 9d5f483f1c2..2f456237318 100644 --- a/vortex-layout/src/scan/v2/layouts/flat.rs +++ b/vortex-layout/src/scan/v2/layouts/flat.rs @@ -17,6 +17,8 @@ use futures::future::BoxFuture; use parking_lot::Mutex; use vortex_array::ArrayRef; use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::SliceArray; use vortex_array::expr::Expression; use vortex_array::serde::SerializedArray; use vortex_error::VortexResult; @@ -203,5 +205,5 @@ pub(crate) fn slice_to_range(array: ArrayRef, range: &Range) -> VortexResul if start == 0 && end == array.len() { return Ok(array); } - array.slice(start..end) + Ok(SliceArray::try_new(array, start..end)?.into_array()) } diff --git a/vortex-layout/src/scan/v2/layouts/struct_.rs b/vortex-layout/src/scan/v2/layouts/struct_.rs index 229a3079eb5..5d28894ae4a 100644 --- a/vortex-layout/src/scan/v2/layouts/struct_.rs +++ b/vortex-layout/src/scan/v2/layouts/struct_.rs @@ -35,6 +35,7 @@ use vortex_scan::plan::ScanPlanRef; use vortex_scan::plan::ScanStateRef; use vortex_scan::plan::StateCtx; use vortex_scan::plan::StructValueScanPlan; +use vortex_scan::plan::literal_scan_plan; use vortex_scan::plan::request::ScanRequest; use vortex_session::VortexSession; @@ -86,6 +87,9 @@ impl ScanPlan for StructScanPlan { cx: &mut PushCtx, ) -> VortexResult> { let scope = struct_fields(self.layout.dtype())?; + if let Some(literal) = literal_scan_plan(expr, self.layout.row_count()) { + return Ok(Some(literal)); + } if is_root(expr) { return self.push_struct(scope.names().clone(), cx).map(Some); } diff --git a/vortex-scan/src/plan/mod.rs b/vortex-scan/src/plan/mod.rs index 7841872b490..3f4fe0b2705 100644 --- a/vortex-scan/src/plan/mod.rs +++ b/vortex-scan/src/plan/mod.rs @@ -31,13 +31,21 @@ use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::VortexSessionExecute; use vortex_array::aggregate_fn::AggregateFnRef; +use vortex_array::arrays::ConstantArray; use vortex_array::arrays::StructArray; use vortex_array::builtins::ArrayBuiltins; +use vortex_array::dtype::Field; use vortex_array::dtype::FieldNames; +use vortex_array::dtype::FieldPath; +use vortex_array::dtype::Nullability; use vortex_array::expr::Expression; +use vortex_array::expr::get_item; use vortex_array::expr::is_root; +use vortex_array::expr::root; use vortex_array::expr::stats::Precision; +use vortex_array::expr::stats::Stat; use vortex_array::scalar::Scalar; +use vortex_array::scalar_fn::fns::literal::Literal; use vortex_array::validity::Validity; use vortex_error::VortexResult; use vortex_error::vortex_bail; @@ -393,17 +401,30 @@ pub trait ScanPlan: 'static + Send + Sync { Ok(None) } - /// Prepare metadata statistics for this plan's root value. + /// Prepare metadata statistics for a field path rooted at this plan's root value. /// - /// The returned handle answers the requested aggregate functions positionally over runtime row - /// ranges using metadata only. `None` means this plan cannot answer these functions from - /// metadata. - fn prepare_stats( + /// The root path means statistics for this plan's root value. Non-root field paths default to + /// expression pushdown followed by preparing stats for the pushed root value. The returned + /// handle answers the requested aggregate functions positionally over runtime row ranges using + /// metadata only. `None` means this plan cannot answer these functions from metadata. + fn prepare_field_stats( self: Arc, - _funcs: &[AggregateFnRef], - _cx: &mut PrepareCtx, + field_path: &FieldPath, + funcs: &[AggregateFnRef], + cx: &mut PrepareCtx, ) -> VortexResult> { - Ok(None) + if field_path.is_root() { + return Ok(None); + } + let Some(expr) = field_path_expr(field_path) else { + return Ok(None); + }; + let Some(pushed) = + Arc::clone(&self).try_push_expr(&expr, &mut PushCtx::new(cx.session().clone()))? + else { + return Ok(None); + }; + pushed.prepare_field_stats(&FieldPath::root(), funcs, cx) } /// Preferred morsel boundaries (chunk edges), for alignment hints. @@ -435,6 +456,184 @@ pub fn default_try_push_expr( } } +/// Return a scan plan for a scalar literal expression. +pub fn literal_scan_plan(expr: &Expression, row_count: u64) -> Option { + let Some(scalar) = expr.as_opt::() else { + return None; + }; + Some(Arc::new(LiteralScanPlan::new(scalar.clone(), row_count)) as ScanPlanRef) +} + +fn field_path_expr(field_path: &FieldPath) -> Option { + let mut expr = root(); + for field in field_path.parts() { + let Field::Name(name) = field else { + return None; + }; + expr = get_item(name.clone(), expr); + } + Some(expr) +} + +/// Virtual plan that reads a scalar literal in any row domain. +pub struct LiteralScanPlan { + scalar: Scalar, + row_count: u64, +} + +impl LiteralScanPlan { + /// Create a plan that produces `scalar` for every selected row. + pub fn new(scalar: Scalar, row_count: u64) -> Self { + Self { scalar, row_count } + } +} + +struct LiteralPreparedRead { + scalar: Scalar, + row_count: u64, +} + +struct LiteralPreparedStats { + scalar: Scalar, + row_count: u64, + funcs: Vec, +} + +impl ScanPlan for LiteralScanPlan { + fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { + Ok(Arc::new(())) + } + + fn try_push_expr( + self: Arc, + expr: &Expression, + _cx: &mut PushCtx, + ) -> VortexResult> { + if let Some(literal) = literal_scan_plan(expr, self.row_count) { + return Ok(Some(literal)); + } + default_try_push_expr(self, expr) + } + + fn prepare_read( + self: Arc, + _cx: &mut PrepareCtx, + ) -> VortexResult> { + Ok(Some(Arc::new(LiteralPreparedRead { + scalar: self.scalar.clone(), + row_count: self.row_count, + }))) + } + + fn prepare_field_stats( + self: Arc, + field_path: &FieldPath, + funcs: &[AggregateFnRef], + _cx: &mut PrepareCtx, + ) -> VortexResult> { + if !field_path.is_root() { + return Ok(None); + } + Ok(Some(Arc::new(LiteralPreparedStats { + scalar: self.scalar.clone(), + row_count: self.row_count, + funcs: funcs.to_vec(), + }))) + } + + fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "literal({}, rows={})", self.scalar, self.row_count) + } +} + +impl PreparedRead for LiteralPreparedRead { + fn read_scoped<'a>( + &'a self, + range: Range, + rows: RowScope<'a>, + _io: &'a FileReader, + _local: &'a mut ExecutionCtx, + ) -> BoxFuture<'a, VortexResult> { + Box::pin(async move { + check_scan_range(&range, self.row_count)?; + Ok(ConstantArray::new(self.scalar.clone(), rows.selection.true_count()).into_array()) + }) + } + + fn segment_requests( + &self, + _range: Range, + _rows: RowScope<'_>, + _cx: &mut SegmentPlanCtx, + ) -> VortexResult { + Ok(SegmentRequests::none()) + } + + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "literal") + } +} + +impl PreparedStats for LiteralPreparedStats { + fn init_state(&self, _ctx: &VortexSession) -> VortexResult { + Ok(Arc::new(())) + } + + fn stats<'a>( + &'a self, + range: Range, + _io: &'a FileReader, + _state: &'a ScanState, + ) -> BoxFuture<'a, VortexResult>>> { + Box::pin(async move { + check_scan_range(&range, self.row_count)?; + self.funcs + .iter() + .map(|func| self.stat_for_func(func, range.end - range.start)) + .collect() + }) + } + + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "literal_stats") + } +} + +impl LiteralPreparedStats { + fn stat_for_func(&self, func: &AggregateFnRef, len: u64) -> VortexResult> { + let Some(stat) = Stat::from_aggregate_fn(func) else { + return Ok(Precision::Absent); + }; + let Some(dtype) = func.return_dtype(self.scalar.dtype()) else { + return Ok(Precision::Absent); + }; + let value = match stat { + Stat::Min | Stat::Max => { + if len == 0 { + return Ok(Precision::Absent); + } + if self.scalar.value().is_some() { + self.scalar.cast(&dtype)? + } else if dtype.is_nullable() { + Scalar::null(dtype) + } else { + return Ok(Precision::Absent); + } + } + Stat::NullCount => Scalar::primitive( + if self.scalar.value().is_none() { + len + } else { + 0 + }, + Nullability::NonNullable, + ), + _ => return Ok(Precision::Absent), + }; + Ok(Precision::exact(value)) + } +} + /// Read every row in `range` through a prepared read. pub fn read_dense<'a>( read: &'a PreparedReadRef, @@ -458,6 +657,17 @@ fn range_len(range: &Range) -> VortexResult { usize::try_from(len).map_err(|_| vortex_err!("read range exceeds usize")) } +fn check_scan_range(range: &Range, row_count: u64) -> VortexResult<()> { + if range.start > range.end || range.end > row_count { + vortex_bail!( + "scan row range {:?} is out of bounds for row count {}", + range, + row_count + ); + } + range_len(range).map(|_| ()) +} + /// Prepared value read for one pushed expression. /// /// A `PreparedRead` is the scan-level runtime handle for a fixed read route. It @@ -645,9 +855,9 @@ pub trait PreparedStats: 'static + Send + Sync { /// Answer aggregate-function statistics over every row of `range`. /// /// The returned vector is positional against the functions passed to - /// [`ScanPlan::prepare_stats`]. Each element is exact, inexact, or absent for the requested - /// aggregate function over `range`. Implementations must not read row values merely to improve - /// an estimate. + /// [`ScanPlan::prepare_field_stats`]. Each element is exact, inexact, or absent for the + /// requested aggregate function over `range`. Implementations must not read row values merely + /// to improve an estimate. fn stats<'a>( &'a self, range: Range, @@ -1132,8 +1342,10 @@ mod tests { use vortex_array::aggregate_fn::EmptyOptions; use vortex_array::aggregate_fn::fns::max::Max; use vortex_array::aggregate_fn::fns::min::Min; + use vortex_array::arrays::Constant; use vortex_array::buffer::BufferHandle; use vortex_array::dtype::Nullability; + use vortex_array::expr::lit; use vortex_buffer::ByteBuffer; use super::*; @@ -1158,6 +1370,9 @@ mod tests { expr: &Expression, _cx: &mut PushCtx, ) -> VortexResult> { + if let Some(literal) = literal_scan_plan(expr, 20) { + return Ok(Some(literal)); + } default_try_push_expr(self, expr) } @@ -1168,11 +1383,15 @@ mod tests { Ok(None) } - fn prepare_stats( + fn prepare_field_stats( self: Arc, + field_path: &FieldPath, funcs: &[AggregateFnRef], _cx: &mut PrepareCtx, ) -> VortexResult> { + if !field_path.is_root() { + return Ok(None); + } Ok(Some(Arc::new(TestPreparedStats { len: funcs.len() }))) } @@ -1217,7 +1436,11 @@ mod tests { let funcs = vec![Min.bind(EmptyOptions), Max.bind(EmptyOptions)]; let plan = plan_root - .prepare_stats(&funcs, &mut PrepareCtx::new(session.clone()))? + .prepare_field_stats( + &FieldPath::root(), + &funcs, + &mut PrepareCtx::new(session.clone()), + )? .ok_or_else(|| vortex_err!("test scan plan did not return a stats plan"))?; let state = plan.init_state(&session)?; let io = FileReader::new(Arc::new(TestSegments), session); @@ -1229,4 +1452,28 @@ mod tests { Ok(()) } + + #[test] + fn literal_pushdown_prepares_without_input_read() -> VortexResult<()> { + let session = VortexSession::empty(); + let plan_root: ScanPlanRef = Arc::new(TestStatsNode); + let literal = lit(42i32); + + let plan = plan_root + .try_push_expr(&literal, &mut PushCtx::new(session.clone()))? + .ok_or_else(|| vortex_err!("literal expression was not pushed"))?; + let read = plan + .prepare_read(&mut PrepareCtx::new(session.clone()))? + .ok_or_else(|| vortex_err!("literal scan plan did not return a prepared read"))?; + let io = FileReader::new(Arc::new(TestSegments), session); + let array = futures::executor::block_on(read_dense(&read, 10..15, &io))?; + let constant = array + .as_opt::() + .ok_or_else(|| vortex_err!("literal read did not produce a constant array"))?; + + assert_eq!(array.len(), 5); + assert_eq!(constant.scalar(), &Scalar::from(42i32)); + + Ok(()) + } } From 2225c16e79790f54562e0cc069435720dff9ed4a Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Sat, 20 Jun 2026 22:56:54 -0400 Subject: [PATCH 21/48] Fix struct binding Signed-off-by: Nicholas Gates --- encodings/fsst/src/kernel.rs | 36 +++++++ .../src/arrays/chunked/compute/kernel.rs | 3 - vortex-array/src/arrays/dict/compute/rules.rs | 102 ++++++++++++++++++ vortex-array/src/arrays/dict/vtable/kernel.rs | 2 + vortex-array/src/executor.rs | 34 ++++++ vortex-file/src/multi/scan_v2.rs | 35 +++--- 6 files changed, 194 insertions(+), 18 deletions(-) diff --git a/encodings/fsst/src/kernel.rs b/encodings/fsst/src/kernel.rs index 942182097fa..8f36aa8a901 100644 --- a/encodings/fsst/src/kernel.rs +++ b/encodings/fsst/src/kernel.rs @@ -38,14 +38,20 @@ mod tests { use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::VortexSessionExecute; + use vortex_array::arrays::BoolArray; + use vortex_array::arrays::ConstantArray; use vortex_array::arrays::FilterArray; use vortex_array::arrays::PrimitiveArray; + use vortex_array::arrays::SharedArray; + use vortex_array::arrays::VarBinArray; use vortex_array::arrays::varbin::builder::VarBinBuilder; use vortex_array::assert_arrays_eq; + use vortex_array::builtins::ArrayBuiltins; use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; use vortex_array::expr::byte_length; use vortex_array::expr::root; + use vortex_array::scalar_fn::fns::operators::Operator; use vortex_error::VortexResult; use vortex_mask::Mask; use vortex_session::VortexSession; @@ -237,4 +243,34 @@ mod tests { assert_arrays_eq!(result, expected); Ok(()) } + + #[test] + fn test_shared_fsst_parent_kernels() -> VortexResult<()> { + let session = vortex_array::array_session(); + crate::initialize(&session); + let mut ctx = session.create_execution_ctx(); + + let varbin = VarBinArray::from_iter( + ["hello", "", "world!!"].map(Some), + DType::Utf8(Nullability::NonNullable), + ); + let compressor = fsst_train_compressor(&varbin); + let len = varbin.len(); + let dtype = varbin.dtype().clone(); + let fsst = fsst_compress(varbin, len, &dtype, &compressor, &mut ctx).into_array(); + let shared = SharedArray::new(fsst).into_array(); + + let lengths = shared.clone().apply(&byte_length(root()))?; + assert_arrays_eq!(lengths, PrimitiveArray::from_iter(vec![5u64, 0, 7])); + + let not_empty = shared + .binary( + ConstantArray::new("", shared.len()).into_array(), + Operator::NotEq, + )? + .execute::(&mut ctx)?; + assert_arrays_eq!(not_empty, BoolArray::from_iter([true, false, true])); + + Ok(()) + } } diff --git a/vortex-array/src/arrays/chunked/compute/kernel.rs b/vortex-array/src/arrays/chunked/compute/kernel.rs index db0042105cd..4d897afb067 100644 --- a/vortex-array/src/arrays/chunked/compute/kernel.rs +++ b/vortex-array/src/arrays/chunked/compute/kernel.rs @@ -5,10 +5,8 @@ use vortex_session::VortexSession; use crate::ArrayVTable; use crate::arrays::Chunked; -use crate::arrays::Dict; use crate::arrays::Filter; use crate::arrays::Slice; -use crate::arrays::dict::TakeExecuteAdaptor; use crate::arrays::filter::FilterExecuteAdaptor; use crate::arrays::slice::SliceExecuteAdaptor; use crate::optimizer::kernels::ArrayKernelsExt; @@ -23,6 +21,5 @@ pub(crate) fn initialize(session: &VortexSession) { kernels.register_execute_parent_kernel(Filter.id(), Chunked, FilterExecuteAdaptor(Chunked)); kernels.register_execute_parent_kernel(Mask.id(), Chunked, MaskExecuteAdaptor(Chunked)); kernels.register_execute_parent_kernel(Slice.id(), Chunked, SliceExecuteAdaptor(Chunked)); - kernels.register_execute_parent_kernel(Dict.id(), Chunked, TakeExecuteAdaptor(Chunked)); kernels.register_execute_parent_kernel(Zip.id(), Chunked, ZipExecuteAdaptor(Chunked)); } diff --git a/vortex-array/src/arrays/dict/compute/rules.rs b/vortex-array/src/arrays/dict/compute/rules.rs index b4804218e37..80fb02d5520 100644 --- a/vortex-array/src/arrays/dict/compute/rules.rs +++ b/vortex-array/src/arrays/dict/compute/rules.rs @@ -9,12 +9,15 @@ use crate::EqMode; use crate::IntoArray; use crate::array::ArrayView; use crate::array::VTable; +use crate::arrays::Chunked; +use crate::arrays::ChunkedArray; use crate::arrays::Constant; use crate::arrays::ConstantArray; use crate::arrays::Dict; use crate::arrays::DictArray; use crate::arrays::ScalarFn; use crate::arrays::ScalarFnArray; +use crate::arrays::chunked::ChunkedArrayExt; use crate::arrays::dict::DictArrayExt; use crate::arrays::dict::DictArraySlotsExt; use crate::arrays::filter::FilterReduceAdaptor; @@ -37,11 +40,59 @@ pub(crate) const PARENT_RULES: ParentRuleSet = ParentRuleSet::new(&[ ParentRuleSet::lift(&CastReduceAdaptor(Dict)), ParentRuleSet::lift(&MaskReduceAdaptor(Dict)), ParentRuleSet::lift(&LikeReduceAdaptor(Dict)), + ParentRuleSet::lift(&DictionaryChunkedValuesPullUpRule), ParentRuleSet::lift(&DictionaryScalarFnValuesPushDownRule), ParentRuleSet::lift(&DictionaryScalarFnCodesPullUpRule), ParentRuleSet::lift(&SliceReduceAdaptor(Dict)), ]); +/// Pull a common dictionary values array above chunked dictionary codes. +/// +/// Rewrites `Chunked>` into `Dict, values>` only when +/// every child dictionary shares the exact same values array allocation. +#[derive(Debug)] +struct DictionaryChunkedValuesPullUpRule; + +impl ArrayParentReduceRule for DictionaryChunkedValuesPullUpRule { + type Parent = Chunked; + + fn reduce_parent( + &self, + array: ArrayView<'_, Dict>, + parent: ArrayView<'_, Chunked>, + _child_idx: usize, + ) -> VortexResult> { + let values = array.values(); + let codes_dtype = array.codes().dtype().clone(); + let mut code_chunks = Vec::with_capacity(parent.nchunks()); + let mut all_values_referenced = array.has_all_values_referenced(); + + for chunk in parent.iter_chunks() { + let Some(dict) = chunk.as_opt::() else { + return Ok(None); + }; + if dict.codes().dtype() != &codes_dtype { + return Ok(None); + } + if !ArrayRef::ptr_eq(dict.values(), values) { + return Ok(None); + } + all_values_referenced |= dict.has_all_values_referenced(); + code_chunks.push(dict.codes().clone()); + } + + let codes = ChunkedArray::try_new(code_chunks, codes_dtype)?.into_array(); + let dict = DictArray::try_new(codes, values.clone())?; + let dict = if all_values_referenced { + unsafe { dict.set_all_values_referenced(true) } + } else { + dict + }; + + Ok(Some(dict.into_array())) + } +} + /// Push down a scalar function to run only over the values of a dictionary array. #[derive(Debug)] struct DictionaryScalarFnValuesPushDownRule; @@ -214,16 +265,67 @@ mod tests { use vortex_buffer::buffer; use vortex_error::VortexResult; + use crate::ArrayRef; use crate::IntoArray; use crate::arrays::BoolArray; + use crate::arrays::Chunked; + use crate::arrays::ChunkedArray; use crate::arrays::Dict; use crate::arrays::DictArray; + use crate::arrays::PrimitiveArray; + use crate::arrays::chunked::ChunkedArrayExt; use crate::arrays::dict::DictArrayExt; + use crate::arrays::dict::DictArraySlotsExt; use crate::arrays::scalar_fn::ScalarFnFactoryExt; + use crate::assert_arrays_eq; use crate::optimizer::ArrayOptimizer; use crate::scalar_fn::EmptyOptions; use crate::scalar_fn::fns::not::Not; + #[test] + fn chunked_dict_with_shared_values_pulls_values_up() -> VortexResult<()> { + let values = buffer![10u32, 20, 30].into_array(); + let chunk0 = DictArray::try_new(buffer![0u8, 1].into_array(), values.clone())?.into_array(); + let chunk1 = + DictArray::try_new(buffer![2u8, 0, 1].into_array(), values.clone())?.into_array(); + let array = + ChunkedArray::try_new(vec![chunk0, chunk1], values.dtype().clone())?.into_array(); + + let optimized = array.optimize()?; + let dict = optimized.as_::(); + let codes = dict.codes().as_::(); + + assert!(ArrayRef::ptr_eq(dict.values(), &values)); + assert_eq!(codes.nchunks(), 2); + assert_arrays_eq!( + optimized, + PrimitiveArray::from_iter([10u32, 20, 30, 10, 20]) + ); + + Ok(()) + } + + #[test] + fn chunked_dict_with_distinct_values_stays_chunked() -> VortexResult<()> { + let values0 = buffer![10u32, 20, 30].into_array(); + let values1 = buffer![10u32, 20, 30].into_array(); + let chunk0 = + DictArray::try_new(buffer![0u8, 1].into_array(), values0.clone())?.into_array(); + let chunk1 = DictArray::try_new(buffer![2u8, 0, 1].into_array(), values1)?.into_array(); + let array = + ChunkedArray::try_new(vec![chunk0, chunk1], values0.dtype().clone())?.into_array(); + + let optimized = array.optimize()?; + + assert!(optimized.is::()); + assert_arrays_eq!( + optimized, + PrimitiveArray::from_iter([10u32, 20, 30, 10, 20]) + ); + + Ok(()) + } + #[test] fn scalar_fn_values_pushdown_preserves_all_values_referenced() -> VortexResult<()> { let dict = unsafe { diff --git a/vortex-array/src/arrays/dict/vtable/kernel.rs b/vortex-array/src/arrays/dict/vtable/kernel.rs index ab750f7d663..79659af18dd 100644 --- a/vortex-array/src/arrays/dict/vtable/kernel.rs +++ b/vortex-array/src/arrays/dict/vtable/kernel.rs @@ -4,6 +4,7 @@ use vortex_session::VortexSession; use crate::ArrayVTable; +use crate::arrays::Chunked; use crate::arrays::Dict; use crate::arrays::dict::TakeExecuteAdaptor; use crate::optimizer::kernels::ArrayKernelsExt; @@ -16,6 +17,7 @@ use crate::scalar_fn::fns::fill_null::FillNullExecuteAdaptor; pub(crate) fn initialize(session: &VortexSession) { let kernels = session.kernels(); kernels.register_execute_parent_kernel(Binary.id(), Dict, CompareExecuteAdaptor(Dict)); + kernels.register_execute_parent_kernel(Dict.id(), Chunked, TakeExecuteAdaptor(Chunked)); kernels.register_execute_parent_kernel(Dict.id(), Dict, TakeExecuteAdaptor(Dict)); kernels.register_execute_parent_kernel(FillNull.id(), Dict, FillNullExecuteAdaptor(Dict)); } diff --git a/vortex-array/src/executor.rs b/vortex-array/src/executor.rs index 515e6f7398e..bd8f0f115fb 100644 --- a/vortex-array/src/executor.rs +++ b/vortex-array/src/executor.rs @@ -34,6 +34,9 @@ use crate::ArrayRef; use crate::Canonical; use crate::IntoArray; use crate::array::ArrayId; +use crate::arrays::ScalarFn; +use crate::arrays::Shared; +use crate::arrays::shared::SharedArrayExt; use crate::builders::ArrayBuilder; use crate::builders::builder_with_capacity_in; use crate::dtype::DType; @@ -568,6 +571,37 @@ fn execute_parent_for_child( slot_idx: usize, kernels: &ParentExecutionKernels, ctx: &mut ExecutionCtx, +) -> VortexResult> { + if let Some(result) = execute_parent_for_exact_child(parent, child, slot_idx, kernels, ctx)? { + return Ok(Some(result)); + } + + // Shared is a transparent cache wrapper. For scalar-function parents, try kernels against the + // wrapped source/current array before forcing Shared to canonicalize and populate its cache. + if parent.is::() { + let mut current = child.clone(); + while let Some(source) = current + .as_opt::() + .map(|shared| shared.current_array_ref().clone()) + { + if let Some(result) = + execute_parent_for_exact_child(parent, &source, slot_idx, kernels, ctx)? + { + return Ok(Some(result)); + } + current = source; + } + } + + Ok(None) +} + +fn execute_parent_for_exact_child( + parent: &ArrayRef, + child: &ArrayRef, + slot_idx: usize, + kernels: &ParentExecutionKernels, + ctx: &mut ExecutionCtx, ) -> VortexResult> { let key = execute_parent_key(parent.encoding_id(), child.encoding_id()); if let Some(plugins) = kernels.get(&key) { diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index 12a1fe6abd6..d37fdad7f28 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -1805,7 +1805,7 @@ struct PreparedScanPlanFile { segment_source_id: SegmentSourceId, scheduled_segment_source: Arc, segment_future_cache: Arc, - root: ScanPlanRef, + split_hints: Option>, projection: PreparedReadRef, predicates: Vec, } @@ -1873,8 +1873,12 @@ impl PreparedScanPlanFile { ); let mut prepare_ctx = PrepareCtx::new(session.clone()); - let projection_plan = - prepare_read(&root, &projection, file.dtype(), &session, &mut prepare_ctx)?; + let projection_pushed = push_expr(&root, &projection, file.dtype(), &session)?; + let mut split_hints = Vec::new(); + extend_split_hints(&projection_pushed, &mut split_hints); + let projection_plan = Arc::clone(&projection_pushed) + .prepare_read(&mut prepare_ctx)? + .ok_or_else(|| vortex_err!("scan2 could not plan read for expression {projection}"))?; // Run cheap, likely-selective conjuncts first so an expensive residual (e.g. an FSST `LIKE`) // only evaluates over the rows that survive the cheaper predicates. AND is commutative, so @@ -1890,6 +1894,7 @@ impl PreparedScanPlanFile { u32::try_from(idx).map_err(|_| vortex_err!("too many predicates"))?, ); let pushed = push_expr(&root, &expr, file.dtype(), &session)?; + extend_split_hints(&pushed, &mut split_hints); let read = Arc::clone(&pushed) .prepare_read(&mut prepare_ctx)? .ok_or_else(|| vortex_err!("scan2 could not plan predicate read {expr}"))?; @@ -1918,7 +1923,7 @@ impl PreparedScanPlanFile { segment_source_id, scheduled_segment_source, segment_future_cache, - root, + split_hints: normalize_split_hints(split_hints), projection: projection_plan, predicates, }) @@ -2158,7 +2163,7 @@ impl PreparedScanPlanFile { fn splits(&self) -> VortexResult>> { let mut points = vec![self.row_range.start]; - if let Some(hints) = self.root.split_hints() { + if let Some(hints) = &self.split_hints { points.extend( hints .iter() @@ -2204,16 +2209,16 @@ fn push_expr( .ok_or_else(|| vortex_err!("scan2 could not push expression {expr}")) } -fn prepare_read( - root: &ScanPlanRef, - expr: &Expression, - dtype: &DType, - session: &VortexSession, - cx: &mut PrepareCtx, -) -> VortexResult { - push_expr(root, expr, dtype, session)? - .prepare_read(cx)? - .ok_or_else(|| vortex_err!("scan2 could not plan read for expression {expr}")) +fn extend_split_hints(plan: &ScanPlanRef, points: &mut Vec) { + if let Some(hints) = plan.split_hints() { + points.extend_from_slice(hints); + } +} + +fn normalize_split_hints(mut hints: Vec) -> Option> { + hints.sort_unstable(); + hints.dedup(); + (!hints.is_empty()).then_some(hints) } fn check_range(range: &Range, row_count: u64) -> VortexResult<()> { From 2615c19c7c7e4b4806e0fd3204db5c3b3b804aa7 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Sun, 21 Jun 2026 18:53:10 -0400 Subject: [PATCH 22/48] Improve sparse OnPair scan projection Signed-off-by: Nicholas Gates --- .../experimental/onpair/src/compute/like.rs | 471 ++++++++++++++++++ .../experimental/onpair/src/compute/mod.rs | 1 + encodings/experimental/onpair/src/kernel.rs | 3 + vortex-array/src/arrays/filter/kernel.rs | 9 +- vortex-array/src/arrays/filter/rules.rs | 31 +- vortex-array/src/arrays/shared/vtable.rs | 8 + vortex-array/src/executor.rs | 25 +- vortex-array/src/scalar_fn/fns/like/kernel.rs | 45 ++ vortex-layout/src/scan/v2/layouts/dict.rs | 30 +- vortex-scan/src/plan/mod.rs | 12 +- 10 files changed, 607 insertions(+), 28 deletions(-) create mode 100644 encodings/experimental/onpair/src/compute/like.rs diff --git a/encodings/experimental/onpair/src/compute/like.rs b/encodings/experimental/onpair/src/compute/like.rs new file mode 100644 index 00000000000..fcad2d68a3b --- /dev/null +++ b/encodings/experimental/onpair/src/compute/like.rs @@ -0,0 +1,471 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use memchr::memmem::Finder; +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::BoolArray; +use vortex_array::scalar_fn::fns::like::LikeKernel; +use vortex_array::scalar_fn::fns::like::LikeOptions; +use vortex_buffer::BitBuffer; +use vortex_error::VortexResult; +use vortex_error::vortex_ensure; + +use crate::OnPair; +use crate::OnPairArrayExt; +use crate::OnPairArraySlotsExt; +use crate::decode::code_boundary_at; +use crate::decode::collect_widened; + +#[derive(Clone, Copy)] +enum SimpleLike<'a> { + All, + Exact(&'a [u8]), + Prefix(&'a [u8]), + Suffix(&'a [u8]), + Contains(&'a [u8]), +} + +impl LikeKernel for OnPair { + fn like( + array: ArrayView<'_, Self>, + pattern: &ArrayRef, + options: LikeOptions, + ctx: &mut ExecutionCtx, + ) -> VortexResult> { + let Some(pattern_scalar) = pattern.as_constant() else { + return Ok(None); + }; + if options.case_insensitive { + return Ok(None); + } + + let pattern_bytes: &[u8] = if let Some(s) = pattern_scalar.as_utf8_opt() { + let Some(v) = s.value() else { + return Ok(None); + }; + v.as_ref() + } else if let Some(b) = pattern_scalar.as_binary_opt() { + let Some(v) = b.value() else { + return Ok(None); + }; + v + } else { + return Ok(None); + }; + let Some(parsed) = parse_simple_like(pattern_bytes) else { + return Ok(None); + }; + + let codes_offsets = array.codes_offsets(); + let code_start = code_boundary_at(codes_offsets, 0, ctx)?; + let code_end = code_boundary_at(codes_offsets, array.len(), ctx)?; + vortex_ensure!( + code_start <= code_end, + "OnPair codes_offsets must be nondecreasing" + ); + vortex_ensure!( + code_end <= array.codes().len(), + "OnPair codes_offsets end {} exceeds codes len {}", + code_end, + array.codes().len() + ); + + let codes = collect_widened::(&array.codes().slice(code_start..code_end)?, ctx)?; + let code_offsets = collect_widened::(codes_offsets, ctx)?; + let dict_offsets = collect_widened::(array.dict_offsets(), ctx)?; + let dict_bytes = array.dict_bytes(); + let dict_bytes = dict_bytes.as_slice(); + let mut tail = Vec::new(); + let mut scratch = Vec::new(); + let finder = match parsed { + SimpleLike::Contains(needle) => Some(Finder::new(needle)), + _ => None, + }; + + let bits = BitBuffer::collect_bool(array.len(), |row| { + let matched = match parsed { + SimpleLike::All => true, + SimpleLike::Exact(needle) => row_matches_exact( + row_codes(&code_offsets, &codes, code_start, row), + dict_bytes, + dict_offsets.as_slice(), + needle, + ), + SimpleLike::Prefix(needle) => row_matches_prefix( + row_codes(&code_offsets, &codes, code_start, row), + dict_bytes, + dict_offsets.as_slice(), + needle, + ), + SimpleLike::Suffix(needle) => row_matches_suffix( + row_codes(&code_offsets, &codes, code_start, row), + dict_bytes, + dict_offsets.as_slice(), + needle, + &mut tail, + ), + SimpleLike::Contains(needle) => row_matches_contains( + row_codes(&code_offsets, &codes, code_start, row), + dict_bytes, + dict_offsets.as_slice(), + needle, + finder + .as_ref() + .expect("contains pattern has a memmem finder"), + &mut tail, + &mut scratch, + ), + }; + matched ^ options.negated + }); + + let validity = array + .array_validity() + .union_nullability(pattern_scalar.dtype().nullability()); + Ok(Some(BoolArray::new(bits, validity).into_array())) + } +} + +fn parse_simple_like(pattern: &[u8]) -> Option> { + if pattern.is_empty() { + return Some(SimpleLike::Exact(b"")); + } + if pattern.iter().any(|&b| matches!(b, b'_' | b'\\')) { + return None; + } + + let Some(first_literal) = pattern.iter().position(|&b| b != b'%') else { + return Some(SimpleLike::All); + }; + let last_literal = pattern.iter().rposition(|&b| b != b'%')? + 1; + let literal = &pattern[first_literal..last_literal]; + if literal.contains(&b'%') { + return None; + } + + match (first_literal == 0, last_literal == pattern.len()) { + (true, true) => Some(SimpleLike::Exact(literal)), + (true, false) => Some(SimpleLike::Prefix(literal)), + (false, true) => Some(SimpleLike::Suffix(literal)), + (false, false) => Some(SimpleLike::Contains(literal)), + } +} + +fn row_codes<'a>( + code_offsets: &[u64], + codes: &'a [u16], + code_start: usize, + row: usize, +) -> &'a [u16] { + let start = code_offsets[row] as usize - code_start; + let end = code_offsets[row + 1] as usize - code_start; + &codes[start..end] +} + +fn token_bytes<'a>(dict_bytes: &'a [u8], dict_offsets: &[u32], code: u16) -> &'a [u8] { + let code = usize::from(code); + let start = dict_offsets[code] as usize; + let end = dict_offsets[code + 1] as usize; + &dict_bytes[start..end] +} + +fn row_matches_exact( + codes: &[u16], + dict_bytes: &[u8], + dict_offsets: &[u32], + needle: &[u8], +) -> bool { + let mut matched = 0; + for &code in codes { + let token = token_bytes(dict_bytes, dict_offsets, code); + if matched + token.len() > needle.len() { + return false; + } + if token != &needle[matched..matched + token.len()] { + return false; + } + matched += token.len(); + } + matched == needle.len() +} + +fn row_matches_prefix( + codes: &[u16], + dict_bytes: &[u8], + dict_offsets: &[u32], + needle: &[u8], +) -> bool { + if needle.is_empty() { + return true; + } + + let mut matched = 0; + for &code in codes { + let token = token_bytes(dict_bytes, dict_offsets, code); + let take = (needle.len() - matched).min(token.len()); + if token[..take] != needle[matched..matched + take] { + return false; + } + matched += take; + if matched == needle.len() { + return true; + } + } + false +} + +fn row_matches_suffix( + codes: &[u16], + dict_bytes: &[u8], + dict_offsets: &[u32], + needle: &[u8], + tail: &mut Vec, +) -> bool { + if needle.is_empty() { + return true; + } + + let mut total_len = 0; + tail.clear(); + for &code in codes { + let token = token_bytes(dict_bytes, dict_offsets, code); + total_len += token.len(); + append_tail(tail, token, needle.len()); + } + total_len >= needle.len() && tail.as_slice() == needle +} + +fn row_matches_contains( + codes: &[u16], + dict_bytes: &[u8], + dict_offsets: &[u32], + needle: &[u8], + finder: &Finder<'_>, + tail: &mut Vec, + scratch: &mut Vec, +) -> bool { + if needle.is_empty() { + return true; + } + + tail.clear(); + for &code in codes { + let token = token_bytes(dict_bytes, dict_offsets, code); + if finder.find(token).is_some() { + return true; + } + if !tail.is_empty() { + scratch.clear(); + scratch.extend_from_slice(tail); + scratch.extend_from_slice(token); + if finder.find(scratch).is_some() { + return true; + } + } + append_tail(tail, token, needle.len() - 1); + } + false +} + +fn append_tail(tail: &mut Vec, bytes: &[u8], max_len: usize) { + if max_len == 0 { + return; + } + if bytes.len() >= max_len { + tail.clear(); + tail.extend_from_slice(&bytes[bytes.len() - max_len..]); + return; + } + let overflow = tail.len() + bytes.len(); + if overflow > max_len { + tail.drain(..overflow - max_len); + } + tail.extend_from_slice(bytes); +} + +#[cfg(test)] +mod tests { + use std::sync::LazyLock; + + use vortex_array::ArrayRef; + use vortex_array::Canonical; + use vortex_array::IntoArray; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::BoolArray; + use vortex_array::arrays::ConstantArray; + use vortex_array::arrays::ScalarFn; + use vortex_array::arrays::SharedArray; + use vortex_array::arrays::VarBinArray; + use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt; + use vortex_array::assert_arrays_eq; + use vortex_array::dtype::DType; + use vortex_array::dtype::Nullability; + use vortex_array::scalar_fn::fns::like::Like; + use vortex_array::scalar_fn::fns::like::LikeOptions; + use vortex_error::VortexResult; + use vortex_mask::Mask; + use vortex_session::VortexSession; + + use crate::OnPair; + use crate::compress::DEFAULT_DICT12_CONFIG; + use crate::compress::onpair_compress; + + static SESSION: LazyLock = LazyLock::new(|| { + let session = vortex_array::array_session(); + crate::initialize(&session); + session + }); + + fn run_like( + values: &[Option<&str>], + pattern: &str, + options: LikeOptions, + ) -> VortexResult { + let input = + VarBinArray::from_iter(values.iter().copied(), DType::Utf8(Nullability::Nullable)); + let len = input.len(); + let dtype = input.dtype().clone(); + let array = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG)?.into_array(); + let pattern = ConstantArray::new(pattern, len).into_array(); + let result = Like + .try_new_array(len, options, [array, pattern])? + .into_array() + .execute::(&mut SESSION.create_execution_ctx())? + .into_bool(); + Ok(result) + } + + #[test] + fn like_contains() -> VortexResult<()> { + let result = run_like( + &[ + Some("https://google.example"), + Some("no match"), + Some("prefix Google suffix"), + None, + ], + "%Google%", + LikeOptions::default(), + )?; + assert_arrays_eq!( + &result, + &BoolArray::from_iter([Some(false), Some(false), Some(true), None]) + ); + Ok(()) + } + + #[test] + fn like_prefix_suffix_exact_and_negated() -> VortexResult<()> { + let values = [ + Some("2020-10-01"), + Some("2020-11-01"), + Some("x-2020-10-01"), + Some(""), + ]; + assert_arrays_eq!( + &run_like(&values, "2020-10-%", LikeOptions::default())?, + &BoolArray::from_iter([Some(true), Some(false), Some(false), Some(false)]) + ); + assert_arrays_eq!( + &run_like(&values, "%-01", LikeOptions::default())?, + &BoolArray::from_iter([Some(true), Some(true), Some(true), Some(false)]) + ); + assert_arrays_eq!( + &run_like(&values, "2020-10-01", LikeOptions::default())?, + &BoolArray::from_iter([Some(true), Some(false), Some(false), Some(false)]) + ); + assert_arrays_eq!( + &run_like( + &values, + "%2020%", + LikeOptions { + negated: true, + case_insensitive: false, + }, + )?, + &BoolArray::from_iter([Some(false), Some(false), Some(false), Some(true)]) + ); + Ok(()) + } + + #[test] + fn like_filtered_onpair_stays_lazy_after_one_step() -> VortexResult<()> { + let input = VarBinArray::from_iter( + [ + Some("Google alpha"), + Some("irrelevant"), + Some("Google beta"), + Some("other"), + ], + DType::Utf8(Nullability::Nullable), + ); + let len = input.len(); + let dtype = input.dtype().clone(); + let array = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG)?.into_array(); + let filtered = array.filter(Mask::from_iter([true, false, true, false]))?; + let pattern = ConstantArray::new("%Google%", filtered.len()).into_array(); + let like = Like + .try_new_array(filtered.len(), LikeOptions::default(), [filtered, pattern])? + .into_array(); + + let stepped = like.execute::(&mut SESSION.create_execution_ctx())?; + assert!(stepped.is::()); + assert!(stepped.children()[0].is::()); + + let result = stepped + .execute::(&mut SESSION.create_execution_ctx())? + .into_bool(); + assert_arrays_eq!(&result, &BoolArray::from_iter([Some(true), Some(true)])); + Ok(()) + } + + #[test] + fn filter_shared_onpair_stays_encoded_after_one_step() -> VortexResult<()> { + let input = VarBinArray::from_iter( + [ + Some("Google alpha"), + Some("irrelevant"), + Some("Google beta"), + Some("other"), + ], + DType::Utf8(Nullability::Nullable), + ); + let len = input.len(); + let dtype = input.dtype().clone(); + let array = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG)?.into_array(); + let shared = SharedArray::new(array).into_array(); + let filtered = shared.filter(Mask::from_iter([true, false, true, false]))?; + + let stepped = filtered.execute::(&mut SESSION.create_execution_ctx())?; + assert!(stepped.is::()); + assert_eq!(stepped.len(), 2); + Ok(()) + } + + #[test] + fn filter_sliced_onpair_stays_encoded_after_one_step() -> VortexResult<()> { + let input = VarBinArray::from_iter( + [ + Some("prefix"), + Some("Google alpha"), + Some("irrelevant"), + Some("Google beta"), + Some("suffix"), + ], + DType::Utf8(Nullability::Nullable), + ); + let len = input.len(); + let dtype = input.dtype().clone(); + let array = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG)?.into_array(); + let sliced = array.slice(1..4)?; + let filtered = sliced.filter(Mask::from_iter([true, false, true]))?; + + let stepped = filtered.execute::(&mut SESSION.create_execution_ctx())?; + assert!(stepped.is::()); + assert_eq!(stepped.len(), 2); + Ok(()) + } +} diff --git a/encodings/experimental/onpair/src/compute/mod.rs b/encodings/experimental/onpair/src/compute/mod.rs index 4ad5f48f578..46cf8bf8bab 100644 --- a/encodings/experimental/onpair/src/compute/mod.rs +++ b/encodings/experimental/onpair/src/compute/mod.rs @@ -5,4 +5,5 @@ mod byte_length; mod cast; mod compare; mod filter; +mod like; mod slice; diff --git a/encodings/experimental/onpair/src/kernel.rs b/encodings/experimental/onpair/src/kernel.rs index 8863d750a72..ed216bfa904 100644 --- a/encodings/experimental/onpair/src/kernel.rs +++ b/encodings/experimental/onpair/src/kernel.rs @@ -10,6 +10,8 @@ use vortex_array::scalar_fn::fns::binary::Binary; use vortex_array::scalar_fn::fns::binary::CompareExecuteAdaptor; use vortex_array::scalar_fn::fns::byte_length::ByteLength; use vortex_array::scalar_fn::fns::byte_length::ByteLengthExecuteAdaptor; +use vortex_array::scalar_fn::fns::like::Like; +use vortex_array::scalar_fn::fns::like::LikeExecuteAdaptor; use vortex_session::VortexSession; use crate::OnPair; @@ -24,4 +26,5 @@ pub(super) fn initialize(session: &VortexSession) { OnPair, ByteLengthExecuteAdaptor(OnPair), ); + kernels.register_execute_parent_kernel(Like.id(), OnPair, LikeExecuteAdaptor(OnPair)); } diff --git a/vortex-array/src/arrays/filter/kernel.rs b/vortex-array/src/arrays/filter/kernel.rs index 21bd225bf55..4213d692509 100644 --- a/vortex-array/src/arrays/filter/kernel.rs +++ b/vortex-array/src/arrays/filter/kernel.rs @@ -26,11 +26,14 @@ use crate::kernel::ExecuteParentKernel; use crate::matcher::Matcher; use crate::optimizer::kernels::ArrayKernelsExt; use crate::optimizer::rules::ArrayParentReduceRule; +use crate::scalar_fn::ScalarFnVTable; +use crate::scalar_fn::fns::like::Like; +use crate::scalar_fn::fns::like::LikeFilterExecuteAdaptor; pub(crate) fn initialize(session: &VortexSession) { - session - .kernels() - .register_execute_parent_kernel(Dict.id(), Filter, TakeExecuteAdaptor(Filter)); + let kernels = session.kernels(); + kernels.register_execute_parent_kernel(Dict.id(), Filter, TakeExecuteAdaptor(Filter)); + kernels.register_execute_parent_kernel(Like.id(), Filter, LikeFilterExecuteAdaptor); } pub trait FilterReduce: VTable { diff --git a/vortex-array/src/arrays/filter/rules.rs b/vortex-array/src/arrays/filter/rules.rs index ffa5c64bd61..68031459f73 100644 --- a/vortex-array/src/arrays/filter/rules.rs +++ b/vortex-array/src/arrays/filter/rules.rs @@ -2,6 +2,7 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use vortex_error::VortexResult; +use vortex_mask::AllOr; use vortex_mask::Mask; use crate::ArrayRef; @@ -9,11 +10,13 @@ use crate::Canonical; use crate::IntoArray; use crate::array::ArrayView; use crate::arrays::Filter; +use crate::arrays::Slice; use crate::arrays::Struct; use crate::arrays::StructArray; use crate::arrays::filter::FilterArrayExt; use crate::arrays::filter::FilterReduce; use crate::arrays::filter::FilterReduceAdaptor; +use crate::arrays::slice::SliceArrayExt; use crate::arrays::struct_::StructDataParts; use crate::optimizer::rules::ArrayReduceRule; use crate::optimizer::rules::ParentRuleSet; @@ -23,7 +26,7 @@ pub(super) const PARENT_RULES: ParentRuleSet = ParentRuleSet::new(&[ParentRuleSet::lift(&FilterReduceAdaptor(Filter))]); pub(super) const RULES: ReduceRuleSet = - ReduceRuleSet::new(&[&TrivialFilterRule, &FilterStructRule]); + ReduceRuleSet::new(&[&TrivialFilterRule, &FilterSliceRule, &FilterStructRule]); impl FilterReduce for Filter { fn filter(array: ArrayView<'_, Self>, mask: &Mask) -> VortexResult> { @@ -47,6 +50,32 @@ impl ArrayReduceRule for TrivialFilterRule { } } +/// A reduce rule that pushes a filter through a slice by expanding the +/// slice-local mask back into the child row domain. +#[derive(Debug)] +struct FilterSliceRule; + +impl ArrayReduceRule for FilterSliceRule { + fn reduce(&self, array: ArrayView<'_, Filter>) -> VortexResult> { + let mask = array.filter_mask(); + let Some(slice) = array.child().as_opt::() else { + return Ok(None); + }; + let range = slice.slice_range(); + let child_len = slice.child().len(); + let child_mask = match mask.indices() { + AllOr::All => Mask::from_slices(child_len, vec![(range.start, range.end)]), + AllOr::None => Mask::new_false(child_len), + AllOr::Some(indices) => Mask::from_indices( + child_len, + indices.iter().copied().map(|idx| range.start + idx), + ), + }; + + Ok(Some(slice.child().filter(child_mask)?)) + } +} + /// A reduce rule that pushes a filter down into the fields of a StructArray. #[derive(Debug)] struct FilterStructRule; diff --git a/vortex-array/src/arrays/shared/vtable.rs b/vortex-array/src/arrays/shared/vtable.rs index 3c3a09216d2..fc03255d785 100644 --- a/vortex-array/src/arrays/shared/vtable.rs +++ b/vortex-array/src/arrays/shared/vtable.rs @@ -113,6 +113,14 @@ impl VTable for Shared { .get_or_compute(|source| source.clone().execute::(ctx)) .map(ExecutionResult::done) } + + fn reduce_parent( + array: ArrayView<'_, Self>, + parent: &ArrayRef, + child_idx: usize, + ) -> VortexResult> { + array.current_array_ref().reduce_parent(parent, child_idx) + } } impl OperationsVTable for Shared { fn scalar_at( diff --git a/vortex-array/src/executor.rs b/vortex-array/src/executor.rs index bd8f0f115fb..c96e19c4679 100644 --- a/vortex-array/src/executor.rs +++ b/vortex-array/src/executor.rs @@ -34,7 +34,6 @@ use crate::ArrayRef; use crate::Canonical; use crate::IntoArray; use crate::array::ArrayId; -use crate::arrays::ScalarFn; use crate::arrays::Shared; use crate::arrays::shared::SharedArrayExt; use crate::builders::ArrayBuilder; @@ -576,21 +575,19 @@ fn execute_parent_for_child( return Ok(Some(result)); } - // Shared is a transparent cache wrapper. For scalar-function parents, try kernels against the - // wrapped source/current array before forcing Shared to canonicalize and populate its cache. - if parent.is::() { - let mut current = child.clone(); - while let Some(source) = current - .as_opt::() - .map(|shared| shared.current_array_ref().clone()) + // Shared is a transparent cache wrapper. Try kernels against the wrapped source/current array + // before forcing Shared to canonicalize and populate its cache. + let mut current = child.clone(); + while let Some(source) = current + .as_opt::() + .map(|shared| shared.current_array_ref().clone()) + { + if let Some(result) = + execute_parent_for_exact_child(parent, &source, slot_idx, kernels, ctx)? { - if let Some(result) = - execute_parent_for_exact_child(parent, &source, slot_idx, kernels, ctx)? - { - return Ok(Some(result)); - } - current = source; + return Ok(Some(result)); } + current = source; } Ok(None) diff --git a/vortex-array/src/scalar_fn/fns/like/kernel.rs b/vortex-array/src/scalar_fn/fns/like/kernel.rs index b3b683212ff..e62f41d9f92 100644 --- a/vortex-array/src/scalar_fn/fns/like/kernel.rs +++ b/vortex-array/src/scalar_fn/fns/like/kernel.rs @@ -6,9 +6,12 @@ use vortex_error::VortexResult; use crate::ArrayRef; use crate::ExecutionCtx; +use crate::IntoArray; use crate::array::ArrayView; use crate::array::VTable; +use crate::arrays::Filter; use crate::arrays::ScalarFn; +use crate::arrays::ScalarFnArray; use crate::arrays::scalar_fn::ExactScalarFn; use crate::arrays::scalar_fn::ScalarFnArrayExt; use crate::arrays::scalar_fn::ScalarFnArrayView; @@ -105,3 +108,45 @@ where ::like(array, pattern, options, ctx) } } + +/// Adaptor that executes a filtered input before evaluating LIKE. +/// +/// This preserves sparse row demand for `LIKE(Filter(child), constant)`: the filter executes into a +/// filtered child first, then the regular child-specific LIKE execute-parent kernel can run over +/// only the selected rows. +#[derive(Default, Debug)] +pub struct LikeFilterExecuteAdaptor; + +impl ExecuteParentKernel for LikeFilterExecuteAdaptor { + type Parent = ExactScalarFn; + + fn execute_parent( + &self, + array: ArrayView<'_, Filter>, + parent: ScalarFnArrayView<'_, LikeExpr>, + child_idx: usize, + ctx: &mut ExecutionCtx, + ) -> VortexResult> { + if child_idx != 0 { + return Ok(None); + } + let scalar_fn_array = parent + .as_opt::() + .vortex_expect("ExactScalarFn matcher confirmed ScalarFnArray"); + let filtered = array.array().clone().execute::(ctx)?; + let children = scalar_fn_array + .iter_children() + .enumerate() + .map(|(idx, child)| { + if idx == child_idx { + filtered.clone() + } else { + child.clone() + } + }) + .collect(); + Ok(Some( + ScalarFnArray::try_new(scalar_fn_array.scalar_fn().clone(), children)?.into_array(), + )) + } +} diff --git a/vortex-layout/src/scan/v2/layouts/dict.rs b/vortex-layout/src/scan/v2/layouts/dict.rs index 18fedfa5ed9..c0c684666ea 100644 --- a/vortex-layout/src/scan/v2/layouts/dict.rs +++ b/vortex-layout/src/scan/v2/layouts/dict.rs @@ -166,6 +166,27 @@ fn sparse_value_expr_candidate(expr: &Expression, values_len: u64, rows: RowScop sparse_dict_candidate(values_len, rows) && value_expr_is_expensive(expr) } +fn value_expr_candidate(expr: &Expression, values_len: u64, rows: RowScope<'_>) -> bool { + if sparse_value_expr_candidate(expr, values_len, rows) { + return false; + } + if !value_expr_is_expensive(expr) { + return true; + } + + let Ok(values_len) = usize::try_from(values_len) else { + return false; + }; + let demand = rows.demand.true_count(); + // Dense scans will usually touch every morsel in this dictionary. Since value-domain + // expressions are cached per DictScanState, allow a small amount of look-ahead instead of + // repeatedly evaluating expensive predicates over decoded row values. + values_len <= demand + || (rows.selection.all_true() + && rows.demand.all_true() + && values_len <= demand.saturating_mul(4)) +} + impl DictScanPlan { /// The values relation wrapped in a `SharedArray`, read once per query. fn values( @@ -648,12 +669,9 @@ impl PreparedRead for DictExprPreparedRead { Box::pin(async move { let sparse_candidate = sparse_value_expr_candidate(&self.node.expr, self.node.dict.values_len, rows); - let value_expr = if !sparse_candidate - && (!value_expr_is_expensive(&self.node.expr) - || matches!( - usize::try_from(self.node.dict.values_len), - Ok(values_len) if values_len <= rows.demand.true_count() - )) { + let value_candidate = + value_expr_candidate(&self.node.expr, self.node.dict.values_len, rows); + let value_expr = if value_candidate { self.value_expr(io, &self.state, local).await? } else { None diff --git a/vortex-scan/src/plan/mod.rs b/vortex-scan/src/plan/mod.rs index 3c519fa4c99..34b2f14cd23 100644 --- a/vortex-scan/src/plan/mod.rs +++ b/vortex-scan/src/plan/mod.rs @@ -24,6 +24,7 @@ use std::sync::Arc; use std::sync::OnceLock; use futures::future::BoxFuture; +use futures::future::try_join_all; use parking_lot::Mutex; use rustc_hash::FxHashMap; use vortex_array::ArrayRef; @@ -1000,10 +1001,13 @@ impl PreparedRead for StructValuePreparedRead { local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { - let mut arrays = Vec::with_capacity(self.fields.len()); - for field in &self.fields { - arrays.push(field.read_scoped(range.clone(), rows, io, local).await?); - } + let session = local.session().clone(); + let arrays = try_join_all(self.fields.iter().map(|field| { + let range = range.clone(); + let mut child_ctx = session.create_execution_ctx(); + async move { field.read_scoped(range, rows, io, &mut child_ctx).await } + })) + .await?; let validity = match &self.validity { Some(validity) => { let array = validity.read_scoped(range, rows, io, local).await?; From 5c6e8249584b27e1ce2afa420d2b5353b940fd5a Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Sun, 21 Jun 2026 19:23:53 -0400 Subject: [PATCH 23/48] Reduce V2 duplicate scan requests Signed-off-by: Nicholas Gates --- vortex-datafusion/src/persistent/opener.rs | 31 ++++++++++++-- vortex-datafusion/src/persistent/source.rs | 5 +++ vortex-file/src/file.rs | 19 +++++++++ vortex-file/src/multi/scan_v2.rs | 10 +++-- vortex-layout/src/scan/v2/layouts/flat.rs | 47 ++++++++++++++++------ vortex-scan/src/plan/mod.rs | 14 +++++++ 6 files changed, 106 insertions(+), 20 deletions(-) diff --git a/vortex-datafusion/src/persistent/opener.rs b/vortex-datafusion/src/persistent/opener.rs index 4d28282e1f5..ba045ede70f 100644 --- a/vortex-datafusion/src/persistent/opener.rs +++ b/vortex-datafusion/src/persistent/opener.rs @@ -103,6 +103,8 @@ pub(crate) struct VortexOpener { pub layout_readers: Arc>>, /// Shared full-file natural split ranges keyed by file path. pub natural_split_ranges: Arc]>>>, + /// Shared V2 file handles keyed by file path. + pub vortex_files: Arc>>, /// Whether the query has output ordering specified pub has_output_ordering: bool, @@ -139,6 +141,7 @@ impl FileOpener for VortexOpener { let limit = self.limit; let layout_readers = Arc::clone(&self.layout_readers); let natural_split_ranges = Arc::clone(&self.natural_split_ranges); + let vortex_files = Arc::clone(&self.vortex_files); let has_output_ordering = self.has_output_ordering; let scan_concurrency = self.scan_concurrency; @@ -216,10 +219,24 @@ impl FileOpener for VortexOpener { open_opts = open_opts.with_footer(footer); } - let vxf = open_opts - .open_read(reader) - .await - .map_err(|e| exec_datafusion_err!("Failed to open Vortex file {e}"))?; + let vxf = if let Some(hit) = vortex_files.get(&file.object_meta.location) { + Arc::clone(hit.value()) + } else { + let opened = Arc::new( + open_opts + .open_read(reader) + .await + .map_err(|e| exec_datafusion_err!("Failed to open Vortex file {e}"))?, + ); + + match vortex_files.entry(file.object_meta.location.clone()) { + Entry::Occupied(entry) => Arc::clone(entry.get()), + Entry::Vacant(entry) => { + entry.insert(Arc::clone(&opened)); + opened + } + } + }; // On a miss, cache the parsed footer so other partitions and later executions // skip the footer fetch and parse. `infer_schema`/`infer_stats` also populate @@ -915,6 +932,7 @@ mod tests { metrics_registry: Arc::new(DefaultMetricsRegistry::default()), layout_readers: Default::default(), natural_split_ranges: Default::default(), + vortex_files: Default::default(), has_output_ordering: false, expression_convertor: Arc::new(DefaultExpressionConvertor::default()), file_metadata_cache: None, @@ -1111,6 +1129,7 @@ mod tests { metrics_registry: Arc::new(DefaultMetricsRegistry::default()), layout_readers: Default::default(), natural_split_ranges: Default::default(), + vortex_files: Default::default(), has_output_ordering: false, expression_convertor: Arc::new(DefaultExpressionConvertor::default()), file_metadata_cache: None, @@ -1199,6 +1218,7 @@ mod tests { metrics_registry: Arc::new(DefaultMetricsRegistry::default()), layout_readers: Default::default(), natural_split_ranges: Default::default(), + vortex_files: Default::default(), has_output_ordering: false, expression_convertor: Arc::new(DefaultExpressionConvertor::default()), file_metadata_cache: None, @@ -1357,6 +1377,7 @@ mod tests { metrics_registry: Arc::new(DefaultMetricsRegistry::default()), layout_readers: Default::default(), natural_split_ranges: Default::default(), + vortex_files: Default::default(), has_output_ordering: false, expression_convertor: Arc::new(DefaultExpressionConvertor::default()), file_metadata_cache: None, @@ -1418,6 +1439,7 @@ mod tests { metrics_registry: Arc::new(DefaultMetricsRegistry::default()), layout_readers: Default::default(), natural_split_ranges: Default::default(), + vortex_files: Default::default(), has_output_ordering: false, expression_convertor: Arc::new(DefaultExpressionConvertor::default()), file_metadata_cache: None, @@ -1628,6 +1650,7 @@ mod tests { metrics_registry: Arc::new(DefaultMetricsRegistry::default()), layout_readers: Default::default(), natural_split_ranges: Default::default(), + vortex_files: Default::default(), has_output_ordering: false, expression_convertor: Arc::new(DefaultExpressionConvertor::default()), file_metadata_cache: None, diff --git a/vortex-datafusion/src/persistent/source.rs b/vortex-datafusion/src/persistent/source.rs index 3c0bd5355b7..74f4d5520b0 100644 --- a/vortex-datafusion/src/persistent/source.rs +++ b/vortex-datafusion/src/persistent/source.rs @@ -32,6 +32,7 @@ use object_store::ObjectStore; use object_store::path::Path; use vortex::error::VortexExpect; use vortex::file::VORTEX_FILE_EXTENSION; +use vortex::file::VortexFile; use vortex::layout::LayoutReader; use vortex::layout::scan::v2::scan2_enabled; use vortex::metrics::DefaultMetricsRegistry; @@ -200,6 +201,8 @@ pub struct VortexSource { layout_readers: Arc>>, /// Shared full-file natural split ranges keyed by path. natural_split_ranges: Arc]>>>, + /// Shared V2 file handles keyed by path. + vortex_files: Arc>>, expression_convertor: Arc, pub(crate) vortex_reader_factory: Option>, pub(crate) ordered: bool, @@ -233,6 +236,7 @@ impl VortexSource { _unused_df_metrics: Default::default(), layout_readers: Arc::new(DashMap::default()), natural_split_ranges: Arc::new(DashMap::default()), + vortex_files: Arc::new(DashMap::default()), expression_convertor: Arc::new(DefaultExpressionConvertor::default()), vortex_reader_factory: None, vx_metrics_registry: Arc::new(DefaultMetricsRegistry::default()), @@ -367,6 +371,7 @@ impl VortexSource { metrics_registry: Arc::clone(&self.vx_metrics_registry), layout_readers: Arc::clone(&self.layout_readers), natural_split_ranges: Arc::clone(&self.natural_split_ranges), + vortex_files: Arc::clone(&self.vortex_files), has_output_ordering: !base_config.output_ordering.is_empty() || self.ordered, expression_convertor: Arc::clone(&self.expression_convertor), file_metadata_cache: self.file_metadata_cache.clone(), diff --git a/vortex-file/src/file.rs b/vortex-file/src/file.rs index 0e833134389..68d9853139b 100644 --- a/vortex-file/src/file.rs +++ b/vortex-file/src/file.rs @@ -30,7 +30,10 @@ use vortex_layout::segments::SegmentInfo; use vortex_layout::segments::SegmentSource; use vortex_scan::DataSourceRef; use vortex_scan::ScanRequest; +use vortex_scan::plan::PreparedStateCache; +use vortex_scan::plan::PreparedStateCacheRef; use vortex_scan::plan::ScanPlanRef; +use vortex_scan::segments::SegmentFutureCache; use vortex_session::VortexSession; use crate::FileStatistics; @@ -58,6 +61,10 @@ pub struct VortexFile { layout_reader_cache: Option>>, /// Shared cache for the v2 physical scan plan root. scan_plan_root_cache: Arc>, + /// Shared cache for v2 prepared state across row-range scans of this file. + scan_plan_state_cache: PreparedStateCacheRef, + /// Shared cache for v2 in-flight segment futures across row-range scans of this file. + scan_plan_segment_future_cache: Arc, } fn layout_reader( @@ -104,6 +111,8 @@ impl VortexFile { session, layout_reader_cache: None, scan_plan_root_cache: Arc::new(OnceLock::new()), + scan_plan_state_cache: Arc::new(PreparedStateCache::default()), + scan_plan_segment_future_cache: Arc::new(SegmentFutureCache::new()), } } @@ -116,6 +125,8 @@ impl VortexFile { session: self.session, layout_reader_cache: Some(OnceLock::new()), scan_plan_root_cache: self.scan_plan_root_cache, + scan_plan_state_cache: self.scan_plan_state_cache, + scan_plan_segment_future_cache: self.scan_plan_segment_future_cache, } } @@ -203,6 +214,14 @@ impl VortexFile { Ok(root) } + pub(crate) fn scan_plan_state_cache(&self) -> PreparedStateCacheRef { + Arc::clone(&self.scan_plan_state_cache) + } + + pub(crate) fn scan_plan_segment_future_cache(&self) -> Arc { + Arc::clone(&self.scan_plan_segment_future_cache) + } + /// Create a [`DataSource`](vortex_scan::DataSource) from this file for scanning. /// /// Wraps the file's layout reader with [`FileStatsLayoutReader`] (when file-level diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index d37fdad7f28..c177ae98fcc 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -1089,8 +1089,9 @@ impl Work { ) -> Self { let known_bytes = registered.bytes(); let future = async move { - let _registered = registered; - future.await + let result = future.await; + drop(registered); + result } .boxed(); Self { @@ -1862,7 +1863,7 @@ impl PreparedScanPlanFile { }, ); let scheduled_segment_source = Arc::clone(®istered_source.source); - let segment_future_cache = Arc::new(SegmentFutureCache::new()); + let segment_future_cache = file.scan_plan_segment_future_cache(); let reader = FileReader::new( Arc::new(ScheduledSegmentSourceReader::new( segment_source_id, @@ -1872,7 +1873,8 @@ impl PreparedScanPlanFile { session.clone(), ); - let mut prepare_ctx = PrepareCtx::new(session.clone()); + let mut prepare_ctx = + PrepareCtx::with_state_cache(session.clone(), file.scan_plan_state_cache()); let projection_pushed = push_expr(&root, &projection, file.dtype(), &session)?; let mut split_hints = Vec::new(); extend_split_hints(&projection_pushed, &mut split_hints); diff --git a/vortex-layout/src/scan/v2/layouts/flat.rs b/vortex-layout/src/scan/v2/layouts/flat.rs index 2f456237318..f33d2103e4e 100644 --- a/vortex-layout/src/scan/v2/layouts/flat.rs +++ b/vortex-layout/src/scan/v2/layouts/flat.rs @@ -13,6 +13,7 @@ use std::fmt; use std::ops::Range; use std::sync::Arc; +use futures::FutureExt; use futures::future::BoxFuture; use parking_lot::Mutex; use vortex_array::ArrayRef; @@ -21,6 +22,7 @@ use vortex_array::IntoArray; use vortex_array::arrays::SliceArray; use vortex_array::expr::Expression; use vortex_array::serde::SerializedArray; +use vortex_error::VortexError; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; @@ -44,6 +46,7 @@ use vortex_session::VortexSession; use crate::layout_v2::Flat; use crate::layout_v2::Layout; use crate::layout_v2::LayoutRef; +use crate::layouts::SharedArrayFuture; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; @@ -63,12 +66,10 @@ pub struct FlatScanPlan { layout: LayoutRef, } -/// Per-query cache of the parsed (still lazy) array. Concurrent decodes -/// are benign: the segment fetch is deduplicated by the shared segment -/// source, and last-write-wins on the parsed array. +/// Per-query cache of the parsed (still lazy) array. #[derive(Default)] pub struct FlatScanState { - array: Mutex>, + array: Mutex>, } struct FlatPreparedRead { @@ -76,6 +77,27 @@ struct FlatPreparedRead { state: Arc, } +impl FlatScanPlan { + fn array(&self, io: &FileReader, state: &FlatScanState) -> SharedArrayFuture { + if let Some(hit) = state.array.lock().clone() { + return hit; + } + + let mut guard = state.array.lock(); + if let Some(hit) = guard.clone() { + return hit; + } + + let layout = self.layout.clone(); + let io = io.clone(); + let future = async move { decode_flat(&layout, &io).await.map_err(Arc::new) } + .boxed() + .shared(); + *guard = Some(future.clone()); + future + } +} + impl ScanPlan for FlatScanPlan { fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { Ok(Arc::new(FlatScanState::default())) @@ -90,7 +112,10 @@ impl ScanPlan for FlatScanPlan { } fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { - let key = PreparedStateKey::new::(Arc::as_ptr(&self) as *const () as usize); + let flat = self.layout.as_opt::().ok_or_else(|| { + vortex_err!("expected flat layout, got {}", self.layout.encoding_id()) + })?; + let key = PreparedStateKey::new::(*flat.data().segment_id() as usize); let state = cx.shared_state(key, || Ok(FlatScanState::default()))?; Ok(Some(Arc::new(FlatPreparedRead { node: self, state }))) } @@ -120,13 +145,11 @@ impl PreparedRead for FlatPreparedRead { _local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { - let array = if let Some(hit) = self.state.array.lock().clone() { - hit - } else { - let decoded = decode_flat(&self.node.layout, io).await?; - *self.state.array.lock() = Some(decoded.clone()); - decoded - }; + let array = self + .node + .array(io, &self.state) + .await + .map_err(VortexError::from)?; let dense = slice_to_range(array, &range)?; if rows.selection.len() != dense.len() { vortex_bail!( diff --git a/vortex-scan/src/plan/mod.rs b/vortex-scan/src/plan/mod.rs index 34b2f14cd23..14fb5cd48bf 100644 --- a/vortex-scan/src/plan/mod.rs +++ b/vortex-scan/src/plan/mod.rs @@ -1242,6 +1242,20 @@ impl PreparedRead for MaskPreparedRead { }) } + fn segment_requests( + &self, + range: Range, + rows: RowScope<'_>, + cx: &mut SegmentPlanCtx, + ) -> VortexResult { + let mut requests = self.input.segment_requests(range.clone(), rows, cx)?; + if requests.is_unknown() { + return Ok(requests); + } + requests.extend(self.validity.segment_requests(range, rows, cx)?); + Ok(requests) + } + fn release(&self, frontier: u64) -> VortexResult<()> { self.input.release(frontier)?; self.validity.release(frontier) From e557c0c6ff6edaeb327d21caf9c3b68e22a8704f Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Sun, 21 Jun 2026 22:45:07 -0400 Subject: [PATCH 24/48] Simplify scan task read scheduling Signed-off-by: Nicholas Gates --- vortex-datafusion/src/persistent/opener.rs | 2 +- vortex-file/src/file.rs | 21 - vortex-file/src/multi/scan_v2.rs | 425 +++++++++------- vortex-file/src/segments/source.rs | 15 + vortex-layout/src/scan/v2/layouts/chunked.rs | 14 +- vortex-layout/src/scan/v2/layouts/dict.rs | 12 +- vortex-layout/src/scan/v2/layouts/flat.rs | 8 +- vortex-layout/src/scan/v2/layouts/zoned.rs | 24 +- vortex-layout/src/scan/v2/row_idx.rs | 4 +- vortex-layout/src/segments/cache.rs | 5 + vortex-layout/src/segments/shared.rs | 9 + vortex-layout/src/segments/test.rs | 9 + vortex-scan/src/lib.rs | 2 - vortex-scan/src/plan/mod.rs | 108 ++-- vortex-scan/src/scheduler.rs | 145 ------ vortex-scan/src/segments/scheduled.rs | 502 +++---------------- vortex-scan/src/segments/source.rs | 4 + 17 files changed, 463 insertions(+), 846 deletions(-) diff --git a/vortex-datafusion/src/persistent/opener.rs b/vortex-datafusion/src/persistent/opener.rs index ba045ede70f..561848b74d6 100644 --- a/vortex-datafusion/src/persistent/opener.rs +++ b/vortex-datafusion/src/persistent/opener.rs @@ -685,7 +685,7 @@ fn compute_natural_split_ranges(layout_reader: &dyn LayoutReader) -> DFResult, - /// Scheduled view of the same segment source. - scheduled_segment_source: Arc, /// The Vortex session used to open this file. session: VortexSession, /// None id LayoutReader caching is turned off @@ -95,19 +90,9 @@ impl VortexFile { segment_source: Arc, session: VortexSession, ) -> Self { - let segment_infos: Arc<[SegmentInfo]> = footer - .segment_map() - .iter() - .map(|segment| SegmentInfo::cacheable(u64::from(segment.length))) - .collect::>() - .into(); - let scheduled_segment_source: Arc = Arc::new( - ScheduledSegmentSourceAdapter::new(Arc::clone(&segment_source), segment_infos), - ); Self { footer, segment_source, - scheduled_segment_source, session, layout_reader_cache: None, scan_plan_root_cache: Arc::new(OnceLock::new()), @@ -121,7 +106,6 @@ impl VortexFile { Self { footer: self.footer, segment_source: self.segment_source, - scheduled_segment_source: self.scheduled_segment_source, session: self.session, layout_reader_cache: Some(OnceLock::new()), scan_plan_root_cache: self.scan_plan_root_cache, @@ -160,11 +144,6 @@ impl VortexFile { Arc::clone(&self.segment_source) } - /// Return the scheduler-aware segment source for this file. - pub fn scheduled_segment_source(&self) -> Arc { - Arc::clone(&self.scheduled_segment_source) - } - /// Returns a reference to the Vortex session used to open this file. pub fn session(&self) -> &VortexSession { &self.session diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index c177ae98fcc..25fee82cd3e 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -64,10 +64,7 @@ use vortex_scan::ScanRequest as DataSourceScanRequest; use vortex_scan::ScanScheduler; use vortex_scan::ScanSchedulerSessionExt; use vortex_scan::ScanTicket; -use vortex_scan::SegmentSourceId; -use vortex_scan::SegmentSourceMeta; use vortex_scan::WorkRequest; -use vortex_scan::plan::FileReader; use vortex_scan::plan::OwnedRowScope; use vortex_scan::plan::PrepareCtx; use vortex_scan::plan::PreparedAggregateRef; @@ -76,6 +73,7 @@ use vortex_scan::plan::PreparedReadRef; use vortex_scan::plan::PreparedStats; use vortex_scan::plan::PreparedStatsRef; use vortex_scan::plan::PushCtx; +use vortex_scan::plan::ReadContext; use vortex_scan::plan::ScanPlan; use vortex_scan::plan::ScanPlanRef; use vortex_scan::plan::ScanState; @@ -90,14 +88,14 @@ use vortex_scan::plan::evidence::PredicateVersion; use vortex_scan::plan::request::EvidenceMode; use vortex_scan::plan::request::OwnedEvidenceRequest; use vortex_scan::plan::request::ScanRequest; +use vortex_scan::segments::CachedSegmentSource; use vortex_scan::segments::ScanIoPhase; -use vortex_scan::segments::ScheduledSegmentSource; -use vortex_scan::segments::ScheduledSegmentSourceReader; +use vortex_scan::segments::ScanRead; use vortex_scan::segments::SegmentFutureCache; use vortex_scan::segments::SegmentPlanCtx; use vortex_scan::segments::SegmentRequests; -use vortex_scan::segments::SubmittedSegmentRequests; -use vortex_scan::segments::submit_segment_requests_cached; +use vortex_scan::segments::SegmentSource; +use vortex_scan::segments::register_segment_reads_cached; use vortex_scan::selection::Selection; use vortex_session::VortexSession; use vortex_utils::parallelism::get_available_parallelism; @@ -322,7 +320,7 @@ impl PreparedStats for FilePreparedStats { fn stats<'a>( &'a self, range: Range, - _io: &'a FileReader, + _io: &'a ReadContext, _state: &'a ScanState, ) -> BoxFuture<'a, VortexResult>>> { Box::pin(async move { @@ -665,14 +663,15 @@ impl DataSource for ScanPlanDataSource { else { continue; }; - let prepared = Arc::new(PreparedScanPlanFile::try_new(file, request, &ticket)?); - let ranges = prepared.splits()?; + let prepared = Arc::new(PreparedScanPlan::try_new(&file, request)?); + let execution = Arc::new(ScanExecution::try_new(file, prepared, &ticket)?); + let ranges = execution.splits()?; if ranges.is_empty() { continue; } - has_runtime_evidence |= prepared.has_runtime_evidence(); + has_runtime_evidence |= execution.has_runtime_evidence(); total_morsels = total_morsels.saturating_add(ranges.len()); - planned_files.push((prepared, ranges)); + planned_files.push((execution, ranges)); } // The physical plan may expose more engine partitions than we can fill with morsels. @@ -681,11 +680,11 @@ impl DataSource for ScanPlanDataSource { let partition_count = total_morsels.min(target_partitions); let mut partitions = vec![Vec::new(); partition_count]; let mut morsel_idx = 0usize; - for (prepared, ranges) in planned_files { + for (execution, ranges) in planned_files { for range in ranges { let partition = morsel_idx % partition_count; partitions[partition].push(PlannedScanPlanMorsel { - prepared: Arc::clone(&prepared), + execution: Arc::clone(&execution), range, }); morsel_idx = morsel_idx.saturating_add(1); @@ -939,7 +938,7 @@ pub(crate) async fn scan_plan_file_statistics_many( ) -> VortexResult>>> { let session = file.session().clone(); let root = file.scan_plan_root()?; - let reader = FileReader::new(file.segment_source(), session); + let reader = ReadContext::new(file.segment_source(), session); let mut result = Vec::with_capacity(exprs.len()); for expr in exprs { let plan = if let Some(field_path) = root_field_path(expr) { @@ -984,7 +983,7 @@ pub(crate) async fn scan_plan_file_plan_splits( let Some(plan) = pushed.prepare_splits(&mut PrepareCtx::new(session.clone()))? else { return Ok(std::iter::once(0..file.row_count()).collect()); }; - let reader = FileReader::new(file.segment_source(), session.clone()); + let reader = ReadContext::new(file.segment_source(), session.clone()); let state = plan.init_state(&session)?; plan.splits(0..file.row_count(), &reader, state.as_ref()) .await @@ -1084,16 +1083,9 @@ impl Work { fn new( phase: ScanIoPhase, handle: Handle, - registered: SubmittedSegmentRequests, + known_bytes: u64, future: BoxFuture<'static, VortexResult>, ) -> Self { - let known_bytes = registered.bytes(); - let future = async move { - let result = future.await; - drop(registered); - result - } - .boxed(); Self { phase, known_bytes, @@ -1152,7 +1144,7 @@ struct PlannedMorselWork { } struct MorselState { - prepared: Arc, + execution: Arc, range: Range, selected: Mask, evidence: Vec>, @@ -1211,13 +1203,7 @@ fn morsel_windows( .config() .morsel_plan_window() .map(|window| window.max(launch_window).max(1)) - .unwrap_or_else(|| { - if has_runtime_evidence { - launch_window - } else { - usize::MAX - } - }); + .unwrap_or(usize::MAX); (plan_window, launch_window) } @@ -1326,7 +1312,7 @@ impl PartitionWorkSchedulerState { return Ok(()); }; let morsel_id = self.next_morsel_id; - let Some(planned) = morsel.prepared.plan_morsel(morsel_id, morsel.range)? else { + let Some(planned) = morsel.execution.plan_morsel(morsel_id, morsel.range)? else { return Ok(()); }; self.next_morsel_id = self.next_morsel_id.saturating_add(1); @@ -1440,14 +1426,14 @@ impl PartitionWorkSchedulerState { if morsel.pending_evidence != 0 { return Ok(()); } - if morsel.next_predicate >= morsel.prepared.predicates.len() { + if morsel.next_predicate >= morsel.execution.predicates.len() { if self.enqueue_recheck_evidence(morsel_id)? { return Ok(()); } let Some(morsel) = self.morsels.get(morsel_id).and_then(Option::as_ref) else { return Ok(()); }; - let projection = morsel.prepared.plan_projection_work( + let projection = morsel.execution.plan_projection_work( morsel_id, morsel.range.clone(), morsel.selected.clone(), @@ -1464,16 +1450,16 @@ impl PartitionWorkSchedulerState { let predicate_idx = morsel.next_predicate; if morsel.evidence[predicate_idx].is_none() { let should_probe = { - let predicate = &morsel.prepared.predicates[predicate_idx]; + let predicate = &morsel.execution.predicates[predicate_idx]; !predicate.evidence.is_empty() && morsel.selected.density() >= EXPR_EVAL_THRESHOLD }; if should_probe { - let work = morsel.prepared.plan_evidence_work( + let work = morsel.execution.plan_evidence_work( morsel_id, predicate_idx, morsel.range.clone(), - morsel.prepared.predicates[predicate_idx].version(), + morsel.execution.predicates[predicate_idx].version(), EvidenceMode::Normal, )?; let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) @@ -1486,8 +1472,8 @@ impl PartitionWorkSchedulerState { } let evidence = PredicateEvidence::new( - morsel.prepared.predicates[predicate_idx].id, - morsel.prepared.predicates[predicate_idx].version(), + morsel.execution.predicates[predicate_idx].id, + morsel.execution.predicates[predicate_idx].version(), morsel.range.clone(), )?; let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { @@ -1508,12 +1494,12 @@ impl PartitionWorkSchedulerState { continue; } - let work = morsel.prepared.plan_predicate_work( + let work = morsel.execution.plan_predicate_work( morsel_id, predicate_idx, morsel.range.clone(), need, - morsel.prepared.predicates[predicate_idx].version(), + morsel.execution.predicates[predicate_idx].version(), )?; let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { return Ok(()); @@ -1529,12 +1515,12 @@ impl PartitionWorkSchedulerState { let Some(morsel) = self.morsels.get(morsel_id).and_then(Option::as_ref) else { return Ok(false); }; - if morsel.next_recheck_predicate >= morsel.prepared.predicates.len() { + if morsel.next_recheck_predicate >= morsel.execution.predicates.len() { return Ok(false); } let predicate_idx = morsel.next_recheck_predicate; - let predicate = &morsel.prepared.predicates[predicate_idx]; + let predicate = &morsel.execution.predicates[predicate_idx]; let current_version = predicate.version(); let evidence_version = morsel.evidence[predicate_idx] .as_ref() @@ -1545,7 +1531,7 @@ impl PartitionWorkSchedulerState { && predicate.has_recheck_evidence() && current_version != evidence_version { - let work = morsel.prepared.plan_evidence_work( + let work = morsel.execution.plan_evidence_work( morsel_id, predicate_idx, morsel.range.clone(), @@ -1666,21 +1652,22 @@ impl Partition for ScanPlanPartition { ticket, } = *self; - let prepared = Arc::new(PreparedScanPlanFile::try_new(file, request, &ticket)?); - let dtype = prepared.dtype.clone(); - let ranges = prepared.splits()?; - let ordered = prepared.ordered; + let prepared = Arc::new(PreparedScanPlan::try_new(&file, request)?); + let execution = Arc::new(ScanExecution::try_new(file, prepared, &ticket)?); + let dtype = execution.plan.dtype().clone(); + let ranges = execution.splits()?; + let ordered = execution.plan.ordered(); let default_window = get_available_parallelism().unwrap_or(1) * 4; let (plan_window, launch_window) = morsel_windows( &scheduler, - prepared.limit_remaining.is_some(), - prepared.has_runtime_evidence(), + execution.limit_remaining.is_some(), + execution.has_runtime_evidence(), default_window, ); let morsels = ranges .into_iter() .map(|range| PlannedScanPlanMorsel { - prepared: Arc::clone(&prepared), + execution: Arc::clone(&execution), range, }) .collect::>(); @@ -1711,7 +1698,7 @@ struct PlannedScanPlanScan { #[derive(Clone)] struct PlannedScanPlanMorsel { - prepared: Arc, + execution: Arc, range: Range, } @@ -1759,8 +1746,9 @@ impl Partition for PlannedScanPlanPartition { for morsel in &self.planned.partitions[self.index] { let range_len = morsel.range.end - morsel.range.start; - row_count = row_count.saturating_add(morsel.prepared.selection.row_count(range_len)); - has_filter |= !morsel.prepared.predicates.is_empty(); + row_count = + row_count.saturating_add(morsel.execution.plan.selection().row_count(range_len)); + has_filter |= morsel.execution.plan.has_filter(); } if has_filter { @@ -1795,23 +1783,37 @@ impl Partition for PlannedScanPlanPartition { } } -struct PreparedScanPlanFile { - session: VortexSession, - reader: FileReader, +struct PreparedScanPlan { + // Request-level physical plan after pushdown. This must stay free of per-scan IO state. dtype: DType, row_range: Range, selection: Selection, ordered: bool, + limit: Option, + split_hints: Option>, + projection: ScanPlanRef, + predicates: Vec, +} + +struct PreparedPredicatePlan { + id: PredicateId, + expr: Expression, + plan: ScanPlanRef, +} + +struct ScanExecution { + // Runtime instantiation of a prepared plan: source binding, prepared handles, and scan state. + session: VortexSession, + reader: ReadContext, + plan: Arc, limit_remaining: Option, - segment_source_id: SegmentSourceId, - scheduled_segment_source: Arc, + segment_source: Arc, segment_future_cache: Arc, - split_hints: Option>, projection: PreparedReadRef, - predicates: Vec, + predicates: Vec, } -struct PreparedPredicate { +struct ExecutionPredicate { id: PredicateId, expr: Expression, dynamic_updates: Option, @@ -1819,7 +1821,7 @@ struct PreparedPredicate { evidence: Vec, } -impl PreparedPredicate { +impl ExecutionPredicate { fn version(&self) -> PredicateVersion { self.dynamic_updates .as_ref() @@ -1834,16 +1836,8 @@ impl PreparedPredicate { } } -struct RegisteredScheduledSegmentSource { - source: Arc, -} - -impl PreparedScanPlanFile { - fn try_new( - file: VortexFile, - request: DataSourceScanRequest, - ticket: &ScanTicket, - ) -> VortexResult { +impl PreparedScanPlan { + fn try_new(file: &VortexFile, request: DataSourceScanRequest) -> VortexResult { let session = file.session().clone(); let dtype = request.projection.return_dtype(file.dtype())?; let projection = request.projection.optimize_recursive(file.dtype())?; @@ -1853,34 +1847,9 @@ impl PreparedScanPlanFile { .transpose()?; let root = file.scan_plan_root()?; - let registered_source = Arc::new(RegisteredScheduledSegmentSource { - source: file.scheduled_segment_source(), - }); - let segment_source_id = ticket.register_segment_source( - Arc::clone(®istered_source), - SegmentSourceMeta { - label: Some("vortex-file".to_string()), - }, - ); - let scheduled_segment_source = Arc::clone(®istered_source.source); - let segment_future_cache = file.scan_plan_segment_future_cache(); - let reader = FileReader::new( - Arc::new(ScheduledSegmentSourceReader::new( - segment_source_id, - Arc::clone(&scheduled_segment_source), - Arc::clone(&segment_future_cache), - )), - session.clone(), - ); - - let mut prepare_ctx = - PrepareCtx::with_state_cache(session.clone(), file.scan_plan_state_cache()); let projection_pushed = push_expr(&root, &projection, file.dtype(), &session)?; let mut split_hints = Vec::new(); extend_split_hints(&projection_pushed, &mut split_hints); - let projection_plan = Arc::clone(&projection_pushed) - .prepare_read(&mut prepare_ctx)? - .ok_or_else(|| vortex_err!("scan2 could not plan read for expression {projection}"))?; // Run cheap, likely-selective conjuncts first so an expensive residual (e.g. an FSST `LIKE`) // only evaluates over the rows that survive the cheaper predicates. AND is commutative, so @@ -1897,57 +1866,166 @@ impl PreparedScanPlanFile { ); let pushed = push_expr(&root, &expr, file.dtype(), &session)?; extend_split_hints(&pushed, &mut split_hints); - let read = Arc::clone(&pushed) - .prepare_read(&mut prepare_ctx)? - .ok_or_else(|| vortex_err!("scan2 could not plan predicate read {expr}"))?; - let evidence = pushed.prepare_evidence(&mut prepare_ctx)?; - let dynamic_updates = DynamicExprUpdates::new(&expr); - Ok(PreparedPredicate { + Ok(PreparedPredicatePlan { id, expr, - dynamic_updates, - read, - evidence, + plan: pushed, }) }) .collect::>>()?; Ok(Self { - session, - reader, dtype, row_range: request .row_range .ok_or_else(|| vortex_err!("scan2 partition row range missing"))?, selection: request.selection, ordered: request.ordered, - limit_remaining: request.limit.map(AtomicU64::new), - segment_source_id, - scheduled_segment_source, - segment_future_cache, + limit: request.limit, split_hints: normalize_split_hints(split_hints), - projection: projection_plan, + projection: projection_pushed, + predicates, + }) + } + + fn dtype(&self) -> &DType { + &self.dtype + } + + fn selection(&self) -> &Selection { + &self.selection + } + + fn ordered(&self) -> bool { + self.ordered + } + + fn limit(&self) -> Option { + self.limit + } + + fn predicates(&self) -> &[PreparedPredicatePlan] { + &self.predicates + } + + fn has_filter(&self) -> bool { + !self.predicates.is_empty() + } + + fn projection(&self) -> &ScanPlanRef { + &self.projection + } + + fn splits(&self) -> VortexResult>> { + let mut points = vec![self.row_range.start]; + if let Some(hints) = &self.split_hints { + points.extend( + hints + .iter() + .copied() + .filter(|&hint| self.row_range.start < hint && hint < self.row_range.end), + ); + } + if points.len() == 1 { + let mut next = self + .row_range + .start + .saturating_add(FALLBACK_SPLIT_SIZE) + .min(self.row_range.end); + while next < self.row_range.end { + points.push(next); + next = next + .saturating_add(FALLBACK_SPLIT_SIZE) + .min(self.row_range.end); + } + } + points.push(self.row_range.end); + points.sort_unstable(); + points.dedup(); + Ok(points + .windows(2) + .filter_map(|window| { + let range = window[0]..window[1]; + (range.start < range.end).then_some(range) + }) + .collect()) + } +} + +impl ScanExecution { + fn try_new( + file: VortexFile, + plan: Arc, + _ticket: &ScanTicket, + ) -> VortexResult { + let session = file.session().clone(); + let segment_source = file.segment_source(); + let segment_future_cache = file.scan_plan_segment_future_cache(); + let reader = ReadContext::new( + Arc::new(CachedSegmentSource::new( + Arc::clone(&segment_source), + Arc::clone(&segment_future_cache), + )), + session.clone(), + ); + + let mut prepare_ctx = + PrepareCtx::with_state_cache(session.clone(), file.scan_plan_state_cache()); + let projection = Arc::clone(plan.projection()) + .prepare_read(&mut prepare_ctx)? + .ok_or_else(|| vortex_err!("scan2 could not plan read for pushed projection"))?; + let predicates = plan + .predicates() + .iter() + .map(|predicate| { + let read = Arc::clone(&predicate.plan) + .prepare_read(&mut prepare_ctx)? + .ok_or_else(|| { + vortex_err!("scan2 could not plan predicate read {}", predicate.expr) + })?; + let evidence = Arc::clone(&predicate.plan).prepare_evidence(&mut prepare_ctx)?; + let dynamic_updates = DynamicExprUpdates::new(&predicate.expr); + Ok(ExecutionPredicate { + id: predicate.id, + expr: predicate.expr.clone(), + dynamic_updates, + read, + evidence, + }) + }) + .collect::>>()?; + + let limit_remaining = plan.limit().map(AtomicU64::new); + + Ok(Self { + session, + reader, + plan, + limit_remaining, + segment_source, + segment_future_cache, + projection, predicates, }) } fn segment_plan_ctx(&self, phase: ScanIoPhase) -> SegmentPlanCtx { - SegmentPlanCtx::new( - self.segment_source_id, - Arc::clone(&self.scheduled_segment_source), - self.session.clone(), - ) - .with_phase(phase) + SegmentPlanCtx::new(Arc::clone(&self.segment_source), self.session.clone()) + .with_phase(phase) } - fn submit_segment_requests(&self, requests: SegmentRequests) -> SubmittedSegmentRequests { - submit_segment_requests_cached( + fn register_segment_reads(&self, requests: SegmentRequests) -> Vec { + register_segment_reads_cached( self.segment_future_cache.as_ref(), - self.scheduled_segment_source.as_ref(), + self.segment_source.as_ref(), requests, ) } + fn known_read_bytes(reads: &[ScanRead]) -> u64 { + reads.iter().map(|read| read.request.bytes).sum() + } + fn has_runtime_evidence(&self) -> bool { self.predicates .iter() @@ -1959,13 +2037,13 @@ impl PreparedScanPlanFile { _morsel_id: usize, range: Range, ) -> VortexResult> { - let selected = self.selection.row_mask(&range).mask().clone(); + let selected = self.plan.selection().row_mask(&range).mask().clone(); if selected.all_false() { return Ok(None); } let state = MorselState { - prepared: Arc::clone(self), + execution: Arc::clone(self), range, selected, evidence: (0..self.predicates.len()).map(|_| None).collect(), @@ -1989,7 +2067,6 @@ impl PreparedScanPlanFile { mode: EvidenceMode, ) -> VortexResult { let predicate = &self.predicates[predicate_idx]; - let mut registered = SubmittedSegmentRequests::default(); let req = OwnedEvidenceRequest { id: predicate.id, version, @@ -1997,28 +2074,30 @@ impl PreparedScanPlanFile { range: range.clone(), mode, }; + let mut known_bytes = 0u64; let mut tasks = Vec::with_capacity(predicate.evidence.len()); for plan in &predicate.evidence { if mode == EvidenceMode::RecheckBeforeProjection && !plan.recheck_before_projection() { continue; } - let task = Arc::clone(plan).begin_evidence(req.clone())?; let mut segment_ctx = self.segment_plan_ctx(ScanIoPhase::EvidenceProbe); - let requests = task.segment_requests(&mut segment_ctx)?; - registered.extend(self.submit_segment_requests(requests)); + let requests = plan.segment_requests(&req.as_request(), &mut segment_ctx)?; + let reads = self.register_segment_reads(requests); + known_bytes = known_bytes.saturating_add(Self::known_read_bytes(&reads)); + let task = Arc::clone(plan).create_task(req.clone(), reads)?; tasks.push(task); } - let prepared = Arc::clone(self); + let execution = Arc::clone(self); Ok(Work::new( ScanIoPhase::EvidenceProbe, self.session.handle(), - registered, + known_bytes, async move { - let predicate = &prepared.predicates[predicate_idx]; + let predicate = &execution.predicates[predicate_idx]; let mut acc = PredicateEvidence::new(predicate.id, version, range.clone())?; for task in tasks { - for fragment in task.evidence(&prepared.reader).await? { + for fragment in task.evidence(&execution.reader).await? { acc.absorb(fragment)?; } if acc.all_false() { @@ -2052,19 +2131,23 @@ impl PreparedScanPlanFile { } else { OwnedRowScope::try_new(Mask::new_true(len), need.clone())? }; - let task = Arc::clone(&predicate.read).begin_read(range.clone(), rows)?; let mut segment_ctx = self.segment_plan_ctx(ScanIoPhase::PredicateRead); - let requests = task.segment_requests(&mut segment_ctx)?; - let registered = self.submit_segment_requests(requests); - - let prepared = Arc::clone(self); + let requests = + predicate + .read + .segment_requests(range.clone(), rows.as_scope(), &mut segment_ctx)?; + let reads = self.register_segment_reads(requests); + let known_bytes = Self::known_read_bytes(&reads); + let task = Arc::clone(&predicate.read).create_task(range.clone(), rows, reads)?; + + let execution = Arc::clone(self); Ok(Work::new( ScanIoPhase::PredicateRead, self.session.handle(), - registered, + known_bytes, async move { - let predicate = &prepared.predicates[predicate_idx]; - let mut ctx = prepared.session.create_execution_ctx(); + let predicate = &execution.predicates[predicate_idx]; + let mut ctx = execution.session.create_execution_ctx(); // Filter-first: when few rows are demanded, read with selection = `need` so the leaf // returns the compacted (filtered) array and an expensive residual (e.g. an FSST // `LIKE`) evaluates over only `need.true_count()` rows. The compacted verdict is @@ -2072,7 +2155,7 @@ impl PreparedScanPlanFile { // mask identical to the dense path's `result & need`. Mirrors V1's flat-reader gate. let result = if compact { let compact = task - .read(&prepared.reader, &mut ctx) + .read(&execution.reader, &mut ctx) .await? .null_as_false() .execute(&mut ctx)?; @@ -2086,7 +2169,7 @@ impl PreparedScanPlanFile { need.intersect_by_rank(&compact) } else { task - .read(&prepared.reader, &mut ctx) + .read(&execution.reader, &mut ctx) .await? .null_as_false() .execute(&mut ctx)? @@ -2140,21 +2223,24 @@ impl PreparedScanPlanFile { ); } - let task = - Arc::clone(&self.projection).begin_read(range, OwnedRowScope::selected(selected))?; + let rows = OwnedRowScope::selected(selected); let mut segment_ctx = self.segment_plan_ctx(ScanIoPhase::ProjectionRead); - let requests = task.segment_requests(&mut segment_ctx)?; - let registered = self.submit_segment_requests(requests); - - let prepared = Arc::clone(self); + let requests = + self.projection + .segment_requests(range.clone(), rows.as_scope(), &mut segment_ctx)?; + let reads = self.register_segment_reads(requests); + let known_bytes = Self::known_read_bytes(&reads); + let task = Arc::clone(&self.projection).create_task(range, rows, reads)?; + + let execution = Arc::clone(self); Ok(Some( Work::new( ScanIoPhase::ProjectionRead, self.session.handle(), - registered, + known_bytes, async move { - let mut ctx = prepared.session.create_execution_ctx(); - let array = task.read(&prepared.reader, &mut ctx).await?; + let mut ctx = execution.session.create_execution_ctx(); + let array = task.read(&execution.reader, &mut ctx).await?; Ok(ProjectionWorkOutput { morsel_id, array }) } .boxed(), @@ -2164,38 +2250,7 @@ impl PreparedScanPlanFile { } fn splits(&self) -> VortexResult>> { - let mut points = vec![self.row_range.start]; - if let Some(hints) = &self.split_hints { - points.extend( - hints - .iter() - .copied() - .filter(|&hint| self.row_range.start < hint && hint < self.row_range.end), - ); - } - if points.len() == 1 { - let mut next = self - .row_range - .start - .saturating_add(FALLBACK_SPLIT_SIZE) - .min(self.row_range.end); - while next < self.row_range.end { - points.push(next); - next = next - .saturating_add(FALLBACK_SPLIT_SIZE) - .min(self.row_range.end); - } - } - points.push(self.row_range.end); - points.sort_unstable(); - points.dedup(); - Ok(points - .windows(2) - .filter_map(|window| { - let range = window[0]..window[1]; - (range.start < range.end).then_some(range) - }) - .collect()) + self.plan.splits() } } diff --git a/vortex-file/src/segments/source.rs b/vortex-file/src/segments/source.rs index c44cbf0e411..0ac13257e93 100644 --- a/vortex-file/src/segments/source.rs +++ b/vortex-file/src/segments/source.rs @@ -23,6 +23,7 @@ use vortex_io::VortexReadAt; use vortex_io::runtime::Handle; use vortex_layout::segments::SegmentFuture; use vortex_layout::segments::SegmentId; +use vortex_layout::segments::SegmentInfo; use vortex_layout::segments::SegmentSource; use vortex_metrics::Counter; use vortex_metrics::Histogram; @@ -147,6 +148,13 @@ impl FileSegmentSource { } impl SegmentSource for FileSegmentSource { + fn segment_info(&self, id: SegmentId) -> VortexResult { + self.segments + .get(*id as usize) + .map(|spec| SegmentInfo::cacheable(u64::from(spec.length))) + .ok_or_else(|| vortex_err!("Missing segment: {}", id)) + } + fn request(&self, id: SegmentId) -> SegmentFuture { // We eagerly register the read request here assuming the behaviour of [`FileSegmentSource`], where // coalescing becomes effective prior to the future being polled. @@ -283,6 +291,13 @@ impl BufferSegmentSource { } impl SegmentSource for BufferSegmentSource { + fn segment_info(&self, id: SegmentId) -> VortexResult { + self.segments + .get(*id as usize) + .map(|spec| SegmentInfo::cacheable(u64::from(spec.length))) + .ok_or_else(|| vortex_err!("Missing segment: {}", id)) + } + fn request(&self, id: SegmentId) -> SegmentFuture { let spec = match self.segments.get(*id as usize) { Some(spec) => spec, diff --git a/vortex-layout/src/scan/v2/layouts/chunked.rs b/vortex-layout/src/scan/v2/layouts/chunked.rs index fa6dd3aaaef..04b03625e97 100644 --- a/vortex-layout/src/scan/v2/layouts/chunked.rs +++ b/vortex-layout/src/scan/v2/layouts/chunked.rs @@ -41,7 +41,6 @@ use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; use vortex_scan::plan::AggregateAnswer; -use vortex_scan::plan::FileReader; use vortex_scan::plan::PrepareCtx; use vortex_scan::plan::PreparedAggregate; use vortex_scan::plan::PreparedAggregateRef; @@ -52,6 +51,7 @@ use vortex_scan::plan::PreparedReadRef; use vortex_scan::plan::PreparedStateCacheRef; use vortex_scan::plan::PreparedStateKey; use vortex_scan::plan::PushCtx; +use vortex_scan::plan::ReadContext; use vortex_scan::plan::RowScope; use vortex_scan::plan::ScanPlan; use vortex_scan::plan::ScanPlanRef; @@ -319,7 +319,7 @@ impl ChunkedAggregateNode { } } - fn child(&self, idx: usize, io: &FileReader) -> VortexResult { + fn child(&self, idx: usize, io: &ReadContext) -> VortexResult { match self { Self::Root(node) => node.child(idx), Self::Expr(node) => node.child(idx, io.session()), @@ -332,7 +332,7 @@ impl ChunkedPreparedAggregate { &self, idx: usize, state: &ChunkedAggregateState, - io: &FileReader, + io: &ReadContext, ) -> VortexResult> { if let Some(hit) = state.children.lock().get(&idx) { return Ok(hit.clone()); @@ -359,7 +359,7 @@ impl PreparedAggregate for ChunkedPreparedAggregate { fn aggregate_partial<'a>( &'a self, range: Range, - io: &'a FileReader, + io: &'a ReadContext, state: &'a ScanState, ) -> BoxFuture<'a, VortexResult>>> { Box::pin(async move { @@ -558,7 +558,7 @@ impl PreparedRead for ChunkedPreparedRead { &'a self, range: Range, rows: RowScope<'a>, - io: &'a FileReader, + io: &'a ReadContext, local_ctx: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { @@ -837,7 +837,7 @@ impl PreparedRead for ChunkedExprPreparedRead { &'a self, range: Range, rows: RowScope<'a>, - io: &'a FileReader, + io: &'a ReadContext, local_ctx: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { @@ -1022,7 +1022,7 @@ impl PreparedEvidence for ChunkedPreparedEvidence { fn evidence<'a>( &'a self, req: &'a EvidenceRequest<'a>, - io: &'a FileReader, + io: &'a ReadContext, ) -> BoxFuture<'a, VortexResult>> { Box::pin(async move { if req.range.start >= req.range.end { diff --git a/vortex-layout/src/scan/v2/layouts/dict.rs b/vortex-layout/src/scan/v2/layouts/dict.rs index c0c684666ea..5971949fe85 100644 --- a/vortex-layout/src/scan/v2/layouts/dict.rs +++ b/vortex-layout/src/scan/v2/layouts/dict.rs @@ -44,12 +44,12 @@ use vortex_error::vortex_bail; use vortex_error::vortex_err; use vortex_mask::AllOr; use vortex_mask::Mask; -use vortex_scan::plan::FileReader; use vortex_scan::plan::PrepareCtx; use vortex_scan::plan::PreparedRead; use vortex_scan::plan::PreparedReadRef; use vortex_scan::plan::PreparedStateKey; use vortex_scan::plan::PushCtx; +use vortex_scan::plan::ReadContext; use vortex_scan::plan::RowScope; use vortex_scan::plan::ScanPlan; use vortex_scan::plan::ScanPlanRef; @@ -192,7 +192,7 @@ impl DictScanPlan { fn values( &self, values_read: PreparedReadRef, - io: &FileReader, + io: &ReadContext, state: &DictScanState, ) -> SharedArrayFuture { if let Some(hit) = state.shared.values.lock().clone() { @@ -340,7 +340,7 @@ impl PreparedRead for DictPreparedRead { &'a self, range: Range, rows: RowScope<'a>, - io: &'a FileReader, + io: &'a ReadContext, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { @@ -424,7 +424,7 @@ impl PreparedRead for DictPreparedRead { impl DictExprPreparedRead { async fn value_expr( &self, - io: &FileReader, + io: &ReadContext, state: &DictScanState, local: &mut ExecutionCtx, ) -> VortexResult> { @@ -479,7 +479,7 @@ impl DictExprPreparedRead { async fn sparse_expr( &self, codes: ArrayRef, - io: &FileReader, + io: &ReadContext, local: &mut ExecutionCtx, ) -> VortexResult> { let values_len = usize::try_from(self.node.dict.values_len) @@ -663,7 +663,7 @@ impl PreparedRead for DictExprPreparedRead { &'a self, range: Range, rows: RowScope<'a>, - io: &'a FileReader, + io: &'a ReadContext, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { diff --git a/vortex-layout/src/scan/v2/layouts/flat.rs b/vortex-layout/src/scan/v2/layouts/flat.rs index f33d2103e4e..a8c97a7b1e8 100644 --- a/vortex-layout/src/scan/v2/layouts/flat.rs +++ b/vortex-layout/src/scan/v2/layouts/flat.rs @@ -26,12 +26,12 @@ use vortex_error::VortexError; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; -use vortex_scan::plan::FileReader; use vortex_scan::plan::PrepareCtx; use vortex_scan::plan::PreparedRead; use vortex_scan::plan::PreparedReadRef; use vortex_scan::plan::PreparedStateKey; use vortex_scan::plan::PushCtx; +use vortex_scan::plan::ReadContext; use vortex_scan::plan::RowScope; use vortex_scan::plan::ScanPlan; use vortex_scan::plan::ScanPlanRef; @@ -78,7 +78,7 @@ struct FlatPreparedRead { } impl FlatScanPlan { - fn array(&self, io: &FileReader, state: &FlatScanState) -> SharedArrayFuture { + fn array(&self, io: &ReadContext, state: &FlatScanState) -> SharedArrayFuture { if let Some(hit) = state.array.lock().clone() { return hit; } @@ -141,7 +141,7 @@ impl PreparedRead for FlatPreparedRead { &'a self, range: Range, rows: RowScope<'a>, - io: &'a FileReader, + io: &'a ReadContext, _local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { @@ -202,7 +202,7 @@ impl PreparedRead for FlatPreparedRead { } } -pub(crate) async fn decode_flat(layout: &LayoutRef, io: &FileReader) -> VortexResult { +pub(crate) async fn decode_flat(layout: &LayoutRef, io: &ReadContext) -> VortexResult { let Some(flat) = layout.as_opt::() else { vortex_bail!("expected flat layout, got {}", layout.encoding_id()); }; diff --git a/vortex-layout/src/scan/v2/layouts/zoned.rs b/vortex-layout/src/scan/v2/layouts/zoned.rs index 2b82546baac..31a50abe1b2 100644 --- a/vortex-layout/src/scan/v2/layouts/zoned.rs +++ b/vortex-layout/src/scan/v2/layouts/zoned.rs @@ -43,7 +43,6 @@ use vortex_error::VortexResult; use vortex_error::vortex_err; use vortex_mask::Mask; use vortex_scan::plan::AggregateAnswer; -use vortex_scan::plan::FileReader; use vortex_scan::plan::PrepareCtx; use vortex_scan::plan::PreparedAggregate; use vortex_scan::plan::PreparedAggregateRef; @@ -53,6 +52,7 @@ use vortex_scan::plan::PreparedRead; use vortex_scan::plan::PreparedReadRef; use vortex_scan::plan::PreparedStateKey; use vortex_scan::plan::PushCtx; +use vortex_scan::plan::ReadContext; use vortex_scan::plan::RowScope; use vortex_scan::plan::ScanPlan; use vortex_scan::plan::ScanPlanRef; @@ -256,7 +256,7 @@ impl ZonedScanPlan { async fn table( &self, zones_read: &PreparedReadRef, - io: &FileReader, + io: &ReadContext, state: &ZonedScanState, ) -> VortexResult> { if let Some(hit) = state.table.lock().clone() { @@ -276,7 +276,7 @@ impl ZonedScanPlan { &self, stat: Stat, zones_read: &PreparedReadRef, - io: &FileReader, + io: &ReadContext, state: &ZonedScanState, ) -> VortexResult>> { if let Some(hit) = state.stat_columns.lock().get(&stat) { @@ -320,7 +320,7 @@ impl ZonedScanPlan { span: &ZoneSpan, func: &AggregateFnRef, zones_read: &PreparedReadRef, - io: &FileReader, + io: &ReadContext, state: &ZonedScanState, ctx: &mut ExecutionCtx, ) -> VortexResult> { @@ -480,7 +480,7 @@ impl ZonedScanPlan { range: Range, funcs: &'a [AggregateFnRef], zones_read: &'a PreparedReadRef, - io: &'a FileReader, + io: &'a ReadContext, state: &'a ZonedScanState, ) -> BoxFuture<'a, VortexResult>>> { Box::pin(async move { @@ -528,7 +528,7 @@ impl ZonedScanPlan { impl ZonedPreparedEvidence { async fn table( &self, - io: &FileReader, + io: &ReadContext, state: &ZonedScanState, ) -> VortexResult> { if let Some(hit) = state.table.lock().clone() { @@ -543,7 +543,7 @@ impl ZonedPreparedEvidence { async fn zone_map( &self, - io: &FileReader, + io: &ReadContext, state: &ZonedScanState, ) -> VortexResult> { if let Some(hit) = state.zone_map.lock().clone() { @@ -579,7 +579,7 @@ impl ZonedPreparedEvidence { async fn predicate_masks( &self, - io: &FileReader, + io: &ReadContext, state: &ZonedScanState, ) -> VortexResult> { if let Some(hit) = state.masks.lock().get(&self.predicate) { @@ -628,7 +628,7 @@ impl PreparedEvidence for ZonedPreparedEvidence { fn evidence<'a>( &'a self, req: &'a EvidenceRequest<'a>, - io: &'a FileReader, + io: &'a ReadContext, ) -> BoxFuture<'a, VortexResult>> { Box::pin(async move { let mut fragments = Vec::new(); @@ -824,7 +824,7 @@ impl PreparedRead for ZonedPreparedRead { &'a self, range: Range, rows: RowScope<'a>, - io: &'a FileReader, + io: &'a ReadContext, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { self.data.read_scoped(range, rows, io, local) @@ -856,7 +856,7 @@ impl PreparedAggregate for ZonedPreparedAggregate { fn aggregate_partial<'a>( &'a self, range: Range, - io: &'a FileReader, + io: &'a ReadContext, _state: &'a ScanState, ) -> BoxFuture<'a, VortexResult>>> { self.node @@ -938,7 +938,7 @@ impl PreparedRead for ZonedExprPreparedRead { &'a self, range: Range, rows: RowScope<'a>, - io: &'a FileReader, + io: &'a ReadContext, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { self.data.read_scoped(range, rows, io, local) diff --git a/vortex-layout/src/scan/v2/row_idx.rs b/vortex-layout/src/scan/v2/row_idx.rs index c65dcb5449f..f859d8656d1 100644 --- a/vortex-layout/src/scan/v2/row_idx.rs +++ b/vortex-layout/src/scan/v2/row_idx.rs @@ -24,11 +24,11 @@ use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_scan::plan::ApplyScanPlan; -use vortex_scan::plan::FileReader; use vortex_scan::plan::PrepareCtx; use vortex_scan::plan::PreparedRead; use vortex_scan::plan::PreparedReadRef; use vortex_scan::plan::PushCtx; +use vortex_scan::plan::ReadContext; use vortex_scan::plan::RowScope; use vortex_scan::plan::ScanPlan; use vortex_scan::plan::ScanPlanRef; @@ -246,7 +246,7 @@ impl PreparedRead for RowIdxPreparedRead { &'a self, range: Range, rows: RowScope<'a>, - _io: &'a FileReader, + _io: &'a ReadContext, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { diff --git a/vortex-layout/src/segments/cache.rs b/vortex-layout/src/segments/cache.rs index dceda90b5a6..096fdc5cda1 100644 --- a/vortex-layout/src/segments/cache.rs +++ b/vortex-layout/src/segments/cache.rs @@ -22,6 +22,7 @@ use vortex_metrics::MetricsRegistry; use crate::segments::SegmentFuture; use crate::segments::SegmentId; +use crate::segments::SegmentInfo; use crate::segments::SegmentSource; static NEXT_SEGMENT_CACHE_SOURCE_ID: AtomicU64 = AtomicU64::new(0); @@ -194,6 +195,10 @@ impl SegmentCacheSourceAdapter { } impl SegmentSource for SegmentCacheSourceAdapter { + fn segment_info(&self, id: SegmentId) -> VortexResult { + self.source.segment_info(id) + } + fn request(&self, id: SegmentId) -> SegmentFuture { let key = SegmentCacheKey::new(self.source_id, id); let cache = Arc::clone(&self.cache); diff --git a/vortex-layout/src/segments/shared.rs b/vortex-layout/src/segments/shared.rs index c794daf608e..62abdda5857 100644 --- a/vortex-layout/src/segments/shared.rs +++ b/vortex-layout/src/segments/shared.rs @@ -16,6 +16,7 @@ use vortex_utils::aliases::dash_map::Entry; use crate::segments::SegmentFuture; use crate::segments::SegmentId; +use crate::segments::SegmentInfo; use crate::segments::SegmentSource; /// A [`SegmentSource`] that allows multiple requesters to await the same underlying segment @@ -38,6 +39,10 @@ impl SharedSegmentSource { } impl SegmentSource for SharedSegmentSource { + fn segment_info(&self, id: SegmentId) -> vortex_error::VortexResult { + self.inner.segment_info(id) + } + fn request(&self, id: SegmentId) -> SegmentFuture { loop { match self.in_flight.entry(id) { @@ -83,6 +88,10 @@ mod tests { } impl SegmentSource for CountingSegmentSource { + fn segment_info(&self, id: SegmentId) -> vortex_error::VortexResult { + self.segments.segment_info(id) + } + fn request(&self, id: SegmentId) -> SegmentFuture { self.request_count.fetch_add(1, Ordering::SeqCst); self.segments.request(id) diff --git a/vortex-layout/src/segments/test.rs b/vortex-layout/src/segments/test.rs index d880d15cc1a..9d49996f704 100644 --- a/vortex-layout/src/segments/test.rs +++ b/vortex-layout/src/segments/test.rs @@ -15,6 +15,7 @@ use vortex_error::vortex_err; use crate::segments::SegmentFuture; use crate::segments::SegmentId; +use crate::segments::SegmentInfo; use crate::segments::SegmentSink; use crate::segments::SegmentSource; use crate::sequence::SequenceId; @@ -26,6 +27,14 @@ pub struct TestSegments { } impl SegmentSource for TestSegments { + fn segment_info(&self, id: SegmentId) -> VortexResult { + self.segments + .lock() + .get(*id as usize) + .map(|segment| SegmentInfo::non_cacheable(segment.len() as u64)) + .ok_or_else(|| vortex_err!("Segment not found")) + } + fn request(&self, id: SegmentId) -> SegmentFuture { let buffer = self.segments.lock().get(*id as usize).cloned(); async move { diff --git a/vortex-scan/src/lib.rs b/vortex-scan/src/lib.rs index 3a7d583d791..7838857281b 100644 --- a/vortex-scan/src/lib.rs +++ b/vortex-scan/src/lib.rs @@ -42,8 +42,6 @@ pub use scheduler::ScanSchedulerSession; pub use scheduler::ScanSchedulerSessionExt; pub use scheduler::ScanTicket; pub use scheduler::ScanWorkClass; -pub use scheduler::SegmentSourceId; -pub use scheduler::SegmentSourceMeta; pub use scheduler::WorkPermit; pub use scheduler::WorkRequest; pub use segments::*; diff --git a/vortex-scan/src/plan/mod.rs b/vortex-scan/src/plan/mod.rs index 14fb5cd48bf..024e59d37f1 100644 --- a/vortex-scan/src/plan/mod.rs +++ b/vortex-scan/src/plan/mod.rs @@ -57,19 +57,20 @@ use vortex_session::VortexSession; use self::evidence::EvidenceFragment; use self::request::EvidenceRequest; use self::request::OwnedEvidenceRequest; +use crate::segments::ScanRead; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; use crate::segments::SegmentSource; -/// Per-file/query IO context for scan plan reads. +/// Execution context for legacy prepared read calls. #[derive(Clone)] -pub struct FileReader { +pub struct ReadContext { segments: Arc, session: VortexSession, } -impl FileReader { - /// Create a reader context from a segment source and session. +impl ReadContext { + /// Create a read context from a segment source and session. pub fn new(segments: Arc, session: VortexSession) -> Self { Self { segments, session } } @@ -550,7 +551,7 @@ impl PreparedRead for LiteralPreparedRead { &'a self, range: Range, rows: RowScope<'a>, - _io: &'a FileReader, + _io: &'a ReadContext, _local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { @@ -581,7 +582,7 @@ impl PreparedStats for LiteralPreparedStats { fn stats<'a>( &'a self, range: Range, - _io: &'a FileReader, + _io: &'a ReadContext, _state: &'a ScanState, ) -> BoxFuture<'a, VortexResult>>> { Box::pin(async move { @@ -637,14 +638,14 @@ impl LiteralPreparedStats { pub fn read_dense<'a>( read: &'a PreparedReadRef, range: Range, - io: &'a FileReader, + io: &'a ReadContext, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { let len = range_len(&range)?; let rows = OwnedRowScope::selected(Mask::new_true(len)); let mut local = io.session().create_execution_ctx(); - let task = Arc::clone(read).begin_read(range, rows)?; - task.read(io, &mut local).await + read.read_scoped(range, rows.as_scope(), io, &mut local) + .await }) } @@ -680,7 +681,7 @@ pub trait PreparedRead: 'static + Send + Sync { &'a self, range: Range, rows: RowScope<'a>, - io: &'a FileReader, + io: &'a ReadContext, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult>; @@ -707,28 +708,30 @@ pub trait PreparedRead: 'static + Send + Sync { impl dyn PreparedRead { /// Create a morsel-level read task for this prepared read. - pub fn begin_read( + pub fn create_task( self: Arc, range: Range, rows: OwnedRowScope, + reads: Vec, ) -> VortexResult> { Ok(Box::new(DefaultReadTask { read: self, range, rows, + reads, })) } } /// A morsel-level read task. pub trait ReadTask: Send { - /// Return scheduler-visible segment requests needed for this task, when known exactly. - fn segment_requests(&self, cx: &mut SegmentPlanCtx) -> VortexResult; + /// Registered reads needed by this task. + fn reads(&self) -> &[ScanRead]; /// Execute the read task. fn read<'a>( self: Box, - io: &'a FileReader, + io: &'a ReadContext, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult>; } @@ -737,23 +740,29 @@ struct DefaultReadTask { read: PreparedReadRef, range: Range, rows: OwnedRowScope, + reads: Vec, } impl ReadTask for DefaultReadTask { - fn segment_requests(&self, cx: &mut SegmentPlanCtx) -> VortexResult { - self.read - .segment_requests(self.range.clone(), self.rows.as_scope(), cx) + fn reads(&self) -> &[ScanRead] { + &self.reads } fn read<'a>( self: Box, - io: &'a FileReader, + io: &'a ReadContext, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { - self.read - .read_scoped(self.range, self.rows.as_scope(), io, local) - .await + let Self { + read, + range, + rows, + reads, + } = *self; + let result = read.read_scoped(range, rows.as_scope(), io, local).await; + drop(reads); + result }) } } @@ -767,7 +776,7 @@ pub trait PreparedSplit: 'static + Send + Sync { fn splits<'a>( &'a self, range: Range, - io: &'a FileReader, + io: &'a ReadContext, state: &'a ScanState, ) -> BoxFuture<'a, VortexResult>>>; @@ -795,7 +804,7 @@ impl PreparedSplit for HintPreparedSplit { fn splits<'a>( &'a self, range: Range, - _io: &'a FileReader, + _io: &'a ReadContext, _state: &'a ScanState, ) -> BoxFuture<'a, VortexResult>>> { Box::pin(async move { @@ -836,7 +845,7 @@ pub trait PreparedAggregate: 'static + Send + Sync { fn aggregate_partial<'a>( &'a self, range: Range, - io: &'a FileReader, + io: &'a ReadContext, state: &'a ScanState, ) -> BoxFuture<'a, VortexResult>>>; @@ -860,7 +869,7 @@ pub trait PreparedStats: 'static + Send + Sync { fn stats<'a>( &'a self, range: Range, - io: &'a FileReader, + io: &'a ReadContext, state: &'a ScanState, ) -> BoxFuture<'a, VortexResult>>>; @@ -997,7 +1006,7 @@ impl PreparedRead for StructValuePreparedRead { &'a self, range: Range, rows: RowScope<'a>, - io: &'a FileReader, + io: &'a ReadContext, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { @@ -1117,7 +1126,7 @@ impl PreparedRead for ApplyPreparedRead { &'a self, range: Range, rows: RowScope<'a>, - io: &'a FileReader, + io: &'a ReadContext, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { @@ -1229,7 +1238,7 @@ impl PreparedRead for MaskPreparedRead { &'a self, range: Range, rows: RowScope<'a>, - io: &'a FileReader, + io: &'a ReadContext, local: &'a mut ExecutionCtx, ) -> BoxFuture<'a, VortexResult> { Box::pin(async move { @@ -1272,7 +1281,7 @@ pub trait PreparedEvidence: 'static + Send + Sync { fn evidence<'a>( &'a self, req: &'a EvidenceRequest<'a>, - io: &'a FileReader, + io: &'a ReadContext, ) -> BoxFuture<'a, VortexResult>>; /// Return scheduler-visible segment requests needed for this evidence, when known exactly. @@ -1299,44 +1308,56 @@ pub trait PreparedEvidence: 'static + Send + Sync { impl dyn PreparedEvidence { /// Create a morsel-level evidence task for this prepared evidence handle. - pub fn begin_evidence( + pub fn create_task( self: Arc, req: OwnedEvidenceRequest, + reads: Vec, ) -> VortexResult> { Ok(Box::new(DefaultEvidenceTask { evidence: self, req, + reads, })) } } /// A morsel-level evidence task. pub trait EvidenceTask: Send { - /// Return scheduler-visible segment requests needed for this task, when known exactly. - fn segment_requests(&self, cx: &mut SegmentPlanCtx) -> VortexResult; + /// Registered reads needed by this task. + fn reads(&self) -> &[ScanRead]; /// Execute the evidence task. fn evidence<'a>( self: Box, - io: &'a FileReader, + io: &'a ReadContext, ) -> BoxFuture<'a, VortexResult>>; } struct DefaultEvidenceTask { evidence: PreparedEvidenceRef, req: OwnedEvidenceRequest, + reads: Vec, } impl EvidenceTask for DefaultEvidenceTask { - fn segment_requests(&self, cx: &mut SegmentPlanCtx) -> VortexResult { - self.evidence.segment_requests(&self.req.as_request(), cx) + fn reads(&self) -> &[ScanRead] { + &self.reads } fn evidence<'a>( self: Box, - io: &'a FileReader, + io: &'a ReadContext, ) -> BoxFuture<'a, VortexResult>> { - Box::pin(async move { self.evidence.evidence(&self.req.as_request(), io).await }) + Box::pin(async move { + let Self { + evidence, + req, + reads, + } = *self; + let result = evidence.evidence(&req.as_request(), io).await; + drop(reads); + result + }) } } @@ -1369,6 +1390,13 @@ mod tests { struct TestSegments; impl SegmentSource for TestSegments { + fn segment_info( + &self, + _id: crate::segments::SegmentId, + ) -> VortexResult { + Ok(crate::segments::SegmentInfo::non_cacheable(0)) + } + fn request(&self, _id: crate::segments::SegmentId) -> crate::segments::SegmentFuture { Box::pin(async { Ok(BufferHandle::new_host(ByteBuffer::from(Vec::::new()))) }) } @@ -1428,7 +1456,7 @@ mod tests { fn stats<'a>( &'a self, range: Range, - _io: &'a FileReader, + _io: &'a ReadContext, _state: &'a ScanState, ) -> BoxFuture<'a, VortexResult>>> { Box::pin(async move { @@ -1462,7 +1490,7 @@ mod tests { )? .ok_or_else(|| vortex_err!("test scan plan did not return a stats plan"))?; let state = plan.init_state(&session)?; - let io = FileReader::new(Arc::new(TestSegments), session); + let io = ReadContext::new(Arc::new(TestSegments), session); let stats = futures::executor::block_on(plan.stats(10..20, &io, state.as_ref()))?; assert_eq!(stats.len(), funcs.len()); @@ -1484,7 +1512,7 @@ mod tests { let read = plan .prepare_read(&mut PrepareCtx::new(session.clone()))? .ok_or_else(|| vortex_err!("literal scan plan did not return a prepared read"))?; - let io = FileReader::new(Arc::new(TestSegments), session); + let io = ReadContext::new(Arc::new(TestSegments), session); let array = futures::executor::block_on(read_dense(&read, 10..15, &io))?; let constant = array .as_opt::() diff --git a/vortex-scan/src/scheduler.rs b/vortex-scan/src/scheduler.rs index a4c53c6d379..53b0e443761 100644 --- a/vortex-scan/src/scheduler.rs +++ b/vortex-scan/src/scheduler.rs @@ -17,13 +17,11 @@ use std::sync::atomic::Ordering; use async_lock::Semaphore; use async_lock::SemaphoreGuardArc; -use parking_lot::Mutex; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_session::SessionExt; use vortex_session::SessionVar; use vortex_session::VortexSession; -use vortex_utils::aliases::hash_map::HashMap; use vortex_utils::parallelism::get_available_parallelism; const DEFAULT_MORSEL_CONCURRENCY_FACTOR: usize = 4; @@ -172,7 +170,6 @@ impl ScanScheduler { .per_scan_slots .map(|slots| Arc::new(Semaphore::new(slots))), per_scan_slot_limit: self.config.per_scan_slots, - segment_sources: Arc::new(Mutex::new(SegmentSourceRegistry::default())), } } @@ -263,39 +260,6 @@ pub struct ScanMeta { pub label: Option, } -/// Scheduler-local identity for a registered segment source. -/// -/// The identity is scoped to one [`ScanTicket`]. A shared scheduler may later associate this with a -/// stable cross-scan source key for cache reuse or metrics, but correctness must not depend on two -/// tickets allocating the same value for the same object. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] -pub struct SegmentSourceId(u64); - -impl SegmentSourceId { - /// Return the integer value of this scheduler-local source id. - pub fn get(self) -> u64 { - self.0 - } -} - -/// Metadata attached to a registered segment source. -#[derive(Clone, Debug, Default, PartialEq, Eq)] -pub struct SegmentSourceMeta { - /// Optional human-readable label used for diagnostics and future metrics. - pub label: Option, -} - -#[derive(Default)] -struct SegmentSourceRegistry { - next_id: u64, - sources: HashMap, -} - -struct SegmentSourceEntry { - source: Arc, - meta: SegmentSourceMeta, -} - /// A logical scan registered with a scheduler. #[derive(Clone)] pub struct ScanTicket { @@ -303,7 +267,6 @@ pub struct ScanTicket { cancelled: Arc, per_scan_slots: Option>, per_scan_slot_limit: Option, - segment_sources: Arc>, } impl fmt::Debug for ScanTicket { @@ -312,7 +275,6 @@ impl fmt::Debug for ScanTicket { .field("id", &self.id) .field("cancelled", &self.is_cancelled()) .field("per_scan_slot_limit", &self.per_scan_slot_limit) - .field("segment_source_count", &self.segment_source_count()) .finish_non_exhaustive() } } @@ -332,60 +294,6 @@ impl ScanTicket { pub fn is_cancelled(&self) -> bool { self.cancelled.load(Ordering::Acquire) } - - /// Register a segment source and return its scan-local source id. - pub fn register_segment_source( - &self, - source: Arc, - meta: SegmentSourceMeta, - ) -> SegmentSourceId - where - S: Any + Send + Sync, - { - let source: Arc = source; - self.register_erased_segment_source(source, meta) - } - - /// Register an already-erased segment source and return its scan-local source id. - pub fn register_erased_segment_source( - &self, - source: Arc, - meta: SegmentSourceMeta, - ) -> SegmentSourceId { - let mut registry = self.segment_sources.lock(); - let id = SegmentSourceId(registry.next_id); - registry.next_id = registry.next_id.saturating_add(1); - registry - .sources - .insert(id, SegmentSourceEntry { source, meta }); - id - } - - /// Return the metadata for a registered segment source. - pub fn segment_source_meta(&self, id: SegmentSourceId) -> Option { - let registry = self.segment_sources.lock(); - registry.sources.get(&id).map(|entry| entry.meta.clone()) - } - - /// Return a registered segment source downcast to the requested concrete type. - /// - /// This is intentionally typed at the call site: the scheduler stores sources opaquely, while - /// the scan runtime decides which concrete source trait or adapter it expects. - pub fn segment_source(&self, id: SegmentSourceId) -> Option> - where - S: Any + Send + Sync, - { - let source = { - let registry = self.segment_sources.lock(); - Arc::clone(®istry.sources.get(&id)?.source) - }; - source.downcast::().ok() - } - - fn segment_source_count(&self) -> usize { - let registry = self.segment_sources.lock(); - registry.sources.len() - } } /// A request to acquire scheduler slots for one scan work item. @@ -504,56 +412,3 @@ pub trait ScanSchedulerSessionExt: SessionExt { } impl ScanSchedulerSessionExt for S {} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use vortex_error::VortexResult; - use vortex_error::vortex_err; - - use super::*; - - struct TestSegmentSource { - label: &'static str, - } - - #[test] - fn segment_source_registration_is_scan_local() -> VortexResult<()> { - let scheduler = ScanScheduler::unbounded(); - let scan_a = scheduler.register_scan(ScanMeta::default()); - let scan_b = scheduler.register_scan(ScanMeta::default()); - - let source_a = Arc::new(TestSegmentSource { label: "a" }); - let source_b = Arc::new(TestSegmentSource { label: "b" }); - - let id_a0 = scan_a.register_segment_source( - source_a, - SegmentSourceMeta { - label: Some("source-a".to_string()), - }, - ); - let id_a1 = scan_a.register_segment_source( - Arc::new(TestSegmentSource { label: "a1" }), - SegmentSourceMeta::default(), - ); - let id_b0 = scan_b.register_segment_source(source_b, SegmentSourceMeta::default()); - - assert_eq!(id_a0.get(), 0); - assert_eq!(id_a1.get(), 1); - assert_eq!(id_b0.get(), 0); - - let meta = scan_a - .segment_source_meta(id_a0) - .ok_or_else(|| vortex_err!("missing segment source metadata"))?; - assert_eq!(meta.label.as_deref(), Some("source-a")); - - let source = scan_a - .segment_source::(id_a0) - .ok_or_else(|| vortex_err!("missing registered segment source"))?; - assert_eq!(source.label, "a"); - assert!(scan_b.segment_source::(id_a1).is_none()); - - Ok(()) - } -} diff --git a/vortex-scan/src/segments/scheduled.rs b/vortex-scan/src/segments/scheduled.rs index b4befd95d00..83dd728eb94 100644 --- a/vortex-scan/src/segments/scheduled.rs +++ b/vortex-scan/src/segments/scheduled.rs @@ -14,13 +14,11 @@ use vortex_error::SharedVortexResult; use vortex_error::VortexError; use vortex_error::VortexExpect; use vortex_error::VortexResult; -use vortex_error::vortex_err; use vortex_session::VortexSession; use vortex_utils::aliases::dash_map::DashMap; use vortex_utils::aliases::dash_map::Entry; use vortex_utils::aliases::hash_set::HashSet; -use crate::scheduler::SegmentSourceId; use crate::segments::SegmentFuture; use crate::segments::SegmentId; use crate::segments::SegmentSource; @@ -113,8 +111,6 @@ impl CancelGroup { /// than smuggling physical locations into `SegmentRequest`. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub struct SegmentRequest { - /// Registered source that owns the segment. - pub source: SegmentSourceId, /// Logical segment id within the source. pub segment: SegmentId, /// Number of bytes in the logical segment payload. @@ -130,35 +126,27 @@ pub struct SegmentRequest { /// Dedupe key for exact segment requests. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub struct SegmentRequestKey { - /// Registered source that owns the segment. - pub source: SegmentSourceId, /// Logical segment id within the source. pub segment: SegmentId, } impl SegmentRequestKey { /// Create a key for deduping exact segment requests. - pub fn new(source: SegmentSourceId, segment: SegmentId) -> Self { - Self { source, segment } + pub fn new(segment: SegmentId) -> Self { + Self { segment } } } impl From<&SegmentRequest> for SegmentRequestKey { fn from(request: &SegmentRequest) -> Self { - Self::new(request.source, request.segment) + Self::new(request.segment) } } impl SegmentRequest { /// Create a segment request from source, segment metadata, and phase. - pub fn new( - source: SegmentSourceId, - segment: SegmentId, - info: SegmentInfo, - phase: ScanIoPhase, - ) -> Self { + pub fn new(segment: SegmentId, info: SegmentInfo, phase: ScanIoPhase) -> Self { Self { - source, segment, bytes: info.bytes, phase, @@ -231,8 +219,7 @@ impl SegmentRequests { /// Context used by plans when producing scheduler-visible segment requests. #[derive(Clone)] pub struct SegmentPlanCtx { - source_id: SegmentSourceId, - source: Arc, + source: Arc, session: VortexSession, phase: ScanIoPhase, priority: ScanPriority, @@ -242,7 +229,6 @@ pub struct SegmentPlanCtx { impl fmt::Debug for SegmentPlanCtx { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("SegmentPlanCtx") - .field("source_id", &self.source_id) .field("phase", &self.phase) .field("priority", &self.priority) .field("cancel_group", &self.cancel_group) @@ -252,13 +238,8 @@ impl fmt::Debug for SegmentPlanCtx { impl SegmentPlanCtx { /// Create a request-planning context for a registered segment source. - pub fn new( - source_id: SegmentSourceId, - source: Arc, - session: VortexSession, - ) -> Self { + pub fn new(source: Arc, session: VortexSession) -> Self { Self { - source_id, source, session, phase: ScanIoPhase::default(), @@ -267,13 +248,8 @@ impl SegmentPlanCtx { } } - /// Return the registered source id used for requests created by this context. - pub fn source_id(&self) -> SegmentSourceId { - self.source_id - } - - /// Return the scheduled source used to resolve segment metadata. - pub fn source(&self) -> &Arc { + /// Return the source used to resolve segment metadata. + pub fn source(&self) -> &Arc { &self.source } @@ -302,7 +278,7 @@ impl SegmentPlanCtx { /// Create a segment request with this context's source and scheduling metadata. pub fn request(&self, segment: SegmentId, info: SegmentInfo) -> SegmentRequest { - SegmentRequest::new(self.source_id, segment, info, self.phase) + SegmentRequest::new(segment, info, self.phase) .with_priority(self.priority) .with_cancel_group(self.cancel_group) } @@ -314,59 +290,15 @@ impl SegmentPlanCtx { } } -/// Backend capabilities relevant to scheduled segment submission. -#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] -pub struct SegmentSourceCapabilities { - /// Maximum number of logical segment requests preferred in one batch. - pub max_batch_len: Option, - /// Maximum number of logical segment bytes preferred in one batch. - pub max_batch_bytes: Option, - /// Whether the backend can observe best-effort cancellation. - pub supports_cancellation: bool, -} - -/// A batch of segment requests submitted to one scheduled segment source. -#[derive(Clone, Debug, Default, PartialEq, Eq)] -pub struct SegmentBatch { - requests: Vec, -} - -impl SegmentBatch { - /// Create a batch from requests that all target the same source. - pub fn new(requests: Vec) -> Self { - Self { requests } - } - - /// Borrow the requests in this batch. - pub fn requests(&self) -> &[SegmentRequest] { - &self.requests - } - - /// Consume this batch and return its requests. - pub fn into_requests(self) -> Vec { - self.requests - } - - /// Return the number of requests in this batch. - pub fn len(&self) -> usize { - self.requests.len() - } - - /// Return whether this batch contains no requests. - pub fn is_empty(&self) -> bool { - self.requests.is_empty() - } -} - -/// One logical segment result returned by a scheduled source submission. -pub struct SegmentHandle { +/// One logical segment read registered for a scan task. +pub struct ScanRead { /// The logical request this handle resolves. pub request: SegmentRequest, /// Future resolving to the requested segment payload. pub future: SegmentFuture, } -impl SegmentHandle { +impl ScanRead { /// Create a handle for one logical segment request. pub fn new(request: SegmentRequest, future: SegmentFuture) -> Self { Self { request, future } @@ -392,11 +324,7 @@ impl SegmentFutureCache { } /// Request one segment from a scheduled source, reusing an in-flight future when present. - pub fn request_segment( - &self, - source: &dyn ScheduledSegmentSource, - request: SegmentRequest, - ) -> SegmentHandle { + pub fn request_segment(&self, source: &dyn SegmentSource, request: SegmentRequest) -> ScanRead { if let Some(handle) = self.cached_handle(request) { return handle; } @@ -410,13 +338,7 @@ impl SegmentFutureCache { entry.remove(); } Entry::Vacant(entry) => { - let Some(handle) = source - .submit(SegmentBatch::new(vec![request])) - .into_iter() - .next() - else { - return missing_segment_handle(request); - }; + let handle = ScanRead::new(request, source.request(request.segment)); let shared = handle.future.map_err(Arc::new).boxed().shared(); entry.insert( shared @@ -429,14 +351,10 @@ impl SegmentFutureCache { } } - /// Submit exact segment requests to a source, returning strong handles that keep them alive. - pub fn submit( - &self, - source: &dyn ScheduledSegmentSource, - requests: SegmentRequests, - ) -> SubmittedSegmentRequests { + /// Register exact segment reads with a source, returning handles that keep the futures alive. + pub fn register(&self, source: &dyn SegmentSource, requests: SegmentRequests) -> Vec { let Some(requests) = requests.into_exact() else { - return SubmittedSegmentRequests::default(); + return Vec::new(); }; let mut seen: HashSet = HashSet::default(); @@ -454,10 +372,10 @@ impl SegmentFutureCache { } handles.extend(self.submit_misses(source, misses)); - SubmittedSegmentRequests::new(handles) + handles } - fn cached_handle(&self, request: SegmentRequest) -> Option { + fn cached_handle(&self, request: SegmentRequest) -> Option { let key = SegmentRequestKey::from(&request); let future = self.in_flight.get(&key)?.upgrade()?; Some(shared_segment_handle(request, future)) @@ -465,36 +383,18 @@ impl SegmentFutureCache { fn submit_misses( &self, - source: &dyn ScheduledSegmentSource, + source: &dyn SegmentSource, misses: Vec, - ) -> Vec { - let capabilities = source.capabilities(); - let mut handles = Vec::new(); - let mut batch = Vec::new(); - let mut batch_bytes = 0_u64; - for request in misses { - let len_limit_reached = capabilities - .max_batch_len - .is_some_and(|max_len| !batch.is_empty() && batch.len() >= max_len); - let bytes_limit_reached = capabilities.max_batch_bytes.is_some_and(|max_bytes| { - !batch.is_empty() && batch_bytes.saturating_add(request.bytes) > max_bytes - }); - if len_limit_reached || bytes_limit_reached { - handles.extend(self.insert_submitted( - source.submit(SegmentBatch::new(std::mem::take(&mut batch))), - )); - batch_bytes = 0; - } - batch_bytes = batch_bytes.saturating_add(request.bytes); - batch.push(request); - } - if !batch.is_empty() { - handles.extend(self.insert_submitted(source.submit(SegmentBatch::new(batch)))); - } - handles + ) -> Vec { + self.insert_submitted( + misses + .into_iter() + .map(|request| ScanRead::new(request, source.request(request.segment))) + .collect(), + ) } - fn insert_submitted(&self, handles: Vec) -> Vec { + fn insert_submitted(&self, handles: Vec) -> Vec { handles .into_iter() .map(|handle| { @@ -512,125 +412,48 @@ impl SegmentFutureCache { } } -/// Submitted segment request handles. -/// -/// Holding this value keeps pre-submitted segment futures alive. The existing file-backed source -/// registers reads when those futures are created, so a later layout read can coalesce with them -/// even if this value never awaits the handles directly. -#[derive(Default)] -pub struct SubmittedSegmentRequests { - handles: Vec, - bytes: u64, -} - -impl SubmittedSegmentRequests { - /// Create a submitted request set from handles. - pub fn new(handles: Vec) -> Self { - let bytes = handles - .iter() - .map(|handle| handle.request.bytes) - .sum::(); - Self { handles, bytes } - } - - /// Borrow submitted segment handles. - pub fn handles(&self) -> &[SegmentHandle] { - &self.handles - } - - /// Return the number of submitted segment handles. - pub fn len(&self) -> usize { - self.handles.len() - } - - /// Return the total logical segment bytes represented by these handles. - pub fn bytes(&self) -> u64 { - self.bytes - } - - /// Return whether no segment handles were submitted. - pub fn is_empty(&self) -> bool { - self.handles.is_empty() - } - - /// Extend this submitted request set with another, keeping all handles alive. - pub fn extend(&mut self, other: SubmittedSegmentRequests) { - self.bytes = self.bytes.saturating_add(other.bytes); - self.handles.extend(other.handles); - } -} - -/// Submit exact segment requests to a source after deduping by `(source, segment)`. -/// -/// Unknown request sets are left to the normal lazy read path and submit no work. The source still -/// owns physical coalescing; this helper only removes duplicate logical segment requests. -pub fn submit_segment_requests( - source: &dyn ScheduledSegmentSource, - requests: SegmentRequests, -) -> SubmittedSegmentRequests { - SegmentFutureCache::new().submit(source, requests) -} - -/// Submit exact segment requests through a shared in-flight future cache. -pub fn submit_segment_requests_cached( +/// Register exact segment reads through a shared in-flight future cache. +pub fn register_segment_reads_cached( cache: &SegmentFutureCache, - source: &dyn ScheduledSegmentSource, + source: &dyn SegmentSource, requests: SegmentRequests, -) -> SubmittedSegmentRequests { - cache.submit(source, requests) +) -> Vec { + cache.register(source, requests) } -fn shared_segment_handle( - request: SegmentRequest, - future: Shared, -) -> SegmentHandle { - SegmentHandle::new(request, future.map_err(VortexError::from).boxed()) -} - -fn missing_segment_handle(request: SegmentRequest) -> SegmentHandle { - SegmentHandle::new( - request, - async move { - Err(vortex_err!( - "scheduled source did not return a handle for segment {}", - request.segment - )) - } - .boxed(), - ) +fn shared_segment_handle(request: SegmentRequest, future: Shared) -> ScanRead { + ScanRead::new(request, future.map_err(VortexError::from).boxed()) } -/// Segment-source view backed by a [`ScheduledSegmentSource`] and [`SegmentFutureCache`]. -pub struct ScheduledSegmentSourceReader { - source_id: SegmentSourceId, - source: Arc, +/// Segment-source view backed by another source and a [`SegmentFutureCache`]. +pub struct CachedSegmentSource { + source: Arc, cache: Arc, phase: ScanIoPhase, } -impl ScheduledSegmentSourceReader { - /// Create a segment source reader using projection reads as the default late-request phase. - pub fn new( - source_id: SegmentSourceId, - source: Arc, - cache: Arc, - ) -> Self { +impl CachedSegmentSource { + /// Create a cached source using projection reads as the default late-request phase. + pub fn new(source: Arc, cache: Arc) -> Self { Self { - source_id, source, cache, phase: ScanIoPhase::ProjectionRead, } } - /// Return a copy of this reader with a different phase for late segment requests. + /// Return a copy of this source with a different phase for late segment requests. pub fn with_phase(mut self, phase: ScanIoPhase) -> Self { self.phase = phase; self } } -impl SegmentSource for ScheduledSegmentSourceReader { +impl SegmentSource for CachedSegmentSource { + fn segment_info(&self, id: SegmentId) -> VortexResult { + self.source.segment_info(id) + } + fn request(&self, id: SegmentId) -> SegmentFuture { let info = match self.source.segment_info(id) { Ok(info) => info, @@ -639,80 +462,12 @@ impl SegmentSource for ScheduledSegmentSourceReader { self.cache .request_segment( self.source.as_ref(), - SegmentRequest::new(self.source_id, id, info, self.phase), + SegmentRequest::new(id, info, self.phase), ) .future } } -/// Source that accepts explicit scheduler-visible segment batches. -pub trait ScheduledSegmentSource: Send + Sync + 'static { - /// Return scheduler-visible metadata for a segment. - fn segment_info(&self, id: SegmentId) -> VortexResult; - - /// Return backend capabilities relevant to scheduling and batching. - fn capabilities(&self) -> SegmentSourceCapabilities { - SegmentSourceCapabilities::default() - } - - /// Submit a batch of segment requests to this source. - fn submit(&self, batch: SegmentBatch) -> Vec; -} - -/// Adapter that exposes an existing [`SegmentSource`] as a scheduled segment source. -pub struct ScheduledSegmentSourceAdapter { - source: Arc, - segments: Arc<[SegmentInfo]>, - capabilities: SegmentSourceCapabilities, -} - -impl ScheduledSegmentSourceAdapter { - /// Create a scheduled adapter over an existing segment source. - pub fn new(source: Arc, segments: Arc<[SegmentInfo]>) -> Self { - Self { - source, - segments, - capabilities: SegmentSourceCapabilities::default(), - } - } - - /// Return a copy of this adapter with explicit capabilities. - pub fn with_capabilities(mut self, capabilities: SegmentSourceCapabilities) -> Self { - self.capabilities = capabilities; - self - } - - /// Return the wrapped segment source. - pub fn source(&self) -> &Arc { - &self.source - } -} - -impl ScheduledSegmentSource for ScheduledSegmentSourceAdapter { - fn segment_info(&self, id: SegmentId) -> VortexResult { - let idx = usize::try_from(*id).map_err(|_| vortex_err!("segment id exceeds usize"))?; - self.segments - .get(idx) - .copied() - .ok_or_else(|| vortex_err!("missing segment: {}", id)) - } - - fn capabilities(&self) -> SegmentSourceCapabilities { - self.capabilities - } - - fn submit(&self, batch: SegmentBatch) -> Vec { - batch - .into_requests() - .into_iter() - .map(|request| { - let future = self.source.request(request.segment); - SegmentHandle::new(request, future) - }) - .collect() - } -} - #[cfg(test)] mod tests { use std::sync::atomic::AtomicUsize; @@ -725,28 +480,12 @@ mod tests { use vortex_buffer::ByteBuffer; use super::*; - use crate::ScanMeta; - use crate::ScanScheduler; - use crate::SegmentSourceMeta; - - struct TestSegmentSource; - - impl SegmentSource for TestSegmentSource { - fn request(&self, id: SegmentId) -> SegmentFuture { - async move { - let id = u8::try_from(*id).map_err(|_| vortex_err!("segment id exceeds u8"))?; - Ok(BufferHandle::new_host(ByteBuffer::from(vec![id]))) - } - .boxed() - } - } - - struct CountingScheduledSegmentSource { + struct CountingSegmentSource { info: SegmentInfo, submit_count: AtomicUsize, } - impl CountingScheduledSegmentSource { + impl CountingSegmentSource { fn new(info: SegmentInfo) -> Self { Self { info, @@ -759,12 +498,12 @@ mod tests { } } - struct BatchingScheduledSegmentSource { + struct CountingMissSegmentSource { info: SegmentInfo, batches: Mutex>, } - impl BatchingScheduledSegmentSource { + impl CountingMissSegmentSource { fn new(info: SegmentInfo) -> Self { Self { info, @@ -777,157 +516,78 @@ mod tests { } } - impl ScheduledSegmentSource for CountingScheduledSegmentSource { + impl SegmentSource for CountingSegmentSource { fn segment_info(&self, _id: SegmentId) -> VortexResult { Ok(self.info) } - fn submit(&self, batch: SegmentBatch) -> Vec { - self.submit_count.fetch_add(batch.len(), Ordering::Relaxed); - batch - .into_requests() - .into_iter() - .map(|request| { - let future = - async move { Ok(BufferHandle::new_host(ByteBuffer::from(vec![0]))) } - .boxed(); - SegmentHandle::new(request, future) - }) - .collect() + fn request(&self, _id: SegmentId) -> SegmentFuture { + self.submit_count.fetch_add(1, Ordering::Relaxed); + async move { Ok(BufferHandle::new_host(ByteBuffer::from(vec![0]))) }.boxed() } } - impl ScheduledSegmentSource for BatchingScheduledSegmentSource { + impl SegmentSource for CountingMissSegmentSource { fn segment_info(&self, _id: SegmentId) -> VortexResult { Ok(self.info) } - fn capabilities(&self) -> SegmentSourceCapabilities { - SegmentSourceCapabilities { - max_batch_len: Some(2), - max_batch_bytes: Some(16), - supports_cancellation: false, - } - } - - fn submit(&self, batch: SegmentBatch) -> Vec { - self.batches.lock().push(batch.len()); - batch - .into_requests() - .into_iter() - .map(|request| { - let future = - async move { Ok(BufferHandle::new_host(ByteBuffer::from(vec![0]))) } - .boxed(); - SegmentHandle::new(request, future) - }) - .collect() + fn request(&self, _id: SegmentId) -> SegmentFuture { + self.batches.lock().push(1); + async move { Ok(BufferHandle::new_host(ByteBuffer::from(vec![0]))) }.boxed() } } #[test] - fn adapter_reports_metadata_and_submits_handles() -> VortexResult<()> { - let adapter = Arc::new(ScheduledSegmentSourceAdapter::new( - Arc::new(TestSegmentSource), - vec![SegmentInfo::cacheable(4), SegmentInfo::cacheable(8)].into(), - )); - let scheduler = ScanScheduler::unbounded(); - let ticket = scheduler.register_scan(ScanMeta::default()); - let source_id = - ticket.register_segment_source(Arc::clone(&adapter), SegmentSourceMeta::default()); - - let scheduled: Arc = - Arc::::clone(&adapter); - let ctx = SegmentPlanCtx::new(source_id, scheduled, VortexSession::empty()) - .with_phase(ScanIoPhase::PredicateRead); - let request = ctx.request_for_segment(SegmentId::from(1))?; - - assert_eq!(request.bytes, 8); - assert_eq!(request.phase, ScanIoPhase::PredicateRead); - - let mut handles = adapter.submit(SegmentBatch::new(vec![request])); - let handle = handles - .pop() - .ok_or_else(|| vortex_err!("scheduled adapter did not return a handle"))?; - assert_eq!(handle.request.segment, SegmentId::from(1)); - assert_eq!(block_on(handle.future)?.as_host().len(), 1); - - Ok(()) - } - - #[test] - fn submit_segment_requests_dedupes_exact_segments() -> VortexResult<()> { - let scheduler = ScanScheduler::unbounded(); - let ticket = scheduler.register_scan(ScanMeta::default()); - let source = Arc::new(CountingScheduledSegmentSource::new(SegmentInfo::cacheable( - 8, - ))); - let source_id = - ticket.register_segment_source(Arc::clone(&source), SegmentSourceMeta::default()); - let scheduled: Arc = - Arc::::clone(&source); - let ctx = SegmentPlanCtx::new(source_id, scheduled, VortexSession::empty()); + fn register_segment_reads_dedupes_exact_segments() -> VortexResult<()> { + let source = Arc::new(CountingSegmentSource::new(SegmentInfo::cacheable(8))); + let segment_source: Arc = Arc::::clone(&source); + let ctx = SegmentPlanCtx::new(segment_source, VortexSession::empty()); let request = ctx.request_for_segment(SegmentId::from(0))?; - let submitted = submit_segment_requests( + let reads = SegmentFutureCache::new().register( source.as_ref(), SegmentRequests::exact(vec![request, request]), ); - assert_eq!(submitted.len(), 1); + assert_eq!(reads.len(), 1); assert_eq!(source.submit_count(), 1); Ok(()) } #[test] - fn submit_segment_requests_respects_batch_capabilities() -> VortexResult<()> { - let scheduler = ScanScheduler::unbounded(); - let ticket = scheduler.register_scan(ScanMeta::default()); - let source = Arc::new(BatchingScheduledSegmentSource::new(SegmentInfo::cacheable( - 8, - ))); - let source_id = - ticket.register_segment_source(Arc::clone(&source), SegmentSourceMeta::default()); - let scheduled: Arc = - Arc::::clone(&source); - let ctx = SegmentPlanCtx::new(source_id, scheduled, VortexSession::empty()); + fn register_segment_reads_registers_each_miss() -> VortexResult<()> { + let source = Arc::new(CountingMissSegmentSource::new(SegmentInfo::cacheable(8))); + let segment_source: Arc = + Arc::::clone(&source); + let ctx = SegmentPlanCtx::new(segment_source, VortexSession::empty()); let requests = (0..5) .map(|segment| ctx.request_for_segment(SegmentId::from(segment))) .collect::>>()?; - let submitted = submit_segment_requests(source.as_ref(), SegmentRequests::exact(requests)); + let reads = + SegmentFutureCache::new().register(source.as_ref(), SegmentRequests::exact(requests)); - assert_eq!(submitted.len(), 5); - assert_eq!(source.batches(), vec![2, 2, 1]); + assert_eq!(reads.len(), 5); + assert_eq!(source.batches(), vec![1, 1, 1, 1, 1]); Ok(()) } #[test] fn segment_future_cache_reuses_prefetched_segment() -> VortexResult<()> { - let scheduler = ScanScheduler::unbounded(); - let ticket = scheduler.register_scan(ScanMeta::default()); - let source = Arc::new(CountingScheduledSegmentSource::new(SegmentInfo::cacheable( - 8, - ))); - let source_id = - ticket.register_segment_source(Arc::clone(&source), SegmentSourceMeta::default()); - let scheduled: Arc = - Arc::::clone(&source); - let ctx = SegmentPlanCtx::new(source_id, Arc::clone(&scheduled), VortexSession::empty()); + let source = Arc::new(CountingSegmentSource::new(SegmentInfo::cacheable(8))); + let segment_source: Arc = Arc::::clone(&source); + let ctx = SegmentPlanCtx::new(Arc::clone(&segment_source), VortexSession::empty()); let request = ctx.request_for_segment(SegmentId::from(0))?; let cache = Arc::new(SegmentFutureCache::new()); - let submitted = submit_segment_requests_cached( - cache.as_ref(), - source.as_ref(), - SegmentRequests::exact(vec![request]), - ); - let reader = ScheduledSegmentSourceReader::new(source_id, scheduled, Arc::clone(&cache)); + let reads = cache.register(source.as_ref(), SegmentRequests::exact(vec![request])); + let reader = CachedSegmentSource::new(segment_source, Arc::clone(&cache)); let read = reader.request(SegmentId::from(0)); - assert_eq!(submitted.len(), 1); + assert_eq!(reads.len(), 1); assert_eq!(source.submit_count(), 1); assert_eq!(block_on(read)?.as_host().len(), 1); assert_eq!(source.submit_count(), 1); diff --git a/vortex-scan/src/segments/source.rs b/vortex-scan/src/segments/source.rs index 9ff8a901c93..df6c5569ba8 100644 --- a/vortex-scan/src/segments/source.rs +++ b/vortex-scan/src/segments/source.rs @@ -6,12 +6,16 @@ use vortex_array::buffer::BufferHandle; use vortex_error::VortexResult; use crate::segments::SegmentId; +use crate::segments::SegmentInfo; /// Static future resolving to a segment byte buffer. pub type SegmentFuture = BoxFuture<'static, VortexResult>; /// A trait for providing logical segment data to a scan plan. pub trait SegmentSource: 'static + Send + Sync { + /// Return scheduler-visible metadata for a segment. + fn segment_info(&self, id: SegmentId) -> VortexResult; + /// Request a segment, returning a future that will eventually resolve to the segment data. fn request(&self, id: SegmentId) -> SegmentFuture; } From 33433bc5b6672f41db7a96cfe7743f3e9669ebec Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Sun, 21 Jun 2026 23:11:54 -0400 Subject: [PATCH 25/48] Port split planning to scan2 Signed-off-by: Nicholas Gates --- vortex-datafusion/src/persistent/opener.rs | 95 +++------ vortex-file/src/multi/scan_v2.rs | 236 +++++++++++++++------ vortex-layout/src/scan/v2/layouts/zoned.rs | 4 + 3 files changed, 212 insertions(+), 123 deletions(-) diff --git a/vortex-datafusion/src/persistent/opener.rs b/vortex-datafusion/src/persistent/opener.rs index 561848b74d6..3f3da596a3f 100644 --- a/vortex-datafusion/src/persistent/opener.rs +++ b/vortex-datafusion/src/persistent/opener.rs @@ -39,10 +39,8 @@ use tracing::Instrument; use vortex::array::VortexSessionExecute; use vortex::array::arrow::ArrowSessionExt; use vortex::dtype::FieldMask; -use vortex::dtype::Nullability; use vortex::error::VortexError; use vortex::error::VortexExpect; -use vortex::expr::pack; use vortex::file::OpenOptionsSessionExt; use vortex::file::VortexFile; use vortex::io::InstrumentedReadAt; @@ -372,36 +370,20 @@ impl FileOpener for VortexOpener { if byte_range.start == 0 && byte_range.end == file.object_meta.size { None } else { - // Distribute the scan's own morsels across DataFusion's byte-range - // file_groups. The morsels are the units the scan actually reads (read-column - // chunk hints, or the 100k-row fallback for single-chunk columns), so each - // morsel lands wholly in one partition: no collapse onto a single partition - // (which serialized the probe), and no chunk straddling a partition boundary - // (which would re-decode it). V2 needs this because it parallelizes the - // scan/probe ACROSS DataFusion partitions, unlike V1 which fans out within - // one partition. - let read_expr = match &filter { - Some(filter) => pack( - [ - ("projection", scan_projection.clone()), - ("filter", filter.clone()), - ], - Nullability::NonNullable, - ), - None => scan_projection.clone(), - }; - let morsels = scan_plan_morsel_ranges_for_file( + // Distribute V2 natural split ranges across DataFusion's byte-range + // file_groups. V2 prepares morsels from these same split ranges, so each + // DataFusion partition owns whole V2 morsels instead of slicing through + // chunk boundaries and forcing duplicate decode work. + let split_ranges = scan_plan_split_ranges_for_file( natural_split_ranges.as_ref(), &file.object_meta.location, &vxf, - &read_expr, - ) - .await?; + )?; let Some(row_range) = split_aligned_row_range( byte_range, file.object_meta.size, - morsels.as_ref(), + split_ranges.as_ref(), ) else { return Ok(stream::empty().boxed()); }; @@ -680,50 +662,37 @@ fn compute_natural_split_ranges(layout_reader: &dyn LayoutReader) -> DFResult]>>, +/// Get or create V2 natural split ranges for a file. These ranges are produced by the file's V2 +/// ScanPlan root and cached per path so every byte-range partition sees the same row boundaries. +fn scan_plan_split_ranges_for_file( + split_ranges_cache: &DashMap]>>, path: &Path, file: &VortexFile, - read_expr: &vortex::expr::Expression, ) -> DFResult]>> { - if let Some(ranges) = morsel_ranges.get(path) { + if let Some(ranges) = split_ranges_cache.get(path) { return Ok(Arc::clone(ranges.value())); } - let chunks = file - .plan_splits(read_expr) - .await - .map_err(|e| exec_datafusion_err!("Failed to compute Vortex scan2 splits: {e}"))?; - - let ranges: Arc<[Range]> = if chunks.len() > 1 { - chunks.into() - } else { - // Single chunk (or none): mirror the scan's fallback of FALLBACK_SPLIT_SIZE-row morsels so - // a single large chunk still spreads across partitions. - let row_count = file.row_count(); - let mut ranges = Vec::new(); - let mut start = 0u64; - while start < row_count { - let end = start - .saturating_add(SCAN_FALLBACK_SPLIT_SIZE) - .min(row_count); - ranges.push(start..end); - start = end; - } - ranges.into() - }; - - match morsel_ranges.entry(path.clone()) { + let ranges: Arc<[Range]> = file + .scan_plan_splits() + .map_err(|e| exec_datafusion_err!("Failed to compute Vortex scan2 splits: {e}"))? + .into(); + tracing::debug!( + target: "vortex_datafusion::persistent::opener", + path = %path, + split_count = ranges.len(), + first_split = ?ranges.first(), + last_split = ?ranges.last(), + "scan2 file split ranges" + ); + tracing::trace!( + target: "vortex_datafusion::persistent::opener", + path = %path, + ?ranges, + "scan2 file split range detail" + ); + + match split_ranges_cache.entry(path.clone()) { Entry::Occupied(entry) => Ok(Arc::clone(entry.get())), Entry::Vacant(entry) => { entry.insert(Arc::clone(&ranges)); diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index 25fee82cd3e..cd97957a671 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -108,7 +108,9 @@ use crate::VortexFile; use crate::VortexOpenOptions; const DEFAULT_CONCURRENCY: usize = 8; -const FALLBACK_SPLIT_SIZE: u64 = 100_000; +const IDEAL_SPLIT_SIZE: u64 = 100_000; +const MAX_SELECTION_RANGE_SIZE: u64 = IDEAL_SPLIT_SIZE / 25; +const MIN_SELECTION_GAP_BETWEEN_RANGES: u64 = IDEAL_SPLIT_SIZE / 2; const DEFAULT_EVIDENCE_MORSEL_WINDOW: usize = 8; /// Below this demanded-row density, evaluate a residual predicate over only the demanded rows @@ -970,7 +972,7 @@ pub(crate) async fn scan_plan_file_statistics_many( pub(crate) fn scan_plan_file_splits(file: &VortexFile) -> VortexResult>> { let root = file.scan_plan_root()?; - split_ranges_from_node(&root, file.row_count()) + Ok(split_ranges_from_node(&root, file.row_count())) } pub(crate) async fn scan_plan_file_plan_splits( @@ -989,25 +991,13 @@ pub(crate) async fn scan_plan_file_plan_splits( .await } -fn split_ranges_from_node(node: &ScanPlanRef, row_count: u64) -> VortexResult>> { - let mut points = vec![0, row_count]; +fn split_ranges_from_node(node: &ScanPlanRef, row_count: u64) -> Vec> { + let mut points = Vec::new(); if let Some(hints) = node.split_hints() { - points.extend( - hints - .iter() - .copied() - .filter(|&hint| 0 < hint && hint < row_count), - ); + points.extend_from_slice(hints); } - points.sort_unstable(); - points.dedup(); - Ok(points - .windows(2) - .filter_map(|window| { - let range = window[0]..window[1]; - (range.start < range.end).then_some(range) - }) - .collect()) + let points = normalize_split_points(row_count, points); + natural_split_ranges(&points, None) } pub(crate) fn build_file_scan_plan_root(file: &VortexFile) -> VortexResult { @@ -1786,11 +1776,10 @@ impl Partition for PlannedScanPlanPartition { struct PreparedScanPlan { // Request-level physical plan after pushdown. This must stay free of per-scan IO state. dtype: DType, - row_range: Range, selection: Selection, ordered: bool, limit: Option, - split_hints: Option>, + splits: Vec>, projection: ScanPlanRef, predicates: Vec, } @@ -1874,15 +1863,20 @@ impl PreparedScanPlan { }) .collect::>>()?; + let row_range = request + .row_range + .ok_or_else(|| vortex_err!("scan2 partition row range missing"))?; + let selection = request.selection; + let (splits, split_kind) = + prepare_split_ranges(file.row_count(), &row_range, &selection, split_hints); + trace_prepared_splits(&row_range, &splits, split_kind, filter.is_some()); + Ok(Self { dtype, - row_range: request - .row_range - .ok_or_else(|| vortex_err!("scan2 partition row range missing"))?, - selection: request.selection, + selection, ordered: request.ordered, limit: request.limit, - split_hints: normalize_split_hints(split_hints), + splits, projection: projection_pushed, predicates, }) @@ -1917,38 +1911,7 @@ impl PreparedScanPlan { } fn splits(&self) -> VortexResult>> { - let mut points = vec![self.row_range.start]; - if let Some(hints) = &self.split_hints { - points.extend( - hints - .iter() - .copied() - .filter(|&hint| self.row_range.start < hint && hint < self.row_range.end), - ); - } - if points.len() == 1 { - let mut next = self - .row_range - .start - .saturating_add(FALLBACK_SPLIT_SIZE) - .min(self.row_range.end); - while next < self.row_range.end { - points.push(next); - next = next - .saturating_add(FALLBACK_SPLIT_SIZE) - .min(self.row_range.end); - } - } - points.push(self.row_range.end); - points.sort_unstable(); - points.dedup(); - Ok(points - .windows(2) - .filter_map(|window| { - let range = window[0]..window[1]; - (range.start < range.end).then_some(range) - }) - .collect()) + Ok(self.splits.clone()) } } @@ -2272,10 +2235,163 @@ fn extend_split_hints(plan: &ScanPlanRef, points: &mut Vec) { } } -fn normalize_split_hints(mut hints: Vec) -> Option> { +#[derive(Clone, Copy, Debug)] +enum PreparedSplitKind { + SelectionRanges, + Natural, +} + +fn prepare_split_ranges( + row_count: u64, + row_range: &Range, + selection: &Selection, + split_hints: Vec, +) -> (Vec>, PreparedSplitKind) { + let explicit_row_range = explicit_row_range(row_count, row_range); + if let Some(ranges) = selection_split_ranges(selection, explicit_row_range) { + return (ranges, PreparedSplitKind::SelectionRanges); + } + + let file_range = 0..row_count; + let selection_range = intersect_ranges(Some(&file_range), selection_bounding_range(selection)); + let bounded_range = intersect_ranges(explicit_row_range, selection_range); + let points = normalize_split_points(row_count, split_hints); + ( + natural_split_ranges(&points, bounded_range.as_ref()), + PreparedSplitKind::Natural, + ) +} + +fn explicit_row_range<'a>(row_count: u64, row_range: &'a Range) -> Option<&'a Range> { + (row_range.start != 0 || row_range.end != row_count).then_some(row_range) +} + +fn selection_split_ranges( + selection: &Selection, + row_range: Option<&Range>, +) -> Option>> { + let Selection::IncludeByIndex(buffer) = selection else { + return None; + }; + if row_range.is_some() { + return None; + } + + let indices = buffer.as_slice(); + if indices.is_empty() { + return Some(Vec::new()); + } + debug_assert!(indices.is_sorted()); + + let mut ranges = Vec::with_capacity((indices.len() as u64 / MAX_SELECTION_RANGE_SIZE) as usize); + let mut curr_start = indices[0]; + let mut curr_end = indices[0].saturating_add(1); + for &idx in &indices[1..] { + let idx_end = idx.saturating_add(1); + let new_range_size = idx_end.saturating_sub(curr_start); + let gap = idx_end.saturating_sub(curr_end); + if new_range_size >= MAX_SELECTION_RANGE_SIZE { + if gap >= MIN_SELECTION_GAP_BETWEEN_RANGES { + ranges.push(curr_start..curr_end); + curr_start = idx; + curr_end = idx_end; + } else { + return None; + } + } else { + curr_end = idx_end; + } + } + ranges.push(curr_start..curr_end); + Some(ranges) +} + +fn selection_bounding_range(selection: &Selection) -> Option> { + match selection { + Selection::IncludeByIndex(buffer) => { + let indices = buffer.as_slice(); + indices + .first() + .zip(indices.last()) + .map(|(&first, &last)| first..last.saturating_add(1)) + } + Selection::IncludeRoaring(roaring) if !roaring.is_empty() => { + Some(roaring.min()?..roaring.max()?.saturating_add(1)) + } + _ => None, + } +} + +fn intersect_ranges(left: Option<&Range>, right: Option>) -> Option> { + match (left, right) { + (Some(left), Some(right)) => Some(left.start.max(right.start)..left.end.min(right.end)), + (Some(left), None) => Some(left.clone()), + (None, Some(right)) => Some(right), + (None, None) => None, + } +} + +fn normalize_split_points(row_count: u64, mut hints: Vec) -> Vec { + hints.push(0); + hints.push(row_count); + hints.retain(|&hint| hint <= row_count); hints.sort_unstable(); hints.dedup(); - (!hints.is_empty()).then_some(hints) + hints +} + +fn natural_split_ranges(split_points: &[u64], row_range: Option<&Range>) -> Vec> { + let points = if let Some(row_range) = row_range { + if row_range.start >= row_range.end { + return Vec::new(); + } + let mut points = Vec::new(); + points.push(row_range.start); + points.extend( + split_points + .iter() + .copied() + .filter(|&point| row_range.start < point && point < row_range.end), + ); + points.push(row_range.end); + points.sort_unstable(); + points.dedup(); + points + } else { + split_points.to_vec() + }; + + points + .windows(2) + .filter_map(|window| { + let range = window[0]..window[1]; + (range.start < range.end).then_some(range) + }) + .collect() +} + +fn trace_prepared_splits( + row_range: &Range, + splits: &[Range], + split_kind: PreparedSplitKind, + has_filter: bool, +) { + tracing::debug!( + target: "vortex_file::scan_v2", + ?split_kind, + split_count = splits.len(), + row_start = row_range.start, + row_end = row_range.end, + first_split = ?splits.first(), + last_split = ?splits.last(), + has_filter, + "prepared scan2 splits" + ); + tracing::trace!( + target: "vortex_file::scan_v2", + ?splits, + "prepared scan2 split ranges" + ); } fn check_range(range: &Range, row_count: u64) -> VortexResult<()> { diff --git a/vortex-layout/src/scan/v2/layouts/zoned.rs b/vortex-layout/src/scan/v2/layouts/zoned.rs index 31a50abe1b2..06f139526cb 100644 --- a/vortex-layout/src/scan/v2/layouts/zoned.rs +++ b/vortex-layout/src/scan/v2/layouts/zoned.rs @@ -928,6 +928,10 @@ impl ScanPlan for ZonedExprScanPlan { self.data.release(frontier, state.data.as_ref()) } + fn split_hints(&self) -> Option<&[u64]> { + self.data.split_hints() + } + fn fmt_chain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "zoned_expr({})", self.expr) } From 0b6e79eda96e34c447d4f0bacca33fbb0f769506 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Mon, 22 Jun 2026 00:40:20 -0400 Subject: [PATCH 26/48] Improve V2 scan partition scheduling Signed-off-by: Nicholas Gates --- benchmarks/datafusion-bench/src/lib.rs | 16 +- vortex-datafusion/src/persistent/opener.rs | 129 ++++--- vortex-file/src/file.rs | 9 + vortex-file/src/multi/scan_v2.rs | 386 +++++++++++++++------ vortex-scan/src/scheduler.rs | 17 + 5 files changed, 395 insertions(+), 162 deletions(-) diff --git a/benchmarks/datafusion-bench/src/lib.rs b/benchmarks/datafusion-bench/src/lib.rs index 9de41eaebce..ce588f31ddb 100644 --- a/benchmarks/datafusion-bench/src/lib.rs +++ b/benchmarks/datafusion-bench/src/lib.rs @@ -145,7 +145,7 @@ fn vortex_session_from_env() -> anyhow::Result { } fn scan_scheduler_config_from_env() -> anyhow::Result { - Ok(std::env::var("VORTEX_SCAN_MAX_MORSEL_SLOTS") + let config = std::env::var("VORTEX_SCAN_MAX_MORSEL_SLOTS") .ok() .map(|value| { value @@ -154,7 +154,19 @@ fn scan_scheduler_config_from_env() -> anyhow::Result { .map_err(|e| anyhow::anyhow!("invalid scan scheduler slot count {value}: {e}")) }) .transpose()? - .unwrap_or_else(ScanSchedulerConfig::default_morsel_slots)) + .unwrap_or_else(ScanSchedulerConfig::default_morsel_slots); + + Ok(std::env::var("VORTEX_SCAN_MAX_READ_BYTES") + .ok() + .map(|value| { + value + .parse::() + .map_err(|e| anyhow::anyhow!("invalid scan scheduler byte budget {value}: {e}")) + }) + .transpose()? + .map_or(config.clone(), |bytes| { + config.with_read_byte_budget(Some(bytes)) + })) } fn vortex_table_options() -> VortexTableOptions { diff --git a/vortex-datafusion/src/persistent/opener.rs b/vortex-datafusion/src/persistent/opener.rs index 3f3da596a3f..b190d1cbb42 100644 --- a/vortex-datafusion/src/persistent/opener.rs +++ b/vortex-datafusion/src/persistent/opener.rs @@ -370,20 +370,15 @@ impl FileOpener for VortexOpener { if byte_range.start == 0 && byte_range.end == file.object_meta.size { None } else { - // Distribute V2 natural split ranges across DataFusion's byte-range - // file_groups. V2 prepares morsels from these same split ranges, so each - // DataFusion partition owns whole V2 morsels instead of slicing through - // chunk boundaries and forcing duplicate decode work. - let split_ranges = scan_plan_split_ranges_for_file( - natural_split_ranges.as_ref(), - &file.object_meta.location, - &vxf, - )?; - - let Some(row_range) = split_aligned_row_range( + // DataFusion partitions a single file by byte ranges. V2 may expose only + // coarse top-level split hints, so assigning whole natural splits here can + // collapse many byte ranges into a few row ranges. Slice proportionally by + // row count; the V2 scan plan will still split the resulting row range into + // layout-aware morsels during preparation. + let Some(row_range) = byte_range_to_row_range( byte_range, file.object_meta.size, - split_ranges.as_ref(), + vxf.row_count(), ) else { return Ok(stream::empty().boxed()); }; @@ -662,43 +657,34 @@ fn compute_natural_split_ranges(layout_reader: &dyn LayoutReader) -> DFResult]>>, - path: &Path, - file: &VortexFile, -) -> DFResult]>> { - if let Some(ranges) = split_ranges_cache.get(path) { - return Ok(Arc::clone(ranges.value())); +fn byte_range_to_row_range( + byte_range: Range, + total_size: u64, + row_count: u64, +) -> Option> { + if byte_range.start >= byte_range.end || total_size == 0 || row_count == 0 { + return None; } - let ranges: Arc<[Range]> = file - .scan_plan_splits() - .map_err(|e| exec_datafusion_err!("Failed to compute Vortex scan2 splits: {e}"))? - .into(); - tracing::debug!( - target: "vortex_datafusion::persistent::opener", - path = %path, - split_count = ranges.len(), - first_split = ?ranges.first(), - last_split = ?ranges.last(), - "scan2 file split ranges" - ); - tracing::trace!( - target: "vortex_datafusion::persistent::opener", - path = %path, - ?ranges, - "scan2 file split range detail" - ); - - match split_ranges_cache.entry(path.clone()) { - Entry::Occupied(entry) => Ok(Arc::clone(entry.get())), - Entry::Vacant(entry) => { - entry.insert(Arc::clone(&ranges)); - Ok(ranges) - } + let start_byte = byte_range.start.min(total_size); + let end_byte = byte_range.end.min(total_size); + if start_byte >= end_byte { + return None; } + + let start = byte_to_row(start_byte, total_size, row_count); + let end = if end_byte == total_size { + row_count + } else { + byte_to_row(end_byte, total_size, row_count) + }; + + (start < end).then_some(start..end) +} + +fn byte_to_row(byte: u64, total_size: u64, row_count: u64) -> u64 { + let row = (u128::from(byte) * u128::from(row_count)) / u128::from(total_size); + u64::try_from(row).vortex_expect("byte-to-row projection should fit into u64") } /// Translate a DataFusion byte range to the contiguous natural split ranges it owns. @@ -807,6 +793,57 @@ mod tests { static SESSION: LazyLock = LazyLock::new(VortexSession::default); + #[rstest] + #[case(0..10, 100, 50, Some(0..5))] + #[case(10..20, 100, 50, Some(5..10))] + #[case(90..100, 100, 50, Some(45..50))] + #[case(100..110, 100, 50, None)] + #[case(0..1, 100, 50, None)] + fn test_byte_range_to_row_range( + #[case] byte_range: Range, + #[case] total_size: u64, + #[case] row_count: u64, + #[case] expected: Option>, + ) { + assert_eq!( + byte_range_to_row_range(byte_range, total_size, row_count), + expected + ); + } + + #[test] + fn test_byte_ranges_cover_rows_exactly_once() { + let total_size = 179_114_706; + let row_count = 6_001_215; + let partitions = 18; + let byte_ranges = (0..partitions) + .map(|idx| { + let start = idx * total_size / partitions; + let end = (idx + 1) * total_size / partitions; + start..end + }) + .collect::>(); + + let row_ranges = byte_ranges + .into_iter() + .filter_map(|byte_range| byte_range_to_row_range(byte_range, total_size, row_count)) + .collect::>(); + + assert_eq!(row_ranges.len(), partitions as usize); + assert_eq!(row_ranges.first().map(|range| range.start), Some(0)); + assert_eq!(row_ranges.last().map(|range| range.end), Some(row_count)); + assert_eq!( + row_ranges + .iter() + .map(|range| range.end - range.start) + .sum::(), + row_count + ); + for (left, right) in row_ranges.iter().tuple_windows() { + assert_eq!(left.end, right.start); + } + } + #[rstest] #[case(0..3, 10, vec![0..2, 2..5, 5..10], Some(0..2))] #[case(3..7, 10, vec![0..2, 2..5, 5..10], Some(2..5))] diff --git a/vortex-file/src/file.rs b/vortex-file/src/file.rs index 6f1483e9f88..4b7ae04b664 100644 --- a/vortex-file/src/file.rs +++ b/vortex-file/src/file.rs @@ -36,6 +36,7 @@ use vortex_session::VortexSession; use crate::FileStatistics; use crate::footer::Footer; use crate::multi::scan_v2; +use crate::multi::scan_v2::PreparedScanPlanCache; use crate::pruning::can_prune_file_stats; use crate::v2::FileStatsLayoutReader; @@ -60,6 +61,8 @@ pub struct VortexFile { scan_plan_state_cache: PreparedStateCacheRef, /// Shared cache for v2 in-flight segment futures across row-range scans of this file. scan_plan_segment_future_cache: Arc, + /// Shared cache for v2 request-level prepared plans across row-range scans of this file. + scan_plan_prepared_cache: Arc, } fn layout_reader( @@ -98,6 +101,7 @@ impl VortexFile { scan_plan_root_cache: Arc::new(OnceLock::new()), scan_plan_state_cache: Arc::new(PreparedStateCache::default()), scan_plan_segment_future_cache: Arc::new(SegmentFutureCache::new()), + scan_plan_prepared_cache: Arc::new(PreparedScanPlanCache::default()), } } @@ -111,6 +115,7 @@ impl VortexFile { scan_plan_root_cache: self.scan_plan_root_cache, scan_plan_state_cache: self.scan_plan_state_cache, scan_plan_segment_future_cache: self.scan_plan_segment_future_cache, + scan_plan_prepared_cache: self.scan_plan_prepared_cache, } } @@ -201,6 +206,10 @@ impl VortexFile { Arc::clone(&self.scan_plan_segment_future_cache) } + pub(crate) fn scan_plan_prepared_cache(&self) -> Arc { + Arc::clone(&self.scan_plan_prepared_cache) + } + /// Create a [`DataSource`](vortex_scan::DataSource) from this file for scanning. /// /// Wraps the file's layout reader with [`FileStatsLayoutReader`] (when file-level diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index cd97957a671..7d317a4e1b6 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -5,6 +5,8 @@ use std::any::Any; use std::collections::BTreeMap; +use std::collections::HashMap; +use std::collections::HashSet; use std::collections::VecDeque; use std::fmt; use std::ops::Range; @@ -19,6 +21,7 @@ use futures::TryStreamExt; use futures::future::BoxFuture; use futures::stream; use futures::stream::FuturesUnordered; +use parking_lot::Mutex; use tracing::Instrument; use vortex_array::ArrayRef; use vortex_array::VortexSessionExecute; @@ -64,7 +67,6 @@ use vortex_scan::ScanRequest as DataSourceScanRequest; use vortex_scan::ScanScheduler; use vortex_scan::ScanSchedulerSessionExt; use vortex_scan::ScanTicket; -use vortex_scan::WorkRequest; use vortex_scan::plan::OwnedRowScope; use vortex_scan::plan::PrepareCtx; use vortex_scan::plan::PreparedAggregateRef; @@ -93,6 +95,7 @@ use vortex_scan::segments::ScanIoPhase; use vortex_scan::segments::ScanRead; use vortex_scan::segments::SegmentFutureCache; use vortex_scan::segments::SegmentPlanCtx; +use vortex_scan::segments::SegmentRequestKey; use vortex_scan::segments::SegmentRequests; use vortex_scan::segments::SegmentSource; use vortex_scan::segments::register_segment_reads_cached; @@ -111,8 +114,6 @@ const DEFAULT_CONCURRENCY: usize = 8; const IDEAL_SPLIT_SIZE: u64 = 100_000; const MAX_SELECTION_RANGE_SIZE: u64 = IDEAL_SPLIT_SIZE / 25; const MIN_SELECTION_GAP_BETWEEN_RANGES: u64 = IDEAL_SPLIT_SIZE / 2; -const DEFAULT_EVIDENCE_MORSEL_WINDOW: usize = 8; - /// Below this demanded-row density, evaluate a residual predicate over only the demanded rows /// (filter-first) rather than the whole morsel. Mirrors the V1 flat-reader threshold. const EXPR_EVAL_THRESHOLD: f64 = 0.2; @@ -659,19 +660,23 @@ impl DataSource for ScanPlanDataSource { let mut planned_files = Vec::new(); let mut total_morsels = 0usize; - let mut has_runtime_evidence = false; for (partition_idx, file) in self.open_files(false).await? { let Some(request) = file_scan_request(partition_idx, &file, scan_request.clone())? else { continue; }; - let prepared = Arc::new(PreparedScanPlan::try_new(&file, request)?); + let row_range = request + .row_range + .clone() + .ok_or_else(|| vortex_err!("scan2 partition row range missing"))?; + let prepared = file + .scan_plan_prepared_cache() + .get_or_prepare(&file, &request)?; let execution = Arc::new(ScanExecution::try_new(file, prepared, &ticket)?); - let ranges = execution.splits()?; + let ranges = execution.splits(&row_range)?; if ranges.is_empty() { continue; } - has_runtime_evidence |= execution.has_runtime_evidence(); total_morsels = total_morsels.saturating_add(ranges.len()); planned_files.push((execution, ranges)); } @@ -693,9 +698,8 @@ impl DataSource for ScanPlanDataSource { } } - let default_window = get_available_parallelism().unwrap_or(1).saturating_mul(4); - let (morsel_plan_window, morsel_launch_window) = - morsel_windows(&scheduler, false, has_runtime_evidence, default_window); + let morsel_plan_window = morsel_plan_window(&scheduler, false); + let read_byte_budget = read_byte_budget(&scheduler); Ok(Some(Arc::new(PlannedScanPlanScan { dtype, @@ -703,7 +707,7 @@ impl DataSource for ScanPlanDataSource { scheduler, ticket, morsel_plan_window, - morsel_launch_window, + read_byte_budget, }))) } @@ -1064,7 +1068,7 @@ fn file_scan_request( struct Work { phase: ScanIoPhase, - known_bytes: u64, + reads: Vec, handle: Handle, future: BoxFuture<'static, VortexResult>, } @@ -1073,12 +1077,12 @@ impl Work { fn new( phase: ScanIoPhase, handle: Handle, - known_bytes: u64, + reads: Vec, future: BoxFuture<'static, VortexResult>, ) -> Self { Self { phase, - known_bytes, + reads, handle, future, } @@ -1092,21 +1096,37 @@ impl Work { QueuedWork { morsel_id, phase: self.phase, - known_bytes: self.known_bytes, + reads: self.reads, handle: self.handle, future: async move { self.future.await.map(map) }.boxed(), } } } +#[derive(Clone, Copy)] +struct WorkRead { + key: SegmentRequestKey, + bytes: u64, +} + struct QueuedWork { morsel_id: usize, phase: ScanIoPhase, - known_bytes: u64, + reads: Vec, handle: Handle, future: BoxFuture<'static, VortexResult>, } +struct ActiveRead { + bytes: u64, + refs: usize, +} + +struct LaunchedWorkOutput { + reads: Vec, + output: VortexResult, +} + struct EvidenceWorkOutput { morsel_id: usize, predicate_idx: usize, @@ -1143,6 +1163,50 @@ struct MorselState { next_recheck_predicate: usize, } +#[derive(Default)] +pub(crate) struct PreparedScanPlanCache { + plans: Mutex>>, +} + +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +struct PreparedScanPlanKey { + projection: Expression, + filter: Option, + ordered: bool, + limit: Option, +} + +impl PreparedScanPlanKey { + fn try_new(request: &DataSourceScanRequest) -> Option { + matches!(request.selection, Selection::All).then(|| Self { + projection: request.projection.clone(), + filter: request.filter.clone(), + ordered: request.ordered, + limit: request.limit, + }) + } +} + +impl PreparedScanPlanCache { + fn get_or_prepare( + &self, + file: &VortexFile, + request: &DataSourceScanRequest, + ) -> VortexResult> { + let Some(key) = PreparedScanPlanKey::try_new(request) else { + return PreparedScanPlan::try_new(file, request).map(Arc::new); + }; + + if let Some(plan) = self.plans.lock().get(&key) { + return Ok(Arc::clone(plan)); + } + + let plan = Arc::new(PreparedScanPlan::try_new(file, request)?); + let mut plans = self.plans.lock(); + Ok(Arc::clone(plans.entry(key).or_insert(plan))) + } +} + struct PartitionWorkSchedulerState { pending: VecDeque, morsels: Vec>, @@ -1152,13 +1216,13 @@ struct PartitionWorkSchedulerState { evidence_queue: VecDeque, predicate_queue: VecDeque, projection_queue: VecDeque, - in_flight: FuturesUnordered>>, + in_flight: FuturesUnordered>, completed_morsels: BTreeMap, - scheduler: Arc, - ticket: ScanTicket, ordered: bool, plan_window: usize, - launch_window: usize, + read_byte_budget: u64, + active_read_bytes: u64, + active_reads: HashMap, phase_cursor: usize, } @@ -1169,41 +1233,28 @@ const WEIGHTED_PHASES: &[ScanIoPhase] = &[ ScanIoPhase::ProjectionRead, ]; -fn morsel_windows( - scheduler: &ScanScheduler, - limited: bool, - has_runtime_evidence: bool, - default_window: usize, -) -> (usize, usize) { +fn morsel_plan_window(scheduler: &ScanScheduler, limited: bool) -> usize { if limited { - return (1, 1); + return 1; } - let launch_window = scheduler - .config() - .morsel_launch_window() - .unwrap_or_else(|| { - if has_runtime_evidence { - default_window.min(DEFAULT_EVIDENCE_MORSEL_WINDOW) - } else { - default_window - } - }) - .max(1); - let plan_window = scheduler + + scheduler .config() .morsel_plan_window() - .map(|window| window.max(launch_window).max(1)) - .unwrap_or(usize::MAX); - (plan_window, launch_window) + .unwrap_or(usize::MAX) +} + +fn read_byte_budget(scheduler: &ScanScheduler) -> u64 { + scheduler.config().read_byte_budget().unwrap_or(u64::MAX) } fn partition_work_stream( morsels: Vec, - scheduler: Arc, - ticket: ScanTicket, + _scheduler: Arc, + _ticket: ScanTicket, ordered: bool, plan_window: usize, - launch_window: usize, + read_byte_budget: u64, ) -> impl futures::Stream> + Send + 'static { let state = PartitionWorkSchedulerState { pending: VecDeque::from(morsels), @@ -1216,11 +1267,11 @@ fn partition_work_stream( projection_queue: VecDeque::new(), in_flight: FuturesUnordered::new(), completed_morsels: BTreeMap::new(), - scheduler, - ticket, ordered, plan_window, - launch_window, + read_byte_budget, + active_read_bytes: 0, + active_reads: HashMap::new(), phase_cursor: 0, }; @@ -1237,10 +1288,7 @@ fn partition_work_stream( } } - while state.in_flight.len() < state.launch_window { - let Some(work) = state.pop_next_work() else { - break; - }; + while let Some(work) = state.pop_next_admissible_work() { state.launch(work); } @@ -1261,12 +1309,14 @@ fn partition_work_stream( } match state.in_flight.next().await { - Some(Ok(output)) => match state.complete_work(output) { - Ok(Some(array)) => return Some((Ok(array), state)), - Ok(None) => continue, - Err(error) => return Some((Err(error), state)), - }, - Some(Err(error)) => return Some((Err(error), state)), + Some(output) => { + state.release_reads(&output.reads); + match output.output.and_then(|output| state.complete_work(output)) { + Ok(Some(array)) => return Some((Ok(array), state)), + Ok(None) => continue, + Err(error) => return Some((Err(error), state)), + } + } None if state.is_done() => return None, None => continue, } @@ -1274,6 +1324,40 @@ fn partition_work_stream( }) } +fn can_admit_work( + active_reads: &HashMap, + active_read_bytes: u64, + read_byte_budget: u64, + in_flight_empty: bool, + work: &QueuedWork, +) -> bool { + let incremental = incremental_read_bytes(active_reads, &work.reads); + incremental == 0 + || active_read_bytes.saturating_add(incremental) <= read_byte_budget + || in_flight_empty +} + +fn incremental_read_bytes( + active_reads: &HashMap, + reads: &[WorkRead], +) -> u64 { + let mut seen = HashSet::new(); + reads + .iter() + .filter(|read| seen.insert(read.key) && !active_reads.contains_key(&read.key)) + .map(|read| read.bytes) + .sum() +} + +fn read_bytes(reads: &[WorkRead]) -> u64 { + let mut seen = HashSet::new(); + reads + .iter() + .filter(|read| seen.insert(read.key)) + .map(|read| read.bytes) + .sum() +} + impl PartitionWorkSchedulerState { fn clear(&mut self) { self.pending.clear(); @@ -1285,6 +1369,8 @@ impl PartitionWorkSchedulerState { self.projection_queue.clear(); self.in_flight = FuturesUnordered::new(); self.completed_morsels.clear(); + self.active_read_bytes = 0; + self.active_reads.clear(); } fn is_done(&self) -> bool { @@ -1321,11 +1407,11 @@ impl PartitionWorkSchedulerState { Ok(()) } - fn pop_next_work(&mut self) -> Option { + fn pop_next_admissible_work(&mut self) -> Option { for _ in 0..WEIGHTED_PHASES.len() { let phase = WEIGHTED_PHASES[self.phase_cursor % WEIGHTED_PHASES.len()]; self.phase_cursor = self.phase_cursor.wrapping_add(1); - if let Some(work) = self.pop_phase_work(phase) { + if let Some(work) = self.pop_phase_admissible_work(phase) { return Some(work); } } @@ -1335,48 +1421,112 @@ impl PartitionWorkSchedulerState { ScanIoPhase::ProjectionRead, ] .into_iter() - .find_map(|phase| self.pop_phase_work(phase)) + .find_map(|phase| self.pop_phase_admissible_work(phase)) } - fn pop_phase_work(&mut self, phase: ScanIoPhase) -> Option { + fn pop_phase_admissible_work(&mut self, phase: ScanIoPhase) -> Option { + let active_reads = &self.active_reads; + let active_read_bytes = self.active_read_bytes; + let read_byte_budget = self.read_byte_budget; + let in_flight_empty = self.in_flight.is_empty(); + let morsels = &self.morsels; let queue = match phase { ScanIoPhase::EvidenceProbe | ScanIoPhase::EvidenceSetup => &mut self.evidence_queue, ScanIoPhase::PredicateRead => &mut self.predicate_queue, ScanIoPhase::ProjectionRead | ScanIoPhase::AggregateRead => &mut self.projection_queue, }; - while let Some(work) = queue.pop_front() { - if self - .morsels + let len = queue.len(); + for _ in 0..len { + let Some(work) = queue.pop_front() else { + break; + }; + if morsels .get(work.morsel_id) .and_then(Option::as_ref) - .is_some() + .is_none() { + continue; + } + if can_admit_work( + active_reads, + active_read_bytes, + read_byte_budget, + in_flight_empty, + &work, + ) { return Some(work); } + queue.push_back(work); } None } fn launch(&mut self, work: QueuedWork) { - let scheduler = Arc::clone(&self.scheduler); - let ticket = self.ticket.clone(); + let reads = self.admit_reads(&work.reads); + let phase = work.phase; + let bytes = read_bytes(&reads); self.in_flight.push( work.handle .spawn( async move { - let _permit = scheduler.acquire(&ticket, WorkRequest::morsel()).await?; - work.future.await + let output = work.future.await; + LaunchedWorkOutput { reads, output } } .instrument(tracing::trace_span!( "scan2_work", - phase = ?work.phase, - known_bytes = work.known_bytes, + phase = ?phase, + read_bytes = bytes, )), ) .boxed(), ); } + fn admit_reads(&mut self, reads: &[WorkRead]) -> Vec { + let mut admitted = Vec::with_capacity(reads.len()); + let mut seen = HashSet::new(); + for read in reads { + if !seen.insert(read.key) { + continue; + } + match self.active_reads.entry(read.key) { + std::collections::hash_map::Entry::Occupied(mut entry) => { + let active = entry.get_mut(); + active.refs = active.refs.saturating_add(1); + } + std::collections::hash_map::Entry::Vacant(entry) => { + self.active_read_bytes = self.active_read_bytes.saturating_add(read.bytes); + entry.insert(ActiveRead { + bytes: read.bytes, + refs: 1, + }); + } + } + admitted.push(*read); + } + admitted + } + + fn release_reads(&mut self, reads: &[WorkRead]) { + let mut seen = HashSet::new(); + for read in reads { + if !seen.insert(read.key) { + continue; + } + let std::collections::hash_map::Entry::Occupied(mut entry) = + self.active_reads.entry(read.key) + else { + continue; + }; + if entry.get().refs > 1 { + entry.get_mut().refs -= 1; + } else { + let active = entry.remove(); + self.active_read_bytes = self.active_read_bytes.saturating_sub(active.bytes); + } + } + } + fn complete_work(&mut self, output: WorkOutput) -> VortexResult> { match output { WorkOutput::Evidence(output) => self.complete_evidence(output), @@ -1642,18 +1792,19 @@ impl Partition for ScanPlanPartition { ticket, } = *self; - let prepared = Arc::new(PreparedScanPlan::try_new(&file, request)?); + let row_range = request + .row_range + .clone() + .ok_or_else(|| vortex_err!("scan2 partition row range missing"))?; + let prepared = file + .scan_plan_prepared_cache() + .get_or_prepare(&file, &request)?; let execution = Arc::new(ScanExecution::try_new(file, prepared, &ticket)?); let dtype = execution.plan.dtype().clone(); - let ranges = execution.splits()?; + let ranges = execution.splits(&row_range)?; let ordered = execution.plan.ordered(); - let default_window = get_available_parallelism().unwrap_or(1) * 4; - let (plan_window, launch_window) = morsel_windows( - &scheduler, - execution.limit_remaining.is_some(), - execution.has_runtime_evidence(), - default_window, - ); + let plan_window = morsel_plan_window(&scheduler, execution.limit_remaining.is_some()); + let read_byte_budget = read_byte_budget(&scheduler); let morsels = ranges .into_iter() .map(|range| PlannedScanPlanMorsel { @@ -1668,7 +1819,7 @@ impl Partition for ScanPlanPartition { ticket, ordered, plan_window, - launch_window, + read_byte_budget, ); Ok(ArrayStreamExt::boxed(ArrayStreamAdapter::new( @@ -1683,7 +1834,7 @@ struct PlannedScanPlanScan { scheduler: Arc, ticket: ScanTicket, morsel_plan_window: usize, - morsel_launch_window: usize, + read_byte_budget: u64, } #[derive(Clone)] @@ -1764,7 +1915,7 @@ impl Partition for PlannedScanPlanPartition { ticket, false, planned.morsel_plan_window, - planned.morsel_launch_window, + planned.read_byte_budget, ); Ok(ArrayStreamExt::boxed(ArrayStreamAdapter::new( @@ -1779,7 +1930,8 @@ struct PreparedScanPlan { selection: Selection, ordered: bool, limit: Option, - splits: Vec>, + row_count: u64, + split_hints: Vec, projection: ScanPlanRef, predicates: Vec, } @@ -1826,12 +1978,13 @@ impl ExecutionPredicate { } impl PreparedScanPlan { - fn try_new(file: &VortexFile, request: DataSourceScanRequest) -> VortexResult { + fn try_new(file: &VortexFile, request: &DataSourceScanRequest) -> VortexResult { let session = file.session().clone(); let dtype = request.projection.return_dtype(file.dtype())?; let projection = request.projection.optimize_recursive(file.dtype())?; let filter = request .filter + .clone() .map(|filter| filter.optimize_recursive(file.dtype())) .transpose()?; @@ -1863,20 +2016,13 @@ impl PreparedScanPlan { }) .collect::>>()?; - let row_range = request - .row_range - .ok_or_else(|| vortex_err!("scan2 partition row range missing"))?; - let selection = request.selection; - let (splits, split_kind) = - prepare_split_ranges(file.row_count(), &row_range, &selection, split_hints); - trace_prepared_splits(&row_range, &splits, split_kind, filter.is_some()); - Ok(Self { dtype, - selection, + selection: request.selection.clone(), ordered: request.ordered, limit: request.limit, - splits, + row_count: file.row_count(), + split_hints, projection: projection_pushed, predicates, }) @@ -1910,8 +2056,16 @@ impl PreparedScanPlan { &self.projection } - fn splits(&self) -> VortexResult>> { - Ok(self.splits.clone()) + fn splits(&self, row_range: &Range) -> VortexResult>> { + check_range(row_range, self.row_count)?; + let (splits, split_kind) = prepare_split_ranges( + self.row_count, + row_range, + &self.selection, + self.split_hints.clone(), + ); + trace_prepared_splits(row_range, &splits, split_kind, self.has_filter()); + Ok(splits) } } @@ -1985,14 +2139,18 @@ impl ScanExecution { ) } - fn known_read_bytes(reads: &[ScanRead]) -> u64 { - reads.iter().map(|read| read.request.bytes).sum() - } - - fn has_runtime_evidence(&self) -> bool { - self.predicates + fn work_reads(reads: &[ScanRead]) -> Vec { + let mut seen = HashSet::new(); + reads .iter() - .any(|predicate| !predicate.evidence.is_empty()) + .filter_map(|read| { + let key = SegmentRequestKey::from(&read.request); + seen.insert(key).then_some(WorkRead { + key, + bytes: read.request.bytes, + }) + }) + .collect() } fn plan_morsel( @@ -2037,7 +2195,7 @@ impl ScanExecution { range: range.clone(), mode, }; - let mut known_bytes = 0u64; + let mut work_reads = Vec::new(); let mut tasks = Vec::with_capacity(predicate.evidence.len()); for plan in &predicate.evidence { if mode == EvidenceMode::RecheckBeforeProjection && !plan.recheck_before_projection() { @@ -2046,7 +2204,7 @@ impl ScanExecution { let mut segment_ctx = self.segment_plan_ctx(ScanIoPhase::EvidenceProbe); let requests = plan.segment_requests(&req.as_request(), &mut segment_ctx)?; let reads = self.register_segment_reads(requests); - known_bytes = known_bytes.saturating_add(Self::known_read_bytes(&reads)); + work_reads.extend(Self::work_reads(&reads)); let task = Arc::clone(plan).create_task(req.clone(), reads)?; tasks.push(task); } @@ -2055,7 +2213,7 @@ impl ScanExecution { Ok(Work::new( ScanIoPhase::EvidenceProbe, self.session.handle(), - known_bytes, + work_reads, async move { let predicate = &execution.predicates[predicate_idx]; let mut acc = PredicateEvidence::new(predicate.id, version, range.clone())?; @@ -2100,14 +2258,14 @@ impl ScanExecution { .read .segment_requests(range.clone(), rows.as_scope(), &mut segment_ctx)?; let reads = self.register_segment_reads(requests); - let known_bytes = Self::known_read_bytes(&reads); + let work_reads = Self::work_reads(&reads); let task = Arc::clone(&predicate.read).create_task(range.clone(), rows, reads)?; let execution = Arc::clone(self); Ok(Work::new( ScanIoPhase::PredicateRead, self.session.handle(), - known_bytes, + work_reads, async move { let predicate = &execution.predicates[predicate_idx]; let mut ctx = execution.session.create_execution_ctx(); @@ -2192,7 +2350,7 @@ impl ScanExecution { self.projection .segment_requests(range.clone(), rows.as_scope(), &mut segment_ctx)?; let reads = self.register_segment_reads(requests); - let known_bytes = Self::known_read_bytes(&reads); + let work_reads = Self::work_reads(&reads); let task = Arc::clone(&self.projection).create_task(range, rows, reads)?; let execution = Arc::clone(self); @@ -2200,7 +2358,7 @@ impl ScanExecution { Work::new( ScanIoPhase::ProjectionRead, self.session.handle(), - known_bytes, + work_reads, async move { let mut ctx = execution.session.create_execution_ctx(); let array = task.read(&execution.reader, &mut ctx).await?; @@ -2212,8 +2370,8 @@ impl ScanExecution { )) } - fn splits(&self) -> VortexResult>> { - self.plan.splits() + fn splits(&self, row_range: &Range) -> VortexResult>> { + self.plan.splits(row_range) } } diff --git a/vortex-scan/src/scheduler.rs b/vortex-scan/src/scheduler.rs index 53b0e443761..04fbe3f68ee 100644 --- a/vortex-scan/src/scheduler.rs +++ b/vortex-scan/src/scheduler.rs @@ -25,6 +25,7 @@ use vortex_session::VortexSession; use vortex_utils::parallelism::get_available_parallelism; const DEFAULT_MORSEL_CONCURRENCY_FACTOR: usize = 4; +const DEFAULT_READ_BYTE_BUDGET: u64 = 256 * 1024 * 1024; /// Configuration for a [`ScanScheduler`]. #[derive(Clone, Debug, PartialEq, Eq)] @@ -33,6 +34,7 @@ pub struct ScanSchedulerConfig { per_scan_slots: Option, morsel_plan_window: Option, morsel_launch_window: Option, + read_byte_budget: Option, } impl ScanSchedulerConfig { @@ -43,6 +45,7 @@ impl ScanSchedulerConfig { per_scan_slots: None, morsel_plan_window: None, morsel_launch_window: None, + read_byte_budget: None, } } @@ -57,6 +60,7 @@ impl ScanSchedulerConfig { per_scan_slots: Some(slots), morsel_plan_window: None, morsel_launch_window: Some(slots), + read_byte_budget: Some(DEFAULT_READ_BYTE_BUDGET), } } @@ -74,6 +78,14 @@ impl ScanSchedulerConfig { self } + /// Return a copy with the maximum number of unfetched read bytes allowed in flight per scan. + /// + /// `None` means scan task launch is not capped by bytes. + pub fn with_read_byte_budget(mut self, bytes: Option) -> Self { + self.read_byte_budget = bytes.map(|bytes| bytes.max(1)); + self + } + /// Create a scheduler configuration matching the current unordered scan concurrency factor. pub fn default_morsel_slots() -> Self { Self::morsel_slots(default_morsel_slots()) @@ -105,6 +117,11 @@ impl ScanSchedulerConfig { pub fn morsel_launch_window(&self) -> Option { self.morsel_launch_window } + + /// Returns the configured per-scan unfetched-read byte budget. + pub fn read_byte_budget(&self) -> Option { + self.read_byte_budget + } } impl Default for ScanSchedulerConfig { From 942ba841d2be290a7cca3be0ec312f1c686dcf69 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Mon, 22 Jun 2026 13:35:14 -0400 Subject: [PATCH 27/48] Tune scan task scheduling Remove the filter-through-slice reduction that expands projection masks into child-domain masks, and keep the scan scheduler changes that make task reads explicit and budgeted by read dependencies. Signed-off-by: Nicholas Gates --- vortex-array/src/arrays/filter/rules.rs | 31 +- vortex-file/src/file.rs | 9 - vortex-file/src/multi/scan_v2.rs | 1460 ++++++++++++------ vortex-layout/src/scan/v2/layouts/struct_.rs | 71 +- vortex-layout/src/scan/v2/layouts/zoned.rs | 10 + vortex-scan/src/lib.rs | 2 + vortex-scan/src/plan/mod.rs | 55 + vortex-scan/src/task.rs | 915 +++++++++++ 8 files changed, 2030 insertions(+), 523 deletions(-) create mode 100644 vortex-scan/src/task.rs diff --git a/vortex-array/src/arrays/filter/rules.rs b/vortex-array/src/arrays/filter/rules.rs index 68031459f73..ffa5c64bd61 100644 --- a/vortex-array/src/arrays/filter/rules.rs +++ b/vortex-array/src/arrays/filter/rules.rs @@ -2,7 +2,6 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use vortex_error::VortexResult; -use vortex_mask::AllOr; use vortex_mask::Mask; use crate::ArrayRef; @@ -10,13 +9,11 @@ use crate::Canonical; use crate::IntoArray; use crate::array::ArrayView; use crate::arrays::Filter; -use crate::arrays::Slice; use crate::arrays::Struct; use crate::arrays::StructArray; use crate::arrays::filter::FilterArrayExt; use crate::arrays::filter::FilterReduce; use crate::arrays::filter::FilterReduceAdaptor; -use crate::arrays::slice::SliceArrayExt; use crate::arrays::struct_::StructDataParts; use crate::optimizer::rules::ArrayReduceRule; use crate::optimizer::rules::ParentRuleSet; @@ -26,7 +23,7 @@ pub(super) const PARENT_RULES: ParentRuleSet = ParentRuleSet::new(&[ParentRuleSet::lift(&FilterReduceAdaptor(Filter))]); pub(super) const RULES: ReduceRuleSet = - ReduceRuleSet::new(&[&TrivialFilterRule, &FilterSliceRule, &FilterStructRule]); + ReduceRuleSet::new(&[&TrivialFilterRule, &FilterStructRule]); impl FilterReduce for Filter { fn filter(array: ArrayView<'_, Self>, mask: &Mask) -> VortexResult> { @@ -50,32 +47,6 @@ impl ArrayReduceRule for TrivialFilterRule { } } -/// A reduce rule that pushes a filter through a slice by expanding the -/// slice-local mask back into the child row domain. -#[derive(Debug)] -struct FilterSliceRule; - -impl ArrayReduceRule for FilterSliceRule { - fn reduce(&self, array: ArrayView<'_, Filter>) -> VortexResult> { - let mask = array.filter_mask(); - let Some(slice) = array.child().as_opt::() else { - return Ok(None); - }; - let range = slice.slice_range(); - let child_len = slice.child().len(); - let child_mask = match mask.indices() { - AllOr::All => Mask::from_slices(child_len, vec![(range.start, range.end)]), - AllOr::None => Mask::new_false(child_len), - AllOr::Some(indices) => Mask::from_indices( - child_len, - indices.iter().copied().map(|idx| range.start + idx), - ), - }; - - Ok(Some(slice.child().filter(child_mask)?)) - } -} - /// A reduce rule that pushes a filter down into the fields of a StructArray. #[derive(Debug)] struct FilterStructRule; diff --git a/vortex-file/src/file.rs b/vortex-file/src/file.rs index 4b7ae04b664..6f1483e9f88 100644 --- a/vortex-file/src/file.rs +++ b/vortex-file/src/file.rs @@ -36,7 +36,6 @@ use vortex_session::VortexSession; use crate::FileStatistics; use crate::footer::Footer; use crate::multi::scan_v2; -use crate::multi::scan_v2::PreparedScanPlanCache; use crate::pruning::can_prune_file_stats; use crate::v2::FileStatsLayoutReader; @@ -61,8 +60,6 @@ pub struct VortexFile { scan_plan_state_cache: PreparedStateCacheRef, /// Shared cache for v2 in-flight segment futures across row-range scans of this file. scan_plan_segment_future_cache: Arc, - /// Shared cache for v2 request-level prepared plans across row-range scans of this file. - scan_plan_prepared_cache: Arc, } fn layout_reader( @@ -101,7 +98,6 @@ impl VortexFile { scan_plan_root_cache: Arc::new(OnceLock::new()), scan_plan_state_cache: Arc::new(PreparedStateCache::default()), scan_plan_segment_future_cache: Arc::new(SegmentFutureCache::new()), - scan_plan_prepared_cache: Arc::new(PreparedScanPlanCache::default()), } } @@ -115,7 +111,6 @@ impl VortexFile { scan_plan_root_cache: self.scan_plan_root_cache, scan_plan_state_cache: self.scan_plan_state_cache, scan_plan_segment_future_cache: self.scan_plan_segment_future_cache, - scan_plan_prepared_cache: self.scan_plan_prepared_cache, } } @@ -206,10 +201,6 @@ impl VortexFile { Arc::clone(&self.scan_plan_segment_future_cache) } - pub(crate) fn scan_plan_prepared_cache(&self) -> Arc { - Arc::clone(&self.scan_plan_prepared_cache) - } - /// Create a [`DataSource`](vortex_scan::DataSource) from this file for scanning. /// /// Wraps the file's layout reader with [`FileStatsLayoutReader`] (when file-level diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index 7d317a4e1b6..710453b211a 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -5,8 +5,6 @@ use std::any::Any; use std::collections::BTreeMap; -use std::collections::HashMap; -use std::collections::HashSet; use std::collections::VecDeque; use std::fmt; use std::ops::Range; @@ -67,6 +65,7 @@ use vortex_scan::ScanRequest as DataSourceScanRequest; use vortex_scan::ScanScheduler; use vortex_scan::ScanSchedulerSessionExt; use vortex_scan::ScanTicket; +use vortex_scan::plan::EvidenceScope; use vortex_scan::plan::OwnedRowScope; use vortex_scan::plan::PrepareCtx; use vortex_scan::plan::PreparedAggregateRef; @@ -95,11 +94,16 @@ use vortex_scan::segments::ScanIoPhase; use vortex_scan::segments::ScanRead; use vortex_scan::segments::SegmentFutureCache; use vortex_scan::segments::SegmentPlanCtx; -use vortex_scan::segments::SegmentRequestKey; use vortex_scan::segments::SegmentRequests; use vortex_scan::segments::SegmentSource; use vortex_scan::segments::register_segment_reads_cached; use vortex_scan::selection::Selection; +use vortex_scan::task::FutureScanTask; +use vortex_scan::task::ScanTaskBox; +use vortex_scan::task::ScanTaskLane; +use vortex_scan::task::ScanTaskQueue; +use vortex_scan::task::ScanTaskRead; +use vortex_scan::task::scan_task_read_bytes; use vortex_session::VortexSession; use vortex_utils::parallelism::get_available_parallelism; @@ -117,6 +121,8 @@ const MIN_SELECTION_GAP_BETWEEN_RANGES: u64 = IDEAL_SPLIT_SIZE / 2; /// Below this demanded-row density, evaluate a residual predicate over only the demanded rows /// (filter-first) rather than the whole morsel. Mirrors the V1 flat-reader threshold. const EXPR_EVAL_THRESHOLD: f64 = 0.2; +const INLINE_ZERO_READ_EVIDENCE_MAX_PRIORITY: u64 = 100_150; +const SCAN_SCOPE_MIN_PREDICATE_COST: u64 = 100; struct FileStatsScanPlan { data: ScanPlanRef, @@ -669,9 +675,7 @@ impl DataSource for ScanPlanDataSource { .row_range .clone() .ok_or_else(|| vortex_err!("scan2 partition row range missing"))?; - let prepared = file - .scan_plan_prepared_cache() - .get_or_prepare(&file, &request)?; + let prepared = Arc::new(PreparedScanPlan::try_new(&file, &request)?); let execution = Arc::new(ScanExecution::try_new(file, prepared, &ticket)?); let ranges = execution.splits(&row_range)?; if ranges.is_empty() { @@ -706,6 +710,7 @@ impl DataSource for ScanPlanDataSource { partitions, scheduler, ticket, + handle: self.session.handle(), morsel_plan_window, read_byte_budget, }))) @@ -894,10 +899,16 @@ fn file_partition( let Some(request) = file_scan_request(partition_idx, &file, request)? else { return Ok(None); }; + let row_range = request + .row_range + .clone() + .ok_or_else(|| vortex_err!("scan2 partition row range missing"))?; + let prepared = Arc::new(PreparedScanPlan::try_new(&file, &request)?); Ok(Some(Box::new(ScanPlanPartition { file, - request, + prepared, + row_range, index: partition_idx, scheduler, ticket, @@ -1066,71 +1077,34 @@ fn file_scan_request( })) } -struct Work { - phase: ScanIoPhase, - reads: Vec, - handle: Handle, - future: BoxFuture<'static, VortexResult>, -} - -impl Work { - fn new( - phase: ScanIoPhase, - handle: Handle, - reads: Vec, - future: BoxFuture<'static, VortexResult>, - ) -> Self { - Self { - phase, - reads, - handle, - future, - } - } - - fn into_queued( - self, - morsel_id: usize, - map: impl FnOnce(T) -> WorkOutput + Send + 'static, - ) -> QueuedWork { - QueuedWork { - morsel_id, - phase: self.phase, - reads: self.reads, - handle: self.handle, - future: async move { self.future.await.map(map) }.boxed(), - } - } -} - -#[derive(Clone, Copy)] -struct WorkRead { - key: SegmentRequestKey, - bytes: u64, -} - -struct QueuedWork { - morsel_id: usize, - phase: ScanIoPhase, - reads: Vec, - handle: Handle, - future: BoxFuture<'static, VortexResult>, -} - -struct ActiveRead { - bytes: u64, - refs: usize, -} +type QueuedWork = ScanTaskBox; struct LaunchedWorkOutput { - reads: Vec, + lane: ScanTaskLane, + reads: Vec, output: VortexResult, } struct EvidenceWorkOutput { morsel_id: usize, predicate_idx: usize, - evidence: PredicateEvidence, + version: PredicateVersion, + source: EvidenceWorkSource, + fragments: Vec, +} + +struct ScanEvidenceWorkOutput { + execution: Arc, + morsel_id: usize, + predicate_idx: usize, + evidence_idx: usize, + version: PredicateVersion, + fragments: Option>, +} + +enum EvidenceWorkSource { + Provider, + Predicate { input_rows: usize, pass_rows: usize }, } struct ProjectionWorkOutput { @@ -1140,6 +1114,7 @@ struct ProjectionWorkOutput { enum WorkOutput { Evidence(EvidenceWorkOutput), + ScanEvidence(ScanEvidenceWorkOutput), Projection(ProjectionWorkOutput), } @@ -1158,53 +1133,45 @@ struct MorselState { range: Range, selected: Mask, evidence: Vec>, - pending_evidence: usize, - next_predicate: usize, + pending_evidence: Vec, + pending_scan_evidence: Vec, + scan_evidence_generation: Vec, + predicate_queued: Vec, + predicate_done: Vec, next_recheck_predicate: usize, + projection_queued: bool, } #[derive(Default)] -pub(crate) struct PreparedScanPlanCache { - plans: Mutex>>, +struct ScanEvidenceStore { + predicates: Vec, } -#[derive(Clone, Debug, PartialEq, Eq, Hash)] -struct PreparedScanPlanKey { - projection: Expression, - filter: Option, - ordered: bool, - limit: Option, +#[derive(Default)] +struct PredicateScanEvidenceStore { + generation: u64, + providers: Vec, } -impl PreparedScanPlanKey { - fn try_new(request: &DataSourceScanRequest) -> Option { - matches!(request.selection, Selection::All).then(|| Self { - projection: request.projection.clone(), - filter: request.filter.clone(), - ordered: request.ordered, - limit: request.limit, - }) - } +#[derive(Default)] +struct ScanEvidenceSlot { + version: Option, + pending: Option, + fragments: Vec, + waiters: Vec>, } -impl PreparedScanPlanCache { - fn get_or_prepare( - &self, - file: &VortexFile, - request: &DataSourceScanRequest, - ) -> VortexResult> { - let Some(key) = PreparedScanPlanKey::try_new(request) else { - return PreparedScanPlan::try_new(file, request).map(Arc::new); - }; - - if let Some(plan) = self.plans.lock().get(&key) { - return Ok(Arc::clone(plan)); - } +enum ScanEvidenceAction { + Ready, + Pending, + Prepare, + Wait(oneshot::Receiver<()>), +} - let plan = Arc::new(PreparedScanPlan::try_new(file, request)?); - let mut plans = self.plans.lock(); - Ok(Arc::clone(plans.entry(key).or_insert(plan))) - } +#[derive(Default)] +struct PredicateRuntimeStats { + input_rows: u64, + rejected_rows: u64, } struct PartitionWorkSchedulerState { @@ -1213,26 +1180,14 @@ struct PartitionWorkSchedulerState { active_morsels: usize, next_morsel_id: usize, next_emit_morsel_id: usize, - evidence_queue: VecDeque, - predicate_queue: VecDeque, - projection_queue: VecDeque, + task_queue: ScanTaskQueue, in_flight: FuturesUnordered>, completed_morsels: BTreeMap, + handle: Handle, ordered: bool, plan_window: usize, - read_byte_budget: u64, - active_read_bytes: u64, - active_reads: HashMap, - phase_cursor: usize, } -const WEIGHTED_PHASES: &[ScanIoPhase] = &[ - ScanIoPhase::EvidenceProbe, - ScanIoPhase::EvidenceProbe, - ScanIoPhase::PredicateRead, - ScanIoPhase::ProjectionRead, -]; - fn morsel_plan_window(scheduler: &ScanScheduler, limited: bool) -> usize { if limited { return 1; @@ -1252,27 +1207,31 @@ fn partition_work_stream( morsels: Vec, _scheduler: Arc, _ticket: ScanTicket, + handle: Handle, ordered: bool, plan_window: usize, read_byte_budget: u64, ) -> impl futures::Stream> + Send + 'static { + tracing::debug!( + target: "vortex_file::scan_v2", + morsel_count = morsels.len(), + ordered, + plan_window, + read_byte_budget, + "created scan2 task stream" + ); let state = PartitionWorkSchedulerState { pending: VecDeque::from(morsels), morsels: Vec::new(), active_morsels: 0, next_morsel_id: 0, next_emit_morsel_id: 0, - evidence_queue: VecDeque::new(), - predicate_queue: VecDeque::new(), - projection_queue: VecDeque::new(), + task_queue: ScanTaskQueue::new(read_byte_budget), in_flight: FuturesUnordered::new(), completed_morsels: BTreeMap::new(), + handle, ordered, plan_window, - read_byte_budget, - active_read_bytes: 0, - active_reads: HashMap::new(), - phase_cursor: 0, }; stream::unfold(state, |mut state| async move { @@ -1288,21 +1247,20 @@ fn partition_work_stream( } } - while let Some(work) = state.pop_next_admissible_work() { - state.launch(work); - } + while state.launch_next_admissible_work() {} if state.in_flight.is_empty() { if state.is_done() { return None; } let error = vortex_err!( - "scan2 work scheduler stalled: {} active morsels, {} pending morsels, {} evidence work items, {} predicate work items, {} projection work items", + "scan2 work scheduler stalled: {} active morsels, {} pending morsels, {} evidence work items, {} predicate work items, {} projection work items, {} active read bytes", state.active_morsels, state.pending.len(), - state.evidence_queue.len(), - state.predicate_queue.len(), - state.projection_queue.len() + state.task_queue.evidence_len(), + state.task_queue.predicate_len(), + state.task_queue.projection_len(), + state.task_queue.active_read_bytes() ); state.clear(); return Some((Err(error), state)); @@ -1310,7 +1268,7 @@ fn partition_work_stream( match state.in_flight.next().await { Some(output) => { - state.release_reads(&output.reads); + state.release_reads(output.lane, &output.reads); match output.output.and_then(|output| state.complete_work(output)) { Ok(Some(array)) => return Some((Ok(array), state)), Ok(None) => continue, @@ -1324,61 +1282,21 @@ fn partition_work_stream( }) } -fn can_admit_work( - active_reads: &HashMap, - active_read_bytes: u64, - read_byte_budget: u64, - in_flight_empty: bool, - work: &QueuedWork, -) -> bool { - let incremental = incremental_read_bytes(active_reads, &work.reads); - incremental == 0 - || active_read_bytes.saturating_add(incremental) <= read_byte_budget - || in_flight_empty -} - -fn incremental_read_bytes( - active_reads: &HashMap, - reads: &[WorkRead], -) -> u64 { - let mut seen = HashSet::new(); - reads - .iter() - .filter(|read| seen.insert(read.key) && !active_reads.contains_key(&read.key)) - .map(|read| read.bytes) - .sum() -} - -fn read_bytes(reads: &[WorkRead]) -> u64 { - let mut seen = HashSet::new(); - reads - .iter() - .filter(|read| seen.insert(read.key)) - .map(|read| read.bytes) - .sum() -} - impl PartitionWorkSchedulerState { fn clear(&mut self) { self.pending.clear(); self.morsels.clear(); self.active_morsels = 0; self.next_emit_morsel_id = 0; - self.evidence_queue.clear(); - self.predicate_queue.clear(); - self.projection_queue.clear(); + self.task_queue.clear(); self.in_flight = FuturesUnordered::new(); self.completed_morsels.clear(); - self.active_read_bytes = 0; - self.active_reads.clear(); } fn is_done(&self) -> bool { self.pending.is_empty() && self.active_morsels == 0 - && self.evidence_queue.is_empty() - && self.predicate_queue.is_empty() - && self.projection_queue.is_empty() + && self.task_queue.is_empty() && self.in_flight.is_empty() && self.completed_morsels.is_empty() } @@ -1397,257 +1315,411 @@ impl PartitionWorkSchedulerState { self.morsels.resize_with(morsel_id + 1, || None); } self.morsels[morsel_id] = Some(planned.state); - self.evidence_queue.extend(planned.evidence); - if self.morsels[morsel_id] - .as_ref() - .is_some_and(|morsel| morsel.pending_evidence == 0) - { - self.enqueue_next_predicate_or_projection(morsel_id)?; - } + self.task_queue.extend(planned.evidence); + self.enqueue_ready_work(morsel_id)?; Ok(()) } - fn pop_next_admissible_work(&mut self) -> Option { - for _ in 0..WEIGHTED_PHASES.len() { - let phase = WEIGHTED_PHASES[self.phase_cursor % WEIGHTED_PHASES.len()]; - self.phase_cursor = self.phase_cursor.wrapping_add(1); - if let Some(work) = self.pop_phase_admissible_work(phase) { - return Some(work); - } - } - [ - ScanIoPhase::EvidenceProbe, - ScanIoPhase::PredicateRead, - ScanIoPhase::ProjectionRead, - ] - .into_iter() - .find_map(|phase| self.pop_phase_admissible_work(phase)) - } - - fn pop_phase_admissible_work(&mut self, phase: ScanIoPhase) -> Option { - let active_reads = &self.active_reads; - let active_read_bytes = self.active_read_bytes; - let read_byte_budget = self.read_byte_budget; + fn launch_next_admissible_work(&mut self) -> bool { let in_flight_empty = self.in_flight.is_empty(); let morsels = &self.morsels; - let queue = match phase { - ScanIoPhase::EvidenceProbe | ScanIoPhase::EvidenceSetup => &mut self.evidence_queue, - ScanIoPhase::PredicateRead => &mut self.predicate_queue, - ScanIoPhase::ProjectionRead | ScanIoPhase::AggregateRead => &mut self.projection_queue, + let Some(task) = self + .task_queue + .pop_next_admissible(in_flight_empty, |morsel_id| { + morsels.get(morsel_id).and_then(Option::as_ref).is_some() + }) + else { + return false; }; - let len = queue.len(); - for _ in 0..len { - let Some(work) = queue.pop_front() else { - break; - }; - if morsels - .get(work.morsel_id) - .and_then(Option::as_ref) - .is_none() - { - continue; - } - if can_admit_work( - active_reads, - active_read_bytes, - read_byte_budget, - in_flight_empty, - &work, - ) { - return Some(work); - } - queue.push_back(work); - } - None - } - - fn launch(&mut self, work: QueuedWork) { - let reads = self.admit_reads(&work.reads); - let phase = work.phase; - let bytes = read_bytes(&reads); - self.in_flight.push( - work.handle - .spawn( - async move { - let output = work.future.await; - LaunchedWorkOutput { reads, output } - } - .instrument(tracing::trace_span!( - "scan2_work", - phase = ?phase, - read_bytes = bytes, - )), - ) - .boxed(), - ); + let (task, lane, reads) = task.into_parts(); + self.launch_admitted(task, lane, reads); + true } - fn admit_reads(&mut self, reads: &[WorkRead]) -> Vec { - let mut admitted = Vec::with_capacity(reads.len()); - let mut seen = HashSet::new(); - for read in reads { - if !seen.insert(read.key) { - continue; + fn launch_admitted(&mut self, work: QueuedWork, lane: ScanTaskLane, reads: Vec) { + let phase = work.phase(); + let priority = work.priority(); + let bytes = scan_task_read_bytes(&reads); + let future = work.into_future(); + let future = async move { + let output = future.await; + LaunchedWorkOutput { + lane, + reads, + output, } - match self.active_reads.entry(read.key) { - std::collections::hash_map::Entry::Occupied(mut entry) => { - let active = entry.get_mut(); - active.refs = active.refs.saturating_add(1); - } - std::collections::hash_map::Entry::Vacant(entry) => { - self.active_read_bytes = self.active_read_bytes.saturating_add(read.bytes); - entry.insert(ActiveRead { - bytes: read.bytes, - refs: 1, - }); + } + .instrument(tracing::trace_span!( + "scan2_work", + phase = ?phase, + lane = ?lane, + read_bytes = bytes, + )); + let inline_zero_read = bytes == 0 + && match phase { + ScanIoPhase::EvidenceProbe | ScanIoPhase::EvidenceSetup => { + priority <= INLINE_ZERO_READ_EVIDENCE_MAX_PRIORITY } - } - admitted.push(*read); + ScanIoPhase::PredicateRead + | ScanIoPhase::ProjectionRead + | ScanIoPhase::AggregateRead => false, + }; + if inline_zero_read { + self.in_flight.push(future.boxed()); + } else { + self.in_flight.push(self.handle.spawn(future).boxed()); } - admitted } - fn release_reads(&mut self, reads: &[WorkRead]) { - let mut seen = HashSet::new(); - for read in reads { - if !seen.insert(read.key) { - continue; - } - let std::collections::hash_map::Entry::Occupied(mut entry) = - self.active_reads.entry(read.key) - else { - continue; - }; - if entry.get().refs > 1 { - entry.get_mut().refs -= 1; - } else { - let active = entry.remove(); - self.active_read_bytes = self.active_read_bytes.saturating_sub(active.bytes); - } - } + fn release_reads(&mut self, lane: ScanTaskLane, reads: &[ScanTaskRead]) { + self.task_queue.release_reads(lane, reads); } fn complete_work(&mut self, output: WorkOutput) -> VortexResult> { match output { WorkOutput::Evidence(output) => self.complete_evidence(output), + WorkOutput::ScanEvidence(output) => self.complete_scan_evidence(output), WorkOutput::Projection(output) => { Ok(self.finish_output_morsel(output.morsel_id, output.array)) } } } - fn complete_evidence(&mut self, output: EvidenceWorkOutput) -> VortexResult> { - let Some(morsel) = self + fn complete_scan_evidence( + &mut self, + output: ScanEvidenceWorkOutput, + ) -> VortexResult> { + if let Some(morsel) = self .morsels .get_mut(output.morsel_id) .and_then(Option::as_mut) - else { - return Ok(None); - }; - morsel.pending_evidence = morsel.pending_evidence.saturating_sub(1); - morsel.selected = &morsel.selected & output.evidence.maybe(); - if morsel.selected.all_false() || output.evidence.all_false() { - return Ok(self.finish_empty_morsel(output.morsel_id)); + && let Some(pending) = morsel.pending_scan_evidence.get_mut(output.predicate_idx) + { + *pending = pending.saturating_sub(1); } - if let Some(slot) = morsel.evidence.get_mut(output.predicate_idx) { - *slot = Some(output.evidence); + + if let Some(fragments) = output.fragments { + output.execution.record_scan_evidence( + output.predicate_idx, + output.evidence_idx, + output.version, + fragments, + )?; } - if morsel.pending_evidence == 0 { - self.enqueue_next_predicate_or_projection(output.morsel_id)?; + + let affected = self + .morsels + .iter() + .enumerate() + .filter_map(|(morsel_id, morsel)| { + morsel + .as_ref() + .filter(|morsel| Arc::ptr_eq(&morsel.execution, &output.execution)) + .map(|_| morsel_id) + }) + .collect::>(); + + for morsel_id in affected { + if self + .morsels + .get(morsel_id) + .and_then(Option::as_ref) + .is_none() + { + continue; + } + if self.refresh_morsel_scan_evidence(morsel_id, output.predicate_idx)? { + if let Some(array) = self.finish_empty_morsel(morsel_id) { + return Ok(Some(array)); + } + } else { + self.enqueue_ready_work(morsel_id)?; + } } Ok(None) } - fn enqueue_next_predicate_or_projection(&mut self, morsel_id: usize) -> VortexResult<()> { - loop { - let Some(morsel) = self.morsels.get(morsel_id).and_then(Option::as_ref) else { - return Ok(()); - }; - if morsel.pending_evidence != 0 { - return Ok(()); - } - if morsel.next_predicate >= morsel.execution.predicates.len() { - if self.enqueue_recheck_evidence(morsel_id)? { - return Ok(()); - } - let Some(morsel) = self.morsels.get(morsel_id).and_then(Option::as_ref) else { - return Ok(()); - }; - let projection = morsel.execution.plan_projection_work( - morsel_id, - morsel.range.clone(), - morsel.selected.clone(), - )?; - match projection { - Some(work) => self.projection_queue.push_back(work), - None => { - self.finish_empty_morsel(morsel_id); - } - } - return Ok(()); + fn refresh_all_scan_evidence(&mut self, morsel_id: usize) -> VortexResult { + let Some(predicate_count) = self + .morsels + .get(morsel_id) + .and_then(Option::as_ref) + .map(|morsel| morsel.execution.predicates.len()) + else { + return Ok(false); + }; + + for predicate_idx in 0..predicate_count { + if self.refresh_morsel_scan_evidence(morsel_id, predicate_idx)? { + return Ok(true); } + } + Ok(false) + } - let predicate_idx = morsel.next_predicate; - if morsel.evidence[predicate_idx].is_none() { - let should_probe = { - let predicate = &morsel.execution.predicates[predicate_idx]; - !predicate.evidence.is_empty() - && morsel.selected.density() >= EXPR_EVAL_THRESHOLD - }; - if should_probe { - let work = morsel.execution.plan_evidence_work( - morsel_id, - predicate_idx, - morsel.range.clone(), - morsel.execution.predicates[predicate_idx].version(), - EvidenceMode::Normal, - )?; - let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) + fn refresh_morsel_scan_evidence( + &mut self, + morsel_id: usize, + predicate_idx: usize, + ) -> VortexResult { + let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { + return Ok(false); + }; + let predicate = &morsel.execution.predicates[predicate_idx]; + let version = predicate.version(); + let (generation, fragments) = + morsel + .execution + .scan_evidence_fragments(predicate_idx, version, &morsel.range)?; + let Some(seen_generation) = morsel.scan_evidence_generation.get_mut(predicate_idx) else { + vortex_bail!("missing scan evidence generation slot {predicate_idx}"); + }; + if generation <= *seen_generation { + return Ok(false); + } + *seen_generation = generation; + + let Some(slot) = morsel.evidence.get_mut(predicate_idx) else { + vortex_bail!("missing predicate evidence slot {predicate_idx}"); + }; + if slot + .as_ref() + .is_none_or(|evidence| evidence.version() != version) + { + *slot = Some(PredicateEvidence::new( + predicate.id, + version, + morsel.range.clone(), + )?); + } + let evidence = slot + .as_mut() + .ok_or_else(|| vortex_err!("missing predicate evidence after initialization"))?; + for fragment in fragments { + evidence.absorb(fragment)?; + } + let maybe = evidence.maybe().clone(); + let all_false = evidence.all_false(); + morsel.selected = &morsel.selected & &maybe; + Ok(morsel.selected.all_false() || all_false) + } + + fn complete_evidence(&mut self, output: EvidenceWorkOutput) -> VortexResult> { + let mut record_predicate = None; + let finish_empty = { + let Some(morsel) = self + .morsels + .get_mut(output.morsel_id) + .and_then(Option::as_mut) + else { + return Ok(None); + }; + match output.source { + EvidenceWorkSource::Provider => { + let Some(pending) = morsel.pending_evidence.get_mut(output.predicate_idx) else { - return Ok(()); + vortex_bail!("missing predicate evidence count {}", output.predicate_idx); }; - morsel.pending_evidence = morsel.pending_evidence.saturating_add(1); - self.evidence_queue.push_back(work); - return Ok(()); + *pending = pending.saturating_sub(1); } - - let evidence = PredicateEvidence::new( - morsel.execution.predicates[predicate_idx].id, - morsel.execution.predicates[predicate_idx].version(), + EvidenceWorkSource::Predicate { + input_rows, + pass_rows, + } => { + let Some(queued) = morsel.predicate_queued.get_mut(output.predicate_idx) else { + vortex_bail!("missing predicate queued slot {}", output.predicate_idx); + }; + *queued = false; + let Some(done) = morsel.predicate_done.get_mut(output.predicate_idx) else { + vortex_bail!("missing predicate done slot {}", output.predicate_idx); + }; + *done = true; + record_predicate = Some(( + Arc::clone(&morsel.execution), + output.predicate_idx, + input_rows, + pass_rows, + )); + } + } + let predicate = &morsel.execution.predicates[output.predicate_idx]; + let Some(slot) = morsel.evidence.get_mut(output.predicate_idx) else { + vortex_bail!("missing predicate evidence slot {}", output.predicate_idx); + }; + if slot + .as_ref() + .is_none_or(|evidence| evidence.version() != output.version) + { + *slot = Some(PredicateEvidence::new( + predicate.id, + output.version, morsel.range.clone(), - )?; - let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { - return Ok(()); - }; - morsel.evidence[predicate_idx] = Some(evidence); - continue; + )?); } - let evidence = morsel.evidence[predicate_idx].as_ref().ok_or_else(|| { - vortex_err!("missing evidence for predicate {predicate_idx} before residual read") - })?; - let need = &morsel.selected & &evidence.unproven(); - if need.all_false() { - let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { - return Ok(()); - }; - morsel.next_predicate = morsel.next_predicate.saturating_add(1); - continue; + let evidence = slot + .as_mut() + .ok_or_else(|| vortex_err!("missing predicate evidence after initialization"))?; + for fragment in output.fragments { + evidence.absorb(fragment)?; } + let maybe = evidence.maybe().clone(); + let all_false = evidence.all_false(); + morsel.selected = &morsel.selected & &maybe; + morsel.selected.all_false() || all_false + }; + + if let Some((execution, predicate_idx, input_rows, pass_rows)) = record_predicate + && !execution.has_dynamic_predicates() + { + execution.record_predicate_result(predicate_idx, input_rows, pass_rows); + } + + if finish_empty { + return Ok(self.finish_empty_morsel(output.morsel_id)); + } + + self.enqueue_ready_work(output.morsel_id)?; + Ok(None) + } + + fn enqueue_ready_work(&mut self, morsel_id: usize) -> VortexResult<()> { + if self.refresh_all_scan_evidence(morsel_id)? { + self.finish_empty_morsel(morsel_id); + return Ok(()); + } + if let Some((predicate_idx, need, priority)) = self.choose_ready_predicate(morsel_id)? { + let Some(morsel) = self.morsels.get(morsel_id).and_then(Option::as_ref) else { + return Ok(()); + }; let work = morsel.execution.plan_predicate_work( morsel_id, predicate_idx, morsel.range.clone(), need, morsel.execution.predicates[predicate_idx].version(), + priority, )?; let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { return Ok(()); }; - morsel.next_predicate = predicate_idx.saturating_add(1); - self.predicate_queue.push_back(work); + morsel.predicate_queued[predicate_idx] = true; + self.task_queue.push(work); return Ok(()); } + + let ready_to_project = self + .morsels + .get(morsel_id) + .and_then(Option::as_ref) + .is_some_and(|morsel| { + !morsel.projection_queued + && morsel.pending_evidence.iter().all(|pending| *pending == 0) + && morsel + .pending_scan_evidence + .iter() + .all(|pending| *pending == 0) + && morsel.predicate_queued.iter().all(|queued| !*queued) + && morsel.predicate_done.iter().all(|done| *done) + }); + if !ready_to_project { + return Ok(()); + } + + if self.enqueue_recheck_evidence(morsel_id)? { + return Ok(()); + } + + let Some(morsel) = self.morsels.get(morsel_id).and_then(Option::as_ref) else { + return Ok(()); + }; + let projection = morsel.execution.plan_projection_work( + morsel_id, + morsel.range.clone(), + morsel.selected.clone(), + )?; + let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { + return Ok(()); + }; + morsel.projection_queued = true; + match projection { + Some(work) => self.task_queue.push(work), + None => { + self.finish_empty_morsel(morsel_id); + } + } + Ok(()) + } + + fn choose_ready_predicate( + &mut self, + morsel_id: usize, + ) -> VortexResult> { + loop { + let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { + return Ok(None); + }; + if morsel.predicate_queued.iter().any(|queued| *queued) { + return Ok(None); + } + let dynamic_scan = morsel.execution.has_dynamic_predicates(); + if dynamic_scan + && (morsel.pending_evidence.iter().any(|pending| *pending != 0) + || morsel + .pending_scan_evidence + .iter() + .any(|pending| *pending != 0)) + { + return Ok(None); + } + + let mut best: Option<(u64, usize, Mask)> = None; + let mut advanced = false; + for predicate_idx in 0..morsel.execution.predicates.len() { + if morsel.predicate_done[predicate_idx] + || morsel.predicate_queued[predicate_idx] + || morsel.pending_evidence[predicate_idx] != 0 + || morsel.pending_scan_evidence[predicate_idx] != 0 + { + continue; + } + if morsel.evidence[predicate_idx].is_none() { + let predicate = &morsel.execution.predicates[predicate_idx]; + morsel.evidence[predicate_idx] = Some(PredicateEvidence::new( + predicate.id, + predicate.version(), + morsel.range.clone(), + )?); + } + let evidence = morsel.evidence[predicate_idx].as_ref().ok_or_else(|| { + vortex_err!( + "missing evidence for predicate {predicate_idx} before residual read" + ) + })?; + let need = &morsel.selected & &evidence.unproven(); + if need.all_false() { + morsel.predicate_done[predicate_idx] = true; + advanced = true; + continue; + } + let priority = if dynamic_scan { + u64::try_from(predicate_idx).unwrap_or(u64::MAX) + } else { + morsel + .execution + .predicate_priority(predicate_idx, need.true_count()) + }; + if best.as_ref().is_none_or(|(best_priority, best_idx, _)| { + (priority, predicate_idx) < (*best_priority, *best_idx) + }) { + best = Some((priority, predicate_idx, need)); + } + } + if advanced { + continue; + } + return Ok(best.map(|(priority, predicate_idx, need)| (predicate_idx, need, priority))); + } } fn enqueue_recheck_evidence(&mut self, morsel_id: usize) -> VortexResult { @@ -1666,11 +1738,42 @@ impl PartitionWorkSchedulerState { .as_ref() .map(PredicateEvidence::version) .unwrap_or(PredicateVersion::STATIC); + let has_dynamic = predicate.dynamic_updates.is_some(); + let has_scan_recheck_evidence = predicate.has_scan_recheck_evidence(); + let has_morsel_recheck_evidence = predicate.has_morsel_recheck_evidence(); - if predicate.dynamic_updates.is_some() - && predicate.has_recheck_evidence() - && current_version != evidence_version - { + if has_dynamic && has_scan_recheck_evidence && current_version != evidence_version { + let work = morsel.execution.plan_scan_evidence_work( + morsel_id, + predicate_idx, + current_version, + EvidenceMode::RecheckBeforeProjection, + )?; + if !work.is_empty() { + let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) + else { + return Ok(false); + }; + morsel.pending_scan_evidence[predicate_idx] = + morsel.pending_scan_evidence[predicate_idx].saturating_add(work.len()); + self.task_queue.extend(work); + return Ok(true); + } + if self.refresh_morsel_scan_evidence(morsel_id, predicate_idx)? { + self.finish_empty_morsel(morsel_id); + return Ok(true); + } + } + + let Some(morsel) = self.morsels.get(morsel_id).and_then(Option::as_ref) else { + return Ok(false); + }; + let evidence_version = morsel.evidence[predicate_idx] + .as_ref() + .map(PredicateEvidence::version) + .unwrap_or(PredicateVersion::STATIC); + + if has_dynamic && has_morsel_recheck_evidence && current_version != evidence_version { let work = morsel.execution.plan_evidence_work( morsel_id, predicate_idx, @@ -1678,11 +1781,20 @@ impl PartitionWorkSchedulerState { current_version, EvidenceMode::RecheckBeforeProjection, )?; + if work.is_empty() { + let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) + else { + return Ok(false); + }; + morsel.next_recheck_predicate = morsel.next_recheck_predicate.saturating_add(1); + continue; + } let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { return Ok(false); }; - morsel.pending_evidence = morsel.pending_evidence.saturating_add(1); - self.evidence_queue.push_back(work); + morsel.pending_evidence[predicate_idx] = + morsel.pending_evidence[predicate_idx].saturating_add(work.len()); + self.task_queue.extend(work); return Ok(true); } @@ -1746,7 +1858,8 @@ impl PartitionWorkSchedulerState { struct ScanPlanPartition { file: VortexFile, - request: DataSourceScanRequest, + prepared: Arc, + row_range: Range, index: usize, scheduler: Arc, ticket: ScanTicket, @@ -1762,17 +1875,14 @@ impl Partition for ScanPlanPartition { } fn row_count(&self) -> Precision { - let Some(row_range) = self.request.row_range.as_ref() else { - return Precision::Absent; - }; - let row_count = row_range.end - row_range.start; - let row_count = self.request.selection.row_count(row_count); + let row_count = self.row_range.end - self.row_range.start; + let row_count = self.prepared.selection().row_count(row_count); let row_count = self - .request - .limit + .prepared + .limit() .map_or(row_count, |limit| row_count.min(limit)); - if self.request.filter.is_some() { + if self.prepared.has_filter() { Precision::inexact(row_count) } else { Precision::exact(row_count) @@ -1786,20 +1896,15 @@ impl Partition for ScanPlanPartition { fn execute(self: Box) -> VortexResult { let ScanPlanPartition { file, - request, + prepared, + row_range, index: _, scheduler, ticket, } = *self; - let row_range = request - .row_range - .clone() - .ok_or_else(|| vortex_err!("scan2 partition row range missing"))?; - let prepared = file - .scan_plan_prepared_cache() - .get_or_prepare(&file, &request)?; let execution = Arc::new(ScanExecution::try_new(file, prepared, &ticket)?); + let handle = execution.session.handle(); let dtype = execution.plan.dtype().clone(); let ranges = execution.splits(&row_range)?; let ordered = execution.plan.ordered(); @@ -1817,6 +1922,7 @@ impl Partition for ScanPlanPartition { morsels, scheduler, ticket, + handle, ordered, plan_window, read_byte_budget, @@ -1833,6 +1939,7 @@ struct PlannedScanPlanScan { partitions: Vec>, scheduler: Arc, ticket: ScanTicket, + handle: Handle, morsel_plan_window: usize, read_byte_budget: u64, } @@ -1909,10 +2016,12 @@ impl Partition for PlannedScanPlanPartition { let dtype = planned.dtype.clone(); let scheduler = Arc::clone(&planned.scheduler); let ticket = planned.ticket.clone(); + let handle = planned.handle.clone(); let stream = partition_work_stream( morsels, scheduler, ticket, + handle, false, planned.morsel_plan_window, planned.read_byte_budget, @@ -1952,11 +2061,14 @@ struct ScanExecution { segment_future_cache: Arc, projection: PreparedReadRef, predicates: Vec, + predicate_stats: Mutex>, + scan_evidence: Mutex, } struct ExecutionPredicate { id: PredicateId, expr: Expression, + static_cost: u64, dynamic_updates: Option, read: PreparedReadRef, evidence: Vec, @@ -1970,10 +2082,16 @@ impl ExecutionPredicate { .unwrap_or(PredicateVersion::STATIC) } - fn has_recheck_evidence(&self) -> bool { + fn has_morsel_recheck_evidence(&self) -> bool { self.evidence .iter() - .any(|plan| plan.recheck_before_projection()) + .any(|plan| plan.scope() == EvidenceScope::Morsel && plan.recheck_before_projection()) + } + + fn has_scan_recheck_evidence(&self) -> bool { + self.evidence + .iter() + .any(|plan| plan.scope() == EvidenceScope::Scan && plan.recheck_before_projection()) } } @@ -2105,12 +2223,29 @@ impl ScanExecution { Ok(ExecutionPredicate { id: predicate.id, expr: predicate.expr.clone(), + static_cost: predicate_cost(&predicate.expr), dynamic_updates, read, evidence, }) }) .collect::>>()?; + let predicate_stats = (0..predicates.len()) + .map(|_| PredicateRuntimeStats::default()) + .collect(); + let scan_evidence = ScanEvidenceStore { + predicates: predicates + .iter() + .map(|predicate| PredicateScanEvidenceStore { + generation: 0, + providers: predicate + .evidence + .iter() + .map(|_| ScanEvidenceSlot::default()) + .collect(), + }) + .collect(), + }; let limit_remaining = plan.limit().map(AtomicU64::new); @@ -2123,6 +2258,8 @@ impl ScanExecution { segment_future_cache, projection, predicates, + predicate_stats: Mutex::new(predicate_stats), + scan_evidence: Mutex::new(scan_evidence), }) } @@ -2139,23 +2276,48 @@ impl ScanExecution { ) } - fn work_reads(reads: &[ScanRead]) -> Vec { - let mut seen = HashSet::new(); - reads + fn predicate_priority(&self, predicate_idx: usize, demand_rows: usize) -> u64 { + let predicate = &self.predicates[predicate_idx]; + let static_cost = predicate.static_cost.max(1); + let demand_rows = u64::try_from(demand_rows).unwrap_or(u64::MAX).max(1); + let stats = self.predicate_stats.lock(); + let stats = &stats[predicate_idx]; + let rejection_per_mille = if stats.input_rows >= 1024 { + stats.rejected_rows.saturating_mul(1000) / stats.input_rows.max(1) + } else { + // Before feedback exists, preserve the existing static cheap-first ordering while still + // giving every predicate a nonzero expected benefit. + 500 + } + .max(1); + let expected_rejected = demand_rows.saturating_mul(rejection_per_mille) / 1000; + static_cost.saturating_mul(1_000_000) / expected_rejected.max(1) + } + + fn has_dynamic_predicates(&self) -> bool { + self.predicates .iter() - .filter_map(|read| { - let key = SegmentRequestKey::from(&read.request); - seen.insert(key).then_some(WorkRead { - key, - bytes: read.request.bytes, - }) - }) - .collect() + .any(|predicate| predicate.dynamic_updates.is_some()) + } + + fn record_predicate_result(&self, predicate_idx: usize, input_rows: usize, pass_rows: usize) { + let input_rows = u64::try_from(input_rows).unwrap_or(u64::MAX); + let pass_rows = u64::try_from(pass_rows).unwrap_or(u64::MAX); + let rejected_rows = input_rows.saturating_sub(pass_rows); + let mut stats = self.predicate_stats.lock(); + let stats = &mut stats[predicate_idx]; + stats.input_rows = stats.input_rows.saturating_add(input_rows); + stats.rejected_rows = stats.rejected_rows.saturating_add(rejected_rows); + } + + fn use_scan_scope_evidence(&self, predicate_idx: usize, mode: EvidenceMode) -> bool { + mode == EvidenceMode::RecheckBeforeProjection + || self.predicates[predicate_idx].static_cost >= SCAN_SCOPE_MIN_PREDICATE_COST } fn plan_morsel( self: &Arc, - _morsel_id: usize, + morsel_id: usize, range: Range, ) -> VortexResult> { let selected = self.plan.selection().row_mask(&range).mask().clone(); @@ -2163,20 +2325,308 @@ impl ScanExecution { return Ok(None); } + let mut evidence = Vec::new(); + let mut pending_evidence = Vec::with_capacity(self.predicates.len()); + let mut pending_scan_evidence = Vec::with_capacity(self.predicates.len()); + for predicate_idx in 0..self.predicates.len() { + let version = self.predicates[predicate_idx].version(); + let scan_work = self.plan_scan_evidence_work( + morsel_id, + predicate_idx, + version, + EvidenceMode::Normal, + )?; + pending_scan_evidence.push(scan_work.len()); + evidence.extend(scan_work); + + let morsel_work = self.plan_evidence_work( + morsel_id, + predicate_idx, + range.clone(), + version, + EvidenceMode::Normal, + )?; + pending_evidence.push(morsel_work.len()); + evidence.extend(morsel_work); + } + let state = MorselState { execution: Arc::clone(self), range, selected, evidence: (0..self.predicates.len()).map(|_| None).collect(), - pending_evidence: 0, - next_predicate: 0, + pending_evidence, + pending_scan_evidence, + scan_evidence_generation: vec![0; self.predicates.len()], + predicate_queued: vec![false; self.predicates.len()], + predicate_done: vec![false; self.predicates.len()], next_recheck_predicate: 0, + projection_queued: false, }; - Ok(Some(PlannedMorselWork { - state, - evidence: Vec::new(), - })) + Ok(Some(PlannedMorselWork { state, evidence })) + } + + fn reserve_scan_evidence( + &self, + predicate_idx: usize, + evidence_idx: usize, + version: PredicateVersion, + create_waiter: bool, + ) -> VortexResult { + let mut store = self.scan_evidence.lock(); + let slot = store + .predicates + .get_mut(predicate_idx) + .and_then(|predicate| predicate.providers.get_mut(evidence_idx)) + .ok_or_else(|| { + vortex_err!( + "missing scan evidence slot for predicate {predicate_idx} provider {evidence_idx}" + ) + })?; + if slot.version == Some(version) { + return Ok(ScanEvidenceAction::Ready); + } + if slot.pending == Some(version) { + if !create_waiter { + return Ok(ScanEvidenceAction::Pending); + } + let (send, recv) = oneshot::channel(); + slot.waiters.push(send); + return Ok(ScanEvidenceAction::Wait(recv)); + } + + // Wake waiters for any older version. They will observe the version change and + // re-enter planning for the current dynamic boundary. + if slot.pending.is_some() || slot.version.is_some() { + wake_scan_evidence_waiters(slot); + } + slot.pending = Some(version); + Ok(ScanEvidenceAction::Prepare) + } + + fn clear_scan_evidence_pending( + &self, + predicate_idx: usize, + evidence_idx: usize, + version: PredicateVersion, + ) { + let mut store = self.scan_evidence.lock(); + let Some(slot) = store + .predicates + .get_mut(predicate_idx) + .and_then(|predicate| predicate.providers.get_mut(evidence_idx)) + else { + return; + }; + if slot.pending == Some(version) { + slot.pending = None; + wake_scan_evidence_waiters(slot); + } + } + + fn scan_evidence_provider_ready( + &self, + predicate_idx: usize, + evidence_idx: usize, + version: PredicateVersion, + ) -> bool { + self.scan_evidence + .lock() + .predicates + .get(predicate_idx) + .and_then(|predicate| predicate.providers.get(evidence_idx)) + .is_some_and(|slot| slot.version == Some(version)) + } + + fn record_scan_evidence( + &self, + predicate_idx: usize, + evidence_idx: usize, + version: PredicateVersion, + mut fragments: Vec, + ) -> VortexResult { + fragments.sort_by_key(|fragment| (fragment.rows.start, fragment.rows.end)); + let mut store = self.scan_evidence.lock(); + let predicate = store + .predicates + .get_mut(predicate_idx) + .ok_or_else(|| vortex_err!("missing scan evidence predicate slot {predicate_idx}"))?; + let slot = predicate.providers.get_mut(evidence_idx).ok_or_else(|| { + vortex_err!( + "missing scan evidence provider slot {evidence_idx} for predicate {predicate_idx}" + ) + })?; + + if slot.pending != Some(version) && slot.version != Some(version) { + wake_scan_evidence_waiters(slot); + return Ok(false); + } + + slot.version = Some(version); + slot.pending = None; + slot.fragments = fragments; + predicate.generation = predicate.generation.saturating_add(1); + wake_scan_evidence_waiters(slot); + Ok(true) + } + + fn scan_evidence_fragments( + &self, + predicate_idx: usize, + version: PredicateVersion, + range: &Range, + ) -> VortexResult<(u64, Vec)> { + let store = self.scan_evidence.lock(); + let Some(predicate) = store.predicates.get(predicate_idx) else { + vortex_bail!("missing scan evidence predicate slot {predicate_idx}"); + }; + let generation = predicate.generation; + let mut fragments = Vec::new(); + for slot in &predicate.providers { + if slot.version == Some(version) { + push_overlapping_fragments(&slot.fragments, range, &mut fragments)?; + } + } + Ok((generation, fragments)) + } + + fn plan_scan_evidence_work( + self: &Arc, + morsel_id: usize, + predicate_idx: usize, + version: PredicateVersion, + mode: EvidenceMode, + ) -> VortexResult> { + if !self.use_scan_scope_evidence(predicate_idx, mode) { + return Ok(Vec::new()); + } + + let predicate = &self.predicates[predicate_idx]; + let predicate_idx_u32 = + u32::try_from(predicate_idx).map_err(|_| vortex_err!("too many predicates"))?; + let mut work = Vec::new(); + for (evidence_idx, plan) in predicate.evidence.iter().enumerate() { + if plan.scope() != EvidenceScope::Scan { + continue; + } + if mode == EvidenceMode::RecheckBeforeProjection && !plan.recheck_before_projection() { + continue; + } + + let evidence_idx_u32 = + u32::try_from(evidence_idx).map_err(|_| vortex_err!("too many evidence plans"))?; + let priority = plan + .cost( + &OwnedEvidenceRequest { + id: predicate.id, + version, + predicate: predicate.expr.clone(), + range: 0..self.plan.row_count, + mode, + } + .as_request(), + ) + .priority(0, mode == EvidenceMode::RecheckBeforeProjection) + .saturating_add(predicate.static_cost); + + let create_waiter = mode == EvidenceMode::RecheckBeforeProjection; + match self.reserve_scan_evidence(predicate_idx, evidence_idx, version, create_waiter)? { + ScanEvidenceAction::Ready => {} + ScanEvidenceAction::Pending => {} + ScanEvidenceAction::Wait(waiter) => { + let execution = Arc::clone(self); + work.push( + FutureScanTask::new_in_lane( + morsel_id, + ScanIoPhase::EvidenceProbe, + ScanTaskLane::ScanEvidence { + predicate_idx: predicate_idx_u32, + evidence_idx: evidence_idx_u32, + }, + Vec::new(), + async move { + if !execution.scan_evidence_provider_ready( + predicate_idx, + evidence_idx, + version, + ) && execution.predicates[predicate_idx].version() == version + { + let _ = waiter.await; + } + Ok(WorkOutput::ScanEvidence(ScanEvidenceWorkOutput { + execution, + morsel_id, + predicate_idx, + evidence_idx, + version, + fragments: None, + })) + } + .boxed(), + ) + .with_priority(priority) + .boxed(), + ); + } + ScanEvidenceAction::Prepare => { + let req = OwnedEvidenceRequest { + id: predicate.id, + version, + predicate: predicate.expr.clone(), + range: 0..self.plan.row_count, + mode, + }; + let mut segment_ctx = self.segment_plan_ctx(ScanIoPhase::EvidenceProbe); + let result = (|| { + let requests = + plan.segment_requests(&req.as_request(), &mut segment_ctx)?; + let reads = self.register_segment_reads(requests); + let work_reads = ScanTaskRead::from_scan_reads(&reads); + let priority = plan + .cost(&req.as_request()) + .priority( + scan_task_read_bytes(&work_reads), + mode == EvidenceMode::RecheckBeforeProjection, + ) + .saturating_add(predicate.static_cost); + let task = Arc::clone(plan).create_task(req, reads)?; + let execution = Arc::clone(self); + Ok(FutureScanTask::new_in_lane( + morsel_id, + ScanIoPhase::EvidenceProbe, + ScanTaskLane::ScanEvidence { + predicate_idx: predicate_idx_u32, + evidence_idx: evidence_idx_u32, + }, + work_reads, + async move { + let fragments = task.evidence(&execution.reader).await?; + Ok(WorkOutput::ScanEvidence(ScanEvidenceWorkOutput { + execution, + morsel_id, + predicate_idx, + evidence_idx, + version, + fragments: Some(fragments), + })) + } + .boxed(), + ) + .with_priority(priority) + .boxed()) + })(); + match result { + Ok(task) => work.push(task), + Err(error) => { + self.clear_scan_evidence_pending(predicate_idx, evidence_idx, version); + return Err(error); + } + } + } + } + } + Ok(work) } fn plan_evidence_work( @@ -2186,54 +2636,68 @@ impl ScanExecution { range: Range, version: PredicateVersion, mode: EvidenceMode, - ) -> VortexResult { + ) -> VortexResult> { let predicate = &self.predicates[predicate_idx]; let req = OwnedEvidenceRequest { id: predicate.id, version, predicate: predicate.expr.clone(), - range: range.clone(), + range, mode, }; - let mut work_reads = Vec::new(); - let mut tasks = Vec::with_capacity(predicate.evidence.len()); - for plan in &predicate.evidence { + let predicate_idx_u32 = + u32::try_from(predicate_idx).map_err(|_| vortex_err!("too many predicates"))?; + let mut work = Vec::with_capacity(predicate.evidence.len()); + for (evidence_idx, plan) in predicate.evidence.iter().enumerate() { + if plan.scope() == EvidenceScope::Scan + && self.use_scan_scope_evidence(predicate_idx, mode) + { + continue; + } if mode == EvidenceMode::RecheckBeforeProjection && !plan.recheck_before_projection() { continue; } + let evidence_idx_u32 = + u32::try_from(evidence_idx).map_err(|_| vortex_err!("too many evidence plans"))?; let mut segment_ctx = self.segment_plan_ctx(ScanIoPhase::EvidenceProbe); let requests = plan.segment_requests(&req.as_request(), &mut segment_ctx)?; let reads = self.register_segment_reads(requests); - work_reads.extend(Self::work_reads(&reads)); + let work_reads = ScanTaskRead::from_scan_reads(&reads); + let priority = plan + .cost(&req.as_request()) + .priority( + scan_task_read_bytes(&work_reads), + mode == EvidenceMode::RecheckBeforeProjection, + ) + .saturating_add(predicate.static_cost); let task = Arc::clone(plan).create_task(req.clone(), reads)?; - tasks.push(task); - } - - let execution = Arc::clone(self); - Ok(Work::new( - ScanIoPhase::EvidenceProbe, - self.session.handle(), - work_reads, - async move { - let predicate = &execution.predicates[predicate_idx]; - let mut acc = PredicateEvidence::new(predicate.id, version, range.clone())?; - for task in tasks { - for fragment in task.evidence(&execution.reader).await? { - acc.absorb(fragment)?; - } - if acc.all_false() { - break; - } - } - Ok(EvidenceWorkOutput { + let execution = Arc::clone(self); + work.push( + FutureScanTask::new_in_lane( morsel_id, - predicate_idx, - evidence: acc, - }) - } - .boxed(), - ) - .into_queued(morsel_id, WorkOutput::Evidence)) + ScanIoPhase::EvidenceProbe, + ScanTaskLane::Evidence { + predicate_idx: predicate_idx_u32, + evidence_idx: evidence_idx_u32, + }, + work_reads, + async move { + let fragments = task.evidence(&execution.reader).await?; + Ok(WorkOutput::Evidence(EvidenceWorkOutput { + morsel_id, + predicate_idx, + version, + source: EvidenceWorkSource::Provider, + fragments, + })) + } + .boxed(), + ) + .with_priority(priority) + .boxed(), + ); + } + Ok(work) } fn plan_predicate_work( @@ -2243,6 +2707,7 @@ impl ScanExecution { range: Range, need: Mask, version: PredicateVersion, + priority: u64, ) -> VortexResult { let len = range_len(&range)?; let predicate = &self.predicates[predicate_idx]; @@ -2258,16 +2723,20 @@ impl ScanExecution { .read .segment_requests(range.clone(), rows.as_scope(), &mut segment_ctx)?; let reads = self.register_segment_reads(requests); - let work_reads = Self::work_reads(&reads); + let work_reads = ScanTaskRead::from_scan_reads(&reads); let task = Arc::clone(&predicate.read).create_task(range.clone(), rows, reads)?; let execution = Arc::clone(self); - Ok(Work::new( + let predicate_idx_u32 = + u32::try_from(predicate_idx).map_err(|_| vortex_err!("too many predicates"))?; + Ok(FutureScanTask::new_in_lane( + morsel_id, ScanIoPhase::PredicateRead, - self.session.handle(), + ScanTaskLane::Predicate { + predicate_idx: predicate_idx_u32, + }, work_reads, async move { - let predicate = &execution.predicates[predicate_idx]; let mut ctx = execution.session.create_execution_ctx(); // Filter-first: when few rows are demanded, read with selection = `need` so the leaf // returns the compacted (filtered) array and an expensive residual (e.g. an FSST @@ -2302,21 +2771,27 @@ impl ScanExecution { ); } let pass = &result & &need; + let input_rows = need.true_count(); + let pass_rows = pass.true_count(); let exact = !&need | &pass; - let mut evidence = PredicateEvidence::new(predicate.id, version, range.clone())?; - evidence.absorb(EvidenceFragment::new( - range, - PredicateEvidenceKind::ExactMask(exact), - ))?; - Ok(EvidenceWorkOutput { + Ok(WorkOutput::Evidence(EvidenceWorkOutput { morsel_id, predicate_idx, - evidence, - }) + version, + source: EvidenceWorkSource::Predicate { + input_rows, + pass_rows, + }, + fragments: vec![EvidenceFragment::new( + range, + PredicateEvidenceKind::ExactMask(exact), + )], + })) } .boxed(), ) - .into_queued(morsel_id, WorkOutput::Evidence)) + .with_priority(priority) + .boxed()) } fn plan_projection_work( @@ -2350,23 +2825,27 @@ impl ScanExecution { self.projection .segment_requests(range.clone(), rows.as_scope(), &mut segment_ctx)?; let reads = self.register_segment_reads(requests); - let work_reads = Self::work_reads(&reads); + let work_reads = ScanTaskRead::from_scan_reads(&reads); let task = Arc::clone(&self.projection).create_task(range, rows, reads)?; let execution = Arc::clone(self); Ok(Some( - Work::new( + FutureScanTask::new_in_lane( + morsel_id, ScanIoPhase::ProjectionRead, - self.session.handle(), + ScanTaskLane::Projection, work_reads, async move { let mut ctx = execution.session.create_execution_ctx(); let array = task.read(&execution.reader, &mut ctx).await?; - Ok(ProjectionWorkOutput { morsel_id, array }) + Ok(WorkOutput::Projection(ProjectionWorkOutput { + morsel_id, + array, + })) } .boxed(), ) - .into_queued(morsel_id, WorkOutput::Projection), + .boxed(), )) } @@ -2375,6 +2854,61 @@ impl ScanExecution { } } +fn push_overlapping_fragments( + fragments: &[EvidenceFragment], + range: &Range, + output: &mut Vec, +) -> VortexResult<()> { + let start = fragments + .partition_point(|fragment| fragment.rows.start < range.start) + .saturating_sub(1); + for fragment in &fragments[start..] { + if fragment.rows.start >= range.end { + break; + } + if let Some(fragment) = slice_evidence_fragment(fragment, range)? { + output.push(fragment); + } + } + Ok(()) +} + +fn wake_scan_evidence_waiters(slot: &mut ScanEvidenceSlot) { + for waiter in slot.waiters.drain(..) { + drop(waiter.send(())); + } +} + +fn slice_evidence_fragment( + fragment: &EvidenceFragment, + range: &Range, +) -> VortexResult> { + let rows = fragment.rows.start.max(range.start)..fragment.rows.end.min(range.end); + if rows.start >= rows.end { + return Ok(None); + } + if rows == fragment.rows { + return Ok(Some(fragment.clone())); + } + + let local = usize::try_from(rows.start - fragment.rows.start) + .map_err(|_| vortex_err!("evidence fragment exceeds usize"))? + ..usize::try_from(rows.end - fragment.rows.start) + .map_err(|_| vortex_err!("evidence fragment exceeds usize"))?; + let kind = match &fragment.kind { + PredicateEvidenceKind::AllFalse => PredicateEvidenceKind::AllFalse, + PredicateEvidenceKind::AllTrue => PredicateEvidenceKind::AllTrue, + PredicateEvidenceKind::Unknown => PredicateEvidenceKind::Unknown, + PredicateEvidenceKind::ExactMask(mask) => { + PredicateEvidenceKind::ExactMask(mask.slice(local)) + } + PredicateEvidenceKind::CandidateMask(mask) => { + PredicateEvidenceKind::CandidateMask(mask.slice(local)) + } + }; + Ok(Some(EvidenceFragment::new(rows, kind))) +} + fn push_expr( root: &ScanPlanRef, expr: &Expression, @@ -2420,7 +2954,7 @@ fn prepare_split_ranges( ) } -fn explicit_row_range<'a>(row_count: u64, row_range: &'a Range) -> Option<&'a Range> { +fn explicit_row_range(row_count: u64, row_range: &Range) -> Option<&Range> { (row_range.start != 0 || row_range.end != row_count).then_some(row_range) } diff --git a/vortex-layout/src/scan/v2/layouts/struct_.rs b/vortex-layout/src/scan/v2/layouts/struct_.rs index 5d28894ae4a..98a37bc444a 100644 --- a/vortex-layout/src/scan/v2/layouts/struct_.rs +++ b/vortex-layout/src/scan/v2/layouts/struct_.rs @@ -11,9 +11,9 @@ use std::fmt; use std::sync::Arc; use parking_lot::Mutex; -use rustc_hash::FxHashMap; use vortex_array::dtype::FieldName; use vortex_array::dtype::FieldNames; +use vortex_array::dtype::StructFields; use vortex_array::expr::Expression; use vortex_array::expr::get_item; use vortex_array::expr::is_root; @@ -39,7 +39,6 @@ use vortex_scan::plan::literal_scan_plan; use vortex_scan::plan::request::ScanRequest; use vortex_session::VortexSession; -use crate::LayoutChildType; use crate::layout_v2::Layout; use crate::layout_v2::LayoutRef; use crate::layout_v2::Struct; @@ -60,10 +59,15 @@ pub(crate) fn new_scan_plan( .new_scan_plan(&mut ScanRequest::empty(), session) }) .transpose()?; + let fields = struct_fields(layout.dtype())?; + let children = Mutex::new(vec![None; fields.nfields()]); + let field_child_offset = usize::from(layout.dtype().is_nullable()); Ok(Arc::new(StructScanPlan { layout: layout.to_layout(), session: session.clone(), - children: Mutex::new(FxHashMap::default()), + fields, + children, + field_child_offset, validity, })) } @@ -72,7 +76,9 @@ pub(crate) fn new_scan_plan( pub struct StructScanPlan { layout: LayoutRef, session: VortexSession, - children: Mutex>, + fields: StructFields, + children: Mutex>>, + field_child_offset: usize, validity: Option, } @@ -86,7 +92,7 @@ impl ScanPlan for StructScanPlan { expr: &Expression, cx: &mut PushCtx, ) -> VortexResult> { - let scope = struct_fields(self.layout.dtype())?; + let scope = &self.fields; if let Some(literal) = literal_scan_plan(expr, self.layout.row_count()) { return Ok(Some(literal)); } @@ -149,29 +155,52 @@ impl StructScanPlan { } fn child_field(&self, name: &FieldName) -> VortexResult { - if let Some(hit) = self.children.lock().get(name) { + let Some(field_idx) = self.fields.find(name) else { + vortex_bail!("field {name} not found in struct layout") + }; + self.child_field_by_index(field_idx, name) + } + + fn child_field_by_index( + &self, + field_idx: usize, + name: &FieldName, + ) -> VortexResult { + let mut children = self.children.lock(); + let Some(slot) = children.get_mut(field_idx) else { + vortex_bail!("field {name} not found in struct layout") + }; + if let Some(hit) = slot { return Ok(Arc::clone(hit)); } - for idx in 0..self.layout.nchildren() { - if let Ok(LayoutChildType::Field(field)) = self.layout.child_type(idx) - && field == *name - { - let child = self - .layout - .child(idx)? - .new_scan_plan(&mut ScanRequest::empty(), &self.session)?; - let mut children = self.children.lock(); - return Ok(Arc::clone(children.entry(name.clone()).or_insert(child))); - } - } - vortex_bail!("field {name} not found in struct layout") + + let child_idx = field_idx + self.field_child_offset; + let child = self + .layout + .child(child_idx)? + .new_scan_plan(&mut ScanRequest::empty(), &self.session)?; + *slot = Some(Arc::clone(&child)); + Ok(child) } fn push_struct(&self, names: FieldNames, cx: &mut PushCtx) -> VortexResult { + let field_indices = if names == self.fields.names() { + (0..names.len()).collect::>() + } else { + names + .iter() + .map(|name| { + self.fields + .find(name) + .ok_or_else(|| vortex_error::vortex_err!("field {name} not found")) + }) + .collect::>>()? + }; let fields = names .iter() - .map(|name| { - let child = self.child_field(name)?; + .zip(field_indices) + .map(|(name, field_idx)| { + let child = self.child_field_by_index(field_idx, name)?; child .try_push_expr(&root(), cx)? .ok_or_else(|| vortex_error::vortex_err!("field {name} did not push root")) diff --git a/vortex-layout/src/scan/v2/layouts/zoned.rs b/vortex-layout/src/scan/v2/layouts/zoned.rs index 06f139526cb..bdf1fbe04fe 100644 --- a/vortex-layout/src/scan/v2/layouts/zoned.rs +++ b/vortex-layout/src/scan/v2/layouts/zoned.rs @@ -43,6 +43,8 @@ use vortex_error::VortexResult; use vortex_error::vortex_err; use vortex_mask::Mask; use vortex_scan::plan::AggregateAnswer; +use vortex_scan::plan::EvidenceCost; +use vortex_scan::plan::EvidenceScope; use vortex_scan::plan::PrepareCtx; use vortex_scan::plan::PreparedAggregate; use vortex_scan::plan::PreparedAggregateRef; @@ -686,6 +688,14 @@ impl PreparedEvidence for ZonedPreparedEvidence { true } + fn cost(&self, _req: &EvidenceRequest<'_>) -> EvidenceCost { + EvidenceCost::Metadata + } + + fn scope(&self) -> EvidenceScope { + EvidenceScope::Scan + } + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "zoned") } diff --git a/vortex-scan/src/lib.rs b/vortex-scan/src/lib.rs index 7838857281b..ffdae740dcb 100644 --- a/vortex-scan/src/lib.rs +++ b/vortex-scan/src/lib.rs @@ -27,6 +27,7 @@ pub mod row_mask; pub mod scheduler; pub mod segments; pub mod selection; +pub mod task; use std::any::Any; use std::ops::Range; @@ -46,6 +47,7 @@ pub use scheduler::WorkPermit; pub use scheduler::WorkRequest; pub use segments::*; use selection::Selection; +pub use task::*; use vortex_array::aggregate_fn::AggregateFnRef; use vortex_array::dtype::DType; use vortex_array::dtype::FieldPath; diff --git a/vortex-scan/src/plan/mod.rs b/vortex-scan/src/plan/mod.rs index 024e59d37f1..d1c1044b857 100644 --- a/vortex-scan/src/plan/mod.rs +++ b/vortex-scan/src/plan/mod.rs @@ -1275,6 +1275,51 @@ impl PreparedRead for MaskPreparedRead { } } +/// Static cost class for a predicate-evidence provider. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum EvidenceCost { + /// Cheap metadata evidence, such as zone maps. + Metadata, + /// Index-like evidence that may touch more state than metadata but avoids row decoding. + Index, + /// Evidence that performs row-level compute. + Compute, + /// Evidence with unknown cost. + Unknown, +} + +/// Natural execution scope for a predicate-evidence provider. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] +pub enum EvidenceScope { + /// Evidence is produced independently for each morsel range. + #[default] + Morsel, + /// Evidence is produced over the scan row domain and consumed by many morsels. + Scan, +} + +impl EvidenceCost { + /// Convert this cost class and estimated read bytes into a scheduling priority. + /// + /// Lower priorities run first. The returned value is intentionally coarse; runtime selectivity + /// feedback should dominate once a lane has observations. + pub fn priority(self, read_bytes: u64, dynamic_recheck: bool) -> u64 { + let base = match self { + Self::Metadata => 1_000, + Self::Index => 10_000, + Self::Unknown => 100_000, + Self::Compute => 1_000_000, + }; + let read_penalty = read_bytes / 1024; + let priority = base + read_penalty; + if dynamic_recheck { + priority / 2 + } else { + priority + } + } +} + /// Prepared predicate evidence for one predicate expression. pub trait PreparedEvidence: 'static + Send + Sync { /// Produce evidence for the prepared predicate over `req.range`. @@ -1300,6 +1345,16 @@ pub trait PreparedEvidence: 'static + Send + Sync { false } + /// Static cost class used by the scan scheduler when ordering evidence tasks. + fn cost(&self, _req: &EvidenceRequest<'_>) -> EvidenceCost { + EvidenceCost::Unknown + } + + /// Return the natural execution scope for this evidence provider. + fn scope(&self) -> EvidenceScope { + EvidenceScope::Morsel + } + /// Compact description for plan display. fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "evidence") diff --git a/vortex-scan/src/task.rs b/vortex-scan/src/task.rs new file mode 100644 index 00000000000..f68a1185e36 --- /dev/null +++ b/vortex-scan/src/task.rs @@ -0,0 +1,915 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Scheduler-visible scan tasks. +//! +//! A scan task is a morsel-level runtime unit with explicit read dependencies. +//! Layouts and scan-plan adapters still decide what a task means, while the +//! scheduler can reason about phase, read bytes, and deduplication before the +//! task future is launched. + +use std::collections::BTreeMap; +use std::collections::VecDeque; + +use futures::future::BoxFuture; +use vortex_error::VortexResult; +use vortex_utils::aliases::hash_map::Entry; +use vortex_utils::aliases::hash_map::HashMap; +use vortex_utils::aliases::hash_set::HashSet; + +use crate::segments::ScanIoPhase; +use crate::segments::ScanRead; +use crate::segments::SegmentRequestKey; + +/// Fine-grained scheduling lane for a scan task. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum ScanTaskLane { + /// Scan-domain metadata/index evidence shared by all morsels. + ScanEvidence { + /// Predicate slot this evidence task belongs to. + predicate_idx: u32, + /// Evidence-provider slot within the predicate. + evidence_idx: u32, + }, + /// Metadata/index evidence for one predicate. + Evidence { + /// Predicate slot this evidence task belongs to. + predicate_idx: u32, + /// Evidence-provider slot within the predicate. + evidence_idx: u32, + }, + /// Exact residual evaluation for one predicate. + Predicate { + /// Predicate slot this residual-read task belongs to. + predicate_idx: u32, + }, + /// Final projected data read. + Projection, + /// Aggregate read. + Aggregate, +} + +impl ScanTaskLane { + /// Default lane for callers that only know the high-level phase. + pub fn from_phase(phase: ScanIoPhase) -> Self { + match phase { + ScanIoPhase::EvidenceProbe | ScanIoPhase::EvidenceSetup => Self::Evidence { + predicate_idx: 0, + evidence_idx: 0, + }, + ScanIoPhase::PredicateRead => Self::Predicate { predicate_idx: 0 }, + ScanIoPhase::ProjectionRead => Self::Projection, + ScanIoPhase::AggregateRead => Self::Aggregate, + } + } + + fn group(self) -> ScanTaskGroup { + match self { + Self::ScanEvidence { .. } => ScanTaskGroup::Evidence, + Self::Evidence { .. } => ScanTaskGroup::Evidence, + Self::Predicate { .. } => ScanTaskGroup::Predicate, + Self::Projection | Self::Aggregate => ScanTaskGroup::Projection, + } + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum ScanTaskGroup { + Predicate, + Projection, + Evidence, +} + +impl ScanTaskGroup { + fn idx(self) -> usize { + match self { + Self::Predicate => 0, + Self::Projection => 1, + Self::Evidence => 2, + } + } +} + +/// Scheduler-visible read dependency for one scan task. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub struct ScanTaskRead { + /// Dedupe key for the logical read. + pub key: SegmentRequestKey, + /// Number of bytes this read contributes if it is not already active. + pub bytes: u64, +} + +impl ScanTaskRead { + /// Convert registered segment reads into scheduler-visible task reads. + pub fn from_scan_reads(reads: &[ScanRead]) -> Vec { + let mut seen = HashSet::new(); + reads + .iter() + .filter_map(|read| { + let key = SegmentRequestKey::from(&read.request); + seen.insert(key).then_some(Self { + key, + bytes: read.request.bytes, + }) + }) + .collect() + } +} + +/// A morsel-level scan task with explicit read dependencies. +pub trait ScanTask: Send { + /// Morsel identifier this task belongs to. + fn morsel_id(&self) -> usize; + + /// High-level scan phase for scheduling. + fn phase(&self) -> ScanIoPhase; + + /// Fine-grained scheduling lane. + fn lane(&self) -> ScanTaskLane; + + /// Logical reads required by this task. + fn reads(&self) -> &[ScanTaskRead]; + + /// Scheduling priority within this task's group. Lower values run first. + fn priority(&self) -> u64; + + /// Execute this task. + fn into_future(self: Box) -> BoxFuture<'static, VortexResult>; +} + +/// Boxed scan task. +pub type ScanTaskBox = Box>; + +/// A scan task backed by an already-constructed future. +pub struct FutureScanTask { + morsel_id: usize, + phase: ScanIoPhase, + lane: ScanTaskLane, + reads: Vec, + priority: u64, + future: BoxFuture<'static, VortexResult>, +} + +impl FutureScanTask { + /// Default scheduling priority for tasks without a more specific estimate. + pub const DEFAULT_PRIORITY: u64 = 1_000_000; + + /// Create a future-backed scan task. + pub fn new( + morsel_id: usize, + phase: ScanIoPhase, + reads: Vec, + future: BoxFuture<'static, VortexResult>, + ) -> Self { + Self::new_in_lane( + morsel_id, + phase, + ScanTaskLane::from_phase(phase), + reads, + future, + ) + } + + /// Create a future-backed scan task in a specific scheduling lane. + pub fn new_in_lane( + morsel_id: usize, + phase: ScanIoPhase, + lane: ScanTaskLane, + reads: Vec, + future: BoxFuture<'static, VortexResult>, + ) -> Self { + Self { + morsel_id, + phase, + lane, + reads, + priority: Self::DEFAULT_PRIORITY, + future, + } + } + + /// Return this task with an explicit scheduling priority. + pub fn with_priority(mut self, priority: u64) -> Self { + self.priority = priority; + self + } + + /// Box this task behind the [`ScanTask`] trait. + pub fn boxed(self) -> ScanTaskBox + where + T: 'static, + { + Box::new(self) + } +} + +impl ScanTask for FutureScanTask { + fn morsel_id(&self) -> usize { + self.morsel_id + } + + fn phase(&self) -> ScanIoPhase { + self.phase + } + + fn lane(&self) -> ScanTaskLane { + self.lane + } + + fn reads(&self) -> &[ScanTaskRead] { + &self.reads + } + + fn priority(&self) -> u64 { + self.priority + } + + fn into_future(self: Box) -> BoxFuture<'static, VortexResult> { + self.future + } +} + +/// A task admitted for launch, including the reads that contributed to the active byte budget. +pub struct AdmittedScanTask { + task: ScanTaskBox, + lane: ScanTaskLane, + admitted_reads: Vec, +} + +impl AdmittedScanTask { + /// Create an admitted task. + fn new(task: ScanTaskBox, admitted_reads: Vec) -> Self { + let lane = task.lane(); + Self { + task, + lane, + admitted_reads, + } + } + + /// Scheduling lane for this launched task. + pub fn lane(&self) -> ScanTaskLane { + self.lane + } + + /// Borrow reads admitted for this task. + pub fn admitted_reads(&self) -> &[ScanTaskRead] { + &self.admitted_reads + } + + /// Consume this value into the task and admitted reads. + pub fn into_parts(self) -> (ScanTaskBox, ScanTaskLane, Vec) { + (self.task, self.lane, self.admitted_reads) + } +} + +#[derive(Clone, Copy, Debug)] +struct ActiveRead { + bytes: u64, + refs: usize, +} + +/// Queue of scheduler-visible scan tasks with byte-budgeted read admission. +pub struct ScanTaskQueue { + evidence_queues: BTreeMap<(u32, u32), VecDeque>>, + predicate_queues: BTreeMap>>, + projection_queue: VecDeque>, + read_byte_budget: u64, + active_read_bytes: u64, + active_group_read_bytes: [u64; 3], + active_reads: HashMap, +} + +const FRONTIER_SLACK_MORSELS: usize = 4; + +impl ScanTaskQueue { + /// Create an empty task queue with an in-flight read byte budget. + pub fn new(read_byte_budget: u64) -> Self { + Self { + evidence_queues: BTreeMap::new(), + predicate_queues: BTreeMap::new(), + projection_queue: VecDeque::new(), + read_byte_budget, + active_read_bytes: 0, + active_group_read_bytes: [0; 3], + active_reads: HashMap::new(), + } + } + + /// Push a task into the queue for its phase. + pub fn push(&mut self, task: ScanTaskBox) { + let phase = task.phase(); + let lane = task.lane(); + tracing::trace!( + target: "vortex_scan::task", + morsel_id = task.morsel_id(), + ?phase, + ?lane, + read_count = task.reads().len(), + read_bytes = scan_task_read_bytes(task.reads()), + "queued scan task" + ); + match lane { + ScanTaskLane::ScanEvidence { + predicate_idx, + evidence_idx, + } + | ScanTaskLane::Evidence { + predicate_idx, + evidence_idx, + } => self + .evidence_queues + .entry((predicate_idx, evidence_idx)) + .or_default() + .push_back(task), + ScanTaskLane::Predicate { predicate_idx } => self + .predicate_queues + .entry(predicate_idx) + .or_default() + .push_back(task), + ScanTaskLane::Projection | ScanTaskLane::Aggregate => { + self.projection_queue.push_back(task) + } + } + } + + /// Extend this queue with tasks. + pub fn extend(&mut self, tasks: impl IntoIterator>) { + for task in tasks { + self.push(task); + } + } + + /// Return whether no tasks are queued. + pub fn is_empty(&self) -> bool { + self.evidence_queues.values().all(VecDeque::is_empty) + && self.predicate_queues.values().all(VecDeque::is_empty) + && self.projection_queue.is_empty() + } + + /// Clear queued tasks and active read accounting. + pub fn clear(&mut self) { + self.evidence_queues.clear(); + self.predicate_queues.clear(); + self.projection_queue.clear(); + self.active_read_bytes = 0; + self.active_group_read_bytes = [0; 3]; + self.active_reads.clear(); + } + + /// Number of queued evidence tasks. + pub fn evidence_len(&self) -> usize { + self.evidence_queues.values().map(VecDeque::len).sum() + } + + /// Number of queued predicate-read tasks. + pub fn predicate_len(&self) -> usize { + self.predicate_queues.values().map(VecDeque::len).sum() + } + + /// Number of queued projection-read tasks. + pub fn projection_len(&self) -> usize { + self.projection_queue.len() + } + + /// Number of currently active logical read bytes. + pub fn active_read_bytes(&self) -> u64 { + self.active_read_bytes + } + + /// Pop the next task admitted by the frontier policy and read byte budget. + pub fn pop_next_admissible( + &mut self, + in_flight_empty: bool, + mut is_live_morsel: impl FnMut(usize) -> bool, + ) -> Option> { + self.drop_dead_heads(&mut is_live_morsel); + let frontier = self.frontier_morsel()?; + + for (group, enforce_target) in [ + (ScanTaskGroup::Evidence, true), + (ScanTaskGroup::Predicate, true), + (ScanTaskGroup::Projection, true), + (ScanTaskGroup::Predicate, false), + (ScanTaskGroup::Projection, false), + (ScanTaskGroup::Evidence, false), + ] { + if let Some(task) = + self.pop_group_admissible(group, enforce_target, in_flight_empty, frontier) + { + return Some(task); + } + } + + None + } + + fn pop_group_admissible( + &mut self, + group: ScanTaskGroup, + enforce_target: bool, + in_flight_empty: bool, + frontier: usize, + ) -> Option> { + if enforce_target && !self.group_has_budget(group, 0, in_flight_empty) { + return None; + } + + let active_reads = &self.active_reads; + let active_read_bytes = self.active_read_bytes; + let read_byte_budget = self.read_byte_budget; + + match group { + ScanTaskGroup::Predicate => { + let mut best = None; + for (idx, queue) in &self.predicate_queues { + let Some(task) = queue.front() else { + continue; + }; + let score = TaskScore::new( + active_reads, + task.reads(), + task.priority(), + task.morsel_id(), + *idx, + ); + if !score.within_frontier(frontier) { + continue; + } + if !can_admit_task( + active_read_bytes, + read_byte_budget, + in_flight_empty, + score.incremental_read_bytes, + ) || (enforce_target + && !self.group_has_budget(group, score.read_bytes, in_flight_empty)) + { + continue; + } + if best.is_none_or(|(_, best_score)| score < best_score) { + best = Some((*idx, score)); + } + } + let (idx, _) = best?; + let task = self.predicate_queues.get_mut(&idx)?.pop_front()?; + Some(self.admit_task(task)) + } + ScanTaskGroup::Projection => { + let task = self.projection_queue.front()?; + let score = TaskScore::new( + active_reads, + task.reads(), + task.priority(), + task.morsel_id(), + 0, + ); + if !score.within_frontier(frontier) { + return None; + } + if !can_admit_task( + active_read_bytes, + read_byte_budget, + in_flight_empty, + score.incremental_read_bytes, + ) || (enforce_target + && !self.group_has_budget(group, score.read_bytes, in_flight_empty)) + { + return None; + } + let task = self.projection_queue.pop_front()?; + Some(self.admit_task(task)) + } + ScanTaskGroup::Evidence => { + let mut best = None; + for (idx, queue) in &self.evidence_queues { + let Some(task) = queue.front() else { + continue; + }; + let score = TaskScore::new( + active_reads, + task.reads(), + task.priority(), + task.morsel_id(), + idx.0, + ); + if !score.within_frontier(frontier) { + continue; + } + if !can_admit_task( + active_read_bytes, + read_byte_budget, + in_flight_empty, + score.incremental_read_bytes, + ) || (enforce_target + && !self.group_has_budget(group, score.read_bytes, in_flight_empty)) + { + continue; + } + if best.is_none_or(|(_, best_score)| score < best_score) { + best = Some((*idx, score)); + } + } + let (idx, _) = best?; + let task = self.evidence_queues.get_mut(&idx)?.pop_front()?; + Some(self.admit_task(task)) + } + } + } + + fn drop_dead_heads(&mut self, is_live_morsel: &mut impl FnMut(usize) -> bool) { + drop_dead_heads_from_map(&mut self.evidence_queues, &mut |task| { + !matches!(task.lane(), ScanTaskLane::ScanEvidence { .. }) + && !is_live_morsel(task.morsel_id()) + }); + drop_dead_heads_from_map(&mut self.predicate_queues, &mut |task| { + !is_live_morsel(task.morsel_id()) + }); + while self + .projection_queue + .front() + .is_some_and(|task| !is_live_morsel(task.morsel_id())) + { + self.projection_queue.pop_front(); + } + } + + fn frontier_morsel(&self) -> Option { + self.evidence_queues + .values() + .filter_map(|queue| queue.front().map(|task| task.morsel_id())) + .chain( + self.predicate_queues + .values() + .filter_map(|queue| queue.front().map(|task| task.morsel_id())), + ) + .chain(self.projection_queue.front().map(|task| task.morsel_id())) + .min() + } + + fn group_target_bytes(&self, group: ScanTaskGroup) -> u64 { + if self.read_byte_budget == u64::MAX { + return u64::MAX; + } + + let projection = (self.read_byte_budget / 8).max(1); + let evidence = (self.read_byte_budget / 8).max(1); + match group { + ScanTaskGroup::Predicate => self + .read_byte_budget + .saturating_sub(projection) + .saturating_sub(evidence) + .max(1), + ScanTaskGroup::Projection => projection, + ScanTaskGroup::Evidence => evidence, + } + } + + fn group_has_budget( + &self, + group: ScanTaskGroup, + task_read_bytes: u64, + in_flight_empty: bool, + ) -> bool { + let active = self.active_group_read_bytes[group.idx()]; + let target = self.group_target_bytes(group); + active < target || active.saturating_add(task_read_bytes) <= target || in_flight_empty + } + + fn admit_task(&mut self, task: ScanTaskBox) -> AdmittedScanTask { + let phase = task.phase(); + let lane = task.lane(); + let group = lane.group(); + let morsel_id = task.morsel_id(); + let read_count = task.reads().len(); + let read_bytes = scan_task_read_bytes(task.reads()); + let incremental_read_bytes = incremental_read_bytes(&self.active_reads, task.reads()); + let mut admitted = Vec::with_capacity(task.reads().len()); + let mut seen = HashSet::new(); + for read in task.reads() { + if !seen.insert(read.key) { + continue; + } + match self.active_reads.entry(read.key) { + Entry::Occupied(mut entry) => { + let active = entry.get_mut(); + active.refs = active.refs.saturating_add(1); + } + Entry::Vacant(entry) => { + self.active_read_bytes = self.active_read_bytes.saturating_add(read.bytes); + entry.insert(ActiveRead { + bytes: read.bytes, + refs: 1, + }); + } + } + admitted.push(*read); + } + self.active_group_read_bytes[group.idx()] = self.active_group_read_bytes[group.idx()] + .saturating_add(scan_task_read_bytes(&admitted)); + tracing::trace!( + target: "vortex_scan::task", + morsel_id, + ?phase, + ?lane, + read_count, + read_bytes, + incremental_read_bytes, + active_read_bytes = self.active_read_bytes, + "admitted scan task" + ); + AdmittedScanTask::new(task, admitted) + } + + /// Release reads admitted for a completed launched task. + pub fn release_reads(&mut self, lane: ScanTaskLane, reads: &[ScanTaskRead]) { + let released_bytes = scan_task_read_bytes(reads); + self.active_group_read_bytes[lane.group().idx()] = + self.active_group_read_bytes[lane.group().idx()].saturating_sub(released_bytes); + let mut seen = HashSet::new(); + for read in reads { + if !seen.insert(read.key) { + continue; + } + let Entry::Occupied(mut entry) = self.active_reads.entry(read.key) else { + continue; + }; + if entry.get().refs > 1 { + entry.get_mut().refs -= 1; + } else { + let active = entry.remove(); + self.active_read_bytes = self.active_read_bytes.saturating_sub(active.bytes); + } + } + tracing::trace!( + target: "vortex_scan::task", + ?lane, + read_count = reads.len(), + released_bytes, + active_read_bytes = self.active_read_bytes, + "released scan task reads" + ); + } +} + +fn drop_dead_heads_from_map( + queues: &mut BTreeMap>>, + should_drop: &mut impl FnMut(&ScanTaskBox) -> bool, +) { + let keys = queues.keys().copied().collect::>(); + for key in keys { + let Some(queue) = queues.get_mut(&key) else { + continue; + }; + while queue.front().is_some_and(&mut *should_drop) { + queue.pop_front(); + } + if queue.is_empty() { + queues.remove(&key); + } + } +} + +fn can_admit_task( + active_read_bytes: u64, + read_byte_budget: u64, + in_flight_empty: bool, + incremental_read_bytes: u64, +) -> bool { + incremental_read_bytes == 0 + || active_read_bytes.saturating_add(incremental_read_bytes) <= read_byte_budget + || in_flight_empty +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +struct TaskScore { + priority: u64, + incremental_read_bytes: u64, + read_bytes: u64, + morsel_id: usize, + lane_idx: u32, +} + +impl TaskScore { + fn new( + active_reads: &HashMap, + reads: &[ScanTaskRead], + priority: u64, + morsel_id: usize, + lane_idx: u32, + ) -> Self { + Self { + priority, + incremental_read_bytes: incremental_read_bytes(active_reads, reads), + read_bytes: scan_task_read_bytes(reads), + morsel_id, + lane_idx, + } + } + + fn within_frontier(&self, frontier: usize) -> bool { + self.morsel_id <= frontier.saturating_add(FRONTIER_SLACK_MORSELS) + } +} + +fn incremental_read_bytes( + active_reads: &HashMap, + reads: &[ScanTaskRead], +) -> u64 { + let mut seen = HashSet::new(); + reads + .iter() + .filter(|read| seen.insert(read.key) && !active_reads.contains_key(&read.key)) + .map(|read| read.bytes) + .sum() +} + +/// Count each unique task read once. +pub fn scan_task_read_bytes(reads: &[ScanTaskRead]) -> u64 { + let mut seen = HashSet::new(); + reads + .iter() + .filter(|read| seen.insert(read.key)) + .map(|read| read.bytes) + .sum() +} + +#[cfg(test)] +mod tests { + use futures::FutureExt; + + use super::*; + use crate::segments::SegmentId; + + fn read(segment: u32, bytes: u64) -> ScanTaskRead { + ScanTaskRead { + key: SegmentRequestKey::new(SegmentId::from(segment)), + bytes, + } + } + + fn task(morsel_id: usize, phase: ScanIoPhase, reads: Vec) -> ScanTaskBox<()> { + FutureScanTask::new(morsel_id, phase, reads, async { Ok(()) }.boxed()).boxed() + } + + fn task_in_lane( + morsel_id: usize, + phase: ScanIoPhase, + lane: ScanTaskLane, + reads: Vec, + ) -> ScanTaskBox<()> { + FutureScanTask::new_in_lane(morsel_id, phase, lane, reads, async { Ok(()) }.boxed()).boxed() + } + + fn prioritized_task_in_lane( + morsel_id: usize, + phase: ScanIoPhase, + lane: ScanTaskLane, + reads: Vec, + priority: u64, + ) -> ScanTaskBox<()> { + FutureScanTask::new_in_lane(morsel_id, phase, lane, reads, async { Ok(()) }.boxed()) + .with_priority(priority) + .boxed() + } + + #[test] + fn queue_admits_by_incremental_read_budget() { + let mut queue = ScanTaskQueue::new(100); + queue.push(task(0, ScanIoPhase::EvidenceProbe, vec![read(1, 80)])); + queue.push(task(0, ScanIoPhase::ProjectionRead, vec![read(2, 80)])); + + let evidence = queue + .pop_next_admissible(true, |_| true) + .expect("evidence task should be admitted"); + assert_eq!(queue.active_read_bytes(), 80); + assert!(queue.pop_next_admissible(false, |_| true).is_none()); + + queue.release_reads(evidence.lane(), evidence.admitted_reads()); + let projection = queue + .pop_next_admissible(false, |_| true) + .expect("projection task should be admitted after release"); + assert_eq!(queue.active_read_bytes(), 80); + queue.release_reads(projection.lane(), projection.admitted_reads()); + assert_eq!(queue.active_read_bytes(), 0); + } + + #[test] + fn queue_dedupes_reads_within_one_task() { + let duplicate = read(1, 40); + let mut queue = ScanTaskQueue::new(100); + queue.push(task( + 0, + ScanIoPhase::ProjectionRead, + vec![duplicate, duplicate], + )); + + let admitted = queue + .pop_next_admissible(true, |_| true) + .expect("task should be admitted"); + assert_eq!(admitted.admitted_reads().len(), 1); + assert_eq!(queue.active_read_bytes(), 40); + } + + #[test] + fn queue_preserves_frontier_within_lane() { + let mut queue = ScanTaskQueue::new(100); + queue.push(task(0, ScanIoPhase::EvidenceProbe, vec![read(1, 90)])); + queue.push(task(1, ScanIoPhase::EvidenceProbe, vec![read(2, 10)])); + + let next = queue + .pop_next_admissible(true, |_| true) + .expect("one task should be admitted"); + let (task, _lane, reads) = next.into_parts(); + assert_eq!(task.morsel_id(), 0); + assert_eq!(reads, vec![read(1, 90)]); + } + + #[test] + fn queue_runs_evidence_before_ready_predicate() { + let mut queue = ScanTaskQueue::new(100); + queue.push(task_in_lane( + 0, + ScanIoPhase::EvidenceProbe, + ScanTaskLane::Evidence { + predicate_idx: 0, + evidence_idx: 0, + }, + vec![read(1, 10)], + )); + queue.push(task_in_lane( + 0, + ScanIoPhase::PredicateRead, + ScanTaskLane::Predicate { predicate_idx: 0 }, + vec![read(2, 10)], + )); + + let next = queue + .pop_next_admissible(true, |_| true) + .expect("one task should be admitted"); + let (task, lane, reads) = next.into_parts(); + assert_eq!(task.morsel_id(), 0); + assert_eq!( + lane, + ScanTaskLane::Evidence { + predicate_idx: 0, + evidence_idx: 0, + } + ); + assert_eq!(reads, vec![read(1, 10)]); + } + + #[test] + fn queue_keeps_scan_evidence_for_dead_anchor_morsel() { + let mut queue = ScanTaskQueue::new(100); + queue.push(task_in_lane( + 0, + ScanIoPhase::EvidenceProbe, + ScanTaskLane::ScanEvidence { + predicate_idx: 0, + evidence_idx: 0, + }, + vec![read(1, 10)], + )); + + let next = queue + .pop_next_admissible(true, |_| false) + .expect("scan-scope evidence task should not be dropped with its anchor morsel"); + let (task, lane, reads) = next.into_parts(); + assert_eq!(task.morsel_id(), 0); + assert_eq!( + lane, + ScanTaskLane::ScanEvidence { + predicate_idx: 0, + evidence_idx: 0, + } + ); + assert_eq!(reads, vec![read(1, 10)]); + } + + #[test] + fn queue_prefers_lower_priority_within_group() { + let mut queue = ScanTaskQueue::new(100); + queue.push(prioritized_task_in_lane( + 0, + ScanIoPhase::PredicateRead, + ScanTaskLane::Predicate { predicate_idx: 0 }, + vec![read(1, 10)], + 100, + )); + queue.push(prioritized_task_in_lane( + 0, + ScanIoPhase::PredicateRead, + ScanTaskLane::Predicate { predicate_idx: 1 }, + vec![read(2, 10)], + 10, + )); + + let next = queue + .pop_next_admissible(true, |_| true) + .expect("one task should be admitted"); + let (task, lane, reads) = next.into_parts(); + assert_eq!(task.morsel_id(), 0); + assert_eq!(lane, ScanTaskLane::Predicate { predicate_idx: 1 }); + assert_eq!(reads, vec![read(2, 10)]); + } +} From 79ecea44859a42cd0252aa6ba902b658a6b85f61 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Mon, 22 Jun 2026 17:11:35 -0400 Subject: [PATCH 28/48] Tune scan2 dictionary projection Signed-off-by: Nicholas Gates --- vortex-file/src/multi/scan_v2.rs | 37 ++++- vortex-file/src/strategy.rs | 13 +- vortex-layout/src/scan/v2/layouts/dict.rs | 186 ++++++++++++++++++++++ vortex-scan/src/task.rs | 48 ++++++ 4 files changed, 277 insertions(+), 7 deletions(-) diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index 710453b211a..3e84b847154 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -1178,6 +1178,8 @@ struct PartitionWorkSchedulerState { pending: VecDeque, morsels: Vec>, active_morsels: usize, + has_dynamic_predicates: bool, + in_flight_projection_tasks: usize, next_morsel_id: usize, next_emit_morsel_id: usize, task_queue: ScanTaskQueue, @@ -1212,18 +1214,24 @@ fn partition_work_stream( plan_window: usize, read_byte_budget: u64, ) -> impl futures::Stream> + Send + 'static { + let has_dynamic_predicates = morsels + .iter() + .any(|morsel| morsel.execution.has_dynamic_predicates()); tracing::debug!( target: "vortex_file::scan_v2", morsel_count = morsels.len(), ordered, plan_window, read_byte_budget, + has_dynamic_predicates, "created scan2 task stream" ); let state = PartitionWorkSchedulerState { pending: VecDeque::from(morsels), morsels: Vec::new(), active_morsels: 0, + has_dynamic_predicates, + in_flight_projection_tasks: 0, next_morsel_id: 0, next_emit_morsel_id: 0, task_queue: ScanTaskQueue::new(read_byte_budget), @@ -1287,6 +1295,7 @@ impl PartitionWorkSchedulerState { self.pending.clear(); self.morsels.clear(); self.active_morsels = 0; + self.in_flight_projection_tasks = 0; self.next_emit_morsel_id = 0; self.task_queue.clear(); self.in_flight = FuturesUnordered::new(); @@ -1322,13 +1331,17 @@ impl PartitionWorkSchedulerState { fn launch_next_admissible_work(&mut self) -> bool { let in_flight_empty = self.in_flight.is_empty(); + // Backlogged output should stop speculative projection for dynamic scans, but not the + // single projection needed to unblock an otherwise idle ordered stream. + let projection_admissible = !self.has_dynamic_predicates + || (self.in_flight_projection_tasks == 0 && !self.has_completed_output_backlog()) + || in_flight_empty; let morsels = &self.morsels; - let Some(task) = self - .task_queue - .pop_next_admissible(in_flight_empty, |morsel_id| { - morsels.get(morsel_id).and_then(Option::as_ref).is_some() - }) - else { + let Some(task) = self.task_queue.pop_next_admissible_with_projection_gate( + in_flight_empty, + projection_admissible, + |morsel_id| morsels.get(morsel_id).and_then(Option::as_ref).is_some(), + ) else { return false; }; let (task, lane, reads) = task.into_parts(); @@ -1369,10 +1382,16 @@ impl PartitionWorkSchedulerState { } else { self.in_flight.push(self.handle.spawn(future).boxed()); } + if matches!(lane, ScanTaskLane::Projection) { + self.in_flight_projection_tasks = self.in_flight_projection_tasks.saturating_add(1); + } } fn release_reads(&mut self, lane: ScanTaskLane, reads: &[ScanTaskRead]) { self.task_queue.release_reads(lane, reads); + if matches!(lane, ScanTaskLane::Projection) { + self.in_flight_projection_tasks = self.in_flight_projection_tasks.saturating_sub(1); + } } fn complete_work(&mut self, output: WorkOutput) -> VortexResult> { @@ -1854,6 +1873,12 @@ impl PartitionWorkSchedulerState { } } } + + fn has_completed_output_backlog(&self) -> bool { + self.completed_morsels + .values() + .any(|morsel| matches!(morsel, CompletedMorsel::Output(_))) + } } struct ScanPlanPartition { diff --git a/vortex-file/src/strategy.rs b/vortex-file/src/strategy.rs index ed58f32e11d..1701653481a 100644 --- a/vortex-file/src/strategy.rs +++ b/vortex-file/src/strategy.rs @@ -32,6 +32,8 @@ use vortex_array::dtype::FieldPath; use vortex_btrblocks::BtrBlocksCompressorBuilder; use vortex_btrblocks::SchemeExt; use vortex_btrblocks::schemes::integer::IntDictScheme; +#[cfg(feature = "unstable_encodings")] +use vortex_btrblocks::schemes::string::OnPairScheme; use vortex_bytebool::ByteBool; use vortex_datetime_parts::DateTimeParts; use vortex_decimal_byte_parts::DecimalByteParts; @@ -155,8 +157,17 @@ impl Default for WriteStrategyBuilder { /// Create a new empty builder. It can be further configured, /// and then finally built yielding the [`LayoutStrategy`]. fn default() -> Self { + #[cfg_attr(not(feature = "unstable_encodings"), allow(unused_mut))] + let mut compressor = BtrBlocksCompressorBuilder::default(); + #[cfg(feature = "unstable_encodings")] + { + // OnPair currently optimizes for compressed size, but its string predicate kernels are + // not yet competitive with FSST for the scan-heavy default file format. + compressor = compressor.exclude_schemes([OnPairScheme.id()]); + } + Self { - compressor: CompressorConfig::BtrBlocks(BtrBlocksCompressorBuilder::default()), + compressor: CompressorConfig::BtrBlocks(compressor), row_block_size: 8192, field_writers: HashMap::new(), allow_encodings: Some(ALLOWED_ENCODINGS.clone()), diff --git a/vortex-layout/src/scan/v2/layouts/dict.rs b/vortex-layout/src/scan/v2/layouts/dict.rs index 5971949fe85..1c001a84ca4 100644 --- a/vortex-layout/src/scan/v2/layouts/dict.rs +++ b/vortex-layout/src/scan/v2/layouts/dict.rs @@ -66,6 +66,10 @@ use crate::layouts::SharedArrayFuture; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; +const DENSE_REMAP_MAX_VALUES: usize = 1 << 20; +const DENSE_REMAP_VALUES_PER_CODE: usize = 4; +const UNREFERENCED_VALUE: usize = usize::MAX; + pub(crate) fn new_scan_plan( layout: Layout, _req: &mut ScanRequest, @@ -553,6 +557,10 @@ where Code: NativePType + TryFrom, usize: TryFrom, { + if use_dense_value_rank_map(codes.len(), valid.true_count(), values_len) { + return compact_codes_and_value_selection_dense(codes, validity, valid, values_len); + } + let referenced = referenced_values(codes, valid, values_len)?; if referenced.is_empty() || referenced.len() == values_len { return Ok(None); @@ -564,6 +572,73 @@ where Ok(Some((compact_codes, value_selection))) } +fn use_dense_value_rank_map(codes_len: usize, valid_count: usize, values_len: usize) -> bool { + values_len <= DENSE_REMAP_MAX_VALUES + && values_len <= valid_count.saturating_mul(DENSE_REMAP_VALUES_PER_CODE) + && values_len <= codes_len.saturating_mul(DENSE_REMAP_VALUES_PER_CODE) +} + +fn compact_codes_and_value_selection_dense( + codes: &[Code], + validity: Validity, + valid: &Mask, + values_len: usize, +) -> VortexResult> +where + Code: NativePType + TryFrom, + usize: TryFrom, +{ + let mut rank_by_value = vec![UNREFERENCED_VALUE; values_len]; + mark_referenced_values(codes, valid, values_len, &mut rank_by_value)?; + + let mut referenced = Vec::with_capacity(valid.true_count().min(values_len)); + let mut rank = 0; + for (value_idx, value_rank) in rank_by_value.iter_mut().enumerate() { + if *value_rank != UNREFERENCED_VALUE { + *value_rank = rank; + referenced.push(value_idx); + rank += 1; + } + } + + if referenced.is_empty() || referenced.len() == values_len { + return Ok(None); + } + + let compact = remap_codes_dense(codes, valid, values_len, &rank_by_value)?; + let value_selection = Mask::from_indices(values_len, referenced); + let compact_codes = PrimitiveArray::new(compact.freeze(), validity).into_array(); + Ok(Some((compact_codes, value_selection))) +} + +fn mark_referenced_values( + codes: &[Code], + valid: &Mask, + values_len: usize, + rank_by_value: &mut [usize], +) -> VortexResult<()> +where + Code: Copy + fmt::Display, + usize: TryFrom, +{ + match valid.bit_buffer() { + AllOr::All => { + for &code in codes { + let idx = checked_code_index(code, values_len)?; + rank_by_value[idx] = 0; + } + } + AllOr::None => {} + AllOr::Some(mask) => { + for idx in mask.set_indices() { + let value_idx = checked_code_index(codes[idx], values_len)?; + rank_by_value[value_idx] = 0; + } + } + } + Ok(()) +} + fn referenced_values( codes: &[Code], valid: &Mask, @@ -626,6 +701,40 @@ where Ok(compact) } +fn remap_codes_dense( + codes: &[Code], + valid: &Mask, + values_len: usize, + rank_by_value: &[usize], +) -> VortexResult> +where + Code: Copy + Default + fmt::Display + TryFrom, + usize: TryFrom, +{ + let mut compact = BufferMut::::with_capacity(codes.len()); + match valid.bit_buffer() { + AllOr::All => { + for &code in codes { + compact.push(compact_code_dense(code, values_len, rank_by_value)?); + } + } + AllOr::None => compact.extend(std::iter::repeat_n(Code::default(), codes.len())), + AllOr::Some(mask) => { + let mut valid_indices = mask.set_indices(); + let mut next_valid = valid_indices.next(); + for (idx, &code) in codes.iter().enumerate() { + if next_valid == Some(idx) { + compact.push(compact_code_dense(code, values_len, rank_by_value)?); + next_valid = valid_indices.next(); + } else { + compact.push(Code::default()); + } + } + } + } + Ok(compact) +} + fn checked_code_index(code: Code, values_len: usize) -> VortexResult where Code: Copy + fmt::Display, @@ -642,6 +751,27 @@ where Ok(idx) } +fn compact_code_dense( + code: Code, + values_len: usize, + rank_by_value: &[usize], +) -> VortexResult +where + Code: Copy + fmt::Display + TryFrom, + usize: TryFrom, +{ + let idx = checked_code_index(code, values_len)?; + let rank = rank_by_value[idx]; + if rank == UNREFERENCED_VALUE { + vortex_bail!("dictionary code {idx} missing from sparse referenced value map"); + } + Code::try_from(rank).map_err(|_| { + vortex_err!( + "sparse dictionary code rank {rank} cannot be represented by original code type" + ) + }) +} + fn compact_code(code: Code, values_len: usize, referenced: &[usize]) -> VortexResult where Code: Copy + fmt::Display + TryFrom, @@ -740,3 +870,59 @@ impl PreparedRead for DictExprPreparedRead { self.node.fmt_chain(f) } } + +#[cfg(test)] +mod tests { + use vortex_array::LEGACY_SESSION; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::validity::Validity; + use vortex_buffer::buffer; + use vortex_error::VortexResult; + use vortex_mask::Mask; + + use super::compact_codes_and_value_selection_typed; + + #[test] + fn dense_compaction_preserves_sparse_value_order_and_validity() -> VortexResult<()> { + let validity = Validity::from_iter([true, false, true, true, true, true]); + let valid = validity.execute_mask(6, &mut LEGACY_SESSION.create_execution_ctx())?; + let (compact_codes, value_selection) = compact_codes_and_value_selection_typed::( + &[7, 9, 3, 7, 1, 3], + validity, + &valid, + 8, + )? + .expect("sparse dict compaction should be available"); + + assert_eq!(value_selection, Mask::from_indices(8, [1, 3, 7])); + let compact_codes = + compact_codes.execute::(&mut LEGACY_SESSION.create_execution_ctx())?; + assert_eq!(compact_codes.as_slice::(), &[2, 0, 1, 2, 0, 1]); + assert_eq!( + compact_codes + .validity()? + .execute_mask(6, &mut LEGACY_SESSION.create_execution_ctx())?, + Mask::from_indices(6, [0, 2, 3, 4, 5]) + ); + + Ok(()) + } + + #[test] + fn dense_compaction_returns_none_when_all_values_referenced() -> VortexResult<()> { + let validity = Validity::NonNullable; + let valid = validity.execute_mask(4, &mut LEGACY_SESSION.create_execution_ctx())?; + assert!( + compact_codes_and_value_selection_typed::( + buffer![2u8, 0, 1, 3].as_slice(), + validity, + &valid, + 4, + )? + .is_none() + ); + + Ok(()) + } +} diff --git a/vortex-scan/src/task.rs b/vortex-scan/src/task.rs index f68a1185e36..1146c42cd9d 100644 --- a/vortex-scan/src/task.rs +++ b/vortex-scan/src/task.rs @@ -382,6 +382,20 @@ impl ScanTaskQueue { &mut self, in_flight_empty: bool, mut is_live_morsel: impl FnMut(usize) -> bool, + ) -> Option> { + self.pop_next_admissible_with_projection_gate(in_flight_empty, true, &mut is_live_morsel) + } + + /// Pop the next task admitted by the frontier policy and read byte budget, optionally + /// suppressing projection/aggregate work. + /// + /// This is useful when a caller wants predicate/evidence run-ahead but must avoid producing + /// more output batches until downstream has consumed earlier projection results. + pub fn pop_next_admissible_with_projection_gate( + &mut self, + in_flight_empty: bool, + projection_admissible: bool, + mut is_live_morsel: impl FnMut(usize) -> bool, ) -> Option> { self.drop_dead_heads(&mut is_live_morsel); let frontier = self.frontier_morsel()?; @@ -394,6 +408,9 @@ impl ScanTaskQueue { (ScanTaskGroup::Projection, false), (ScanTaskGroup::Evidence, false), ] { + if group == ScanTaskGroup::Projection && !projection_admissible { + continue; + } if let Some(task) = self.pop_group_admissible(group, enforce_target, in_flight_empty, frontier) { @@ -858,6 +875,37 @@ mod tests { assert_eq!(reads, vec![read(1, 10)]); } + #[test] + fn queue_projection_gate_still_allows_predicates() { + let mut queue = ScanTaskQueue::new(100); + queue.push(task_in_lane( + 0, + ScanIoPhase::ProjectionRead, + ScanTaskLane::Projection, + vec![read(1, 10)], + )); + queue.push(task_in_lane( + 0, + ScanIoPhase::PredicateRead, + ScanTaskLane::Predicate { predicate_idx: 0 }, + vec![read(2, 10)], + )); + + let next = queue + .pop_next_admissible_with_projection_gate(true, false, |_| true) + .expect("predicate task should still be admitted while projection is gated"); + let (_task, lane, reads) = next.into_parts(); + assert_eq!(lane, ScanTaskLane::Predicate { predicate_idx: 0 }); + assert_eq!(reads, vec![read(2, 10)]); + + assert!( + queue + .pop_next_admissible_with_projection_gate(false, false, |_| true) + .is_none(), + "projection task should remain queued while projection is gated" + ); + } + #[test] fn queue_keeps_scan_evidence_for_dead_anchor_morsel() { let mut queue = ScanTaskQueue::new(100); From 9e2a53ebcfd0afb4f71c6b6d41328688d23a712c Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Mon, 22 Jun 2026 21:18:41 -0400 Subject: [PATCH 29/48] Fix sparse byte-view Arrow export Signed-off-by: Nicholas Gates --- benchmarks/datafusion-bench/src/lib.rs | 12 +- .../internals/scan-scheduler.md | 5 +- vortex-array/src/arrow/executor/byte.rs | 25 +++ vortex-array/src/arrow/executor/byte_view.rs | 19 +-- vortex-datafusion/src/persistent/opener.rs | 2 +- vortex-datafusion/src/v2/source.rs | 18 +- vortex-file/src/multi/mod.rs | 2 +- vortex-file/src/multi/scan_v2.rs | 156 ++++++++++++++---- vortex-layout/src/layout_v2.rs | 129 ++++++++++++++- vortex-layout/src/scan/v2/mod.rs | 6 +- vortex-scan/src/lib.rs | 2 - vortex-scan/src/scheduler.rs | 101 ++++++++++++ 12 files changed, 406 insertions(+), 71 deletions(-) diff --git a/benchmarks/datafusion-bench/src/lib.rs b/benchmarks/datafusion-bench/src/lib.rs index ce588f31ddb..9d6720aa04d 100644 --- a/benchmarks/datafusion-bench/src/lib.rs +++ b/benchmarks/datafusion-bench/src/lib.rs @@ -156,17 +156,19 @@ fn scan_scheduler_config_from_env() -> anyhow::Result { .transpose()? .unwrap_or_else(ScanSchedulerConfig::default_morsel_slots); - Ok(std::env::var("VORTEX_SCAN_MAX_READ_BYTES") + let read_byte_budget = std::env::var("VORTEX_SCAN_MAX_READ_BYTES") .ok() .map(|value| { value .parse::() .map_err(|e| anyhow::anyhow!("invalid scan scheduler byte budget {value}: {e}")) }) - .transpose()? - .map_or(config.clone(), |bytes| { - config.with_read_byte_budget(Some(bytes)) - })) + .transpose()?; + + Ok(match read_byte_budget { + Some(bytes) => config.with_read_byte_budget(Some(bytes)), + None => config, + }) } fn vortex_table_options() -> VortexTableOptions { diff --git a/docs/developer-guide/internals/scan-scheduler.md b/docs/developer-guide/internals/scan-scheduler.md index 9f9ed111ec7..591ef684685 100644 --- a/docs/developer-guide/internals/scan-scheduler.md +++ b/docs/developer-guide/internals/scan-scheduler.md @@ -179,8 +179,9 @@ impl VortexDataSourceBuilder { } ``` -The same options should be available on `VortexTable` and `VortexFormatFactory` so users who -register tables through DataFusion's listing format path can still control scheduling. +`VortexTable` should expose the same options when it builds a `VortexDataSource`. Listing-format +users that go through `VortexFormatFactory` should configure scheduling on the `VortexSession` +used by the factory; the factory does not currently carry a separate scheduler override. For DataFusion, `DataSource::open` creates a single Vortex scan for partition zero. A per-query scheduler can therefore be resolved immediately before calling diff --git a/vortex-array/src/arrow/executor/byte.rs b/vortex-array/src/arrow/executor/byte.rs index bace6665bca..b42af375c03 100644 --- a/vortex-array/src/arrow/executor/byte.rs +++ b/vortex-array/src/arrow/executor/byte.rs @@ -80,6 +80,8 @@ mod tests { use arrow_array::cast::AsArray; use arrow_schema::DataType; use rstest::rstest; + use vortex_error::VortexResult; + use vortex_mask::Mask; use crate::IntoArray; use crate::LEGACY_SESSION; @@ -179,4 +181,27 @@ mod tests { assert!(arrow.is_null(1)); assert!(!arrow.is_null(2)); } + + #[test] + fn filtered_utf8_view_export_does_not_retain_unselected_buffers() -> VortexResult<()> { + let unselected = "x".repeat(1 << 20); + let array = + VarBinViewArray::from_iter_str(["selected", unselected.as_str(), unselected.as_str()]); + let filtered = array + .into_array() + .filter(Mask::from_iter([true, false, false]))?; + + let arrow = filtered.execute_arrow( + Some(&DataType::Utf8View), + &mut LEGACY_SESSION.create_execution_ctx(), + )?; + + assert_eq!(arrow.as_string_view().value(0), "selected"); + assert!( + arrow.get_array_memory_size() < unselected.len(), + "filtered export retained unselected payload: {} bytes", + arrow.get_array_memory_size() + ); + Ok(()) + } } diff --git a/vortex-array/src/arrow/executor/byte_view.rs b/vortex-array/src/arrow/executor/byte_view.rs index b88b1895d53..89f37681220 100644 --- a/vortex-array/src/arrow/executor/byte_view.rs +++ b/vortex-array/src/arrow/executor/byte_view.rs @@ -12,7 +12,6 @@ use vortex_error::VortexResult; use crate::ArrayRef; use crate::ExecutionCtx; use crate::arrays::VarBinViewArray; -use crate::arrow::executor::validity::to_arrow_null_buffer; use crate::arrow::null_buffer::to_null_buffer; use crate::builtins::ArrayBuiltins; use crate::dtype::DType; @@ -48,19 +47,8 @@ pub fn execute_varbinview_to_arrow( array: &VarBinViewArray, ctx: &mut ExecutionCtx, ) -> VortexResult { - let views = - ScalarBuffer::::from(array.views_handle().as_host().clone().into_arrow_buffer()); - let buffers: Vec<_> = array - .data_buffers() - .iter() - .map(|buffer| buffer.as_host().clone().into_arrow_buffer()) - .collect(); - let nulls = to_arrow_null_buffer(array.validity()?, array.len(), ctx)?; - - // SAFETY: our own VarBinView array is considered safe. - Ok(Arc::new(unsafe { - GenericByteViewArray::::new_unchecked(views, buffers, nulls) - })) + let compacted = array.compact_buffers()?; + canonical_varbinview_to_arrow::(&compacted, ctx) } pub(super) fn to_arrow_byte_view( @@ -73,6 +61,7 @@ pub(super) fn to_arrow_byte_view( // flexible since there's no prescribed nullability in Arrow types. let array = array.cast(DType::from_arrow((&T::DATA_TYPE, Nullability::Nullable)))?; + let array = array.execute::(ctx)?; let varbinview = array.execute::(ctx)?; - canonical_varbinview_to_arrow::(&varbinview, ctx) + execute_varbinview_to_arrow::(&varbinview, ctx) } diff --git a/vortex-datafusion/src/persistent/opener.rs b/vortex-datafusion/src/persistent/opener.rs index b190d1cbb42..0c68f20a848 100644 --- a/vortex-datafusion/src/persistent/opener.rs +++ b/vortex-datafusion/src/persistent/opener.rs @@ -829,7 +829,7 @@ mod tests { .filter_map(|byte_range| byte_range_to_row_range(byte_range, total_size, row_count)) .collect::>(); - assert_eq!(row_ranges.len(), partitions as usize); + assert_eq!(u64::try_from(row_ranges.len()), Ok(partitions)); assert_eq!(row_ranges.first().map(|range| range.start), Some(0)); assert_eq!(row_ranges.last().map(|range| range.end), Some(row_count)); assert_eq!( diff --git a/vortex-datafusion/src/v2/source.rs b/vortex-datafusion/src/v2/source.rs index bcb609fb58f..de624d54376 100644 --- a/vortex-datafusion/src/v2/source.rs +++ b/vortex-datafusion/src/v2/source.rs @@ -450,23 +450,29 @@ async fn scan_to_array_stream( scan_request: ScanRequest, num_partitions: usize, ) -> DFResult>> { + let ordered = scan_request.ordered; let scan = data_source .scan(scan_request) .await .map_err(|e| DataFusionError::External(Box::new(e)))?; // Each split.execute() returns a lazy stream whose early polls do preparation - // work (expression resolution, layout traversal, first I/O spawns). We use - // try_flatten_unordered to poll multiple split streams concurrently so that - // the next split is already warm when the current one finishes. + // work (expression resolution, layout traversal, first I/O spawns). Unordered + // scans can poll multiple split streams concurrently so the next split is + // already warm when the current one finishes; ordered scans must preserve + // partition order. let scan_streams = scan.partitions().map(|split_result| { let split = split_result?; split.execute() }); - Ok(scan_streams - .try_flatten_unordered(Some(num_partitions * 2)) - .boxed()) + if ordered { + Ok(scan_streams.try_flatten().boxed()) + } else { + Ok(scan_streams + .try_flatten_unordered(Some(num_partitions * 2)) + .boxed()) + } } impl DataSource for VortexDataSource { diff --git a/vortex-file/src/multi/mod.rs b/vortex-file/src/multi/mod.rs index 6d734174e4c..744bcbfc738 100644 --- a/vortex-file/src/multi/mod.rs +++ b/vortex-file/src/multi/mod.rs @@ -230,7 +230,7 @@ impl MultiFileDataSource { /// Build the [`DataSource`] selected by `VORTEX_SCAN_IMPL`. /// /// The default is the existing LayoutReader-backed scan. Setting - /// `VORTEX_SCAN_IMPL=v2` (or `scan2`/`scan3`/`native`) builds the ScanPlan-backed V2 scan. + /// `VORTEX_SCAN_IMPL=v2` (or `scan2`) builds the ScanPlan-backed V2 scan. pub async fn build_data_source(self) -> VortexResult { if scan2_enabled()? { Ok(Arc::new(scan_v2::build_scan_plan_data_source(self).await?)) diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index 3e84b847154..f4e1fb89222 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -676,7 +676,7 @@ impl DataSource for ScanPlanDataSource { .clone() .ok_or_else(|| vortex_err!("scan2 partition row range missing"))?; let prepared = Arc::new(PreparedScanPlan::try_new(&file, &request)?); - let execution = Arc::new(ScanExecution::try_new(file, prepared, &ticket)?); + let execution = Arc::new(ScanExecution::try_new(file, prepared, &ticket, None)?); let ranges = execution.splits(&row_range)?; if ranges.is_empty() { continue; @@ -730,14 +730,17 @@ impl DataSource for ScanPlanDataSource { let mut ready = VecDeque::new(); let mut deferred = VecDeque::new(); - for child in &self.children { + for (index, child) in self.children.iter().enumerate() { match child { - ScanPlanChild::Opened(file) => ready.push_back(file.clone()), - ScanPlanChild::Deferred(factory) => deferred.push_back(Arc::clone(factory)), + ScanPlanChild::Opened(file) => ready.push_back((index, file.clone())), + ScanPlanChild::Deferred(factory) => { + deferred.push_back((index, Arc::clone(factory))); + } } } let dtype = scan_request.projection.return_dtype(&self.dtype)?; + let limit_remaining = scan_request.limit.map(AtomicU64::new).map(Arc::new); Ok(Box::new(ScanPlanDataSourceScan { dtype, @@ -748,6 +751,7 @@ impl DataSource for ScanPlanDataSource { concurrency: self.concurrency, scheduler, ticket, + limit_remaining, })) } @@ -800,12 +804,13 @@ impl DataSource for ScanPlanDataSource { struct ScanPlanDataSourceScan { dtype: DType, request: DataSourceScanRequest, - ready: VecDeque, - deferred: VecDeque>, + ready: VecDeque<(usize, VortexFile)>, + deferred: VecDeque<(usize, Arc)>, handle: Handle, concurrency: usize, scheduler: Arc, ticket: ScanTicket, + limit_remaining: Option>, } impl DataSourceScan for ScanPlanDataSourceScan { @@ -832,16 +837,18 @@ impl DataSourceScan for ScanPlanDataSourceScan { concurrency, scheduler, ticket, + limit_remaining, } = *self; let ordered = request.ordered; let ready_stream = stream::iter(ready).map(Ok); - let spawned = stream::iter(deferred).map(move |factory| { + let spawned = stream::iter(deferred).map(move |(index, factory)| { handle.spawn(async move { factory .open() .instrument(tracing::info_span!("VortexFileFactory::open")) .await + .map(|file| file.map(|file| (index, file))) }) }); @@ -871,15 +878,16 @@ impl DataSourceScan for ScanPlanDataSourceScan { ready_stream .chain(deferred_stream) - .enumerate() - .filter_map(move |(index, file_result)| { + .filter_map(move |file_result| { let request = request.clone(); let scheduler = Arc::clone(&scheduler); let ticket = ticket.clone(); + let limit_remaining = limit_remaining.clone(); async move { match file_result { - Ok(file) => { - file_partition(index, file, request, scheduler, ticket).transpose() + Ok((index, file)) => { + file_partition(index, file, request, scheduler, ticket, limit_remaining) + .transpose() } Err(error) => Some(Err(error)), } @@ -895,6 +903,7 @@ fn file_partition( request: DataSourceScanRequest, scheduler: Arc, ticket: ScanTicket, + limit_remaining: Option>, ) -> VortexResult> { let Some(request) = file_scan_request(partition_idx, &file, request)? else { return Ok(None); @@ -912,6 +921,7 @@ fn file_partition( index: partition_idx, scheduler, ticket, + limit_remaining, }))) } @@ -930,7 +940,9 @@ pub(crate) fn scan_plan_file_stream( let scheduler = provider.scheduler_for_scan(&meta); let ticket = scheduler.register_scan(meta); - let Some(partition) = file_partition(0, file, request, scheduler, ticket)? else { + let limit_remaining = request.limit.map(AtomicU64::new).map(Arc::new); + let Some(partition) = file_partition(0, file, request, scheduler, ticket, limit_remaining)? + else { return Ok(ArrayStreamExt::boxed(ArrayStreamAdapter::new( dtype, stream::empty(), @@ -1888,6 +1900,7 @@ struct ScanPlanPartition { index: usize, scheduler: Arc, ticket: ScanTicket, + limit_remaining: Option>, } impl Partition for ScanPlanPartition { @@ -1926,9 +1939,15 @@ impl Partition for ScanPlanPartition { index: _, scheduler, ticket, + limit_remaining, } = *self; - let execution = Arc::new(ScanExecution::try_new(file, prepared, &ticket)?); + let execution = Arc::new(ScanExecution::try_new( + file, + prepared, + &ticket, + limit_remaining, + )?); let handle = execution.session.handle(); let dtype = execution.plan.dtype().clone(); let ranges = execution.splits(&row_range)?; @@ -2081,7 +2100,7 @@ struct ScanExecution { session: VortexSession, reader: ReadContext, plan: Arc, - limit_remaining: Option, + limit_remaining: Option>, segment_source: Arc, segment_future_cache: Arc, projection: PreparedReadRef, @@ -2217,6 +2236,7 @@ impl ScanExecution { file: VortexFile, plan: Arc, _ticket: &ScanTicket, + limit_remaining: Option>, ) -> VortexResult { let session = file.session().clone(); let segment_source = file.segment_source(); @@ -2272,8 +2292,6 @@ impl ScanExecution { .collect(), }; - let limit_remaining = plan.limit().map(AtomicU64::new); - Ok(Self { session, reader, @@ -3131,31 +3149,57 @@ fn range_len(range: &Range) -> VortexResult { } fn limit_mask(mask: Mask, remaining: &AtomicU64) -> VortexResult { - let available = remaining.load(Ordering::Relaxed); - if available == 0 { - return Ok(Mask::new_false(mask.len())); - } let true_count = mask.true_count(); - if true_count as u64 <= available { - remaining.fetch_sub(true_count as u64, Ordering::Relaxed); - return Ok(mask); - } - let take = usize::try_from(available).unwrap_or(usize::MAX); - remaining.store(0, Ordering::Relaxed); - Ok(Mask::from_indices( - mask.len(), - (0..mask.len()).filter(|idx| mask.value(*idx)).take(take), - )) + let true_count = + u64::try_from(true_count).map_err(|_| vortex_err!("mask count exceeds u64"))?; + + loop { + let available = remaining.load(Ordering::Acquire); + if available == 0 { + return Ok(Mask::new_false(mask.len())); + } + + let take = true_count.min(available); + if remaining + .compare_exchange_weak( + available, + available - take, + Ordering::AcqRel, + Ordering::Acquire, + ) + .is_err() + { + continue; + } + + if take == true_count { + return Ok(mask); + } + + let take = usize::try_from(take).unwrap_or(usize::MAX); + return Ok(Mask::from_indices( + mask.len(), + (0..mask.len()).filter(|idx| mask.value(*idx)).take(take), + )); + } } #[cfg(test)] mod tests { + use std::sync::Arc; + use std::sync::atomic::AtomicU64; + use std::sync::atomic::Ordering; + use vortex_array::expr::get_item; use vortex_array::expr::like; use vortex_array::expr::lit; use vortex_array::expr::not_eq; use vortex_array::expr::root; + use vortex_error::VortexResult; + use vortex_error::vortex_err; + use vortex_mask::Mask; + use super::limit_mask; use super::predicate_cost; #[test] @@ -3169,4 +3213,56 @@ mod tests { predicate_cost(&expensive), ); } + + #[test] + fn limit_mask_consumes_full_mask_when_limit_allows() -> VortexResult<()> { + let remaining = AtomicU64::new(4); + + let selected = limit_mask(Mask::from_indices(6, [1, 2, 4]), &remaining)?; + + assert_eq!(selected.true_count(), 3); + assert!(selected.value(1)); + assert!(selected.value(2)); + assert!(selected.value(4)); + assert_eq!(remaining.load(Ordering::Acquire), 1); + Ok(()) + } + + #[test] + fn limit_mask_trims_mask_to_remaining_rows() -> VortexResult<()> { + let remaining = AtomicU64::new(2); + + let selected = limit_mask(Mask::from_indices(6, [1, 2, 4]), &remaining)?; + + assert_eq!(selected.true_count(), 2); + assert!(selected.value(1)); + assert!(selected.value(2)); + assert!(!selected.value(4)); + assert_eq!(remaining.load(Ordering::Acquire), 0); + Ok(()) + } + + #[test] + fn limit_mask_shared_counter_never_overselects() -> VortexResult<()> { + let remaining = Arc::new(AtomicU64::new(10)); + + let handles = (0..16) + .map(|_| { + let remaining = Arc::clone(&remaining); + std::thread::spawn(move || limit_mask(Mask::new_true(8), &remaining)) + }) + .collect::>(); + + let mut selected_rows = 0; + for handle in handles { + let selected = handle + .join() + .map_err(|_| vortex_err!("limit mask worker thread panicked"))??; + selected_rows += selected.true_count(); + } + + assert_eq!(selected_rows, 10); + assert_eq!(remaining.load(Ordering::Acquire), 0); + Ok(()) + } } diff --git a/vortex-layout/src/layout_v2.rs b/vortex-layout/src/layout_v2.rs index 645992de52c..90147a7848e 100644 --- a/vortex-layout/src/layout_v2.rs +++ b/vortex-layout/src/layout_v2.rs @@ -678,7 +678,9 @@ fn metadata_bytes_field(metadata: &[u8], field_number: u64) -> VortexResult metadata.len() { vortex_bail!("metadata field extends past end of buffer"); } @@ -695,14 +697,20 @@ fn skip_proto_field(metadata: &[u8], offset: &mut usize, wire_type: u64) -> Vort read_varint(metadata, offset)?; } 1 => { - *offset += 8; + *offset = offset + .checked_add(8) + .ok_or_else(|| vortex_err!("metadata field offset overflow"))?; } 2 => { let len = usize::try_from(read_varint(metadata, offset)?)?; - *offset += len; + *offset = offset + .checked_add(len) + .ok_or_else(|| vortex_err!("metadata field offset overflow"))?; } 5 => { - *offset += 4; + *offset = offset + .checked_add(4) + .ok_or_else(|| vortex_err!("metadata field offset overflow"))?; } _ => vortex_bail!("unsupported protobuf wire type {wire_type}"), } @@ -818,10 +826,12 @@ impl VTable for Chunked { fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { EmptyMetadata::deserialize(args.metadata)?; - let mut chunk_offsets = Vec::with_capacity(args.children.nchildren() + 1); + let mut chunk_offsets: Vec = Vec::with_capacity(args.children.nchildren() + 1); chunk_offsets.push(0); for idx in 0..args.children.nchildren() { - let next = chunk_offsets[idx] + args.children.child_row_count(idx)?; + let next = chunk_offsets[idx] + .checked_add(args.children.child_row_count(idx)?) + .ok_or_else(|| vortex_err!("Chunked child row counts overflow"))?; chunk_offsets.push(next); } vortex_ensure!( @@ -836,6 +846,9 @@ impl VTable for Chunked { } fn child_type(layout: Layout, idx: usize) -> VortexResult { + if idx >= layout.nchildren() { + vortex_bail!("Chunked child index out of bounds: {idx}"); + } let offset = *layout .data() .chunk_offsets @@ -976,6 +989,110 @@ impl VTable for Dict { } } +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use vortex_array::dtype::DType; + use vortex_array::dtype::Nullability; + use vortex_array::dtype::PType; + use vortex_session::VortexSession; + use vortex_session::registry::ReadContext; + + use super::*; + + #[derive(Debug)] + struct TestChildren { + row_counts: Vec, + } + + impl LayoutChildren for TestChildren { + fn child(&self, idx: usize, _dtype: &DType) -> VortexResult { + vortex_bail!("test child {idx} is not materialized") + } + + fn child_row_count(&self, idx: usize) -> VortexResult { + self.row_counts + .get(idx) + .copied() + .ok_or_else(|| vortex_err!("test child index out of bounds: {idx}")) + } + + fn nchildren(&self) -> usize { + self.row_counts.len() + } + } + + fn primitive_dtype() -> DType { + DType::Primitive(PType::I32, Nullability::NonNullable) + } + + fn read_context() -> ReadContext { + ReadContext::new([]) + } + + #[test] + fn metadata_bytes_field_rejects_length_overflow() { + let mut metadata = vec![0x0a]; + metadata.extend_from_slice(&u64::MAX.to_le_bytes()); + // Replace the fixed-width bytes with a protobuf varint for u64::MAX. + metadata.truncate(1); + metadata.extend([0xff; 9]); + metadata.push(0x01); + + assert!(metadata_bytes_field(&metadata, 1).is_err()); + } + + #[test] + fn skip_proto_field_rejects_length_overflow() { + let mut metadata = vec![0x12]; + metadata.extend([0xff; 9]); + metadata.push(0x01); + + assert!(metadata_varint_field(&metadata, 1).is_err()); + } + + #[test] + fn chunked_deserialize_rejects_row_count_overflow() { + let dtype = primitive_dtype(); + let read_context = read_context(); + let session = VortexSession::empty(); + let args = LayoutDeserializeArgs { + dtype: &dtype, + row_count: 0, + metadata: &[], + segment_ids: Vec::new(), + children: Arc::new(TestChildren { + row_counts: vec![u64::MAX, 1], + }), + array_ctx: &read_context, + session: &session, + }; + + assert!(VTable::deserialize(&Chunked, &args).is_err()); + } + + #[test] + fn chunked_child_type_rejects_terminal_offset_index() { + let dtype = primitive_dtype(); + let layout = LayoutParts::new( + Chunked, + dtype, + 1, + Vec::new(), + Arc::new(TestChildren { + row_counts: vec![1], + }), + ChunkedData { + chunk_offsets: vec![0, 1], + }, + ) + .into_typed(); + + assert!(layout.child_type(1).is_err()); + } +} + /// V2 zoned layout vtable. #[derive(Clone, Debug)] pub struct Zoned; diff --git a/vortex-layout/src/scan/v2/mod.rs b/vortex-layout/src/scan/v2/mod.rs index bb182ed3de2..0cbb894be2c 100644 --- a/vortex-layout/src/scan/v2/mod.rs +++ b/vortex-layout/src/scan/v2/mod.rs @@ -30,7 +30,7 @@ pub use vortex_scan::plan::request; /// /// - `v1`, `scan`, `scan_builder`, `scan-builder`, `layout-reader`, or unset: use the /// existing LayoutReader-based scan. -/// - `v2`, `scan2`, `scan3`, or `native`: use the scan2 +/// - `v2` or `scan2`: use the scan2 /// [`ScanPlan`](vortex_scan::plan::ScanPlan) implementation. pub const SCAN_IMPL_ENV: &str = "VORTEX_SCAN_IMPL"; @@ -49,9 +49,9 @@ pub fn scan2_enabled() -> VortexResult { fn parse_scan_impl(value: &str) -> VortexResult { match value { "v1" | "scan" | "scan_builder" | "scan-builder" | "layout-reader" => Ok(false), - "v2" | "scan2" | "scan3" | "native" => Ok(true), + "v2" | "scan2" => Ok(true), other => vortex_bail!( - "{SCAN_IMPL_ENV} must be one of v1, scan, scan_builder, scan-builder, layout-reader, v2, scan2, scan3, or native, got {other:?}" + "{SCAN_IMPL_ENV} must be one of v1, scan, scan_builder, scan-builder, layout-reader, v2, or scan2, got {other:?}" ), } } diff --git a/vortex-scan/src/lib.rs b/vortex-scan/src/lib.rs index ffdae740dcb..5c517977895 100644 --- a/vortex-scan/src/lib.rs +++ b/vortex-scan/src/lib.rs @@ -45,9 +45,7 @@ pub use scheduler::ScanTicket; pub use scheduler::ScanWorkClass; pub use scheduler::WorkPermit; pub use scheduler::WorkRequest; -pub use segments::*; use selection::Selection; -pub use task::*; use vortex_array::aggregate_fn::AggregateFnRef; use vortex_array::dtype::DType; use vortex_array::dtype::FieldPath; diff --git a/vortex-scan/src/scheduler.rs b/vortex-scan/src/scheduler.rs index 04fbe3f68ee..1b50250833d 100644 --- a/vortex-scan/src/scheduler.rs +++ b/vortex-scan/src/scheduler.rs @@ -360,6 +360,107 @@ impl fmt::Debug for WorkPermit { } } +#[cfg(test)] +mod tests { + use std::pin::pin; + + use futures::FutureExt; + use futures::executor::block_on; + + use super::*; + + #[test] + fn permit_release_unblocks_waiting_work() -> VortexResult<()> { + block_on(async { + let scheduler = ScanScheduler::new(ScanSchedulerConfig::morsel_slots(1)); + let ticket = scheduler.register_scan(ScanMeta::default()); + let permit = scheduler.acquire(&ticket, WorkRequest::morsel()).await?; + + let waiting = scheduler.acquire(&ticket, WorkRequest::morsel()); + let mut waiting = pin!(waiting); + assert!(waiting.as_mut().now_or_never().is_none()); + + drop(permit); + let _permit = waiting.await?; + Ok(()) + }) + } + + #[test] + fn cancelled_ticket_rejects_new_work() -> VortexResult<()> { + block_on(async { + let scheduler = ScanScheduler::new(ScanSchedulerConfig::morsel_slots(1)); + let ticket = scheduler.register_scan(ScanMeta::default()); + ticket.cancel(); + + assert!( + scheduler + .acquire(&ticket, WorkRequest::morsel()) + .await + .is_err() + ); + Ok(()) + }) + } + + #[test] + fn cancelled_waiter_releases_acquired_permit() -> VortexResult<()> { + block_on(async { + let scheduler = ScanScheduler::new(ScanSchedulerConfig::morsel_slots(1)); + let ticket = scheduler.register_scan(ScanMeta::default()); + let permit = scheduler.acquire(&ticket, WorkRequest::morsel()).await?; + + let waiting = scheduler.acquire(&ticket, WorkRequest::morsel()); + let mut waiting = pin!(waiting); + assert!(waiting.as_mut().now_or_never().is_none()); + + ticket.cancel(); + drop(permit); + assert!(waiting.await.is_err()); + + let next_ticket = scheduler.register_scan(ScanMeta::default()); + let _permit = scheduler + .acquire(&next_ticket, WorkRequest::morsel()) + .await?; + Ok(()) + }) + } + + #[test] + fn invalid_slot_counts_are_rejected() -> VortexResult<()> { + block_on(async { + let scheduler = ScanScheduler::new(ScanSchedulerConfig::morsel_slots(1)); + let ticket = scheduler.register_scan(ScanMeta::default()); + + assert!( + scheduler + .acquire( + &ticket, + WorkRequest { + class: ScanWorkClass::Morsel, + slots: 0, + }, + ) + .await + .is_err() + ); + assert!( + scheduler + .acquire( + &ticket, + WorkRequest { + class: ScanWorkClass::Morsel, + slots: 2, + }, + ) + .await + .is_err() + ); + Ok(()) + }) + } +} + /// Session state for scan scheduler configuration. #[derive(Clone, Debug)] pub struct ScanSchedulerSession { From f740e714e6152fbace467ee6f9bf409fa91e18e6 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Mon, 22 Jun 2026 21:54:50 -0400 Subject: [PATCH 30/48] Fix assertion contexts after develop merge Signed-off-by: Nicholas Gates --- .../experimental/onpair/src/compute/like.rs | 24 ++++++++++++++----- encodings/fsst/src/kernel.rs | 12 ++++++++-- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/encodings/experimental/onpair/src/compute/like.rs b/encodings/experimental/onpair/src/compute/like.rs index 6e0e23134be..84925e18d69 100644 --- a/encodings/experimental/onpair/src/compute/like.rs +++ b/encodings/experimental/onpair/src/compute/like.rs @@ -377,6 +377,7 @@ mod tests { #[test] fn like_contains() -> VortexResult<()> { + let mut ctx = SESSION.create_execution_ctx(); let result = run_like( &[ Some("https://google.example"), @@ -389,13 +390,15 @@ mod tests { )?; assert_arrays_eq!( &result, - &BoolArray::from_iter([Some(false), Some(false), Some(true), None]) + &BoolArray::from_iter([Some(false), Some(false), Some(true), None]), + &mut ctx ); Ok(()) } #[test] fn like_prefix_suffix_exact_and_negated() -> VortexResult<()> { + let mut ctx = SESSION.create_execution_ctx(); let values = [ Some("2020-10-01"), Some("2020-11-01"), @@ -404,15 +407,18 @@ mod tests { ]; assert_arrays_eq!( &run_like(&values, "2020-10-%", LikeOptions::default())?, - &BoolArray::from_iter([Some(true), Some(false), Some(false), Some(false)]) + &BoolArray::from_iter([Some(true), Some(false), Some(false), Some(false)]), + &mut ctx ); assert_arrays_eq!( &run_like(&values, "%-01", LikeOptions::default())?, - &BoolArray::from_iter([Some(true), Some(true), Some(true), Some(false)]) + &BoolArray::from_iter([Some(true), Some(true), Some(true), Some(false)]), + &mut ctx ); assert_arrays_eq!( &run_like(&values, "2020-10-01", LikeOptions::default())?, - &BoolArray::from_iter([Some(true), Some(false), Some(false), Some(false)]) + &BoolArray::from_iter([Some(true), Some(false), Some(false), Some(false)]), + &mut ctx ); assert_arrays_eq!( &run_like( @@ -423,7 +429,8 @@ mod tests { case_insensitive: false, }, )?, - &BoolArray::from_iter([Some(false), Some(false), Some(false), Some(true)]) + &BoolArray::from_iter([Some(false), Some(false), Some(false), Some(true)]), + &mut ctx ); Ok(()) } @@ -455,7 +462,12 @@ mod tests { let result = stepped .execute::(&mut SESSION.create_execution_ctx())? .into_bool(); - assert_arrays_eq!(&result, &BoolArray::from_iter([Some(true), Some(true)])); + let mut ctx = SESSION.create_execution_ctx(); + assert_arrays_eq!( + &result, + &BoolArray::from_iter([Some(true), Some(true)]), + &mut ctx + ); Ok(()) } diff --git a/encodings/fsst/src/kernel.rs b/encodings/fsst/src/kernel.rs index fab998adafc..95d25080af7 100644 --- a/encodings/fsst/src/kernel.rs +++ b/encodings/fsst/src/kernel.rs @@ -253,7 +253,11 @@ mod tests { let shared = SharedArray::new(fsst).into_array(); let lengths = shared.clone().apply(&byte_length(root()))?; - assert_arrays_eq!(lengths, PrimitiveArray::from_iter(vec![5u64, 0, 7])); + assert_arrays_eq!( + lengths, + PrimitiveArray::from_iter(vec![5u64, 0, 7]), + &mut ctx + ); let not_empty = shared .binary( @@ -261,7 +265,11 @@ mod tests { Operator::NotEq, )? .execute::(&mut ctx)?; - assert_arrays_eq!(not_empty, BoolArray::from_iter([true, false, true])); + assert_arrays_eq!( + not_empty, + BoolArray::from_iter([true, false, true]), + &mut ctx + ); Ok(()) } From 000485c3961168d4817ea9cbb1e4c822c57bf960 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Mon, 22 Jun 2026 22:55:26 -0400 Subject: [PATCH 31/48] Fix PolarSignals V2 scan regressions Optimize V2 flat slice results before returning them from scan reads so slices over constants collapse back to constants before Arrow export. Also avoid running expensive VarBinView buffer compaction analysis for dense byte-view exports while keeping sparse retained-buffer exports compact. Signed-off-by: Nicholas Gates --- vortex-array/src/arrays/varbinview/compact.rs | 23 ++++++++++++------- vortex-layout/src/scan/v2/layouts/flat.rs | 5 +++- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/vortex-array/src/arrays/varbinview/compact.rs b/vortex-array/src/arrays/varbinview/compact.rs index 2f6d459113f..9e962d41659 100644 --- a/vortex-array/src/arrays/varbinview/compact.rs +++ b/vortex-array/src/arrays/varbinview/compact.rs @@ -18,6 +18,9 @@ use crate::arrays::varbinview::Ref; use crate::builders::ArrayBuilder; use crate::builders::VarBinViewBuilder; +const DEFAULT_COMPACTION_THRESHOLD: f64 = 0.5; +const MIN_RETAINED_BYTES_PER_ROW_TO_CHECK_COMPACTION: u64 = 128; + impl VarBinViewArray { /// Returns a compacted copy of the input array, where all wasted space has been cleaned up. This /// operation can be very expensive, in the worst case copying all existing string data into @@ -33,8 +36,7 @@ impl VarBinViewArray { return Ok(self.clone()); } - // Use selective compaction with threshold of 1.0 (compact any buffer with any waste) - self.compact_with_threshold(1.0) + self.compact_with_threshold(DEFAULT_COMPACTION_THRESHOLD) } fn should_compact(&self) -> VortexResult { @@ -50,12 +52,18 @@ impl VarBinViewArray { return Ok(true); } - let bytes_referenced: u64 = self.count_referenced_bytes()?; let buffer_total_bytes: u64 = self.buffers.iter().map(|buf| buf.len() as u64).sum(); + if buffer_total_bytes == 0 { + return Ok(true); + } - // If there is any wasted space, we want to repack. - // This is very aggressive. - Ok(bytes_referenced < buffer_total_bytes || buffer_total_bytes == 0) + let len = u64::try_from(self.len()).unwrap_or(u64::MAX); + if len > 0 && buffer_total_bytes / len <= MIN_RETAINED_BYTES_PER_ROW_TO_CHECK_COMPACTION { + return Ok(false); + } + + let bytes_referenced: u64 = self.count_referenced_bytes()?; + Ok((bytes_referenced as f64 / buffer_total_bytes as f64) < DEFAULT_COMPACTION_THRESHOLD) } /// Iterates over all valid, non-inlined views, calling the provided @@ -265,8 +273,7 @@ mod tests { .execute::(&mut LEGACY_SESSION.create_execution_ctx()) .unwrap(); - // Optimize the taken array - let optimized_array = taken_array.compact_buffers().unwrap(); + let optimized_array = taken_array.compact_with_threshold(1.0).unwrap(); // The optimized array should have exactly 1 buffer (consolidated) assert_eq!(optimized_array.data_buffers().len(), 1); diff --git a/vortex-layout/src/scan/v2/layouts/flat.rs b/vortex-layout/src/scan/v2/layouts/flat.rs index a8c97a7b1e8..46e12a58ef8 100644 --- a/vortex-layout/src/scan/v2/layouts/flat.rs +++ b/vortex-layout/src/scan/v2/layouts/flat.rs @@ -21,6 +21,7 @@ use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::SliceArray; use vortex_array::expr::Expression; +use vortex_array::optimizer::ArrayOptimizer; use vortex_array::serde::SerializedArray; use vortex_error::VortexError; use vortex_error::VortexResult; @@ -228,5 +229,7 @@ pub(crate) fn slice_to_range(array: ArrayRef, range: &Range) -> VortexResul if start == 0 && end == array.len() { return Ok(array); } - Ok(SliceArray::try_new(array, start..end)?.into_array()) + SliceArray::try_new(array, start..end)? + .into_array() + .optimize() } From 80f0ddab2efeb1945c78427328b60d7d41d1a1a0 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Tue, 23 Jun 2026 21:46:29 -0400 Subject: [PATCH 32/48] Refactor scan tasks into continuation steps Signed-off-by: Nicholas Gates --- benchmarks/datafusion-bench/src/lib.rs | 25 +- .../extending/writing-a-layout.md | 4 +- .../internals/scan-scheduler.md | 1100 ++++------------- vortex-file/src/file.rs | 8 +- vortex-file/src/multi/scan_v2.rs | 775 ++++++++---- vortex-file/src/segments/source.rs | 10 +- vortex-layout/src/layout_v2.rs | 4 +- vortex-layout/src/scan/mod.rs | 1 + .../src/scan}/plan/evidence.rs | 0 .../src/scan}/plan/mod.rs | 745 ++++++++--- .../src/scan}/plan/request.rs | 0 vortex-layout/src/scan/v2/layouts/chunked.rs | 831 +++++++++---- vortex-layout/src/scan/v2/layouts/dict.rs | 863 ++++++++----- vortex-layout/src/scan/v2/layouts/flat.rs | 161 +-- vortex-layout/src/scan/v2/layouts/struct_.rs | 24 +- vortex-layout/src/scan/v2/layouts/zoned.rs | 270 ++-- vortex-layout/src/scan/v2/mod.rs | 9 +- vortex-layout/src/scan/v2/row_idx.rs | 108 +- vortex-layout/src/segments/mod.rs | 46 +- .../src/segments/scheduled.rs | 159 ++- .../src/segments/source.rs | 6 + vortex-scan/src/lib.rs | 3 +- vortex-scan/src/read.rs | 202 +++ vortex-scan/src/scheduler.rs | 20 +- vortex-scan/src/segments/mod.rs | 47 - vortex-scan/src/task.rs | 348 ++++-- 26 files changed, 3493 insertions(+), 2276 deletions(-) rename {vortex-scan/src => vortex-layout/src/scan}/plan/evidence.rs (100%) rename {vortex-scan/src => vortex-layout/src/scan}/plan/mod.rs (69%) rename {vortex-scan/src => vortex-layout/src/scan}/plan/request.rs (100%) rename {vortex-scan => vortex-layout}/src/segments/scheduled.rs (85%) rename {vortex-scan => vortex-layout}/src/segments/source.rs (74%) create mode 100644 vortex-scan/src/read.rs delete mode 100644 vortex-scan/src/segments/mod.rs diff --git a/benchmarks/datafusion-bench/src/lib.rs b/benchmarks/datafusion-bench/src/lib.rs index 9d6720aa04d..d6026f25238 100644 --- a/benchmarks/datafusion-bench/src/lib.rs +++ b/benchmarks/datafusion-bench/src/lib.rs @@ -156,17 +156,32 @@ fn scan_scheduler_config_from_env() -> anyhow::Result { .transpose()? .unwrap_or_else(ScanSchedulerConfig::default_morsel_slots); - let read_byte_budget = std::env::var("VORTEX_SCAN_MAX_READ_BYTES") + let plan_window = std::env::var("VORTEX_SCAN_MORSEL_PLAN_WINDOW") .ok() .map(|value| { value - .parse::() - .map_err(|e| anyhow::anyhow!("invalid scan scheduler byte budget {value}: {e}")) + .parse::() + .map_err(|e| anyhow::anyhow!("invalid scan scheduler plan window {value}: {e}")) }) .transpose()?; - Ok(match read_byte_budget { - Some(bytes) => config.with_read_byte_budget(Some(bytes)), + let morsel_byte_budget = std::env::var("VORTEX_SCAN_MAX_MORSEL_BYTES") + .or_else(|_| std::env::var("VORTEX_SCAN_MAX_READ_BYTES")) + .ok() + .map(|value| { + value.parse::().map_err(|e| { + anyhow::anyhow!("invalid scan scheduler morsel byte budget {value}: {e}") + }) + }) + .transpose()?; + + let config = match plan_window { + Some(window) => config.with_morsel_plan_window(Some(window)), + None => config, + }; + + Ok(match morsel_byte_budget { + Some(bytes) => config.with_morsel_byte_budget(Some(bytes)), None => config, }) } diff --git a/docs/developer-guide/extending/writing-a-layout.md b/docs/developer-guide/extending/writing-a-layout.md index 738061a7e8c..1221fa22ee6 100644 --- a/docs/developer-guide/extending/writing-a-layout.md +++ b/docs/developer-guide/extending/writing-a-layout.md @@ -23,8 +23,8 @@ layout-specific `LayoutData`. ```rust use vortex_layout::layout_v2; use vortex_layout::{LayoutChildType, LayoutId}; -use vortex_scan::plan::ScanPlanRef; -use vortex_scan::plan::request::ScanRequest; +use vortex_layout::scan::plan::ScanPlanRef; +use vortex_layout::scan::plan::request::ScanRequest; use vortex_session::VortexSession; #[derive(Clone, Debug)] diff --git a/docs/developer-guide/internals/scan-scheduler.md b/docs/developer-guide/internals/scan-scheduler.md index 591ef684685..06126bcaa1c 100644 --- a/docs/developer-guide/internals/scan-scheduler.md +++ b/docs/developer-guide/internals/scan-scheduler.md @@ -1,930 +1,350 @@ # Scan Scheduler -:::{note} -This is an implementation design for scheduler-aware ScanPlan execution. It describes the resource -coordination shape that the scan runtime is growing toward. -::: - -The ScanPlan scan path needs a resource scheduler that can coordinate work across files, partitions, -and concurrent scans. The scheduler should be explicit and embeddable: a host engine can share one -scheduler across many scans to enforce global limits, or create a fresh scheduler for each query to -isolate resource usage. - -The design uses one shared `ScanScheduler` object for resource arbitration and one per-scan runtime -for query semantics. - -The existing `DataSource` / `ScanRequest` / `DataSourceScan` API remains the public query-engine -boundary for this phase. The scheduler and morsel runtime sit behind that boundary, so the first -implementation can improve scan execution without introducing a second scan API that mostly duplicates -the current one. - -## Goals - -- Bound scan resource usage across concurrent scans. -- Allow DataFusion users to choose a shared scheduler, a new scheduler per query, or an unbounded - mode. -- Give DuckDB a simple global scheduler owned by the extension session. -- Keep ScanPlan planning and morsel ordering local to each scan. -- Make I/O planning explicit enough that future evidence, predicate, and projection reads can be - deduplicated, batched, and prioritized without relying on hidden unpolled futures inside layout - readers. -- Keep storage backends pluggable: local files, object stores, HTTP range sources, memory buffers, - and future `io_uring`-backed sources should all fit behind the same scheduler-visible shape. -- Make cancellation and permit release reliable when a stream is dropped early. -- Keep scheduler APIs independent of layout internals so other `DataSource` implementations can use - the same resource controls. - -## Non-goals - -- Do not make a process-global singleton the only way to schedule scans. -- Do not put query semantics, filter ordering, evidence planning, or output ordering into the global - scheduler. -- Do not replace the `DataSource` scan API in the first scheduler implementation. If the public API - changes later, it should be because the ScanPlan runtime needs capabilities that cannot be added - compatibly to `ScanRequest` or `DataSourceScan`. -- Do not require every scan integration to expose the same configuration surface immediately. -- Do not solve cluster-wide distributed admission control. The scheduler is process-local. -- Do not design an opaque I/O path in the first implementation. If a future custom `ScanPlan` needs - non-segment I/O, add that as a small extension point next to `SegmentRequest`. - -## Core Model - -There are three layers: - -1. `ScanScheduler` - Arbitrates global resources such as I/O bytes, decoded bytes, request concurrency, decode task - concurrency, and per-scan fairness. - -2. `ScanTicket` - Represents one logical scan registered with a scheduler. It carries scan identity, cancellation, - priority, metrics, and per-scan limits. - -3. Per-scan `MorselScanRuntime` - Owns the ScanPlan graph, evidence/read/aggregate plans, morsel queue, row ordering, limit - handling, dynamic filters, and the choice of which work is useful next. - -`DataSource::scan` constructs this per-scan runtime internally and returns the existing -`DataSourceScan` wrapper. Query engines do not need to know the internal ScanPlan topology. - -The scheduler decides whether work may run. The per-scan runtime decides what work should run. +This document describes the current ScanPlan V2 scheduler and I/O pipeline. It is +an implementation guide, not a design sketch. + +The scheduler is split across three layers: + +- `vortex-scan::scheduler` owns the process/query-level scheduler object, + scheduler provider, scan tickets, and coarse configuration. +- `vortex-file::multi::scan_v2` owns the per-partition ScanPlan runtime. It + plans morsels, queues evidence/predicate/projection work, and decides which + queued task is useful next. +- `vortex-file::segments` and `vortex-file::read` own segment future + registration, logical read deduplication, physical range coalescing, and + backend request concurrency. + +The global scheduler is deliberately not a central work queue. It does not know +about predicates, layouts, row masks, or query semantics. The scan runtime makes +those decisions locally, then uses scheduler-visible read bytes and task lanes to +control how much work is launched. + +## Execution Shape + +The normal DataFusion V2 path is: ```text -DataFusion / DuckDB +DataFusion DataSource::open(partition) | v -DataSource::scan(request) +VortexDataSource builds ScanRequest | v -resolve scheduler provider +DataSourceRef::plan_morsel_partitions or DataSourceRef::scan | v -ScanScheduler::register_scan(meta) -> ScanTicket +ScanSchedulerProvider::scheduler_for_scan | v -MorselScanRuntime +ScanScheduler::register_scan -> ScanTicket | - +-- plan next useful morsel - +-- acquire scheduler permits - +-- run evidence / read / decode / aggregate work - +-- release permits on completion or drop + v +partition_work_stream + | + +-- plan morsels into task queues + +-- register segment futures synchronously + +-- admit tasks by lane/frontier/read bytes + +-- poll task futures on the Vortex runtime + +-- emit arrays in ordered or unordered mode ``` -## Scheduler Ownership +`DataSource::plan_morsel_partitions` is used when the engine can consume many +output partitions. It opens files, asks each prepared file for split ranges, and +round-robins planned morsels across the engine-requested partition count. Each +partition then runs its own `partition_work_stream`, but planned morsels from the +same file share the same `ScanExecution`, `SegmentFutureCache`, and +`FileSegmentSource`. + +`DataSource::scan` is the fallback path. It yields file partitions and each file +partition creates its own `partition_work_stream`. -`ScanScheduler` is an ordinary shared object: +Limited scans force a morsel planning window of one because limit accounting is +owned by the scan runtime and must not speculatively consume rows far ahead of +the output frontier. -```rust -pub struct ScanScheduler { - config: ScanSchedulerConfig, - state: ScanSchedulerState, -} -``` +## Scheduler Objects -The object is normally used behind `Arc`. +`ScanSchedulerConfig` currently has these fields: -```rust -let scheduler = Arc::new(ScanScheduler::new(config)); -``` +- `global_slots`: optional process/query-wide slot limit. +- `per_scan_slots`: optional slot limit for each registered scan. +- `morsel_plan_window`: optional number of morsels a partition stream may plan + ahead. `None` means all pending morsels may be planned. +- `morsel_launch_window`: optional number of morsels intended to run + concurrently. This is configured but not currently consumed by `scan_v2`. +- `morsel_byte_budget`: optional per-partition active logical segment-byte budget. -Scheduler ownership is selected by a provider: +`ScanSchedulerProvider` chooses scheduler ownership: -```rust -pub enum ScanSchedulerProvider { - /// Use one scheduler for every scan that shares this provider. - Shared(Arc), +- `Unbounded`: create an unbounded scheduler for the scan. +- `Shared`: reuse one `Arc`. +- `PerScan`: create a fresh scheduler from the config for each logical scan. - /// Construct a new scheduler whenever a logical scan starts. - PerScan(ScanSchedulerConfig), +The default `VortexSession` provider is `Unbounded`. DuckDB installs a shared +default scheduler in the extension session. The DataFusion benchmark only +installs a scheduler when `VORTEX_SCAN_SCHEDULER` is set. - /// No resource limits. Useful as the compatibility default and for tests. - Unbounded, -} -``` +The `ScanScheduler::acquire` permit API exists and is tested, but V2 scan tasks +do not currently acquire permits before launching. In the current V2 runtime the +effective controls are the morsel planning window and the task queue morsel-byte +budget. Slot fields are still useful because `morsel_slots(n)` derives default +read-budgeted config, but the slots themselves are not yet an execution gate. -The provider is resolved when a logical scan starts, not when a table or data source is registered. -This matters for DataFusion, where a table can be registered once and executed many times. +## Planning Morsels -```rust -impl ScanSchedulerProvider { - pub fn scheduler_for_scan(&self, meta: &ScanMeta) -> Arc; -} -``` +`partition_work_stream` owns a `PartitionWorkSchedulerState`: -## Session Integration +- `pending`: planned morsel ranges not yet converted to runtime state. +- `morsels`: active morsel states indexed by morsel id. +- `task_queue`: queued evidence, predicate, projection, and aggregate tasks. +- `in_flight`: launched task futures. +- `completed_morsels`: ordered-output buffer. +- `plan_window`: maximum active planned morsels for this partition stream. -The scheduler provider should be stored on `VortexSession`, following the same pattern as -`RuntimeSession`. +On each stream poll, the runtime: -```rust -pub struct ScanSchedulerSession { - provider: Arc, -} +1. Emits already-completed output if possible. +2. Plans more morsels while `active_morsels < plan_window`. +3. Launches admissible queued tasks until the task queue refuses more work. +4. Waits for one launched task to complete. +5. Updates evidence, predicate masks, projection state, and read accounting. -pub trait ScanSchedulerSessionExt: SessionExt { - fn scan_scheduler_provider(&self) -> Arc; +Planning a morsel is synchronous. It creates initial evidence work and then calls +`enqueue_ready_work`. For scans without predicates, projection work is queued +immediately. For filtered scans, the runtime queues evidence first, then residual +predicate reads, then projection once all predicates are proven for the morsel. - fn with_scan_scheduler(self, scheduler: Arc) -> Self; +## Task Lanes - fn with_new_scan_scheduler_per_scan(self, config: ScanSchedulerConfig) -> Self; +`ScanTaskQueue` groups queued work into lanes: - fn with_unbounded_scan_scheduler(self) -> Self; -} -``` +- `ScanEvidence`: scan-domain evidence shared by all morsels for one predicate. +- `Evidence`: morsel-local evidence for one predicate. +- `Predicate`: exact residual predicate evaluation. +- `Projection`: final projected values. +- `Aggregate`: aggregate reads, grouped with projection. -The default can be `Unbounded` initially, so adopting the scheduler does not silently introduce new -resource limits. Integrations can opt into bounded scheduling explicitly. +Admission is not FIFO across all work. The queue tries groups in this order: -The scheduler types should live in `vortex-scan`, not `vortex-layout`, because the resource policy -belongs to the scan API layer and should be reusable by non-layout sources. ScanPlan-specific code in -`vortex-layout` can consume tickets and permits through the public scan scheduler API without making -the scheduler understand layout-specific plan types. +1. Evidence within its byte target. +2. Predicate within its byte target. +3. Projection within its byte target. +4. Predicate ignoring group target. +5. Projection ignoring group target. +6. Evidence ignoring group target. -## DataFusion Integration +All groups still obey the total morsel-byte budget unless the task contributes no +new bytes or the runtime has no launched work at all. The empty-in-flight escape +hatch prevents deadlock when one task is larger than the configured budget. -DataFusion should expose scheduler control in the table/source builders. +Within a group, lower priority wins, then lower incremental read bytes, then +lower total read bytes, then lower morsel id. The incremental byte score is +important because tasks reading the same active segment can be admitted without +increasing active physical-read pressure. -```rust -impl VortexDataSourceBuilder { - pub fn with_scan_scheduler(mut self, scheduler: Arc) -> Self; - - pub fn with_scan_scheduler_provider( - mut self, - provider: Arc, - ) -> Self; - - pub fn with_new_scan_scheduler_per_query( - mut self, - config: ScanSchedulerConfig, - ) -> Self; -} -``` +There is no fixed morsel read-ahead frontier. Morsels can vary substantially in +byte size and can overlap in their segment requests, so run-ahead is governed by +incremental active read bytes rather than by a count of morsels. A later morsel +with small or already-active reads may be admitted ahead of an earlier morsel +whose reads would exceed the active byte budget. -`VortexTable` should expose the same options when it builds a `VortexDataSource`. Listing-format -users that go through `VortexFormatFactory` should configure scheduling on the `VortexSession` -used by the factory; the factory does not currently carry a separate scheduler override. +For dynamic-predicate scans there is one extra gate: speculative projection is +suppressed while completed output is backlogged, except when there are no +launched tasks and one projection is needed to keep an ordered stream moving. +Evidence and predicate tasks are still admissible while projection is gated. This +favors avoiding wasted projection I/O over maximizing object-store request depth. -For DataFusion, `DataSource::open` creates a single Vortex scan for partition zero. A per-query -scheduler can therefore be resolved immediately before calling -`DataSourceRef::scan`. If DataFusion later produces multiple Vortex scan plans for one query and -those scans should share a per-query scheduler, the integration should propagate a scheduler through -DataFusion's `TaskContext` or another query-scoped extension and use that as the provider result. +## Morsel-Byte Budget -Recommended DataFusion modes: +`morsel_byte_budget` is per partition stream. It counts active logical segment +bytes for admitted tasks, deduped by `SegmentRequestKey`. If two launched tasks +await the same segment, only the first contributes bytes; the active entry keeps +a reference count until both tasks complete. -```rust -// One scheduler across an application, tenant, or SessionContext. -let scheduler = Arc::new(ScanScheduler::new(config)); -let source = VortexDataSource::builder(data_source, session) - .with_scan_scheduler(scheduler) - .build() - .await?; - -// A fresh scheduler each time this table is scanned. -let source = VortexDataSource::builder(data_source, session) - .with_new_scan_scheduler_per_query(config) - .build() - .await?; -``` - -Benchmark environment variables can map onto these APIs, but they should not be the primary control -surface: +When the budget is finite, the queue divides target bytes by group: ```text -VORTEX_SCAN_SCHEDULER=unbounded|shared|per-query -VORTEX_SCAN_MAX_MORSEL_SLOTS=... -``` - -## DuckDB Integration - -DuckDB can use one scheduler in the extension's global session. - -```rust -static SCAN_SCHEDULER: LazyLock> = - LazyLock::new(|| Arc::new(ScanScheduler::new(ScanSchedulerConfig::duckdb_default()))); - -static SESSION: LazyLock = LazyLock::new(|| { - let session = VortexSession::default() - .with_handle(RUNTIME.handle()) - .with_scan_scheduler(Arc::clone(&SCAN_SCHEDULER)); - vortex_geo::initialize(&session); - session -}); -``` - -This matches DuckDB's current extension shape: a global runtime and global Vortex session. It still -keeps the scheduler explicit and testable. - -## Work Requests and Permits - -Scan work should acquire scheduler permits before consuming bounded resources. - -The first implementation should not require every `PreparedRead`, `PreparedEvidence`, or `PreparedAggregate` -to expose pending I/O, decoded-size estimates, or cost statistics. Those estimates are useful, but -they are also hard to get right and would make the initial ScanPlan API more rigid. The scan runtime -already knows the coarse unit of scheduling: the morsel. The MVP scheduler should admit morsels and -let each admitted morsel run its evidence/read/aggregate pipeline internally. - -The MVP `WorkRequest` should be coarse: - -```rust -pub struct WorkRequest { - pub class: ScanWorkClass, - pub slots: u32, -} - -pub enum ScanWorkClass { - FileOpen, - Morsel, - OutputConversion, -} - -impl ScanScheduler { - pub fn register_scan(&self, meta: ScanMeta) -> ScanTicket; - - pub async fn acquire( - &self, - ticket: &ScanTicket, - request: WorkRequest, - ) -> VortexResult; -} +predicate: 6/8 of budget +projection: 1/8 of budget +evidence: 1/8 of budget ``` -Richer byte/task fields can be added once the runtime has instrumentation showing which resource -limits matter in practice: +These are soft group targets. The second pass can use any remaining total budget +for predicate, projection, or evidence, but no task can exceed the total budget +unless it is the only way to make progress. -```rust -pub struct PreparedCostHint { - pub estimated_io_bytes: Option, - pub estimated_decoded_bytes: Option, - pub estimated_cpu_units: Option, -} -``` - -If those hints are added, they should remain advisory. A prepared handle that does not provide hints should -still be schedulable with default morsel accounting. - -`WorkPermit` is RAII. Dropping it releases every reserved resource. This is required for early -limit termination, query cancellation, stream drop, and panic-safe cleanup. - -```rust -pub struct WorkPermit { - scheduler: Arc, - reservation: ReservationId, -} - -impl Drop for WorkPermit { - fn drop(&mut self) { - self.scheduler.release(self.reservation); - } -} -``` - -Once byte accounting exists, large work should be allowed to resize reservations after the actual -memory footprint is known: - -```rust -impl WorkPermit { - pub async fn grow_decoded_bytes(&mut self, bytes: u64) -> VortexResult<()>; - - pub fn shrink_decoded_bytes(&mut self, bytes: u64); -} -``` - -This will let the scan reserve from estimates first, then correct accounting after decoding. - -## Explicit Segment Request Model - -The ScanPlan path makes segment requests explicit enough for scheduling while keeping physical I/O -inside the segment source. Layouts still refer to logical segments by -`SegmentId`, and the scheduler should stay at that same abstraction level. It should know which -registered source owns the segment and roughly how many bytes the segment costs, but it should not -need the segment's physical byte location: - -```rust -pub struct SegmentRequest { - pub source: SegmentSourceId, - pub segment: SegmentId, - pub bytes: u64, - pub phase: ScanIoPhase, - pub priority: ScanPriority, - pub cancel_group: CancelGroup, -} - -pub enum ScanIoPhase { - EvidenceSetup, - EvidenceProbe, - PredicateRead, - ProjectionRead, - AggregateRead, -} -``` - -`SegmentId` is not a physical I/O address. It is a layout-local reference. A `VortexFile` binds -that reference when it instantiates a ScanPlan tree: +The default bounded config uses: ```text -footer segment map + opened byte source - | - v -SegmentId -> SegmentInfo { bytes, cacheability, source-local metadata } -``` - -For normal Vortex files, the source is the `VortexReadAt` returned by `VortexOpenOptions` or -`FileSystem::open_read`. For a custom ScanPlan, the source might be an HTTP range reader, an -in-memory reader, or another backend that can provide segment payloads. - -The first implementation should only support segment requests. A future non-segment I/O hook can be -added next to `SegmentRequest` if a custom source cannot present its work as segment payloads, but -leaving that hook out of the initial API keeps the scheduler boundary smaller. - -There are two stages for making this request model authoritative: - -1. **Intermediate: scheduled morsel futures.** - Constructing a morsel future synchronously registers the segment requests that future will later - await. The returned future owns those segment futures, so the scheduler can construct work ahead - of time, observe its byte cost, reorder or drop it, and only poll it when useful. - -2. **End state: strict scheduler-backed resolution.** - Plans describe their exact segment requests before execution, the scheduler/source submits them, - and execution reads through a context backed by the submitted request set. In this mode, reading - a segment that was not declared is an error or an explicitly metered late request. - -The intermediate stage preserves the useful old pre-registration behavior, but moves it out of -layout-reader side effects and into an explicit scheduler context. - -## Segment Source Registration - -Segment sources are registered against a scan ticket and receive scheduler-local identities: - -```rust -impl ScanTicket { - pub fn register_segment_source( - &self, - source: Arc, - meta: SegmentSourceMeta, - ) -> SegmentSourceId; -} -``` - -Equivalently, this can be spelled as `ScanScheduler::register_segment_source(&ticket, ...)` if the -implementation wants the scheduler to own all mutation directly. The ergonomic API should keep the -source tied to the ticket, because source identity is only meaningful within the scheduler context -that is arbitrating the scan. - -`SegmentSourceId` should be opaque. A shared scheduler may internally deduplicate physical sources -by an optional stable `SegmentSourceKey`, but correctness must not depend on that global -deduplication. The minimum guarantee is scan-local identity: all requests with the same -`SegmentSourceId` target the same registered source and may be deduped or batched together. - -For a prepared `VortexFile`, source registration happens during file preparation, before layout -plans produce runtime segment requests. Layout-specific plans should not know how a file was -opened. A flat layout can continue to store `segment_id`; the prepared file state translates that ID to a -`SegmentRequest` using the bound segment table. - -Custom ScanPlans that own independent I/O register their own sources during preparation or state -initialization. For example, an HTTP-backed plan can register a source that maps `SegmentId`s to -HTTP range requests internally and produce `SegmentRequest`s against the returned -`SegmentSourceId`. - -## Batching and Coalescing - -`ScanScheduler` should own logical scheduling, not physical coalescing. - -The intermediate scheduler should expose a scheduling context to plan execution constructors: - -```rust -pub struct ScheduleCtx<'a> { - ticket: &'a ScanTicket, - source_id: SegmentSourceId, - source: Arc, - in_flight: &'a SegmentFutureCache, -} - -impl ScheduleCtx<'_> { - pub fn request_for_segment(&self, segment: SegmentId) -> VortexResult; - - pub fn request_segment(&mut self, request: SegmentRequest) -> SegmentFuture; -} -``` - -`request_segment` is synchronous. It dedupes by `(SegmentSourceId, SegmentId)`, submits to the -registered source when needed, and returns a shared future for the logical segment payload. Adjacent -morsels and different prepared handles that touch the same segment receive clones of the same shared future -while it remains in flight. - -The scheduler should therefore: - -- construct scheduled morsel futures ahead of polling; -- let those constructors call `ScheduleCtx::request_segment` for the I/O they will later await; -- cache in-flight segment futures by `(SegmentSourceId, SegmentId)`; -- group pending reads by `SegmentSourceId`; -- prioritize grouped reads based on phase, frontier, memory pressure, cancellation, and observed - predicate selectivity; -- submit ordered batches or windows of segment requests to each source. - -For the current intermediate implementation, scans without a pushed-down limit should default to an -unbounded planning window and a bounded launch window. Constructing the planned morsel registers -segment futures for the scan window, while only -the launch window controls how many morsels are actively polled and decoded. Ordered scans use the -same planning and launch machinery, but projection completions are buffered behind an ordered -emission frontier. Scans with a pushed-down limit should continue using a `1/1` plan/launch window -until limit accounting can be preserved with a wider frontier. - -The `ScheduledSegmentSource` should own physical coalescing and submission: - -```rust -pub trait ScheduledSegmentSource: Send + Sync { - fn segment_info(&self, id: SegmentId) -> VortexResult; - - fn capabilities(&self) -> SegmentSourceCapabilities; - - fn submit(&self, batch: SegmentBatch) -> Vec; -} -``` - -Different backends make different tradeoffs. Local files, object stores, HTTP range readers, memory -buffers, and an `io_uring` implementation have different queue depths, alignment constraints, -cancellation behavior, request overheads, and tolerance for over-reading. The scheduler can hand a -source nearby segment requests in a useful order, but the source decides whether to merge their -underlying byte ranges into one physical request, issue them independently, or use a backend-specific -submission queue. - -The in-flight future cache is a scan/runtime data structure, not a decoded-data cache. Its job is to -avoid duplicate physical submission while scheduled morsels overlap. Once no scheduled future or -plan state retains interest, the in-flight entry may be dropped. Longer-lived reuse belongs either -in plan state, such as a decoded flat array or zoned stats table, or in the segment cache described -below. - -This preserves the existing `VortexReadAt` abstraction. A default `ReadAtSegmentSource` can wrap -`Arc` and use the source's `coalesce_config()` and `concurrency()` as physical -submission policy behind a file-backed `ScheduledSegmentSource`. The current `FileSegmentSource` -can then become a compatibility adapter over the same machinery rather than the scheduler-visible -abstraction. - -The important invariant is that dedupe and coalescing never cross `SegmentSourceId` unless an -implementation has proven that two IDs share the same physical source. A byte range on one HTTP -endpoint is not interchangeable with the same range on another endpoint. - -## Segment Cache - -The segment cache should cache segment payloads, not entire files and not arbitrary coalesced byte -ranges. The unit stored in the cache is the exact buffer a layout segment would receive after a -physical read has been sliced back to the segment boundary. - -The cache key must be source/file scoped. A raw `SegmentId` is only meaningful inside one footer's -segment map. Reusing `SegmentId(0)` across two files must not collide. A scheduler-aware key should -therefore include source identity plus the logical segment: - -```rust -pub struct SegmentCacheKey { - pub source: SegmentSourceId, - pub segment_id: SegmentId, -} -``` - -If a shared scheduler later wants cross-scan cache reuse for the same object, it can translate -`SegmentSourceId` to an optional stable `SegmentSourceKey`, such as an opened-file identity with -size and version metadata. That optimization is separate from correctness. The scan-local key is -sufficient for deduping and cache lookup within one prepared scan. - -Cache lookup should happen before physical I/O submission: - -1. The runtime produces a cacheable `SegmentRequest`. -2. The scheduler dedupes exact logical requests. -3. The source adapter checks the segment cache for each cacheable request. -4. Cache hits complete the logical segment request without consuming physical I/O queue depth. -5. Cache misses are submitted to the underlying `ScheduledSegmentSource`. -6. When a physical read completes, the source stores the exact segment slice, not the coalesced - super-range. - -This can be implemented as a cached source adapter: - -```rust -CachedSegmentSource { - cache: Arc, - inner: Arc, -} -``` - -The adapter mirrors today's `SegmentCacheSourceAdapter`, but it works with explicit -`SegmentRequest`s instead of `SegmentFuture`s. It also preserves the current -`InitialReadSegmentCache` behavior: when footer parsing already fetched bytes that cover whole -segments, the prepared file can seed those segment entries and avoid issuing later reads. - -Segment cache admission should remain a cache policy decision at first. The scheduler should observe -hits, misses, and stores for metrics, and it may eventually coordinate a shared cache memory budget, -but it does not need to own eviction to schedule scans correctly. - -## Scheduled Morsel Futures - -`ScanPlan` and prepared handles serve different purposes: - -- `ScanPlan` is the expanded layout tree with capabilities. It answers whether a layout can push an - expression, produce evidence, read values, split work, or answer statistics. -- A prepared handle is a reusable compiled route through that tree for one purpose, such as reading - a projection expression or producing one predicate's evidence. It should not own frontier state - and should not have an `execute_next(len)` API. - -The drive/cursor owns frontier state and chooses explicit morsel ranges. Prepared handles execute -explicit work: - -```rust -pub struct MorselScope<'a> { - pub range: Range, - pub rows: RowScope<'a>, -} - -pub struct ScheduledRead<'a> { - pub range: Range, - pub phase: ScanIoPhase, - pub bytes: u64, - pub future: BoxFuture<'a, VortexResult>, -} - -pub trait ScheduledPreparedRead { - fn schedule_morsel<'a>( - &'a self, - scope: MorselScope<'a>, - state: &'a Self::State, - cx: &'a mut ScheduleCtx<'_>, - local: &'a mut ExecutionCtx, - ) -> VortexResult>; -} -``` - -The exact Rust shape may differ, but the important property is that `schedule_morsel` is not an -`async fn`. It runs immediately, requests every segment the returned future will await, and returns -a future that may be polled later. For example, a flat leaf requests its segment before returning -the decode future: - -```rust -fn schedule_morsel(...) -> VortexResult> { - let request = cx.request_for_segment(flat.segment_id())?; - let segment = cx.request_segment(request); - Ok(ScheduledRead::new(async move { - let bytes = segment.await?; - decode_flat(bytes) - })) -} +DEFAULT_MORSEL_BYTE_BUDGET = 256 MiB ``` -This avoids a pure "declare requests, then ignore them during execution" layer. The scheduled -future captures the segment futures that define its I/O lifetime. Dropping an unpolled scheduled -morsel releases its interest; polling it later awaits the already-registered segment futures. - -The scheduler can construct scheduled work ahead until one or more thresholds are reached: - -- in-flight segment bytes; -- in-flight scheduled morsels; -- projected output or intermediate memory; -- maximum distance ahead of the contiguous morsel frontier. +`ScanSchedulerConfig::unbounded()` leaves this unset, which becomes `u64::MAX` +inside `partition_work_stream`. -It can then choose which scheduled futures to poll using phase, readiness, observed selectivity, -frontier pressure, and byte size. Evidence can run farther ahead because output is small and it -sharpens later work. Predicate reads should be ordered by expected selectivity per byte. Projection -reads should stay near the accepted-row frontier so the scan does not retain an entire filtered -stream before emitting output. +## Segment Requests -## End-State Prepared-Handle Introspection - -The stricter end state can add explicit request introspection on top of scheduled morsel futures. -In that model, prepared handles describe the segments they would need before execution, the -scheduler submits those requests, and execution receives a resolver backed by the submitted request -set: +Prepared reads and evidence providers expose segment requests before task launch. +The runtime turns those requests into `ScanRead` values with: ```rust -pub trait PreparedRead { - fn segment_requests( - &self, - range: Range, - rows: RowScope<'_>, - state: &Self::State, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - Ok(SegmentRequests::unknown()) - } -} - -pub trait PreparedEvidence { - fn segment_requests( - &self, - req: &EvidenceRequest<'_>, - state: &Self::State, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - Ok(SegmentRequests::unknown()) - } -} +register_segment_reads_cached(cache, source, requests) ``` -Leaf prepared handles can provide exact requests. A flat leaf reports the segment bound to its `segment_id`. -Zoned evidence reports the shared stats-table setup read separately from cheap per-morsel probes. -Struct and apply prepared reads compose child requests. Chunked prepared reads use `selection` and `demand` to include -only the chunks that actually require data, preserving the current selected-but-undemanded behavior -where default filler can be produced without expanding or reading a child. - -In strict mode, prepared handles that return `unknown` cannot use the strict resolver without falling back to -an explicit late request path. That fallback should be observable in metrics and should eventually -disappear from core layouts. This is why the scheduled-morsel-future model is the better -intermediate step: it makes I/O registration authoritative without requiring every prepared handle -to expose a perfect request set on day one. - -## Morsel Pipeline - -Long term, the per-scan runtime should manage a state machine per morsel: - -```text -Planned - -> EvidenceReady - -> PredicateReady - -> ProjectionReady - -> Emitted | Pruned - -> Released -``` - -Evidence, predicate, and projection work should be pipelined rather than globally phased: - -- Run shared evidence setup far ahead when it is cheap and likely to prune later work. -- Run evidence probes ahead within an evidence-memory budget. -- Schedule residual predicate reads using observed cost and selectivity. -- Schedule projection reads with output backpressure so the scan does not filter the entire stream - and retain all surviving masks before producing batches. -- Re-run cheap `recheck_before_projection` evidence immediately before projection when dynamic - predicate versions changed while the morsel was in flight. +This call is synchronous. For cache misses, it calls the underlying +`SegmentSource::request(segment)` immediately and stores a shared future in the +scan-local `SegmentFutureCache`. That means simply planning work registers the +logical reads with the file segment source before the task future is polled. -The runtime, not the global scheduler, owns predicate semantics. It tracks masks, predicate -versions, limits, output ordering, and aggregate state. The scheduler only sees resource classes, -segment requests, priorities, reservations, cancellation state, and source IDs. +The cache key is currently the logical `SegmentId`. That is sufficient inside one +`ScanExecution` because each execution has one bound file segment source. It is +not a cross-file or cross-scan cache key. -Predicate ordering should be adaptive. The runtime can keep per-predicate statistics such as: +`SegmentInfo` includes `bytes` and `cacheable`. The task scheduler currently uses +`bytes` for read-budget admission. The `cacheable` flag is not part of task +admission policy. -- evidence prune rate; -- residual selectivity; -- bytes read per evaluated row; -- latency; -- cache hit rate; -- downstream projection bytes avoided. +## Physical I/O -These observations feed future priority decisions. A residual predicate that is cheap and highly -selective should run before an expensive low-selectivity predicate. A predicate whose evidence setup -is already cached may become cheap enough to run earlier. These are per-scan policy decisions, not -layout-node behavior. +`FileSegmentSource` bridges logical segment requests to a `VortexReadAt` backend. +It has an internal event stream with these request states: -## Morsel Frontier +- registered: a segment future exists, but has not been polled; +- requested: the segment future has been polled; +- in-flight: the physical backend read has been submitted; +- resolved: the future has completed. -The scheduler-aware runtime should own the per-file morsel frontier. Each prepared file tracks the -set of morsels that may still read state. When a morsel is emitted or pruned, the runtime advances -the contiguous completed frontier and calls release hooks on prepared reads and scan plans. +Registered but unpolled requests are still visible to coalescing. When one +request is polled, `IoRequestStream` picks the earliest polled request and may +coalesce nearby registered or polled requests by physical offset. -This is required for lookahead. Without a frontier, running evidence and predicate work far ahead can -leave decoded chunks, flat arrays, zone maps, and masks retained longer than intended. The release -frontier lets layouts keep only the working set: +Physical coalescing is controlled by `VortexReadAt::coalesce_config()`: ```text -unfinished: [m3, m7, m8] -completed: [m0, m1, m2, m4, m5, m6] -frontier: end(m2) +in-memory: 8 KiB distance, 8 KiB max +local file: 1 MiB distance, 4 MiB max +object storage: 1 MiB distance, 16 MiB max ``` -The frontier advances only through contiguous completed morsels. Later completed morsels cannot -release earlier state until the gap closes. - -## Resources to Control - -The first implementation should control active execution: - -- Maximum morsels in flight per scan. -- Maximum morsels in flight across a shared scheduler. - -This intentionally approximates the current scan behavior: scans without a pushed-down limit can run -several morsels concurrently. Ordered scans keep the same work window, but emit projection results -through an ordered frontier. Scans with a pushed-down limit should run with a narrower launch -window. The default launch window should be proportional to available scan parallelism: +Physical request concurrency is controlled by `VortexReadAt::concurrency()`: ```text -no limit: max_morsels_in_flight = 4 * available_parallelism -limit: max_morsels_in_flight = 1 +ObjectStoreReadAt default concurrency = 192 ``` -The shared scheduler can apply the same window globally, per scan, or both. For example, a -DataFusion user can choose one shared scheduler with `4 * available_parallelism` total morsel slots -to cap the whole process, or create a new scheduler per query to isolate resource accounting. - -Later implementations can add: +This concurrency is below the scan task queue. The object-store layer can only +use that depth if the scan runtime has registered and polled enough segment +futures. -- I/O bytes in flight. -- Decoded/intermediate bytes in flight. -- Number of outstanding I/O operations. -- Number of decode/CPU tasks spawned by the scan path. -- Scheduler-aware segment cache admission. -- Per-scan weights and priorities. -- Storage-class-specific concurrency, such as separate local disk and object store limits. -- Output batch memory handoff, where permits live until the query engine consumes the batch. +## Object Store Behavior -Output memory is the hardest resource to account for because ownership leaves the scan runtime. -The first byte-accounting implementation should bound intermediate scan memory and treat output -batch accounting as a follow-up. +The current object-store path has good physical defaults but no automatic scan +scheduler preset: -## Fairness +- `ObjectStoreReadAt` uses object-store coalescing and high physical request + concurrency. +- DataFusion remote benchmarks create the `VortexSession` before registering the + object store URL, so the Vortex scheduler provider cannot infer S3/GCS from + the source URL. +- DuckDB uses a shared bounded scheduler by default, but `morsel_launch_window` + is not yet enforced by V2. +- DataFusion uses an unbounded scheduler unless benchmark environment variables + opt into a scheduler. -A shared scheduler must avoid letting one large scan submit enough work to starve smaller scans. -The initial policy should combine global limits with per-scan windows: +For object stores, the main risk is not the `ObjectStoreReadAt` queue depth. It +is failing to expose enough useful segment futures early enough, or exposing far +too many tiny/sparse reads without a workload-specific budget. The important +knobs are: -- Each `ScanTicket` has a maximum number of in-flight morsels. -- Global slot semaphores bound aggregate morsel concurrency. -- Work is admitted only when both the per-scan and global limits allow it. +- `morsel_plan_window`: how far ahead segment futures are registered; +- `morsel_byte_budget`: how many active logical segment bytes may be polled; +- physical coalescing distance/max size on the object-store reader; +- physical object-store request concurrency; +- DataFusion output partition count, which controls how many partition streams + run at once. -This is simpler than a centralized global work queue and avoids making the scheduler responsible for -query semantics. Weighted fair scheduling can be added later if the per-scan windows are not enough. - -## Morsel Runtime - -The scan should move toward an explicit per-scan runtime. The MVP can still execute one whole -morsel after acquiring one coarse scheduler permit, but the runtime boundary should be chosen so it -can later split a morsel into evidence, predicate, projection, emit, and release work without -changing the public `DataSource` API. - -```rust -pub struct MorselScanRuntime { - scheduler: Arc, - ticket: ScanTicket, - plan: ScanRuntimePlan, - state: ScanRuntimeState, -} -``` +The hardcoded frontier slack of four morsels can also matter for remote storage. +Even with an unbounded plan window, queued task admission does not run arbitrarily +far ahead of the lowest queued morsel. If each morsel produces only a few small +range reads, this can under-fill a high-latency object store after the initial +registration burst. -`ScanRuntimePlan` is internal to the scan implementation. It contains the files, expanded ScanPlan -trees, pushed expressions, prepared evidence handles, prepared reads, prepared aggregate handles, -and reusable per-file state. -It is not a replacement public scan API. +## Benchmark Knobs -MVP execution loop: +The DataFusion benchmark supports: ```text -while output is still required: - claim next morsel - acquire per-morsel scheduler permits - read evidence needed for pruning or satisfaction - update row selection - read residual filter columns if needed - evaluate residual filter - read projected values or update aggregate state - emit batch or aggregate partial - release permits +VORTEX_SCAN_SCHEDULER=unbounded|shared|per-query +VORTEX_SCAN_MAX_MORSEL_SLOTS=... +VORTEX_SCAN_MORSEL_PLAN_WINDOW=... +VORTEX_SCAN_MAX_MORSEL_BYTES=... ``` -Target execution loop: +`VORTEX_SCAN_MAX_MORSEL_SLOTS` currently feeds `ScanSchedulerConfig::morsel_slots`. +Because V2 does not enforce launch permits yet, this mainly selects a bounded +config with the default morsel-byte budget unless paired with explicit +morsel-byte budget configuration. -```text -while output is still required: - choose explicit morsel ranges from the drive/cursor - construct scheduled evidence/predicate/projection futures until budget is full - each scheduled future registers its segment requests through ScheduleCtx - poll the most useful scheduled futures based on phase, bytes, selectivity, and frontier - update evidence, predicate masks, projection demand, or output state as futures complete - advance the per-file frontier and release state behind it -``` +`VORTEX_SCAN_MAX_READ_BYTES` is accepted as a compatibility fallback for older +benchmark scripts. -The scheduler should not know that the work is "zoned evidence" or "dict prepared read". It should see -resource classes, source IDs, segment requests, slot counts, cancellation state, and priorities. -The per-scan runtime maps layout-specific plan behavior into those generic scheduler inputs. +Useful S3 sweeps should compare: -## Cancellation - -`ScanTicket` owns a cancellation token. +```text +# Current compatibility behavior. +VORTEX_SCAN_SCHEDULER=unbounded -```rust -impl ScanTicket { - pub fn cancel(&self); +# Bounded read pressure, one scheduler per query. +VORTEX_SCAN_SCHEDULER=per-query +VORTEX_SCAN_MAX_MORSEL_BYTES=268435456 - pub fn is_cancelled(&self) -> bool; -} +# Larger remote-storage byte window. +VORTEX_SCAN_SCHEDULER=per-query +VORTEX_SCAN_MAX_MORSEL_BYTES=1073741824 ``` -Cancellation should happen when: - -- The engine drops the stream. -- A limit has been satisfied. -- A scheduler admission wait is cancelled. -- The host engine explicitly cancels the query. - -Queued work must observe the ticket before starting. Running work must release permits on drop. - -## Metrics - -The scheduler should expose per-scheduler and per-scan metrics: - -- Permit wait time by resource. -- Morsels admitted, completed, cancelled, and skipped. -- Per-scan queue/admission delay. - -Later byte/task accounting should add bytes reserved, peak bytes reserved, I/O operations admitted, -and decode tasks admitted. - -DataFusion should attach these to the existing scan metrics path where possible. DuckDB can expose -them through tracing or debug logs first. - -## Implementation Plan - -1. Add scheduler API to `vortex-scan`. - Include `ScanScheduler`, `ScanSchedulerConfig`, `ScanSchedulerProvider`, `ScanSchedulerSession`, - `ScanTicket`, `WorkRequest`, and `WorkPermit`. - -2. Wire the scan to register one ticket per `DataSource::scan` call. - Store the ticket and scheduler in the `DataSourceScan` so all partitions from the same scan - share one resource view. - -3. Add permits around morsel execution. - Start with one scheduler slot per in-flight morsel. Do not require `PreparedRead`, - `PreparedEvidence`, or `PreparedAggregate` to expose cost estimates in the MVP. Keep byte accounting and output batch - memory accounting out of the MVP. - -4. Add DataFusion builder controls. - Support shared scheduler and per-query scheduler modes on `VortexDataSource`, `VortexTable`, and - `VortexFormatFactory`. - -5. Add DuckDB global scheduler. - Store a shared scheduler in the extension's global `VortexSession`. - -6. Add benchmark env vars. - Use them to compare unbounded, shared, and per-query scheduler modes under TPC-H and ClickBench. - -7. Add fairness and cancellation tests. - Tests should cover permit release on stream drop, per-scan windows, shared limits across two - scans, and per-query isolation. - -8. Add scheduler-scoped segment source registration. - A prepared `VortexFile` should register its opened segment source and keep a bound segment table - that maps `SegmentId` to `SegmentInfo`. Physical byte locations stay inside the registered - source. - -9. Add a scheduler-owned in-flight segment future cache. - Key it by `(SegmentSourceId, SegmentId)`. `ScheduleCtx::request_segment` should synchronously - submit or reuse the logical segment request and return a shared future for the segment payload. - -10. Convert prepared-handle execution to scheduled morsel future construction. - `PreparedRead` and `PreparedEvidence` should expose synchronous future constructors for explicit - morsel ranges. Constructing the future registers all segment futures it will await. The drive - can then construct work ahead until byte/frontier/memory thresholds are full and decide which - futures to poll. - -11. Route segment requests through source batches. - The per-scan runtime should dedupe logical segment reads, group pending requests by - `SegmentSourceId`, and submit ordered batches to the source. The default source adapter should - wrap `VortexReadAt` and own physical coalescing using that backend's `coalesce_config()` and - `concurrency()`. - -12. Move segment-cache lookup into the segment request path. - Add `SegmentCacheKey` to cacheable `SegmentRequest`s and implement a cached source adapter that - checks the segment cache before submitting physical I/O, then stores exact segment slices on - misses. Preserve initial-read cache seeding during file preparation. - -13. Split whole-morsel execution into pipeline work. - Add explicit morsel states for evidence, residual predicate reads, projection, emit, and - release. Use observed selectivity and I/O cost to reprioritize predicate work within each scan. - -14. Drive the morsel frontier. - Track completed/pruned morsels per file and call prepared-read/scan-plan release hooks as the - contiguous frontier advances. - -15. Add strict end-state segment resolution. - Add exact request introspection and a scheduler-backed read context. In strict mode, execution - reads only submitted segments from that context; undeclared segment reads are errors or explicit - late requests with metrics. Start with flat, zoned, dictionary, struct/apply, and chunked - composition, then remove late fallback from core layouts. - -## Open Questions - -- Should the default scheduler remain unbounded permanently, or should ScanPlan scans eventually use bounded - defaults? -- How should DataFusion propagate one per-query scheduler across several Vortex scan plans in the - same physical plan? -- Should scheduler config be part of the public stable scan API or remain integration-specific until - the ScanPlan scan is more mature? -- How should output batch memory be accounted once ownership moves into DataFusion or DuckDB? -- Should segment cache memory share the scheduler's decoded/intermediate budget, or have a separate - cache budget coordinated by the same scheduler? -- Should `SegmentSourceId` be strictly scan-local, or should a shared scheduler expose optional - cross-scan source keys for deduping reads against the same opened object? -- How much physical coalescing feedback should a `ScheduledSegmentSource` report back to the - scheduler for adaptive policy and metrics? +The most useful remote sweep is fixed read budget with plan windows such as 16, +64, 256, and unset/unbounded. Small plan windows are expected to reduce +coalescing and hurt S3 unless they also avoid substantial over-read. + +An active-logical-read target was tested as an I/O-depth proxy and rejected: it +improved some FineWeb cases, but regressed local PolarSignals enough that it was +too indirect to use as a scheduler knob. + +## Tuning Guidance + +For local NVMe, keep the read budget moderate and rely on local filesystem +coalescing. Excessive read-ahead can increase memory pressure without hiding much +latency. + +For S3/GCS, prefer a larger byte budget and a large or unbounded plan window so +the file segment source can see adjacent registered requests and coalesce them. +If a query is highly selective and projection reads are sparse, validate the +coalesced-byte metrics before increasing the object-store coalescing max size. +If dynamic predicates are active, also compare projection-gated behavior against +object-store request depth: the gate is intended to avoid wasted projection I/O, +but it can reduce S3 latency hiding for projection-light queries. + +Use scan metrics to separate three failure modes: + +- low object-store request concurrency: not enough futures are being polled; +- low coalescing: not enough adjacent futures are registered before polling; +- excessive over-read: coalesced requests are much larger than useful projected + segment bytes. + +The scheduler today cannot distinguish those automatically. The next practical +tuning step is to expose plan-window control in the benchmark and, separately, +enforce `morsel_launch_window` with scheduler permits so slot configuration +matches runtime behavior. + +## Known Gaps + +- `morsel_launch_window`, `global_slots`, and `per_scan_slots` are not enforced + by `scan_v2` task launch. +- The benchmark can configure scheduler mode, plan window, and byte budget, but + not physical object-store coalescing or request concurrency. +- There is no automatic object-store scheduler preset. +- The scan runtime accounts logical segment bytes, not physical coalesced bytes. +- `SegmentInfo::cacheable` is not used by task admission. +- Output Arrow conversion is outside the scan task queue and has separate + buffering in the DataFusion adapter. diff --git a/vortex-file/src/file.rs b/vortex-file/src/file.rs index 6f1483e9f88..23f6bcf2081 100644 --- a/vortex-file/src/file.rs +++ b/vortex-file/src/file.rs @@ -22,15 +22,15 @@ use vortex_array::stream::SendableArrayStream; use vortex_error::VortexResult; use vortex_layout::LayoutReader; use vortex_layout::scan::layout::LayoutReaderDataSource; +use vortex_layout::scan::plan::PreparedStateCache; +use vortex_layout::scan::plan::PreparedStateCacheRef; +use vortex_layout::scan::plan::ScanPlanRef; use vortex_layout::scan::scan_builder::ScanBuilder; use vortex_layout::scan::split_by::SplitBy; +use vortex_layout::segments::SegmentFutureCache; use vortex_layout::segments::SegmentSource; use vortex_scan::DataSourceRef; use vortex_scan::ScanRequest; -use vortex_scan::plan::PreparedStateCache; -use vortex_scan::plan::PreparedStateCacheRef; -use vortex_scan::plan::ScanPlanRef; -use vortex_scan::segments::SegmentFutureCache; use vortex_session::VortexSession; use crate::FileStatistics; diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index f4e1fb89222..f04a5b29581 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -48,8 +48,42 @@ use vortex_io::filesystem::FileListing; use vortex_io::filesystem::FileSystemRef; use vortex_io::runtime::Handle; use vortex_io::session::RuntimeSessionExt; +use vortex_layout::scan::plan::EvidenceScope; +use vortex_layout::scan::plan::OwnedRowScope; +use vortex_layout::scan::plan::PrepareCtx; +use vortex_layout::scan::plan::PreparedAggregateRef; +use vortex_layout::scan::plan::PreparedEvidenceRef; +use vortex_layout::scan::plan::PreparedReadRef; +use vortex_layout::scan::plan::PreparedStats; +use vortex_layout::scan::plan::PreparedStatsRef; +use vortex_layout::scan::plan::PushCtx; +use vortex_layout::scan::plan::ReadContext; +use vortex_layout::scan::plan::ReadTask; +use vortex_layout::scan::plan::ReadTaskOutput; +use vortex_layout::scan::plan::ScanPlan; +use vortex_layout::scan::plan::ScanPlanRef; +use vortex_layout::scan::plan::ScanState; +use vortex_layout::scan::plan::ScanStateRef; +use vortex_layout::scan::plan::StateCtx; +use vortex_layout::scan::plan::downcast_state; +use vortex_layout::scan::plan::evidence::EvidenceFragment; +use vortex_layout::scan::plan::evidence::PredicateEvidence; +use vortex_layout::scan::plan::evidence::PredicateEvidenceKind; +use vortex_layout::scan::plan::evidence::PredicateId; +use vortex_layout::scan::plan::evidence::PredicateVersion; +use vortex_layout::scan::plan::request::EvidenceMode; +use vortex_layout::scan::plan::request::OwnedEvidenceRequest; +use vortex_layout::scan::plan::request::ScanRequest; use vortex_layout::scan::v2::validate_temporal_comparisons; use vortex_layout::scan::v2::with_row_idx; +use vortex_layout::segments::ReadResultsSegmentSource; +use vortex_layout::segments::ScanIoPhase; +use vortex_layout::segments::ScanRead; +use vortex_layout::segments::SegmentFutureCache; +use vortex_layout::segments::SegmentPlanCtx; +use vortex_layout::segments::SegmentRequests; +use vortex_layout::segments::SegmentSource; +use vortex_layout::segments::register_segment_reads_cached; use vortex_mask::Mask; use vortex_metrics::MetricsRegistry; use vortex_scan::DataSource; @@ -65,40 +99,13 @@ use vortex_scan::ScanRequest as DataSourceScanRequest; use vortex_scan::ScanScheduler; use vortex_scan::ScanSchedulerSessionExt; use vortex_scan::ScanTicket; -use vortex_scan::plan::EvidenceScope; -use vortex_scan::plan::OwnedRowScope; -use vortex_scan::plan::PrepareCtx; -use vortex_scan::plan::PreparedAggregateRef; -use vortex_scan::plan::PreparedEvidenceRef; -use vortex_scan::plan::PreparedReadRef; -use vortex_scan::plan::PreparedStats; -use vortex_scan::plan::PreparedStatsRef; -use vortex_scan::plan::PushCtx; -use vortex_scan::plan::ReadContext; -use vortex_scan::plan::ScanPlan; -use vortex_scan::plan::ScanPlanRef; -use vortex_scan::plan::ScanState; -use vortex_scan::plan::ScanStateRef; -use vortex_scan::plan::StateCtx; -use vortex_scan::plan::downcast_state; -use vortex_scan::plan::evidence::EvidenceFragment; -use vortex_scan::plan::evidence::PredicateEvidence; -use vortex_scan::plan::evidence::PredicateEvidenceKind; -use vortex_scan::plan::evidence::PredicateId; -use vortex_scan::plan::evidence::PredicateVersion; -use vortex_scan::plan::request::EvidenceMode; -use vortex_scan::plan::request::OwnedEvidenceRequest; -use vortex_scan::plan::request::ScanRequest; -use vortex_scan::segments::CachedSegmentSource; -use vortex_scan::segments::ScanIoPhase; -use vortex_scan::segments::ScanRead; -use vortex_scan::segments::SegmentFutureCache; -use vortex_scan::segments::SegmentPlanCtx; -use vortex_scan::segments::SegmentRequests; -use vortex_scan::segments::SegmentSource; -use vortex_scan::segments::register_segment_reads_cached; +use vortex_scan::read::ReadResults; +use vortex_scan::read::ReadStore; +use vortex_scan::read::ReadStoreRef; use vortex_scan::selection::Selection; -use vortex_scan::task::FutureScanTask; +use vortex_scan::task::ScanStep; +use vortex_scan::task::ScanStepResult; +use vortex_scan::task::ScanTask; use vortex_scan::task::ScanTaskBox; use vortex_scan::task::ScanTaskLane; use vortex_scan::task::ScanTaskQueue; @@ -703,7 +710,7 @@ impl DataSource for ScanPlanDataSource { } let morsel_plan_window = morsel_plan_window(&scheduler, false); - let read_byte_budget = read_byte_budget(&scheduler); + let morsel_byte_budget = morsel_byte_budget(&scheduler); Ok(Some(Arc::new(PlannedScanPlanScan { dtype, @@ -712,7 +719,7 @@ impl DataSource for ScanPlanDataSource { ticket, handle: self.session.handle(), morsel_plan_window, - read_byte_budget, + morsel_byte_budget, }))) } @@ -1094,7 +1101,7 @@ type QueuedWork = ScanTaskBox; struct LaunchedWorkOutput { lane: ScanTaskLane, reads: Vec, - output: VortexResult, + output: VortexResult, } struct EvidenceWorkOutput { @@ -1130,6 +1137,307 @@ enum WorkOutput { Projection(ProjectionWorkOutput), } +enum WorkPoll { + Ready(WorkOutput), + Pending(QueuedWork), +} + +struct ScanEvidenceWaitTask { + execution: Arc, + morsel_id: usize, + predicate_idx: usize, + evidence_idx: usize, + version: PredicateVersion, + lane: ScanTaskLane, + priority: u64, +} + +impl ScanTask for ScanEvidenceWaitTask { + fn morsel_id(&self) -> usize { + self.morsel_id + } + + fn phase(&self) -> ScanIoPhase { + ScanIoPhase::EvidenceProbe + } + + fn lane(&self) -> ScanTaskLane { + self.lane + } + + fn reads(&self) -> &[ScanTaskRead] { + &[] + } + + fn priority(&self) -> u64 { + self.priority + } + + fn into_step(self: Box) -> VortexResult> { + let task = *self; + let morsel_id = task.morsel_id; + let lane = task.lane; + let priority = task.priority; + Ok(ScanStep::new( + morsel_id, + ScanIoPhase::EvidenceProbe, + lane, + Vec::new(), + Vec::new(), + Vec::new(), + move |_| { + if !task.execution.scan_evidence_provider_ready( + task.predicate_idx, + task.evidence_idx, + task.version, + ) && task.execution.predicates[task.predicate_idx].version() == task.version + { + return Ok(ScanStepResult::Continue(Box::new(task))); + } + + Ok(ScanStepResult::Ready(WorkOutput::ScanEvidence( + ScanEvidenceWorkOutput { + execution: Arc::clone(&task.execution), + morsel_id: task.morsel_id, + predicate_idx: task.predicate_idx, + evidence_idx: task.evidence_idx, + version: task.version, + fragments: None, + }, + ))) + }, + ) + .with_priority(priority)) + } +} + +struct PredicateReadWorkTask { + execution: Arc, + task: Box, + reads: Vec, + morsel_id: usize, + predicate_idx: usize, + version: PredicateVersion, + range: Range, + need: Mask, + compact: bool, + len: usize, + priority: u64, + lane: ScanTaskLane, +} + +impl ScanTask for PredicateReadWorkTask { + fn morsel_id(&self) -> usize { + self.morsel_id + } + + fn phase(&self) -> ScanIoPhase { + ScanIoPhase::PredicateRead + } + + fn lane(&self) -> ScanTaskLane { + self.lane + } + + fn reads(&self) -> &[ScanTaskRead] { + &self.reads + } + + fn priority(&self) -> u64 { + self.priority + } + + fn into_step(self: Box) -> VortexResult> { + let task = *self; + let read_step = task.task.into_step()?; + let morsel_id = task.morsel_id; + let lane = task.lane; + let reads = task.reads.clone(); + let priority = task.priority; + Ok(ScanStep::new( + morsel_id, + ScanIoPhase::PredicateRead, + lane, + reads, + read_step.required_reads, + read_step.prefetch_reads, + move |results| { + let reader = task.execution.resolved_reader(results.clone()); + let mut ctx = task.execution.session.create_execution_ctx(); + let array = match read_step.continuation.run(&reader, &mut ctx, results)? { + ReadTaskOutput::Ready(array) => array, + ReadTaskOutput::Continue(read_task) => { + return Ok(ScanStepResult::Continue(Box::new(PredicateReadWorkTask { + execution: task.execution, + task: read_task, + reads: task.reads, + morsel_id: task.morsel_id, + predicate_idx: task.predicate_idx, + version: task.version, + range: task.range, + need: task.need, + compact: task.compact, + len: task.len, + priority: task.priority, + lane: task.lane, + }))); + } + }; + let result = if task.compact { + let compact = array.null_as_false().execute(&mut ctx)?; + if compact.len() != task.need.true_count() { + vortex_bail!( + "compacted residual result length {} does not match demanded row count {}", + compact.len(), + task.need.true_count() + ); + } + task.need.intersect_by_rank(&compact) + } else { + array.null_as_false().execute(&mut ctx)? + }; + if result.len() != task.len { + vortex_bail!( + "residual result length {} does not match morsel length {}", + result.len(), + task.len + ); + } + let pass = &result & &task.need; + let input_rows = task.need.true_count(); + let pass_rows = pass.true_count(); + let exact = !&task.need | &pass; + Ok(ScanStepResult::Ready(WorkOutput::Evidence( + EvidenceWorkOutput { + morsel_id: task.morsel_id, + predicate_idx: task.predicate_idx, + version: task.version, + source: EvidenceWorkSource::Predicate { + input_rows, + pass_rows, + }, + fragments: vec![EvidenceFragment::new( + task.range.clone(), + PredicateEvidenceKind::ExactMask(exact), + )], + }, + ))) + }, + ) + .with_priority(priority)) + } +} + +struct ProjectionReadWorkTask { + execution: Arc, + task: Box, + reads: Vec, + morsel_id: usize, +} + +impl ScanTask for ProjectionReadWorkTask { + fn morsel_id(&self) -> usize { + self.morsel_id + } + + fn phase(&self) -> ScanIoPhase { + ScanIoPhase::ProjectionRead + } + + fn lane(&self) -> ScanTaskLane { + ScanTaskLane::Projection + } + + fn reads(&self) -> &[ScanTaskRead] { + &self.reads + } + + fn priority(&self) -> u64 { + ScanStep::::DEFAULT_PRIORITY + } + + fn into_step(self: Box) -> VortexResult> { + let task = *self; + let read_step = task.task.into_step()?; + let reads = task.reads.clone(); + Ok(ScanStep::new( + task.morsel_id, + ScanIoPhase::ProjectionRead, + ScanTaskLane::Projection, + reads, + read_step.required_reads, + read_step.prefetch_reads, + move |results| { + let reader = task.execution.resolved_reader(results.clone()); + let mut ctx = task.execution.session.create_execution_ctx(); + match read_step.continuation.run(&reader, &mut ctx, results)? { + ReadTaskOutput::Ready(array) => Ok(ScanStepResult::Ready( + WorkOutput::Projection(ProjectionWorkOutput { + morsel_id: task.morsel_id, + array, + }), + )), + ReadTaskOutput::Continue(read_task) => { + Ok(ScanStepResult::Continue(Box::new(ProjectionReadWorkTask { + execution: task.execution, + task: read_task, + reads: task.reads, + morsel_id: task.morsel_id, + }))) + } + } + }, + )) + } +} + +async fn resolve_scan_reads(read_store: ReadStoreRef, reads: Vec) -> VortexResult<()> { + let mut pending_reads = FuturesUnordered::new(); + for read in reads { + let key = read.request.key; + if read_store.get(key).is_none() { + pending_reads.push(async move { read.future.await.map(|buffer| (key, buffer)) }); + } + } + while let Some(result) = pending_reads.next().await { + let (key, buffer) = result?; + read_store.insert(key, buffer); + } + Ok(()) +} + +fn prefetch_scan_reads(handle: &Handle, read_store: ReadStoreRef, reads: Vec) { + if reads.is_empty() { + return; + } + handle + .spawn(async move { + if let Err(error) = resolve_scan_reads(read_store, reads).await { + tracing::debug!( + target: "vortex_file::scan_v2", + ?error, + "scan2 prefetch read failed" + ); + } + }) + .detach(); +} + +async fn run_scan_task_step( + work: QueuedWork, + read_store: ReadStoreRef, + handle: Handle, +) -> VortexResult { + let mut step = work.into_step()?; + let (required_reads, prefetch_reads) = step.take_reads(); + prefetch_scan_reads(&handle, Arc::clone(&read_store), prefetch_reads); + resolve_scan_reads(Arc::clone(&read_store), required_reads).await?; + match step.continue_with(ReadResults::new(Arc::clone(&read_store)))? { + ScanStepResult::Ready(output) => Ok(WorkPoll::Ready(output)), + ScanStepResult::Continue(work) => Ok(WorkPoll::Pending(work)), + } +} + enum CompletedMorsel { Empty, Output(ArrayRef), @@ -1170,14 +1478,13 @@ struct ScanEvidenceSlot { version: Option, pending: Option, fragments: Vec, - waiters: Vec>, } enum ScanEvidenceAction { Ready, Pending, Prepare, - Wait(oneshot::Receiver<()>), + Wait, } #[derive(Default)] @@ -1196,6 +1503,7 @@ struct PartitionWorkSchedulerState { next_emit_morsel_id: usize, task_queue: ScanTaskQueue, in_flight: FuturesUnordered>, + read_store: ReadStoreRef, completed_morsels: BTreeMap, handle: Handle, ordered: bool, @@ -1213,8 +1521,8 @@ fn morsel_plan_window(scheduler: &ScanScheduler, limited: bool) -> usize { .unwrap_or(usize::MAX) } -fn read_byte_budget(scheduler: &ScanScheduler) -> u64 { - scheduler.config().read_byte_budget().unwrap_or(u64::MAX) +fn morsel_byte_budget(scheduler: &ScanScheduler) -> u64 { + scheduler.config().morsel_byte_budget().unwrap_or(u64::MAX) } fn partition_work_stream( @@ -1224,7 +1532,7 @@ fn partition_work_stream( handle: Handle, ordered: bool, plan_window: usize, - read_byte_budget: u64, + morsel_byte_budget: u64, ) -> impl futures::Stream> + Send + 'static { let has_dynamic_predicates = morsels .iter() @@ -1234,7 +1542,7 @@ fn partition_work_stream( morsel_count = morsels.len(), ordered, plan_window, - read_byte_budget, + morsel_byte_budget, has_dynamic_predicates, "created scan2 task stream" ); @@ -1246,8 +1554,9 @@ fn partition_work_stream( in_flight_projection_tasks: 0, next_morsel_id: 0, next_emit_morsel_id: 0, - task_queue: ScanTaskQueue::new(read_byte_budget), + task_queue: ScanTaskQueue::new(morsel_byte_budget), in_flight: FuturesUnordered::new(), + read_store: Arc::new(ReadStore::new()), completed_morsels: BTreeMap::new(), handle, ordered, @@ -1289,9 +1598,16 @@ fn partition_work_stream( match state.in_flight.next().await { Some(output) => { state.release_reads(output.lane, &output.reads); - match output.output.and_then(|output| state.complete_work(output)) { - Ok(Some(array)) => return Some((Ok(array), state)), - Ok(None) => continue, + match output.output { + Ok(WorkPoll::Ready(output)) => match state.complete_work(output) { + Ok(Some(array)) => return Some((Ok(array), state)), + Ok(None) => continue, + Err(error) => return Some((Err(error), state)), + }, + Ok(WorkPoll::Pending(work)) => { + state.task_queue.push(work); + continue; + } Err(error) => return Some((Err(error), state)), } } @@ -1311,6 +1627,7 @@ impl PartitionWorkSchedulerState { self.next_emit_morsel_id = 0; self.task_queue.clear(); self.in_flight = FuturesUnordered::new(); + self.read_store = Arc::new(ReadStore::new()); self.completed_morsels.clear(); } @@ -1327,7 +1644,17 @@ impl PartitionWorkSchedulerState { return Ok(()); }; let morsel_id = self.next_morsel_id; + let range = morsel.range.clone(); let Some(planned) = morsel.execution.plan_morsel(morsel_id, morsel.range)? else { + tracing::trace!( + target: "vortex_file::scan_v2", + morsel_id, + range_start = range.start, + range_end = range.end, + pending_morsels = self.pending.len(), + active_morsels = self.active_morsels, + "scan2 skipped empty morsel" + ); return Ok(()); }; self.next_morsel_id = self.next_morsel_id.saturating_add(1); @@ -1336,8 +1663,22 @@ impl PartitionWorkSchedulerState { self.morsels.resize_with(morsel_id + 1, || None); } self.morsels[morsel_id] = Some(planned.state); + let evidence_len = planned.evidence.len(); self.task_queue.extend(planned.evidence); self.enqueue_ready_work(morsel_id)?; + tracing::trace!( + target: "vortex_file::scan_v2", + morsel_id, + range_start = range.start, + range_end = range.end, + pending_morsels = self.pending.len(), + active_morsels = self.active_morsels, + queued_evidence = evidence_len, + evidence_queue_len = self.task_queue.evidence_len(), + predicate_queue_len = self.task_queue.predicate_len(), + projection_queue_len = self.task_queue.projection_len(), + "scan2 planned morsel" + ); Ok(()) } @@ -1362,12 +1703,37 @@ impl PartitionWorkSchedulerState { } fn launch_admitted(&mut self, work: QueuedWork, lane: ScanTaskLane, reads: Vec) { + let morsel_id = work.morsel_id(); let phase = work.phase(); let priority = work.priority(); let bytes = scan_task_read_bytes(&reads); - let future = work.into_future(); + let read_count = reads.len(); + tracing::trace!( + target: "vortex_file::scan_v2", + morsel_id, + ?phase, + ?lane, + read_count, + read_bytes = bytes, + priority, + in_flight = self.in_flight.len(), + in_flight_projection_tasks = self.in_flight_projection_tasks, + active_morsels = self.active_morsels, + pending_morsels = self.pending.len(), + evidence_queue_len = self.task_queue.evidence_len(), + predicate_queue_len = self.task_queue.predicate_len(), + projection_queue_len = self.task_queue.projection_len(), + active_read_count = self.task_queue.active_read_count(), + active_read_bytes = self.task_queue.active_read_bytes(), + active_evidence_read_bytes = self.task_queue.active_evidence_read_bytes(), + active_predicate_read_bytes = self.task_queue.active_predicate_read_bytes(), + active_projection_read_bytes = self.task_queue.active_projection_read_bytes(), + "scan2 launching work" + ); + let read_store = Arc::clone(&self.read_store); + let handle = self.handle.clone(); let future = async move { - let output = future.await; + let output = run_scan_task_step(work, read_store, handle).await; LaunchedWorkOutput { lane, reads, @@ -1376,8 +1742,10 @@ impl PartitionWorkSchedulerState { } .instrument(tracing::trace_span!( "scan2_work", + morsel_id, phase = ?phase, lane = ?lane, + read_count, read_bytes = bytes, )); let inline_zero_read = bytes == 0 @@ -1953,7 +2321,7 @@ impl Partition for ScanPlanPartition { let ranges = execution.splits(&row_range)?; let ordered = execution.plan.ordered(); let plan_window = morsel_plan_window(&scheduler, execution.limit_remaining.is_some()); - let read_byte_budget = read_byte_budget(&scheduler); + let morsel_byte_budget = morsel_byte_budget(&scheduler); let morsels = ranges .into_iter() .map(|range| PlannedScanPlanMorsel { @@ -1969,7 +2337,7 @@ impl Partition for ScanPlanPartition { handle, ordered, plan_window, - read_byte_budget, + morsel_byte_budget, ); Ok(ArrayStreamExt::boxed(ArrayStreamAdapter::new( @@ -1985,7 +2353,7 @@ struct PlannedScanPlanScan { ticket: ScanTicket, handle: Handle, morsel_plan_window: usize, - read_byte_budget: u64, + morsel_byte_budget: u64, } #[derive(Clone)] @@ -2068,7 +2436,7 @@ impl Partition for PlannedScanPlanPartition { handle, false, planned.morsel_plan_window, - planned.read_byte_budget, + planned.morsel_byte_budget, ); Ok(ArrayStreamExt::boxed(ArrayStreamAdapter::new( @@ -2098,7 +2466,6 @@ struct PreparedPredicatePlan { struct ScanExecution { // Runtime instantiation of a prepared plan: source binding, prepared handles, and scan state. session: VortexSession, - reader: ReadContext, plan: Arc, limit_remaining: Option>, segment_source: Arc, @@ -2241,14 +2608,6 @@ impl ScanExecution { let session = file.session().clone(); let segment_source = file.segment_source(); let segment_future_cache = file.scan_plan_segment_future_cache(); - let reader = ReadContext::new( - Arc::new(CachedSegmentSource::new( - Arc::clone(&segment_source), - Arc::clone(&segment_future_cache), - )), - session.clone(), - ); - let mut prepare_ctx = PrepareCtx::with_state_cache(session.clone(), file.scan_plan_state_cache()); let projection = Arc::clone(plan.projection()) @@ -2294,7 +2653,6 @@ impl ScanExecution { Ok(Self { session, - reader, plan, limit_remaining, segment_source, @@ -2311,7 +2669,21 @@ impl ScanExecution { .with_phase(phase) } - fn register_segment_reads(&self, requests: SegmentRequests) -> Vec { + fn register_segment_reads(&self, requests: SegmentRequests) -> VortexResult> { + if requests.is_unknown() { + vortex_bail!("scan2 task produced unknown segment requests") + } + Ok(register_segment_reads_cached( + self.segment_future_cache.as_ref(), + self.segment_source.as_ref(), + requests, + )) + } + + fn register_prefetch_segment_reads(&self, requests: SegmentRequests) -> Vec { + if requests.is_unknown() { + return Vec::new(); + } register_segment_reads_cached( self.segment_future_cache.as_ref(), self.segment_source.as_ref(), @@ -2319,6 +2691,16 @@ impl ScanExecution { ) } + fn resolved_reader(&self, results: ReadResults) -> ReadContext { + ReadContext::new( + Arc::new(ReadResultsSegmentSource::new( + Arc::clone(&self.segment_source), + results, + )), + self.session.clone(), + ) + } + fn predicate_priority(&self, predicate_idx: usize, demand_rows: usize) -> u64 { let predicate = &self.predicates[predicate_idx]; let static_cost = predicate.static_cost.max(1); @@ -2434,16 +2816,11 @@ impl ScanExecution { if !create_waiter { return Ok(ScanEvidenceAction::Pending); } - let (send, recv) = oneshot::channel(); - slot.waiters.push(send); - return Ok(ScanEvidenceAction::Wait(recv)); + return Ok(ScanEvidenceAction::Wait); } - // Wake waiters for any older version. They will observe the version change and + // Any older version is superseded. Polling waiters observe the version change and // re-enter planning for the current dynamic boundary. - if slot.pending.is_some() || slot.version.is_some() { - wake_scan_evidence_waiters(slot); - } slot.pending = Some(version); Ok(ScanEvidenceAction::Prepare) } @@ -2464,7 +2841,6 @@ impl ScanExecution { }; if slot.pending == Some(version) { slot.pending = None; - wake_scan_evidence_waiters(slot); } } @@ -2502,7 +2878,6 @@ impl ScanExecution { })?; if slot.pending != Some(version) && slot.version != Some(version) { - wake_scan_evidence_waiters(slot); return Ok(false); } @@ -2510,7 +2885,6 @@ impl ScanExecution { slot.pending = None; slot.fragments = fragments; predicate.generation = predicate.generation.saturating_add(1); - wake_scan_evidence_waiters(slot); Ok(true) } @@ -2577,40 +2951,19 @@ impl ScanExecution { match self.reserve_scan_evidence(predicate_idx, evidence_idx, version, create_waiter)? { ScanEvidenceAction::Ready => {} ScanEvidenceAction::Pending => {} - ScanEvidenceAction::Wait(waiter) => { - let execution = Arc::clone(self); - work.push( - FutureScanTask::new_in_lane( - morsel_id, - ScanIoPhase::EvidenceProbe, - ScanTaskLane::ScanEvidence { - predicate_idx: predicate_idx_u32, - evidence_idx: evidence_idx_u32, - }, - Vec::new(), - async move { - if !execution.scan_evidence_provider_ready( - predicate_idx, - evidence_idx, - version, - ) && execution.predicates[predicate_idx].version() == version - { - let _ = waiter.await; - } - Ok(WorkOutput::ScanEvidence(ScanEvidenceWorkOutput { - execution, - morsel_id, - predicate_idx, - evidence_idx, - version, - fragments: None, - })) - } - .boxed(), - ) - .with_priority(priority) - .boxed(), - ); + ScanEvidenceAction::Wait => { + work.push(Box::new(ScanEvidenceWaitTask { + execution: Arc::clone(self), + morsel_id, + predicate_idx, + evidence_idx, + version, + lane: ScanTaskLane::ScanEvidence { + predicate_idx: predicate_idx_u32, + evidence_idx: evidence_idx_u32, + }, + priority, + }) as QueuedWork); } ScanEvidenceAction::Prepare => { let req = OwnedEvidenceRequest { @@ -2624,7 +2977,11 @@ impl ScanExecution { let result = (|| { let requests = plan.segment_requests(&req.as_request(), &mut segment_ctx)?; - let reads = self.register_segment_reads(requests); + let reads = self.register_segment_reads(requests)?; + let prefetch_requests = + plan.prefetch_segment_requests(&req.as_request(), &mut segment_ctx)?; + let prefetch_reads = + self.register_prefetch_segment_reads(prefetch_requests); let work_reads = ScanTaskRead::from_scan_reads(&reads); let priority = plan .cost(&req.as_request()) @@ -2633,9 +2990,9 @@ impl ScanExecution { mode == EvidenceMode::RecheckBeforeProjection, ) .saturating_add(predicate.static_cost); - let task = Arc::clone(plan).create_task(req, reads)?; + let task = Arc::clone(plan).create_task(req, Vec::new())?; let execution = Arc::clone(self); - Ok(FutureScanTask::new_in_lane( + Ok(ScanStep::new( morsel_id, ScanIoPhase::EvidenceProbe, ScanTaskLane::ScanEvidence { @@ -2643,18 +3000,22 @@ impl ScanExecution { evidence_idx: evidence_idx_u32, }, work_reads, - async move { - let fragments = task.evidence(&execution.reader).await?; - Ok(WorkOutput::ScanEvidence(ScanEvidenceWorkOutput { - execution, - morsel_id, - predicate_idx, - evidence_idx, - version, - fragments: Some(fragments), - })) - } - .boxed(), + reads, + prefetch_reads, + move |results| { + let reader = execution.resolved_reader(results); + let fragments = task.evidence(&reader)?; + Ok(ScanStepResult::Ready(WorkOutput::ScanEvidence( + ScanEvidenceWorkOutput { + execution, + morsel_id, + predicate_idx, + evidence_idx, + version, + fragments: Some(fragments), + }, + ))) + }, ) .with_priority(priority) .boxed()) @@ -2704,7 +3065,10 @@ impl ScanExecution { u32::try_from(evidence_idx).map_err(|_| vortex_err!("too many evidence plans"))?; let mut segment_ctx = self.segment_plan_ctx(ScanIoPhase::EvidenceProbe); let requests = plan.segment_requests(&req.as_request(), &mut segment_ctx)?; - let reads = self.register_segment_reads(requests); + let reads = self.register_segment_reads(requests)?; + let prefetch_requests = + plan.prefetch_segment_requests(&req.as_request(), &mut segment_ctx)?; + let prefetch_reads = self.register_prefetch_segment_reads(prefetch_requests); let work_reads = ScanTaskRead::from_scan_reads(&reads); let priority = plan .cost(&req.as_request()) @@ -2713,10 +3077,10 @@ impl ScanExecution { mode == EvidenceMode::RecheckBeforeProjection, ) .saturating_add(predicate.static_cost); - let task = Arc::clone(plan).create_task(req.clone(), reads)?; + let task = Arc::clone(plan).create_task(req.clone(), Vec::new())?; let execution = Arc::clone(self); work.push( - FutureScanTask::new_in_lane( + ScanStep::new( morsel_id, ScanIoPhase::EvidenceProbe, ScanTaskLane::Evidence { @@ -2724,17 +3088,21 @@ impl ScanExecution { evidence_idx: evidence_idx_u32, }, work_reads, - async move { - let fragments = task.evidence(&execution.reader).await?; - Ok(WorkOutput::Evidence(EvidenceWorkOutput { - morsel_id, - predicate_idx, - version, - source: EvidenceWorkSource::Provider, - fragments, - })) - } - .boxed(), + reads, + prefetch_reads, + move |results| { + let reader = execution.resolved_reader(results); + let fragments = task.evidence(&reader)?; + Ok(ScanStepResult::Ready(WorkOutput::Evidence( + EvidenceWorkOutput { + morsel_id, + predicate_idx, + version, + source: EvidenceWorkSource::Provider, + fragments, + }, + ))) + }, ) .with_priority(priority) .boxed(), @@ -2765,76 +3133,41 @@ impl ScanExecution { predicate .read .segment_requests(range.clone(), rows.as_scope(), &mut segment_ctx)?; - let reads = self.register_segment_reads(requests); + let reads = self.register_segment_reads(requests)?; + let prefetch_requests = predicate.read.prefetch_segment_requests( + range.clone(), + rows.as_scope(), + &mut segment_ctx, + )?; + let prefetch_reads = self.register_prefetch_segment_reads(prefetch_requests); let work_reads = ScanTaskRead::from_scan_reads(&reads); - let task = Arc::clone(&predicate.read).create_task(range.clone(), rows, reads)?; + let task = Arc::clone(&predicate.read).create_task( + range.clone(), + rows, + reads, + prefetch_reads, + &mut segment_ctx, + )?; let execution = Arc::clone(self); let predicate_idx_u32 = u32::try_from(predicate_idx).map_err(|_| vortex_err!("too many predicates"))?; - Ok(FutureScanTask::new_in_lane( + Ok(Box::new(PredicateReadWorkTask { + execution, + task, + reads: work_reads, morsel_id, - ScanIoPhase::PredicateRead, - ScanTaskLane::Predicate { + predicate_idx, + version, + range, + need, + compact, + len, + priority, + lane: ScanTaskLane::Predicate { predicate_idx: predicate_idx_u32, }, - work_reads, - async move { - let mut ctx = execution.session.create_execution_ctx(); - // Filter-first: when few rows are demanded, read with selection = `need` so the leaf - // returns the compacted (filtered) array and an expensive residual (e.g. an FSST - // `LIKE`) evaluates over only `need.true_count()` rows. The compacted verdict is - // scattered back into the morsel domain via `intersect_by_rank`, giving a full-length - // mask identical to the dense path's `result & need`. Mirrors V1's flat-reader gate. - let result = if compact { - let compact = task - .read(&execution.reader, &mut ctx) - .await? - .null_as_false() - .execute(&mut ctx)?; - if compact.len() != need.true_count() { - vortex_bail!( - "compacted residual result length {} does not match demanded row count {}", - compact.len(), - need.true_count() - ); - } - need.intersect_by_rank(&compact) - } else { - task - .read(&execution.reader, &mut ctx) - .await? - .null_as_false() - .execute(&mut ctx)? - }; - if result.len() != len { - vortex_bail!( - "residual result length {} does not match morsel length {len}", - result.len() - ); - } - let pass = &result & &need; - let input_rows = need.true_count(); - let pass_rows = pass.true_count(); - let exact = !&need | &pass; - Ok(WorkOutput::Evidence(EvidenceWorkOutput { - morsel_id, - predicate_idx, - version, - source: EvidenceWorkSource::Predicate { - input_rows, - pass_rows, - }, - fragments: vec![EvidenceFragment::new( - range, - PredicateEvidenceKind::ExactMask(exact), - )], - })) - } - .boxed(), - ) - .with_priority(priority) - .boxed()) + })) } fn plan_projection_work( @@ -2867,29 +3200,29 @@ impl ScanExecution { let requests = self.projection .segment_requests(range.clone(), rows.as_scope(), &mut segment_ctx)?; - let reads = self.register_segment_reads(requests); + let reads = self.register_segment_reads(requests)?; + let prefetch_requests = self.projection.prefetch_segment_requests( + range.clone(), + rows.as_scope(), + &mut segment_ctx, + )?; + let prefetch_reads = self.register_prefetch_segment_reads(prefetch_requests); let work_reads = ScanTaskRead::from_scan_reads(&reads); - let task = Arc::clone(&self.projection).create_task(range, rows, reads)?; + let task = Arc::clone(&self.projection).create_task( + range, + rows, + reads, + prefetch_reads, + &mut segment_ctx, + )?; let execution = Arc::clone(self); - Ok(Some( - FutureScanTask::new_in_lane( - morsel_id, - ScanIoPhase::ProjectionRead, - ScanTaskLane::Projection, - work_reads, - async move { - let mut ctx = execution.session.create_execution_ctx(); - let array = task.read(&execution.reader, &mut ctx).await?; - Ok(WorkOutput::Projection(ProjectionWorkOutput { - morsel_id, - array, - })) - } - .boxed(), - ) - .boxed(), - )) + Ok(Some(Box::new(ProjectionReadWorkTask { + execution, + task, + reads: work_reads, + morsel_id, + }))) } fn splits(&self, row_range: &Range) -> VortexResult>> { @@ -2916,12 +3249,6 @@ fn push_overlapping_fragments( Ok(()) } -fn wake_scan_evidence_waiters(slot: &mut ScanEvidenceSlot) { - for waiter in slot.waiters.drain(..) { - drop(waiter.send(())); - } -} - fn slice_evidence_fragment( fragment: &EvidenceFragment, range: &Range, diff --git a/vortex-file/src/segments/source.rs b/vortex-file/src/segments/source.rs index 0ac13257e93..62ee04f6c0d 100644 --- a/vortex-file/src/segments/source.rs +++ b/vortex-file/src/segments/source.rs @@ -12,6 +12,7 @@ use futures::FutureExt; use futures::StreamExt; use futures::channel::mpsc; use futures::future; +use tracing::Instrument; use vortex_array::buffer::BufferHandle; use vortex_buffer::Alignment; use vortex_buffer::ByteBuffer; @@ -114,8 +115,15 @@ impl FileSegmentSource { .map(move |req| { let reader = reader.clone(); async move { + let offset = req.offset(); + let len = req.len(); let result = reader - .read_at(req.offset(), req.len(), req.alignment()) + .read_at(offset, len, req.alignment()) + .instrument(tracing::trace_span!( + "vortex_segment_read", + offset, + len, + )) .await; let result = result.and_then(|buffer| { if req.len() != buffer.len() { diff --git a/vortex-layout/src/layout_v2.rs b/vortex-layout/src/layout_v2.rs index 90147a7848e..246120a1e41 100644 --- a/vortex-layout/src/layout_v2.rs +++ b/vortex-layout/src/layout_v2.rs @@ -26,8 +26,6 @@ use vortex_error::vortex_ensure; use vortex_error::vortex_err; use vortex_flatbuffers::FlatBuffer; use vortex_flatbuffers::layout; -use vortex_scan::plan::ScanPlanRef; -use vortex_scan::plan::request::ScanRequest; use vortex_session::VortexSession; use vortex_session::registry::ReadContext; use vortex_session::registry::Registry; @@ -40,6 +38,8 @@ use crate::layouts::zoned::ZonedMetadata; use crate::layouts::zoned::aggregate_fns_from_specs; use crate::layouts::zoned::aggregate_stats_table_dtype; use crate::layouts::zoned::legacy_stats_table_dtype; +use crate::scan::plan::ScanPlanRef; +use crate::scan::plan::request::ScanRequest; use crate::scan::v2::layouts::chunked as scan_chunked; use crate::scan::v2::layouts::dict as scan_dict; use crate::scan::v2::layouts::flat as scan_flat; diff --git a/vortex-layout/src/scan/mod.rs b/vortex-layout/src/scan/mod.rs index ab003641eb5..55c57d51089 100644 --- a/vortex-layout/src/scan/mod.rs +++ b/vortex-layout/src/scan/mod.rs @@ -5,6 +5,7 @@ pub mod arrow; mod filter; pub mod layout; pub mod multi; +pub mod plan; pub mod repeated_scan; pub mod scan_builder; pub mod split_by; diff --git a/vortex-scan/src/plan/evidence.rs b/vortex-layout/src/scan/plan/evidence.rs similarity index 100% rename from vortex-scan/src/plan/evidence.rs rename to vortex-layout/src/scan/plan/evidence.rs diff --git a/vortex-scan/src/plan/mod.rs b/vortex-layout/src/scan/plan/mod.rs similarity index 69% rename from vortex-scan/src/plan/mod.rs rename to vortex-layout/src/scan/plan/mod.rs index d1c1044b857..eff61c27c0e 100644 --- a/vortex-scan/src/plan/mod.rs +++ b/vortex-layout/src/scan/plan/mod.rs @@ -24,7 +24,6 @@ use std::sync::Arc; use std::sync::OnceLock; use futures::future::BoxFuture; -use futures::future::try_join_all; use parking_lot::Mutex; use rustc_hash::FxHashMap; use vortex_array::ArrayRef; @@ -34,6 +33,7 @@ use vortex_array::VortexSessionExecute; use vortex_array::aggregate_fn::AggregateFnRef; use vortex_array::arrays::ConstantArray; use vortex_array::arrays::StructArray; +use vortex_array::buffer::BufferHandle; use vortex_array::builtins::ArrayBuiltins; use vortex_array::dtype::Field; use vortex_array::dtype::FieldNames; @@ -52,17 +52,21 @@ use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; use vortex_mask::Mask; +use vortex_scan::read::ReadRequestKey; +use vortex_scan::read::ReadResults; +use vortex_scan::read::ScanRead; use vortex_session::VortexSession; +use vortex_utils::aliases::hash_set::HashSet; use self::evidence::EvidenceFragment; use self::request::EvidenceRequest; use self::request::OwnedEvidenceRequest; -use crate::segments::ScanRead; use crate::segments::SegmentPlanCtx; +use crate::segments::SegmentRequestKey; use crate::segments::SegmentRequests; use crate::segments::SegmentSource; -/// Execution context for legacy prepared read calls. +/// Execution context for prepared scan tasks. #[derive(Clone)] pub struct ReadContext { segments: Arc, @@ -80,6 +84,11 @@ impl ReadContext { &self.segments } + /// Return a segment that was resolved by the scan scheduler before execution. + pub fn segment(&self, id: crate::segments::SegmentId) -> VortexResult { + self.segments.resolved(id) + } + /// Session used to decode arrays and execute expressions. pub fn session(&self) -> &VortexSession { &self.session @@ -499,6 +508,13 @@ struct LiteralPreparedStats { funcs: Vec, } +struct LiteralReadTask { + scalar: Scalar, + row_count: u64, + range: Range, + len: usize, +} + impl ScanPlan for LiteralScanPlan { fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { Ok(Arc::new(())) @@ -547,19 +563,6 @@ impl ScanPlan for LiteralScanPlan { } impl PreparedRead for LiteralPreparedRead { - fn read_scoped<'a>( - &'a self, - range: Range, - rows: RowScope<'a>, - _io: &'a ReadContext, - _local: &'a mut ExecutionCtx, - ) -> BoxFuture<'a, VortexResult> { - Box::pin(async move { - check_scan_range(&range, self.row_count)?; - Ok(ConstantArray::new(self.scalar.clone(), rows.selection.true_count()).into_array()) - }) - } - fn segment_requests( &self, _range: Range, @@ -569,11 +572,39 @@ impl PreparedRead for LiteralPreparedRead { Ok(SegmentRequests::none()) } + fn create_task( + self: Arc, + range: Range, + rows: OwnedRowScope, + _reads: Vec, + _prefetch_reads: Vec, + _cx: &mut SegmentPlanCtx, + ) -> VortexResult> { + check_scan_range(&range, self.row_count)?; + Ok(Box::new(LiteralReadTask { + scalar: self.scalar.clone(), + row_count: self.row_count, + range, + len: rows.selection.true_count(), + })) + } + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "literal") } } +impl ReadTask for LiteralReadTask { + fn into_step(self: Box) -> VortexResult { + Ok(ReadStep::new(Vec::new(), Vec::new(), move |_, _, _| { + check_scan_range(&self.range, self.row_count)?; + Ok(ReadTaskOutput::Ready( + ConstantArray::new(self.scalar.clone(), self.len).into_array(), + )) + })) + } +} + impl PreparedStats for LiteralPreparedStats { fn init_state(&self, _ctx: &VortexSession) -> VortexResult { Ok(Arc::new(())) @@ -634,21 +665,6 @@ impl LiteralPreparedStats { } } -/// Read every row in `range` through a prepared read. -pub fn read_dense<'a>( - read: &'a PreparedReadRef, - range: Range, - io: &'a ReadContext, -) -> BoxFuture<'a, VortexResult> { - Box::pin(async move { - let len = range_len(&range)?; - let rows = OwnedRowScope::selected(Mask::new_true(len)); - let mut local = io.session().create_execution_ctx(); - read.read_scoped(range, rows.as_scope(), io, &mut local) - .await - }) -} - fn range_len(range: &Range) -> VortexResult { let len = range .end @@ -672,19 +688,8 @@ fn check_scan_range(range: &Range, row_count: u64) -> VortexResult<()> { /// /// A `PreparedRead` is the scan-level runtime handle for a fixed read route. It /// may hold child prepared reads and initializes route-scoped state once per -/// prepared file scan; each `read_scoped` call executes that route for one -/// morsel row scope. +/// prepared file scan; each morsel execution is represented as a [`ReadTask`]. pub trait PreparedRead: 'static + Send + Sync { - /// Read the live rows of `range`, with [`RowScope`] defining output - /// cardinality (`selection`) and meaningful-value demand (`demand`). - fn read_scoped<'a>( - &'a self, - range: Range, - rows: RowScope<'a>, - io: &'a ReadContext, - local: &'a mut ExecutionCtx, - ) -> BoxFuture<'a, VortexResult>; - /// Return scheduler-visible segment requests needed for this read, when known exactly. fn segment_requests( &self, @@ -695,6 +700,26 @@ pub trait PreparedRead: 'static + Send + Sync { Ok(SegmentRequests::unknown()) } + /// Return scheduler-visible segment requests that may be fetched speculatively. + fn prefetch_segment_requests( + &self, + _range: Range, + _rows: RowScope<'_>, + _cx: &mut SegmentPlanCtx, + ) -> VortexResult { + Ok(SegmentRequests::none()) + } + + /// Create a morsel-level read task for this prepared read. + fn create_task( + self: Arc, + range: Range, + rows: OwnedRowScope, + reads: Vec, + prefetch_reads: Vec, + cx: &mut SegmentPlanCtx, + ) -> VortexResult>; + /// Release state behind the completed-row frontier. fn release(&self, _frontier: u64) -> VortexResult<()> { Ok(()) @@ -706,63 +731,325 @@ pub trait PreparedRead: 'static + Send + Sync { } } -impl dyn PreparedRead { - /// Create a morsel-level read task for this prepared read. - pub fn create_task( - self: Arc, - range: Range, - rows: OwnedRowScope, - reads: Vec, - ) -> VortexResult> { - Ok(Box::new(DefaultReadTask { - read: self, - range, - rows, - reads, - })) +/// Result of executing a morsel-level read task continuation. +pub enum ReadTaskOutput { + /// The task produced its final array. + Ready(ArrayRef), + /// The task needs another scheduler-admitted read step. + Continue(Box), +} + +/// Continuation called after a read step's required reads have resolved. +pub trait ReadContinuation: Send { + /// Execute the continuation. + fn run( + self: Box, + io: &ReadContext, + local: &mut ExecutionCtx, + results: ReadResults, + ) -> VortexResult; +} + +impl ReadContinuation for F +where + F: FnOnce(&ReadContext, &mut ExecutionCtx, ReadResults) -> VortexResult + Send, +{ + fn run( + self: Box, + io: &ReadContext, + local: &mut ExecutionCtx, + results: ReadResults, + ) -> VortexResult { + self(io, local, results) + } +} + +/// One scheduler-visible step of a layout read task. +pub struct ReadStep { + /// Reads that must resolve before the continuation runs. + pub required_reads: Vec, + /// Reads that may be fetched speculatively while this step is queued. + pub prefetch_reads: Vec, + /// Continuation to execute after required reads resolve. + pub continuation: Box, +} + +impl ReadStep { + /// Create a read step. + pub fn new( + required_reads: Vec, + prefetch_reads: Vec, + continuation: impl FnOnce( + &ReadContext, + &mut ExecutionCtx, + ReadResults, + ) -> VortexResult + + Send + + 'static, + ) -> Self { + Self { + required_reads, + prefetch_reads, + continuation: Box::new(continuation), + } } } /// A morsel-level read task. pub trait ReadTask: Send { - /// Registered reads needed by this task. - fn reads(&self) -> &[ScanRead]; + /// Convert this task into its next scheduler-visible step. + fn into_step(self: Box) -> VortexResult; +} + +pub(crate) fn take_reads_for_requests( + registered: &mut [Option], + requests: SegmentRequests, +) -> VortexResult> { + let Some(requests) = requests.into_exact() else { + vortex_bail!("scan2 child task produced unknown segment requests") + }; + let keys = requests + .iter() + .map(|request| ReadRequestKey::from(SegmentRequestKey::from(request))) + .collect::>(); + Ok(registered + .iter_mut() + .filter_map(|read| { + if read + .as_ref() + .is_some_and(|read| keys.contains(&read.request.key)) + { + read.take() + } else { + None + } + }) + .collect()) +} - /// Execute the read task. - fn read<'a>( - self: Box, - io: &'a ReadContext, - local: &'a mut ExecutionCtx, - ) -> BoxFuture<'a, VortexResult>; +enum StructReadPart { + Ready(ArrayRef), + Pending(Box), } -struct DefaultReadTask { - read: PreparedReadRef, - range: Range, - rows: OwnedRowScope, - reads: Vec, +struct StructReadTask { + names: FieldNames, + len: usize, + fields: Vec, + validity: Option, } -impl ReadTask for DefaultReadTask { - fn reads(&self) -> &[ScanRead] { - &self.reads +impl ReadTask for StructReadTask { + fn into_step(self: Box) -> VortexResult { + let Self { + names, + len, + fields, + validity, + } = *self; + let mut field_steps = Vec::with_capacity(fields.len()); + let mut step_fields = Vec::with_capacity(fields.len()); + let mut required_reads = Vec::new(); + let mut prefetch_reads = Vec::new(); + for field in fields { + match field { + StructReadPart::Ready(array) => step_fields.push(StructReadPart::Ready(array)), + StructReadPart::Pending(task) => { + let step = task.into_step()?; + required_reads.extend(step.required_reads); + prefetch_reads.extend(step.prefetch_reads); + field_steps.push((step_fields.len(), step.continuation)); + step_fields.push(StructReadPart::Pending(Box::new(DeferredReadTask))); + } + } + } + let (validity_step, step_validity) = match validity { + Some(StructReadPart::Ready(array)) => (None, Some(StructReadPart::Ready(array))), + Some(StructReadPart::Pending(task)) => { + let step = task.into_step()?; + required_reads.extend(step.required_reads); + prefetch_reads.extend(step.prefetch_reads); + ( + Some(step.continuation), + Some(StructReadPart::Pending(Box::new(DeferredReadTask))), + ) + } + None => (None, None), + }; + Ok(ReadStep::new( + required_reads, + prefetch_reads, + move |io, local, results| { + let session = local.session().clone(); + let mut fields = step_fields; + let mut pending = false; + for (idx, continuation) in field_steps { + let mut child_ctx = session.create_execution_ctx(); + match continuation.run(io, &mut child_ctx, results.clone())? { + ReadTaskOutput::Ready(array) => fields[idx] = StructReadPart::Ready(array), + ReadTaskOutput::Continue(task) => { + fields[idx] = StructReadPart::Pending(task); + pending = true; + } + } + } + let mut next_validity = step_validity; + let validity = match (next_validity, validity_step) { + (Some(StructReadPart::Ready(array)), _) => { + next_validity = Some(StructReadPart::Ready(array.clone())); + Validity::Array(array) + } + (Some(StructReadPart::Pending(_)), Some(continuation)) => { + match continuation.run(io, local, results)? { + ReadTaskOutput::Ready(array) => { + next_validity = Some(StructReadPart::Ready(array.clone())); + Validity::Array(array) + } + ReadTaskOutput::Continue(task) => { + next_validity = Some(StructReadPart::Pending(task)); + pending = true; + Validity::NonNullable + } + } + } + (None, _) => { + next_validity = None; + Validity::NonNullable + } + (Some(StructReadPart::Pending(_)), None) => { + vortex_bail!("struct validity continuation missing") + } + }; + if pending { + return Ok(ReadTaskOutput::Continue(Box::new(StructReadTask { + names, + len, + fields, + validity: next_validity, + }))); + } + let arrays = fields + .into_iter() + .map(|field| match field { + StructReadPart::Ready(array) => Ok(array), + StructReadPart::Pending(_) => { + vortex_bail!("struct field continuation missing") + } + }) + .collect::>>()?; + Ok(ReadTaskOutput::Ready( + StructArray::try_new(names, arrays, len, validity)?.into_array(), + )) + }, + )) } +} - fn read<'a>( - self: Box, - io: &'a ReadContext, - local: &'a mut ExecutionCtx, - ) -> BoxFuture<'a, VortexResult> { - Box::pin(async move { - let Self { - read, - range, - rows, - reads, - } = *self; - let result = read.read_scoped(range, rows.as_scope(), io, local).await; - drop(reads); - result +pub(crate) struct DeferredReadTask; + +impl ReadTask for DeferredReadTask { + fn into_step(self: Box) -> VortexResult { + vortex_bail!("deferred read task should be replaced before stepping") + } +} + +struct ApplyReadTask { + expr: Expression, + input: Box, +} + +impl ReadTask for ApplyReadTask { + fn into_step(self: Box) -> VortexResult { + let Self { expr, input } = *self; + let step = input.into_step()?; + Ok(ReadStep::new( + step.required_reads, + step.prefetch_reads, + move |io, local, results| match step.continuation.run(io, local, results)? { + ReadTaskOutput::Ready(input) => Ok(ReadTaskOutput::Ready( + input.apply(&expr)?.execute::(local)?, + )), + ReadTaskOutput::Continue(input) => { + Ok(ReadTaskOutput::Continue(Box::new(ApplyReadTask { + expr, + input, + }))) + } + }, + )) + } +} + +struct MaskReadTask { + input: Box, + validity: Box, +} + +impl ReadTask for MaskReadTask { + fn into_step(self: Box) -> VortexResult { + let Self { input, validity } = *self; + let input_step = input.into_step()?; + let validity_step = validity.into_step()?; + let mut required_reads = input_step.required_reads; + required_reads.extend(validity_step.required_reads); + let mut prefetch_reads = input_step.prefetch_reads; + prefetch_reads.extend(validity_step.prefetch_reads); + Ok(ReadStep::new( + required_reads, + prefetch_reads, + move |io, local, results| { + let input = match input_step.continuation.run(io, local, results.clone())? { + ReadTaskOutput::Ready(input) => input, + ReadTaskOutput::Continue(input) => { + return Ok(ReadTaskOutput::Continue(Box::new(MaskReadTask { + input, + validity: Box::new(StepReadTask::new(validity_step.continuation)), + }))); + } + }; + let validity = match validity_step.continuation.run(io, local, results)? { + ReadTaskOutput::Ready(validity) => validity, + ReadTaskOutput::Continue(validity) => { + return Ok(ReadTaskOutput::Continue(Box::new(MaskReadTask { + input: Box::new(ReadyReadTask(input)), + validity, + }))); + } + }; + Ok(ReadTaskOutput::Ready( + input.mask(validity)?.execute::(local)?, + )) + }, + )) + } +} + +struct ReadyReadTask(ArrayRef); + +impl ReadTask for ReadyReadTask { + fn into_step(self: Box) -> VortexResult { + Ok(ReadStep::new(Vec::new(), Vec::new(), move |_, _, _| { + Ok(ReadTaskOutput::Ready(self.0)) + })) + } +} + +struct StepReadTask { + continuation: Box, +} + +impl StepReadTask { + fn new(continuation: Box) -> Self { + Self { continuation } + } +} + +impl ReadTask for StepReadTask { + fn into_step(self: Box) -> VortexResult { + Ok(ReadStep { + required_reads: Vec::new(), + prefetch_reads: Vec::new(), + continuation: self.continuation, }) } } @@ -1002,38 +1289,6 @@ impl ScanPlan for StructValueScanPlan { } impl PreparedRead for StructValuePreparedRead { - fn read_scoped<'a>( - &'a self, - range: Range, - rows: RowScope<'a>, - io: &'a ReadContext, - local: &'a mut ExecutionCtx, - ) -> BoxFuture<'a, VortexResult> { - Box::pin(async move { - let session = local.session().clone(); - let arrays = try_join_all(self.fields.iter().map(|field| { - let range = range.clone(); - let mut child_ctx = session.create_execution_ctx(); - async move { field.read_scoped(range, rows, io, &mut child_ctx).await } - })) - .await?; - let validity = match &self.validity { - Some(validity) => { - let array = validity.read_scoped(range, rows, io, local).await?; - Validity::Array(array) - } - None => Validity::NonNullable, - }; - Ok(StructArray::try_new( - self.plan.names.clone(), - arrays, - rows.selection.true_count(), - validity, - )? - .into_array()) - }) - } - fn segment_requests( &self, range: Range, @@ -1053,6 +1308,84 @@ impl PreparedRead for StructValuePreparedRead { Ok(requests) } + fn prefetch_segment_requests( + &self, + range: Range, + rows: RowScope<'_>, + cx: &mut SegmentPlanCtx, + ) -> VortexResult { + let mut requests = SegmentRequests::none(); + for field in &self.fields { + requests.extend(field.prefetch_segment_requests(range.clone(), rows, cx)?); + if requests.is_unknown() { + return Ok(requests); + } + } + if let Some(validity) = &self.validity { + requests.extend(validity.prefetch_segment_requests(range, rows, cx)?); + } + Ok(requests) + } + + fn create_task( + self: Arc, + range: Range, + rows: OwnedRowScope, + reads: Vec, + prefetch_reads: Vec, + cx: &mut SegmentPlanCtx, + ) -> VortexResult> { + let mut reads = reads.into_iter().map(Some).collect::>(); + let mut prefetch_reads = prefetch_reads.into_iter().map(Some).collect::>(); + let mut fields = Vec::with_capacity(self.fields.len()); + for field in &self.fields { + let field_reads = take_reads_for_requests( + &mut reads, + field.segment_requests(range.clone(), rows.as_scope(), cx)?, + )?; + let field_prefetch_reads = take_reads_for_requests( + &mut prefetch_reads, + field.prefetch_segment_requests(range.clone(), rows.as_scope(), cx)?, + )?; + fields.push(StructReadPart::Pending(Arc::clone(field).create_task( + range.clone(), + rows.clone(), + field_reads, + field_prefetch_reads, + cx, + )?)); + } + let validity = self + .validity + .as_ref() + .map(|validity| { + let validity_reads = take_reads_for_requests( + &mut reads, + validity.segment_requests(range.clone(), rows.as_scope(), cx)?, + )?; + let validity_prefetch_reads = take_reads_for_requests( + &mut prefetch_reads, + validity.prefetch_segment_requests(range.clone(), rows.as_scope(), cx)?, + )?; + Arc::clone(validity) + .create_task( + range.clone(), + rows.clone(), + validity_reads, + validity_prefetch_reads, + cx, + ) + .map(StructReadPart::Pending) + }) + .transpose()?; + Ok(Box::new(StructReadTask { + names: self.plan.names.clone(), + len: rows.selection.true_count(), + fields, + validity, + })) + } + fn release(&self, frontier: u64) -> VortexResult<()> { for field in &self.fields { field.release(frontier)?; @@ -1122,26 +1455,37 @@ impl ScanPlan for ApplyScanPlan { } impl PreparedRead for ApplyPreparedRead { - fn read_scoped<'a>( - &'a self, + fn segment_requests( + &self, range: Range, - rows: RowScope<'a>, - io: &'a ReadContext, - local: &'a mut ExecutionCtx, - ) -> BoxFuture<'a, VortexResult> { - Box::pin(async move { - let input = self.input.read_scoped(range, rows, io, local).await?; - input.apply(&self.plan.expr)?.execute::(local) - }) + rows: RowScope<'_>, + cx: &mut SegmentPlanCtx, + ) -> VortexResult { + self.input.segment_requests(range, rows, cx) } - fn segment_requests( + fn prefetch_segment_requests( &self, range: Range, rows: RowScope<'_>, cx: &mut SegmentPlanCtx, ) -> VortexResult { - self.input.segment_requests(range, rows, cx) + self.input.prefetch_segment_requests(range, rows, cx) + } + + fn create_task( + self: Arc, + range: Range, + rows: OwnedRowScope, + reads: Vec, + prefetch_reads: Vec, + cx: &mut SegmentPlanCtx, + ) -> VortexResult> { + let input = Arc::clone(&self.input).create_task(range, rows, reads, prefetch_reads, cx)?; + Ok(Box::new(ApplyReadTask { + expr: self.plan.expr.clone(), + input, + })) } fn release(&self, frontier: u64) -> VortexResult<()> { @@ -1234,23 +1578,6 @@ impl ScanPlan for MaskScanPlan { } impl PreparedRead for MaskPreparedRead { - fn read_scoped<'a>( - &'a self, - range: Range, - rows: RowScope<'a>, - io: &'a ReadContext, - local: &'a mut ExecutionCtx, - ) -> BoxFuture<'a, VortexResult> { - Box::pin(async move { - let input = self - .input - .read_scoped(range.clone(), rows, io, local) - .await?; - let validity = self.validity.read_scoped(range, rows, io, local).await?; - input.mask(validity)?.execute::(local) - }) - } - fn segment_requests( &self, range: Range, @@ -1265,6 +1592,69 @@ impl PreparedRead for MaskPreparedRead { Ok(requests) } + fn prefetch_segment_requests( + &self, + range: Range, + rows: RowScope<'_>, + cx: &mut SegmentPlanCtx, + ) -> VortexResult { + let mut requests = self + .input + .prefetch_segment_requests(range.clone(), rows, cx)?; + if requests.is_unknown() { + return Ok(requests); + } + requests.extend(self.validity.prefetch_segment_requests(range, rows, cx)?); + Ok(requests) + } + + fn create_task( + self: Arc, + range: Range, + rows: OwnedRowScope, + reads: Vec, + prefetch_reads: Vec, + cx: &mut SegmentPlanCtx, + ) -> VortexResult> { + let mut reads = reads.into_iter().map(Some).collect::>(); + let mut prefetch_reads = prefetch_reads.into_iter().map(Some).collect::>(); + let input_reads = take_reads_for_requests( + &mut reads, + self.input + .segment_requests(range.clone(), rows.as_scope(), cx)?, + )?; + let input_prefetch_reads = take_reads_for_requests( + &mut prefetch_reads, + self.input + .prefetch_segment_requests(range.clone(), rows.as_scope(), cx)?, + )?; + let validity_reads = take_reads_for_requests( + &mut reads, + self.validity + .segment_requests(range.clone(), rows.as_scope(), cx)?, + )?; + let validity_prefetch_reads = take_reads_for_requests( + &mut prefetch_reads, + self.validity + .prefetch_segment_requests(range.clone(), rows.as_scope(), cx)?, + )?; + let input = Arc::clone(&self.input).create_task( + range.clone(), + rows.clone(), + input_reads, + input_prefetch_reads, + cx, + )?; + let validity = Arc::clone(&self.validity).create_task( + range, + rows, + validity_reads, + validity_prefetch_reads, + cx, + )?; + Ok(Box::new(MaskReadTask { input, validity })) + } + fn release(&self, frontier: u64) -> VortexResult<()> { self.input.release(frontier)?; self.validity.release(frontier) @@ -1327,7 +1717,7 @@ pub trait PreparedEvidence: 'static + Send + Sync { &'a self, req: &'a EvidenceRequest<'a>, io: &'a ReadContext, - ) -> BoxFuture<'a, VortexResult>>; + ) -> VortexResult>; /// Return scheduler-visible segment requests needed for this evidence, when known exactly. fn segment_requests( @@ -1338,6 +1728,18 @@ pub trait PreparedEvidence: 'static + Send + Sync { Ok(SegmentRequests::unknown()) } + /// Return scheduler-visible segment requests that may be fetched speculatively. + /// + /// Prefetch requests must not be required for the immediate [`PreparedEvidence::evidence`] + /// execution path, because the scan scheduler may launch them without waiting for completion. + fn prefetch_segment_requests( + &self, + _req: &EvidenceRequest<'_>, + _cx: &mut SegmentPlanCtx, + ) -> VortexResult { + Ok(SegmentRequests::none()) + } + /// Whether this handle is cheap enough to re-run immediately before a /// projection read when a dynamic predicate boundary changes while /// the morsel is in flight. @@ -1382,10 +1784,7 @@ pub trait EvidenceTask: Send { fn reads(&self) -> &[ScanRead]; /// Execute the evidence task. - fn evidence<'a>( - self: Box, - io: &'a ReadContext, - ) -> BoxFuture<'a, VortexResult>>; + fn evidence(self: Box, io: &ReadContext) -> VortexResult>; } struct DefaultEvidenceTask { @@ -1399,20 +1798,15 @@ impl EvidenceTask for DefaultEvidenceTask { &self.reads } - fn evidence<'a>( - self: Box, - io: &'a ReadContext, - ) -> BoxFuture<'a, VortexResult>> { - Box::pin(async move { - let Self { - evidence, - req, - reads, - } = *self; - let result = evidence.evidence(&req.as_request(), io).await; - drop(reads); - result - }) + fn evidence(self: Box, io: &ReadContext) -> VortexResult> { + let Self { + evidence, + req, + reads, + } = *self; + let result = evidence.evidence(&req.as_request(), io); + drop(reads); + result } } @@ -1439,6 +1833,7 @@ mod tests { use vortex_array::dtype::Nullability; use vortex_array::expr::lit; use vortex_buffer::ByteBuffer; + use vortex_scan::read::ReadStore; use super::*; @@ -1568,7 +1963,19 @@ mod tests { .prepare_read(&mut PrepareCtx::new(session.clone()))? .ok_or_else(|| vortex_err!("literal scan plan did not return a prepared read"))?; let io = ReadContext::new(Arc::new(TestSegments), session); - let array = futures::executor::block_on(read_dense(&read, 10..15, &io))?; + let rows = OwnedRowScope::selected(Mask::new_true(5)); + let mut segment_ctx = SegmentPlanCtx::new(Arc::clone(io.segments()), io.session().clone()); + let task = read.create_task(10..15, rows, Vec::new(), Vec::new(), &mut segment_ctx)?; + let results = ReadResults::new(Arc::new(ReadStore::new())); + let step = task.into_step()?; + if !step.required_reads.is_empty() || !step.prefetch_reads.is_empty() { + vortex_bail!("literal read unexpectedly requested reads"); + } + let mut local = io.session().create_execution_ctx(); + let array = match step.continuation.run(&io, &mut local, results)? { + ReadTaskOutput::Ready(array) => array, + ReadTaskOutput::Continue(_) => vortex_bail!("literal read unexpectedly continued"), + }; let constant = array .as_opt::() .ok_or_else(|| vortex_err!("literal read did not produce a constant array"))?; diff --git a/vortex-scan/src/plan/request.rs b/vortex-layout/src/scan/plan/request.rs similarity index 100% rename from vortex-scan/src/plan/request.rs rename to vortex-layout/src/scan/plan/request.rs diff --git a/vortex-layout/src/scan/v2/layouts/chunked.rs b/vortex-layout/src/scan/v2/layouts/chunked.rs index 04b03625e97..f60f3fb388d 100644 --- a/vortex-layout/src/scan/v2/layouts/chunked.rs +++ b/vortex-layout/src/scan/v2/layouts/chunked.rs @@ -27,7 +27,6 @@ use futures::future::BoxFuture; use parking_lot::Mutex; use rustc_hash::FxHashMap; use vortex_array::ArrayRef; -use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::aggregate_fn::AggregateFnRef; use vortex_array::arrays::ChunkedArray; @@ -40,35 +39,41 @@ use vortex_array::scalar::Scalar; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; -use vortex_scan::plan::AggregateAnswer; -use vortex_scan::plan::PrepareCtx; -use vortex_scan::plan::PreparedAggregate; -use vortex_scan::plan::PreparedAggregateRef; -use vortex_scan::plan::PreparedEvidence; -use vortex_scan::plan::PreparedEvidenceRef; -use vortex_scan::plan::PreparedRead; -use vortex_scan::plan::PreparedReadRef; -use vortex_scan::plan::PreparedStateCacheRef; -use vortex_scan::plan::PreparedStateKey; -use vortex_scan::plan::PushCtx; -use vortex_scan::plan::ReadContext; -use vortex_scan::plan::RowScope; -use vortex_scan::plan::ScanPlan; -use vortex_scan::plan::ScanPlanRef; -use vortex_scan::plan::ScanState; -use vortex_scan::plan::ScanStateRef; -use vortex_scan::plan::StateCtx; -use vortex_scan::plan::default_try_push_expr; -use vortex_scan::plan::downcast_state; -use vortex_scan::plan::evidence::EvidenceFragment; -use vortex_scan::plan::request::EvidenceMode; -use vortex_scan::plan::request::EvidenceRequest; -use vortex_scan::plan::request::ScanRequest; +use vortex_scan::read::ScanRead; use vortex_session::VortexSession; use crate::layout_v2::Chunked; use crate::layout_v2::Layout; use crate::layout_v2::LayoutRef; +use crate::scan::plan::AggregateAnswer; +use crate::scan::plan::OwnedRowScope; +use crate::scan::plan::PrepareCtx; +use crate::scan::plan::PreparedAggregate; +use crate::scan::plan::PreparedAggregateRef; +use crate::scan::plan::PreparedEvidence; +use crate::scan::plan::PreparedEvidenceRef; +use crate::scan::plan::PreparedRead; +use crate::scan::plan::PreparedReadRef; +use crate::scan::plan::PreparedStateCacheRef; +use crate::scan::plan::PreparedStateKey; +use crate::scan::plan::PushCtx; +use crate::scan::plan::ReadContext; +use crate::scan::plan::ReadStep; +use crate::scan::plan::ReadTask; +use crate::scan::plan::ReadTaskOutput; +use crate::scan::plan::RowScope; +use crate::scan::plan::ScanPlan; +use crate::scan::plan::ScanPlanRef; +use crate::scan::plan::ScanState; +use crate::scan::plan::ScanStateRef; +use crate::scan::plan::StateCtx; +use crate::scan::plan::default_try_push_expr; +use crate::scan::plan::downcast_state; +use crate::scan::plan::evidence::EvidenceFragment; +use crate::scan::plan::request::EvidenceMode; +use crate::scan::plan::request::EvidenceRequest; +use crate::scan::plan::request::ScanRequest; +use crate::scan::plan::take_reads_for_requests; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; @@ -160,6 +165,91 @@ struct ChunkedExprPreparedRead { state: Arc, } +enum ChunkedReadPart { + Ready(ArrayRef), + Pending { + expected_len: usize, + task: Box, + }, +} + +struct ChunkedReadTask { + dtype: DType, + parts: Vec, +} + +impl ReadTask for ChunkedReadTask { + fn into_step(self: Box) -> VortexResult { + let Self { dtype, parts } = *self; + let mut step_parts = Vec::with_capacity(parts.len()); + let mut continuations = Vec::new(); + let mut required_reads = Vec::new(); + let mut prefetch_reads = Vec::new(); + for part in parts { + match part { + ChunkedReadPart::Ready(array) => step_parts.push(ChunkedReadPart::Ready(array)), + ChunkedReadPart::Pending { expected_len, task } => { + let step = task.into_step()?; + required_reads.extend(step.required_reads); + prefetch_reads.extend(step.prefetch_reads); + continuations.push((step_parts.len(), expected_len, step.continuation)); + step_parts.push(ChunkedReadPart::Pending { + expected_len, + task: Box::new(crate::scan::plan::DeferredReadTask), + }); + } + } + } + Ok(ReadStep::new( + required_reads, + prefetch_reads, + move |io, local, results| { + let mut parts = step_parts; + let mut pending = false; + for (idx, expected_len, continuation) in continuations { + match continuation.run(io, local, results.clone())? { + ReadTaskOutput::Ready(chunk) => { + if chunk.len() != expected_len { + vortex_bail!( + "scoped chunk read returned length {}, expected {}", + chunk.len(), + expected_len + ); + } + parts[idx] = ChunkedReadPart::Ready(chunk); + } + ReadTaskOutput::Continue(task) => { + parts[idx] = ChunkedReadPart::Pending { expected_len, task }; + pending = true; + } + } + } + if pending { + return Ok(ReadTaskOutput::Continue(Box::new(ChunkedReadTask { + dtype, + parts, + }))); + } + let mut arrays = parts + .into_iter() + .map(|part| match part { + ChunkedReadPart::Ready(array) => Ok(array), + ChunkedReadPart::Pending { .. } => { + vortex_bail!("chunked read part remained pending after step completion") + } + }) + .collect::>>()?; + let array = match arrays.len() { + 0 => vortex_bail!("chunked scoped read produced no parts"), + 1 => arrays.swap_remove(0), + _ => ChunkedArray::try_new(arrays, dtype)?.into_array(), + }; + Ok(ReadTaskOutput::Ready(array)) + }, + )) + } +} + struct ChunkedEvidenceState { chunked: Arc, children: Mutex>>, @@ -550,104 +640,111 @@ impl ScanPlan for ChunkedScanPlan { } impl PreparedRead for ChunkedPreparedRead { - /// The chunked scoped read: slice the selection and demand per - /// overlapping chunk, skip chunks whose selection is all-false, and - /// represent selected-but-undemanded chunks with dtype-default filler - /// without expanding or reading the child. - fn read_scoped<'a>( - &'a self, + fn create_task( + self: Arc, range: Range, - rows: RowScope<'a>, - io: &'a ReadContext, - local_ctx: &'a mut ExecutionCtx, - ) -> BoxFuture<'a, VortexResult> { - Box::pin(async move { - if range.start >= range.end { - vortex_bail!("empty chunked scoped read range"); - } - #[cfg(debug_assertions)] - { - let released = self.state.released.load(Ordering::Relaxed); - debug_assert!( - range.start >= released, - "chunked read {range:?} below the released frontier {released}" - ); - } - let range_len = usize::try_from(range.end - range.start) - .map_err(|_| vortex_err!("read range exceeds usize"))?; - if rows.selection.len() != range_len { - vortex_bail!( - "selection length {} does not match range length {range_len}", - rows.selection.len() - ); - } - if rows.demand.len() != range_len { - vortex_bail!( - "demand length {} does not match range length {range_len}", - rows.demand.len() - ); - } - if rows.selection.all_false() { - return Ok( + rows: OwnedRowScope, + reads: Vec, + prefetch_reads: Vec, + cx: &mut SegmentPlanCtx, + ) -> VortexResult> { + if range.start >= range.end { + vortex_bail!("empty chunked scoped read range"); + } + #[cfg(debug_assertions)] + { + let released = self.state.released.load(Ordering::Relaxed); + debug_assert!( + range.start >= released, + "chunked read {range:?} below the released frontier {released}" + ); + } + let range_len = usize::try_from(range.end - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + let row_scope = rows.as_scope(); + if row_scope.selection.len() != range_len { + vortex_bail!( + "selection length {} does not match range length {range_len}", + row_scope.selection.len() + ); + } + if row_scope.demand.len() != range_len { + vortex_bail!( + "demand length {} does not match range length {range_len}", + row_scope.demand.len() + ); + } + if row_scope.selection.all_false() { + return Ok(Box::new(ChunkedReadTask { + dtype: self.node.layout.dtype().clone(), + parts: vec![ChunkedReadPart::Ready( ConstantArray::new(Scalar::default_value(self.node.layout.dtype()), 0) .into_array(), - ); - } + )], + })); + } - let dtype = self.node.layout.dtype().clone(); - let dense_scope = rows.selection.all_true() && rows.demand.all_true(); - let selected_scope = !dense_scope && rows.demands_all_selected(); - let mut parts = Vec::new(); - let mut idx = self.node.first_chunk(range.start); - while idx + 1 < self.node.offsets.len() && self.node.offsets[idx] < range.end { - let chunk_start = self.node.offsets[idx]; - let chunk_end = self.node.offsets[idx + 1]; - let local = range.start.saturating_sub(chunk_start) - ..(range.end.min(chunk_end) - chunk_start); - let sel_start = usize::try_from(chunk_start.max(range.start) - range.start) - .map_err(|_| vortex_err!("read range exceeds usize"))?; - let sel_end = usize::try_from(chunk_end.min(range.end) - range.start) - .map_err(|_| vortex_err!("read range exceeds usize"))?; - let chunk_selection = rows.selection.slice(sel_start..sel_end); - idx += 1; - if chunk_selection.all_false() { - continue; - } - let chunk_demand = rows.demand.slice(sel_start..sel_end); - if chunk_demand.all_false() { - parts.push( - ConstantArray::new( - Scalar::default_value(&dtype), - chunk_selection.true_count(), - ) - .into_array(), - ); - continue; - } - let chunk_idx = idx - 1; - let read = self.node.child_read(chunk_idx, &self.state, io.session())?; - let chunk = if dense_scope || selected_scope { - read.read_scoped(local, RowScope::selected(&chunk_selection), io, local_ctx) - .await? - } else { - let chunk_rows = RowScope::try_new(&chunk_selection, &chunk_demand)?; - read.read_scoped(local, chunk_rows, io, local_ctx).await? - }; - if chunk.len() != chunk_selection.true_count() { - vortex_bail!( - "scoped chunk read returned length {}, expected {}", - chunk.len(), - chunk_selection.true_count() - ); - } - parts.push(chunk); + let dtype = self.node.layout.dtype().clone(); + let mut reads = reads.into_iter().map(Some).collect::>(); + let mut prefetch_reads = prefetch_reads.into_iter().map(Some).collect::>(); + let dense_scope = row_scope.selection.all_true() && row_scope.demand.all_true(); + let selected_scope = !dense_scope && row_scope.demands_all_selected(); + let mut parts = Vec::new(); + let mut idx = self.node.first_chunk(range.start); + while idx + 1 < self.node.offsets.len() && self.node.offsets[idx] < range.end { + let chunk_start = self.node.offsets[idx]; + let chunk_end = self.node.offsets[idx + 1]; + let local = + range.start.saturating_sub(chunk_start)..(range.end.min(chunk_end) - chunk_start); + let sel_start = usize::try_from(chunk_start.max(range.start) - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + let sel_end = usize::try_from(chunk_end.min(range.end) - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + let chunk_selection = row_scope.selection.slice(sel_start..sel_end); + idx += 1; + if chunk_selection.all_false() { + continue; } - match parts.len() { - 0 => vortex_bail!("chunked scoped read range {range:?} out of bounds"), - 1 => Ok(parts.swap_remove(0)), - _ => Ok(ChunkedArray::try_new(parts, dtype)?.into_array()), + let chunk_demand = row_scope.demand.slice(sel_start..sel_end); + if chunk_demand.all_false() { + parts.push(ChunkedReadPart::Ready( + ConstantArray::new(Scalar::default_value(&dtype), chunk_selection.true_count()) + .into_array(), + )); + continue; } - }) + let chunk_idx = idx - 1; + let read = self.node.child_read(chunk_idx, &self.state, cx.session())?; + let chunk_rows = if dense_scope || selected_scope { + OwnedRowScope::selected(chunk_selection.clone()) + } else { + OwnedRowScope::try_new(chunk_selection.clone(), chunk_demand)? + }; + let chunk_scope = chunk_rows.as_scope(); + let chunk_reads = take_reads_for_requests( + &mut reads, + read.segment_requests(local.clone(), chunk_scope, cx)?, + )?; + let chunk_prefetch_reads = take_reads_for_requests( + &mut prefetch_reads, + read.prefetch_segment_requests(local.clone(), chunk_scope, cx)?, + )?; + let expected_len = chunk_selection.true_count(); + parts.push(ChunkedReadPart::Pending { + expected_len, + task: Arc::clone(&read).create_task( + local, + chunk_rows, + chunk_reads, + chunk_prefetch_reads, + cx, + )?, + }); + } + match parts.len() { + 0 => vortex_bail!("chunked scoped read range {range:?} out of bounds"), + _ => Ok(Box::new(ChunkedReadTask { dtype, parts })), + } } fn segment_requests( @@ -728,6 +825,76 @@ impl PreparedRead for ChunkedPreparedRead { Ok(requests) } + fn prefetch_segment_requests( + &self, + range: Range, + rows: RowScope<'_>, + cx: &mut SegmentPlanCtx, + ) -> VortexResult { + if range.start >= range.end { + vortex_bail!("empty chunked scoped read range"); + } + let range_len = usize::try_from(range.end - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + if rows.selection.len() != range_len { + vortex_bail!( + "selection length {} does not match range length {range_len}", + rows.selection.len() + ); + } + if rows.demand.len() != range_len { + vortex_bail!( + "demand length {} does not match range length {range_len}", + rows.demand.len() + ); + } + if rows.selection.all_false() { + return Ok(SegmentRequests::none()); + } + + let dense_scope = rows.selection.all_true() && rows.demand.all_true(); + let selected_scope = !dense_scope && rows.demands_all_selected(); + let mut requests = SegmentRequests::none(); + let mut saw_overlap = false; + let mut idx = self.node.first_chunk(range.start); + while idx + 1 < self.node.offsets.len() && self.node.offsets[idx] < range.end { + saw_overlap = true; + let chunk_start = self.node.offsets[idx]; + let chunk_end = self.node.offsets[idx + 1]; + let local = + range.start.saturating_sub(chunk_start)..(range.end.min(chunk_end) - chunk_start); + let sel_start = usize::try_from(chunk_start.max(range.start) - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + let sel_end = usize::try_from(chunk_end.min(range.end) - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + let chunk_selection = rows.selection.slice(sel_start..sel_end); + idx += 1; + if chunk_selection.all_false() { + continue; + } + let chunk_demand = rows.demand.slice(sel_start..sel_end); + if chunk_demand.all_false() { + continue; + } + let chunk_idx = idx - 1; + let read = self.node.child_read(chunk_idx, &self.state, cx.session())?; + let chunk_requests = if dense_scope || selected_scope { + read.prefetch_segment_requests(local, RowScope::selected(&chunk_selection), cx)? + } else { + let chunk_rows = RowScope::try_new(&chunk_selection, &chunk_demand)?; + read.prefetch_segment_requests(local, chunk_rows, cx)? + }; + requests.extend(chunk_requests); + if requests.is_unknown() { + return Ok(requests); + } + } + if !saw_overlap { + vortex_bail!("chunked scoped read range {range:?} out of bounds"); + } + Ok(requests) + } + fn release(&self, frontier: u64) -> VortexResult<()> { self.node.release(frontier, &self.state) } @@ -833,100 +1000,117 @@ impl ScanPlan for ChunkedExprScanPlan { } impl PreparedRead for ChunkedExprPreparedRead { - fn read_scoped<'a>( - &'a self, + fn create_task( + self: Arc, range: Range, - rows: RowScope<'a>, - io: &'a ReadContext, - local_ctx: &'a mut ExecutionCtx, - ) -> BoxFuture<'a, VortexResult> { - Box::pin(async move { - if range.start >= range.end { - vortex_bail!("empty chunked scoped read range"); - } - #[cfg(debug_assertions)] - { - let released = self.state.released.load(Ordering::Relaxed); - debug_assert!( - range.start >= released, - "chunked expression read {range:?} below the released frontier {released}" - ); - } - let range_len = usize::try_from(range.end - range.start) - .map_err(|_| vortex_err!("read range exceeds usize"))?; - if rows.selection.len() != range_len { - vortex_bail!( - "selection length {} does not match range length {range_len}", - rows.selection.len() - ); - } - if rows.demand.len() != range_len { - vortex_bail!( - "demand length {} does not match range length {range_len}", - rows.demand.len() - ); - } - if rows.selection.all_false() { - return Ok( + rows: OwnedRowScope, + reads: Vec, + prefetch_reads: Vec, + cx: &mut SegmentPlanCtx, + ) -> VortexResult> { + if range.start >= range.end { + vortex_bail!("empty chunked scoped read range"); + } + #[cfg(debug_assertions)] + { + let released = self.state.released.load(Ordering::Relaxed); + debug_assert!( + range.start >= released, + "chunked expression read {range:?} below the released frontier {released}" + ); + } + let range_len = usize::try_from(range.end - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + let row_scope = rows.as_scope(); + if row_scope.selection.len() != range_len { + vortex_bail!( + "selection length {} does not match range length {range_len}", + row_scope.selection.len() + ); + } + if row_scope.demand.len() != range_len { + vortex_bail!( + "demand length {} does not match range length {range_len}", + row_scope.demand.len() + ); + } + if row_scope.selection.all_false() { + return Ok(Box::new(ChunkedReadTask { + dtype: self.node.dtype.clone(), + parts: vec![ChunkedReadPart::Ready( ConstantArray::new(Scalar::default_value(&self.node.dtype), 0).into_array(), - ); - } + )], + })); + } - let dense_scope = rows.selection.all_true() && rows.demand.all_true(); - let selected_scope = !dense_scope && rows.demands_all_selected(); - let mut parts = Vec::new(); - let mut idx = self.node.chunked.first_chunk(range.start); - while idx + 1 < self.node.chunked.offsets.len() - && self.node.chunked.offsets[idx] < range.end - { - let chunk_start = self.node.chunked.offsets[idx]; - let chunk_end = self.node.chunked.offsets[idx + 1]; - let local = range.start.saturating_sub(chunk_start) - ..(range.end.min(chunk_end) - chunk_start); - let sel_start = usize::try_from(chunk_start.max(range.start) - range.start) - .map_err(|_| vortex_err!("read range exceeds usize"))?; - let sel_end = usize::try_from(chunk_end.min(range.end) - range.start) - .map_err(|_| vortex_err!("read range exceeds usize"))?; - let chunk_selection = rows.selection.slice(sel_start..sel_end); - idx += 1; - if chunk_selection.all_false() { - continue; - } - let chunk_demand = rows.demand.slice(sel_start..sel_end); - if chunk_demand.all_false() { - parts.push( - ConstantArray::new( - Scalar::default_value(&self.node.dtype), - chunk_selection.true_count(), - ) - .into_array(), - ); - continue; - } - let chunk_idx = idx - 1; - let read = self.node.child_read(chunk_idx, &self.state, io.session())?; - let chunk = if dense_scope || selected_scope { - read.read_scoped(local, RowScope::selected(&chunk_selection), io, local_ctx) - .await? - } else { - let chunk_rows = RowScope::try_new(&chunk_selection, &chunk_demand)?; - read.read_scoped(local, chunk_rows, io, local_ctx).await? - }; - if chunk.len() != chunk_selection.true_count() { - vortex_bail!( - "scoped chunk read returned length {}, expected {}", - chunk.len(), - chunk_selection.true_count() - ); - } - parts.push(chunk); + let mut reads = reads.into_iter().map(Some).collect::>(); + let mut prefetch_reads = prefetch_reads.into_iter().map(Some).collect::>(); + let dense_scope = row_scope.selection.all_true() && row_scope.demand.all_true(); + let selected_scope = !dense_scope && row_scope.demands_all_selected(); + let mut parts = Vec::new(); + let mut idx = self.node.chunked.first_chunk(range.start); + while idx + 1 < self.node.chunked.offsets.len() + && self.node.chunked.offsets[idx] < range.end + { + let chunk_start = self.node.chunked.offsets[idx]; + let chunk_end = self.node.chunked.offsets[idx + 1]; + let local = + range.start.saturating_sub(chunk_start)..(range.end.min(chunk_end) - chunk_start); + let sel_start = usize::try_from(chunk_start.max(range.start) - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + let sel_end = usize::try_from(chunk_end.min(range.end) - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + let chunk_selection = row_scope.selection.slice(sel_start..sel_end); + idx += 1; + if chunk_selection.all_false() { + continue; } - match parts.len() { - 0 => vortex_bail!("chunked scoped read range {range:?} out of bounds"), - 1 => Ok(parts.swap_remove(0)), - _ => Ok(ChunkedArray::try_new(parts, self.node.dtype.clone())?.into_array()), + let chunk_demand = row_scope.demand.slice(sel_start..sel_end); + if chunk_demand.all_false() { + parts.push(ChunkedReadPart::Ready( + ConstantArray::new( + Scalar::default_value(&self.node.dtype), + chunk_selection.true_count(), + ) + .into_array(), + )); + continue; } - }) + let chunk_idx = idx - 1; + let read = self.node.child_read(chunk_idx, &self.state, cx.session())?; + let chunk_rows = if dense_scope || selected_scope { + OwnedRowScope::selected(chunk_selection.clone()) + } else { + OwnedRowScope::try_new(chunk_selection.clone(), chunk_demand)? + }; + let chunk_scope = chunk_rows.as_scope(); + let chunk_reads = take_reads_for_requests( + &mut reads, + read.segment_requests(local.clone(), chunk_scope, cx)?, + )?; + let chunk_prefetch_reads = take_reads_for_requests( + &mut prefetch_reads, + read.prefetch_segment_requests(local.clone(), chunk_scope, cx)?, + )?; + let expected_len = chunk_selection.true_count(); + parts.push(ChunkedReadPart::Pending { + expected_len, + task: Arc::clone(&read).create_task( + local, + chunk_rows, + chunk_reads, + chunk_prefetch_reads, + cx, + )?, + }); + } + match parts.len() { + 0 => vortex_bail!("chunked scoped read range {range:?} out of bounds"), + _ => Ok(Box::new(ChunkedReadTask { + dtype: self.node.dtype.clone(), + parts, + })), + } } fn segment_requests( @@ -1009,6 +1193,78 @@ impl PreparedRead for ChunkedExprPreparedRead { Ok(requests) } + fn prefetch_segment_requests( + &self, + range: Range, + rows: RowScope<'_>, + cx: &mut SegmentPlanCtx, + ) -> VortexResult { + if range.start >= range.end { + vortex_bail!("empty chunked scoped read range"); + } + let range_len = usize::try_from(range.end - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + if rows.selection.len() != range_len { + vortex_bail!( + "selection length {} does not match range length {range_len}", + rows.selection.len() + ); + } + if rows.demand.len() != range_len { + vortex_bail!( + "demand length {} does not match range length {range_len}", + rows.demand.len() + ); + } + if rows.selection.all_false() { + return Ok(SegmentRequests::none()); + } + + let dense_scope = rows.selection.all_true() && rows.demand.all_true(); + let selected_scope = !dense_scope && rows.demands_all_selected(); + let mut requests = SegmentRequests::none(); + let mut saw_overlap = false; + let mut idx = self.node.chunked.first_chunk(range.start); + while idx + 1 < self.node.chunked.offsets.len() + && self.node.chunked.offsets[idx] < range.end + { + saw_overlap = true; + let chunk_start = self.node.chunked.offsets[idx]; + let chunk_end = self.node.chunked.offsets[idx + 1]; + let local = + range.start.saturating_sub(chunk_start)..(range.end.min(chunk_end) - chunk_start); + let sel_start = usize::try_from(chunk_start.max(range.start) - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + let sel_end = usize::try_from(chunk_end.min(range.end) - range.start) + .map_err(|_| vortex_err!("read range exceeds usize"))?; + let chunk_selection = rows.selection.slice(sel_start..sel_end); + idx += 1; + if chunk_selection.all_false() { + continue; + } + let chunk_demand = rows.demand.slice(sel_start..sel_end); + if chunk_demand.all_false() { + continue; + } + let chunk_idx = idx - 1; + let read = self.node.child_read(chunk_idx, &self.state, cx.session())?; + let chunk_requests = if dense_scope || selected_scope { + read.prefetch_segment_requests(local, RowScope::selected(&chunk_selection), cx)? + } else { + let chunk_rows = RowScope::try_new(&chunk_selection, &chunk_demand)?; + read.prefetch_segment_requests(local, chunk_rows, cx)? + }; + requests.extend(chunk_requests); + if requests.is_unknown() { + return Ok(requests); + } + } + if !saw_overlap { + vortex_bail!("chunked scoped read range {range:?} out of bounds"); + } + Ok(requests) + } + fn release(&self, frontier: u64) -> VortexResult<()> { self.node.release(frontier, &self.state) } @@ -1023,65 +1279,63 @@ impl PreparedEvidence for ChunkedPreparedEvidence { &'a self, req: &'a EvidenceRequest<'a>, io: &'a ReadContext, - ) -> BoxFuture<'a, VortexResult>> { - Box::pin(async move { - if req.range.start >= req.range.end { - return Ok(Vec::new()); - } - let mut fragments = Vec::new(); - let mut idx = self.node.chunked.first_chunk(req.range.start); - while idx + 1 < self.node.chunked.offsets.len() - && self.node.chunked.offsets[idx] < req.range.end - { - let chunk_start = self.node.chunked.offsets[idx]; - let chunk_end = self.node.chunked.offsets[idx + 1]; - let local = req.range.start.saturating_sub(chunk_start) - ..(req.range.end.min(chunk_end) - chunk_start); - let recheck = req.mode == EvidenceMode::RecheckBeforeProjection; - let child_plans = if let Some(hit) = self.state.children.lock().get(&idx) { + ) -> VortexResult> { + if req.range.start >= req.range.end { + return Ok(Vec::new()); + } + let mut fragments = Vec::new(); + let mut idx = self.node.chunked.first_chunk(req.range.start); + while idx + 1 < self.node.chunked.offsets.len() + && self.node.chunked.offsets[idx] < req.range.end + { + let chunk_start = self.node.chunked.offsets[idx]; + let chunk_end = self.node.chunked.offsets[idx + 1]; + let local = req.range.start.saturating_sub(chunk_start) + ..(req.range.end.min(chunk_end) - chunk_start); + let recheck = req.mode == EvidenceMode::RecheckBeforeProjection; + let child_plans = if let Some(hit) = self.state.children.lock().get(&idx) { + hit.clone() + } else if recheck { + if let Some(hit) = self.state.recheck_children.lock().get(&idx) { hit.clone() - } else if recheck { - if let Some(hit) = self.state.recheck_children.lock().get(&idx) { - hit.clone() - } else { - let node = self.node.child(idx, io.session())?; - let mut plan_ctx = self.state.chunked.child_prepare_ctx(idx, io.session()); - let plans = node.prepare_evidence(&mut plan_ctx)?; - let planned = plans - .into_iter() - .filter(|plan| plan.recheck_before_projection()) - .collect::>(); - let mut children = self.state.recheck_children.lock(); - children.entry(idx).or_insert(planned).clone() - } } else { let node = self.node.child(idx, io.session())?; let mut plan_ctx = self.state.chunked.child_prepare_ctx(idx, io.session()); - let planned = node.prepare_evidence(&mut plan_ctx)?; - let mut children = self.state.children.lock(); + let plans = node.prepare_evidence(&mut plan_ctx)?; + let planned = plans + .into_iter() + .filter(|plan| plan.recheck_before_projection()) + .collect::>(); + let mut children = self.state.recheck_children.lock(); children.entry(idx).or_insert(planned).clone() + } + } else { + let node = self.node.child(idx, io.session())?; + let mut plan_ctx = self.state.chunked.child_prepare_ctx(idx, io.session()); + let planned = node.prepare_evidence(&mut plan_ctx)?; + let mut children = self.state.children.lock(); + children.entry(idx).or_insert(planned).clone() + }; + if !child_plans.is_empty() { + let child_req = EvidenceRequest { + id: req.id, + version: req.version, + predicate: req.predicate, + range: local, + mode: req.mode, }; - if !child_plans.is_empty() { - let child_req = EvidenceRequest { - id: req.id, - version: req.version, - predicate: req.predicate, - range: local, - mode: req.mode, - }; - for plan in child_plans { - if recheck && !plan.recheck_before_projection() { - continue; - } - for fragment in plan.evidence(&child_req, io).await? { - fragments.push(translate_fragment(fragment, chunk_start)); - } + for plan in child_plans { + if recheck && !plan.recheck_before_projection() { + continue; + } + for fragment in plan.evidence(&child_req, io)? { + fragments.push(translate_fragment(fragment, chunk_start)); } } - idx += 1; } - Ok(fragments) - }) + idx += 1; + } + Ok(fragments) } fn segment_requests( @@ -1149,6 +1403,71 @@ impl PreparedEvidence for ChunkedPreparedEvidence { Ok(requests) } + fn prefetch_segment_requests( + &self, + req: &EvidenceRequest<'_>, + cx: &mut SegmentPlanCtx, + ) -> VortexResult { + if req.range.start >= req.range.end { + return Ok(SegmentRequests::none()); + } + + let mut requests = SegmentRequests::none(); + let mut idx = self.node.chunked.first_chunk(req.range.start); + while idx + 1 < self.node.chunked.offsets.len() + && self.node.chunked.offsets[idx] < req.range.end + { + let chunk_start = self.node.chunked.offsets[idx]; + let chunk_end = self.node.chunked.offsets[idx + 1]; + let local = req.range.start.saturating_sub(chunk_start) + ..(req.range.end.min(chunk_end) - chunk_start); + let recheck = req.mode == EvidenceMode::RecheckBeforeProjection; + let child_plans = if let Some(hit) = self.state.children.lock().get(&idx) { + hit.clone() + } else if recheck { + if let Some(hit) = self.state.recheck_children.lock().get(&idx) { + hit.clone() + } else { + let node = self.node.child(idx, cx.session())?; + let mut plan_ctx = self.state.chunked.child_prepare_ctx(idx, cx.session()); + let plans = node.prepare_evidence(&mut plan_ctx)?; + let planned = plans + .into_iter() + .filter(|plan| plan.recheck_before_projection()) + .collect::>(); + let mut children = self.state.recheck_children.lock(); + children.entry(idx).or_insert(planned).clone() + } + } else { + let node = self.node.child(idx, cx.session())?; + let mut plan_ctx = self.state.chunked.child_prepare_ctx(idx, cx.session()); + let planned = node.prepare_evidence(&mut plan_ctx)?; + let mut children = self.state.children.lock(); + children.entry(idx).or_insert(planned).clone() + }; + if !child_plans.is_empty() { + let child_req = EvidenceRequest { + id: req.id, + version: req.version, + predicate: req.predicate, + range: local, + mode: req.mode, + }; + for plan in child_plans { + if recheck && !plan.recheck_before_projection() { + continue; + } + requests.extend(plan.prefetch_segment_requests(&child_req, cx)?); + if requests.is_unknown() { + return Ok(requests); + } + } + } + idx += 1; + } + Ok(requests) + } + fn recheck_before_projection(&self) -> bool { true } diff --git a/vortex-layout/src/scan/v2/layouts/dict.rs b/vortex-layout/src/scan/v2/layouts/dict.rs index 1c001a84ca4..1b38ef8fd02 100644 --- a/vortex-layout/src/scan/v2/layouts/dict.rs +++ b/vortex-layout/src/scan/v2/layouts/dict.rs @@ -17,19 +17,14 @@ use std::fmt; use std::ops::Range; use std::sync::Arc; -use futures::FutureExt; -use futures::future::BoxFuture; -use futures::try_join; use parking_lot::Mutex; use rustc_hash::FxHashMap; use vortex_array::ArrayRef; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; -use vortex_array::VortexSessionExecute; use vortex_array::arrays::BoolArray; use vortex_array::arrays::DictArray; use vortex_array::arrays::PrimitiveArray; -use vortex_array::arrays::SharedArray; use vortex_array::dtype::DType; use vortex_array::dtype::NativePType; use vortex_array::expr::Expression; @@ -38,31 +33,34 @@ use vortex_array::match_each_integer_ptype; use vortex_array::optimizer::ArrayOptimizer; use vortex_array::validity::Validity; use vortex_buffer::BufferMut; -use vortex_error::VortexError; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; use vortex_mask::AllOr; use vortex_mask::Mask; -use vortex_scan::plan::PrepareCtx; -use vortex_scan::plan::PreparedRead; -use vortex_scan::plan::PreparedReadRef; -use vortex_scan::plan::PreparedStateKey; -use vortex_scan::plan::PushCtx; -use vortex_scan::plan::ReadContext; -use vortex_scan::plan::RowScope; -use vortex_scan::plan::ScanPlan; -use vortex_scan::plan::ScanPlanRef; -use vortex_scan::plan::ScanState; -use vortex_scan::plan::ScanStateRef; -use vortex_scan::plan::StateCtx; -use vortex_scan::plan::default_try_push_expr; -use vortex_scan::plan::request::ScanRequest; +use vortex_scan::read::ScanRead; use vortex_session::VortexSession; use crate::layout_v2::Dict; use crate::layout_v2::Layout; -use crate::layouts::SharedArrayFuture; +use crate::scan::plan::OwnedRowScope; +use crate::scan::plan::PrepareCtx; +use crate::scan::plan::PreparedRead; +use crate::scan::plan::PreparedReadRef; +use crate::scan::plan::PreparedStateKey; +use crate::scan::plan::PushCtx; +use crate::scan::plan::ReadStep; +use crate::scan::plan::ReadTask; +use crate::scan::plan::ReadTaskOutput; +use crate::scan::plan::RowScope; +use crate::scan::plan::ScanPlan; +use crate::scan::plan::ScanPlanRef; +use crate::scan::plan::ScanState; +use crate::scan::plan::ScanStateRef; +use crate::scan::plan::StateCtx; +use crate::scan::plan::default_try_push_expr; +use crate::scan::plan::request::ScanRequest; +use crate::scan::plan::take_reads_for_requests; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; @@ -93,8 +91,7 @@ pub struct DictScanPlan { codes: ScanPlanRef, } -/// Per-query dictionary caches: the shared values relation and cached -/// value-domain expression results. +/// Per-query dictionary caches for value-domain expression results. #[derive(Clone)] pub struct DictScanState { shared: DictSharedState, @@ -102,7 +99,6 @@ pub struct DictScanState { #[derive(Clone)] struct DictSharedState { - values: Arc>>, value_exprs: Arc>>>, } @@ -117,7 +113,6 @@ impl DictScanState { impl Default for DictSharedState { fn default() -> Self { Self { - values: Arc::new(Mutex::new(None)), value_exprs: Arc::new(Mutex::new(FxHashMap::default())), } } @@ -131,7 +126,6 @@ struct DictExprScanPlan { struct DictPreparedRead { node: Arc, - state: Arc, values_read: PreparedReadRef, codes_read: PreparedReadRef, } @@ -192,50 +186,6 @@ fn value_expr_candidate(expr: &Expression, values_len: u64, rows: RowScope<'_>) } impl DictScanPlan { - /// The values relation wrapped in a `SharedArray`, read once per query. - fn values( - &self, - values_read: PreparedReadRef, - io: &ReadContext, - state: &DictScanState, - ) -> SharedArrayFuture { - if let Some(hit) = state.shared.values.lock().clone() { - return hit; - } - - let mut guard = state.shared.values.lock(); - if let Some(hit) = guard.clone() { - return hit; - } - - let values_len = self.values_len; - let io = io.clone(); - let future = async move { - let selection = - Mask::new_true(usize::try_from(values_len).map_err(|_| { - Arc::new(vortex_err!("dictionary values length exceeds usize")) - })?); - let mut local = io.session().create_execution_ctx(); - let values = values_read - .read_scoped( - 0..values_len, - RowScope::selected(&selection), - &io, - &mut local, - ) - .await - .map_err(Arc::new)?; - // The shared future single-flights IO. `SharedArray` separately memoizes execution of - // the full dictionary values across batches; sparse selected reads bypass this path. - Ok(SharedArray::new(values).into_array()) - } - .boxed() - .shared(); - - *guard = Some(future.clone()); - future - } - fn build_dict(&self, codes: ArrayRef, values: ArrayRef) -> VortexResult { // SAFETY: the codes and values children come from a validated dictionary layout. Ok(unsafe { DictArray::new_unchecked(codes, values) }.into_array()) @@ -267,8 +217,6 @@ impl ScanPlan for DictScanPlan { } fn prepare_read(self: Arc, cx: &mut PrepareCtx) -> VortexResult> { - let key = PreparedStateKey::new::(Arc::as_ptr(&self) as *const () as usize); - let state = cx.shared_state(key, || Ok(DictScanState::new()))?; let values_read = Arc::clone(&self.values) .prepare_read(cx)? .ok_or_else(|| vortex_err!("dictionary values did not produce a prepared read"))?; @@ -277,16 +225,12 @@ impl ScanPlan for DictScanPlan { .ok_or_else(|| vortex_err!("dictionary codes did not produce a prepared read"))?; Ok(Some(Arc::new(DictPreparedRead { node: self, - state, values_read, codes_read, }))) } - /// Codes live in this node's row domain and release with it. The - /// cached values relation and value-domain expression results stay: - /// they are read once per query by design and consulted by every - /// remaining morsel. + /// Codes live in this node's row domain and release with it. fn release(&self, frontier: u64, state: &ScanState) -> VortexResult<()> { let _ = (frontier, state); Ok(()) @@ -339,81 +283,236 @@ impl ScanPlan for DictExprScanPlan { } } -impl PreparedRead for DictPreparedRead { - fn read_scoped<'a>( - &'a self, - range: Range, - rows: RowScope<'a>, - io: &'a ReadContext, - local: &'a mut ExecutionCtx, - ) -> BoxFuture<'a, VortexResult> { - Box::pin(async move { - if sparse_dict_candidate(self.node.values_len, rows) { - let codes = self - .codes_read - .read_scoped(range.clone(), rows, io, local) - .await?; - let values_len = usize::try_from(self.node.values_len) - .map_err(|_| vortex_err!("dictionary values length exceeds usize"))?; - if let Some((compact_codes, value_selection)) = - compact_codes_and_value_selection(codes.clone(), values_len, local)? - { - let values = self - .values_read - .read_scoped( - 0..self.node.values_len, - RowScope::selected(&value_selection), - io, - local, - ) - .await?; - return self.node.build_dict(compact_codes, values)?.optimize(); - } +enum DictReadState { + Start, + SparseValues { + compact_codes: ArrayRef, + values: Option>, + }, + FullValues { + codes: ArrayRef, + values: Option>, + }, +} - let values = self - .node - .values(Arc::clone(&self.values_read), io, &self.state) - .await - .map_err(VortexError::from)?; - return self.node.build_dict(codes, values)?.optimize(); +struct DictReadTask { + read: Arc, + codes: Box, + value_reads: Vec>, + cx: SegmentPlanCtx, + state: DictReadState, +} + +impl ReadTask for DictReadTask { + fn into_step(self: Box) -> VortexResult { + let task = *self; + match task.state { + DictReadState::Start => { + let DictReadTask { + read, + codes, + value_reads, + cx, + state: _, + } = task; + let codes_step = codes.into_step()?; + Ok(ReadStep::new( + codes_step.required_reads, + codes_step.prefetch_reads, + move |io, local, results| match codes_step + .continuation + .run(io, local, results)? + { + ReadTaskOutput::Continue(codes) => { + Ok(ReadTaskOutput::Continue(Box::new(DictReadTask { + read, + codes, + value_reads, + cx, + state: DictReadState::Start, + }))) + } + ReadTaskOutput::Ready(codes) => { + let mut task = DictReadTask { + read, + codes: Box::new(crate::scan::plan::DeferredReadTask), + value_reads, + cx, + state: DictReadState::Start, + }; + let rows = OwnedRowScope::selected(Mask::new_true(codes.len())); + if sparse_dict_candidate(task.read.node.values_len, rows.as_scope()) { + let values_len = usize::try_from(task.read.node.values_len) + .map_err(|_| { + vortex_err!("dictionary values length exceeds usize") + })?; + if let Some((compact_codes, value_selection)) = + compact_codes_and_value_selection( + codes.clone(), + values_len, + local, + )? + { + let values = task + .create_values_task(RowScope::selected(&value_selection))?; + task.state = DictReadState::SparseValues { + compact_codes, + values: Some(values), + }; + return Ok(ReadTaskOutput::Continue(Box::new(task))); + } + } + let values = task.create_full_values_task()?; + task.state = DictReadState::FullValues { + codes, + values: Some(values), + }; + Ok(ReadTaskOutput::Continue(Box::new(task))) + } + }, + )) + } + DictReadState::SparseValues { + compact_codes, + mut values, + } => { + let values_task = values.take().ok_or_else(|| { + vortex_err!("dictionary sparse values task was not initialized") + })?; + let values_step = values_task.into_step()?; + let read = task.read; + let value_reads = task.value_reads; + let cx = task.cx; + Ok(ReadStep::new( + values_step.required_reads, + values_step.prefetch_reads, + move |io, local, results| match values_step + .continuation + .run(io, local, results)? + { + ReadTaskOutput::Continue(values) => { + Ok(ReadTaskOutput::Continue(Box::new(DictReadTask { + read, + codes: Box::new(crate::scan::plan::DeferredReadTask), + value_reads, + cx, + state: DictReadState::SparseValues { + compact_codes, + values: Some(values), + }, + }))) + } + ReadTaskOutput::Ready(values) => Ok(ReadTaskOutput::Ready( + read.node.build_dict(compact_codes, values)?.optimize()?, + )), + }, + )) } + DictReadState::FullValues { codes, mut values } => { + let values_task = values.take().ok_or_else(|| { + vortex_err!("dictionary full values task was not initialized") + })?; + let values_step = values_task.into_step()?; + let read = task.read; + let value_reads = task.value_reads; + let cx = task.cx; + Ok(ReadStep::new( + values_step.required_reads, + values_step.prefetch_reads, + move |io, local, results| match values_step + .continuation + .run(io, local, results)? + { + ReadTaskOutput::Continue(values) => { + Ok(ReadTaskOutput::Continue(Box::new(DictReadTask { + read, + codes: Box::new(crate::scan::plan::DeferredReadTask), + value_reads, + cx, + state: DictReadState::FullValues { + codes, + values: Some(values), + }, + }))) + } + ReadTaskOutput::Ready(values) => Ok(ReadTaskOutput::Ready( + read.node.build_dict(codes, values)?.optimize()?, + )), + }, + )) + } + } + } +} +impl DictReadTask { + fn create_values_task(&mut self, rows: RowScope<'_>) -> VortexResult> { + let range = 0..self.read.node.values_len; + let requests = self + .read + .values_read + .segment_requests(range.clone(), rows, &mut self.cx)?; + let reads = take_reads_for_requests(&mut self.value_reads, requests)?; + let owned_rows = OwnedRowScope::try_new(rows.selection.clone(), rows.demand.clone())?; + Arc::clone(&self.read.values_read).create_task( + range, + owned_rows, + reads, + Vec::new(), + &mut self.cx, + ) + } - let values = async { - self.node - .values(Arc::clone(&self.values_read), io, &self.state) - .await - .map_err(VortexError::from) - }; - let codes = self.codes_read.read_scoped(range, rows, io, local); - let (values, codes) = try_join!(values, codes)?; - self.node.build_dict(codes, values)?.optimize() - }) + fn create_full_values_task(&mut self) -> VortexResult> { + let values_selection = Mask::new_true( + usize::try_from(self.read.node.values_len) + .map_err(|_| vortex_err!("dictionary values length exceeds usize"))?, + ); + self.create_values_task(RowScope::selected(&values_selection)) } +} +impl PreparedRead for DictPreparedRead { fn segment_requests( &self, range: Range, rows: RowScope<'_>, cx: &mut SegmentPlanCtx, ) -> VortexResult { - if sparse_dict_candidate(self.node.values_len, rows) { - return self.codes_read.segment_requests(range, rows, cx); - } + self.codes_read.segment_requests(range, rows, cx) + } + fn prefetch_segment_requests( + &self, + _range: Range, + _rows: RowScope<'_>, + cx: &mut SegmentPlanCtx, + ) -> VortexResult { let values_selection = Mask::new_true( usize::try_from(self.node.values_len) .map_err(|_| vortex_err!("dictionary values length exceeds usize"))?, ); - let mut requests = self.values_read.segment_requests( + self.values_read.segment_requests( 0..self.node.values_len, RowScope::selected(&values_selection), cx, - )?; - if requests.is_unknown() { - return Ok(requests); - } - requests.extend(self.codes_read.segment_requests(range, rows, cx)?); - Ok(requests) + ) + } + + fn create_task( + self: Arc, + range: Range, + rows: OwnedRowScope, + reads: Vec, + prefetch_reads: Vec, + cx: &mut SegmentPlanCtx, + ) -> VortexResult> { + Ok(Box::new(DictReadTask { + codes: Arc::clone(&self.codes_read).create_task(range, rows, reads, Vec::new(), cx)?, + read: self, + value_reads: prefetch_reads.into_iter().map(Some).collect(), + cx: cx.clone(), + state: DictReadState::Start, + })) } fn release(&self, frontier: u64) -> VortexResult<()> { @@ -425,106 +524,6 @@ impl PreparedRead for DictPreparedRead { } } -impl DictExprPreparedRead { - async fn value_expr( - &self, - io: &ReadContext, - state: &DictScanState, - local: &mut ExecutionCtx, - ) -> VortexResult> { - if let Some(hit) = state - .shared - .value_exprs - .lock() - .get(&self.node.expr) - .cloned() - { - return Ok(hit); - } - let values = self - .node - .dict - .values(Arc::clone(&self.values_read), io, state) - .await - .map_err(VortexError::from)?; - let computed = values.apply(&self.node.expr).and_then(|array| { - match array.clone().execute::(local) { - Ok(mask) => { - let DType::Bool(nullability) = array.dtype() else { - return array.execute::(local); - }; - Ok( - BoolArray::new(mask.to_bit_buffer(), Validity::from(nullability)) - .into_array(), - ) - } - Err(_) => array.execute::(local), - } - }); - let value_expr = match computed { - Ok(array) => Some(array), - Err(error) => { - tracing::debug!( - predicate = %self.node.expr, - %error, - "dict value-domain expression read unavailable" - ); - None - } - }; - state - .shared - .value_exprs - .lock() - .insert(self.node.expr.clone(), value_expr.clone()); - Ok(value_expr) - } - - async fn sparse_expr( - &self, - codes: ArrayRef, - io: &ReadContext, - local: &mut ExecutionCtx, - ) -> VortexResult> { - let values_len = usize::try_from(self.node.dict.values_len) - .map_err(|_| vortex_err!("dictionary values length exceeds usize"))?; - let Some((compact_codes, value_selection)) = - compact_codes_and_value_selection(codes, values_len, local)? - else { - return Ok(None); - }; - - let values = self - .values_read - .read_scoped( - 0..self.node.dict.values_len, - RowScope::selected(&value_selection), - io, - local, - ) - .await?; - let input = self - .node - .dict - .build_dict(compact_codes, values)? - .optimize()?; - let computed = input - .apply(&self.node.expr) - .and_then(|array| array.execute::(local)); - match computed { - Ok(array) => Ok(Some(array)), - Err(error) => { - tracing::debug!( - predicate = %self.node.expr, - %error, - "sparse dict expression read unavailable" - ); - Ok(None) - } - } - } -} - fn compact_codes_and_value_selection( codes: ArrayRef, values_len: usize, @@ -788,78 +787,372 @@ where }) } -impl PreparedRead for DictExprPreparedRead { - fn read_scoped<'a>( - &'a self, - range: Range, - rows: RowScope<'a>, - io: &'a ReadContext, - local: &'a mut ExecutionCtx, - ) -> BoxFuture<'a, VortexResult> { - Box::pin(async move { - let sparse_candidate = - sparse_value_expr_candidate(&self.node.expr, self.node.dict.values_len, rows); - let value_candidate = - value_expr_candidate(&self.node.expr, self.node.dict.values_len, rows); - let value_expr = if value_candidate { - self.value_expr(io, &self.state, local).await? - } else { - None - }; - let codes = self - .codes_read - .read_scoped(range.clone(), rows, io, local) - .await?; - if let Some(value_expr) = value_expr { - let all_valid = !codes.dtype().is_nullable() - || codes - .validity()? - .execute_mask(codes.len(), local)? - .all_true(); - if all_valid { - return self.node.dict.build_dict(codes, value_expr); - } +enum DictExprReadState { + Start, + Values { + codes: ArrayRef, + values: Option>, + mode: DictExprValueMode, + }, +} + +enum DictExprValueMode { + Full { try_value_expr: bool }, + Sparse { compact_codes: ArrayRef }, +} + +struct DictExprReadTask { + read: Arc, + codes: Box, + value_reads: Vec>, + cx: SegmentPlanCtx, + state: DictExprReadState, +} + +impl ReadTask for DictExprReadTask { + fn into_step(self: Box) -> VortexResult { + let task = *self; + match task.state { + DictExprReadState::Start => { + let DictExprReadTask { + read, + codes, + value_reads, + cx, + state: _, + } = task; + let codes_step = codes.into_step()?; + Ok(ReadStep::new( + codes_step.required_reads, + codes_step.prefetch_reads, + move |io, local, results| match codes_step + .continuation + .run(io, local, results)? + { + ReadTaskOutput::Continue(codes) => { + Ok(ReadTaskOutput::Continue(Box::new(DictExprReadTask { + read, + codes, + value_reads, + cx, + state: DictExprReadState::Start, + }))) + } + ReadTaskOutput::Ready(codes) => { + let mut task = DictExprReadTask { + read, + codes: Box::new(crate::scan::plan::DeferredReadTask), + value_reads, + cx, + state: DictExprReadState::Start, + }; + let selection = Mask::new_true(codes.len()); + let rows = RowScope::selected(&selection); + let sparse_candidate = sparse_value_expr_candidate( + &task.read.node.expr, + task.read.node.dict.values_len, + rows, + ); + let value_candidate = value_expr_candidate( + &task.read.node.expr, + task.read.node.dict.values_len, + rows, + ); + let all_valid = !codes.dtype().is_nullable() + || codes + .validity() + .and_then(|validity| validity.execute_mask(codes.len(), local))? + .all_true(); + let mut try_value_expr = value_candidate && all_valid; + if try_value_expr { + let cached = task + .read + .state + .shared + .value_exprs + .lock() + .get(&task.read.node.expr) + .cloned(); + match cached { + Some(Some(value_expr)) => { + return Ok(ReadTaskOutput::Ready( + task.read.node.dict.build_dict(codes, value_expr)?, + )); + } + Some(None) => try_value_expr = false, + None => {} + } + } + if try_value_expr { + let values = task.create_full_values_task()?; + task.state = DictExprReadState::Values { + codes, + values: Some(values), + mode: DictExprValueMode::Full { + try_value_expr: true, + }, + }; + return Ok(ReadTaskOutput::Continue(Box::new(task))); + } + if sparse_candidate { + let values_len = usize::try_from(task.read.node.dict.values_len) + .map_err(|_| { + vortex_err!("dictionary values length exceeds usize") + })?; + if let Some((compact_codes, value_selection)) = + compact_codes_and_value_selection( + codes.clone(), + values_len, + local, + )? + { + let values = task + .create_values_task(RowScope::selected(&value_selection))?; + task.state = DictExprReadState::Values { + codes, + values: Some(values), + mode: DictExprValueMode::Sparse { compact_codes }, + }; + return Ok(ReadTaskOutput::Continue(Box::new(task))); + } + } + let values = task.create_full_values_task()?; + task.state = DictExprReadState::Values { + codes, + values: Some(values), + mode: DictExprValueMode::Full { + try_value_expr: false, + }, + }; + Ok(ReadTaskOutput::Continue(Box::new(task))) + } + }, + )) } - if sparse_candidate - && let Some(result) = self.sparse_expr(codes.clone(), io, local).await? - { - return Ok(result); + DictExprReadState::Values { + codes, + mut values, + mode, + } => { + let values_task = values.take().ok_or_else(|| { + vortex_err!("dictionary expression values task was not initialized") + })?; + let values_step = values_task.into_step()?; + let read = task.read; + let value_reads = task.value_reads; + let cx = task.cx; + Ok(ReadStep::new( + values_step.required_reads, + values_step.prefetch_reads, + move |io, local, results| match values_step + .continuation + .run(io, local, results)? + { + ReadTaskOutput::Continue(values) => { + Ok(ReadTaskOutput::Continue(Box::new(DictExprReadTask { + read, + codes: Box::new(crate::scan::plan::DeferredReadTask), + value_reads, + cx, + state: DictExprReadState::Values { + codes, + values: Some(values), + mode, + }, + }))) + } + ReadTaskOutput::Ready(values_array) => finish_dict_expr_values( + read, + value_reads, + cx, + codes, + mode, + values_array, + local, + ), + }, + )) + } + } + } +} + +fn finish_dict_expr_values( + read: Arc, + mut value_reads: Vec>, + mut cx: SegmentPlanCtx, + codes: ArrayRef, + mode: DictExprValueMode, + values_array: ArrayRef, + local: &mut ExecutionCtx, +) -> VortexResult { + match mode { + DictExprValueMode::Full { try_value_expr } => { + if try_value_expr { + let value_expr = { + let mut value_exprs = read.state.shared.value_exprs.lock(); + if let Some(cached) = value_exprs.get(&read.node.expr).cloned() { + cached + } else { + let computed = values_array.clone().apply(&read.node.expr).and_then( + |array| match array.clone().execute::(local) { + Ok(mask) => { + let DType::Bool(nullability) = array.dtype() else { + return array.execute::(local); + }; + Ok(BoolArray::new( + mask.to_bit_buffer(), + Validity::from(nullability), + ) + .into_array()) + } + Err(_) => array.execute::(local), + }, + ); + let value_expr = match computed { + Ok(array) => Some(array), + Err(error) => { + tracing::debug!( + predicate = %read.node.expr, + %error, + "dict value-domain expression read unavailable" + ); + None + } + }; + value_exprs.insert(read.node.expr.clone(), value_expr.clone()); + value_expr + } + }; + if let Some(value_expr) = value_expr { + return Ok(ReadTaskOutput::Ready( + read.node.dict.build_dict(codes, value_expr)?, + )); + } } - let values = self + let input = read.node.dict.build_dict(codes, values_array)?.optimize()?; + Ok(ReadTaskOutput::Ready( + input.apply(&read.node.expr)?.execute::(local)?, + )) + } + DictExprValueMode::Sparse { compact_codes } => { + let input = read .node .dict - .values(Arc::clone(&self.values_read), io, &self.state) - .await - .map_err(VortexError::from)?; - let input = self.node.dict.build_dict(codes, values)?.optimize()?; - input.apply(&self.node.expr)?.execute::(local) - }) + .build_dict(compact_codes.clone(), values_array)? + .optimize()?; + let computed = input + .apply(&read.node.expr) + .and_then(|array| array.execute::(local)); + match computed { + Ok(array) => Ok(ReadTaskOutput::Ready(array)), + Err(error) => { + tracing::debug!( + predicate = %read.node.expr, + %error, + "sparse dict expression read unavailable" + ); + let full_values = DictExprReadTask::create_full_values_task_for( + &read, + &mut value_reads, + &mut cx, + )?; + Ok(ReadTaskOutput::Continue(Box::new(DictExprReadTask { + read, + codes: Box::new(crate::scan::plan::DeferredReadTask), + value_reads, + cx, + state: DictExprReadState::Values { + codes, + values: Some(full_values), + mode: DictExprValueMode::Full { + try_value_expr: false, + }, + }, + }))) + } + } + } + } +} + +impl DictExprReadTask { + fn create_values_task(&mut self, rows: RowScope<'_>) -> VortexResult> { + Self::create_values_task_for(&self.read, &mut self.value_reads, &mut self.cx, rows) + } + + fn create_values_task_for( + read: &Arc, + value_reads: &mut [Option], + cx: &mut SegmentPlanCtx, + rows: RowScope<'_>, + ) -> VortexResult> { + let range = 0..read.node.dict.values_len; + let requests = read.values_read.segment_requests(range.clone(), rows, cx)?; + let reads = take_reads_for_requests(value_reads, requests)?; + let owned_rows = OwnedRowScope::try_new(rows.selection.clone(), rows.demand.clone())?; + Arc::clone(&read.values_read).create_task(range, owned_rows, reads, Vec::new(), cx) + } + + fn create_full_values_task(&mut self) -> VortexResult> { + Self::create_full_values_task_for(&self.read, &mut self.value_reads, &mut self.cx) + } + + fn create_full_values_task_for( + read: &Arc, + value_reads: &mut [Option], + cx: &mut SegmentPlanCtx, + ) -> VortexResult> { + let values_selection = Mask::new_true( + usize::try_from(read.node.dict.values_len) + .map_err(|_| vortex_err!("dictionary values length exceeds usize"))?, + ); + Self::create_values_task_for(read, value_reads, cx, RowScope::selected(&values_selection)) } +} +impl PreparedRead for DictExprPreparedRead { fn segment_requests( &self, range: Range, rows: RowScope<'_>, cx: &mut SegmentPlanCtx, ) -> VortexResult { - if sparse_value_expr_candidate(&self.node.expr, self.node.dict.values_len, rows) { - return self.codes_read.segment_requests(range, rows, cx); - } + self.codes_read.segment_requests(range, rows, cx) + } + fn prefetch_segment_requests( + &self, + _range: Range, + _rows: RowScope<'_>, + cx: &mut SegmentPlanCtx, + ) -> VortexResult { let values_selection = Mask::new_true( usize::try_from(self.node.dict.values_len) .map_err(|_| vortex_err!("dictionary values length exceeds usize"))?, ); - let mut requests = self.values_read.segment_requests( + self.values_read.segment_requests( 0..self.node.dict.values_len, RowScope::selected(&values_selection), cx, - )?; - if requests.is_unknown() { - return Ok(requests); - } - requests.extend(self.codes_read.segment_requests(range, rows, cx)?); - Ok(requests) + ) + } + + fn create_task( + self: Arc, + range: Range, + rows: OwnedRowScope, + reads: Vec, + prefetch_reads: Vec, + cx: &mut SegmentPlanCtx, + ) -> VortexResult> { + Ok(Box::new(DictExprReadTask { + codes: Arc::clone(&self.codes_read).create_task(range, rows, reads, Vec::new(), cx)?, + read: self, + value_reads: prefetch_reads.into_iter().map(Some).collect(), + cx: cx.clone(), + state: DictExprReadState::Start, + })) } fn release(&self, frontier: u64) -> VortexResult<()> { diff --git a/vortex-layout/src/scan/v2/layouts/flat.rs b/vortex-layout/src/scan/v2/layouts/flat.rs index 46e12a58ef8..3199d2a5127 100644 --- a/vortex-layout/src/scan/v2/layouts/flat.rs +++ b/vortex-layout/src/scan/v2/layouts/flat.rs @@ -13,41 +13,41 @@ use std::fmt; use std::ops::Range; use std::sync::Arc; -use futures::FutureExt; -use futures::future::BoxFuture; use parking_lot::Mutex; use vortex_array::ArrayRef; -use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::SliceArray; use vortex_array::expr::Expression; use vortex_array::optimizer::ArrayOptimizer; use vortex_array::serde::SerializedArray; -use vortex_error::VortexError; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; -use vortex_scan::plan::PrepareCtx; -use vortex_scan::plan::PreparedRead; -use vortex_scan::plan::PreparedReadRef; -use vortex_scan::plan::PreparedStateKey; -use vortex_scan::plan::PushCtx; -use vortex_scan::plan::ReadContext; -use vortex_scan::plan::RowScope; -use vortex_scan::plan::ScanPlan; -use vortex_scan::plan::ScanPlanRef; -use vortex_scan::plan::ScanState; -use vortex_scan::plan::ScanStateRef; -use vortex_scan::plan::StateCtx; -use vortex_scan::plan::default_try_push_expr; -use vortex_scan::plan::downcast_state; -use vortex_scan::plan::request::ScanRequest; +use vortex_scan::read::ScanRead; use vortex_session::VortexSession; use crate::layout_v2::Flat; use crate::layout_v2::Layout; use crate::layout_v2::LayoutRef; -use crate::layouts::SharedArrayFuture; +use crate::scan::plan::OwnedRowScope; +use crate::scan::plan::PrepareCtx; +use crate::scan::plan::PreparedRead; +use crate::scan::plan::PreparedReadRef; +use crate::scan::plan::PreparedStateKey; +use crate::scan::plan::PushCtx; +use crate::scan::plan::ReadContext; +use crate::scan::plan::ReadStep; +use crate::scan::plan::ReadTask; +use crate::scan::plan::ReadTaskOutput; +use crate::scan::plan::RowScope; +use crate::scan::plan::ScanPlan; +use crate::scan::plan::ScanPlanRef; +use crate::scan::plan::ScanState; +use crate::scan::plan::ScanStateRef; +use crate::scan::plan::StateCtx; +use crate::scan::plan::default_try_push_expr; +use crate::scan::plan::downcast_state; +use crate::scan::plan::request::ScanRequest; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; @@ -70,7 +70,7 @@ pub struct FlatScanPlan { /// Per-query cache of the parsed (still lazy) array. #[derive(Default)] pub struct FlatScanState { - array: Mutex>, + array: Mutex>, } struct FlatPreparedRead { @@ -78,24 +78,28 @@ struct FlatPreparedRead { state: Arc, } +struct FlatReadTask { + read: Arc, + range: Range, + rows: OwnedRowScope, + reads: Vec, + prefetch_reads: Vec, +} + impl FlatScanPlan { - fn array(&self, io: &ReadContext, state: &FlatScanState) -> SharedArrayFuture { + fn array(&self, io: &ReadContext, state: &FlatScanState) -> VortexResult { if let Some(hit) = state.array.lock().clone() { - return hit; + return Ok(hit); } let mut guard = state.array.lock(); if let Some(hit) = guard.clone() { - return hit; + return Ok(hit); } - let layout = self.layout.clone(); - let io = io.clone(); - let future = async move { decode_flat(&layout, &io).await.map_err(Arc::new) } - .boxed() - .shared(); - *guard = Some(future.clone()); - future + let array = decode_flat(&self.layout, io)?; + *guard = Some(array.clone()); + Ok(array) } } @@ -138,51 +142,12 @@ impl ScanPlan for FlatScanPlan { } impl PreparedRead for FlatPreparedRead { - fn read_scoped<'a>( - &'a self, - range: Range, - rows: RowScope<'a>, - io: &'a ReadContext, - _local: &'a mut ExecutionCtx, - ) -> BoxFuture<'a, VortexResult> { - Box::pin(async move { - let array = self - .node - .array(io, &self.state) - .await - .map_err(VortexError::from)?; - let dense = slice_to_range(array, &range)?; - if rows.selection.len() != dense.len() { - vortex_bail!( - "selection length {} does not match read range length {}", - rows.selection.len(), - dense.len() - ); - } - if rows.demand.len() != dense.len() { - vortex_bail!( - "demand length {} does not match read range length {}", - rows.demand.len(), - dense.len() - ); - } - if rows.selection.all_true() { - return Ok(dense); - } - dense.filter(rows.selection.clone()) - }) - } - fn segment_requests( &self, _range: Range, _rows: RowScope<'_>, cx: &mut SegmentPlanCtx, ) -> VortexResult { - if self.state.array.lock().is_some() { - return Ok(SegmentRequests::none()); - } - let Some(flat) = self.node.layout.as_opt::() else { vortex_bail!( "expected flat layout, got {}", @@ -194,6 +159,23 @@ impl PreparedRead for FlatPreparedRead { ])) } + fn create_task( + self: Arc, + range: Range, + rows: OwnedRowScope, + reads: Vec, + prefetch_reads: Vec, + _cx: &mut SegmentPlanCtx, + ) -> VortexResult> { + Ok(Box::new(FlatReadTask { + read: self, + range, + rows, + reads, + prefetch_reads, + })) + } + fn release(&self, frontier: u64) -> VortexResult<()> { self.node.release(frontier, &self.state) } @@ -203,13 +185,48 @@ impl PreparedRead for FlatPreparedRead { } } -pub(crate) async fn decode_flat(layout: &LayoutRef, io: &ReadContext) -> VortexResult { +impl ReadTask for FlatReadTask { + fn into_step(self: Box) -> VortexResult { + let Self { + read, + range, + rows, + reads, + prefetch_reads, + } = *self; + Ok(ReadStep::new(reads, prefetch_reads, move |io, _, _| { + let array = read.node.array(io, &read.state)?; + let rows = rows.as_scope(); + let dense = slice_to_range(array, &range)?; + if rows.selection.len() != dense.len() { + vortex_bail!( + "selection length {} does not match read range length {}", + rows.selection.len(), + dense.len() + ); + } + if rows.demand.len() != dense.len() { + vortex_bail!( + "demand length {} does not match read range length {}", + rows.demand.len(), + dense.len() + ); + } + if rows.selection.all_true() { + return Ok(ReadTaskOutput::Ready(dense)); + } + Ok(ReadTaskOutput::Ready(dense.filter(rows.selection.clone())?)) + })) + } +} + +pub(crate) fn decode_flat(layout: &LayoutRef, io: &ReadContext) -> VortexResult { let Some(flat) = layout.as_opt::() else { vortex_bail!("expected flat layout, got {}", layout.encoding_id()); }; let row_count = usize::try_from(layout.row_count()) .map_err(|_| vortex_err!("layout row count exceeds usize"))?; - let segment = io.segments().request(flat.data().segment_id()).await?; + let segment = io.segment(flat.data().segment_id())?; let parts = if let Some(tree) = flat.data().array_tree() { SerializedArray::from_flatbuffer_and_segment(tree.clone(), segment)? } else { diff --git a/vortex-layout/src/scan/v2/layouts/struct_.rs b/vortex-layout/src/scan/v2/layouts/struct_.rs index dd0d0201985..2f1dd3e5773 100644 --- a/vortex-layout/src/scan/v2/layouts/struct_.rs +++ b/vortex-layout/src/scan/v2/layouts/struct_.rs @@ -25,23 +25,23 @@ use vortex_array::scalar_fn::fns::root::Root; use vortex_array::scalar_fn::fns::select::Select; use vortex_error::VortexResult; use vortex_error::vortex_bail; -use vortex_scan::plan::ApplyScanPlan; -use vortex_scan::plan::MaskScanPlan; -use vortex_scan::plan::PrepareCtx; -use vortex_scan::plan::PreparedReadRef; -use vortex_scan::plan::PushCtx; -use vortex_scan::plan::ScanPlan; -use vortex_scan::plan::ScanPlanRef; -use vortex_scan::plan::ScanStateRef; -use vortex_scan::plan::StateCtx; -use vortex_scan::plan::StructValueScanPlan; -use vortex_scan::plan::literal_scan_plan; -use vortex_scan::plan::request::ScanRequest; use vortex_session::VortexSession; use crate::layout_v2::Layout; use crate::layout_v2::LayoutRef; use crate::layout_v2::Struct; +use crate::scan::plan::ApplyScanPlan; +use crate::scan::plan::MaskScanPlan; +use crate::scan::plan::PrepareCtx; +use crate::scan::plan::PreparedReadRef; +use crate::scan::plan::PushCtx; +use crate::scan::plan::ScanPlan; +use crate::scan::plan::ScanPlanRef; +use crate::scan::plan::ScanStateRef; +use crate::scan::plan::StateCtx; +use crate::scan::plan::StructValueScanPlan; +use crate::scan::plan::literal_scan_plan; +use crate::scan::plan::request::ScanRequest; use crate::scan::v2::referenced_fields; use crate::scan::v2::struct_fields; diff --git a/vortex-layout/src/scan/v2/layouts/zoned.rs b/vortex-layout/src/scan/v2/layouts/zoned.rs index bdf1fbe04fe..3da60ca85e6 100644 --- a/vortex-layout/src/scan/v2/layouts/zoned.rs +++ b/vortex-layout/src/scan/v2/layouts/zoned.rs @@ -40,34 +40,12 @@ use vortex_array::expr::root; use vortex_array::expr::stats::Stat; use vortex_array::scalar::Scalar; use vortex_error::VortexResult; +use vortex_error::vortex_bail; use vortex_error::vortex_err; use vortex_mask::Mask; -use vortex_scan::plan::AggregateAnswer; -use vortex_scan::plan::EvidenceCost; -use vortex_scan::plan::EvidenceScope; -use vortex_scan::plan::PrepareCtx; -use vortex_scan::plan::PreparedAggregate; -use vortex_scan::plan::PreparedAggregateRef; -use vortex_scan::plan::PreparedEvidence; -use vortex_scan::plan::PreparedEvidenceRef; -use vortex_scan::plan::PreparedRead; -use vortex_scan::plan::PreparedReadRef; -use vortex_scan::plan::PreparedStateKey; -use vortex_scan::plan::PushCtx; -use vortex_scan::plan::ReadContext; -use vortex_scan::plan::RowScope; -use vortex_scan::plan::ScanPlan; -use vortex_scan::plan::ScanPlanRef; -use vortex_scan::plan::ScanState; -use vortex_scan::plan::ScanStateRef; -use vortex_scan::plan::StateCtx; -use vortex_scan::plan::default_try_push_expr; -use vortex_scan::plan::downcast_state; -use vortex_scan::plan::evidence::EvidenceFragment; -use vortex_scan::plan::evidence::PredicateEvidenceKind; -use vortex_scan::plan::read_dense; -use vortex_scan::plan::request::EvidenceRequest; -use vortex_scan::plan::request::ScanRequest; +use vortex_scan::read::ReadResults; +use vortex_scan::read::ReadStore; +use vortex_scan::read::ScanRead; use vortex_session::VortexSession; use crate::layout_v2::Layout; @@ -77,8 +55,38 @@ use crate::layouts::zoned::MAX_IS_TRUNCATED; use crate::layouts::zoned::MIN_IS_TRUNCATED; use crate::layouts::zoned::ZoneMapSchema; use crate::layouts::zoned::zone_map::ZoneMap; +use crate::scan::plan::AggregateAnswer; +use crate::scan::plan::EvidenceCost; +use crate::scan::plan::EvidenceScope; +use crate::scan::plan::OwnedRowScope; +use crate::scan::plan::PrepareCtx; +use crate::scan::plan::PreparedAggregate; +use crate::scan::plan::PreparedAggregateRef; +use crate::scan::plan::PreparedEvidence; +use crate::scan::plan::PreparedEvidenceRef; +use crate::scan::plan::PreparedRead; +use crate::scan::plan::PreparedReadRef; +use crate::scan::plan::PreparedStateKey; +use crate::scan::plan::PushCtx; +use crate::scan::plan::ReadContext; +use crate::scan::plan::ReadTask; +use crate::scan::plan::ReadTaskOutput; +use crate::scan::plan::RowScope; +use crate::scan::plan::ScanPlan; +use crate::scan::plan::ScanPlanRef; +use crate::scan::plan::ScanState; +use crate::scan::plan::ScanStateRef; +use crate::scan::plan::StateCtx; +use crate::scan::plan::default_try_push_expr; +use crate::scan::plan::downcast_state; +use crate::scan::plan::evidence::EvidenceFragment; +use crate::scan::plan::evidence::PredicateEvidenceKind; +use crate::scan::plan::request::EvidenceRequest; +use crate::scan::plan::request::ScanRequest; +use crate::segments::SegmentFutureCache; use crate::segments::SegmentPlanCtx; use crate::segments::SegmentRequests; +use crate::segments::register_segment_reads_cached; pub(crate) fn new_scan_plan>( layout: Layout, @@ -232,6 +240,60 @@ impl ZonedScanState { } } +fn read_zones_child( + zones_read: &PreparedReadRef, + nzones: u64, + io: &ReadContext, +) -> VortexResult { + let len = usize::try_from(nzones).map_err(|_| vortex_err!("zone count exceeds usize"))?; + let rows = OwnedRowScope::selected(Mask::new_true(len)); + let mut segment_ctx = SegmentPlanCtx::new(Arc::clone(io.segments()), io.session().clone()); + let requests = zones_read.segment_requests(0..nzones, rows.as_scope(), &mut segment_ctx)?; + if requests.is_unknown() { + vortex_bail!("zoned stats child produced unknown segment requests") + } + let cache = SegmentFutureCache::new(); + let reads = register_segment_reads_cached(&cache, io.segments().as_ref(), requests); + let prefetch_requests = + zones_read.prefetch_segment_requests(0..nzones, rows.as_scope(), &mut segment_ctx)?; + let prefetch_reads = if prefetch_requests.is_unknown() { + Vec::new() + } else { + register_segment_reads_cached(&cache, io.segments().as_ref(), prefetch_requests) + }; + let mut task = Arc::clone(zones_read).create_task( + 0..nzones, + rows, + reads, + prefetch_reads, + &mut segment_ctx, + )?; + let read_store = Arc::new(ReadStore::new()); + loop { + let step = task.into_step()?; + resolve_zoned_reads(Arc::clone(&read_store), step.required_reads)?; + resolve_zoned_reads(Arc::clone(&read_store), step.prefetch_reads)?; + let mut local = io.session().create_execution_ctx(); + match step + .continuation + .run(io, &mut local, ReadResults::new(Arc::clone(&read_store)))? + { + ReadTaskOutput::Ready(array) => return Ok(array), + ReadTaskOutput::Continue(next) => task = next, + } + } +} + +fn resolve_zoned_reads(read_store: Arc, reads: Vec) -> VortexResult<()> { + for read in reads { + if read_store.get(read.request.key).is_none() { + let buffer = futures::executor::block_on(read.future)?; + read_store.insert(read.request.key, buffer); + } + } + Ok(()) +} + impl ZonedScanPlan { fn shared_zone_state(&self, cx: &mut PrepareCtx) -> VortexResult> { let key = @@ -255,7 +317,7 @@ impl ZonedScanPlan { /// The decoded per-zone stats table, read once per query. Concurrent /// decodes are benign (the segment fetch is shared; last-write-wins). - async fn table( + fn table( &self, zones_read: &PreparedReadRef, io: &ReadContext, @@ -264,7 +326,7 @@ impl ZonedScanPlan { if let Some(hit) = state.table.lock().clone() { return Ok(hit); } - let zones = read_dense(zones_read, 0..self.nzones, io).await?; + let zones = read_zones_child(zones_read, self.nzones, io)?; let mut ctx = io.session().create_execution_ctx(); let table = Arc::new(zones.execute::(&mut ctx)?); *state.table.lock() = Some(Arc::clone(&table)); @@ -274,7 +336,7 @@ impl ZonedScanPlan { /// One stat's per-zone column prepared for aggregation, built once /// per query directly over the stats table: the values, their /// validity, and (for min/max) the truncation flags. - async fn stat_column( + fn stat_column( &self, stat: Stat, zones_read: &PreparedReadRef, @@ -284,7 +346,7 @@ impl ZonedScanPlan { if let Some(hit) = state.stat_columns.lock().get(&stat) { return Ok(hit.clone()); } - let table = self.table(zones_read, io, state).await?; + let table = self.table(zones_read, io, state)?; let mut ctx = io.session().create_execution_ctx(); let column = match table.unmasked_field_by_name_opt(stat.name()) { None => None, @@ -317,7 +379,7 @@ impl ZonedScanPlan { /// statistics: a partial for the answerable interior zones, residual /// spans for edge fragments and unanswerable zones. `None`: nothing /// covered, the caller owns the whole range. - async fn aggregate_one( + fn aggregate_one( &self, span: &ZoneSpan, func: &AggregateFnRef, @@ -346,7 +408,7 @@ impl ZonedScanPlan { let Some(partial_dtype) = func.state_dtype(&self.column_dtype) else { return Ok(None); }; - let Some(col) = self.stat_column(stat, zones_read, io, state).await? else { + let Some(col) = self.stat_column(stat, zones_read, io, state)? else { return Ok(None); }; @@ -397,10 +459,7 @@ impl ZonedScanPlan { // to the caller. let null_counts = match stat { Stat::NullCount => Some(Arc::clone(&col)), - _ => { - self.stat_column(Stat::NullCount, zones_read, io, state) - .await? - } + _ => self.stat_column(Stat::NullCount, zones_read, io, state)?, }; let zone_nulls = |zone: usize, ctx: &mut ExecutionCtx| -> VortexResult> { match &null_counts { @@ -503,10 +562,7 @@ impl ZonedScanPlan { let mut answers = Vec::with_capacity(funcs.len()); let mut covered_any = false; for func in funcs { - match self - .aggregate_one(&span, func, zones_read, io, state, &mut ctx) - .await? - { + match self.aggregate_one(&span, func, zones_read, io, state, &mut ctx)? { Some(answer) => { covered_any = true; answers.push(answer); @@ -528,30 +584,22 @@ impl ZonedScanPlan { } impl ZonedPreparedEvidence { - async fn table( - &self, - io: &ReadContext, - state: &ZonedScanState, - ) -> VortexResult> { + fn table(&self, io: &ReadContext, state: &ZonedScanState) -> VortexResult> { if let Some(hit) = state.table.lock().clone() { return Ok(hit); } - let zones = read_dense(&self.zones_read, 0..self.nzones, io).await?; + let zones = read_zones_child(&self.zones_read, self.nzones, io)?; let mut ctx = io.session().create_execution_ctx(); let table = Arc::new(zones.execute::(&mut ctx)?); *state.table.lock() = Some(Arc::clone(&table)); Ok(table) } - async fn zone_map( - &self, - io: &ReadContext, - state: &ZonedScanState, - ) -> VortexResult> { + fn zone_map(&self, io: &ReadContext, state: &ZonedScanState) -> VortexResult> { if let Some(hit) = state.zone_map.lock().clone() { return Ok(hit); } - let table = self.table(io, state).await?; + let table = self.table(io, state)?; let zone_map = match &self.zone_map_schema { ZoneMapSchema::AggregateFns(_) => ZoneMap::try_new( self.column_dtype.clone(), @@ -579,7 +627,7 @@ impl ZonedPreparedEvidence { Ok(zone_map) } - async fn predicate_masks( + fn predicate_masks( &self, io: &ReadContext, state: &ZonedScanState, @@ -587,7 +635,7 @@ impl ZonedPreparedEvidence { if let Some(hit) = state.masks.lock().get(&self.predicate) { return Ok(Arc::clone(hit)); } - let zone_map = self.zone_map(io, state).await?; + let zone_map = self.zone_map(io, state)?; let session = io.session(); let all_false = self .falsifier @@ -631,41 +679,39 @@ impl PreparedEvidence for ZonedPreparedEvidence { &'a self, req: &'a EvidenceRequest<'a>, io: &'a ReadContext, - ) -> BoxFuture<'a, VortexResult>> { - Box::pin(async move { - let mut fragments = Vec::new(); - if self.zone_len > 0 && (self.falsifier.is_some() || self.satisfier.is_some()) { - let masks = self.predicate_masks(io, &self.state).await?; - let zones = self.zone_range(&req.range); - let mut run: Option<(Range, bool)> = None; - for zone in zones { - let all_false = masks - .all_false - .as_ref() - .is_some_and(|mask| mask.value(zone)); - let all_true = - !all_false && masks.all_true.as_ref().is_some_and(|mask| mask.value(zone)); - let span = self.zone_span(zone); - match (&mut run, all_false || all_true) { - (Some((rows, false_run)), true) if *false_run == all_false => { - rows.end = span.end; + ) -> VortexResult> { + let mut fragments = Vec::new(); + if self.zone_len > 0 && (self.falsifier.is_some() || self.satisfier.is_some()) { + let masks = self.predicate_masks(io, &self.state)?; + let zones = self.zone_range(&req.range); + let mut run: Option<(Range, bool)> = None; + for zone in zones { + let all_false = masks + .all_false + .as_ref() + .is_some_and(|mask| mask.value(zone)); + let all_true = + !all_false && masks.all_true.as_ref().is_some_and(|mask| mask.value(zone)); + let span = self.zone_span(zone); + match (&mut run, all_false || all_true) { + (Some((rows, false_run)), true) if *false_run == all_false => { + rows.end = span.end; + } + (current, proven) => { + if let Some((rows, false_run)) = current.take() { + fragments.push(fragment(rows, false_run)); } - (current, proven) => { - if let Some((rows, false_run)) = current.take() { - fragments.push(fragment(rows, false_run)); - } - if proven { - *current = Some((span, all_false)); - } + if proven { + *current = Some((span, all_false)); } } } - if let Some((rows, false_run)) = run { - fragments.push(fragment(rows, false_run)); - } } - Ok(fragments) - }) + if let Some((rows, false_run)) = run { + fragments.push(fragment(rows, false_run)); + } + } + Ok(fragments) } fn segment_requests( @@ -830,23 +876,33 @@ impl ScanPlan for ZonedScanPlan { } impl PreparedRead for ZonedPreparedRead { - fn read_scoped<'a>( - &'a self, + fn segment_requests( + &self, range: Range, - rows: RowScope<'a>, - io: &'a ReadContext, - local: &'a mut ExecutionCtx, - ) -> BoxFuture<'a, VortexResult> { - self.data.read_scoped(range, rows, io, local) + rows: RowScope<'_>, + cx: &mut SegmentPlanCtx, + ) -> VortexResult { + self.data.segment_requests(range, rows, cx) } - fn segment_requests( + fn prefetch_segment_requests( &self, range: Range, rows: RowScope<'_>, cx: &mut SegmentPlanCtx, ) -> VortexResult { - self.data.segment_requests(range, rows, cx) + self.data.prefetch_segment_requests(range, rows, cx) + } + + fn create_task( + self: Arc, + range: Range, + rows: OwnedRowScope, + reads: Vec, + prefetch_reads: Vec, + cx: &mut SegmentPlanCtx, + ) -> VortexResult> { + Arc::clone(&self.data).create_task(range, rows, reads, prefetch_reads, cx) } fn release(&self, frontier: u64) -> VortexResult<()> { @@ -948,23 +1004,33 @@ impl ScanPlan for ZonedExprScanPlan { } impl PreparedRead for ZonedExprPreparedRead { - fn read_scoped<'a>( - &'a self, + fn segment_requests( + &self, range: Range, - rows: RowScope<'a>, - io: &'a ReadContext, - local: &'a mut ExecutionCtx, - ) -> BoxFuture<'a, VortexResult> { - self.data.read_scoped(range, rows, io, local) + rows: RowScope<'_>, + cx: &mut SegmentPlanCtx, + ) -> VortexResult { + self.data.segment_requests(range, rows, cx) } - fn segment_requests( + fn prefetch_segment_requests( &self, range: Range, rows: RowScope<'_>, cx: &mut SegmentPlanCtx, ) -> VortexResult { - self.data.segment_requests(range, rows, cx) + self.data.prefetch_segment_requests(range, rows, cx) + } + + fn create_task( + self: Arc, + range: Range, + rows: OwnedRowScope, + reads: Vec, + prefetch_reads: Vec, + cx: &mut SegmentPlanCtx, + ) -> VortexResult> { + Arc::clone(&self.data).create_task(range, rows, reads, prefetch_reads, cx) } fn release(&self, frontier: u64) -> VortexResult<()> { diff --git a/vortex-layout/src/scan/v2/mod.rs b/vortex-layout/src/scan/v2/mod.rs index 0cbb894be2c..6660ce657b9 100644 --- a/vortex-layout/src/scan/v2/mod.rs +++ b/vortex-layout/src/scan/v2/mod.rs @@ -4,7 +4,7 @@ //! Scan2 layout plan machinery. //! //! This module contains the layout-tree expansion vtables and executable -//! [`ScanPlan`](vortex_scan::plan::ScanPlan) plans used by the alternate scan implementation. +//! [`ScanPlan`](crate::scan::plan::ScanPlan) plans used by the alternate scan implementation. pub mod session; @@ -21,8 +21,9 @@ use vortex_array::scalar_fn::fns::binary::Binary; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; -pub use vortex_scan::plan::evidence; -pub use vortex_scan::plan::request; + +pub use crate::scan::plan::evidence; +pub use crate::scan::plan::request; /// Environment variable selecting the file scan implementation. /// @@ -31,7 +32,7 @@ pub use vortex_scan::plan::request; /// - `v1`, `scan`, `scan_builder`, `scan-builder`, `layout-reader`, or unset: use the /// existing LayoutReader-based scan. /// - `v2` or `scan2`: use the scan2 -/// [`ScanPlan`](vortex_scan::plan::ScanPlan) implementation. +/// [`ScanPlan`](crate::scan::plan::ScanPlan) implementation. pub const SCAN_IMPL_ENV: &str = "VORTEX_SCAN_IMPL"; /// Returns whether the scan2 implementation should be used by scan data sources. diff --git a/vortex-layout/src/scan/v2/row_idx.rs b/vortex-layout/src/scan/v2/row_idx.rs index f859d8656d1..7ab8acda574 100644 --- a/vortex-layout/src/scan/v2/row_idx.rs +++ b/vortex-layout/src/scan/v2/row_idx.rs @@ -5,9 +5,7 @@ use std::fmt; use std::ops::Range; use std::sync::Arc; -use futures::future::BoxFuture; use vortex_array::ArrayRef; -use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::dtype::DType; use vortex_array::dtype::FieldName; @@ -23,26 +21,31 @@ use vortex_array::scalar::PValue; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_bail; -use vortex_scan::plan::ApplyScanPlan; -use vortex_scan::plan::PrepareCtx; -use vortex_scan::plan::PreparedRead; -use vortex_scan::plan::PreparedReadRef; -use vortex_scan::plan::PushCtx; -use vortex_scan::plan::ReadContext; -use vortex_scan::plan::RowScope; -use vortex_scan::plan::ScanPlan; -use vortex_scan::plan::ScanPlanRef; -use vortex_scan::plan::ScanStateRef; -use vortex_scan::plan::StateCtx; -use vortex_scan::plan::StructValueScanPlan; -use vortex_scan::plan::default_try_push_expr; -use vortex_scan::segments::SegmentPlanCtx; -use vortex_scan::segments::SegmentRequests; +use vortex_scan::read::ScanRead; use vortex_sequence::Sequence; use vortex_sequence::SequenceArray; use crate::layouts::row_idx::RowIdx; use crate::layouts::row_idx::row_idx; +use crate::scan::plan::ApplyScanPlan; +use crate::scan::plan::OwnedRowScope; +use crate::scan::plan::PrepareCtx; +use crate::scan::plan::PreparedRead; +use crate::scan::plan::PreparedReadRef; +use crate::scan::plan::PushCtx; +use crate::scan::plan::ReadStep; +use crate::scan::plan::ReadTask; +use crate::scan::plan::ReadTaskOutput; +use crate::scan::plan::RowScope; +use crate::scan::plan::ScanPlan; +use crate::scan::plan::ScanPlanRef; +use crate::scan::plan::ScanState; +use crate::scan::plan::ScanStateRef; +use crate::scan::plan::StateCtx; +use crate::scan::plan::StructValueScanPlan; +use crate::scan::plan::default_try_push_expr; +use crate::segments::SegmentPlanCtx; +use crate::segments::SegmentRequests; pub fn with_row_idx(root: ScanPlanRef, dtype: DType, row_offset: u64) -> ScanPlanRef { Arc::new(RowIdxScanPlan { @@ -181,7 +184,7 @@ impl ScanPlan for RowIdxScanPlan { Arc::clone(&self.child).prepare_read(cx) } - fn release(&self, frontier: u64, state: &vortex_scan::plan::ScanState) -> VortexResult<()> { + fn release(&self, frontier: u64, state: &ScanState) -> VortexResult<()> { self.child.release(frontier, state) } @@ -216,6 +219,12 @@ struct RowIdxPreparedRead { plan: Arc, } +struct RowIdxReadTask { + read: Arc, + range: Range, + rows: OwnedRowScope, +} + impl ScanPlan for RowIdxExprScanPlan { fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { Ok(Arc::new(())) @@ -242,15 +251,41 @@ impl ScanPlan for RowIdxExprScanPlan { } impl PreparedRead for RowIdxPreparedRead { - fn read_scoped<'a>( - &'a self, + fn segment_requests( + &self, + _range: Range, + _rows: RowScope<'_>, + _cx: &mut SegmentPlanCtx, + ) -> VortexResult { + Ok(SegmentRequests::none()) + } + + fn create_task( + self: Arc, range: Range, - rows: RowScope<'a>, - _io: &'a ReadContext, - local: &'a mut ExecutionCtx, - ) -> BoxFuture<'a, VortexResult> { - Box::pin(async move { - let dense = idx_array(self.plan.row_offset, &range).into_array(); + rows: OwnedRowScope, + _reads: Vec, + _prefetch_reads: Vec, + _cx: &mut SegmentPlanCtx, + ) -> VortexResult> { + Ok(Box::new(RowIdxReadTask { + read: self, + range, + rows, + })) + } + + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "row_idx({}) -> {}", self.plan.expr, self.plan.dtype) + } +} + +impl ReadTask for RowIdxReadTask { + fn into_step(self: Box) -> VortexResult { + let Self { read, range, rows } = *self; + Ok(ReadStep::new(Vec::new(), Vec::new(), move |_, local, _| { + let rows = rows.as_scope(); + let dense = idx_array(read.plan.row_offset, &range).into_array(); if rows.selection.len() != dense.len() { vortex_bail!( "selection length {} does not match row_idx range length {}", @@ -270,21 +305,12 @@ impl PreparedRead for RowIdxPreparedRead { } else { dense.filter(rows.selection.clone())? }; - selected.apply(&self.plan.expr)?.execute::(local) - }) - } - - fn segment_requests( - &self, - _range: Range, - _rows: RowScope<'_>, - _cx: &mut SegmentPlanCtx, - ) -> VortexResult { - Ok(SegmentRequests::none()) - } - - fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "row_idx({}) -> {}", self.plan.expr, self.plan.dtype) + Ok(ReadTaskOutput::Ready( + selected + .apply(&read.plan.expr)? + .execute::(local)?, + )) + })) } } diff --git a/vortex-layout/src/segments/mod.rs b/vortex-layout/src/segments/mod.rs index e3f7047ea40..a4fcd519cfa 100644 --- a/vortex-layout/src/segments/mod.rs +++ b/vortex-layout/src/segments/mod.rs @@ -2,15 +2,59 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors mod cache; +mod scheduled; mod shared; mod sink; +mod source; #[cfg(any(test, feature = "_test-harness"))] mod test; +use std::fmt::Display; +use std::ops::Deref; + pub use cache::*; +pub use scheduled::*; pub use shared::*; pub use sink::*; +pub use source::*; #[cfg(any(test, feature = "_test-harness"))] pub use test::*; -pub use vortex_scan::segments::*; +use vortex_error::VortexError; +pub use vortex_scan::read::CancelGroup; +pub use vortex_scan::read::ScanIoPhase; +pub use vortex_scan::read::ScanPriority; +pub use vortex_scan::read::ScanRead; + +/// The identifier for a single segment. +// TODO(ngates): should this be a `[u8]` instead? Allowing for arbitrary segment identifiers? +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct SegmentId(u32); + +impl From for SegmentId { + fn from(value: u32) -> Self { + Self(value) + } +} + +impl TryFrom for SegmentId { + type Error = VortexError; + + fn try_from(value: usize) -> Result { + Ok(Self::from(u32::try_from(value)?)) + } +} + +impl Deref for SegmentId { + type Target = u32; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl Display for SegmentId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "SegmentId({})", self.0) + } +} diff --git a/vortex-scan/src/segments/scheduled.rs b/vortex-layout/src/segments/scheduled.rs similarity index 85% rename from vortex-scan/src/segments/scheduled.rs rename to vortex-layout/src/segments/scheduled.rs index 83dd728eb94..7021a6e5f84 100644 --- a/vortex-scan/src/segments/scheduled.rs +++ b/vortex-layout/src/segments/scheduled.rs @@ -14,6 +14,13 @@ use vortex_error::SharedVortexResult; use vortex_error::VortexError; use vortex_error::VortexExpect; use vortex_error::VortexResult; +use vortex_scan::read::CancelGroup; +use vortex_scan::read::ReadRequestKey; +use vortex_scan::read::ReadResults; +use vortex_scan::read::ScanIoPhase; +use vortex_scan::read::ScanPriority; +use vortex_scan::read::ScanRead; +use vortex_scan::read::ScanReadRequest; use vortex_session::VortexSession; use vortex_utils::aliases::dash_map::DashMap; use vortex_utils::aliases::dash_map::Entry; @@ -50,65 +57,7 @@ impl SegmentInfo { } } -/// High-level scan phase associated with a segment request. -#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash)] -pub enum ScanIoPhase { - /// Shared evidence setup, such as loading a stats table. - EvidenceSetup, - /// Per-morsel evidence probe. - EvidenceProbe, - /// Residual predicate value read. - PredicateRead, - /// Projected output value read. - #[default] - ProjectionRead, - /// Aggregate input or metadata read. - AggregateRead, -} - -/// Scheduler priority for a segment request. -#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct ScanPriority(i32); - -impl ScanPriority { - /// Normal request priority. - pub const NORMAL: Self = Self(0); - - /// Create a priority from a signed integer value. - pub fn new(value: i32) -> Self { - Self(value) - } - - /// Return the signed integer priority value. - pub fn get(self) -> i32 { - self.0 - } -} - -/// Cancellation scope for a group of related segment requests. -#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash)] -pub struct CancelGroup(u64); - -impl CancelGroup { - /// A request that is not associated with a finer cancellation group. - pub const NONE: Self = Self(0); - - /// Create a cancellation group from an integer id. - pub fn new(value: u64) -> Self { - Self(value) - } - - /// Return the integer cancellation group id. - pub fn get(self) -> u64 { - self.0 - } -} - /// A scheduler-visible request for one logical segment payload. -/// -/// The first scheduler API intentionally only models segment payloads. If a future custom -/// `ScanPlan` needs opaque or non-segment I/O, add that request shape next to this type rather -/// than smuggling physical locations into `SegmentRequest`. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub struct SegmentRequest { /// Logical segment id within the source. @@ -143,6 +92,24 @@ impl From<&SegmentRequest> for SegmentRequestKey { } } +impl From for ReadRequestKey { + fn from(key: SegmentRequestKey) -> Self { + Self::new(u64::from(*key.segment)) + } +} + +impl From<&SegmentRequest> for ScanReadRequest { + fn from(request: &SegmentRequest) -> Self { + Self::new( + ReadRequestKey::from(SegmentRequestKey::from(request)), + request.bytes, + request.phase, + ) + .with_priority(request.priority) + .with_cancel_group(request.cancel_group) + } +} + impl SegmentRequest { /// Create a segment request from source, segment metadata, and phase. pub fn new(segment: SegmentId, info: SegmentInfo, phase: ScanIoPhase) -> Self { @@ -290,21 +257,6 @@ impl SegmentPlanCtx { } } -/// One logical segment read registered for a scan task. -pub struct ScanRead { - /// The logical request this handle resolves. - pub request: SegmentRequest, - /// Future resolving to the requested segment payload. - pub future: SegmentFuture, -} - -impl ScanRead { - /// Create a handle for one logical segment request. - pub fn new(request: SegmentRequest, future: SegmentFuture) -> Self { - Self { request, future } - } -} - type SharedSegmentFuture = BoxFuture<'static, SharedVortexResult>; /// Scan-local cache of in-flight segment futures keyed by logical segment request. @@ -338,8 +290,11 @@ impl SegmentFutureCache { entry.remove(); } Entry::Vacant(entry) => { - let handle = ScanRead::new(request, source.request(request.segment)); - let shared = handle.future.map_err(Arc::new).boxed().shared(); + let shared = source + .request(request.segment) + .map_err(Arc::new) + .boxed() + .shared(); entry.insert( shared .downgrade() @@ -386,20 +341,20 @@ impl SegmentFutureCache { source: &dyn SegmentSource, misses: Vec, ) -> Vec { - self.insert_submitted( - misses - .into_iter() - .map(|request| ScanRead::new(request, source.request(request.segment))) - .collect(), - ) + self.insert_submitted(misses.into_iter().map(|request| { + let future = source.request(request.segment); + (request, future) + })) } - fn insert_submitted(&self, handles: Vec) -> Vec { + fn insert_submitted( + &self, + handles: impl IntoIterator, + ) -> Vec { handles .into_iter() - .map(|handle| { - let request = handle.request; - let shared = handle.future.map_err(Arc::new).boxed().shared(); + .map(|(request, future)| { + let shared = future.map_err(Arc::new).boxed().shared(); self.in_flight.insert( SegmentRequestKey::from(&request), shared @@ -422,6 +377,10 @@ pub fn register_segment_reads_cached( } fn shared_segment_handle(request: SegmentRequest, future: Shared) -> ScanRead { + shared_read_handle(ScanReadRequest::from(&request), future) +} + +fn shared_read_handle(request: ScanReadRequest, future: Shared) -> ScanRead { ScanRead::new(request, future.map_err(VortexError::from).boxed()) } @@ -432,6 +391,36 @@ pub struct CachedSegmentSource { phase: ScanIoPhase, } +/// Segment source backed by scheduler-resolved read results. +pub struct ReadResultsSegmentSource { + source: Arc, + results: ReadResults, +} + +impl ReadResultsSegmentSource { + /// Create a segment source over already-resolved scan read results. + pub fn new(source: Arc, results: ReadResults) -> Self { + Self { source, results } + } +} + +impl SegmentSource for ReadResultsSegmentSource { + fn segment_info(&self, id: SegmentId) -> VortexResult { + self.source.segment_info(id) + } + + fn request(&self, id: SegmentId) -> SegmentFuture { + let key = ReadRequestKey::from(SegmentRequestKey::new(id)); + let results = self.results.clone(); + async move { results.get(key) }.boxed() + } + + fn resolved(&self, id: SegmentId) -> VortexResult { + self.results + .get(ReadRequestKey::from(SegmentRequestKey::new(id))) + } +} + impl CachedSegmentSource { /// Create a cached source using projection reads as the default late-request phase. pub fn new(source: Arc, cache: Arc) -> Self { diff --git a/vortex-scan/src/segments/source.rs b/vortex-layout/src/segments/source.rs similarity index 74% rename from vortex-scan/src/segments/source.rs rename to vortex-layout/src/segments/source.rs index df6c5569ba8..27bec1a33a1 100644 --- a/vortex-scan/src/segments/source.rs +++ b/vortex-layout/src/segments/source.rs @@ -4,6 +4,7 @@ use futures::future::BoxFuture; use vortex_array::buffer::BufferHandle; use vortex_error::VortexResult; +use vortex_error::vortex_bail; use crate::segments::SegmentId; use crate::segments::SegmentInfo; @@ -18,4 +19,9 @@ pub trait SegmentSource: 'static + Send + Sync { /// Request a segment, returning a future that will eventually resolve to the segment data. fn request(&self, id: SegmentId) -> SegmentFuture; + + /// Return a segment that has already been resolved by the scan scheduler. + fn resolved(&self, id: SegmentId) -> VortexResult { + vortex_bail!("segment {id} has not been resolved by the scan scheduler") + } } diff --git a/vortex-scan/src/lib.rs b/vortex-scan/src/lib.rs index 5c517977895..e6bbe2699de 100644 --- a/vortex-scan/src/lib.rs +++ b/vortex-scan/src/lib.rs @@ -22,10 +22,9 @@ //! * We should add a way for the client to negotiate capabilities with the data source, for //! example which encodings it knows about. -pub mod plan; +pub mod read; pub mod row_mask; pub mod scheduler; -pub mod segments; pub mod selection; pub mod task; diff --git a/vortex-scan/src/read.rs b/vortex-scan/src/read.rs new file mode 100644 index 00000000000..4fa245d4fec --- /dev/null +++ b/vortex-scan/src/read.rs @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Scheduler-visible scan read requests. +//! +//! These types intentionally do not model layout segments. Layouts and file readers +//! decide what a read key means; the scan scheduler only needs a stable key, byte +//! estimate, phase, priority, and cancellation scope for admission and deduplication. + +use std::sync::Arc; + +use futures::future::BoxFuture; +use parking_lot::Mutex; +use vortex_array::buffer::BufferHandle; +use vortex_error::VortexResult; +use vortex_error::vortex_err; +use vortex_utils::aliases::hash_map::HashMap; + +/// Static future resolving to a scan read buffer. +pub type ScanReadFuture = BoxFuture<'static, VortexResult>; + +/// High-level scan phase associated with a read request. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash)] +pub enum ScanIoPhase { + /// Shared evidence setup, such as loading a stats table. + EvidenceSetup, + /// Per-morsel evidence probe. + EvidenceProbe, + /// Residual predicate value read. + PredicateRead, + /// Projected output value read. + #[default] + ProjectionRead, + /// Aggregate input or metadata read. + AggregateRead, +} + +/// Scheduler priority for a read request. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct ScanPriority(i32); + +impl ScanPriority { + /// Normal request priority. + pub const NORMAL: Self = Self(0); + + /// Create a priority from a signed integer value. + pub fn new(value: i32) -> Self { + Self(value) + } + + /// Return the signed integer priority value. + pub fn get(self) -> i32 { + self.0 + } +} + +/// Cancellation scope for a group of related read requests. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash)] +pub struct CancelGroup(u64); + +impl CancelGroup { + /// A request that is not associated with a finer cancellation group. + pub const NONE: Self = Self(0); + + /// Create a cancellation group from an integer id. + pub fn new(value: u64) -> Self { + Self(value) + } + + /// Return the integer cancellation group id. + pub fn get(self) -> u64 { + self.0 + } +} + +/// Opaque dedupe key for a logical scan read. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub struct ReadRequestKey(u64); + +impl ReadRequestKey { + /// Create an opaque read request key. + pub fn new(value: u64) -> Self { + Self(value) + } + + /// Return the raw key value. + pub fn get(self) -> u64 { + self.0 + } +} + +impl From for ReadRequestKey { + fn from(value: u64) -> Self { + Self::new(value) + } +} + +/// A scheduler-visible request for one logical read payload. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub struct ScanReadRequest { + /// Opaque read dedupe key. + pub key: ReadRequestKey, + /// Number of logical bytes this read contributes to admission. + pub bytes: u64, + /// High-level scan phase that needs this read. + pub phase: ScanIoPhase, + /// Scheduler priority for this request. + pub priority: ScanPriority, + /// Cancellation scope for this request. + pub cancel_group: CancelGroup, +} + +impl ScanReadRequest { + /// Create a read request with normal priority and no cancellation group. + pub fn new(key: ReadRequestKey, bytes: u64, phase: ScanIoPhase) -> Self { + Self { + key, + bytes, + phase, + priority: ScanPriority::NORMAL, + cancel_group: CancelGroup::NONE, + } + } + + /// Return a copy of this request with the provided priority. + pub fn with_priority(mut self, priority: ScanPriority) -> Self { + self.priority = priority; + self + } + + /// Return a copy of this request with the provided cancellation group. + pub fn with_cancel_group(mut self, cancel_group: CancelGroup) -> Self { + self.cancel_group = cancel_group; + self + } +} + +/// One logical read registered for a scan task. +pub struct ScanRead { + /// The logical request this handle resolves. + pub request: ScanReadRequest, + /// Future resolving to the requested payload. + pub future: ScanReadFuture, +} + +/// Scan-wide store of resolved read buffers. +#[derive(Default)] +pub struct ReadStore { + entries: Mutex>, +} + +impl ReadStore { + /// Create an empty read store. + pub fn new() -> Self { + Self::default() + } + + /// Return a resolved buffer by key, if present. + pub fn get(&self, key: ReadRequestKey) -> Option { + self.entries.lock().get(&key).cloned() + } + + /// Insert a resolved buffer. + pub fn insert(&self, key: ReadRequestKey, buffer: BufferHandle) { + self.entries.lock().insert(key, buffer); + } +} + +/// Shared scan-wide read store. +pub type ReadStoreRef = Arc; + +/// Read-only view over resolved scan reads. +#[derive(Clone)] +pub struct ReadResults { + store: ReadStoreRef, +} + +impl ReadResults { + /// Create a read results view over a shared store. + pub fn new(store: ReadStoreRef) -> Self { + Self { store } + } + + /// Return a resolved buffer by key. + pub fn get(&self, key: ReadRequestKey) -> VortexResult { + self.store + .get(key) + .ok_or_else(|| vortex_err!("scan read {:?} was not resolved before execution", key)) + } + + /// Return whether a read has already been resolved. + pub fn contains(&self, key: ReadRequestKey) -> bool { + self.store.get(key).is_some() + } +} + +impl ScanRead { + /// Create a handle for one logical read request. + pub fn new(request: ScanReadRequest, future: ScanReadFuture) -> Self { + Self { request, future } + } +} diff --git a/vortex-scan/src/scheduler.rs b/vortex-scan/src/scheduler.rs index 1b50250833d..4fbfc6a51f6 100644 --- a/vortex-scan/src/scheduler.rs +++ b/vortex-scan/src/scheduler.rs @@ -25,7 +25,7 @@ use vortex_session::VortexSession; use vortex_utils::parallelism::get_available_parallelism; const DEFAULT_MORSEL_CONCURRENCY_FACTOR: usize = 4; -const DEFAULT_READ_BYTE_BUDGET: u64 = 256 * 1024 * 1024; +const DEFAULT_MORSEL_BYTE_BUDGET: u64 = 256 * 1024 * 1024; /// Configuration for a [`ScanScheduler`]. #[derive(Clone, Debug, PartialEq, Eq)] @@ -34,7 +34,7 @@ pub struct ScanSchedulerConfig { per_scan_slots: Option, morsel_plan_window: Option, morsel_launch_window: Option, - read_byte_budget: Option, + morsel_byte_budget: Option, } impl ScanSchedulerConfig { @@ -45,7 +45,7 @@ impl ScanSchedulerConfig { per_scan_slots: None, morsel_plan_window: None, morsel_launch_window: None, - read_byte_budget: None, + morsel_byte_budget: None, } } @@ -60,7 +60,7 @@ impl ScanSchedulerConfig { per_scan_slots: Some(slots), morsel_plan_window: None, morsel_launch_window: Some(slots), - read_byte_budget: Some(DEFAULT_READ_BYTE_BUDGET), + morsel_byte_budget: Some(DEFAULT_MORSEL_BYTE_BUDGET), } } @@ -78,11 +78,11 @@ impl ScanSchedulerConfig { self } - /// Return a copy with the maximum number of unfetched read bytes allowed in flight per scan. + /// Return a copy with the maximum number of logical segment bytes allowed in flight per scan. /// /// `None` means scan task launch is not capped by bytes. - pub fn with_read_byte_budget(mut self, bytes: Option) -> Self { - self.read_byte_budget = bytes.map(|bytes| bytes.max(1)); + pub fn with_morsel_byte_budget(mut self, bytes: Option) -> Self { + self.morsel_byte_budget = bytes.map(|bytes| bytes.max(1)); self } @@ -118,9 +118,9 @@ impl ScanSchedulerConfig { self.morsel_launch_window } - /// Returns the configured per-scan unfetched-read byte budget. - pub fn read_byte_budget(&self) -> Option { - self.read_byte_budget + /// Returns the configured per-scan logical segment byte budget. + pub fn morsel_byte_budget(&self) -> Option { + self.morsel_byte_budget } } diff --git a/vortex-scan/src/segments/mod.rs b/vortex-scan/src/segments/mod.rs deleted file mode 100644 index 939ae412718..00000000000 --- a/vortex-scan/src/segments/mod.rs +++ /dev/null @@ -1,47 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Segment sources and scheduler-visible segment request planning. - -mod scheduled; -mod source; - -use std::fmt::Display; -use std::ops::Deref; - -pub use scheduled::*; -pub use source::*; -use vortex_error::VortexError; - -/// The identifier for a single logical segment. -// TODO(ngates): should this be a `[u8]` instead? Allowing for arbitrary segment identifiers? -#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct SegmentId(u32); - -impl From for SegmentId { - fn from(value: u32) -> Self { - Self(value) - } -} - -impl TryFrom for SegmentId { - type Error = VortexError; - - fn try_from(value: usize) -> Result { - Ok(Self::from(u32::try_from(value)?)) - } -} - -impl Deref for SegmentId { - type Target = u32; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl Display for SegmentId { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "SegmentId({})", self.0) - } -} diff --git a/vortex-scan/src/task.rs b/vortex-scan/src/task.rs index 1146c42cd9d..14a2b612673 100644 --- a/vortex-scan/src/task.rs +++ b/vortex-scan/src/task.rs @@ -11,15 +11,16 @@ use std::collections::BTreeMap; use std::collections::VecDeque; -use futures::future::BoxFuture; use vortex_error::VortexResult; +use vortex_error::vortex_err; use vortex_utils::aliases::hash_map::Entry; use vortex_utils::aliases::hash_map::HashMap; use vortex_utils::aliases::hash_set::HashSet; -use crate::segments::ScanIoPhase; -use crate::segments::ScanRead; -use crate::segments::SegmentRequestKey; +use crate::read::ReadRequestKey; +use crate::read::ReadResults; +use crate::read::ScanIoPhase; +use crate::read::ScanRead; /// Fine-grained scheduling lane for a scan task. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] @@ -94,7 +95,7 @@ impl ScanTaskGroup { #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub struct ScanTaskRead { /// Dedupe key for the logical read. - pub key: SegmentRequestKey, + pub key: ReadRequestKey, /// Number of bytes this read contributes if it is not already active. pub bytes: u64, } @@ -106,7 +107,7 @@ impl ScanTaskRead { reads .iter() .filter_map(|read| { - let key = SegmentRequestKey::from(&read.request); + let key = read.request.key; seen.insert(key).then_some(Self { key, bytes: read.request.bytes, @@ -116,94 +117,210 @@ impl ScanTaskRead { } } -/// A morsel-level scan task with explicit read dependencies. -pub trait ScanTask: Send { - /// Morsel identifier this task belongs to. - fn morsel_id(&self) -> usize; +/// Reads requested by one scheduler-visible scan step. +#[derive(Default)] +pub struct ScanStepReads { + required: Vec, + prefetch: Vec, +} - /// High-level scan phase for scheduling. - fn phase(&self) -> ScanIoPhase; +impl ScanStepReads { + /// Create an empty read set. + pub fn new() -> Self { + Self::default() + } - /// Fine-grained scheduling lane. - fn lane(&self) -> ScanTaskLane; + /// Add a read that must complete before the task can make progress. + pub fn require(&mut self, read: ScanRead) { + self.required.push(read); + } - /// Logical reads required by this task. - fn reads(&self) -> &[ScanTaskRead]; + /// Add a read that may be fetched speculatively but must not gate task progress. + pub fn prefetch(&mut self, read: ScanRead) { + self.prefetch.push(read); + } - /// Scheduling priority within this task's group. Lower values run first. - fn priority(&self) -> u64; + /// Return required reads. + pub fn required(&self) -> &[ScanRead] { + &self.required + } + + /// Return prefetch reads. + pub fn prefetches(&self) -> &[ScanRead] { + &self.prefetch + } + + /// Consume required reads. + pub fn into_required(self) -> Vec { + self.required + } + + /// Consume prefetch reads. + pub fn into_prefetches(self) -> Vec { + self.prefetch + } - /// Execute this task. - fn into_future(self: Box) -> BoxFuture<'static, VortexResult>; + /// Consume both read classes. + pub fn into_parts(self) -> (Vec, Vec) { + (self.required, self.prefetch) + } + + /// Return true when there are no reads of either class. + pub fn is_empty(&self) -> bool { + self.required.is_empty() && self.prefetch.is_empty() + } + + /// Return true when no progress-gating reads were requested. + pub fn required_is_empty(&self) -> bool { + self.required.is_empty() + } } -/// Boxed scan task. -pub type ScanTaskBox = Box>; +/// Result of executing a scheduler-visible scan step. +pub enum ScanStepResult { + /// The task produced its final output. + Ready(T), + /// The task needs another scheduler-admitted step before it can finish. + Continue(ScanTaskBox), +} + +type ScanStepContinuation = + Box VortexResult> + Send>; -/// A scan task backed by an already-constructed future. -pub struct FutureScanTask { +/// One scheduler-visible step of a morsel-level scan task. +pub struct ScanStep { morsel_id: usize, phase: ScanIoPhase, lane: ScanTaskLane, reads: Vec, + /// Reads that must resolve before the continuation runs. + pub required_reads: Vec, + /// Reads that may be fetched speculatively while this step is queued. + pub prefetch_reads: Vec, priority: u64, - future: BoxFuture<'static, VortexResult>, + continuation: Option>, } -impl FutureScanTask { +impl ScanStep { /// Default scheduling priority for tasks without a more specific estimate. pub const DEFAULT_PRIORITY: u64 = 1_000_000; - /// Create a future-backed scan task. + /// Create a scheduler-visible scan step. pub fn new( morsel_id: usize, phase: ScanIoPhase, + lane: ScanTaskLane, reads: Vec, - future: BoxFuture<'static, VortexResult>, + required_reads: Vec, + prefetch_reads: Vec, + continuation: impl FnOnce(ReadResults) -> VortexResult> + Send + 'static, ) -> Self { - Self::new_in_lane( + Self { morsel_id, phase, - ScanTaskLane::from_phase(phase), + lane, reads, - future, - ) + required_reads, + prefetch_reads, + priority: Self::DEFAULT_PRIORITY, + continuation: Some(Box::new(continuation)), + } } - /// Create a future-backed scan task in a specific scheduling lane. - pub fn new_in_lane( + /// Create a ready step with no required reads. + pub fn ready( morsel_id: usize, phase: ScanIoPhase, lane: ScanTaskLane, reads: Vec, - future: BoxFuture<'static, VortexResult>, - ) -> Self { - Self { + output: VortexResult, + ) -> Self + where + T: Send + 'static, + { + Self::new( morsel_id, phase, lane, reads, - priority: Self::DEFAULT_PRIORITY, - future, - } + Vec::new(), + Vec::new(), + move |_| output.map(ScanStepResult::Ready), + ) } - /// Return this task with an explicit scheduling priority. + /// Return this step with an explicit scheduling priority. pub fn with_priority(mut self, priority: u64) -> Self { self.priority = priority; self } - /// Box this task behind the [`ScanTask`] trait. + /// Box this step behind the [`ScanTask`] trait. pub fn boxed(self) -> ScanTaskBox where T: 'static, { Box::new(self) } + + /// Reads that must resolve before the continuation runs. + pub fn required_reads(&self) -> &[ScanRead] { + &self.required_reads + } + + /// Reads that may be fetched speculatively for this step. + pub fn prefetch_reads(&self) -> &[ScanRead] { + &self.prefetch_reads + } + + /// Consume the step into its required and prefetch reads. + pub fn into_reads(self) -> (Vec, Vec) { + (self.required_reads, self.prefetch_reads) + } + + /// Take the step's required and prefetch reads, leaving empty read lists behind. + pub fn take_reads(&mut self) -> (Vec, Vec) { + ( + std::mem::take(&mut self.required_reads), + std::mem::take(&mut self.prefetch_reads), + ) + } + + /// Execute this step's continuation. + pub fn continue_with(mut self, results: ReadResults) -> VortexResult> { + let continuation = self + .continuation + .take() + .ok_or_else(|| vortex_err!("scan step was continued after completion"))?; + continuation(results) + } } -impl ScanTask for FutureScanTask { +/// A morsel-level scan task with explicit read dependencies. +pub trait ScanTask: Send { + /// Morsel identifier this task belongs to. + fn morsel_id(&self) -> usize; + + /// High-level scan phase for scheduling. + fn phase(&self) -> ScanIoPhase; + + /// Fine-grained scheduling lane. + fn lane(&self) -> ScanTaskLane; + + /// Logical reads required by this task. + fn reads(&self) -> &[ScanTaskRead]; + + /// Scheduling priority within this task's group. Lower values run first. + fn priority(&self) -> u64; + + /// Convert this task into its next scheduler-visible step. + fn into_step(self: Box) -> VortexResult>; +} + +/// Boxed scan task. +pub type ScanTaskBox = Box>; + +impl ScanTask for ScanStep { fn morsel_id(&self) -> usize { self.morsel_id } @@ -224,8 +341,8 @@ impl ScanTask for FutureScanTask { self.priority } - fn into_future(self: Box) -> BoxFuture<'static, VortexResult> { - self.future + fn into_step(self: Box) -> VortexResult> { + Ok(*self) } } @@ -274,22 +391,20 @@ pub struct ScanTaskQueue { evidence_queues: BTreeMap<(u32, u32), VecDeque>>, predicate_queues: BTreeMap>>, projection_queue: VecDeque>, - read_byte_budget: u64, + morsel_byte_budget: u64, active_read_bytes: u64, active_group_read_bytes: [u64; 3], - active_reads: HashMap, + active_reads: HashMap, } -const FRONTIER_SLACK_MORSELS: usize = 4; - impl ScanTaskQueue { - /// Create an empty task queue with an in-flight read byte budget. - pub fn new(read_byte_budget: u64) -> Self { + /// Create an empty task queue with an in-flight morsel byte budget. + pub fn new(morsel_byte_budget: u64) -> Self { Self { evidence_queues: BTreeMap::new(), predicate_queues: BTreeMap::new(), projection_queue: VecDeque::new(), - read_byte_budget, + morsel_byte_budget, active_read_bytes: 0, active_group_read_bytes: [0; 3], active_reads: HashMap::new(), @@ -377,7 +492,27 @@ impl ScanTaskQueue { self.active_read_bytes } - /// Pop the next task admitted by the frontier policy and read byte budget. + /// Number of active logical read dependencies. + pub fn active_read_count(&self) -> usize { + self.active_reads.len() + } + + /// Number of currently active predicate logical read bytes. + pub fn active_predicate_read_bytes(&self) -> u64 { + self.active_group_read_bytes[ScanTaskGroup::Predicate.idx()] + } + + /// Number of currently active projection logical read bytes. + pub fn active_projection_read_bytes(&self) -> u64 { + self.active_group_read_bytes[ScanTaskGroup::Projection.idx()] + } + + /// Number of currently active evidence logical read bytes. + pub fn active_evidence_read_bytes(&self) -> u64 { + self.active_group_read_bytes[ScanTaskGroup::Evidence.idx()] + } + + /// Pop the next task admitted by the active read byte strategy. pub fn pop_next_admissible( &mut self, in_flight_empty: bool, @@ -386,8 +521,8 @@ impl ScanTaskQueue { self.pop_next_admissible_with_projection_gate(in_flight_empty, true, &mut is_live_morsel) } - /// Pop the next task admitted by the frontier policy and read byte budget, optionally - /// suppressing projection/aggregate work. + /// Pop the next task admitted by the active read byte strategy, optionally suppressing + /// projection/aggregate work. /// /// This is useful when a caller wants predicate/evidence run-ahead but must avoid producing /// more output batches until downstream has consumed earlier projection results. @@ -398,7 +533,6 @@ impl ScanTaskQueue { mut is_live_morsel: impl FnMut(usize) -> bool, ) -> Option> { self.drop_dead_heads(&mut is_live_morsel); - let frontier = self.frontier_morsel()?; for (group, enforce_target) in [ (ScanTaskGroup::Evidence, true), @@ -411,9 +545,7 @@ impl ScanTaskQueue { if group == ScanTaskGroup::Projection && !projection_admissible { continue; } - if let Some(task) = - self.pop_group_admissible(group, enforce_target, in_flight_empty, frontier) - { + if let Some(task) = self.pop_group_admissible(group, enforce_target, in_flight_empty) { return Some(task); } } @@ -426,7 +558,6 @@ impl ScanTaskQueue { group: ScanTaskGroup, enforce_target: bool, in_flight_empty: bool, - frontier: usize, ) -> Option> { if enforce_target && !self.group_has_budget(group, 0, in_flight_empty) { return None; @@ -434,7 +565,7 @@ impl ScanTaskQueue { let active_reads = &self.active_reads; let active_read_bytes = self.active_read_bytes; - let read_byte_budget = self.read_byte_budget; + let morsel_byte_budget = self.morsel_byte_budget; match group { ScanTaskGroup::Predicate => { @@ -450,12 +581,9 @@ impl ScanTaskQueue { task.morsel_id(), *idx, ); - if !score.within_frontier(frontier) { - continue; - } if !can_admit_task( active_read_bytes, - read_byte_budget, + morsel_byte_budget, in_flight_empty, score.incremental_read_bytes, ) || (enforce_target @@ -480,12 +608,9 @@ impl ScanTaskQueue { task.morsel_id(), 0, ); - if !score.within_frontier(frontier) { - return None; - } if !can_admit_task( active_read_bytes, - read_byte_budget, + morsel_byte_budget, in_flight_empty, score.incremental_read_bytes, ) || (enforce_target @@ -509,12 +634,9 @@ impl ScanTaskQueue { task.morsel_id(), idx.0, ); - if !score.within_frontier(frontier) { - continue; - } if !can_admit_task( active_read_bytes, - read_byte_budget, + morsel_byte_budget, in_flight_empty, score.incremental_read_bytes, ) || (enforce_target @@ -550,29 +672,16 @@ impl ScanTaskQueue { } } - fn frontier_morsel(&self) -> Option { - self.evidence_queues - .values() - .filter_map(|queue| queue.front().map(|task| task.morsel_id())) - .chain( - self.predicate_queues - .values() - .filter_map(|queue| queue.front().map(|task| task.morsel_id())), - ) - .chain(self.projection_queue.front().map(|task| task.morsel_id())) - .min() - } - fn group_target_bytes(&self, group: ScanTaskGroup) -> u64 { - if self.read_byte_budget == u64::MAX { + if self.morsel_byte_budget == u64::MAX { return u64::MAX; } - let projection = (self.read_byte_budget / 8).max(1); - let evidence = (self.read_byte_budget / 8).max(1); + let projection = (self.morsel_byte_budget / 8).max(1); + let evidence = (self.morsel_byte_budget / 8).max(1); match group { ScanTaskGroup::Predicate => self - .read_byte_budget + .morsel_byte_budget .saturating_sub(projection) .saturating_sub(evidence) .max(1), @@ -688,12 +797,12 @@ fn drop_dead_heads_from_map( fn can_admit_task( active_read_bytes: u64, - read_byte_budget: u64, + morsel_byte_budget: u64, in_flight_empty: bool, incremental_read_bytes: u64, ) -> bool { incremental_read_bytes == 0 - || active_read_bytes.saturating_add(incremental_read_bytes) <= read_byte_budget + || active_read_bytes.saturating_add(incremental_read_bytes) <= morsel_byte_budget || in_flight_empty } @@ -708,7 +817,7 @@ struct TaskScore { impl TaskScore { fn new( - active_reads: &HashMap, + active_reads: &HashMap, reads: &[ScanTaskRead], priority: u64, morsel_id: usize, @@ -722,14 +831,10 @@ impl TaskScore { lane_idx, } } - - fn within_frontier(&self, frontier: usize) -> bool { - self.morsel_id <= frontier.saturating_add(FRONTIER_SLACK_MORSELS) - } } fn incremental_read_bytes( - active_reads: &HashMap, + active_reads: &HashMap, reads: &[ScanTaskRead], ) -> u64 { let mut seen = HashSet::new(); @@ -752,20 +857,23 @@ pub fn scan_task_read_bytes(reads: &[ScanTaskRead]) -> u64 { #[cfg(test)] mod tests { - use futures::FutureExt; - use super::*; - use crate::segments::SegmentId; - - fn read(segment: u32, bytes: u64) -> ScanTaskRead { + fn read(key: u32, bytes: u64) -> ScanTaskRead { ScanTaskRead { - key: SegmentRequestKey::new(SegmentId::from(segment)), + key: ReadRequestKey::new(u64::from(key)), bytes, } } fn task(morsel_id: usize, phase: ScanIoPhase, reads: Vec) -> ScanTaskBox<()> { - FutureScanTask::new(morsel_id, phase, reads, async { Ok(()) }.boxed()).boxed() + ScanStep::ready( + morsel_id, + phase, + ScanTaskLane::from_phase(phase), + reads, + Ok(()), + ) + .boxed() } fn task_in_lane( @@ -774,7 +882,7 @@ mod tests { lane: ScanTaskLane, reads: Vec, ) -> ScanTaskBox<()> { - FutureScanTask::new_in_lane(morsel_id, phase, lane, reads, async { Ok(()) }.boxed()).boxed() + ScanStep::ready(morsel_id, phase, lane, reads, Ok(())).boxed() } fn prioritized_task_in_lane( @@ -784,7 +892,7 @@ mod tests { reads: Vec, priority: u64, ) -> ScanTaskBox<()> { - FutureScanTask::new_in_lane(morsel_id, phase, lane, reads, async { Ok(()) }.boxed()) + ScanStep::ready(morsel_id, phase, lane, reads, Ok(())) .with_priority(priority) .boxed() } @@ -828,17 +936,33 @@ mod tests { } #[test] - fn queue_preserves_frontier_within_lane() { + fn queue_prefers_smaller_incremental_bytes_without_morsel_frontier() { let mut queue = ScanTaskQueue::new(100); - queue.push(task(0, ScanIoPhase::EvidenceProbe, vec![read(1, 90)])); - queue.push(task(1, ScanIoPhase::EvidenceProbe, vec![read(2, 10)])); + queue.push(task_in_lane( + 0, + ScanIoPhase::EvidenceProbe, + ScanTaskLane::Evidence { + predicate_idx: 0, + evidence_idx: 0, + }, + vec![read(1, 90)], + )); + queue.push(task_in_lane( + 1, + ScanIoPhase::EvidenceProbe, + ScanTaskLane::Evidence { + predicate_idx: 0, + evidence_idx: 1, + }, + vec![read(2, 10)], + )); let next = queue .pop_next_admissible(true, |_| true) .expect("one task should be admitted"); let (task, _lane, reads) = next.into_parts(); - assert_eq!(task.morsel_id(), 0); - assert_eq!(reads, vec![read(1, 90)]); + assert_eq!(task.morsel_id(), 1); + assert_eq!(reads, vec![read(2, 10)]); } #[test] From 662e2b6820ec8802e1f972a0db2eeba303ea1683 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Tue, 23 Jun 2026 21:59:16 -0400 Subject: [PATCH 33/48] Fix CI failures after scan merge Signed-off-by: Nicholas Gates --- AGENTS.md | 13 +++++++++++++ vortex-array/src/arrow/executor/byte.rs | 1 + vortex-layout/src/scan/v2/layouts/chunked.rs | 3 ++- vortex-layout/src/scan/v2/layouts/dict.rs | 15 ++++++++------- 4 files changed, 24 insertions(+), 8 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index e5c3d0cc13b..3453e7da0dd 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -113,6 +113,19 @@ cargo +nightly fmt --all cargo clippy --all-targets --all-features ``` +Before pushing Rust changes, compile the relevant test targets, not only library targets. At +minimum, run `cargo test -p --all-features --no-run` for every touched Rust crate that +has tests. For cross-crate scan, layout, file, Arrow export, or execution-context changes, include +the crates that can compile hidden or feature-gated tests, for example: + +```bash +cargo test -p vortex-array --all-features --no-run +cargo check -p vortex-layout -p vortex-file -p vortex-duckdb -p vortex-datafusion --all-features +``` + +Do not push after merge conflict resolution until the post-merge test-target build succeeds for the +affected crates. + Notes: - For `.github/` changes, follow `.github/AGENTS.md` and run diff --git a/vortex-array/src/arrow/executor/byte.rs b/vortex-array/src/arrow/executor/byte.rs index ced0affbb3b..2db9d3e6494 100644 --- a/vortex-array/src/arrow/executor/byte.rs +++ b/vortex-array/src/arrow/executor/byte.rs @@ -84,6 +84,7 @@ mod tests { use vortex_mask::Mask; use crate::IntoArray; + use crate::LEGACY_SESSION; use crate::VortexSessionExecute; use crate::array_session; use crate::arrow::ArrowArrayExecutor; diff --git a/vortex-layout/src/scan/v2/layouts/chunked.rs b/vortex-layout/src/scan/v2/layouts/chunked.rs index f60f3fb388d..c70dd385933 100644 --- a/vortex-layout/src/scan/v2/layouts/chunked.rs +++ b/vortex-layout/src/scan/v2/layouts/chunked.rs @@ -46,6 +46,7 @@ use crate::layout_v2::Chunked; use crate::layout_v2::Layout; use crate::layout_v2::LayoutRef; use crate::scan::plan::AggregateAnswer; +use crate::scan::plan::DeferredReadTask; use crate::scan::plan::OwnedRowScope; use crate::scan::plan::PrepareCtx; use crate::scan::plan::PreparedAggregate; @@ -195,7 +196,7 @@ impl ReadTask for ChunkedReadTask { continuations.push((step_parts.len(), expected_len, step.continuation)); step_parts.push(ChunkedReadPart::Pending { expected_len, - task: Box::new(crate::scan::plan::DeferredReadTask), + task: Box::new(DeferredReadTask), }); } } diff --git a/vortex-layout/src/scan/v2/layouts/dict.rs b/vortex-layout/src/scan/v2/layouts/dict.rs index 1b38ef8fd02..3c90a40a53e 100644 --- a/vortex-layout/src/scan/v2/layouts/dict.rs +++ b/vortex-layout/src/scan/v2/layouts/dict.rs @@ -43,6 +43,7 @@ use vortex_session::VortexSession; use crate::layout_v2::Dict; use crate::layout_v2::Layout; +use crate::scan::plan::DeferredReadTask; use crate::scan::plan::OwnedRowScope; use crate::scan::plan::PrepareCtx; use crate::scan::plan::PreparedRead; @@ -335,7 +336,7 @@ impl ReadTask for DictReadTask { ReadTaskOutput::Ready(codes) => { let mut task = DictReadTask { read, - codes: Box::new(crate::scan::plan::DeferredReadTask), + codes: Box::new(DeferredReadTask), value_reads, cx, state: DictReadState::Start, @@ -393,7 +394,7 @@ impl ReadTask for DictReadTask { ReadTaskOutput::Continue(values) => { Ok(ReadTaskOutput::Continue(Box::new(DictReadTask { read, - codes: Box::new(crate::scan::plan::DeferredReadTask), + codes: Box::new(DeferredReadTask), value_reads, cx, state: DictReadState::SparseValues { @@ -426,7 +427,7 @@ impl ReadTask for DictReadTask { ReadTaskOutput::Continue(values) => { Ok(ReadTaskOutput::Continue(Box::new(DictReadTask { read, - codes: Box::new(crate::scan::plan::DeferredReadTask), + codes: Box::new(DeferredReadTask), value_reads, cx, state: DictReadState::FullValues { @@ -841,7 +842,7 @@ impl ReadTask for DictExprReadTask { ReadTaskOutput::Ready(codes) => { let mut task = DictExprReadTask { read, - codes: Box::new(crate::scan::plan::DeferredReadTask), + codes: Box::new(DeferredReadTask), value_reads, cx, state: DictExprReadState::Start, @@ -951,7 +952,7 @@ impl ReadTask for DictExprReadTask { ReadTaskOutput::Continue(values) => { Ok(ReadTaskOutput::Continue(Box::new(DictExprReadTask { read, - codes: Box::new(crate::scan::plan::DeferredReadTask), + codes: Box::new(DeferredReadTask), value_reads, cx, state: DictExprReadState::Values { @@ -1039,7 +1040,7 @@ fn finish_dict_expr_values( let input = read .node .dict - .build_dict(compact_codes.clone(), values_array)? + .build_dict(compact_codes, values_array)? .optimize()?; let computed = input .apply(&read.node.expr) @@ -1059,7 +1060,7 @@ fn finish_dict_expr_values( )?; Ok(ReadTaskOutput::Continue(Box::new(DictExprReadTask { read, - codes: Box::new(crate::scan::plan::DeferredReadTask), + codes: Box::new(DeferredReadTask), value_reads, cx, state: DictExprReadState::Values { From 0d1c52bd662fc657cde63f2cdcb12f533e096cbd Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Tue, 23 Jun 2026 23:09:04 -0400 Subject: [PATCH 34/48] Remove unused scan2 session switch Signed-off-by: Nicholas Gates --- vortex-layout/src/scan/v2/mod.rs | 2 - vortex-layout/src/scan/v2/session.rs | 67 ---------------------------- 2 files changed, 69 deletions(-) delete mode 100644 vortex-layout/src/scan/v2/session.rs diff --git a/vortex-layout/src/scan/v2/mod.rs b/vortex-layout/src/scan/v2/mod.rs index 6660ce657b9..b1c6d81f053 100644 --- a/vortex-layout/src/scan/v2/mod.rs +++ b/vortex-layout/src/scan/v2/mod.rs @@ -6,8 +6,6 @@ //! This module contains the layout-tree expansion vtables and executable //! [`ScanPlan`](crate::scan::plan::ScanPlan) plans used by the alternate scan implementation. -pub mod session; - pub(crate) mod layouts; mod row_idx; pub use row_idx::with_row_idx; diff --git a/vortex-layout/src/scan/v2/session.rs b/vortex-layout/src/scan/v2/session.rs deleted file mode 100644 index e6fec8b9d1f..00000000000 --- a/vortex-layout/src/scan/v2/session.rs +++ /dev/null @@ -1,67 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Session-level default for which scan implementation `VortexScanExec` expands to. - -use std::any::Any; -use std::sync::atomic::AtomicBool; -use std::sync::atomic::Ordering; - -use vortex_session::SessionExt; -use vortex_session::SessionVar; -/// Session variable holding the scan implementation default. -#[derive(Debug)] -pub struct ScanV2Session { - /// Whether `VortexScanExec` expands through scan2 when the node does - /// not choose explicitly (see `VortexScanExec::with_scan2`). - default_enabled: AtomicBool, -} - -impl Clone for ScanV2Session { - fn clone(&self) -> Self { - Self { - default_enabled: AtomicBool::new(self.default_enabled()), - } - } -} - -impl ScanV2Session { - /// Whether scans expand through scan2 by default in this session. - pub fn default_enabled(&self) -> bool { - self.default_enabled.load(Ordering::Relaxed) - } - - /// Set the session default: scans that do not choose explicitly - /// expand through scan2 when `enabled`. - pub fn set_default_enabled(&self, enabled: bool) { - self.default_enabled.store(enabled, Ordering::Relaxed); - } -} - -impl Default for ScanV2Session { - fn default() -> Self { - Self { - default_enabled: AtomicBool::new(false), - } - } -} - -impl SessionVar for ScanV2Session { - fn as_any(&self) -> &dyn Any { - self - } - - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } -} - -/// Session accessor for the scan2 implementation switch. -pub trait ScanV2SessionExt: SessionExt { - /// The scan2 session variable. - fn scan_v2(&self) -> &ScanV2Session { - self.get::() - } -} - -impl ScanV2SessionExt for S {} From ff63cb5de72e6063e8af7b70f81575d4219a27ce Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Tue, 23 Jun 2026 23:52:36 -0400 Subject: [PATCH 35/48] Prepare scan v2 APIs for merge Signed-off-by: Nicholas Gates --- Cargo.lock | 2 +- benchmarks/datafusion-bench/src/lib.rs | 46 +-- .../internals/scan-scheduler.md | 94 ++--- vortex-bench/src/random_access/take.rs | 2 +- vortex-cxx/src/lib.rs | 5 +- vortex-cxx/src/read.rs | 5 +- vortex-datafusion/src/persistent/opener.rs | 66 ++- vortex-duckdb/Cargo.toml | 1 + vortex-duckdb/src/convert/table_filter.rs | 16 +- .../src/e2e_test/vortex_scan_test.rs | 26 ++ vortex-ffi/src/scan.rs | 4 +- vortex-file/src/multi/scan_v2.rs | 92 +---- vortex-file/src/scan_v1_v2_differential.rs | 185 ++++++++- vortex-file/src/segments/source.rs | 4 +- vortex-file/src/tests.rs | 9 +- vortex-jni/src/scan.rs | 4 +- vortex-layout/src/scan/plan/mod.rs | 2 +- vortex-layout/src/scan/scan_builder.rs | 6 +- vortex-layout/src/segments/scheduled.rs | 25 +- vortex-layout/src/segments/test.rs | 2 +- vortex-python/src/dataset.rs | 2 +- vortex-python/src/file.rs | 2 +- vortex-scan/Cargo.toml | 1 - vortex-scan/src/lib.rs | 4 - vortex-scan/src/scheduler.rs | 384 ++---------------- vortex-scan/src/selection.rs | 234 +++++++++-- vortex-scan/src/task.rs | 28 +- 27 files changed, 625 insertions(+), 626 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 12a602ea325..b378d183700 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9902,6 +9902,7 @@ dependencies = [ "reqwest 0.13.4", "rstest", "static_assertions", + "temp-env", "tempfile", "tracing", "tracing-subscriber", @@ -10382,7 +10383,6 @@ dependencies = [ name = "vortex-scan" version = "0.1.0" dependencies = [ - "async-lock", "async-trait", "futures", "parking_lot", diff --git a/benchmarks/datafusion-bench/src/lib.rs b/benchmarks/datafusion-bench/src/lib.rs index d6026f25238..239c0a1b1ba 100644 --- a/benchmarks/datafusion-bench/src/lib.rs +++ b/benchmarks/datafusion-bench/src/lib.rs @@ -145,44 +145,30 @@ fn vortex_session_from_env() -> anyhow::Result { } fn scan_scheduler_config_from_env() -> anyhow::Result { - let config = std::env::var("VORTEX_SCAN_MAX_MORSEL_SLOTS") - .ok() - .map(|value| { - value - .parse::() - .map(ScanSchedulerConfig::morsel_slots) - .map_err(|e| anyhow::anyhow!("invalid scan scheduler slot count {value}: {e}")) - }) - .transpose()? - .unwrap_or_else(ScanSchedulerConfig::default_morsel_slots); - - let plan_window = std::env::var("VORTEX_SCAN_MORSEL_PLAN_WINDOW") - .ok() - .map(|value| { - value - .parse::() - .map_err(|e| anyhow::anyhow!("invalid scan scheduler plan window {value}: {e}")) - }) - .transpose()?; + if std::env::var_os("VORTEX_SCAN_MAX_MORSEL_SLOTS").is_some() { + anyhow::bail!( + "VORTEX_SCAN_MAX_MORSEL_SLOTS is no longer supported; use VORTEX_SCAN_MAX_READ_BYTES" + ); + } + if std::env::var_os("VORTEX_SCAN_MORSEL_PLAN_WINDOW").is_some() { + anyhow::bail!( + "VORTEX_SCAN_MORSEL_PLAN_WINDOW is no longer supported; V2 only exposes read-byte budgeting" + ); + } - let morsel_byte_budget = std::env::var("VORTEX_SCAN_MAX_MORSEL_BYTES") - .or_else(|_| std::env::var("VORTEX_SCAN_MAX_READ_BYTES")) + let read_byte_budget = std::env::var("VORTEX_SCAN_MAX_READ_BYTES") + .or_else(|_| std::env::var("VORTEX_SCAN_MAX_MORSEL_BYTES")) .ok() .map(|value| { value.parse::().map_err(|e| { - anyhow::anyhow!("invalid scan scheduler morsel byte budget {value}: {e}") + anyhow::anyhow!("invalid scan scheduler read byte budget {value}: {e}") }) }) .transpose()?; - let config = match plan_window { - Some(window) => config.with_morsel_plan_window(Some(window)), - None => config, - }; - - Ok(match morsel_byte_budget { - Some(bytes) => config.with_morsel_byte_budget(Some(bytes)), - None => config, + Ok(match read_byte_budget { + Some(bytes) => ScanSchedulerConfig::default().with_read_byte_budget(Some(bytes)), + None => ScanSchedulerConfig::default(), }) } diff --git a/docs/developer-guide/internals/scan-scheduler.md b/docs/developer-guide/internals/scan-scheduler.md index 06126bcaa1c..213d3c1c201 100644 --- a/docs/developer-guide/internals/scan-scheduler.md +++ b/docs/developer-guide/internals/scan-scheduler.md @@ -6,7 +6,7 @@ an implementation guide, not a design sketch. The scheduler is split across three layers: - `vortex-scan::scheduler` owns the process/query-level scheduler object, - scheduler provider, scan tickets, and coarse configuration. + scheduler provider, and read-byte budget configuration. - `vortex-file::multi::scan_v2` owns the per-partition ScanPlan runtime. It plans morsels, queues evidence/predicate/projection work, and decides which queued task is useful next. @@ -36,9 +36,6 @@ DataSourceRef::plan_morsel_partitions or DataSourceRef::scan ScanSchedulerProvider::scheduler_for_scan | v -ScanScheduler::register_scan -> ScanTicket - | - v partition_work_stream | +-- plan morsels into task queues @@ -64,15 +61,9 @@ the output frontier. ## Scheduler Objects -`ScanSchedulerConfig` currently has these fields: +`ScanSchedulerConfig` currently has one enforced field: -- `global_slots`: optional process/query-wide slot limit. -- `per_scan_slots`: optional slot limit for each registered scan. -- `morsel_plan_window`: optional number of morsels a partition stream may plan - ahead. `None` means all pending morsels may be planned. -- `morsel_launch_window`: optional number of morsels intended to run - concurrently. This is configured but not currently consumed by `scan_v2`. -- `morsel_byte_budget`: optional per-partition active logical segment-byte budget. +- `read_byte_budget`: optional per-partition active logical segment-byte budget. `ScanSchedulerProvider` chooses scheduler ownership: @@ -84,11 +75,11 @@ The default `VortexSession` provider is `Unbounded`. DuckDB installs a shared default scheduler in the extension session. The DataFusion benchmark only installs a scheduler when `VORTEX_SCAN_SCHEDULER` is set. -The `ScanScheduler::acquire` permit API exists and is tested, but V2 scan tasks -do not currently acquire permits before launching. In the current V2 runtime the -effective controls are the morsel planning window and the task queue morsel-byte -budget. Slot fields are still useful because `morsel_slots(n)` derives default -read-budgeted config, but the slots themselves are not yet an execution gate. +There is no scheduler permit API in the V2 runtime. Task launch is admitted by +the per-partition `ScanTaskQueue` using active logical read bytes. Limited scans +still plan one active morsel at a time internally because limit accounting must +not consume rows far ahead of the output frontier, but that is not a public +tuning knob. ## Planning Morsels @@ -99,7 +90,8 @@ read-budgeted config, but the slots themselves are not yet an execution gate. - `task_queue`: queued evidence, predicate, projection, and aggregate tasks. - `in_flight`: launched task futures. - `completed_morsels`: ordered-output buffer. -- `plan_window`: maximum active planned morsels for this partition stream. +- `plan_window`: internal active planned-morsel cap. This is unbounded for + normal scans and one for limited scans. On each stream poll, the runtime: @@ -133,7 +125,7 @@ Admission is not FIFO across all work. The queue tries groups in this order: 5. Projection ignoring group target. 6. Evidence ignoring group target. -All groups still obey the total morsel-byte budget unless the task contributes no +All groups still obey the total read-byte budget unless the task contributes no new bytes or the runtime has no launched work at all. The empty-in-flight escape hatch prevents deadlock when one task is larger than the configured budget. @@ -154,9 +146,9 @@ launched tasks and one projection is needed to keep an ordered stream moving. Evidence and predicate tasks are still admissible while projection is gated. This favors avoiding wasted projection I/O over maximizing object-store request depth. -## Morsel-Byte Budget +## Read-Byte Budget -`morsel_byte_budget` is per partition stream. It counts active logical segment +`read_byte_budget` is per partition stream. It counts active logical segment bytes for admitted tasks, deduped by `SegmentRequestKey`. If two launched tasks await the same segment, only the first contributes bytes; the active entry keeps a reference count until both tasks complete. @@ -176,7 +168,7 @@ unless it is the only way to make progress. The default bounded config uses: ```text -DEFAULT_MORSEL_BYTE_BUDGET = 256 MiB +DEFAULT_READ_BYTE_BUDGET = 256 MiB ``` `ScanSchedulerConfig::unbounded()` leaves this unset, which becomes `u64::MAX` @@ -200,9 +192,10 @@ The cache key is currently the logical `SegmentId`. That is sufficient inside on `ScanExecution` because each execution has one bound file segment source. It is not a cross-file or cross-scan cache key. -`SegmentInfo` includes `bytes` and `cacheable`. The task scheduler currently uses -`bytes` for read-budget admission. The `cacheable` flag is not part of task -admission policy. +`SegmentInfo` contains only logical payload `bytes`, which the task scheduler +uses for read-budget admission. Segment-cache policy is owned by the +`SegmentCacheSourceAdapter`; it is not expressed through scheduler-visible +segment metadata. ## Physical I/O @@ -246,8 +239,7 @@ scheduler preset: - DataFusion remote benchmarks create the `VortexSession` before registering the object store URL, so the Vortex scheduler provider cannot infer S3/GCS from the source URL. -- DuckDB uses a shared bounded scheduler by default, but `morsel_launch_window` - is not yet enforced by V2. +- DuckDB uses a shared scheduler with the default active read-byte budget. - DataFusion uses an unbounded scheduler unless benchmark environment variables opt into a scheduler. @@ -256,37 +248,25 @@ is failing to expose enough useful segment futures early enough, or exposing far too many tiny/sparse reads without a workload-specific budget. The important knobs are: -- `morsel_plan_window`: how far ahead segment futures are registered; -- `morsel_byte_budget`: how many active logical segment bytes may be polled; +- `read_byte_budget`: how many active logical segment bytes may be polled; - physical coalescing distance/max size on the object-store reader; - physical object-store request concurrency; - DataFusion output partition count, which controls how many partition streams run at once. -The hardcoded frontier slack of four morsels can also matter for remote storage. -Even with an unbounded plan window, queued task admission does not run arbitrarily -far ahead of the lowest queued morsel. If each morsel produces only a few small -range reads, this can under-fill a high-latency object store after the initial -registration burst. - ## Benchmark Knobs The DataFusion benchmark supports: ```text VORTEX_SCAN_SCHEDULER=unbounded|shared|per-query -VORTEX_SCAN_MAX_MORSEL_SLOTS=... -VORTEX_SCAN_MORSEL_PLAN_WINDOW=... -VORTEX_SCAN_MAX_MORSEL_BYTES=... +VORTEX_SCAN_MAX_READ_BYTES=... ``` -`VORTEX_SCAN_MAX_MORSEL_SLOTS` currently feeds `ScanSchedulerConfig::morsel_slots`. -Because V2 does not enforce launch permits yet, this mainly selects a bounded -config with the default morsel-byte budget unless paired with explicit -morsel-byte budget configuration. - -`VORTEX_SCAN_MAX_READ_BYTES` is accepted as a compatibility fallback for older -benchmark scripts. +`VORTEX_SCAN_MAX_MORSEL_BYTES` is accepted as a compatibility fallback for older +benchmark scripts. `VORTEX_SCAN_MAX_MORSEL_SLOTS` and +`VORTEX_SCAN_MORSEL_PLAN_WINDOW` are rejected because V2 no longer exposes +morsel-count scheduler knobs. Useful S3 sweeps should compare: @@ -296,17 +276,13 @@ VORTEX_SCAN_SCHEDULER=unbounded # Bounded read pressure, one scheduler per query. VORTEX_SCAN_SCHEDULER=per-query -VORTEX_SCAN_MAX_MORSEL_BYTES=268435456 +VORTEX_SCAN_MAX_READ_BYTES=268435456 # Larger remote-storage byte window. VORTEX_SCAN_SCHEDULER=per-query -VORTEX_SCAN_MAX_MORSEL_BYTES=1073741824 +VORTEX_SCAN_MAX_READ_BYTES=1073741824 ``` -The most useful remote sweep is fixed read budget with plan windows such as 16, -64, 256, and unset/unbounded. Small plan windows are expected to reduce -coalescing and hurt S3 unless they also avoid substantial over-read. - An active-logical-read target was tested as an I/O-depth proxy and rejected: it improved some FineWeb cases, but regressed local PolarSignals enough that it was too indirect to use as a scheduler knob. @@ -317,8 +293,8 @@ For local NVMe, keep the read budget moderate and rely on local filesystem coalescing. Excessive read-ahead can increase memory pressure without hiding much latency. -For S3/GCS, prefer a larger byte budget and a large or unbounded plan window so -the file segment source can see adjacent registered requests and coalesce them. +For S3/GCS, prefer a larger byte budget so the file segment source can keep more +useful logical reads active and coalesce adjacent registered requests. If a query is highly selective and projection reads are sparse, validate the coalesced-byte metrics before increasing the object-store coalescing max size. If dynamic predicates are active, also compare projection-gated behavior against @@ -333,18 +309,14 @@ Use scan metrics to separate three failure modes: segment bytes. The scheduler today cannot distinguish those automatically. The next practical -tuning step is to expose plan-window control in the benchmark and, separately, -enforce `morsel_launch_window` with scheduler permits so slot configuration -matches runtime behavior. +tuning step is to expose byte-based controls for physical object-store +coalescing/request pressure if logical read-byte budgeting is not enough. ## Known Gaps -- `morsel_launch_window`, `global_slots`, and `per_scan_slots` are not enforced - by `scan_v2` task launch. -- The benchmark can configure scheduler mode, plan window, and byte budget, but - not physical object-store coalescing or request concurrency. +- The benchmark can configure scheduler mode and read-byte budget, but not + physical object-store coalescing or request concurrency. - There is no automatic object-store scheduler preset. - The scan runtime accounts logical segment bytes, not physical coalesced bytes. -- `SegmentInfo::cacheable` is not used by task admission. - Output Arrow conversion is outside the scan task queue and has separate buffering in the DataFusion adapter. diff --git a/vortex-bench/src/random_access/take.rs b/vortex-bench/src/random_access/take.rs index e86e5d576d6..42d6ebb4924 100644 --- a/vortex-bench/src/random_access/take.rs +++ b/vortex-bench/src/random_access/take.rs @@ -76,7 +76,7 @@ impl RandomAccessor for VortexRandomAccessor { let array = self .file .scan()? - .with_row_indices(indices_buf) + .with_row_indices(indices_buf)? .into_array_stream()? .read_all() .await?; diff --git a/vortex-cxx/src/lib.rs b/vortex-cxx/src/lib.rs index b6d002513f6..a7535e66027 100644 --- a/vortex-cxx/src/lib.rs +++ b/vortex-cxx/src/lib.rs @@ -95,7 +95,10 @@ mod ffi { fn with_projection(self: &mut VortexScanBuilder, projection: Box); fn with_projection_ref(self: &mut VortexScanBuilder, projection: &Expr); fn with_row_range(self: &mut VortexScanBuilder, row_range_start: u64, row_range_end: u64); - fn with_include_by_index(self: &mut VortexScanBuilder, include_by_index: &[u64]); + fn with_include_by_index( + self: &mut VortexScanBuilder, + include_by_index: &[u64], + ) -> Result<()>; fn with_limit(self: &mut VortexScanBuilder, limit: usize); unsafe fn with_output_schema( self: &mut VortexScanBuilder, diff --git a/vortex-cxx/src/read.rs b/vortex-cxx/src/read.rs index 4ce229b7a2b..be071958d0b 100644 --- a/vortex-cxx/src/read.rs +++ b/vortex-cxx/src/read.rs @@ -91,9 +91,10 @@ impl VortexScanBuilder { }); } - pub(crate) fn with_include_by_index(&mut self, include_by_index: &[u64]) { - let selection = Selection::IncludeByIndex(Buffer::copy_from(include_by_index)); + pub(crate) fn with_include_by_index(&mut self, include_by_index: &[u64]) -> Result<()> { + let selection = Selection::include_by_index(Buffer::copy_from(include_by_index))?; take_mut::take(&mut self.inner, |inner| inner.with_selection(selection)); + Ok(()) } pub(crate) fn with_limit(&mut self, limit: usize) { diff --git a/vortex-datafusion/src/persistent/opener.rs b/vortex-datafusion/src/persistent/opener.rs index 0c68f20a848..29232546ee9 100644 --- a/vortex-datafusion/src/persistent/opener.rs +++ b/vortex-datafusion/src/persistent/opener.rs @@ -1472,9 +1472,9 @@ mod tests { let mut file = PartitionedFile::new(file_path.to_string(), data_size); file.extensions .insert( - VortexAccessPlan::default().with_selection(Selection::IncludeByIndex( + VortexAccessPlan::default().with_selection(Selection::include_by_index( Buffer::from_iter(vec![1, 3, 5, 7]), - )), + )?), ); let opener = make_test_opener( @@ -1516,9 +1516,9 @@ mod tests { let mut file = PartitionedFile::new(file_path.to_string(), data_size); file.extensions .insert( - VortexAccessPlan::default().with_selection(Selection::ExcludeByIndex( + VortexAccessPlan::default().with_selection(Selection::exclude_by_index( Buffer::from_iter(vec![0, 2, 4, 6, 8]), - )), + )?), ); let opener = make_test_opener( @@ -1579,6 +1579,64 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_scan_impl_flip_flop_v1_v2() -> anyhow::Result<()> { + let object_store = Arc::new(InMemory::new()) as Arc; + let file_path = "/path/file.vortex"; + + let batch = make_test_batch_with_10_rows(); + let data_size = + write_arrow_to_vortex(Arc::clone(&object_store), file_path, batch.clone()).await?; + + let schema = batch.schema(); + let mut file = PartitionedFile::new(file_path.to_string(), data_size); + file.extensions + .insert( + VortexAccessPlan::default().with_selection(Selection::include_by_index( + Buffer::from_iter(vec![1, 3, 5, 7, 9]), + )?), + ); + + let mut opener_v1 = make_test_opener( + Arc::clone(&object_store), + Arc::clone(&schema), + ProjectionExprs::from_indices(&[0, 1], &schema), + ); + opener_v1.scan_v2 = false; + opener_v1.limit = Some(3); + opener_v1.has_output_ordering = true; + + let mut opener_v2 = opener_v1.clone(); + opener_v2.scan_v2 = true; + + let v1 = opener_v1 + .open(file.clone())? + .await? + .try_collect::>() + .await?; + let v2 = opener_v2.open(file)?.await?.try_collect::>().await?; + + let format_opts = FormatOptions::new().with_types_info(true); + let v1_pretty = pretty_format_batches_with_options(&v1, &format_opts)?.to_string(); + let v2_pretty = pretty_format_batches_with_options(&v2, &format_opts)?.to_string(); + + assert_eq!(v1_pretty, v2_pretty); + assert_eq!( + v1_pretty, + r"+-------+------+ +| a | b | +| Int32 | Utf8 | ++-------+------+ +| 1 | r1 | +| 3 | r3 | +| 5 | r5 | ++-------+------+" + .trim() + ); + + Ok(()) + } + #[tokio::test] // Test that when no extensions are provided, all rows are returned (backward compatibility). async fn test_selection_no_extensions() -> anyhow::Result<()> { diff --git a/vortex-duckdb/Cargo.toml b/vortex-duckdb/Cargo.toml index b6896910a89..9cfa3f118e8 100644 --- a/vortex-duckdb/Cargo.toml +++ b/vortex-duckdb/Cargo.toml @@ -48,6 +48,7 @@ anyhow = { workspace = true } geo-types = { workspace = true } jiff = { workspace = true } rstest = { workspace = true } +temp-env = { workspace = true } tempfile = { workspace = true } vortex-array = { workspace = true, features = ["_test-harness"] } vortex-runend = { workspace = true } diff --git a/vortex-duckdb/src/convert/table_filter.rs b/vortex-duckdb/src/convert/table_filter.rs index fa7ee26e744..2c8be999e04 100644 --- a/vortex-duckdb/src/convert/table_filter.rs +++ b/vortex-duckdb/src/convert/table_filter.rs @@ -166,6 +166,12 @@ fn intersect_sorted(left: &[u64], right: &[u64]) -> Vec { result } +fn normalize_indices(mut indices: Vec) -> Vec { + indices.sort_unstable(); + indices.dedup(); + indices +} + /// For constant comparison on IN filters over file_index or file_row_number /// virtual column, create a selection and a range covering the same range as /// expressions do. @@ -178,7 +184,10 @@ pub fn try_from_virtual_column_filter( .iter() .map(nonnegative_number_from_value) .collect::>>()?; - Ok((Selection::IncludeByIndex(Buffer::from_iter(indices)), None)) + Ok(( + Selection::include_by_index(Buffer::from_iter(normalize_indices(indices)))?, + None, + )) } TableFilterClass::ConstantComparison(const_) => { let n = nonnegative_number_from_value(const_.value)?; @@ -206,7 +215,7 @@ pub fn try_from_virtual_column_filter( let (sel, range) = try_from_virtual_column_filter(child)?; if let Selection::IncludeByIndex(buf) = sel { indices = Some(match indices { - None => buf.iter().copied().collect(), + None => buf.as_slice().to_vec(), Some(existing) => intersect_sorted(&existing, buf.as_ref()), }); } @@ -217,7 +226,8 @@ pub fn try_from_virtual_column_filter( } let range = (start < end).then_some(start..end); let sel = indices - .map(|v| Selection::IncludeByIndex(Buffer::from_iter(v))) + .map(|v| Selection::include_by_index(Buffer::from_iter(v))) + .transpose()? .unwrap_or(Selection::All); Ok((sel, range)) } diff --git a/vortex-duckdb/src/e2e_test/vortex_scan_test.rs b/vortex-duckdb/src/e2e_test/vortex_scan_test.rs index 6cc28483571..cecd2271c48 100644 --- a/vortex-duckdb/src/e2e_test/vortex_scan_test.rs +++ b/vortex-duckdb/src/e2e_test/vortex_scan_test.rs @@ -237,6 +237,32 @@ fn test_vortex_scan_integers() { assert_eq!(sum, 138); } +#[test] +fn test_vortex_scan_impl_flip_flop_env() { + let file = RUNTIME.block_on(async { + let numbers = buffer![1i32, 42, 100, -5, 0]; + write_single_column_vortex_file("number", numbers).await + }); + let file_path = file.path().to_string_lossy(); + let query = format!("SELECT SUM(number) FROM '{file_path}' WHERE number >= 0 LIMIT 3"); + + let scan = |scan_impl| { + temp_env::with_var("VORTEX_SCAN_IMPL", Some(scan_impl), || { + let conn = database_connection(); + let result = conn.query(&query).unwrap(); + let mut chunk = result.into_iter().next().unwrap(); + let len = chunk.len().as_(); + let vec = chunk.get_vector_mut(0); + i64::from_duckdb_value(&mut unsafe { vec.as_slice_mut::(len) }[0]) + }) + }; + + let v1 = scan("v1"); + let v2 = scan("v2"); + assert_eq!(v1, v2); + assert_eq!(v1, 143); +} + #[test] fn test_vortex_scan_integers_in_list() { let file = RUNTIME.block_on(async { diff --git a/vortex-ffi/src/scan.rs b/vortex-ffi/src/scan.rs index 6452d0e232b..5fb624d97cf 100644 --- a/vortex-ffi/src/scan.rs +++ b/vortex-ffi/src/scan.rs @@ -177,13 +177,13 @@ fn scan_request(opts: *const vx_scan_options) -> VortexResult { vortex_ensure!(!selection.idx.is_null()); let buf = unsafe { slice::from_raw_parts(selection.idx, selection.idx_len) }; let buf = Buffer::copy_from(buf); - Selection::IncludeByIndex(buf) + Selection::include_by_index(buf)? } vx_scan_selection_include::VX_SELECTION_EXCLUDE_RANGE => { vortex_ensure!(!selection.idx.is_null()); let buf = unsafe { slice::from_raw_parts(selection.idx, selection.idx_len) }; let buf = Buffer::copy_from(buf); - Selection::ExcludeByIndex(buf) + Selection::exclude_by_index(buf)? } }; diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index f04a5b29581..181d031c307 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -98,7 +98,6 @@ use vortex_scan::ScanMeta; use vortex_scan::ScanRequest as DataSourceScanRequest; use vortex_scan::ScanScheduler; use vortex_scan::ScanSchedulerSessionExt; -use vortex_scan::ScanTicket; use vortex_scan::read::ReadResults; use vortex_scan::read::ReadStore; use vortex_scan::read::ReadStoreRef; @@ -431,7 +430,7 @@ fn scalar_precision_to_value(precision: Precision) -> Precision VortexResult { if builder.glob_sources.is_empty() { @@ -669,7 +668,6 @@ impl DataSource for ScanPlanDataSource { .clone() .unwrap_or_else(|| self.session.scan_scheduler_provider()); let scheduler = provider.scheduler_for_scan(&meta); - let ticket = scheduler.register_scan(meta); let mut planned_files = Vec::new(); let mut total_morsels = 0usize; @@ -683,7 +681,7 @@ impl DataSource for ScanPlanDataSource { .clone() .ok_or_else(|| vortex_err!("scan2 partition row range missing"))?; let prepared = Arc::new(PreparedScanPlan::try_new(&file, &request)?); - let execution = Arc::new(ScanExecution::try_new(file, prepared, &ticket, None)?); + let execution = Arc::new(ScanExecution::try_new(file, prepared, None)?); let ranges = execution.splits(&row_range)?; if ranges.is_empty() { continue; @@ -709,17 +707,13 @@ impl DataSource for ScanPlanDataSource { } } - let morsel_plan_window = morsel_plan_window(&scheduler, false); - let morsel_byte_budget = morsel_byte_budget(&scheduler); + let read_byte_budget = read_byte_budget(&scheduler); Ok(Some(Arc::new(PlannedScanPlanScan { dtype, partitions, - scheduler, - ticket, handle: self.session.handle(), - morsel_plan_window, - morsel_byte_budget, + read_byte_budget, }))) } @@ -732,7 +726,6 @@ impl DataSource for ScanPlanDataSource { .clone() .unwrap_or_else(|| self.session.scan_scheduler_provider()); let scheduler = provider.scheduler_for_scan(&meta); - let ticket = scheduler.register_scan(meta); let mut ready = VecDeque::new(); let mut deferred = VecDeque::new(); @@ -757,7 +750,6 @@ impl DataSource for ScanPlanDataSource { handle: self.session.handle(), concurrency: self.concurrency, scheduler, - ticket, limit_remaining, })) } @@ -816,7 +808,6 @@ struct ScanPlanDataSourceScan { handle: Handle, concurrency: usize, scheduler: Arc, - ticket: ScanTicket, limit_remaining: Option>, } @@ -843,7 +834,6 @@ impl DataSourceScan for ScanPlanDataSourceScan { handle, concurrency, scheduler, - ticket, limit_remaining, } = *self; @@ -888,12 +878,11 @@ impl DataSourceScan for ScanPlanDataSourceScan { .filter_map(move |file_result| { let request = request.clone(); let scheduler = Arc::clone(&scheduler); - let ticket = ticket.clone(); let limit_remaining = limit_remaining.clone(); async move { match file_result { Ok((index, file)) => { - file_partition(index, file, request, scheduler, ticket, limit_remaining) + file_partition(index, file, request, scheduler, limit_remaining) .transpose() } Err(error) => Some(Err(error)), @@ -909,7 +898,6 @@ fn file_partition( file: VortexFile, request: DataSourceScanRequest, scheduler: Arc, - ticket: ScanTicket, limit_remaining: Option>, ) -> VortexResult> { let Some(request) = file_scan_request(partition_idx, &file, request)? else { @@ -927,7 +915,6 @@ fn file_partition( row_range, index: partition_idx, scheduler, - ticket, limit_remaining, }))) } @@ -945,11 +932,9 @@ pub(crate) fn scan_plan_file_stream( .clone() .unwrap_or_else(|| file.session().scan_scheduler_provider()); let scheduler = provider.scheduler_for_scan(&meta); - let ticket = scheduler.register_scan(meta); let limit_remaining = request.limit.map(AtomicU64::new).map(Arc::new); - let Some(partition) = file_partition(0, file, request, scheduler, ticket, limit_remaining)? - else { + let Some(partition) = file_partition(0, file, request, scheduler, limit_remaining)? else { return Ok(ArrayStreamExt::boxed(ArrayStreamAdapter::new( dtype, stream::empty(), @@ -1510,29 +1495,20 @@ struct PartitionWorkSchedulerState { plan_window: usize, } -fn morsel_plan_window(scheduler: &ScanScheduler, limited: bool) -> usize { - if limited { - return 1; - } - - scheduler - .config() - .morsel_plan_window() - .unwrap_or(usize::MAX) +fn plan_window_for_limit(limited: bool) -> usize { + if limited { 1 } else { usize::MAX } } -fn morsel_byte_budget(scheduler: &ScanScheduler) -> u64 { - scheduler.config().morsel_byte_budget().unwrap_or(u64::MAX) +fn read_byte_budget(scheduler: &ScanScheduler) -> u64 { + scheduler.config().read_byte_budget().unwrap_or(u64::MAX) } fn partition_work_stream( morsels: Vec, - _scheduler: Arc, - _ticket: ScanTicket, handle: Handle, ordered: bool, plan_window: usize, - morsel_byte_budget: u64, + read_byte_budget: u64, ) -> impl futures::Stream> + Send + 'static { let has_dynamic_predicates = morsels .iter() @@ -1542,7 +1518,7 @@ fn partition_work_stream( morsel_count = morsels.len(), ordered, plan_window, - morsel_byte_budget, + read_byte_budget, has_dynamic_predicates, "created scan2 task stream" ); @@ -1554,7 +1530,7 @@ fn partition_work_stream( in_flight_projection_tasks: 0, next_morsel_id: 0, next_emit_morsel_id: 0, - task_queue: ScanTaskQueue::new(morsel_byte_budget), + task_queue: ScanTaskQueue::new(read_byte_budget), in_flight: FuturesUnordered::new(), read_store: Arc::new(ReadStore::new()), completed_morsels: BTreeMap::new(), @@ -2267,7 +2243,6 @@ struct ScanPlanPartition { row_range: Range, index: usize, scheduler: Arc, - ticket: ScanTicket, limit_remaining: Option>, } @@ -2306,22 +2281,16 @@ impl Partition for ScanPlanPartition { row_range, index: _, scheduler, - ticket, limit_remaining, } = *self; - let execution = Arc::new(ScanExecution::try_new( - file, - prepared, - &ticket, - limit_remaining, - )?); + let execution = Arc::new(ScanExecution::try_new(file, prepared, limit_remaining)?); let handle = execution.session.handle(); let dtype = execution.plan.dtype().clone(); let ranges = execution.splits(&row_range)?; let ordered = execution.plan.ordered(); - let plan_window = morsel_plan_window(&scheduler, execution.limit_remaining.is_some()); - let morsel_byte_budget = morsel_byte_budget(&scheduler); + let plan_window = plan_window_for_limit(execution.limit_remaining.is_some()); + let read_byte_budget = read_byte_budget(&scheduler); let morsels = ranges .into_iter() .map(|range| PlannedScanPlanMorsel { @@ -2330,15 +2299,7 @@ impl Partition for ScanPlanPartition { }) .collect::>(); - let stream = partition_work_stream( - morsels, - scheduler, - ticket, - handle, - ordered, - plan_window, - morsel_byte_budget, - ); + let stream = partition_work_stream(morsels, handle, ordered, plan_window, read_byte_budget); Ok(ArrayStreamExt::boxed(ArrayStreamAdapter::new( dtype, stream, @@ -2349,11 +2310,8 @@ impl Partition for ScanPlanPartition { struct PlannedScanPlanScan { dtype: DType, partitions: Vec>, - scheduler: Arc, - ticket: ScanTicket, handle: Handle, - morsel_plan_window: usize, - morsel_byte_budget: u64, + read_byte_budget: u64, } #[derive(Clone)] @@ -2426,18 +2384,9 @@ impl Partition for PlannedScanPlanPartition { let PlannedScanPlanPartition { planned, index } = *self; let morsels = planned.partitions[index].clone(); let dtype = planned.dtype.clone(); - let scheduler = Arc::clone(&planned.scheduler); - let ticket = planned.ticket.clone(); let handle = planned.handle.clone(); - let stream = partition_work_stream( - morsels, - scheduler, - ticket, - handle, - false, - planned.morsel_plan_window, - planned.morsel_byte_budget, - ); + let stream = + partition_work_stream(morsels, handle, false, usize::MAX, planned.read_byte_budget); Ok(ArrayStreamExt::boxed(ArrayStreamAdapter::new( dtype, stream, @@ -2602,7 +2551,6 @@ impl ScanExecution { fn try_new( file: VortexFile, plan: Arc, - _ticket: &ScanTicket, limit_remaining: Option>, ) -> VortexResult { let session = file.session().clone(); diff --git a/vortex-file/src/scan_v1_v2_differential.rs b/vortex-file/src/scan_v1_v2_differential.rs index 3837076f511..6207dc51d5a 100644 --- a/vortex-file/src/scan_v1_v2_differential.rs +++ b/vortex-file/src/scan_v1_v2_differential.rs @@ -15,12 +15,20 @@ // regression tests; single-char names are clearest here. #![allow(clippy::many_single_char_names)] +use std::collections::BTreeMap; +use std::sync::Arc; use std::sync::LazyLock; +use async_trait::async_trait; +use futures::StreamExt; +use futures::TryStreamExt; +use futures::stream; +use futures::stream::BoxStream; use rstest::rstest; use vortex_array::ArrayRef; use vortex_array::IntoArray; use vortex_array::VortexSessionExecute; +use vortex_array::accessor::ArrayAccessor; use vortex_array::arrays::BoolArray; use vortex_array::arrays::ChunkedArray; use vortex_array::arrays::PrimitiveArray; @@ -39,18 +47,29 @@ use vortex_array::expr::pack; use vortex_array::expr::root; use vortex_array::expr::select; use vortex_array::stats::PRUNING_STATS; +use vortex_array::stream::ArrayStreamAdapter; use vortex_array::stream::ArrayStreamExt; use vortex_array::validity::Validity; +use vortex_buffer::Buffer; +use vortex_buffer::ByteBuffer; use vortex_buffer::ByteBufferMut; use vortex_buffer::buffer; use vortex_error::VortexResult; +use vortex_io::VortexReadAt; +use vortex_io::filesystem::FileListing; +use vortex_io::filesystem::FileSystem; +use vortex_io::filesystem::FileSystemRef; use vortex_layout::layouts::row_idx::row_idx; +use vortex_scan::DataSourceRef; use vortex_scan::ScanRequest; +use vortex_scan::selection::Selection; use vortex_session::VortexSession; use crate::OpenOptionsSessionExt; use crate::VortexFile; use crate::WriteOptionsSessionExt; +use crate::multi::MultiFileDataSource; +use crate::multi::scan_v2::build_scan_plan_data_source; static SESSION: LazyLock = LazyLock::new(crate::tests::new_test_session); @@ -79,10 +98,15 @@ async fn scan_v1(file: &VortexFile, request: &ScanRequest) -> VortexResult) -> ScanRequest { } } +async fn write_part(array: ArrayRef) -> VortexResult { + let mut buf = ByteBufferMut::empty(); + SESSION + .write_options() + .write(&mut buf, array.to_array_stream()) + .await?; + Ok(buf.freeze()) +} + +#[derive(Debug)] +struct MemoryFileSystem { + files: BTreeMap, +} + +#[async_trait] +impl FileSystem for MemoryFileSystem { + fn list(&self, prefix: &str) -> BoxStream<'_, VortexResult> { + let listings = self + .files + .iter() + .filter_map(move |(path, bytes)| { + path.starts_with(prefix).then_some(Ok(FileListing { + path: path.clone(), + size: Some(bytes.len() as u64), + })) + }) + .collect::>(); + stream::iter(listings).boxed() + } + + async fn head(&self, path: &str) -> VortexResult> { + Ok(self.files.get(path).map(|bytes| FileListing { + path: path.to_string(), + size: Some(bytes.len() as u64), + })) + } + + async fn open_read(&self, path: &str) -> VortexResult> { + self.files + .get(path) + .cloned() + .map(|bytes| Arc::new(bytes) as Arc) + .ok_or_else(|| vortex_error::vortex_err!("missing test file {path}")) + } + + async fn delete(&self, _path: &str) -> VortexResult<()> { + Ok(()) + } +} + +async fn scan_data_source(source: DataSourceRef, request: ScanRequest) -> VortexResult { + let scan = source.scan(request).await?; + let dtype = scan.dtype().clone(); + let stream = scan + .partitions() + .then(|partition| async move { partition?.execute() }) + .try_flatten() + .boxed(); + ArrayStreamAdapter::new(dtype, stream).read_all().await +} + +fn sorted_i32_values(array: ArrayRef) -> VortexResult> { + let mut ctx = SESSION.create_execution_ctx(); + let primitive = array.execute::(&mut ctx)?; + let mut values = primitive + .with_iterator(|iter| iter.map(|value| value.copied()).collect::>>()) + .ok_or_else(|| { + vortex_error::vortex_err!("unordered differential values must be non-null") + })?; + values.sort_unstable(); + Ok(values) +} + // ---- Fixtures ---- /// Flat primitive column, both nullable and non-nullable variants. @@ -272,6 +369,92 @@ async fn differential_filter_numbers(#[case] array: ArrayRef) -> VortexResult<() assert_v1_eq_v2(&file, request(root(), Some(filter))).await } +#[tokio::test] +async fn differential_row_range() -> VortexResult<()> { + let file = write_file(chunked(), false).await?; + let scan_request = ScanRequest { + row_range: Some(2..8), + ..request(root(), None) + }; + assert_v1_eq_v2(&file, scan_request).await +} + +#[tokio::test] +async fn differential_include_selection() -> VortexResult<()> { + let file = write_file(chunked(), false).await?; + let scan_request = ScanRequest { + selection: Selection::include_by_index(Buffer::from_iter([0, 2, 5, 9]))?, + ..request(root(), None) + }; + assert_v1_eq_v2(&file, scan_request).await +} + +#[tokio::test] +async fn differential_exclude_selection() -> VortexResult<()> { + let file = write_file(chunked(), false).await?; + let scan_request = ScanRequest { + selection: Selection::exclude_by_index(Buffer::from_iter([1, 4, 7]))?, + ..request(root(), None) + }; + assert_v1_eq_v2(&file, scan_request).await +} + +#[tokio::test] +async fn differential_limit() -> VortexResult<()> { + let file = write_file(chunked(), false).await?; + let scan_request = ScanRequest { + limit: Some(5), + ..request(root(), None) + }; + assert_v1_eq_v2(&file, scan_request).await +} + +#[tokio::test] +async fn differential_unordered_multi_file_partition_selection() -> VortexResult<()> { + let request = ScanRequest { + projection: get_item("numbers", root()), + row_range: Some(1..4), + selection: Selection::exclude_by_index(Buffer::from_iter([2]))?, + partition_selection: Selection::include_by_index(Buffer::from_iter([0, 2]))?, + ordered: false, + ..Default::default() + }; + + let parts = [ + ("part-0.vortex", buffer![0i32, 1, 2, 3, 4].into_array()), + ("part-1.vortex", buffer![10i32, 11, 12, 13, 14].into_array()), + ("part-2.vortex", buffer![20i32, 21, 22, 23, 24].into_array()), + ]; + let files = BTreeMap::from_iter( + futures::future::try_join_all(parts.into_iter().map(|(path, numbers)| async move { + let array = StructArray::from_fields(&[("numbers", numbers)])?.into_array(); + Ok::<_, vortex_error::VortexError>((path.to_string(), write_part(array).await?)) + })) + .await?, + ); + let fs: FileSystemRef = Arc::new(MemoryFileSystem { files }); + + let v1_source: DataSourceRef = Arc::new( + MultiFileDataSource::new(SESSION.clone()) + .with_glob("part-*.vortex", Some(Arc::clone(&fs))) + .build() + .await?, + ); + let v1 = scan_data_source(v1_source, request.clone()).await?; + + let v2_source: DataSourceRef = Arc::new( + build_scan_plan_data_source( + MultiFileDataSource::new(SESSION.clone()).with_glob("part-*.vortex", Some(fs)), + ) + .await?, + ); + let v2 = scan_data_source(v2_source, request).await?; + + assert_eq!(sorted_i32_values(v1)?, vec![1, 3, 21, 23]); + assert_eq!(sorted_i32_values(v2)?, vec![1, 3, 21, 23]); + Ok(()) +} + #[tokio::test] async fn differential_dict_filter() -> VortexResult<()> { let file = write_file(dict_encoded(), false).await?; diff --git a/vortex-file/src/segments/source.rs b/vortex-file/src/segments/source.rs index bcac295efb8..c048a5cb9b3 100644 --- a/vortex-file/src/segments/source.rs +++ b/vortex-file/src/segments/source.rs @@ -167,7 +167,7 @@ impl SegmentSource for FileSegmentSource { fn segment_info(&self, id: SegmentId) -> VortexResult { self.segments .get(*id as usize) - .map(|spec| SegmentInfo::cacheable(u64::from(spec.length))) + .map(|spec| SegmentInfo::new(u64::from(spec.length))) .ok_or_else(|| vortex_err!("Missing segment: {}", id)) } @@ -315,7 +315,7 @@ impl SegmentSource for BufferSegmentSource { fn segment_info(&self, id: SegmentId) -> VortexResult { self.segments .get(*id as usize) - .map(|spec| SegmentInfo::cacheable(u64::from(spec.length))) + .map(|spec| SegmentInfo::new(u64::from(spec.length))) .ok_or_else(|| vortex_err!("Missing segment: {}", id)) } diff --git a/vortex-file/src/tests.rs b/vortex-file/src/tests.rs index f583cd7e254..824b46ca957 100644 --- a/vortex-file/src/tests.rs +++ b/vortex-file/src/tests.rs @@ -960,6 +960,7 @@ async fn test_with_indices_simple() { .scan() .unwrap() .with_row_indices(Buffer::::empty()) + .unwrap() .into_array_stream() .unwrap() .read_all() @@ -977,6 +978,7 @@ async fn test_with_indices_simple() { .scan() .unwrap() .with_row_indices(Buffer::from_iter(kept_indices)) + .unwrap() .into_array_stream() .unwrap() .read_all() @@ -1002,6 +1004,7 @@ async fn test_with_indices_simple() { .scan() .unwrap() .with_row_indices((0u64..500).collect::>()) + .unwrap() .into_array_stream() .unwrap() .read_all() @@ -1047,6 +1050,7 @@ async fn test_with_indices_on_two_columns() { .scan() .unwrap() .with_row_indices(Buffer::from_iter(kept_indices)) + .unwrap() .into_array_stream() .unwrap() .read_all() @@ -1104,6 +1108,7 @@ async fn test_with_indices_and_with_row_filter_simple() { .unwrap() .with_filter(gt(get_item("numbers", root()), lit(50_i16))) .with_row_indices(Buffer::empty()) + .unwrap() .into_array_stream() .unwrap() .read_all() @@ -1122,6 +1127,7 @@ async fn test_with_indices_and_with_row_filter_simple() { .unwrap() .with_filter(gt(get_item("numbers", root()), lit(50_i16))) .with_row_indices(Buffer::from_iter(kept_indices)) + .unwrap() .into_array_stream() .unwrap() .read_all() @@ -1150,6 +1156,7 @@ async fn test_with_indices_and_with_row_filter_simple() { .unwrap() .with_filter(gt(get_item("numbers", root()), lit(50_i16))) .with_row_indices((0..500).collect::>()) + .unwrap() .into_array_stream() .unwrap() .read_all() @@ -1452,7 +1459,7 @@ async fn file_take() -> VortexResult<()> { let vxf = chunked_file().await?; let result = vxf .scan()? - .with_row_indices(buffer![0, 1, 8]) + .with_row_indices(buffer![0, 1, 8])? .into_array_stream()? .read_all() .await?; diff --git a/vortex-jni/src/scan.rs b/vortex-jni/src/scan.rs index 360d92aa989..ba320add6e3 100644 --- a/vortex-jni/src/scan.rs +++ b/vortex-jni/src/scan.rs @@ -96,8 +96,8 @@ fn build_scan_request( let selection = match selection_include { 0 => Selection::All, - 1 => Selection::IncludeByIndex(Buffer::copy_from(selection_idx)), - 2 => Selection::ExcludeByIndex(Buffer::copy_from(selection_idx)), + 1 => Selection::include_by_index(Buffer::copy_from(selection_idx))?, + 2 => Selection::exclude_by_index(Buffer::copy_from(selection_idx))?, 3 => Selection::IncludeRoaring(deserialize_roaring_selection(selection_roaring_bitmap)?), 4 => Selection::ExcludeRoaring(deserialize_roaring_selection(selection_roaring_bitmap)?), other => vortex_bail!("unknown selection include code: {other}"), diff --git a/vortex-layout/src/scan/plan/mod.rs b/vortex-layout/src/scan/plan/mod.rs index eff61c27c0e..06c0d919a92 100644 --- a/vortex-layout/src/scan/plan/mod.rs +++ b/vortex-layout/src/scan/plan/mod.rs @@ -1844,7 +1844,7 @@ mod tests { &self, _id: crate::segments::SegmentId, ) -> VortexResult { - Ok(crate::segments::SegmentInfo::non_cacheable(0)) + Ok(crate::segments::SegmentInfo::new(0)) } fn request(&self, _id: crate::segments::SegmentId) -> crate::segments::SegmentFuture { diff --git a/vortex-layout/src/scan/scan_builder.rs b/vortex-layout/src/scan/scan_builder.rs index 11fd5c7b882..224a9e8783c 100644 --- a/vortex-layout/src/scan/scan_builder.rs +++ b/vortex-layout/src/scan/scan_builder.rs @@ -173,9 +173,9 @@ impl ScanBuilder { } /// Select rows by absolute indices relative to the scan input. - pub fn with_row_indices(mut self, row_indices: Buffer) -> Self { - self.selection = Selection::IncludeByIndex(row_indices); - self + pub fn with_row_indices(mut self, row_indices: Buffer) -> VortexResult { + self.selection = Selection::include_by_index(row_indices)?; + Ok(self) } /// Set the root row offset used by row-index expressions. diff --git a/vortex-layout/src/segments/scheduled.rs b/vortex-layout/src/segments/scheduled.rs index 7021a6e5f84..7fdb975b07c 100644 --- a/vortex-layout/src/segments/scheduled.rs +++ b/vortex-layout/src/segments/scheduled.rs @@ -35,25 +35,12 @@ use crate::segments::SegmentSource; pub struct SegmentInfo { /// Number of bytes in the logical segment payload. pub bytes: u64, - /// Whether this segment is eligible for segment-cache lookup and admission. - pub cacheable: bool, } impl SegmentInfo { - /// Create cacheable metadata for a segment with `bytes` payload bytes. - pub fn cacheable(bytes: u64) -> Self { - Self { - bytes, - cacheable: true, - } - } - - /// Create non-cacheable metadata for a segment with `bytes` payload bytes. - pub fn non_cacheable(bytes: u64) -> Self { - Self { - bytes, - cacheable: false, - } + /// Create metadata for a segment with `bytes` payload bytes. + pub fn new(bytes: u64) -> Self { + Self { bytes } } } @@ -529,7 +516,7 @@ mod tests { #[test] fn register_segment_reads_dedupes_exact_segments() -> VortexResult<()> { - let source = Arc::new(CountingSegmentSource::new(SegmentInfo::cacheable(8))); + let source = Arc::new(CountingSegmentSource::new(SegmentInfo::new(8))); let segment_source: Arc = Arc::::clone(&source); let ctx = SegmentPlanCtx::new(segment_source, VortexSession::empty()); let request = ctx.request_for_segment(SegmentId::from(0))?; @@ -547,7 +534,7 @@ mod tests { #[test] fn register_segment_reads_registers_each_miss() -> VortexResult<()> { - let source = Arc::new(CountingMissSegmentSource::new(SegmentInfo::cacheable(8))); + let source = Arc::new(CountingMissSegmentSource::new(SegmentInfo::new(8))); let segment_source: Arc = Arc::::clone(&source); let ctx = SegmentPlanCtx::new(segment_source, VortexSession::empty()); @@ -566,7 +553,7 @@ mod tests { #[test] fn segment_future_cache_reuses_prefetched_segment() -> VortexResult<()> { - let source = Arc::new(CountingSegmentSource::new(SegmentInfo::cacheable(8))); + let source = Arc::new(CountingSegmentSource::new(SegmentInfo::new(8))); let segment_source: Arc = Arc::::clone(&source); let ctx = SegmentPlanCtx::new(Arc::clone(&segment_source), VortexSession::empty()); let request = ctx.request_for_segment(SegmentId::from(0))?; diff --git a/vortex-layout/src/segments/test.rs b/vortex-layout/src/segments/test.rs index 9d49996f704..cbbaf62efe5 100644 --- a/vortex-layout/src/segments/test.rs +++ b/vortex-layout/src/segments/test.rs @@ -31,7 +31,7 @@ impl SegmentSource for TestSegments { self.segments .lock() .get(*id as usize) - .map(|segment| SegmentInfo::non_cacheable(segment.len() as u64)) + .map(|segment| SegmentInfo::new(segment.len() as u64)) .ok_or_else(|| vortex_err!("Segment not found")) } diff --git a/vortex-python/src/dataset.rs b/vortex-python/src/dataset.rs index acb8285e5a0..da2d8f018ad 100644 --- a/vortex-python/src/dataset.rs +++ b/vortex-python/src/dataset.rs @@ -66,7 +66,7 @@ pub fn read_array_from_reader( if let Some(indices) = indices { let primitive = indices.execute::(ctx)?; let indices = primitive.into_buffer(); - scan = scan.with_row_indices(indices); + scan = scan.with_row_indices(indices)?; } if let Some((l, r)) = row_range { diff --git a/vortex-python/src/file.rs b/vortex-python/src/file.rs index f6bd1ed9cda..65327b1e967 100644 --- a/vortex-python/src/file.rs +++ b/vortex-python/src/file.rs @@ -220,7 +220,7 @@ fn scan_builder( if let Some(indices) = indices { let casted = indices.cast(DType::Primitive(PType::U64, NonNullable))?; let indices = casted.execute::(ctx)?.into_buffer::(); - builder = builder.with_row_indices(indices); + builder = builder.with_row_indices(indices)?; } if let Some(batch_size) = batch_size { diff --git a/vortex-scan/Cargo.toml b/vortex-scan/Cargo.toml index 5c0cf465f5e..3f587020606 100644 --- a/vortex-scan/Cargo.toml +++ b/vortex-scan/Cargo.toml @@ -24,7 +24,6 @@ vortex-mask = { workspace = true } vortex-session = { workspace = true } vortex-utils = { workspace = true, features = ["dashmap"] } -async-lock = { workspace = true } async-trait = { workspace = true } futures = { workspace = true } parking_lot = { workspace = true } diff --git a/vortex-scan/src/lib.rs b/vortex-scan/src/lib.rs index e6bbe2699de..70c24e45ab0 100644 --- a/vortex-scan/src/lib.rs +++ b/vortex-scan/src/lib.rs @@ -40,10 +40,6 @@ pub use scheduler::ScanSchedulerConfig; pub use scheduler::ScanSchedulerProvider; pub use scheduler::ScanSchedulerSession; pub use scheduler::ScanSchedulerSessionExt; -pub use scheduler::ScanTicket; -pub use scheduler::ScanWorkClass; -pub use scheduler::WorkPermit; -pub use scheduler::WorkRequest; use selection::Selection; use vortex_array::aggregate_fn::AggregateFnRef; use vortex_array::dtype::DType; diff --git a/vortex-scan/src/scheduler.rs b/vortex-scan/src/scheduler.rs index 4fbfc6a51f6..82e2e97f37c 100644 --- a/vortex-scan/src/scheduler.rs +++ b/vortex-scan/src/scheduler.rs @@ -1,148 +1,71 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Coarse-grained resource scheduling for scans. +//! Shared scan scheduling configuration. //! -//! The scheduler deliberately starts with one primitive: a slot permit. The ScanPlan runtime -//! uses one slot per in-flight morsel, which is enough to preserve the existing scan concurrency -//! model while giving integrations a shared object they can use to bound concurrent work across -//! scans. +//! The V2 scan runtime currently enforces one scheduler knob: an active logical read-byte budget +//! per partition stream. Scheduler instances still provide shared configuration, +//! but they do not expose a separate morsel-slot permit API. use std::any::Any; use std::fmt; use std::sync::Arc; -use std::sync::atomic::AtomicBool; -use std::sync::atomic::AtomicU64; -use std::sync::atomic::Ordering; -use async_lock::Semaphore; -use async_lock::SemaphoreGuardArc; -use vortex_error::VortexResult; -use vortex_error::vortex_bail; use vortex_session::SessionExt; use vortex_session::SessionVar; use vortex_session::VortexSession; -use vortex_utils::parallelism::get_available_parallelism; -const DEFAULT_MORSEL_CONCURRENCY_FACTOR: usize = 4; -const DEFAULT_MORSEL_BYTE_BUDGET: u64 = 256 * 1024 * 1024; +const DEFAULT_READ_BYTE_BUDGET: u64 = 256 * 1024 * 1024; /// Configuration for a [`ScanScheduler`]. #[derive(Clone, Debug, PartialEq, Eq)] pub struct ScanSchedulerConfig { - global_slots: Option, - per_scan_slots: Option, - morsel_plan_window: Option, - morsel_launch_window: Option, - morsel_byte_budget: Option, + read_byte_budget: Option, } impl ScanSchedulerConfig { /// Create an unbounded scheduler configuration. pub fn unbounded() -> Self { Self { - global_slots: None, - per_scan_slots: None, - morsel_plan_window: None, - morsel_launch_window: None, - morsel_byte_budget: None, + read_byte_budget: None, } } - /// Create a scheduler configuration with the same morsel-slot limit globally and per scan. - /// - /// Morsel execution remains bounded by `slots`, but planning is unbounded by default so - /// segment futures can be registered ahead of execution. - pub fn morsel_slots(slots: usize) -> Self { - let slots = slots.max(1); + /// Create a scheduler configuration with the default active read-byte budget. + pub fn default_read_byte_budget() -> Self { Self { - global_slots: Some(slots), - per_scan_slots: Some(slots), - morsel_plan_window: None, - morsel_launch_window: Some(slots), - morsel_byte_budget: Some(DEFAULT_MORSEL_BYTE_BUDGET), + read_byte_budget: Some(DEFAULT_READ_BYTE_BUDGET), } } - /// Return a copy with the maximum number of morsels allowed to be planned ahead per scan. - /// - /// `None` means the scan may plan all morsels ahead of execution. - pub fn with_morsel_plan_window(mut self, window: Option) -> Self { - self.morsel_plan_window = window.map(|window| window.max(1)); - self - } - - /// Return a copy with the maximum number of morsels allowed to run concurrently per scan. - pub fn with_morsel_launch_window(mut self, window: Option) -> Self { - self.morsel_launch_window = window.map(|window| window.max(1)); - self - } - - /// Return a copy with the maximum number of logical segment bytes allowed in flight per scan. + /// Return a copy with the maximum number of logical read bytes allowed in flight per partition. /// /// `None` means scan task launch is not capped by bytes. - pub fn with_morsel_byte_budget(mut self, bytes: Option) -> Self { - self.morsel_byte_budget = bytes.map(|bytes| bytes.max(1)); + pub fn with_read_byte_budget(mut self, bytes: Option) -> Self { + self.read_byte_budget = bytes.map(|bytes| bytes.max(1)); self } - /// Create a scheduler configuration matching the current unordered scan concurrency factor. - pub fn default_morsel_slots() -> Self { - Self::morsel_slots(default_morsel_slots()) - } - /// Configuration used by the DuckDB integration by default. pub fn duckdb_default() -> Self { - Self::default_morsel_slots() - } - - /// Returns the configured global slot limit. - pub fn global_slots(&self) -> Option { - self.global_slots + Self::default_read_byte_budget() } - /// Returns the configured per-scan slot limit. - pub fn per_scan_slots(&self) -> Option { - self.per_scan_slots - } - - /// Returns the configured per-scan morsel planning window. - /// - /// `None` means planning is unbounded. - pub fn morsel_plan_window(&self) -> Option { - self.morsel_plan_window - } - - /// Returns the configured per-scan morsel launch window. - pub fn morsel_launch_window(&self) -> Option { - self.morsel_launch_window - } - - /// Returns the configured per-scan logical segment byte budget. - pub fn morsel_byte_budget(&self) -> Option { - self.morsel_byte_budget + /// Returns the configured per-partition active logical read-byte budget. + pub fn read_byte_budget(&self) -> Option { + self.read_byte_budget } } impl Default for ScanSchedulerConfig { fn default() -> Self { - Self::default_morsel_slots() + Self::default_read_byte_budget() } } -/// Returns the default number of morsel slots for unordered scans. -pub fn default_morsel_slots() -> usize { - get_available_parallelism() - .unwrap_or(1) - .saturating_mul(DEFAULT_MORSEL_CONCURRENCY_FACTOR) - .max(1) -} - -/// Shared scheduler that admits scan work using coarse slot permits. +/// Shared scheduler configuration for scan work. pub struct ScanScheduler { config: ScanSchedulerConfig, - global_slots: Option>, - next_scan_id: AtomicU64, } impl fmt::Debug for ScanScheduler { @@ -156,14 +79,7 @@ impl fmt::Debug for ScanScheduler { impl ScanScheduler { /// Create a scheduler from a configuration. pub fn new(config: ScanSchedulerConfig) -> Self { - let global_slots = config - .global_slots - .map(|slots| Arc::new(Semaphore::new(slots))); - Self { - config, - global_slots, - next_scan_id: AtomicU64::new(0), - } + Self { config } } /// Create an unbounded scheduler. @@ -175,76 +91,6 @@ impl ScanScheduler { pub fn config(&self) -> &ScanSchedulerConfig { &self.config } - - /// Register a logical scan and return a ticket used for future permit acquisition. - pub fn register_scan(&self, _meta: ScanMeta) -> ScanTicket { - let id = self.next_scan_id.fetch_add(1, Ordering::Relaxed); - ScanTicket { - id, - cancelled: Arc::new(AtomicBool::new(false)), - per_scan_slots: self - .config - .per_scan_slots - .map(|slots| Arc::new(Semaphore::new(slots))), - per_scan_slot_limit: self.config.per_scan_slots, - } - } - - /// Acquire permits for one unit of scan work. - pub async fn acquire( - &self, - ticket: &ScanTicket, - request: WorkRequest, - ) -> VortexResult { - if ticket.is_cancelled() { - vortex_bail!("scan {} was cancelled", ticket.id()); - } - - let slots = usize::try_from(request.slots) - .map_err(|_| vortex_error::vortex_err!("scan work slot count exceeds usize"))?; - if slots == 0 { - vortex_bail!("scan work must request at least one scheduler slot"); - } - if slots != 1 { - vortex_bail!("the MVP scan scheduler only supports one slot per work request"); - } - if let Some(limit) = ticket.per_scan_slot_limit - && slots > limit - { - vortex_bail!( - "scan work requested {} slots, exceeding per-scan limit {}", - slots, - limit - ); - } - if let Some(limit) = self.config.global_slots - && slots > limit - { - vortex_bail!( - "scan work requested {} slots, exceeding global limit {}", - slots, - limit - ); - } - - let mut guards = Vec::with_capacity(slots.saturating_mul(2)); - if let Some(per_scan_slots) = &ticket.per_scan_slots { - for _ in 0..slots { - guards.push(per_scan_slots.acquire_arc().await); - } - } - if let Some(global_slots) = &self.global_slots { - for _ in 0..slots { - guards.push(global_slots.acquire_arc().await); - } - } - - if ticket.is_cancelled() { - vortex_bail!("scan {} was cancelled", ticket.id()); - } - - Ok(WorkPermit { _guards: guards }) - } } /// Scheduler ownership strategy. @@ -254,7 +100,7 @@ pub enum ScanSchedulerProvider { Shared(Arc), /// Create a new scheduler whenever a logical scan starts. PerScan(ScanSchedulerConfig), - /// Do not bound scan work through scheduler permits. + /// Do not bound scan work through scheduler configuration. #[default] Unbounded, } @@ -270,197 +116,13 @@ impl ScanSchedulerProvider { } } -/// Metadata for a logical scan registered with a [`ScanScheduler`]. +/// Metadata for resolving a logical scan scheduler. #[derive(Clone, Debug, Default)] pub struct ScanMeta { /// Optional label used for diagnostics and future metrics. pub label: Option, } -/// A logical scan registered with a scheduler. -#[derive(Clone)] -pub struct ScanTicket { - id: u64, - cancelled: Arc, - per_scan_slots: Option>, - per_scan_slot_limit: Option, -} - -impl fmt::Debug for ScanTicket { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("ScanTicket") - .field("id", &self.id) - .field("cancelled", &self.is_cancelled()) - .field("per_scan_slot_limit", &self.per_scan_slot_limit) - .finish_non_exhaustive() - } -} - -impl ScanTicket { - /// Return this ticket's scheduler-local scan id. - pub fn id(&self) -> u64 { - self.id - } - - /// Cancel the scan. - pub fn cancel(&self) { - self.cancelled.store(true, Ordering::Release); - } - - /// Return whether the scan has been cancelled. - pub fn is_cancelled(&self) -> bool { - self.cancelled.load(Ordering::Acquire) - } -} - -/// A request to acquire scheduler slots for one scan work item. -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub struct WorkRequest { - /// The class of scan work requesting admission. - pub class: ScanWorkClass, - /// Number of slots requested. - /// - /// The MVP scheduler only accepts `1`. - pub slots: u32, -} - -impl WorkRequest { - /// Create a request for one morsel execution slot. - pub fn morsel() -> Self { - Self { - class: ScanWorkClass::Morsel, - slots: 1, - } - } -} - -/// Coarse scan work classes understood by the MVP scheduler. -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum ScanWorkClass { - /// File-open work. - FileOpen, - /// Morsel execution work. - Morsel, - /// Output conversion work. - OutputConversion, -} - -/// RAII scheduler permit. -/// -/// Dropping this value releases every scheduler slot acquired for a work item. -pub struct WorkPermit { - _guards: Vec, -} - -impl fmt::Debug for WorkPermit { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("WorkPermit") - .field("slot_count", &self._guards.len()) - .finish() - } -} - -#[cfg(test)] -mod tests { - use std::pin::pin; - - use futures::FutureExt; - use futures::executor::block_on; - - use super::*; - - #[test] - fn permit_release_unblocks_waiting_work() -> VortexResult<()> { - block_on(async { - let scheduler = ScanScheduler::new(ScanSchedulerConfig::morsel_slots(1)); - let ticket = scheduler.register_scan(ScanMeta::default()); - let permit = scheduler.acquire(&ticket, WorkRequest::morsel()).await?; - - let waiting = scheduler.acquire(&ticket, WorkRequest::morsel()); - let mut waiting = pin!(waiting); - assert!(waiting.as_mut().now_or_never().is_none()); - - drop(permit); - let _permit = waiting.await?; - Ok(()) - }) - } - - #[test] - fn cancelled_ticket_rejects_new_work() -> VortexResult<()> { - block_on(async { - let scheduler = ScanScheduler::new(ScanSchedulerConfig::morsel_slots(1)); - let ticket = scheduler.register_scan(ScanMeta::default()); - ticket.cancel(); - - assert!( - scheduler - .acquire(&ticket, WorkRequest::morsel()) - .await - .is_err() - ); - Ok(()) - }) - } - - #[test] - fn cancelled_waiter_releases_acquired_permit() -> VortexResult<()> { - block_on(async { - let scheduler = ScanScheduler::new(ScanSchedulerConfig::morsel_slots(1)); - let ticket = scheduler.register_scan(ScanMeta::default()); - let permit = scheduler.acquire(&ticket, WorkRequest::morsel()).await?; - - let waiting = scheduler.acquire(&ticket, WorkRequest::morsel()); - let mut waiting = pin!(waiting); - assert!(waiting.as_mut().now_or_never().is_none()); - - ticket.cancel(); - drop(permit); - assert!(waiting.await.is_err()); - - let next_ticket = scheduler.register_scan(ScanMeta::default()); - let _permit = scheduler - .acquire(&next_ticket, WorkRequest::morsel()) - .await?; - Ok(()) - }) - } - - #[test] - fn invalid_slot_counts_are_rejected() -> VortexResult<()> { - block_on(async { - let scheduler = ScanScheduler::new(ScanSchedulerConfig::morsel_slots(1)); - let ticket = scheduler.register_scan(ScanMeta::default()); - - assert!( - scheduler - .acquire( - &ticket, - WorkRequest { - class: ScanWorkClass::Morsel, - slots: 0, - }, - ) - .await - .is_err() - ); - assert!( - scheduler - .acquire( - &ticket, - WorkRequest { - class: ScanWorkClass::Morsel, - slots: 2, - }, - ) - .await - .is_err() - ); - Ok(()) - }) - } -} - /// Session state for scan scheduler configuration. #[derive(Clone, Debug)] pub struct ScanSchedulerSession { @@ -520,7 +182,7 @@ pub trait ScanSchedulerSessionExt: SessionExt { builder.build() } - /// Configure this session to run scans without scheduler limits. + /// Configure this session to run scans without scheduler read-byte limits. fn with_unbounded_scan_scheduler(self) -> VortexSession { let mut builder = self.session().to_builder(); builder.get_mut::().provider = diff --git a/vortex-scan/src/selection.rs b/vortex-scan/src/selection.rs index 79abafc3d4d..d17af0b950d 100644 --- a/vortex-scan/src/selection.rs +++ b/vortex-scan/src/selection.rs @@ -7,11 +7,99 @@ use std::ops::Not; use std::ops::Range; use vortex_buffer::Buffer; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; use vortex_error::vortex_panic; use vortex_mask::Mask; use crate::row_mask::RowMask; +/// A validated selection of rows to include by absolute row index. +#[derive(Clone, Debug)] +pub struct IncludeByIndex { + indices: Buffer, +} + +impl IncludeByIndex { + /// Create a new include-by-index selection. + pub fn try_new(indices: Buffer) -> VortexResult { + validate_indices(&indices)?; + Ok(Self { indices }) + } + + /// Return the selected row indices. + pub fn as_slice(&self) -> &[u64] { + self.indices.as_slice() + } + + /// Return true if the selection contains no row indices. + pub fn is_empty(&self) -> bool { + self.indices.is_empty() + } + + /// Return the number of selected row indices. + pub fn len(&self) -> usize { + self.indices.len() + } +} + +impl std::ops::Deref for IncludeByIndex { + type Target = [u64]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} + +impl AsRef<[u64]> for IncludeByIndex { + fn as_ref(&self) -> &[u64] { + self.as_slice() + } +} + +/// A validated selection of rows to exclude by absolute row index. +#[derive(Clone, Debug)] +pub struct ExcludeByIndex { + indices: Buffer, +} + +impl ExcludeByIndex { + /// Create a new exclude-by-index selection. + pub fn try_new(indices: Buffer) -> VortexResult { + validate_indices(&indices)?; + Ok(Self { indices }) + } + + /// Return the excluded row indices. + pub fn as_slice(&self) -> &[u64] { + self.indices.as_slice() + } + + /// Return true if the selection contains no row indices. + pub fn is_empty(&self) -> bool { + self.indices.is_empty() + } + + /// Return the number of excluded row indices. + pub fn len(&self) -> usize { + self.indices.len() + } +} + +impl std::ops::Deref for ExcludeByIndex { + type Target = [u64]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} + +impl AsRef<[u64]> for ExcludeByIndex { + fn as_ref(&self) -> &[u64] { + self.as_slice() + } +} + /// A selection identifies a set of rows to include in the scan (in addition to applying any /// filter predicates). #[derive(Default, Clone, Debug)] @@ -19,10 +107,10 @@ pub enum Selection { /// No selection, all rows are included. #[default] All, - /// A selection of sorted rows to include by index. - IncludeByIndex(Buffer), - /// A selection of sorted rows to exclude by index. - ExcludeByIndex(Buffer), + /// A selection of sorted, unique rows to include by index. + IncludeByIndex(IncludeByIndex), + /// A selection of sorted, unique rows to exclude by index. + ExcludeByIndex(ExcludeByIndex), /// A selection of rows to include using a [`roaring::RoaringTreemap`]. IncludeRoaring(roaring::RoaringTreemap), /// A selection of rows to exclude using a [`roaring::RoaringTreemap`]. @@ -30,6 +118,16 @@ pub enum Selection { } impl Selection { + /// Create a selection of rows to include by absolute row index. + pub fn include_by_index(indices: Buffer) -> VortexResult { + Ok(Self::IncludeByIndex(IncludeByIndex::try_new(indices)?)) + } + + /// Create a selection of rows to exclude by absolute row index. + pub fn exclude_by_index(indices: Buffer) -> VortexResult { + Ok(Self::ExcludeByIndex(ExcludeByIndex::try_new(indices)?)) + } + /// Return the row count for this selection. pub fn row_count(&self, total_rows: u64) -> u64 { match self { @@ -62,12 +160,12 @@ impl Selection { match self { Selection::All => RowMask::new(range.start, Mask::new_true(range_len)), Selection::IncludeByIndex(include) => { - let mask = indices_range(range, include) + let indices = include.as_slice(); + let mask = indices_range(range, indices) .map(|idx_range| { Mask::from_indices( range_len, - include - .slice(idx_range) + indices[idx_range] .iter() .map(|idx| { idx.checked_sub(range.start).unwrap_or_else(|| { @@ -89,10 +187,26 @@ impl Selection { RowMask::new(range.start, mask) } Selection::ExcludeByIndex(exclude) => { - let mask = Selection::IncludeByIndex(exclude.clone()) - .row_mask(range) - .mask() - .clone(); + let indices = exclude.as_slice(); + let mask = indices_range(range, indices) + .map(|idx_range| { + Mask::from_indices( + range_len, + indices[idx_range] + .iter() + .map(|idx| { + idx.checked_sub(range.start).unwrap_or_else(|| { + vortex_panic!( + "index underflow, range: {:?}, idx: {:?}", + range, + idx + ) + }) + }) + .filter_map(|idx| usize::try_from(idx).ok()), + ) + }) + .unwrap_or_else(|| Mask::new_false(range_len)); RowMask::new(range.start, mask.not()) } Selection::IncludeRoaring(roaring) => { @@ -156,6 +270,24 @@ impl Selection { } } +fn validate_indices(indices: &[u64]) -> VortexResult<()> { + // Row-mask extraction uses binary search over these indices, and row_count treats + // them as set membership. Unsorted or duplicate input can otherwise silently + // mis-select rows or over-report the selected row count. + for (idx, window) in indices.windows(2).enumerate() { + if window[0] >= window[1] { + vortex_bail!( + "row index selection must be strictly increasing at positions {} and {}: {} >= {}", + idx, + idx + 1, + window[0], + window[1] + ); + } + } + Ok(()) +} + /// Find the positional range within row_indices that covers all rows in the given range. fn indices_range(range: &Range, row_indices: &[u64]) -> Option> { if row_indices.first().is_some_and(|&first| first >= range.end) @@ -177,9 +309,39 @@ fn indices_range(range: &Range, row_indices: &[u64]) -> Option mod tests { use vortex_buffer::Buffer; + use super::Selection; + + fn include(indices: impl IntoIterator) -> Selection { + Selection::include_by_index(Buffer::from_iter(indices)) + .expect("test indices should be strictly increasing") + } + + fn exclude(indices: impl IntoIterator) -> Selection { + Selection::exclude_by_index(Buffer::from_iter(indices)) + .expect("test indices should be strictly increasing") + } + + #[test] + fn include_by_index_rejects_unsorted_indices() { + let err = Selection::include_by_index(Buffer::from_iter([3, 1])).unwrap_err(); + assert!(err.to_string().contains("strictly increasing")); + } + + #[test] + fn include_by_index_rejects_duplicate_indices() { + let err = Selection::include_by_index(Buffer::from_iter([1, 1])).unwrap_err(); + assert!(err.to_string().contains("strictly increasing")); + } + + #[test] + fn exclude_by_index_rejects_unsorted_indices() { + let err = Selection::exclude_by_index(Buffer::from_iter([3, 1])).unwrap_err(); + assert!(err.to_string().contains("strictly increasing")); + } + #[test] fn test_row_mask_all() { - let selection = super::Selection::IncludeByIndex(Buffer::from_iter(vec![1, 3, 5, 7])); + let selection = include([1, 3, 5, 7]); let range = 1..8; let row_mask = selection.row_mask(&range); @@ -188,7 +350,7 @@ mod tests { #[test] fn test_row_mask_slice() { - let selection = super::Selection::IncludeByIndex(Buffer::from_iter(vec![1, 3, 5, 7])); + let selection = include([1, 3, 5, 7]); let range = 3..6; let row_mask = selection.row_mask(&range); @@ -197,7 +359,7 @@ mod tests { #[test] fn test_row_mask_exclusive() { - let selection = super::Selection::IncludeByIndex(Buffer::from_iter(vec![1, 3, 5, 7])); + let selection = include([1, 3, 5, 7]); let range = 3..5; let row_mask = selection.row_mask(&range); @@ -206,7 +368,7 @@ mod tests { #[test] fn test_row_mask_all_false() { - let selection = super::Selection::IncludeByIndex(Buffer::from_iter(vec![1, 3, 5, 7])); + let selection = include([1, 3, 5, 7]); let range = 8..10; let row_mask = selection.row_mask(&range); @@ -215,7 +377,7 @@ mod tests { #[test] fn test_row_mask_all_true() { - let selection = super::Selection::IncludeByIndex(Buffer::from_iter(vec![1, 3, 4, 5, 6])); + let selection = include([1, 3, 4, 5, 6]); let range = 3..7; let row_mask = selection.row_mask(&range); @@ -224,7 +386,7 @@ mod tests { #[test] fn test_row_mask_zero() { - let selection = super::Selection::IncludeByIndex(Buffer::from_iter(vec![0])); + let selection = include([0]); let range = 0..5; let row_mask = selection.row_mask(&range); @@ -244,7 +406,7 @@ mod tests { roaring.insert(5); roaring.insert(7); - let selection = super::super::Selection::IncludeRoaring(roaring); + let selection = Selection::IncludeRoaring(roaring); let range = 1..8; let row_mask = selection.row_mask(&range); @@ -259,7 +421,7 @@ mod tests { roaring.insert(5); roaring.insert(7); - let selection = super::super::Selection::IncludeRoaring(roaring); + let selection = Selection::IncludeRoaring(roaring); let range = 3..6; let row_mask = selection.row_mask(&range); @@ -274,7 +436,7 @@ mod tests { roaring.insert(5); roaring.insert(7); - let selection = super::super::Selection::IncludeRoaring(roaring); + let selection = Selection::IncludeRoaring(roaring); let range = 8..10; let row_mask = selection.row_mask(&range); @@ -289,7 +451,7 @@ mod tests { roaring.insert(i); } - let selection = super::super::Selection::IncludeRoaring(roaring); + let selection = Selection::IncludeRoaring(roaring); let range = 1000..2000; let row_mask = selection.row_mask(&range); @@ -304,7 +466,7 @@ mod tests { roaring.insert(3); roaring.insert(5); - let selection = super::super::Selection::ExcludeRoaring(roaring); + let selection = Selection::ExcludeRoaring(roaring); let range = 0..7; let row_mask = selection.row_mask(&range); @@ -320,7 +482,7 @@ mod tests { roaring.insert(i); } - let selection = super::super::Selection::ExcludeRoaring(roaring); + let selection = Selection::ExcludeRoaring(roaring); let range = 10..20; let row_mask = selection.row_mask(&range); @@ -333,7 +495,7 @@ mod tests { roaring.insert(100); roaring.insert(101); - let selection = super::super::Selection::ExcludeRoaring(roaring); + let selection = Selection::ExcludeRoaring(roaring); let range = 0..10; let row_mask = selection.row_mask(&range); @@ -349,7 +511,7 @@ mod tests { roaring.insert(7); roaring.insert(15); // Outside range - let selection = super::super::Selection::ExcludeRoaring(roaring); + let selection = Selection::ExcludeRoaring(roaring); let range = 5..10; let row_mask = selection.row_mask(&range); @@ -360,7 +522,7 @@ mod tests { #[test] fn test_roaring_include_empty() { let roaring = RoaringTreemap::new(); - let selection = super::super::Selection::IncludeRoaring(roaring); + let selection = Selection::IncludeRoaring(roaring); let range = 0..100; let row_mask = selection.row_mask(&range); @@ -370,7 +532,7 @@ mod tests { #[test] fn test_roaring_exclude_empty() { let roaring = RoaringTreemap::new(); - let selection = super::super::Selection::ExcludeRoaring(roaring); + let selection = Selection::ExcludeRoaring(roaring); let range = 0..100; let row_mask = selection.row_mask(&range); @@ -383,7 +545,7 @@ mod tests { roaring.insert(0); roaring.insert(99); - let selection = super::super::Selection::IncludeRoaring(roaring); + let selection = Selection::IncludeRoaring(roaring); let range = 0..100; let row_mask = selection.row_mask(&range); @@ -397,7 +559,7 @@ mod tests { roaring.insert_range(10..20); roaring.insert_range(30..40); - let selection = super::super::Selection::IncludeRoaring(roaring); + let selection = Selection::IncludeRoaring(roaring); let range = 15..35; let row_mask = selection.row_mask(&range); @@ -413,7 +575,7 @@ mod tests { roaring.insert(u64::MAX - 1); roaring.insert(u64::MAX); - let selection = super::super::Selection::IncludeRoaring(roaring); + let selection = Selection::IncludeRoaring(roaring); let range = u64::MAX - 10..u64::MAX; let row_mask = selection.row_mask(&range); @@ -426,7 +588,7 @@ mod tests { let mut roaring = RoaringTreemap::new(); roaring.insert(u64::MAX - 1); - let selection = super::super::Selection::ExcludeRoaring(roaring); + let selection = Selection::ExcludeRoaring(roaring); let range = u64::MAX - 10..u64::MAX; let row_mask = selection.row_mask(&range); @@ -439,14 +601,13 @@ mod tests { // Test that RoaringTreemap and Buffer produce same results let indices = vec![1, 3, 5, 7, 9]; - let buffer_selection = - super::super::Selection::IncludeByIndex(Buffer::from_iter(indices.clone())); + let buffer_selection = include(indices.clone()); let mut roaring = RoaringTreemap::new(); for idx in &indices { roaring.insert(*idx); } - let roaring_selection = super::super::Selection::IncludeRoaring(roaring); + let roaring_selection = Selection::IncludeRoaring(roaring); let range = 0..12; let buffer_mask = buffer_selection.row_mask(&range); @@ -463,14 +624,13 @@ mod tests { // Test that ExcludeRoaring and ExcludeByIndex produce same results let indices = vec![2, 4, 6, 8]; - let buffer_selection = - super::super::Selection::ExcludeByIndex(Buffer::from_iter(indices.clone())); + let buffer_selection = exclude(indices.clone()); let mut roaring = RoaringTreemap::new(); for idx in &indices { roaring.insert(*idx); } - let roaring_selection = super::super::Selection::ExcludeRoaring(roaring); + let roaring_selection = Selection::ExcludeRoaring(roaring); let range = 0..10; let buffer_mask = buffer_selection.row_mask(&range); diff --git a/vortex-scan/src/task.rs b/vortex-scan/src/task.rs index 14a2b612673..f6c8f00936a 100644 --- a/vortex-scan/src/task.rs +++ b/vortex-scan/src/task.rs @@ -391,20 +391,20 @@ pub struct ScanTaskQueue { evidence_queues: BTreeMap<(u32, u32), VecDeque>>, predicate_queues: BTreeMap>>, projection_queue: VecDeque>, - morsel_byte_budget: u64, + read_byte_budget: u64, active_read_bytes: u64, active_group_read_bytes: [u64; 3], active_reads: HashMap, } impl ScanTaskQueue { - /// Create an empty task queue with an in-flight morsel byte budget. - pub fn new(morsel_byte_budget: u64) -> Self { + /// Create an empty task queue with an in-flight logical read-byte budget. + pub fn new(read_byte_budget: u64) -> Self { Self { evidence_queues: BTreeMap::new(), predicate_queues: BTreeMap::new(), projection_queue: VecDeque::new(), - morsel_byte_budget, + read_byte_budget, active_read_bytes: 0, active_group_read_bytes: [0; 3], active_reads: HashMap::new(), @@ -565,7 +565,7 @@ impl ScanTaskQueue { let active_reads = &self.active_reads; let active_read_bytes = self.active_read_bytes; - let morsel_byte_budget = self.morsel_byte_budget; + let read_byte_budget = self.read_byte_budget; match group { ScanTaskGroup::Predicate => { @@ -583,7 +583,7 @@ impl ScanTaskQueue { ); if !can_admit_task( active_read_bytes, - morsel_byte_budget, + read_byte_budget, in_flight_empty, score.incremental_read_bytes, ) || (enforce_target @@ -610,7 +610,7 @@ impl ScanTaskQueue { ); if !can_admit_task( active_read_bytes, - morsel_byte_budget, + read_byte_budget, in_flight_empty, score.incremental_read_bytes, ) || (enforce_target @@ -636,7 +636,7 @@ impl ScanTaskQueue { ); if !can_admit_task( active_read_bytes, - morsel_byte_budget, + read_byte_budget, in_flight_empty, score.incremental_read_bytes, ) || (enforce_target @@ -673,15 +673,15 @@ impl ScanTaskQueue { } fn group_target_bytes(&self, group: ScanTaskGroup) -> u64 { - if self.morsel_byte_budget == u64::MAX { + if self.read_byte_budget == u64::MAX { return u64::MAX; } - let projection = (self.morsel_byte_budget / 8).max(1); - let evidence = (self.morsel_byte_budget / 8).max(1); + let projection = (self.read_byte_budget / 8).max(1); + let evidence = (self.read_byte_budget / 8).max(1); match group { ScanTaskGroup::Predicate => self - .morsel_byte_budget + .read_byte_budget .saturating_sub(projection) .saturating_sub(evidence) .max(1), @@ -797,12 +797,12 @@ fn drop_dead_heads_from_map( fn can_admit_task( active_read_bytes: u64, - morsel_byte_budget: u64, + read_byte_budget: u64, in_flight_empty: bool, incremental_read_bytes: u64, ) -> bool { incremental_read_bytes == 0 - || active_read_bytes.saturating_add(incremental_read_bytes) <= morsel_byte_budget + || active_read_bytes.saturating_add(incremental_read_bytes) <= read_byte_budget || in_flight_empty } From 0cc14212651183e50807d10286716bb8e8e8bc3d Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 24 Jun 2026 08:14:09 -0400 Subject: [PATCH 36/48] Clean up DataFusion scan config API Signed-off-by: Nicholas Gates --- benchmarks/datafusion-bench/src/lib.rs | 21 +++++-------------- .../internals/scan-scheduler.md | 5 ----- vortex-datafusion/src/persistent/format.rs | 14 ++++--------- 3 files changed, 9 insertions(+), 31 deletions(-) diff --git a/benchmarks/datafusion-bench/src/lib.rs b/benchmarks/datafusion-bench/src/lib.rs index 239c0a1b1ba..5c38b9f2b1d 100644 --- a/benchmarks/datafusion-bench/src/lib.rs +++ b/benchmarks/datafusion-bench/src/lib.rs @@ -49,10 +49,11 @@ pub fn get_session_context() -> SessionContext { .build_arc() .expect("could not build runtime environment"); - let factory = VortexFormatFactory::new_with_options( - vortex_session_from_env().expect("invalid Vortex benchmark scan scheduler env"), - vortex_table_options(), - ); + let factory = VortexFormatFactory::new() + .with_session( + vortex_session_from_env().expect("invalid Vortex benchmark scan scheduler env"), + ) + .with_options(vortex_table_options()); let mut session_state_builder = SessionStateBuilder::new() .with_config(SessionConfig::from_env().expect("shouldn't fail")) @@ -145,19 +146,7 @@ fn vortex_session_from_env() -> anyhow::Result { } fn scan_scheduler_config_from_env() -> anyhow::Result { - if std::env::var_os("VORTEX_SCAN_MAX_MORSEL_SLOTS").is_some() { - anyhow::bail!( - "VORTEX_SCAN_MAX_MORSEL_SLOTS is no longer supported; use VORTEX_SCAN_MAX_READ_BYTES" - ); - } - if std::env::var_os("VORTEX_SCAN_MORSEL_PLAN_WINDOW").is_some() { - anyhow::bail!( - "VORTEX_SCAN_MORSEL_PLAN_WINDOW is no longer supported; V2 only exposes read-byte budgeting" - ); - } - let read_byte_budget = std::env::var("VORTEX_SCAN_MAX_READ_BYTES") - .or_else(|_| std::env::var("VORTEX_SCAN_MAX_MORSEL_BYTES")) .ok() .map(|value| { value.parse::().map_err(|e| { diff --git a/docs/developer-guide/internals/scan-scheduler.md b/docs/developer-guide/internals/scan-scheduler.md index 213d3c1c201..00f7a35906a 100644 --- a/docs/developer-guide/internals/scan-scheduler.md +++ b/docs/developer-guide/internals/scan-scheduler.md @@ -263,11 +263,6 @@ VORTEX_SCAN_SCHEDULER=unbounded|shared|per-query VORTEX_SCAN_MAX_READ_BYTES=... ``` -`VORTEX_SCAN_MAX_MORSEL_BYTES` is accepted as a compatibility fallback for older -benchmark scripts. `VORTEX_SCAN_MAX_MORSEL_SLOTS` and -`VORTEX_SCAN_MORSEL_PLAN_WINDOW` are rejected because V2 no longer exposes -morsel-count scheduler knobs. - Useful S3 sweeps should compare: ```text diff --git a/vortex-datafusion/src/persistent/format.rs b/vortex-datafusion/src/persistent/format.rs index 9b1791b6d41..a181a8c64db 100644 --- a/vortex-datafusion/src/persistent/format.rs +++ b/vortex-datafusion/src/persistent/format.rs @@ -251,16 +251,10 @@ impl VortexFormatFactory { } } - /// Creates a factory with an explicit session and default options. - /// - /// The supplied options become the baseline for every [`VortexFormat`] - /// created by this factory. DataFusion may still override them with - /// table-level options passed into [`FileFormatFactory::create`]. - pub fn new_with_options(session: VortexSession, options: VortexTableOptions) -> Self { - Self { - session, - options: Some(options), - } + /// Overrides the [`VortexSession`] used by formats created from this factory. + pub fn with_session(mut self, session: VortexSession) -> Self { + self.session = session; + self } /// Overrides the default options for this factory. From 5376adf41d2c05097414aa977a900cd9b01d8439 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 24 Jun 2026 08:24:08 -0400 Subject: [PATCH 37/48] Split layout v2 vtables into layouts_v2 Signed-off-by: Nicholas Gates --- vortex-layout/src/layout_v2.rs | 498 +------------------ vortex-layout/src/layouts_v2/chunked.rs | 175 +++++++ vortex-layout/src/layouts_v2/dict.rs | 84 ++++ vortex-layout/src/layouts_v2/flat.rs | 86 ++++ vortex-layout/src/layouts_v2/mod.rs | 10 + vortex-layout/src/layouts_v2/struct_.rs | 79 +++ vortex-layout/src/layouts_v2/zoned.rs | 159 ++++++ vortex-layout/src/lib.rs | 1 + vortex-layout/src/scan/v2/layouts/chunked.rs | 2 +- vortex-layout/src/scan/v2/layouts/dict.rs | 2 +- vortex-layout/src/scan/v2/layouts/flat.rs | 2 +- vortex-layout/src/scan/v2/layouts/struct_.rs | 2 +- vortex-layout/src/scan/v2/layouts/zoned.rs | 2 +- vortex-layout/src/session.rs | 32 +- 14 files changed, 627 insertions(+), 507 deletions(-) create mode 100644 vortex-layout/src/layouts_v2/chunked.rs create mode 100644 vortex-layout/src/layouts_v2/dict.rs create mode 100644 vortex-layout/src/layouts_v2/flat.rs create mode 100644 vortex-layout/src/layouts_v2/mod.rs create mode 100644 vortex-layout/src/layouts_v2/struct_.rs create mode 100644 vortex-layout/src/layouts_v2/zoned.rs diff --git a/vortex-layout/src/layout_v2.rs b/vortex-layout/src/layout_v2.rs index 246120a1e41..680f1b9c8a9 100644 --- a/vortex-layout/src/layout_v2.rs +++ b/vortex-layout/src/layout_v2.rs @@ -13,16 +13,9 @@ use flatbuffers::Follow; use flatbuffers::VerifierOptions; use flatbuffers::root_with_opts; use once_cell::sync::OnceCell; -use vortex_array::DeserializeMetadata; -use vortex_array::EmptyMetadata; -use vortex_array::aggregate_fn::AggregateFnRef; use vortex_array::dtype::DType; -use vortex_array::dtype::Nullability; -use vortex_array::dtype::PType; -use vortex_buffer::ByteBuffer; use vortex_error::VortexResult; use vortex_error::vortex_bail; -use vortex_error::vortex_ensure; use vortex_error::vortex_err; use vortex_flatbuffers::FlatBuffer; use vortex_flatbuffers::layout; @@ -32,19 +25,8 @@ use vortex_session::registry::Registry; use crate::LayoutChildType; use crate::LayoutId; -use crate::layouts::zoned::LegacyStatsMetadata; -use crate::layouts::zoned::ZoneMapSchema; -use crate::layouts::zoned::ZonedMetadata; -use crate::layouts::zoned::aggregate_fns_from_specs; -use crate::layouts::zoned::aggregate_stats_table_dtype; -use crate::layouts::zoned::legacy_stats_table_dtype; use crate::scan::plan::ScanPlanRef; use crate::scan::plan::request::ScanRequest; -use crate::scan::v2::layouts::chunked as scan_chunked; -use crate::scan::v2::layouts::dict as scan_dict; -use crate::scan::v2::layouts::flat as scan_flat; -use crate::scan::v2::layouts::struct_ as scan_struct; -use crate::scan::v2::layouts::zoned as scan_zoned; use crate::segments::SegmentId; /// A reference-counted, type-erased v2 layout. @@ -646,11 +628,17 @@ fn layout_from_fb_layout( }) } -fn metadata_bool_field(metadata: &[u8], field_number: u64) -> VortexResult> { +pub(crate) fn metadata_bool_field( + metadata: &[u8], + field_number: u64, +) -> VortexResult> { Ok(metadata_varint_field(metadata, field_number)?.map(|value| value != 0)) } -fn metadata_varint_field(metadata: &[u8], field_number: u64) -> VortexResult> { +pub(crate) fn metadata_varint_field( + metadata: &[u8], + field_number: u64, +) -> VortexResult> { let mut offset = 0; while offset < metadata.len() { let key = read_varint(metadata, &mut offset)?; @@ -667,7 +655,10 @@ fn metadata_varint_field(metadata: &[u8], field_number: u64) -> VortexResult VortexResult>> { +pub(crate) fn metadata_bytes_field( + metadata: &[u8], + field_number: u64, +) -> VortexResult>> { let mut offset = 0; while offset < metadata.len() { let key = read_varint(metadata, &mut offset)?; @@ -735,302 +726,10 @@ fn read_varint(metadata: &[u8], offset: &mut usize) -> VortexResult { vortex_bail!("protobuf varint exceeds 64 bits") } -/// V2 flat layout vtable. -#[derive(Clone, Debug)] -pub struct Flat; - -/// V2 flat layout data. -#[derive(Clone, Debug)] -pub struct FlatData { - pub(crate) segment_id: SegmentId, - pub(crate) array_ctx: ReadContext, - pub(crate) array_tree: Option, -} - -impl FlatData { - /// Returns the serialized array segment ID. - pub fn segment_id(&self) -> SegmentId { - self.segment_id - } - - /// Returns the array read context. - pub fn array_ctx(&self) -> &ReadContext { - &self.array_ctx - } - - /// Returns the optional inline array encoding tree. - pub fn array_tree(&self) -> Option<&ByteBuffer> { - self.array_tree.as_ref() - } -} - -impl VTable for Flat { - type LayoutData = FlatData; - - fn id(&self) -> LayoutId { - LayoutId::new("vortex.flat") - } - - fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { - vortex_ensure!( - args.segment_ids.len() == 1, - "Flat layout must have exactly one segment ID" - ); - Ok(FlatData { - segment_id: args.segment_ids[0], - array_ctx: args.array_ctx.clone(), - array_tree: metadata_bytes_field(args.metadata, 1)?.map(ByteBuffer::from), - }) - } - - fn child_dtype(_layout: Layout, idx: usize) -> VortexResult { - vortex_bail!("Flat layout has no child {idx}") - } - - fn child_type(_layout: Layout, idx: usize) -> VortexResult { - vortex_bail!("Flat layout has no child {idx}") - } - - fn new_scan_plan( - layout: Layout, - req: &mut ScanRequest, - session: &VortexSession, - ) -> VortexResult { - scan_flat::new_scan_plan(layout, req, session) - } -} - -/// V2 chunked layout vtable. -#[derive(Clone, Debug)] -pub struct Chunked; - -/// V2 chunked layout data. -#[derive(Clone, Debug)] -pub struct ChunkedData { - pub(crate) chunk_offsets: Vec, -} - -impl ChunkedData { - /// Returns the cumulative chunk offsets. - pub fn chunk_offsets(&self) -> &[u64] { - &self.chunk_offsets - } -} - -impl VTable for Chunked { - type LayoutData = ChunkedData; - - fn id(&self) -> LayoutId { - LayoutId::new("vortex.chunked") - } - - fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { - EmptyMetadata::deserialize(args.metadata)?; - let mut chunk_offsets: Vec = Vec::with_capacity(args.children.nchildren() + 1); - chunk_offsets.push(0); - for idx in 0..args.children.nchildren() { - let next = chunk_offsets[idx] - .checked_add(args.children.child_row_count(idx)?) - .ok_or_else(|| vortex_err!("Chunked child row counts overflow"))?; - chunk_offsets.push(next); - } - vortex_ensure!( - chunk_offsets.last().copied() == Some(args.row_count), - "Chunked child row counts do not add up to parent row count" - ); - Ok(ChunkedData { chunk_offsets }) - } - - fn child_dtype(layout: Layout, _idx: usize) -> VortexResult { - Ok(layout.dtype().clone()) - } - - fn child_type(layout: Layout, idx: usize) -> VortexResult { - if idx >= layout.nchildren() { - vortex_bail!("Chunked child index out of bounds: {idx}"); - } - let offset = *layout - .data() - .chunk_offsets - .get(idx) - .ok_or_else(|| vortex_err!("Chunked child index out of bounds: {idx}"))?; - Ok(LayoutChildType::Chunk((idx, offset))) - } - - fn new_scan_plan( - layout: Layout, - req: &mut ScanRequest, - session: &VortexSession, - ) -> VortexResult { - scan_chunked::new_scan_plan(layout, req, session) - } -} - -/// V2 struct layout vtable. -#[derive(Clone, Debug)] -pub struct Struct; - -impl VTable for Struct { - type LayoutData = (); - - fn id(&self) -> LayoutId { - LayoutId::new("vortex.struct") - } - - fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { - EmptyMetadata::deserialize(args.metadata)?; - Ok(()) - } - - fn child_dtype(layout: Layout, idx: usize) -> VortexResult { - let schema_index = if layout.dtype().is_nullable() { - idx.saturating_sub(1) - } else { - idx - }; - if idx == 0 && layout.dtype().is_nullable() { - Ok(DType::Bool(Nullability::NonNullable)) - } else { - layout - .dtype() - .as_struct_fields_opt() - .and_then(|fields| fields.field_by_index(schema_index)) - .ok_or_else(|| vortex_err!("Missing struct field {schema_index}")) - } - } - - fn child_type(layout: Layout, idx: usize) -> VortexResult { - let schema_index = if layout.dtype().is_nullable() { - idx.saturating_sub(1) - } else { - idx - }; - if idx == 0 && layout.dtype().is_nullable() { - Ok(LayoutChildType::Auxiliary("validity".into())) - } else { - let name = layout - .dtype() - .as_struct_fields_opt() - .and_then(|fields| fields.field_name(schema_index)) - .ok_or_else(|| vortex_err!("Missing struct field {schema_index}"))?; - Ok(LayoutChildType::Field(name.clone())) - } - } - - fn new_scan_plan( - layout: Layout, - req: &mut ScanRequest, - session: &VortexSession, - ) -> VortexResult { - scan_struct::new_scan_plan(layout, req, session) - } -} - -/// V2 dictionary layout vtable. -#[derive(Clone, Debug)] -pub struct Dict; - -/// V2 dictionary layout data. -#[derive(Clone, Debug)] -pub struct DictData { - pub(crate) codes_dtype: DType, - pub(crate) all_values_referenced: bool, -} - -impl DictData { - /// Returns whether all dictionary values are definitely referenced. - pub fn has_all_values_referenced(&self) -> bool { - self.all_values_referenced - } -} - -impl VTable for Dict { - type LayoutData = DictData; - - fn id(&self) -> LayoutId { - LayoutId::new("vortex.dict") - } - - fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { - let codes_ptype = metadata_varint_field(args.metadata, 1)? - .ok_or_else(|| vortex_err!("Dict metadata missing codes ptype"))?; - let codes_ptype = PType::try_from(i32::try_from(codes_ptype)?)?; - let codes_nullable = metadata_bool_field(args.metadata, 2)? - .map(Nullability::from) - .unwrap_or_else(|| args.dtype.nullability()); - Ok(DictData { - codes_dtype: DType::Primitive(codes_ptype, codes_nullable), - all_values_referenced: metadata_bool_field(args.metadata, 3)?.unwrap_or(false), - }) - } - - fn child_dtype(layout: Layout, idx: usize) -> VortexResult { - match idx { - 0 => Ok(layout.dtype().clone()), - 1 => Ok(layout.data().codes_dtype.clone()), - _ => vortex_bail!("Dict child index out of bounds: {idx}"), - } - } - - fn child_type(_layout: Layout, idx: usize) -> VortexResult { - match idx { - 0 => Ok(LayoutChildType::Auxiliary("values".into())), - 1 => Ok(LayoutChildType::Transparent("codes".into())), - _ => vortex_bail!("Dict child index out of bounds: {idx}"), - } - } - - fn new_scan_plan( - layout: Layout, - req: &mut ScanRequest, - session: &VortexSession, - ) -> VortexResult { - scan_dict::new_scan_plan(layout, req, session) - } -} - #[cfg(test)] mod tests { - use std::sync::Arc; - - use vortex_array::dtype::DType; - use vortex_array::dtype::Nullability; - use vortex_array::dtype::PType; - use vortex_session::VortexSession; - use vortex_session::registry::ReadContext; - use super::*; - #[derive(Debug)] - struct TestChildren { - row_counts: Vec, - } - - impl LayoutChildren for TestChildren { - fn child(&self, idx: usize, _dtype: &DType) -> VortexResult { - vortex_bail!("test child {idx} is not materialized") - } - - fn child_row_count(&self, idx: usize) -> VortexResult { - self.row_counts - .get(idx) - .copied() - .ok_or_else(|| vortex_err!("test child index out of bounds: {idx}")) - } - - fn nchildren(&self) -> usize { - self.row_counts.len() - } - } - - fn primitive_dtype() -> DType { - DType::Primitive(PType::I32, Nullability::NonNullable) - } - - fn read_context() -> ReadContext { - ReadContext::new([]) - } - #[test] fn metadata_bytes_field_rejects_length_overflow() { let mut metadata = vec![0x0a]; @@ -1051,177 +750,4 @@ mod tests { assert!(metadata_varint_field(&metadata, 1).is_err()); } - - #[test] - fn chunked_deserialize_rejects_row_count_overflow() { - let dtype = primitive_dtype(); - let read_context = read_context(); - let session = VortexSession::empty(); - let args = LayoutDeserializeArgs { - dtype: &dtype, - row_count: 0, - metadata: &[], - segment_ids: Vec::new(), - children: Arc::new(TestChildren { - row_counts: vec![u64::MAX, 1], - }), - array_ctx: &read_context, - session: &session, - }; - - assert!(VTable::deserialize(&Chunked, &args).is_err()); - } - - #[test] - fn chunked_child_type_rejects_terminal_offset_index() { - let dtype = primitive_dtype(); - let layout = LayoutParts::new( - Chunked, - dtype, - 1, - Vec::new(), - Arc::new(TestChildren { - row_counts: vec![1], - }), - ChunkedData { - chunk_offsets: vec![0, 1], - }, - ) - .into_typed(); - - assert!(layout.child_type(1).is_err()); - } -} - -/// V2 zoned layout vtable. -#[derive(Clone, Debug)] -pub struct Zoned; - -/// V2 legacy stats layout vtable. -#[derive(Clone, Debug)] -pub struct LegacyStats; - -/// V2 zoned layout data. -#[derive(Clone, Debug)] -pub struct ZonedData { - pub(crate) zone_len: usize, - pub(crate) zone_map_schema: ZoneMapSchema, - pub(crate) aggregate_fns: Arc<[AggregateFnRef]>, -} - -impl ZonedData { - /// Returns the configured zone length. - pub fn zone_len(&self) -> usize { - self.zone_len - } - - /// Returns the aggregate functions stored in the zone table. - pub fn aggregate_fns(&self) -> &Arc<[AggregateFnRef]> { - &self.aggregate_fns - } - - /// Returns the zone-map schema used by the zone table. - pub(crate) fn zone_map_schema(&self) -> &ZoneMapSchema { - &self.zone_map_schema - } - - fn stats_table_dtype(&self, dtype: &DType) -> DType { - match &self.zone_map_schema { - ZoneMapSchema::LegacyStats(stats) => legacy_stats_table_dtype(dtype, stats), - ZoneMapSchema::AggregateFns(aggregate_fns) => { - aggregate_stats_table_dtype(dtype, aggregate_fns) - } - } - } -} - -impl VTable for Zoned { - type LayoutData = ZonedData; - - fn id(&self) -> LayoutId { - LayoutId::new("vortex.zoned") - } - - fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { - let metadata = ZonedMetadata::deserialize(args.metadata)?; - let aggregate_fns = aggregate_fns_from_specs(&metadata.aggregate_specs, args.session)?; - Ok(ZonedData { - zone_len: metadata.zone_len as usize, - zone_map_schema: ZoneMapSchema::AggregateFns(Arc::clone(&aggregate_fns)), - aggregate_fns, - }) - } - - fn child_dtype(layout: Layout, idx: usize) -> VortexResult { - match idx { - 0 => Ok(layout.dtype().clone()), - 1 => Ok(layout.data().stats_table_dtype(layout.dtype())), - _ => vortex_bail!("Zoned child index out of bounds: {idx}"), - } - } - - fn child_type(_layout: Layout, idx: usize) -> VortexResult { - match idx { - 0 => Ok(LayoutChildType::Transparent("data".into())), - 1 => Ok(LayoutChildType::Auxiliary("zones".into())), - _ => vortex_bail!("Zoned child index out of bounds: {idx}"), - } - } - - fn new_scan_plan( - layout: Layout, - req: &mut ScanRequest, - session: &VortexSession, - ) -> VortexResult { - scan_zoned::new_scan_plan(layout, req, session) - } -} - -impl VTable for LegacyStats { - type LayoutData = ZonedData; - - fn id(&self) -> LayoutId { - LayoutId::new("vortex.stats") - } - - fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { - let metadata = LegacyStatsMetadata::deserialize(args.metadata)?; - let aggregate_fns = match &metadata.zone_map_schema { - ZoneMapSchema::LegacyStats(stats) => stats - .iter() - .filter_map(|stat| stat.aggregate_fn()) - .collect::>() - .into(), - ZoneMapSchema::AggregateFns(aggregate_fns) => Arc::clone(aggregate_fns), - }; - Ok(ZonedData { - zone_len: metadata.zone_len as usize, - zone_map_schema: metadata.zone_map_schema, - aggregate_fns, - }) - } - - fn child_dtype(layout: Layout, idx: usize) -> VortexResult { - match idx { - 0 => Ok(layout.dtype().clone()), - 1 => Ok(layout.data().stats_table_dtype(layout.dtype())), - _ => vortex_bail!("Legacy stats child index out of bounds: {idx}"), - } - } - - fn child_type(_layout: Layout, idx: usize) -> VortexResult { - match idx { - 0 => Ok(LayoutChildType::Transparent("data".into())), - 1 => Ok(LayoutChildType::Auxiliary("zones".into())), - _ => vortex_bail!("Legacy stats child index out of bounds: {idx}"), - } - } - - fn new_scan_plan( - layout: Layout, - req: &mut ScanRequest, - session: &VortexSession, - ) -> VortexResult { - scan_zoned::new_scan_plan(layout, req, session) - } } diff --git a/vortex-layout/src/layouts_v2/chunked.rs b/vortex-layout/src/layouts_v2/chunked.rs new file mode 100644 index 00000000000..3ed0e361e4c --- /dev/null +++ b/vortex-layout/src/layouts_v2/chunked.rs @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::DeserializeMetadata; +use vortex_array::EmptyMetadata; +use vortex_array::dtype::DType; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; +use vortex_error::vortex_err; +use vortex_session::VortexSession; + +use crate::LayoutChildType; +use crate::LayoutId; +use crate::layout_v2::Layout; +use crate::layout_v2::LayoutDeserializeArgs; +use crate::layout_v2::VTable; +use crate::scan::plan::ScanPlanRef; +use crate::scan::plan::request::ScanRequest; +use crate::scan::v2::layouts::chunked as scan_chunked; + +/// V2 chunked layout vtable. +#[derive(Clone, Debug)] +pub struct Chunked; + +/// V2 chunked layout data. +#[derive(Clone, Debug)] +pub struct ChunkedData { + pub(crate) chunk_offsets: Vec, +} + +impl ChunkedData { + /// Returns the cumulative chunk offsets. + pub fn chunk_offsets(&self) -> &[u64] { + &self.chunk_offsets + } +} + +impl VTable for Chunked { + type LayoutData = ChunkedData; + + fn id(&self) -> LayoutId { + LayoutId::new("vortex.chunked") + } + + fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { + EmptyMetadata::deserialize(args.metadata)?; + let mut chunk_offsets: Vec = Vec::with_capacity(args.children.nchildren() + 1); + chunk_offsets.push(0); + for idx in 0..args.children.nchildren() { + let next = chunk_offsets[idx] + .checked_add(args.children.child_row_count(idx)?) + .ok_or_else(|| vortex_err!("Chunked child row counts overflow"))?; + chunk_offsets.push(next); + } + vortex_ensure!( + chunk_offsets.last().copied() == Some(args.row_count), + "Chunked child row counts do not add up to parent row count" + ); + Ok(ChunkedData { chunk_offsets }) + } + + fn child_dtype(layout: Layout, _idx: usize) -> VortexResult { + Ok(layout.dtype().clone()) + } + + fn child_type(layout: Layout, idx: usize) -> VortexResult { + if idx >= layout.nchildren() { + vortex_bail!("Chunked child index out of bounds: {idx}"); + } + let offset = *layout + .data() + .chunk_offsets + .get(idx) + .ok_or_else(|| vortex_err!("Chunked child index out of bounds: {idx}"))?; + Ok(LayoutChildType::Chunk((idx, offset))) + } + + fn new_scan_plan( + layout: Layout, + req: &mut ScanRequest, + session: &VortexSession, + ) -> VortexResult { + scan_chunked::new_scan_plan(layout, req, session) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use vortex_array::dtype::DType; + use vortex_array::dtype::Nullability; + use vortex_array::dtype::PType; + use vortex_error::VortexResult; + use vortex_error::vortex_bail; + use vortex_error::vortex_err; + use vortex_session::VortexSession; + use vortex_session::registry::ReadContext; + + use super::*; + use crate::layout_v2::LayoutChildren; + use crate::layout_v2::LayoutParts; + use crate::layout_v2::LayoutRef; + + #[derive(Debug)] + struct TestChildren { + row_counts: Vec, + } + + impl LayoutChildren for TestChildren { + fn child(&self, idx: usize, _dtype: &DType) -> VortexResult { + vortex_bail!("test child {idx} is not materialized") + } + + fn child_row_count(&self, idx: usize) -> VortexResult { + self.row_counts + .get(idx) + .copied() + .ok_or_else(|| vortex_err!("test child index out of bounds: {idx}")) + } + + fn nchildren(&self) -> usize { + self.row_counts.len() + } + } + + fn primitive_dtype() -> DType { + DType::Primitive(PType::I32, Nullability::NonNullable) + } + + fn read_context() -> ReadContext { + ReadContext::new([]) + } + + #[test] + fn chunked_deserialize_rejects_row_count_overflow() { + let dtype = primitive_dtype(); + let read_context = read_context(); + let session = VortexSession::empty(); + let args = LayoutDeserializeArgs { + dtype: &dtype, + row_count: 0, + metadata: &[], + segment_ids: Vec::new(), + children: Arc::new(TestChildren { + row_counts: vec![u64::MAX, 1], + }), + array_ctx: &read_context, + session: &session, + }; + + assert!(VTable::deserialize(&Chunked, &args).is_err()); + } + + #[test] + fn chunked_child_type_rejects_terminal_offset_index() { + let dtype = primitive_dtype(); + let layout = LayoutParts::new( + Chunked, + dtype, + 1, + Vec::new(), + Arc::new(TestChildren { + row_counts: vec![1], + }), + ChunkedData { + chunk_offsets: vec![0, 1], + }, + ) + .into_typed(); + + assert!(layout.child_type(1).is_err()); + } +} diff --git a/vortex-layout/src/layouts_v2/dict.rs b/vortex-layout/src/layouts_v2/dict.rs new file mode 100644 index 00000000000..8167ebf7573 --- /dev/null +++ b/vortex-layout/src/layouts_v2/dict.rs @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_err; +use vortex_session::VortexSession; + +use crate::LayoutChildType; +use crate::LayoutId; +use crate::layout_v2::Layout; +use crate::layout_v2::LayoutDeserializeArgs; +use crate::layout_v2::VTable; +use crate::layout_v2::metadata_bool_field; +use crate::layout_v2::metadata_varint_field; +use crate::scan::plan::ScanPlanRef; +use crate::scan::plan::request::ScanRequest; +use crate::scan::v2::layouts::dict as scan_dict; + +/// V2 dictionary layout vtable. +#[derive(Clone, Debug)] +pub struct Dict; + +/// V2 dictionary layout data. +#[derive(Clone, Debug)] +pub struct DictData { + pub(crate) codes_dtype: DType, + pub(crate) all_values_referenced: bool, +} + +impl DictData { + /// Returns whether all dictionary values are definitely referenced. + pub fn has_all_values_referenced(&self) -> bool { + self.all_values_referenced + } +} + +impl VTable for Dict { + type LayoutData = DictData; + + fn id(&self) -> LayoutId { + LayoutId::new("vortex.dict") + } + + fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { + let codes_ptype = metadata_varint_field(args.metadata, 1)? + .ok_or_else(|| vortex_err!("Dict metadata missing codes ptype"))?; + let codes_ptype = PType::try_from(i32::try_from(codes_ptype)?)?; + let codes_nullable = metadata_bool_field(args.metadata, 2)? + .map(Nullability::from) + .unwrap_or_else(|| args.dtype.nullability()); + Ok(DictData { + codes_dtype: DType::Primitive(codes_ptype, codes_nullable), + all_values_referenced: metadata_bool_field(args.metadata, 3)?.unwrap_or(false), + }) + } + + fn child_dtype(layout: Layout, idx: usize) -> VortexResult { + match idx { + 0 => Ok(layout.dtype().clone()), + 1 => Ok(layout.data().codes_dtype.clone()), + _ => vortex_bail!("Dict child index out of bounds: {idx}"), + } + } + + fn child_type(_layout: Layout, idx: usize) -> VortexResult { + match idx { + 0 => Ok(LayoutChildType::Auxiliary("values".into())), + 1 => Ok(LayoutChildType::Transparent("codes".into())), + _ => vortex_bail!("Dict child index out of bounds: {idx}"), + } + } + + fn new_scan_plan( + layout: Layout, + req: &mut ScanRequest, + session: &VortexSession, + ) -> VortexResult { + scan_dict::new_scan_plan(layout, req, session) + } +} diff --git a/vortex-layout/src/layouts_v2/flat.rs b/vortex-layout/src/layouts_v2/flat.rs new file mode 100644 index 00000000000..dd5d69a276a --- /dev/null +++ b/vortex-layout/src/layouts_v2/flat.rs @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::dtype::DType; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; +use vortex_session::VortexSession; +use vortex_session::registry::ReadContext; + +use crate::LayoutChildType; +use crate::LayoutId; +use crate::layout_v2::Layout; +use crate::layout_v2::LayoutDeserializeArgs; +use crate::layout_v2::VTable; +use crate::layout_v2::metadata_bytes_field; +use crate::scan::plan::ScanPlanRef; +use crate::scan::plan::request::ScanRequest; +use crate::scan::v2::layouts::flat as scan_flat; +use crate::segments::SegmentId; + +/// V2 flat layout vtable. +#[derive(Clone, Debug)] +pub struct Flat; + +/// V2 flat layout data. +#[derive(Clone, Debug)] +pub struct FlatData { + pub(crate) segment_id: SegmentId, + pub(crate) array_ctx: ReadContext, + pub(crate) array_tree: Option, +} + +impl FlatData { + /// Returns the serialized array segment ID. + pub fn segment_id(&self) -> SegmentId { + self.segment_id + } + + /// Returns the array read context. + pub fn array_ctx(&self) -> &ReadContext { + &self.array_ctx + } + + /// Returns the optional inline array encoding tree. + pub fn array_tree(&self) -> Option<&ByteBuffer> { + self.array_tree.as_ref() + } +} + +impl VTable for Flat { + type LayoutData = FlatData; + + fn id(&self) -> LayoutId { + LayoutId::new("vortex.flat") + } + + fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { + vortex_ensure!( + args.segment_ids.len() == 1, + "Flat layout must have exactly one segment ID" + ); + Ok(FlatData { + segment_id: args.segment_ids[0], + array_ctx: args.array_ctx.clone(), + array_tree: metadata_bytes_field(args.metadata, 1)?.map(ByteBuffer::from), + }) + } + + fn child_dtype(_layout: Layout, idx: usize) -> VortexResult { + vortex_bail!("Flat layout has no child {idx}") + } + + fn child_type(_layout: Layout, idx: usize) -> VortexResult { + vortex_bail!("Flat layout has no child {idx}") + } + + fn new_scan_plan( + layout: Layout, + req: &mut ScanRequest, + session: &VortexSession, + ) -> VortexResult { + scan_flat::new_scan_plan(layout, req, session) + } +} diff --git a/vortex-layout/src/layouts_v2/mod.rs b/vortex-layout/src/layouts_v2/mod.rs new file mode 100644 index 00000000000..22d2fd76fdb --- /dev/null +++ b/vortex-layout/src/layouts_v2/mod.rs @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Built-in v2 layout vtables. + +pub mod chunked; +pub mod dict; +pub mod flat; +pub mod struct_; +pub mod zoned; diff --git a/vortex-layout/src/layouts_v2/struct_.rs b/vortex-layout/src/layouts_v2/struct_.rs new file mode 100644 index 00000000000..1ab35db88bc --- /dev/null +++ b/vortex-layout/src/layouts_v2/struct_.rs @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::DeserializeMetadata; +use vortex_array::EmptyMetadata; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_error::VortexResult; +use vortex_error::vortex_err; +use vortex_session::VortexSession; + +use crate::LayoutChildType; +use crate::LayoutId; +use crate::layout_v2::Layout; +use crate::layout_v2::LayoutDeserializeArgs; +use crate::layout_v2::VTable; +use crate::scan::plan::ScanPlanRef; +use crate::scan::plan::request::ScanRequest; +use crate::scan::v2::layouts::struct_ as scan_struct; + +/// V2 struct layout vtable. +#[derive(Clone, Debug)] +pub struct Struct; + +impl VTable for Struct { + type LayoutData = (); + + fn id(&self) -> LayoutId { + LayoutId::new("vortex.struct") + } + + fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { + EmptyMetadata::deserialize(args.metadata)?; + Ok(()) + } + + fn child_dtype(layout: Layout, idx: usize) -> VortexResult { + let schema_index = if layout.dtype().is_nullable() { + idx.saturating_sub(1) + } else { + idx + }; + if idx == 0 && layout.dtype().is_nullable() { + Ok(DType::Bool(Nullability::NonNullable)) + } else { + layout + .dtype() + .as_struct_fields_opt() + .and_then(|fields| fields.field_by_index(schema_index)) + .ok_or_else(|| vortex_err!("Missing struct field {schema_index}")) + } + } + + fn child_type(layout: Layout, idx: usize) -> VortexResult { + let schema_index = if layout.dtype().is_nullable() { + idx.saturating_sub(1) + } else { + idx + }; + if idx == 0 && layout.dtype().is_nullable() { + Ok(LayoutChildType::Auxiliary("validity".into())) + } else { + let name = layout + .dtype() + .as_struct_fields_opt() + .and_then(|fields| fields.field_name(schema_index)) + .ok_or_else(|| vortex_err!("Missing struct field {schema_index}"))?; + Ok(LayoutChildType::Field(name.clone())) + } + } + + fn new_scan_plan( + layout: Layout, + req: &mut ScanRequest, + session: &VortexSession, + ) -> VortexResult { + scan_struct::new_scan_plan(layout, req, session) + } +} diff --git a/vortex-layout/src/layouts_v2/zoned.rs b/vortex-layout/src/layouts_v2/zoned.rs new file mode 100644 index 00000000000..d9aa4d21d3e --- /dev/null +++ b/vortex-layout/src/layouts_v2/zoned.rs @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::sync::Arc; + +use vortex_array::DeserializeMetadata; +use vortex_array::aggregate_fn::AggregateFnRef; +use vortex_array::dtype::DType; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_session::VortexSession; + +use crate::LayoutChildType; +use crate::LayoutId; +use crate::layout_v2::Layout; +use crate::layout_v2::LayoutDeserializeArgs; +use crate::layout_v2::VTable; +use crate::layouts::zoned::LegacyStatsMetadata; +use crate::layouts::zoned::ZoneMapSchema; +use crate::layouts::zoned::ZonedMetadata; +use crate::layouts::zoned::aggregate_fns_from_specs; +use crate::layouts::zoned::aggregate_stats_table_dtype; +use crate::layouts::zoned::legacy_stats_table_dtype; +use crate::scan::plan::ScanPlanRef; +use crate::scan::plan::request::ScanRequest; +use crate::scan::v2::layouts::zoned as scan_zoned; + +/// V2 zoned layout vtable. +#[derive(Clone, Debug)] +pub struct Zoned; + +/// V2 legacy stats layout vtable. +#[derive(Clone, Debug)] +pub struct LegacyStats; + +/// V2 zoned layout data. +#[derive(Clone, Debug)] +pub struct ZonedData { + pub(crate) zone_len: usize, + pub(crate) zone_map_schema: ZoneMapSchema, + pub(crate) aggregate_fns: Arc<[AggregateFnRef]>, +} + +impl ZonedData { + /// Returns the configured zone length. + pub fn zone_len(&self) -> usize { + self.zone_len + } + + /// Returns the aggregate functions stored in the zone table. + pub fn aggregate_fns(&self) -> &Arc<[AggregateFnRef]> { + &self.aggregate_fns + } + + /// Returns the zone-map schema used by the zone table. + pub(crate) fn zone_map_schema(&self) -> &ZoneMapSchema { + &self.zone_map_schema + } + + fn stats_table_dtype(&self, dtype: &DType) -> DType { + match &self.zone_map_schema { + ZoneMapSchema::LegacyStats(stats) => legacy_stats_table_dtype(dtype, stats), + ZoneMapSchema::AggregateFns(aggregate_fns) => { + aggregate_stats_table_dtype(dtype, aggregate_fns) + } + } + } +} + +impl VTable for Zoned { + type LayoutData = ZonedData; + + fn id(&self) -> LayoutId { + LayoutId::new("vortex.zoned") + } + + fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { + let metadata = ZonedMetadata::deserialize(args.metadata)?; + let aggregate_fns = aggregate_fns_from_specs(&metadata.aggregate_specs, args.session)?; + Ok(ZonedData { + zone_len: metadata.zone_len as usize, + zone_map_schema: ZoneMapSchema::AggregateFns(Arc::clone(&aggregate_fns)), + aggregate_fns, + }) + } + + fn child_dtype(layout: Layout, idx: usize) -> VortexResult { + match idx { + 0 => Ok(layout.dtype().clone()), + 1 => Ok(layout.data().stats_table_dtype(layout.dtype())), + _ => vortex_bail!("Zoned child index out of bounds: {idx}"), + } + } + + fn child_type(_layout: Layout, idx: usize) -> VortexResult { + match idx { + 0 => Ok(LayoutChildType::Transparent("data".into())), + 1 => Ok(LayoutChildType::Auxiliary("zones".into())), + _ => vortex_bail!("Zoned child index out of bounds: {idx}"), + } + } + + fn new_scan_plan( + layout: Layout, + req: &mut ScanRequest, + session: &VortexSession, + ) -> VortexResult { + scan_zoned::new_scan_plan(layout, req, session) + } +} + +impl VTable for LegacyStats { + type LayoutData = ZonedData; + + fn id(&self) -> LayoutId { + LayoutId::new("vortex.stats") + } + + fn deserialize(&self, args: &LayoutDeserializeArgs<'_>) -> VortexResult { + let metadata = LegacyStatsMetadata::deserialize(args.metadata)?; + let aggregate_fns = match &metadata.zone_map_schema { + ZoneMapSchema::LegacyStats(stats) => stats + .iter() + .filter_map(|stat| stat.aggregate_fn()) + .collect::>() + .into(), + ZoneMapSchema::AggregateFns(aggregate_fns) => Arc::clone(aggregate_fns), + }; + Ok(ZonedData { + zone_len: metadata.zone_len as usize, + zone_map_schema: metadata.zone_map_schema, + aggregate_fns, + }) + } + + fn child_dtype(layout: Layout, idx: usize) -> VortexResult { + match idx { + 0 => Ok(layout.dtype().clone()), + 1 => Ok(layout.data().stats_table_dtype(layout.dtype())), + _ => vortex_bail!("Legacy stats child index out of bounds: {idx}"), + } + } + + fn child_type(_layout: Layout, idx: usize) -> VortexResult { + match idx { + 0 => Ok(LayoutChildType::Transparent("data".into())), + 1 => Ok(LayoutChildType::Auxiliary("zones".into())), + _ => vortex_bail!("Legacy stats child index out of bounds: {idx}"), + } + } + + fn new_scan_plan( + layout: Layout, + req: &mut ScanRequest, + session: &VortexSession, + ) -> VortexResult { + scan_zoned::new_scan_plan(layout, req, session) + } +} diff --git a/vortex-layout/src/lib.rs b/vortex-layout/src/lib.rs index 70a7a4f7f97..e5e9eaae8b2 100644 --- a/vortex-layout/src/lib.rs +++ b/vortex-layout/src/lib.rs @@ -17,6 +17,7 @@ pub mod layout_v2; pub mod layouts; +pub mod layouts_v2; pub use children::*; pub use encoding::*; diff --git a/vortex-layout/src/scan/v2/layouts/chunked.rs b/vortex-layout/src/scan/v2/layouts/chunked.rs index c70dd385933..ab96af436d1 100644 --- a/vortex-layout/src/scan/v2/layouts/chunked.rs +++ b/vortex-layout/src/scan/v2/layouts/chunked.rs @@ -42,9 +42,9 @@ use vortex_error::vortex_err; use vortex_scan::read::ScanRead; use vortex_session::VortexSession; -use crate::layout_v2::Chunked; use crate::layout_v2::Layout; use crate::layout_v2::LayoutRef; +use crate::layouts_v2::chunked::Chunked; use crate::scan::plan::AggregateAnswer; use crate::scan::plan::DeferredReadTask; use crate::scan::plan::OwnedRowScope; diff --git a/vortex-layout/src/scan/v2/layouts/dict.rs b/vortex-layout/src/scan/v2/layouts/dict.rs index 3c90a40a53e..70b1bfa7937 100644 --- a/vortex-layout/src/scan/v2/layouts/dict.rs +++ b/vortex-layout/src/scan/v2/layouts/dict.rs @@ -41,8 +41,8 @@ use vortex_mask::Mask; use vortex_scan::read::ScanRead; use vortex_session::VortexSession; -use crate::layout_v2::Dict; use crate::layout_v2::Layout; +use crate::layouts_v2::dict::Dict; use crate::scan::plan::DeferredReadTask; use crate::scan::plan::OwnedRowScope; use crate::scan::plan::PrepareCtx; diff --git a/vortex-layout/src/scan/v2/layouts/flat.rs b/vortex-layout/src/scan/v2/layouts/flat.rs index 3199d2a5127..659270989ed 100644 --- a/vortex-layout/src/scan/v2/layouts/flat.rs +++ b/vortex-layout/src/scan/v2/layouts/flat.rs @@ -26,9 +26,9 @@ use vortex_error::vortex_err; use vortex_scan::read::ScanRead; use vortex_session::VortexSession; -use crate::layout_v2::Flat; use crate::layout_v2::Layout; use crate::layout_v2::LayoutRef; +use crate::layouts_v2::flat::Flat; use crate::scan::plan::OwnedRowScope; use crate::scan::plan::PrepareCtx; use crate::scan::plan::PreparedRead; diff --git a/vortex-layout/src/scan/v2/layouts/struct_.rs b/vortex-layout/src/scan/v2/layouts/struct_.rs index 2f1dd3e5773..b785bcdda65 100644 --- a/vortex-layout/src/scan/v2/layouts/struct_.rs +++ b/vortex-layout/src/scan/v2/layouts/struct_.rs @@ -29,7 +29,7 @@ use vortex_session::VortexSession; use crate::layout_v2::Layout; use crate::layout_v2::LayoutRef; -use crate::layout_v2::Struct; +use crate::layouts_v2::struct_::Struct; use crate::scan::plan::ApplyScanPlan; use crate::scan::plan::MaskScanPlan; use crate::scan::plan::PrepareCtx; diff --git a/vortex-layout/src/scan/v2/layouts/zoned.rs b/vortex-layout/src/scan/v2/layouts/zoned.rs index 3da60ca85e6..ecd3cbd1f26 100644 --- a/vortex-layout/src/scan/v2/layouts/zoned.rs +++ b/vortex-layout/src/scan/v2/layouts/zoned.rs @@ -50,11 +50,11 @@ use vortex_session::VortexSession; use crate::layout_v2::Layout; use crate::layout_v2::VTable; -use crate::layout_v2::ZonedData; use crate::layouts::zoned::MAX_IS_TRUNCATED; use crate::layouts::zoned::MIN_IS_TRUNCATED; use crate::layouts::zoned::ZoneMapSchema; use crate::layouts::zoned::zone_map::ZoneMap; +use crate::layouts_v2::zoned::ZonedData; use crate::scan::plan::AggregateAnswer; use crate::scan::plan::EvidenceCost; use crate::scan::plan::EvidenceScope; diff --git a/vortex-layout/src/session.rs b/vortex-layout/src/session.rs index ca2bbd4d625..4a8549983ed 100644 --- a/vortex-layout/src/session.rs +++ b/vortex-layout/src/session.rs @@ -17,6 +17,12 @@ use crate::layouts::flat::FlatLayoutEncoding; use crate::layouts::struct_::StructLayoutEncoding; use crate::layouts::zoned::LegacyStatsLayoutEncoding; use crate::layouts::zoned::ZonedLayoutEncoding; +use crate::layouts_v2::chunked::Chunked as ChunkedV2; +use crate::layouts_v2::dict::Dict as DictV2; +use crate::layouts_v2::flat::Flat as FlatV2; +use crate::layouts_v2::struct_::Struct as StructV2; +use crate::layouts_v2::zoned::LegacyStats as LegacyStatsV2; +use crate::layouts_v2::zoned::Zoned as ZonedV2; pub type LayoutRegistry = Registry; @@ -75,29 +81,23 @@ impl Default for LayoutSession { // Register the built-in v2 layout vtables. v2_layouts.register( - layout_v2::Chunked.id(), - Arc::new(layout_v2::Chunked) as layout_v2::LayoutVTableRef, + ChunkedV2.id(), + Arc::new(ChunkedV2) as layout_v2::LayoutVTableRef, ); + v2_layouts.register(FlatV2.id(), Arc::new(FlatV2) as layout_v2::LayoutVTableRef); v2_layouts.register( - layout_v2::Flat.id(), - Arc::new(layout_v2::Flat) as layout_v2::LayoutVTableRef, + StructV2.id(), + Arc::new(StructV2) as layout_v2::LayoutVTableRef, ); v2_layouts.register( - layout_v2::Struct.id(), - Arc::new(layout_v2::Struct) as layout_v2::LayoutVTableRef, + ZonedV2.id(), + Arc::new(ZonedV2) as layout_v2::LayoutVTableRef, ); v2_layouts.register( - layout_v2::Zoned.id(), - Arc::new(layout_v2::Zoned) as layout_v2::LayoutVTableRef, - ); - v2_layouts.register( - layout_v2::LegacyStats.id(), - Arc::new(layout_v2::LegacyStats) as layout_v2::LayoutVTableRef, - ); - v2_layouts.register( - layout_v2::Dict.id(), - Arc::new(layout_v2::Dict) as layout_v2::LayoutVTableRef, + LegacyStatsV2.id(), + Arc::new(LegacyStatsV2) as layout_v2::LayoutVTableRef, ); + v2_layouts.register(DictV2.id(), Arc::new(DictV2) as layout_v2::LayoutVTableRef); Self { registry: layouts, From e96e21d69ef230eed7d993c46bdbba8b5bfe0c60 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 24 Jun 2026 10:35:05 -0400 Subject: [PATCH 38/48] Refactor ScanPlan read scheduling Signed-off-by: "Nicholas Gates" --- .../internals/scan-scheduler.md | 41 +- vortex-file/src/multi/scan_v2.rs | 291 ++++----- vortex-layout/src/layout_v2.rs | 55 +- vortex-layout/src/layouts_v2/chunked.rs | 6 +- vortex-layout/src/layouts_v2/dict.rs | 6 +- vortex-layout/src/layouts_v2/flat.rs | 6 +- vortex-layout/src/layouts_v2/struct_.rs | 6 +- vortex-layout/src/layouts_v2/zoned.rs | 10 +- vortex-layout/src/scan/plan/mod.rs | 382 +++--------- vortex-layout/src/scan/v2/layouts/chunked.rs | 553 ++++-------------- vortex-layout/src/scan/v2/layouts/dict.rs | 217 +++---- vortex-layout/src/scan/v2/layouts/flat.rs | 147 +++-- vortex-layout/src/scan/v2/layouts/struct_.rs | 14 +- vortex-layout/src/scan/v2/layouts/zoned.rs | 209 +++---- vortex-layout/src/scan/v2/row_idx.rs | 17 +- vortex-layout/src/segments/scheduled.rs | 266 +-------- vortex-scan/src/read.rs | 5 + 17 files changed, 665 insertions(+), 1566 deletions(-) diff --git a/docs/developer-guide/internals/scan-scheduler.md b/docs/developer-guide/internals/scan-scheduler.md index 00f7a35906a..9b8ce751ce8 100644 --- a/docs/developer-guide/internals/scan-scheduler.md +++ b/docs/developer-guide/internals/scan-scheduler.md @@ -1,15 +1,18 @@ # Scan Scheduler -This document describes the current ScanPlan V2 scheduler and I/O pipeline. It is -an implementation guide, not a design sketch. +This document describes the current ScanPlan-backed scheduler and I/O pipeline. +It is an implementation guide, not a design sketch. The scheduler is split across three layers: - `vortex-scan::scheduler` owns the process/query-level scheduler object, scheduler provider, and read-byte budget configuration. -- `vortex-file::multi::scan_v2` owns the per-partition ScanPlan runtime. It - plans morsels, queues evidence/predicate/projection work, and decides which - queued task is useful next. +- `vortex-layout::scan::plan` owns the ScanPlan runtime interfaces and + layout-backed implementations. Deserialized layouts construct concrete plans + with the file-provided segment source. +- `vortex-file::multi::scan_v2` wires files into that runtime. It builds the + root plan for a file, plans morsels, queues evidence/predicate/projection + work, and decides which queued task is useful next. - `vortex-file::segments` and `vortex-file::read` own segment future registration, logical read deduplication, physical range coalescing, and backend request concurrency. @@ -21,7 +24,7 @@ control how much work is launched. ## Execution Shape -The normal DataFusion V2 path is: +The normal DataFusion ScanPlan path is: ```text DataFusion DataSource::open(partition) @@ -39,7 +42,7 @@ ScanSchedulerProvider::scheduler_for_scan partition_work_stream | +-- plan morsels into task queues - +-- register segment futures synchronously + +-- create task steps and register segment reads synchronously +-- admit tasks by lane/frontier/read bytes +-- poll task futures on the Vortex runtime +-- emit arrays in ordered or unordered mode @@ -75,7 +78,7 @@ The default `VortexSession` provider is `Unbounded`. DuckDB installs a shared default scheduler in the extension session. The DataFusion benchmark only installs a scheduler when `VORTEX_SCAN_SCHEDULER` is set. -There is no scheduler permit API in the V2 runtime. Task launch is admitted by +There is no scheduler permit API in the ScanPlan runtime. Task launch is admitted by the per-partition `ScanTaskQueue` using active logical read bytes. Limited scans still plan one active morsel at a time internally because limit accounting must not consume rows far ahead of the output frontier, but that is not a public @@ -174,23 +177,25 @@ DEFAULT_READ_BYTE_BUDGET = 256 MiB `ScanSchedulerConfig::unbounded()` leaves this unset, which becomes `u64::MAX` inside `partition_work_stream`. -## Segment Requests +## Segment Reads -Prepared reads and evidence providers expose segment requests before task launch. -The runtime turns those requests into `ScanRead` values with: +Prepared reads and evidence providers create tasks. When a task is converted +into a scheduler-visible step, the concrete `ScanPlan` implementation turns any +needed layout segments into `ScanRead` values through its scan-local +`SegmentFutureCache`: ```rust -register_segment_reads_cached(cache, source, requests) +cache.register(source, requests) ``` This call is synchronous. For cache misses, it calls the underlying `SegmentSource::request(segment)` immediately and stores a shared future in the -scan-local `SegmentFutureCache`. That means simply planning work registers the -logical reads with the file segment source before the task future is polled. +scan-local cache. That means creating a read step registers the logical reads +with the file segment source before the task continuation is run. -The cache key is currently the logical `SegmentId`. That is sufficient inside one -`ScanExecution` because each execution has one bound file segment source. It is -not a cross-file or cross-scan cache key. +The cache key is currently the logical `SegmentId`. That is sufficient inside a +file-bound `SegmentFutureCache` because the concrete plans using that cache are +bound to the same file segment source. It is not a cross-file cache key. `SegmentInfo` contains only logical payload `bytes`, which the task scheduler uses for read-budget admission. Segment-cache policy is owned by the @@ -266,7 +271,7 @@ VORTEX_SCAN_MAX_READ_BYTES=... Useful S3 sweeps should compare: ```text -# Current compatibility behavior. +# Default unbounded behavior. VORTEX_SCAN_SCHEDULER=unbounded # Bounded read pressure, one scheduler per query. diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index 181d031c307..c351b129ae4 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -48,6 +48,7 @@ use vortex_io::filesystem::FileListing; use vortex_io::filesystem::FileSystemRef; use vortex_io::runtime::Handle; use vortex_io::session::RuntimeSessionExt; +use vortex_layout::layout_v2::LayoutScanPlanCtx; use vortex_layout::scan::plan::EvidenceScope; use vortex_layout::scan::plan::OwnedRowScope; use vortex_layout::scan::plan::PrepareCtx; @@ -58,6 +59,7 @@ use vortex_layout::scan::plan::PreparedStats; use vortex_layout::scan::plan::PreparedStatsRef; use vortex_layout::scan::plan::PushCtx; use vortex_layout::scan::plan::ReadContext; +use vortex_layout::scan::plan::ReadStep; use vortex_layout::scan::plan::ReadTask; use vortex_layout::scan::plan::ReadTaskOutput; use vortex_layout::scan::plan::ScanPlan; @@ -76,14 +78,8 @@ use vortex_layout::scan::plan::request::OwnedEvidenceRequest; use vortex_layout::scan::plan::request::ScanRequest; use vortex_layout::scan::v2::validate_temporal_comparisons; use vortex_layout::scan::v2::with_row_idx; -use vortex_layout::segments::ReadResultsSegmentSource; use vortex_layout::segments::ScanIoPhase; use vortex_layout::segments::ScanRead; -use vortex_layout::segments::SegmentFutureCache; -use vortex_layout::segments::SegmentPlanCtx; -use vortex_layout::segments::SegmentRequests; -use vortex_layout::segments::SegmentSource; -use vortex_layout::segments::register_segment_reads_cached; use vortex_mask::Mask; use vortex_metrics::MetricsRegistry; use vortex_scan::DataSource; @@ -125,7 +121,7 @@ const IDEAL_SPLIT_SIZE: u64 = 100_000; const MAX_SELECTION_RANGE_SIZE: u64 = IDEAL_SPLIT_SIZE / 25; const MIN_SELECTION_GAP_BETWEEN_RANGES: u64 = IDEAL_SPLIT_SIZE / 2; /// Below this demanded-row density, evaluate a residual predicate over only the demanded rows -/// (filter-first) rather than the whole morsel. Mirrors the V1 flat-reader threshold. +/// (filter-first) rather than the whole morsel. const EXPR_EVAL_THRESHOLD: f64 = 0.2; const INLINE_ZERO_READ_EVIDENCE_MAX_PRIORITY: u64 = 100_150; const SCAN_SCOPE_MIN_PREDICATE_COST: u64 = 100; @@ -959,7 +955,7 @@ pub(crate) async fn scan_plan_file_statistics_many( ) -> VortexResult>>> { let session = file.session().clone(); let root = file.scan_plan_root()?; - let reader = ReadContext::new(file.segment_source(), session); + let reader = ReadContext::new(session); let mut result = Vec::with_capacity(exprs.len()); for expr in exprs { let plan = if let Some(field_path) = root_field_path(expr) { @@ -1004,7 +1000,7 @@ pub(crate) async fn scan_plan_file_plan_splits( let Some(plan) = pushed.prepare_splits(&mut PrepareCtx::new(session.clone()))? else { return Ok(std::iter::once(0..file.row_count()).collect()); }; - let reader = ReadContext::new(file.segment_source(), session.clone()); + let reader = ReadContext::new(session.clone()); let state = plan.init_state(&session)?; plan.splits(0..file.row_count(), &reader, state.as_ref()) .await @@ -1025,7 +1021,12 @@ pub(crate) fn build_file_scan_plan_root(file: &VortexFile) -> VortexResult FileStatsScanPlan::try_new( @@ -1196,10 +1197,8 @@ impl ScanTask for ScanEvidenceWaitTask { } } -struct PredicateReadWorkTask { +struct PredicateReadWorkState { execution: Arc, - task: Box, - reads: Vec, morsel_id: usize, predicate_idx: usize, version: PredicateVersion, @@ -1211,9 +1210,23 @@ struct PredicateReadWorkTask { lane: ScanTaskLane, } +struct PredicateReadWorkTask { + state: PredicateReadWorkState, + step: ReadStep, + reads: Vec, +} + +impl PredicateReadWorkTask { + fn try_new(state: PredicateReadWorkState, task: Box) -> VortexResult { + let step = task.into_step()?; + let reads = ScanTaskRead::from_scan_reads(&step.required_reads); + Ok(Self { state, step, reads }) + } +} + impl ScanTask for PredicateReadWorkTask { fn morsel_id(&self) -> usize { - self.morsel_id + self.state.morsel_id } fn phase(&self) -> ScanIoPhase { @@ -1221,7 +1234,7 @@ impl ScanTask for PredicateReadWorkTask { } fn lane(&self) -> ScanTaskLane { - self.lane + self.state.lane } fn reads(&self) -> &[ScanTaskRead] { @@ -1229,16 +1242,17 @@ impl ScanTask for PredicateReadWorkTask { } fn priority(&self) -> u64 { - self.priority + self.state.priority } fn into_step(self: Box) -> VortexResult> { let task = *self; - let read_step = task.task.into_step()?; - let morsel_id = task.morsel_id; - let lane = task.lane; + let state = task.state; + let morsel_id = state.morsel_id; + let lane = state.lane; let reads = task.reads.clone(); - let priority = task.priority; + let priority = state.priority; + let read_step = task.step; Ok(ScanStep::new( morsel_id, ScanIoPhase::PredicateRead, @@ -1247,62 +1261,51 @@ impl ScanTask for PredicateReadWorkTask { read_step.required_reads, read_step.prefetch_reads, move |results| { - let reader = task.execution.resolved_reader(results.clone()); - let mut ctx = task.execution.session.create_execution_ctx(); + let reader = state.execution.read_context(); + let mut ctx = state.execution.session.create_execution_ctx(); let array = match read_step.continuation.run(&reader, &mut ctx, results)? { ReadTaskOutput::Ready(array) => array, ReadTaskOutput::Continue(read_task) => { - return Ok(ScanStepResult::Continue(Box::new(PredicateReadWorkTask { - execution: task.execution, - task: read_task, - reads: task.reads, - morsel_id: task.morsel_id, - predicate_idx: task.predicate_idx, - version: task.version, - range: task.range, - need: task.need, - compact: task.compact, - len: task.len, - priority: task.priority, - lane: task.lane, - }))); + return Ok(ScanStepResult::Continue(Box::new( + PredicateReadWorkTask::try_new(state, read_task)?, + ))); } }; - let result = if task.compact { + let result = if state.compact { let compact = array.null_as_false().execute(&mut ctx)?; - if compact.len() != task.need.true_count() { + if compact.len() != state.need.true_count() { vortex_bail!( "compacted residual result length {} does not match demanded row count {}", compact.len(), - task.need.true_count() + state.need.true_count() ); } - task.need.intersect_by_rank(&compact) + state.need.intersect_by_rank(&compact) } else { array.null_as_false().execute(&mut ctx)? }; - if result.len() != task.len { + if result.len() != state.len { vortex_bail!( "residual result length {} does not match morsel length {}", result.len(), - task.len + state.len ); } - let pass = &result & &task.need; - let input_rows = task.need.true_count(); + let pass = &result & &state.need; + let input_rows = state.need.true_count(); let pass_rows = pass.true_count(); - let exact = !&task.need | &pass; + let exact = !&state.need | &pass; Ok(ScanStepResult::Ready(WorkOutput::Evidence( EvidenceWorkOutput { - morsel_id: task.morsel_id, - predicate_idx: task.predicate_idx, - version: task.version, + morsel_id: state.morsel_id, + predicate_idx: state.predicate_idx, + version: state.version, source: EvidenceWorkSource::Predicate { input_rows, pass_rows, }, fragments: vec![EvidenceFragment::new( - task.range.clone(), + state.range.clone(), PredicateEvidenceKind::ExactMask(exact), )], }, @@ -1315,11 +1318,28 @@ impl ScanTask for PredicateReadWorkTask { struct ProjectionReadWorkTask { execution: Arc, - task: Box, + step: ReadStep, reads: Vec, morsel_id: usize, } +impl ProjectionReadWorkTask { + fn try_new( + execution: Arc, + task: Box, + morsel_id: usize, + ) -> VortexResult { + let step = task.into_step()?; + let reads = ScanTaskRead::from_scan_reads(&step.required_reads); + Ok(Self { + execution, + step, + reads, + morsel_id, + }) + } +} + impl ScanTask for ProjectionReadWorkTask { fn morsel_id(&self) -> usize { self.morsel_id @@ -1343,8 +1363,8 @@ impl ScanTask for ProjectionReadWorkTask { fn into_step(self: Box) -> VortexResult> { let task = *self; - let read_step = task.task.into_step()?; let reads = task.reads.clone(); + let read_step = task.step; Ok(ScanStep::new( task.morsel_id, ScanIoPhase::ProjectionRead, @@ -1353,7 +1373,7 @@ impl ScanTask for ProjectionReadWorkTask { read_step.required_reads, read_step.prefetch_reads, move |results| { - let reader = task.execution.resolved_reader(results.clone()); + let reader = task.execution.read_context(); let mut ctx = task.execution.session.create_execution_ctx(); match read_step.continuation.run(&reader, &mut ctx, results)? { ReadTaskOutput::Ready(array) => Ok(ScanStepResult::Ready( @@ -1362,21 +1382,16 @@ impl ScanTask for ProjectionReadWorkTask { array, }), )), - ReadTaskOutput::Continue(read_task) => { - Ok(ScanStepResult::Continue(Box::new(ProjectionReadWorkTask { - execution: task.execution, - task: read_task, - reads: task.reads, - morsel_id: task.morsel_id, - }))) - } + ReadTaskOutput::Continue(read_task) => Ok(ScanStepResult::Continue(Box::new( + ProjectionReadWorkTask::try_new(task.execution, read_task, task.morsel_id)?, + ))), } }, )) } } -async fn resolve_scan_reads(read_store: ReadStoreRef, reads: Vec) -> VortexResult<()> { +async fn resolve_step_reads(read_store: ReadStoreRef, reads: Vec) -> VortexResult<()> { let mut pending_reads = FuturesUnordered::new(); for read in reads { let key = read.request.key; @@ -1391,13 +1406,13 @@ async fn resolve_scan_reads(read_store: ReadStoreRef, reads: Vec) -> V Ok(()) } -fn prefetch_scan_reads(handle: &Handle, read_store: ReadStoreRef, reads: Vec) { +fn prefetch_step_reads(handle: &Handle, read_store: ReadStoreRef, reads: Vec) { if reads.is_empty() { return; } handle .spawn(async move { - if let Err(error) = resolve_scan_reads(read_store, reads).await { + if let Err(error) = resolve_step_reads(read_store, reads).await { tracing::debug!( target: "vortex_file::scan_v2", ?error, @@ -1415,8 +1430,8 @@ async fn run_scan_task_step( ) -> VortexResult { let mut step = work.into_step()?; let (required_reads, prefetch_reads) = step.take_reads(); - prefetch_scan_reads(&handle, Arc::clone(&read_store), prefetch_reads); - resolve_scan_reads(Arc::clone(&read_store), required_reads).await?; + prefetch_step_reads(&handle, Arc::clone(&read_store), prefetch_reads); + resolve_step_reads(Arc::clone(&read_store), required_reads).await?; match step.continue_with(ReadResults::new(Arc::clone(&read_store)))? { ScanStepResult::Ready(output) => Ok(WorkPoll::Ready(output)), ScanStepResult::Continue(work) => Ok(WorkPoll::Pending(work)), @@ -2417,8 +2432,6 @@ struct ScanExecution { session: VortexSession, plan: Arc, limit_remaining: Option>, - segment_source: Arc, - segment_future_cache: Arc, projection: PreparedReadRef, predicates: Vec, predicate_stats: Mutex>, @@ -2554,8 +2567,6 @@ impl ScanExecution { limit_remaining: Option>, ) -> VortexResult { let session = file.session().clone(); - let segment_source = file.segment_source(); - let segment_future_cache = file.scan_plan_segment_future_cache(); let mut prepare_ctx = PrepareCtx::with_state_cache(session.clone(), file.scan_plan_state_cache()); let projection = Arc::clone(plan.projection()) @@ -2603,8 +2614,6 @@ impl ScanExecution { session, plan, limit_remaining, - segment_source, - segment_future_cache, projection, predicates, predicate_stats: Mutex::new(predicate_stats), @@ -2612,41 +2621,8 @@ impl ScanExecution { }) } - fn segment_plan_ctx(&self, phase: ScanIoPhase) -> SegmentPlanCtx { - SegmentPlanCtx::new(Arc::clone(&self.segment_source), self.session.clone()) - .with_phase(phase) - } - - fn register_segment_reads(&self, requests: SegmentRequests) -> VortexResult> { - if requests.is_unknown() { - vortex_bail!("scan2 task produced unknown segment requests") - } - Ok(register_segment_reads_cached( - self.segment_future_cache.as_ref(), - self.segment_source.as_ref(), - requests, - )) - } - - fn register_prefetch_segment_reads(&self, requests: SegmentRequests) -> Vec { - if requests.is_unknown() { - return Vec::new(); - } - register_segment_reads_cached( - self.segment_future_cache.as_ref(), - self.segment_source.as_ref(), - requests, - ) - } - - fn resolved_reader(&self, results: ReadResults) -> ReadContext { - ReadContext::new( - Arc::new(ReadResultsSegmentSource::new( - Arc::clone(&self.segment_source), - results, - )), - self.session.clone(), - ) + fn read_context(&self) -> ReadContext { + ReadContext::new(self.session.clone()) } fn predicate_priority(&self, predicate_idx: usize, demand_rows: usize) -> u64 { @@ -2921,16 +2897,11 @@ impl ScanExecution { range: 0..self.plan.row_count, mode, }; - let mut segment_ctx = self.segment_plan_ctx(ScanIoPhase::EvidenceProbe); let result = (|| { - let requests = - plan.segment_requests(&req.as_request(), &mut segment_ctx)?; - let reads = self.register_segment_reads(requests)?; - let prefetch_requests = - plan.prefetch_segment_requests(&req.as_request(), &mut segment_ctx)?; - let prefetch_reads = - self.register_prefetch_segment_reads(prefetch_requests); - let work_reads = ScanTaskRead::from_scan_reads(&reads); + let task = Arc::clone(plan) + .create_task(req.clone(), ScanIoPhase::EvidenceProbe)?; + let step = task.into_step()?; + let work_reads = ScanTaskRead::from_scan_reads(&step.required_reads); let priority = plan .cost(&req.as_request()) .priority( @@ -2938,7 +2909,6 @@ impl ScanExecution { mode == EvidenceMode::RecheckBeforeProjection, ) .saturating_add(predicate.static_cost); - let task = Arc::clone(plan).create_task(req, Vec::new())?; let execution = Arc::clone(self); Ok(ScanStep::new( morsel_id, @@ -2948,11 +2918,11 @@ impl ScanExecution { evidence_idx: evidence_idx_u32, }, work_reads, - reads, - prefetch_reads, + step.required_reads, + step.prefetch_reads, move |results| { - let reader = execution.resolved_reader(results); - let fragments = task.evidence(&reader)?; + let reader = execution.read_context(); + let fragments = step.continuation.run(&reader, results)?; Ok(ScanStepResult::Ready(WorkOutput::ScanEvidence( ScanEvidenceWorkOutput { execution, @@ -3011,13 +2981,9 @@ impl ScanExecution { } let evidence_idx_u32 = u32::try_from(evidence_idx).map_err(|_| vortex_err!("too many evidence plans"))?; - let mut segment_ctx = self.segment_plan_ctx(ScanIoPhase::EvidenceProbe); - let requests = plan.segment_requests(&req.as_request(), &mut segment_ctx)?; - let reads = self.register_segment_reads(requests)?; - let prefetch_requests = - plan.prefetch_segment_requests(&req.as_request(), &mut segment_ctx)?; - let prefetch_reads = self.register_prefetch_segment_reads(prefetch_requests); - let work_reads = ScanTaskRead::from_scan_reads(&reads); + let task = Arc::clone(plan).create_task(req.clone(), ScanIoPhase::EvidenceProbe)?; + let step = task.into_step()?; + let work_reads = ScanTaskRead::from_scan_reads(&step.required_reads); let priority = plan .cost(&req.as_request()) .priority( @@ -3025,7 +2991,6 @@ impl ScanExecution { mode == EvidenceMode::RecheckBeforeProjection, ) .saturating_add(predicate.static_cost); - let task = Arc::clone(plan).create_task(req.clone(), Vec::new())?; let execution = Arc::clone(self); work.push( ScanStep::new( @@ -3036,11 +3001,11 @@ impl ScanExecution { evidence_idx: evidence_idx_u32, }, work_reads, - reads, - prefetch_reads, + step.required_reads, + step.prefetch_reads, move |results| { - let reader = execution.resolved_reader(results); - let fragments = task.evidence(&reader)?; + let reader = execution.read_context(); + let fragments = step.continuation.run(&reader, results)?; Ok(ScanStepResult::Ready(WorkOutput::Evidence( EvidenceWorkOutput { morsel_id, @@ -3076,34 +3041,13 @@ impl ScanExecution { } else { OwnedRowScope::try_new(Mask::new_true(len), need.clone())? }; - let mut segment_ctx = self.segment_plan_ctx(ScanIoPhase::PredicateRead); - let requests = - predicate - .read - .segment_requests(range.clone(), rows.as_scope(), &mut segment_ctx)?; - let reads = self.register_segment_reads(requests)?; - let prefetch_requests = predicate.read.prefetch_segment_requests( - range.clone(), - rows.as_scope(), - &mut segment_ctx, - )?; - let prefetch_reads = self.register_prefetch_segment_reads(prefetch_requests); - let work_reads = ScanTaskRead::from_scan_reads(&reads); - let task = Arc::clone(&predicate.read).create_task( - range.clone(), - rows, - reads, - prefetch_reads, - &mut segment_ctx, - )?; + let phase = ScanIoPhase::PredicateRead; + let task = Arc::clone(&predicate.read).create_task(range.clone(), rows, phase)?; - let execution = Arc::clone(self); let predicate_idx_u32 = u32::try_from(predicate_idx).map_err(|_| vortex_err!("too many predicates"))?; - Ok(Box::new(PredicateReadWorkTask { - execution, - task, - reads: work_reads, + let state = PredicateReadWorkState { + execution: Arc::clone(self), morsel_id, predicate_idx, version, @@ -3115,7 +3059,8 @@ impl ScanExecution { lane: ScanTaskLane::Predicate { predicate_idx: predicate_idx_u32, }, - })) + }; + Ok(Box::new(PredicateReadWorkTask::try_new(state, task)?)) } fn plan_projection_work( @@ -3144,33 +3089,13 @@ impl ScanExecution { } let rows = OwnedRowScope::selected(selected); - let mut segment_ctx = self.segment_plan_ctx(ScanIoPhase::ProjectionRead); - let requests = - self.projection - .segment_requests(range.clone(), rows.as_scope(), &mut segment_ctx)?; - let reads = self.register_segment_reads(requests)?; - let prefetch_requests = self.projection.prefetch_segment_requests( - range.clone(), - rows.as_scope(), - &mut segment_ctx, - )?; - let prefetch_reads = self.register_prefetch_segment_reads(prefetch_requests); - let work_reads = ScanTaskRead::from_scan_reads(&reads); - let task = Arc::clone(&self.projection).create_task( - range, - rows, - reads, - prefetch_reads, - &mut segment_ctx, - )?; + let phase = ScanIoPhase::ProjectionRead; + let task = Arc::clone(&self.projection).create_task(range, rows, phase)?; let execution = Arc::clone(self); - Ok(Some(Box::new(ProjectionReadWorkTask { - execution, - task, - reads: work_reads, - morsel_id, - }))) + Ok(Some(Box::new(ProjectionReadWorkTask::try_new( + execution, task, morsel_id, + )?))) } fn splits(&self, row_range: &Range) -> VortexResult>> { diff --git a/vortex-layout/src/layout_v2.rs b/vortex-layout/src/layout_v2.rs index 680f1b9c8a9..006f5563d78 100644 --- a/vortex-layout/src/layout_v2.rs +++ b/vortex-layout/src/layout_v2.rs @@ -27,7 +27,9 @@ use crate::LayoutChildType; use crate::LayoutId; use crate::scan::plan::ScanPlanRef; use crate::scan::plan::request::ScanRequest; +use crate::segments::SegmentFutureCache; use crate::segments::SegmentId; +use crate::segments::SegmentSource; /// A reference-counted, type-erased v2 layout. #[derive(Clone)] @@ -85,10 +87,51 @@ pub trait VTable: 'static + Clone + Send + Sync + Debug { fn new_scan_plan( layout: Layout, req: &mut ScanRequest, - session: &VortexSession, + ctx: &LayoutScanPlanCtx, ) -> VortexResult; } +/// Context captured while expanding a serialized layout into a physical scan plan. +/// +/// Layouts are serialization metadata; concrete scan plans are bound to the segment source +/// they will read from when the layout is expanded. +#[derive(Clone)] +pub struct LayoutScanPlanCtx { + session: VortexSession, + segment_source: Arc, + segment_future_cache: Arc, +} + +impl LayoutScanPlanCtx { + /// Create a layout scan-plan expansion context. + pub fn new( + session: VortexSession, + segment_source: Arc, + segment_future_cache: Arc, + ) -> Self { + Self { + session, + segment_source, + segment_future_cache, + } + } + + /// Return the session used while constructing scan plans. + pub fn session(&self) -> &VortexSession { + &self.session + } + + /// Return the segment source concrete scan plans should capture. + pub fn segment_source(&self) -> &Arc { + &self.segment_source + } + + /// Return the file-level cache used for scheduled segment futures. + pub fn segment_future_cache(&self) -> &Arc { + &self.segment_future_cache + } +} + /// Object-safe plugin for deserializing v2 layouts by ID. pub trait LayoutVTablePlugin: 'static + Send + Sync { /// Returns the ID of this layout encoding. @@ -329,7 +372,7 @@ trait DynLayout: 'static + Send + Sync + Debug { fn dyn_new_scan_plan( &self, req: &mut ScanRequest, - session: &VortexSession, + ctx: &LayoutScanPlanCtx, ) -> VortexResult; } @@ -388,9 +431,9 @@ impl LayoutRef { pub fn new_scan_plan( &self, req: &mut ScanRequest, - session: &VortexSession, + ctx: &LayoutScanPlanCtx, ) -> VortexResult { - self.0.dyn_new_scan_plan(req, session) + self.0.dyn_new_scan_plan(req, ctx) } /// Returns an iterator over child row offsets. @@ -445,9 +488,9 @@ impl DynLayout for Layout { fn dyn_new_scan_plan( &self, req: &mut ScanRequest, - session: &VortexSession, + ctx: &LayoutScanPlanCtx, ) -> VortexResult { - V::new_scan_plan(self.clone(), req, session) + V::new_scan_plan(self.clone(), req, ctx) } } diff --git a/vortex-layout/src/layouts_v2/chunked.rs b/vortex-layout/src/layouts_v2/chunked.rs index 3ed0e361e4c..373f99911c8 100644 --- a/vortex-layout/src/layouts_v2/chunked.rs +++ b/vortex-layout/src/layouts_v2/chunked.rs @@ -8,12 +8,12 @@ use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_ensure; use vortex_error::vortex_err; -use vortex_session::VortexSession; use crate::LayoutChildType; use crate::LayoutId; use crate::layout_v2::Layout; use crate::layout_v2::LayoutDeserializeArgs; +use crate::layout_v2::LayoutScanPlanCtx; use crate::layout_v2::VTable; use crate::scan::plan::ScanPlanRef; use crate::scan::plan::request::ScanRequest; @@ -79,9 +79,9 @@ impl VTable for Chunked { fn new_scan_plan( layout: Layout, req: &mut ScanRequest, - session: &VortexSession, + ctx: &LayoutScanPlanCtx, ) -> VortexResult { - scan_chunked::new_scan_plan(layout, req, session) + scan_chunked::new_scan_plan(layout, req, ctx) } } diff --git a/vortex-layout/src/layouts_v2/dict.rs b/vortex-layout/src/layouts_v2/dict.rs index 8167ebf7573..a7b14fdb11f 100644 --- a/vortex-layout/src/layouts_v2/dict.rs +++ b/vortex-layout/src/layouts_v2/dict.rs @@ -7,12 +7,12 @@ use vortex_array::dtype::PType; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; -use vortex_session::VortexSession; use crate::LayoutChildType; use crate::LayoutId; use crate::layout_v2::Layout; use crate::layout_v2::LayoutDeserializeArgs; +use crate::layout_v2::LayoutScanPlanCtx; use crate::layout_v2::VTable; use crate::layout_v2::metadata_bool_field; use crate::layout_v2::metadata_varint_field; @@ -77,8 +77,8 @@ impl VTable for Dict { fn new_scan_plan( layout: Layout, req: &mut ScanRequest, - session: &VortexSession, + ctx: &LayoutScanPlanCtx, ) -> VortexResult { - scan_dict::new_scan_plan(layout, req, session) + scan_dict::new_scan_plan(layout, req, ctx) } } diff --git a/vortex-layout/src/layouts_v2/flat.rs b/vortex-layout/src/layouts_v2/flat.rs index dd5d69a276a..061c256063c 100644 --- a/vortex-layout/src/layouts_v2/flat.rs +++ b/vortex-layout/src/layouts_v2/flat.rs @@ -6,13 +6,13 @@ use vortex_buffer::ByteBuffer; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_ensure; -use vortex_session::VortexSession; use vortex_session::registry::ReadContext; use crate::LayoutChildType; use crate::LayoutId; use crate::layout_v2::Layout; use crate::layout_v2::LayoutDeserializeArgs; +use crate::layout_v2::LayoutScanPlanCtx; use crate::layout_v2::VTable; use crate::layout_v2::metadata_bytes_field; use crate::scan::plan::ScanPlanRef; @@ -79,8 +79,8 @@ impl VTable for Flat { fn new_scan_plan( layout: Layout, req: &mut ScanRequest, - session: &VortexSession, + ctx: &LayoutScanPlanCtx, ) -> VortexResult { - scan_flat::new_scan_plan(layout, req, session) + scan_flat::new_scan_plan(layout, req, ctx) } } diff --git a/vortex-layout/src/layouts_v2/struct_.rs b/vortex-layout/src/layouts_v2/struct_.rs index 1ab35db88bc..c512856e03b 100644 --- a/vortex-layout/src/layouts_v2/struct_.rs +++ b/vortex-layout/src/layouts_v2/struct_.rs @@ -7,12 +7,12 @@ use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; use vortex_error::VortexResult; use vortex_error::vortex_err; -use vortex_session::VortexSession; use crate::LayoutChildType; use crate::LayoutId; use crate::layout_v2::Layout; use crate::layout_v2::LayoutDeserializeArgs; +use crate::layout_v2::LayoutScanPlanCtx; use crate::layout_v2::VTable; use crate::scan::plan::ScanPlanRef; use crate::scan::plan::request::ScanRequest; @@ -72,8 +72,8 @@ impl VTable for Struct { fn new_scan_plan( layout: Layout, req: &mut ScanRequest, - session: &VortexSession, + ctx: &LayoutScanPlanCtx, ) -> VortexResult { - scan_struct::new_scan_plan(layout, req, session) + scan_struct::new_scan_plan(layout, req, ctx) } } diff --git a/vortex-layout/src/layouts_v2/zoned.rs b/vortex-layout/src/layouts_v2/zoned.rs index d9aa4d21d3e..f8f6e6da51b 100644 --- a/vortex-layout/src/layouts_v2/zoned.rs +++ b/vortex-layout/src/layouts_v2/zoned.rs @@ -8,12 +8,12 @@ use vortex_array::aggregate_fn::AggregateFnRef; use vortex_array::dtype::DType; use vortex_error::VortexResult; use vortex_error::vortex_bail; -use vortex_session::VortexSession; use crate::LayoutChildType; use crate::LayoutId; use crate::layout_v2::Layout; use crate::layout_v2::LayoutDeserializeArgs; +use crate::layout_v2::LayoutScanPlanCtx; use crate::layout_v2::VTable; use crate::layouts::zoned::LegacyStatsMetadata; use crate::layouts::zoned::ZoneMapSchema; @@ -103,9 +103,9 @@ impl VTable for Zoned { fn new_scan_plan( layout: Layout, req: &mut ScanRequest, - session: &VortexSession, + ctx: &LayoutScanPlanCtx, ) -> VortexResult { - scan_zoned::new_scan_plan(layout, req, session) + scan_zoned::new_scan_plan(layout, req, ctx) } } @@ -152,8 +152,8 @@ impl VTable for LegacyStats { fn new_scan_plan( layout: Layout, req: &mut ScanRequest, - session: &VortexSession, + ctx: &LayoutScanPlanCtx, ) -> VortexResult { - scan_zoned::new_scan_plan(layout, req, session) + scan_zoned::new_scan_plan(layout, req, ctx) } } diff --git a/vortex-layout/src/scan/plan/mod.rs b/vortex-layout/src/scan/plan/mod.rs index 06c0d919a92..989957aed1f 100644 --- a/vortex-layout/src/scan/plan/mod.rs +++ b/vortex-layout/src/scan/plan/mod.rs @@ -33,7 +33,6 @@ use vortex_array::VortexSessionExecute; use vortex_array::aggregate_fn::AggregateFnRef; use vortex_array::arrays::ConstantArray; use vortex_array::arrays::StructArray; -use vortex_array::buffer::BufferHandle; use vortex_array::builtins::ArrayBuiltins; use vortex_array::dtype::Field; use vortex_array::dtype::FieldNames; @@ -52,41 +51,25 @@ use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; use vortex_mask::Mask; -use vortex_scan::read::ReadRequestKey; use vortex_scan::read::ReadResults; +use vortex_scan::read::ScanIoPhase; use vortex_scan::read::ScanRead; use vortex_session::VortexSession; -use vortex_utils::aliases::hash_set::HashSet; use self::evidence::EvidenceFragment; use self::request::EvidenceRequest; use self::request::OwnedEvidenceRequest; -use crate::segments::SegmentPlanCtx; -use crate::segments::SegmentRequestKey; -use crate::segments::SegmentRequests; -use crate::segments::SegmentSource; /// Execution context for prepared scan tasks. #[derive(Clone)] pub struct ReadContext { - segments: Arc, session: VortexSession, } impl ReadContext { - /// Create a read context from a segment source and session. - pub fn new(segments: Arc, session: VortexSession) -> Self { - Self { segments, session } - } - - /// Segment source for layout data. - pub fn segments(&self) -> &Arc { - &self.segments - } - - /// Return a segment that was resolved by the scan scheduler before execution. - pub fn segment(&self, id: crate::segments::SegmentId) -> VortexResult { - self.segments.resolved(id) + /// Create a read context from a session. + pub fn new(session: VortexSession) -> Self { + Self { session } } /// Session used to decode arrays and execute expressions. @@ -563,22 +546,11 @@ impl ScanPlan for LiteralScanPlan { } impl PreparedRead for LiteralPreparedRead { - fn segment_requests( - &self, - _range: Range, - _rows: RowScope<'_>, - _cx: &mut SegmentPlanCtx, - ) -> VortexResult { - Ok(SegmentRequests::none()) - } - fn create_task( self: Arc, range: Range, rows: OwnedRowScope, - _reads: Vec, - _prefetch_reads: Vec, - _cx: &mut SegmentPlanCtx, + _phase: ScanIoPhase, ) -> VortexResult> { check_scan_range(&range, self.row_count)?; Ok(Box::new(LiteralReadTask { @@ -690,34 +662,12 @@ fn check_scan_range(range: &Range, row_count: u64) -> VortexResult<()> { /// may hold child prepared reads and initializes route-scoped state once per /// prepared file scan; each morsel execution is represented as a [`ReadTask`]. pub trait PreparedRead: 'static + Send + Sync { - /// Return scheduler-visible segment requests needed for this read, when known exactly. - fn segment_requests( - &self, - _range: Range, - _rows: RowScope<'_>, - _cx: &mut SegmentPlanCtx, - ) -> VortexResult { - Ok(SegmentRequests::unknown()) - } - - /// Return scheduler-visible segment requests that may be fetched speculatively. - fn prefetch_segment_requests( - &self, - _range: Range, - _rows: RowScope<'_>, - _cx: &mut SegmentPlanCtx, - ) -> VortexResult { - Ok(SegmentRequests::none()) - } - /// Create a morsel-level read task for this prepared read. fn create_task( self: Arc, range: Range, rows: OwnedRowScope, - reads: Vec, - prefetch_reads: Vec, - cx: &mut SegmentPlanCtx, + phase: ScanIoPhase, ) -> VortexResult>; /// Release state behind the completed-row frontier. @@ -801,32 +751,6 @@ pub trait ReadTask: Send { fn into_step(self: Box) -> VortexResult; } -pub(crate) fn take_reads_for_requests( - registered: &mut [Option], - requests: SegmentRequests, -) -> VortexResult> { - let Some(requests) = requests.into_exact() else { - vortex_bail!("scan2 child task produced unknown segment requests") - }; - let keys = requests - .iter() - .map(|request| ReadRequestKey::from(SegmentRequestKey::from(request))) - .collect::>(); - Ok(registered - .iter_mut() - .filter_map(|read| { - if read - .as_ref() - .is_some_and(|read| keys.contains(&read.request.key)) - { - read.take() - } else { - None - } - }) - .collect()) -} - enum StructReadPart { Ready(ArrayRef), Pending(Box), @@ -1289,92 +1213,26 @@ impl ScanPlan for StructValueScanPlan { } impl PreparedRead for StructValuePreparedRead { - fn segment_requests( - &self, - range: Range, - rows: RowScope<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - let mut requests = SegmentRequests::none(); - for field in &self.fields { - requests.extend(field.segment_requests(range.clone(), rows, cx)?); - if requests.is_unknown() { - return Ok(requests); - } - } - if let Some(validity) = &self.validity { - requests.extend(validity.segment_requests(range, rows, cx)?); - } - Ok(requests) - } - - fn prefetch_segment_requests( - &self, - range: Range, - rows: RowScope<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - let mut requests = SegmentRequests::none(); - for field in &self.fields { - requests.extend(field.prefetch_segment_requests(range.clone(), rows, cx)?); - if requests.is_unknown() { - return Ok(requests); - } - } - if let Some(validity) = &self.validity { - requests.extend(validity.prefetch_segment_requests(range, rows, cx)?); - } - Ok(requests) - } - fn create_task( self: Arc, range: Range, rows: OwnedRowScope, - reads: Vec, - prefetch_reads: Vec, - cx: &mut SegmentPlanCtx, + phase: ScanIoPhase, ) -> VortexResult> { - let mut reads = reads.into_iter().map(Some).collect::>(); - let mut prefetch_reads = prefetch_reads.into_iter().map(Some).collect::>(); let mut fields = Vec::with_capacity(self.fields.len()); for field in &self.fields { - let field_reads = take_reads_for_requests( - &mut reads, - field.segment_requests(range.clone(), rows.as_scope(), cx)?, - )?; - let field_prefetch_reads = take_reads_for_requests( - &mut prefetch_reads, - field.prefetch_segment_requests(range.clone(), rows.as_scope(), cx)?, - )?; fields.push(StructReadPart::Pending(Arc::clone(field).create_task( range.clone(), rows.clone(), - field_reads, - field_prefetch_reads, - cx, + phase, )?)); } let validity = self .validity .as_ref() .map(|validity| { - let validity_reads = take_reads_for_requests( - &mut reads, - validity.segment_requests(range.clone(), rows.as_scope(), cx)?, - )?; - let validity_prefetch_reads = take_reads_for_requests( - &mut prefetch_reads, - validity.prefetch_segment_requests(range.clone(), rows.as_scope(), cx)?, - )?; Arc::clone(validity) - .create_task( - range.clone(), - rows.clone(), - validity_reads, - validity_prefetch_reads, - cx, - ) + .create_task(range.clone(), rows.clone(), phase) .map(StructReadPart::Pending) }) .transpose()?; @@ -1455,33 +1313,13 @@ impl ScanPlan for ApplyScanPlan { } impl PreparedRead for ApplyPreparedRead { - fn segment_requests( - &self, - range: Range, - rows: RowScope<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - self.input.segment_requests(range, rows, cx) - } - - fn prefetch_segment_requests( - &self, - range: Range, - rows: RowScope<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - self.input.prefetch_segment_requests(range, rows, cx) - } - fn create_task( self: Arc, range: Range, rows: OwnedRowScope, - reads: Vec, - prefetch_reads: Vec, - cx: &mut SegmentPlanCtx, + phase: ScanIoPhase, ) -> VortexResult> { - let input = Arc::clone(&self.input).create_task(range, rows, reads, prefetch_reads, cx)?; + let input = Arc::clone(&self.input).create_task(range, rows, phase)?; Ok(Box::new(ApplyReadTask { expr: self.plan.expr.clone(), input, @@ -1578,80 +1416,14 @@ impl ScanPlan for MaskScanPlan { } impl PreparedRead for MaskPreparedRead { - fn segment_requests( - &self, - range: Range, - rows: RowScope<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - let mut requests = self.input.segment_requests(range.clone(), rows, cx)?; - if requests.is_unknown() { - return Ok(requests); - } - requests.extend(self.validity.segment_requests(range, rows, cx)?); - Ok(requests) - } - - fn prefetch_segment_requests( - &self, - range: Range, - rows: RowScope<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - let mut requests = self - .input - .prefetch_segment_requests(range.clone(), rows, cx)?; - if requests.is_unknown() { - return Ok(requests); - } - requests.extend(self.validity.prefetch_segment_requests(range, rows, cx)?); - Ok(requests) - } - fn create_task( self: Arc, range: Range, rows: OwnedRowScope, - reads: Vec, - prefetch_reads: Vec, - cx: &mut SegmentPlanCtx, + phase: ScanIoPhase, ) -> VortexResult> { - let mut reads = reads.into_iter().map(Some).collect::>(); - let mut prefetch_reads = prefetch_reads.into_iter().map(Some).collect::>(); - let input_reads = take_reads_for_requests( - &mut reads, - self.input - .segment_requests(range.clone(), rows.as_scope(), cx)?, - )?; - let input_prefetch_reads = take_reads_for_requests( - &mut prefetch_reads, - self.input - .prefetch_segment_requests(range.clone(), rows.as_scope(), cx)?, - )?; - let validity_reads = take_reads_for_requests( - &mut reads, - self.validity - .segment_requests(range.clone(), rows.as_scope(), cx)?, - )?; - let validity_prefetch_reads = take_reads_for_requests( - &mut prefetch_reads, - self.validity - .prefetch_segment_requests(range.clone(), rows.as_scope(), cx)?, - )?; - let input = Arc::clone(&self.input).create_task( - range.clone(), - rows.clone(), - input_reads, - input_prefetch_reads, - cx, - )?; - let validity = Arc::clone(&self.validity).create_task( - range, - rows, - validity_reads, - validity_prefetch_reads, - cx, - )?; + let input = Arc::clone(&self.input).create_task(range.clone(), rows.clone(), phase)?; + let validity = Arc::clone(&self.validity).create_task(range, rows, phase)?; Ok(Box::new(MaskReadTask { input, validity })) } @@ -1717,29 +1489,9 @@ pub trait PreparedEvidence: 'static + Send + Sync { &'a self, req: &'a EvidenceRequest<'a>, io: &'a ReadContext, + results: ReadResults, ) -> VortexResult>; - /// Return scheduler-visible segment requests needed for this evidence, when known exactly. - fn segment_requests( - &self, - _req: &EvidenceRequest<'_>, - _cx: &mut SegmentPlanCtx, - ) -> VortexResult { - Ok(SegmentRequests::unknown()) - } - - /// Return scheduler-visible segment requests that may be fetched speculatively. - /// - /// Prefetch requests must not be required for the immediate [`PreparedEvidence::evidence`] - /// execution path, because the scan scheduler may launch them without waiting for completion. - fn prefetch_segment_requests( - &self, - _req: &EvidenceRequest<'_>, - _cx: &mut SegmentPlanCtx, - ) -> VortexResult { - Ok(SegmentRequests::none()) - } - /// Whether this handle is cheap enough to re-run immediately before a /// projection read when a dynamic predicate boundary changes while /// the morsel is in flight. @@ -1761,52 +1513,68 @@ pub trait PreparedEvidence: 'static + Send + Sync { fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "evidence") } -} -impl dyn PreparedEvidence { /// Create a morsel-level evidence task for this prepared evidence handle. - pub fn create_task( + fn create_task( self: Arc, req: OwnedEvidenceRequest, - reads: Vec, - ) -> VortexResult> { - Ok(Box::new(DefaultEvidenceTask { - evidence: self, - req, - reads, - })) - } + phase: ScanIoPhase, + ) -> VortexResult>; } /// A morsel-level evidence task. pub trait EvidenceTask: Send { - /// Registered reads needed by this task. - fn reads(&self) -> &[ScanRead]; - - /// Execute the evidence task. - fn evidence(self: Box, io: &ReadContext) -> VortexResult>; + /// Convert this task into its scheduler-visible step. + fn into_step(self: Box) -> VortexResult; } -struct DefaultEvidenceTask { - evidence: PreparedEvidenceRef, - req: OwnedEvidenceRequest, - reads: Vec, +/// Continuation called after an evidence step's required reads have resolved. +pub trait EvidenceContinuation: Send { + /// Execute the continuation. + fn run( + self: Box, + io: &ReadContext, + results: ReadResults, + ) -> VortexResult>; } -impl EvidenceTask for DefaultEvidenceTask { - fn reads(&self) -> &[ScanRead] { - &self.reads +impl EvidenceContinuation for F +where + F: FnOnce(&ReadContext, ReadResults) -> VortexResult> + Send, +{ + fn run( + self: Box, + io: &ReadContext, + results: ReadResults, + ) -> VortexResult> { + self(io, results) } +} - fn evidence(self: Box, io: &ReadContext) -> VortexResult> { - let Self { - evidence, - req, - reads, - } = *self; - let result = evidence.evidence(&req.as_request(), io); - drop(reads); - result +/// One scheduler-visible step of an evidence task. +pub struct EvidenceStep { + /// Reads that must resolve before the continuation runs. + pub required_reads: Vec, + /// Reads that may be fetched speculatively while this step is queued. + pub prefetch_reads: Vec, + /// Continuation to execute after required reads resolve. + pub continuation: Box, +} + +impl EvidenceStep { + /// Create an evidence step. + pub fn new( + required_reads: Vec, + prefetch_reads: Vec, + continuation: impl FnOnce(&ReadContext, ReadResults) -> VortexResult> + + Send + + 'static, + ) -> Self { + Self { + required_reads, + prefetch_reads, + continuation: Box::new(continuation), + } } } @@ -1829,29 +1597,12 @@ mod tests { use vortex_array::aggregate_fn::fns::max::Max; use vortex_array::aggregate_fn::fns::min::Min; use vortex_array::arrays::Constant; - use vortex_array::buffer::BufferHandle; use vortex_array::dtype::Nullability; use vortex_array::expr::lit; - use vortex_buffer::ByteBuffer; use vortex_scan::read::ReadStore; use super::*; - struct TestSegments; - - impl SegmentSource for TestSegments { - fn segment_info( - &self, - _id: crate::segments::SegmentId, - ) -> VortexResult { - Ok(crate::segments::SegmentInfo::new(0)) - } - - fn request(&self, _id: crate::segments::SegmentId) -> crate::segments::SegmentFuture { - Box::pin(async { Ok(BufferHandle::new_host(ByteBuffer::from(Vec::::new()))) }) - } - } - struct TestStatsNode; impl ScanPlan for TestStatsNode { @@ -1940,7 +1691,7 @@ mod tests { )? .ok_or_else(|| vortex_err!("test scan plan did not return a stats plan"))?; let state = plan.init_state(&session)?; - let io = ReadContext::new(Arc::new(TestSegments), session); + let io = ReadContext::new(session); let stats = futures::executor::block_on(plan.stats(10..20, &io, state.as_ref()))?; assert_eq!(stats.len(), funcs.len()); @@ -1962,10 +1713,9 @@ mod tests { let read = plan .prepare_read(&mut PrepareCtx::new(session.clone()))? .ok_or_else(|| vortex_err!("literal scan plan did not return a prepared read"))?; - let io = ReadContext::new(Arc::new(TestSegments), session); + let io = ReadContext::new(session); let rows = OwnedRowScope::selected(Mask::new_true(5)); - let mut segment_ctx = SegmentPlanCtx::new(Arc::clone(io.segments()), io.session().clone()); - let task = read.create_task(10..15, rows, Vec::new(), Vec::new(), &mut segment_ctx)?; + let task = read.create_task(10..15, rows, ScanIoPhase::ProjectionRead)?; let results = ReadResults::new(Arc::new(ReadStore::new())); let step = task.into_step()?; if !step.required_reads.is_empty() || !step.prefetch_reads.is_empty() { diff --git a/vortex-layout/src/scan/v2/layouts/chunked.rs b/vortex-layout/src/scan/v2/layouts/chunked.rs index ab96af436d1..b9c0bc8e5f8 100644 --- a/vortex-layout/src/scan/v2/layouts/chunked.rs +++ b/vortex-layout/src/scan/v2/layouts/chunked.rs @@ -39,14 +39,18 @@ use vortex_array::scalar::Scalar; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; -use vortex_scan::read::ScanRead; +use vortex_scan::read::ReadResults; +use vortex_scan::read::ScanIoPhase; use vortex_session::VortexSession; use crate::layout_v2::Layout; use crate::layout_v2::LayoutRef; +use crate::layout_v2::LayoutScanPlanCtx; use crate::layouts_v2::chunked::Chunked; use crate::scan::plan::AggregateAnswer; use crate::scan::plan::DeferredReadTask; +use crate::scan::plan::EvidenceStep; +use crate::scan::plan::EvidenceTask; use crate::scan::plan::OwnedRowScope; use crate::scan::plan::PrepareCtx; use crate::scan::plan::PreparedAggregate; @@ -62,7 +66,6 @@ use crate::scan::plan::ReadContext; use crate::scan::plan::ReadStep; use crate::scan::plan::ReadTask; use crate::scan::plan::ReadTaskOutput; -use crate::scan::plan::RowScope; use crate::scan::plan::ScanPlan; use crate::scan::plan::ScanPlanRef; use crate::scan::plan::ScanState; @@ -73,20 +76,18 @@ use crate::scan::plan::downcast_state; use crate::scan::plan::evidence::EvidenceFragment; use crate::scan::plan::request::EvidenceMode; use crate::scan::plan::request::EvidenceRequest; +use crate::scan::plan::request::OwnedEvidenceRequest; use crate::scan::plan::request::ScanRequest; -use crate::scan::plan::take_reads_for_requests; -use crate::segments::SegmentPlanCtx; -use crate::segments::SegmentRequests; pub(crate) fn new_scan_plan( layout: Layout, _req: &mut ScanRequest, - session: &VortexSession, + ctx: &LayoutScanPlanCtx, ) -> VortexResult { Ok(Arc::new(ChunkedScanPlan { layout: layout.to_layout(), offsets: layout.data().chunk_offsets().to_vec(), - session: session.clone(), + ctx: ctx.clone(), children: Mutex::new(FxHashMap::default()), })) } @@ -97,7 +98,7 @@ pub(crate) fn new_scan_plan( pub struct ChunkedScanPlan { layout: LayoutRef, offsets: Vec, - session: VortexSession, + ctx: LayoutScanPlanCtx, /// Lazily expanded chunk nodes, shared across queries. children: Mutex>, } @@ -142,6 +143,13 @@ pub struct ChunkedExprScanState { struct ChunkedPreparedEvidence { node: Arc, state: Arc, + session: VortexSession, +} + +struct ChunkedEvidenceTask { + evidence: Arc, + req: OwnedEvidenceRequest, + phase: ScanIoPhase, } enum ChunkedAggregateNode { @@ -313,10 +321,7 @@ impl ChunkedScanPlan { return Ok(Arc::clone(hit)); } let mut req = ScanRequest::empty(); - let plan = self - .layout - .child(idx)? - .new_scan_plan(&mut req, &self.session)?; + let plan = self.layout.child(idx)?.new_scan_plan(&mut req, &self.ctx)?; self.children.lock().insert(idx, Arc::clone(&plan)); Ok(plan) } @@ -585,7 +590,11 @@ impl ScanPlan for ChunkedScanPlan { let key = PreparedStateKey::new::(Arc::as_ptr(&self) as *const () as usize); let state = cx.shared_state(key, || Ok(ChunkedEvidenceState::new(chunked_state)))?; - Ok(vec![Arc::new(ChunkedPreparedEvidence { node, state })]) + Ok(vec![Arc::new(ChunkedPreparedEvidence { + node, + state, + session: cx.session().clone(), + })]) } fn prepare_aggregate_partial( @@ -645,9 +654,7 @@ impl PreparedRead for ChunkedPreparedRead { self: Arc, range: Range, rows: OwnedRowScope, - reads: Vec, - prefetch_reads: Vec, - cx: &mut SegmentPlanCtx, + phase: ScanIoPhase, ) -> VortexResult> { if range.start >= range.end { vortex_bail!("empty chunked scoped read range"); @@ -686,8 +693,6 @@ impl PreparedRead for ChunkedPreparedRead { } let dtype = self.node.layout.dtype().clone(); - let mut reads = reads.into_iter().map(Some).collect::>(); - let mut prefetch_reads = prefetch_reads.into_iter().map(Some).collect::>(); let dense_scope = row_scope.selection.all_true() && row_scope.demand.all_true(); let selected_scope = !dense_scope && row_scope.demands_all_selected(); let mut parts = Vec::new(); @@ -715,31 +720,18 @@ impl PreparedRead for ChunkedPreparedRead { continue; } let chunk_idx = idx - 1; - let read = self.node.child_read(chunk_idx, &self.state, cx.session())?; + let read = self + .node + .child_read(chunk_idx, &self.state, self.node.ctx.session())?; let chunk_rows = if dense_scope || selected_scope { OwnedRowScope::selected(chunk_selection.clone()) } else { OwnedRowScope::try_new(chunk_selection.clone(), chunk_demand)? }; - let chunk_scope = chunk_rows.as_scope(); - let chunk_reads = take_reads_for_requests( - &mut reads, - read.segment_requests(local.clone(), chunk_scope, cx)?, - )?; - let chunk_prefetch_reads = take_reads_for_requests( - &mut prefetch_reads, - read.prefetch_segment_requests(local.clone(), chunk_scope, cx)?, - )?; let expected_len = chunk_selection.true_count(); parts.push(ChunkedReadPart::Pending { expected_len, - task: Arc::clone(&read).create_task( - local, - chunk_rows, - chunk_reads, - chunk_prefetch_reads, - cx, - )?, + task: Arc::clone(&read).create_task(local, chunk_rows, phase)?, }); } match parts.len() { @@ -748,154 +740,6 @@ impl PreparedRead for ChunkedPreparedRead { } } - fn segment_requests( - &self, - range: Range, - rows: RowScope<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - if range.start >= range.end { - vortex_bail!("empty chunked scoped read range"); - } - #[cfg(debug_assertions)] - { - let released = self.state.released.load(Ordering::Relaxed); - debug_assert!( - range.start >= released, - "chunked request planning {range:?} below the released frontier {released}" - ); - } - let range_len = usize::try_from(range.end - range.start) - .map_err(|_| vortex_err!("read range exceeds usize"))?; - if rows.selection.len() != range_len { - vortex_bail!( - "selection length {} does not match range length {range_len}", - rows.selection.len() - ); - } - if rows.demand.len() != range_len { - vortex_bail!( - "demand length {} does not match range length {range_len}", - rows.demand.len() - ); - } - if rows.selection.all_false() { - return Ok(SegmentRequests::none()); - } - - let dense_scope = rows.selection.all_true() && rows.demand.all_true(); - let selected_scope = !dense_scope && rows.demands_all_selected(); - let mut requests = SegmentRequests::none(); - let mut saw_overlap = false; - let mut idx = self.node.first_chunk(range.start); - while idx + 1 < self.node.offsets.len() && self.node.offsets[idx] < range.end { - saw_overlap = true; - let chunk_start = self.node.offsets[idx]; - let chunk_end = self.node.offsets[idx + 1]; - let local = - range.start.saturating_sub(chunk_start)..(range.end.min(chunk_end) - chunk_start); - let sel_start = usize::try_from(chunk_start.max(range.start) - range.start) - .map_err(|_| vortex_err!("read range exceeds usize"))?; - let sel_end = usize::try_from(chunk_end.min(range.end) - range.start) - .map_err(|_| vortex_err!("read range exceeds usize"))?; - let chunk_selection = rows.selection.slice(sel_start..sel_end); - idx += 1; - if chunk_selection.all_false() { - continue; - } - let chunk_demand = rows.demand.slice(sel_start..sel_end); - if chunk_demand.all_false() { - continue; - } - let chunk_idx = idx - 1; - let read = self.node.child_read(chunk_idx, &self.state, cx.session())?; - let chunk_requests = if dense_scope || selected_scope { - read.segment_requests(local, RowScope::selected(&chunk_selection), cx)? - } else { - let chunk_rows = RowScope::try_new(&chunk_selection, &chunk_demand)?; - read.segment_requests(local, chunk_rows, cx)? - }; - requests.extend(chunk_requests); - if requests.is_unknown() { - return Ok(requests); - } - } - if !saw_overlap { - vortex_bail!("chunked scoped read range {range:?} out of bounds"); - } - Ok(requests) - } - - fn prefetch_segment_requests( - &self, - range: Range, - rows: RowScope<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - if range.start >= range.end { - vortex_bail!("empty chunked scoped read range"); - } - let range_len = usize::try_from(range.end - range.start) - .map_err(|_| vortex_err!("read range exceeds usize"))?; - if rows.selection.len() != range_len { - vortex_bail!( - "selection length {} does not match range length {range_len}", - rows.selection.len() - ); - } - if rows.demand.len() != range_len { - vortex_bail!( - "demand length {} does not match range length {range_len}", - rows.demand.len() - ); - } - if rows.selection.all_false() { - return Ok(SegmentRequests::none()); - } - - let dense_scope = rows.selection.all_true() && rows.demand.all_true(); - let selected_scope = !dense_scope && rows.demands_all_selected(); - let mut requests = SegmentRequests::none(); - let mut saw_overlap = false; - let mut idx = self.node.first_chunk(range.start); - while idx + 1 < self.node.offsets.len() && self.node.offsets[idx] < range.end { - saw_overlap = true; - let chunk_start = self.node.offsets[idx]; - let chunk_end = self.node.offsets[idx + 1]; - let local = - range.start.saturating_sub(chunk_start)..(range.end.min(chunk_end) - chunk_start); - let sel_start = usize::try_from(chunk_start.max(range.start) - range.start) - .map_err(|_| vortex_err!("read range exceeds usize"))?; - let sel_end = usize::try_from(chunk_end.min(range.end) - range.start) - .map_err(|_| vortex_err!("read range exceeds usize"))?; - let chunk_selection = rows.selection.slice(sel_start..sel_end); - idx += 1; - if chunk_selection.all_false() { - continue; - } - let chunk_demand = rows.demand.slice(sel_start..sel_end); - if chunk_demand.all_false() { - continue; - } - let chunk_idx = idx - 1; - let read = self.node.child_read(chunk_idx, &self.state, cx.session())?; - let chunk_requests = if dense_scope || selected_scope { - read.prefetch_segment_requests(local, RowScope::selected(&chunk_selection), cx)? - } else { - let chunk_rows = RowScope::try_new(&chunk_selection, &chunk_demand)?; - read.prefetch_segment_requests(local, chunk_rows, cx)? - }; - requests.extend(chunk_requests); - if requests.is_unknown() { - return Ok(requests); - } - } - if !saw_overlap { - vortex_bail!("chunked scoped read range {range:?} out of bounds"); - } - Ok(requests) - } - fn release(&self, frontier: u64) -> VortexResult<()> { self.node.release(frontier, &self.state) } @@ -953,6 +797,7 @@ impl ScanPlan for ChunkedExprScanPlan { Ok(vec![Arc::new(ChunkedPreparedEvidence { node: self, state, + session: cx.session().clone(), })]) } @@ -1005,9 +850,7 @@ impl PreparedRead for ChunkedExprPreparedRead { self: Arc, range: Range, rows: OwnedRowScope, - reads: Vec, - prefetch_reads: Vec, - cx: &mut SegmentPlanCtx, + phase: ScanIoPhase, ) -> VortexResult> { if range.start >= range.end { vortex_bail!("empty chunked scoped read range"); @@ -1044,8 +887,6 @@ impl PreparedRead for ChunkedExprPreparedRead { })); } - let mut reads = reads.into_iter().map(Some).collect::>(); - let mut prefetch_reads = prefetch_reads.into_iter().map(Some).collect::>(); let dense_scope = row_scope.selection.all_true() && row_scope.demand.all_true(); let selected_scope = !dense_scope && row_scope.demands_all_selected(); let mut parts = Vec::new(); @@ -1078,31 +919,18 @@ impl PreparedRead for ChunkedExprPreparedRead { continue; } let chunk_idx = idx - 1; - let read = self.node.child_read(chunk_idx, &self.state, cx.session())?; + let read = + self.node + .child_read(chunk_idx, &self.state, self.node.chunked.ctx.session())?; let chunk_rows = if dense_scope || selected_scope { OwnedRowScope::selected(chunk_selection.clone()) } else { OwnedRowScope::try_new(chunk_selection.clone(), chunk_demand)? }; - let chunk_scope = chunk_rows.as_scope(); - let chunk_reads = take_reads_for_requests( - &mut reads, - read.segment_requests(local.clone(), chunk_scope, cx)?, - )?; - let chunk_prefetch_reads = take_reads_for_requests( - &mut prefetch_reads, - read.prefetch_segment_requests(local.clone(), chunk_scope, cx)?, - )?; let expected_len = chunk_selection.true_count(); parts.push(ChunkedReadPart::Pending { expected_len, - task: Arc::clone(&read).create_task( - local, - chunk_rows, - chunk_reads, - chunk_prefetch_reads, - cx, - )?, + task: Arc::clone(&read).create_task(local, chunk_rows, phase)?, }); } match parts.len() { @@ -1114,158 +942,6 @@ impl PreparedRead for ChunkedExprPreparedRead { } } - fn segment_requests( - &self, - range: Range, - rows: RowScope<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - if range.start >= range.end { - vortex_bail!("empty chunked scoped read range"); - } - #[cfg(debug_assertions)] - { - let released = self.state.released.load(Ordering::Relaxed); - debug_assert!( - range.start >= released, - "chunked expression request planning {range:?} below the released frontier {released}" - ); - } - let range_len = usize::try_from(range.end - range.start) - .map_err(|_| vortex_err!("read range exceeds usize"))?; - if rows.selection.len() != range_len { - vortex_bail!( - "selection length {} does not match range length {range_len}", - rows.selection.len() - ); - } - if rows.demand.len() != range_len { - vortex_bail!( - "demand length {} does not match range length {range_len}", - rows.demand.len() - ); - } - if rows.selection.all_false() { - return Ok(SegmentRequests::none()); - } - - let dense_scope = rows.selection.all_true() && rows.demand.all_true(); - let selected_scope = !dense_scope && rows.demands_all_selected(); - let mut requests = SegmentRequests::none(); - let mut saw_overlap = false; - let mut idx = self.node.chunked.first_chunk(range.start); - while idx + 1 < self.node.chunked.offsets.len() - && self.node.chunked.offsets[idx] < range.end - { - saw_overlap = true; - let chunk_start = self.node.chunked.offsets[idx]; - let chunk_end = self.node.chunked.offsets[idx + 1]; - let local = - range.start.saturating_sub(chunk_start)..(range.end.min(chunk_end) - chunk_start); - let sel_start = usize::try_from(chunk_start.max(range.start) - range.start) - .map_err(|_| vortex_err!("read range exceeds usize"))?; - let sel_end = usize::try_from(chunk_end.min(range.end) - range.start) - .map_err(|_| vortex_err!("read range exceeds usize"))?; - let chunk_selection = rows.selection.slice(sel_start..sel_end); - idx += 1; - if chunk_selection.all_false() { - continue; - } - let chunk_demand = rows.demand.slice(sel_start..sel_end); - if chunk_demand.all_false() { - continue; - } - let chunk_idx = idx - 1; - let read = self.node.child_read(chunk_idx, &self.state, cx.session())?; - let chunk_requests = if dense_scope || selected_scope { - read.segment_requests(local, RowScope::selected(&chunk_selection), cx)? - } else { - let chunk_rows = RowScope::try_new(&chunk_selection, &chunk_demand)?; - read.segment_requests(local, chunk_rows, cx)? - }; - requests.extend(chunk_requests); - if requests.is_unknown() { - return Ok(requests); - } - } - if !saw_overlap { - vortex_bail!("chunked scoped read range {range:?} out of bounds"); - } - Ok(requests) - } - - fn prefetch_segment_requests( - &self, - range: Range, - rows: RowScope<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - if range.start >= range.end { - vortex_bail!("empty chunked scoped read range"); - } - let range_len = usize::try_from(range.end - range.start) - .map_err(|_| vortex_err!("read range exceeds usize"))?; - if rows.selection.len() != range_len { - vortex_bail!( - "selection length {} does not match range length {range_len}", - rows.selection.len() - ); - } - if rows.demand.len() != range_len { - vortex_bail!( - "demand length {} does not match range length {range_len}", - rows.demand.len() - ); - } - if rows.selection.all_false() { - return Ok(SegmentRequests::none()); - } - - let dense_scope = rows.selection.all_true() && rows.demand.all_true(); - let selected_scope = !dense_scope && rows.demands_all_selected(); - let mut requests = SegmentRequests::none(); - let mut saw_overlap = false; - let mut idx = self.node.chunked.first_chunk(range.start); - while idx + 1 < self.node.chunked.offsets.len() - && self.node.chunked.offsets[idx] < range.end - { - saw_overlap = true; - let chunk_start = self.node.chunked.offsets[idx]; - let chunk_end = self.node.chunked.offsets[idx + 1]; - let local = - range.start.saturating_sub(chunk_start)..(range.end.min(chunk_end) - chunk_start); - let sel_start = usize::try_from(chunk_start.max(range.start) - range.start) - .map_err(|_| vortex_err!("read range exceeds usize"))?; - let sel_end = usize::try_from(chunk_end.min(range.end) - range.start) - .map_err(|_| vortex_err!("read range exceeds usize"))?; - let chunk_selection = rows.selection.slice(sel_start..sel_end); - idx += 1; - if chunk_selection.all_false() { - continue; - } - let chunk_demand = rows.demand.slice(sel_start..sel_end); - if chunk_demand.all_false() { - continue; - } - let chunk_idx = idx - 1; - let read = self.node.child_read(chunk_idx, &self.state, cx.session())?; - let chunk_requests = if dense_scope || selected_scope { - read.prefetch_segment_requests(local, RowScope::selected(&chunk_selection), cx)? - } else { - let chunk_rows = RowScope::try_new(&chunk_selection, &chunk_demand)?; - read.prefetch_segment_requests(local, chunk_rows, cx)? - }; - requests.extend(chunk_requests); - if requests.is_unknown() { - return Ok(requests); - } - } - if !saw_overlap { - vortex_bail!("chunked scoped read range {range:?} out of bounds"); - } - Ok(requests) - } - fn release(&self, frontier: u64) -> VortexResult<()> { self.node.release(frontier, &self.state) } @@ -1280,6 +956,7 @@ impl PreparedEvidence for ChunkedPreparedEvidence { &'a self, req: &'a EvidenceRequest<'a>, io: &'a ReadContext, + results: ReadResults, ) -> VortexResult> { if req.range.start >= req.range.end { return Ok(Vec::new()); @@ -1329,7 +1006,7 @@ impl PreparedEvidence for ChunkedPreparedEvidence { if recheck && !plan.recheck_before_projection() { continue; } - for fragment in plan.evidence(&child_req, io)? { + for fragment in plan.evidence(&child_req, io, results.clone())? { fragments.push(translate_fragment(fragment, chunk_start)); } } @@ -1339,118 +1016,87 @@ impl PreparedEvidence for ChunkedPreparedEvidence { Ok(fragments) } - fn segment_requests( - &self, - req: &EvidenceRequest<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - if req.range.start >= req.range.end { - return Ok(SegmentRequests::none()); - } + fn recheck_before_projection(&self) -> bool { + true + } - let mut requests = SegmentRequests::none(); - let mut idx = self.node.chunked.first_chunk(req.range.start); - while idx + 1 < self.node.chunked.offsets.len() - && self.node.chunked.offsets[idx] < req.range.end - { - let chunk_start = self.node.chunked.offsets[idx]; - let chunk_end = self.node.chunked.offsets[idx + 1]; - let local = req.range.start.saturating_sub(chunk_start) - ..(req.range.end.min(chunk_end) - chunk_start); - let recheck = req.mode == EvidenceMode::RecheckBeforeProjection; - let child_plans = if let Some(hit) = self.state.children.lock().get(&idx) { - hit.clone() - } else if recheck { - if let Some(hit) = self.state.recheck_children.lock().get(&idx) { - hit.clone() - } else { - let node = self.node.child(idx, cx.session())?; - let mut plan_ctx = self.state.chunked.child_prepare_ctx(idx, cx.session()); - let plans = node.prepare_evidence(&mut plan_ctx)?; - let planned = plans - .into_iter() - .filter(|plan| plan.recheck_before_projection()) - .collect::>(); - let mut children = self.state.recheck_children.lock(); - children.entry(idx).or_insert(planned).clone() - } - } else { - let node = self.node.child(idx, cx.session())?; - let mut plan_ctx = self.state.chunked.child_prepare_ctx(idx, cx.session()); - let planned = node.prepare_evidence(&mut plan_ctx)?; - let mut children = self.state.children.lock(); - children.entry(idx).or_insert(planned).clone() - }; - if !child_plans.is_empty() { - let child_req = EvidenceRequest { - id: req.id, - version: req.version, - predicate: req.predicate, - range: local, - mode: req.mode, - }; - for plan in child_plans { - if recheck && !plan.recheck_before_projection() { - continue; - } - requests.extend(plan.segment_requests(&child_req, cx)?); - if requests.is_unknown() { - return Ok(requests); - } - } - } - idx += 1; - } - Ok(requests) + fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "chunked") } - fn prefetch_segment_requests( - &self, - req: &EvidenceRequest<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { + fn create_task( + self: Arc, + req: OwnedEvidenceRequest, + phase: ScanIoPhase, + ) -> VortexResult> { + Ok(Box::new(ChunkedEvidenceTask { + evidence: self, + req, + phase, + })) + } +} + +impl EvidenceTask for ChunkedEvidenceTask { + fn into_step(self: Box) -> VortexResult { + let Self { + evidence, + req, + phase, + } = *self; if req.range.start >= req.range.end { - return Ok(SegmentRequests::none()); + return Ok(EvidenceStep::new( + Vec::new(), + Vec::new(), + move |io, results| evidence.evidence(&req.as_request(), io, results), + )); } - let mut requests = SegmentRequests::none(); - let mut idx = self.node.chunked.first_chunk(req.range.start); - while idx + 1 < self.node.chunked.offsets.len() - && self.node.chunked.offsets[idx] < req.range.end + let mut required_reads = Vec::new(); + let mut prefetch_reads = Vec::new(); + let mut idx = evidence.node.chunked.first_chunk(req.range.start); + while idx + 1 < evidence.node.chunked.offsets.len() + && evidence.node.chunked.offsets[idx] < req.range.end { - let chunk_start = self.node.chunked.offsets[idx]; - let chunk_end = self.node.chunked.offsets[idx + 1]; + let chunk_start = evidence.node.chunked.offsets[idx]; + let chunk_end = evidence.node.chunked.offsets[idx + 1]; let local = req.range.start.saturating_sub(chunk_start) ..(req.range.end.min(chunk_end) - chunk_start); let recheck = req.mode == EvidenceMode::RecheckBeforeProjection; - let child_plans = if let Some(hit) = self.state.children.lock().get(&idx) { + let child_plans = if let Some(hit) = evidence.state.children.lock().get(&idx) { hit.clone() } else if recheck { - if let Some(hit) = self.state.recheck_children.lock().get(&idx) { + if let Some(hit) = evidence.state.recheck_children.lock().get(&idx) { hit.clone() } else { - let node = self.node.child(idx, cx.session())?; - let mut plan_ctx = self.state.chunked.child_prepare_ctx(idx, cx.session()); + let node = evidence.node.child(idx, &evidence.session)?; + let mut plan_ctx = evidence + .state + .chunked + .child_prepare_ctx(idx, &evidence.session); let plans = node.prepare_evidence(&mut plan_ctx)?; let planned = plans .into_iter() .filter(|plan| plan.recheck_before_projection()) .collect::>(); - let mut children = self.state.recheck_children.lock(); + let mut children = evidence.state.recheck_children.lock(); children.entry(idx).or_insert(planned).clone() } } else { - let node = self.node.child(idx, cx.session())?; - let mut plan_ctx = self.state.chunked.child_prepare_ctx(idx, cx.session()); + let node = evidence.node.child(idx, &evidence.session)?; + let mut plan_ctx = evidence + .state + .chunked + .child_prepare_ctx(idx, &evidence.session); let planned = node.prepare_evidence(&mut plan_ctx)?; - let mut children = self.state.children.lock(); + let mut children = evidence.state.children.lock(); children.entry(idx).or_insert(planned).clone() }; if !child_plans.is_empty() { - let child_req = EvidenceRequest { + let child_req = OwnedEvidenceRequest { id: req.id, version: req.version, - predicate: req.predicate, + predicate: req.predicate.clone(), range: local, mode: req.mode, }; @@ -1458,23 +1104,20 @@ impl PreparedEvidence for ChunkedPreparedEvidence { if recheck && !plan.recheck_before_projection() { continue; } - requests.extend(plan.prefetch_segment_requests(&child_req, cx)?); - if requests.is_unknown() { - return Ok(requests); - } + let step = Arc::clone(&plan) + .create_task(child_req.clone(), phase)? + .into_step()?; + required_reads.extend(step.required_reads); + prefetch_reads.extend(step.prefetch_reads); } } idx += 1; } - Ok(requests) - } - - fn recheck_before_projection(&self) -> bool { - true - } - - fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "chunked") + Ok(EvidenceStep::new( + required_reads, + prefetch_reads, + move |io, results| evidence.evidence(&req.as_request(), io, results), + )) } } diff --git a/vortex-layout/src/scan/v2/layouts/dict.rs b/vortex-layout/src/scan/v2/layouts/dict.rs index 70b1bfa7937..4420f125a69 100644 --- a/vortex-layout/src/scan/v2/layouts/dict.rs +++ b/vortex-layout/src/scan/v2/layouts/dict.rs @@ -3,8 +3,8 @@ //! Scan2 vtable support for dictionary layouts. //! -//! Value reads keep the v1 shape: values read once per query and cached, -//! codes read per range (selection-aware), the pair rebuilt as a lazy +//! Value reads use the dictionary value domain: values read once per query and +//! cached, codes read per range (selection-aware), the pair rebuilt as a lazy //! `DictArray`. Pushed dictionary expressions also try to evaluate the //! expression over the dictionary values once per query, then reuse the //! resulting value-domain array with per-range codes. @@ -38,10 +38,10 @@ use vortex_error::vortex_bail; use vortex_error::vortex_err; use vortex_mask::AllOr; use vortex_mask::Mask; -use vortex_scan::read::ScanRead; -use vortex_session::VortexSession; +use vortex_scan::read::ScanIoPhase; use crate::layout_v2::Layout; +use crate::layout_v2::LayoutScanPlanCtx; use crate::layouts_v2::dict::Dict; use crate::scan::plan::DeferredReadTask; use crate::scan::plan::OwnedRowScope; @@ -61,9 +61,6 @@ use crate::scan::plan::ScanStateRef; use crate::scan::plan::StateCtx; use crate::scan::plan::default_try_push_expr; use crate::scan::plan::request::ScanRequest; -use crate::scan::plan::take_reads_for_requests; -use crate::segments::SegmentPlanCtx; -use crate::segments::SegmentRequests; const DENSE_REMAP_MAX_VALUES: usize = 1 << 20; const DENSE_REMAP_VALUES_PER_CODE: usize = 4; @@ -72,15 +69,15 @@ const UNREFERENCED_VALUE: usize = usize::MAX; pub(crate) fn new_scan_plan( layout: Layout, _req: &mut ScanRequest, - session: &VortexSession, + ctx: &LayoutScanPlanCtx, ) -> VortexResult { let values = layout.child(0)?; let codes = layout.child(1)?; Ok(Arc::new(DictScanPlan { values_len: values.row_count(), // Values and codes live in other row domains. - values: values.new_scan_plan(&mut ScanRequest::empty(), session)?, - codes: codes.new_scan_plan(&mut ScanRequest::empty(), session)?, + values: values.new_scan_plan(&mut ScanRequest::empty(), ctx)?, + codes: codes.new_scan_plan(&mut ScanRequest::empty(), ctx)?, })) } @@ -299,8 +296,7 @@ enum DictReadState { struct DictReadTask { read: Arc, codes: Box, - value_reads: Vec>, - cx: SegmentPlanCtx, + phase: ScanIoPhase, state: DictReadState, } @@ -312,14 +308,18 @@ impl ReadTask for DictReadTask { let DictReadTask { read, codes, - value_reads, - cx, + phase, state: _, } = task; let codes_step = codes.into_step()?; + let values_prefetch_step = + DictReadTask::create_full_values_task_for(&read, phase)?.into_step()?; + let mut prefetch_reads = codes_step.prefetch_reads; + prefetch_reads.extend(values_prefetch_step.required_reads); + prefetch_reads.extend(values_prefetch_step.prefetch_reads); Ok(ReadStep::new( codes_step.required_reads, - codes_step.prefetch_reads, + prefetch_reads, move |io, local, results| match codes_step .continuation .run(io, local, results)? @@ -328,8 +328,7 @@ impl ReadTask for DictReadTask { Ok(ReadTaskOutput::Continue(Box::new(DictReadTask { read, codes, - value_reads, - cx, + phase, state: DictReadState::Start, }))) } @@ -337,8 +336,7 @@ impl ReadTask for DictReadTask { let mut task = DictReadTask { read, codes: Box::new(DeferredReadTask), - value_reads, - cx, + phase, state: DictReadState::Start, }; let rows = OwnedRowScope::selected(Mask::new_true(codes.len())); @@ -382,8 +380,7 @@ impl ReadTask for DictReadTask { })?; let values_step = values_task.into_step()?; let read = task.read; - let value_reads = task.value_reads; - let cx = task.cx; + let phase = task.phase; Ok(ReadStep::new( values_step.required_reads, values_step.prefetch_reads, @@ -395,8 +392,7 @@ impl ReadTask for DictReadTask { Ok(ReadTaskOutput::Continue(Box::new(DictReadTask { read, codes: Box::new(DeferredReadTask), - value_reads, - cx, + phase, state: DictReadState::SparseValues { compact_codes, values: Some(values), @@ -415,8 +411,7 @@ impl ReadTask for DictReadTask { })?; let values_step = values_task.into_step()?; let read = task.read; - let value_reads = task.value_reads; - let cx = task.cx; + let phase = task.phase; Ok(ReadStep::new( values_step.required_reads, values_step.prefetch_reads, @@ -428,8 +423,7 @@ impl ReadTask for DictReadTask { Ok(ReadTaskOutput::Continue(Box::new(DictReadTask { read, codes: Box::new(DeferredReadTask), - value_reads, - cx, + phase, state: DictReadState::FullValues { codes, values: Some(values), @@ -447,71 +441,46 @@ impl ReadTask for DictReadTask { } impl DictReadTask { fn create_values_task(&mut self, rows: RowScope<'_>) -> VortexResult> { - let range = 0..self.read.node.values_len; - let requests = self - .read - .values_read - .segment_requests(range.clone(), rows, &mut self.cx)?; - let reads = take_reads_for_requests(&mut self.value_reads, requests)?; - let owned_rows = OwnedRowScope::try_new(rows.selection.clone(), rows.demand.clone())?; - Arc::clone(&self.read.values_read).create_task( - range, - owned_rows, - reads, - Vec::new(), - &mut self.cx, - ) + Self::create_values_task_for(&self.read, self.phase, rows) } - fn create_full_values_task(&mut self) -> VortexResult> { - let values_selection = Mask::new_true( - usize::try_from(self.read.node.values_len) - .map_err(|_| vortex_err!("dictionary values length exceeds usize"))?, - ); - self.create_values_task(RowScope::selected(&values_selection)) + fn create_values_task_for( + read: &Arc, + phase: ScanIoPhase, + rows: RowScope<'_>, + ) -> VortexResult> { + let range = 0..read.node.values_len; + let owned_rows = OwnedRowScope::try_new(rows.selection.clone(), rows.demand.clone())?; + Arc::clone(&read.values_read).create_task(range, owned_rows, phase) } -} -impl PreparedRead for DictPreparedRead { - fn segment_requests( - &self, - range: Range, - rows: RowScope<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - self.codes_read.segment_requests(range, rows, cx) + fn create_full_values_task(&mut self) -> VortexResult> { + Self::create_full_values_task_for(&self.read, self.phase) } - fn prefetch_segment_requests( - &self, - _range: Range, - _rows: RowScope<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { + fn create_full_values_task_for( + read: &Arc, + phase: ScanIoPhase, + ) -> VortexResult> { let values_selection = Mask::new_true( - usize::try_from(self.node.values_len) + usize::try_from(read.node.values_len) .map_err(|_| vortex_err!("dictionary values length exceeds usize"))?, ); - self.values_read.segment_requests( - 0..self.node.values_len, - RowScope::selected(&values_selection), - cx, - ) + Self::create_values_task_for(read, phase, RowScope::selected(&values_selection)) } +} +impl PreparedRead for DictPreparedRead { fn create_task( self: Arc, range: Range, rows: OwnedRowScope, - reads: Vec, - prefetch_reads: Vec, - cx: &mut SegmentPlanCtx, + phase: ScanIoPhase, ) -> VortexResult> { Ok(Box::new(DictReadTask { - codes: Arc::clone(&self.codes_read).create_task(range, rows, reads, Vec::new(), cx)?, + codes: Arc::clone(&self.codes_read).create_task(range, rows, phase)?, read: self, - value_reads: prefetch_reads.into_iter().map(Some).collect(), - cx: cx.clone(), + phase, state: DictReadState::Start, })) } @@ -805,8 +774,7 @@ enum DictExprValueMode { struct DictExprReadTask { read: Arc, codes: Box, - value_reads: Vec>, - cx: SegmentPlanCtx, + phase: ScanIoPhase, state: DictExprReadState, } @@ -818,14 +786,18 @@ impl ReadTask for DictExprReadTask { let DictExprReadTask { read, codes, - value_reads, - cx, + phase, state: _, } = task; let codes_step = codes.into_step()?; + let values_prefetch_step = + DictExprReadTask::create_full_values_task_for(&read, phase)?.into_step()?; + let mut prefetch_reads = codes_step.prefetch_reads; + prefetch_reads.extend(values_prefetch_step.required_reads); + prefetch_reads.extend(values_prefetch_step.prefetch_reads); Ok(ReadStep::new( codes_step.required_reads, - codes_step.prefetch_reads, + prefetch_reads, move |io, local, results| match codes_step .continuation .run(io, local, results)? @@ -834,8 +806,7 @@ impl ReadTask for DictExprReadTask { Ok(ReadTaskOutput::Continue(Box::new(DictExprReadTask { read, codes, - value_reads, - cx, + phase, state: DictExprReadState::Start, }))) } @@ -843,8 +814,7 @@ impl ReadTask for DictExprReadTask { let mut task = DictExprReadTask { read, codes: Box::new(DeferredReadTask), - value_reads, - cx, + phase, state: DictExprReadState::Start, }; let selection = Mask::new_true(codes.len()); @@ -940,8 +910,7 @@ impl ReadTask for DictExprReadTask { })?; let values_step = values_task.into_step()?; let read = task.read; - let value_reads = task.value_reads; - let cx = task.cx; + let phase = task.phase; Ok(ReadStep::new( values_step.required_reads, values_step.prefetch_reads, @@ -953,8 +922,7 @@ impl ReadTask for DictExprReadTask { Ok(ReadTaskOutput::Continue(Box::new(DictExprReadTask { read, codes: Box::new(DeferredReadTask), - value_reads, - cx, + phase, state: DictExprReadState::Values { codes, values: Some(values), @@ -962,15 +930,9 @@ impl ReadTask for DictExprReadTask { }, }))) } - ReadTaskOutput::Ready(values_array) => finish_dict_expr_values( - read, - value_reads, - cx, - codes, - mode, - values_array, - local, - ), + ReadTaskOutput::Ready(values_array) => { + finish_dict_expr_values(read, phase, codes, mode, values_array, local) + } }, )) } @@ -980,8 +942,7 @@ impl ReadTask for DictExprReadTask { fn finish_dict_expr_values( read: Arc, - mut value_reads: Vec>, - mut cx: SegmentPlanCtx, + phase: ScanIoPhase, codes: ArrayRef, mode: DictExprValueMode, values_array: ArrayRef, @@ -1053,16 +1014,11 @@ fn finish_dict_expr_values( %error, "sparse dict expression read unavailable" ); - let full_values = DictExprReadTask::create_full_values_task_for( - &read, - &mut value_reads, - &mut cx, - )?; + let full_values = DictExprReadTask::create_full_values_task_for(&read, phase)?; Ok(ReadTaskOutput::Continue(Box::new(DictExprReadTask { read, codes: Box::new(DeferredReadTask), - value_reads, - cx, + phase, state: DictExprReadState::Values { codes, values: Some(full_values), @@ -1079,79 +1035,46 @@ fn finish_dict_expr_values( impl DictExprReadTask { fn create_values_task(&mut self, rows: RowScope<'_>) -> VortexResult> { - Self::create_values_task_for(&self.read, &mut self.value_reads, &mut self.cx, rows) + Self::create_values_task_for(&self.read, self.phase, rows) } fn create_values_task_for( read: &Arc, - value_reads: &mut [Option], - cx: &mut SegmentPlanCtx, + phase: ScanIoPhase, rows: RowScope<'_>, ) -> VortexResult> { let range = 0..read.node.dict.values_len; - let requests = read.values_read.segment_requests(range.clone(), rows, cx)?; - let reads = take_reads_for_requests(value_reads, requests)?; let owned_rows = OwnedRowScope::try_new(rows.selection.clone(), rows.demand.clone())?; - Arc::clone(&read.values_read).create_task(range, owned_rows, reads, Vec::new(), cx) + Arc::clone(&read.values_read).create_task(range, owned_rows, phase) } fn create_full_values_task(&mut self) -> VortexResult> { - Self::create_full_values_task_for(&self.read, &mut self.value_reads, &mut self.cx) + Self::create_full_values_task_for(&self.read, self.phase) } fn create_full_values_task_for( read: &Arc, - value_reads: &mut [Option], - cx: &mut SegmentPlanCtx, + phase: ScanIoPhase, ) -> VortexResult> { let values_selection = Mask::new_true( usize::try_from(read.node.dict.values_len) .map_err(|_| vortex_err!("dictionary values length exceeds usize"))?, ); - Self::create_values_task_for(read, value_reads, cx, RowScope::selected(&values_selection)) + Self::create_values_task_for(read, phase, RowScope::selected(&values_selection)) } } impl PreparedRead for DictExprPreparedRead { - fn segment_requests( - &self, - range: Range, - rows: RowScope<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - self.codes_read.segment_requests(range, rows, cx) - } - - fn prefetch_segment_requests( - &self, - _range: Range, - _rows: RowScope<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - let values_selection = Mask::new_true( - usize::try_from(self.node.dict.values_len) - .map_err(|_| vortex_err!("dictionary values length exceeds usize"))?, - ); - self.values_read.segment_requests( - 0..self.node.dict.values_len, - RowScope::selected(&values_selection), - cx, - ) - } - fn create_task( self: Arc, range: Range, rows: OwnedRowScope, - reads: Vec, - prefetch_reads: Vec, - cx: &mut SegmentPlanCtx, + phase: ScanIoPhase, ) -> VortexResult> { Ok(Box::new(DictExprReadTask { - codes: Arc::clone(&self.codes_read).create_task(range, rows, reads, Vec::new(), cx)?, + codes: Arc::clone(&self.codes_read).create_task(range, rows, phase)?, read: self, - value_reads: prefetch_reads.into_iter().map(Some).collect(), - cx: cx.clone(), + phase, state: DictExprReadState::Start, })) } diff --git a/vortex-layout/src/scan/v2/layouts/flat.rs b/vortex-layout/src/scan/v2/layouts/flat.rs index 659270989ed..9e0f8c2405e 100644 --- a/vortex-layout/src/scan/v2/layouts/flat.rs +++ b/vortex-layout/src/scan/v2/layouts/flat.rs @@ -23,11 +23,15 @@ use vortex_array::serde::SerializedArray; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; +use vortex_scan::read::ReadRequestKey; +use vortex_scan::read::ReadResults; +use vortex_scan::read::ScanIoPhase; use vortex_scan::read::ScanRead; use vortex_session::VortexSession; use crate::layout_v2::Layout; use crate::layout_v2::LayoutRef; +use crate::layout_v2::LayoutScanPlanCtx; use crate::layouts_v2::flat::Flat; use crate::scan::plan::OwnedRowScope; use crate::scan::plan::PrepareCtx; @@ -35,11 +39,9 @@ use crate::scan::plan::PreparedRead; use crate::scan::plan::PreparedReadRef; use crate::scan::plan::PreparedStateKey; use crate::scan::plan::PushCtx; -use crate::scan::plan::ReadContext; use crate::scan::plan::ReadStep; use crate::scan::plan::ReadTask; use crate::scan::plan::ReadTaskOutput; -use crate::scan::plan::RowScope; use crate::scan::plan::ScanPlan; use crate::scan::plan::ScanPlanRef; use crate::scan::plan::ScanState; @@ -48,16 +50,21 @@ use crate::scan::plan::StateCtx; use crate::scan::plan::default_try_push_expr; use crate::scan::plan::downcast_state; use crate::scan::plan::request::ScanRequest; -use crate::segments::SegmentPlanCtx; -use crate::segments::SegmentRequests; +use crate::segments::SegmentFutureCache; +use crate::segments::SegmentRequest; +use crate::segments::SegmentRequestKey; +use crate::segments::SegmentSource; pub(crate) fn new_scan_plan( layout: Layout, _req: &mut ScanRequest, - _session: &VortexSession, + ctx: &LayoutScanPlanCtx, ) -> VortexResult { Ok(Arc::new(FlatScanPlan { layout: layout.to_layout(), + session: ctx.session().clone(), + segment_source: Arc::clone(ctx.segment_source()), + segment_future_cache: Arc::clone(ctx.segment_future_cache()), })) } @@ -65,6 +72,9 @@ pub(crate) fn new_scan_plan( /// into a (lazy) array, and slices per request. pub struct FlatScanPlan { layout: LayoutRef, + session: VortexSession, + segment_source: Arc, + segment_future_cache: Arc, } /// Per-query cache of the parsed (still lazy) array. @@ -82,12 +92,11 @@ struct FlatReadTask { read: Arc, range: Range, rows: OwnedRowScope, - reads: Vec, - prefetch_reads: Vec, + phase: ScanIoPhase, } impl FlatScanPlan { - fn array(&self, io: &ReadContext, state: &FlatScanState) -> VortexResult { + fn array(&self, results: &ReadResults, state: &FlatScanState) -> VortexResult { if let Some(hit) = state.array.lock().clone() { return Ok(hit); } @@ -97,7 +106,7 @@ impl FlatScanPlan { return Ok(hit); } - let array = decode_flat(&self.layout, io)?; + let array = decode_flat(&self.layout, results, &self.session)?; *guard = Some(array.clone()); Ok(array) } @@ -142,37 +151,17 @@ impl ScanPlan for FlatScanPlan { } impl PreparedRead for FlatPreparedRead { - fn segment_requests( - &self, - _range: Range, - _rows: RowScope<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - let Some(flat) = self.node.layout.as_opt::() else { - vortex_bail!( - "expected flat layout, got {}", - self.node.layout.encoding_id() - ); - }; - Ok(SegmentRequests::exact(vec![ - cx.request_for_segment(flat.data().segment_id())?, - ])) - } - fn create_task( self: Arc, range: Range, rows: OwnedRowScope, - reads: Vec, - prefetch_reads: Vec, - _cx: &mut SegmentPlanCtx, + phase: ScanIoPhase, ) -> VortexResult> { Ok(Box::new(FlatReadTask { read: self, range, rows, - reads, - prefetch_reads, + phase, })) } @@ -185,59 +174,89 @@ impl PreparedRead for FlatPreparedRead { } } +impl FlatPreparedRead { + fn segment_read(&self, phase: ScanIoPhase) -> VortexResult { + let Some(flat) = self.node.layout.as_opt::() else { + vortex_bail!( + "expected flat layout, got {}", + self.node.layout.encoding_id() + ); + }; + self.node + .segment_future_cache + .register( + self.node.segment_source.as_ref(), + vec![SegmentRequest::new( + flat.data().segment_id(), + self.node + .segment_source + .segment_info(flat.data().segment_id())?, + phase, + )], + ) + .into_iter() + .next() + .ok_or_else(|| vortex_err!("flat segment read registration returned no reads")) + } +} + impl ReadTask for FlatReadTask { fn into_step(self: Box) -> VortexResult { let Self { read, range, rows, - reads, - prefetch_reads, + phase, } = *self; - Ok(ReadStep::new(reads, prefetch_reads, move |io, _, _| { - let array = read.node.array(io, &read.state)?; - let rows = rows.as_scope(); - let dense = slice_to_range(array, &range)?; - if rows.selection.len() != dense.len() { - vortex_bail!( - "selection length {} does not match read range length {}", - rows.selection.len(), - dense.len() - ); - } - if rows.demand.len() != dense.len() { - vortex_bail!( - "demand length {} does not match read range length {}", - rows.demand.len(), - dense.len() - ); - } - if rows.selection.all_true() { - return Ok(ReadTaskOutput::Ready(dense)); - } - Ok(ReadTaskOutput::Ready(dense.filter(rows.selection.clone())?)) - })) + let segment_read = read.segment_read(phase)?; + Ok(ReadStep::new( + vec![segment_read], + Vec::new(), + move |_, _, results| { + let array = read.node.array(&results, &read.state)?; + let rows = rows.as_scope(); + let dense = slice_to_range(array, &range)?; + if rows.selection.len() != dense.len() { + vortex_bail!( + "selection length {} does not match read range length {}", + rows.selection.len(), + dense.len() + ); + } + if rows.demand.len() != dense.len() { + vortex_bail!( + "demand length {} does not match read range length {}", + rows.demand.len(), + dense.len() + ); + } + if rows.selection.all_true() { + return Ok(ReadTaskOutput::Ready(dense)); + } + Ok(ReadTaskOutput::Ready(dense.filter(rows.selection.clone())?)) + }, + )) } } -pub(crate) fn decode_flat(layout: &LayoutRef, io: &ReadContext) -> VortexResult { +pub(crate) fn decode_flat( + layout: &LayoutRef, + results: &ReadResults, + session: &VortexSession, +) -> VortexResult { let Some(flat) = layout.as_opt::() else { vortex_bail!("expected flat layout, got {}", layout.encoding_id()); }; let row_count = usize::try_from(layout.row_count()) .map_err(|_| vortex_err!("layout row count exceeds usize"))?; - let segment = io.segment(flat.data().segment_id())?; + let key = ReadRequestKey::from(SegmentRequestKey::new(flat.data().segment_id())); + let segment = results.get(key)?; let parts = if let Some(tree) = flat.data().array_tree() { SerializedArray::from_flatbuffer_and_segment(tree.clone(), segment)? } else { SerializedArray::try_from(segment)? }; - parts.decode( - layout.dtype(), - row_count, - flat.data().array_ctx(), - io.session(), - ) + parts.decode(layout.dtype(), row_count, flat.data().array_ctx(), session) } pub(crate) fn slice_to_range(array: ArrayRef, range: &Range) -> VortexResult { diff --git a/vortex-layout/src/scan/v2/layouts/struct_.rs b/vortex-layout/src/scan/v2/layouts/struct_.rs index b785bcdda65..b36dee457df 100644 --- a/vortex-layout/src/scan/v2/layouts/struct_.rs +++ b/vortex-layout/src/scan/v2/layouts/struct_.rs @@ -25,10 +25,10 @@ use vortex_array::scalar_fn::fns::root::Root; use vortex_array::scalar_fn::fns::select::Select; use vortex_error::VortexResult; use vortex_error::vortex_bail; -use vortex_session::VortexSession; use crate::layout_v2::Layout; use crate::layout_v2::LayoutRef; +use crate::layout_v2::LayoutScanPlanCtx; use crate::layouts_v2::struct_::Struct; use crate::scan::plan::ApplyScanPlan; use crate::scan::plan::MaskScanPlan; @@ -48,7 +48,7 @@ use crate::scan::v2::struct_fields; pub(crate) fn new_scan_plan( layout: Layout, _req: &mut ScanRequest, - session: &VortexSession, + ctx: &LayoutScanPlanCtx, ) -> VortexResult { let validity = layout .dtype() @@ -56,7 +56,7 @@ pub(crate) fn new_scan_plan( .then(|| { layout .child(0)? - .new_scan_plan(&mut ScanRequest::empty(), session) + .new_scan_plan(&mut ScanRequest::empty(), ctx) }) .transpose()?; let fields = struct_fields(layout.dtype())?; @@ -64,7 +64,7 @@ pub(crate) fn new_scan_plan( let field_child_offset = usize::from(layout.dtype().is_nullable()); Ok(Arc::new(StructScanPlan { layout: layout.to_layout(), - session: session.clone(), + ctx: ctx.clone(), fields, children, field_child_offset, @@ -75,7 +75,7 @@ pub(crate) fn new_scan_plan( /// Plans struct field expressions through child scan plans. pub struct StructScanPlan { layout: LayoutRef, - session: VortexSession, + ctx: LayoutScanPlanCtx, fields: StructFields, children: Mutex>>, field_child_offset: usize, @@ -144,7 +144,7 @@ impl StructScanPlan { /// The single-field fast paths route straight to a child node, bypassing /// the parent struct's validity. When the struct is nullable we wrap the /// child in a [`MaskScanPlan`] so the parent's null mask is applied to the - /// child result, mirroring the v1 struct reader's `array.mask(validity)`. + /// child result. fn apply_validity(&self, pushed: Option) -> Option { match (pushed, &self.validity) { (Some(node), Some(validity)) => { @@ -178,7 +178,7 @@ impl StructScanPlan { let child = self .layout .child(child_idx)? - .new_scan_plan(&mut ScanRequest::empty(), &self.session)?; + .new_scan_plan(&mut ScanRequest::empty(), &self.ctx)?; *slot = Some(Arc::clone(&child)); Ok(child) } diff --git a/vortex-layout/src/scan/v2/layouts/zoned.rs b/vortex-layout/src/scan/v2/layouts/zoned.rs index ecd3cbd1f26..9ca432ae41f 100644 --- a/vortex-layout/src/scan/v2/layouts/zoned.rs +++ b/vortex-layout/src/scan/v2/layouts/zoned.rs @@ -9,11 +9,11 @@ //! [`Expression::satisfy`]) are evaluated over the zone map once per //! query, and evidence walks the per-zone masks. //! -//! Coverage is partial (plan 017 SP3): every zone proves its own span, -//! so a morsel misaligned with zone boundaries still gets evidence for -//! its interior zones — the v1 scan's whole-morsel verdict is just the -//! case where every overlapping zone agrees. Edge rows the statistics -//! cannot prove stay unknown and fall through to residual evaluation. +//! Coverage is partial: every zone proves its own span, so a morsel +//! misaligned with zone boundaries still gets evidence for its interior zones. +//! A whole-morsel verdict is the case where every overlapping zone agrees. +//! Edge rows the statistics cannot prove stay unknown and fall through to +//! residual evaluation. use std::fmt; use std::ops::Range; @@ -40,15 +40,17 @@ use vortex_array::expr::root; use vortex_array::expr::stats::Stat; use vortex_array::scalar::Scalar; use vortex_error::VortexResult; -use vortex_error::vortex_bail; use vortex_error::vortex_err; use vortex_mask::Mask; use vortex_scan::read::ReadResults; use vortex_scan::read::ReadStore; +use vortex_scan::read::ReadStoreRef; +use vortex_scan::read::ScanIoPhase; use vortex_scan::read::ScanRead; use vortex_session::VortexSession; use crate::layout_v2::Layout; +use crate::layout_v2::LayoutScanPlanCtx; use crate::layout_v2::VTable; use crate::layouts::zoned::MAX_IS_TRUNCATED; use crate::layouts::zoned::MIN_IS_TRUNCATED; @@ -58,6 +60,8 @@ use crate::layouts_v2::zoned::ZonedData; use crate::scan::plan::AggregateAnswer; use crate::scan::plan::EvidenceCost; use crate::scan::plan::EvidenceScope; +use crate::scan::plan::EvidenceStep; +use crate::scan::plan::EvidenceTask; use crate::scan::plan::OwnedRowScope; use crate::scan::plan::PrepareCtx; use crate::scan::plan::PreparedAggregate; @@ -71,7 +75,6 @@ use crate::scan::plan::PushCtx; use crate::scan::plan::ReadContext; use crate::scan::plan::ReadTask; use crate::scan::plan::ReadTaskOutput; -use crate::scan::plan::RowScope; use crate::scan::plan::ScanPlan; use crate::scan::plan::ScanPlanRef; use crate::scan::plan::ScanState; @@ -82,24 +85,21 @@ use crate::scan::plan::downcast_state; use crate::scan::plan::evidence::EvidenceFragment; use crate::scan::plan::evidence::PredicateEvidenceKind; use crate::scan::plan::request::EvidenceRequest; +use crate::scan::plan::request::OwnedEvidenceRequest; use crate::scan::plan::request::ScanRequest; -use crate::segments::SegmentFutureCache; -use crate::segments::SegmentPlanCtx; -use crate::segments::SegmentRequests; -use crate::segments::register_segment_reads_cached; pub(crate) fn new_scan_plan>( layout: Layout, req: &mut ScanRequest, - session: &VortexSession, + ctx: &LayoutScanPlanCtx, ) -> VortexResult { let zones = layout.child(1)?; Ok(Arc::new(ZonedScanPlan { // The data child preserves this node's rows: pass the // expansion request through. - data: layout.child(0)?.new_scan_plan(req, session)?, + data: layout.child(0)?.new_scan_plan(req, ctx)?, nzones: zones.row_count(), - zones: zones.new_scan_plan(&mut ScanRequest::empty(), session)?, + zones: zones.new_scan_plan(&mut ScanRequest::empty(), ctx)?, column_dtype: layout.dtype().clone(), zone_len: layout.data().zone_len() as u64, row_count: layout.row_count(), @@ -157,6 +157,12 @@ struct ZonedPreparedEvidence { satisfier: Option, } +struct ZonedEvidenceTask { + evidence: Arc, + req: OwnedEvidenceRequest, + phase: ScanIoPhase, +} + /// Planned ungrouped aggregate over a zoned node's root value. struct ZonedPreparedAggregate { node: Arc, @@ -244,31 +250,29 @@ fn read_zones_child( zones_read: &PreparedReadRef, nzones: u64, io: &ReadContext, +) -> VortexResult { + read_zones_child_with_store(zones_read, nzones, io, Arc::new(ReadStore::new())) +} + +fn read_zones_child_with_results( + zones_read: &PreparedReadRef, + nzones: u64, + io: &ReadContext, + results: ReadResults, +) -> VortexResult { + read_zones_child_with_store(zones_read, nzones, io, results.store()) +} + +fn read_zones_child_with_store( + zones_read: &PreparedReadRef, + nzones: u64, + io: &ReadContext, + read_store: ReadStoreRef, ) -> VortexResult { let len = usize::try_from(nzones).map_err(|_| vortex_err!("zone count exceeds usize"))?; let rows = OwnedRowScope::selected(Mask::new_true(len)); - let mut segment_ctx = SegmentPlanCtx::new(Arc::clone(io.segments()), io.session().clone()); - let requests = zones_read.segment_requests(0..nzones, rows.as_scope(), &mut segment_ctx)?; - if requests.is_unknown() { - vortex_bail!("zoned stats child produced unknown segment requests") - } - let cache = SegmentFutureCache::new(); - let reads = register_segment_reads_cached(&cache, io.segments().as_ref(), requests); - let prefetch_requests = - zones_read.prefetch_segment_requests(0..nzones, rows.as_scope(), &mut segment_ctx)?; - let prefetch_reads = if prefetch_requests.is_unknown() { - Vec::new() - } else { - register_segment_reads_cached(&cache, io.segments().as_ref(), prefetch_requests) - }; - let mut task = Arc::clone(zones_read).create_task( - 0..nzones, - rows, - reads, - prefetch_reads, - &mut segment_ctx, - )?; - let read_store = Arc::new(ReadStore::new()); + let phase = ScanIoPhase::EvidenceSetup; + let mut task = Arc::clone(zones_read).create_task(0..nzones, rows, phase)?; loop { let step = task.into_step()?; resolve_zoned_reads(Arc::clone(&read_store), step.required_reads)?; @@ -284,7 +288,7 @@ fn read_zones_child( } } -fn resolve_zoned_reads(read_store: Arc, reads: Vec) -> VortexResult<()> { +fn resolve_zoned_reads(read_store: ReadStoreRef, reads: Vec) -> VortexResult<()> { for read in reads { if read_store.get(read.request.key).is_none() { let buffer = futures::executor::block_on(read.future)?; @@ -584,22 +588,32 @@ impl ZonedScanPlan { } impl ZonedPreparedEvidence { - fn table(&self, io: &ReadContext, state: &ZonedScanState) -> VortexResult> { + fn table( + &self, + io: &ReadContext, + state: &ZonedScanState, + results: ReadResults, + ) -> VortexResult> { if let Some(hit) = state.table.lock().clone() { return Ok(hit); } - let zones = read_zones_child(&self.zones_read, self.nzones, io)?; + let zones = read_zones_child_with_results(&self.zones_read, self.nzones, io, results)?; let mut ctx = io.session().create_execution_ctx(); let table = Arc::new(zones.execute::(&mut ctx)?); *state.table.lock() = Some(Arc::clone(&table)); Ok(table) } - fn zone_map(&self, io: &ReadContext, state: &ZonedScanState) -> VortexResult> { + fn zone_map( + &self, + io: &ReadContext, + state: &ZonedScanState, + results: ReadResults, + ) -> VortexResult> { if let Some(hit) = state.zone_map.lock().clone() { return Ok(hit); } - let table = self.table(io, state)?; + let table = self.table(io, state, results)?; let zone_map = match &self.zone_map_schema { ZoneMapSchema::AggregateFns(_) => ZoneMap::try_new( self.column_dtype.clone(), @@ -631,11 +645,12 @@ impl ZonedPreparedEvidence { &self, io: &ReadContext, state: &ZonedScanState, + results: ReadResults, ) -> VortexResult> { if let Some(hit) = state.masks.lock().get(&self.predicate) { return Ok(Arc::clone(hit)); } - let zone_map = self.zone_map(io, state)?; + let zone_map = self.zone_map(io, state, results)?; let session = io.session(); let all_false = self .falsifier @@ -679,10 +694,11 @@ impl PreparedEvidence for ZonedPreparedEvidence { &'a self, req: &'a EvidenceRequest<'a>, io: &'a ReadContext, + results: ReadResults, ) -> VortexResult> { let mut fragments = Vec::new(); if self.zone_len > 0 && (self.falsifier.is_some() || self.satisfier.is_some()) { - let masks = self.predicate_masks(io, &self.state)?; + let masks = self.predicate_masks(io, &self.state, results)?; let zones = self.zone_range(&req.range); let mut run: Option<(Range, bool)> = None; for zone in zones { @@ -714,22 +730,6 @@ impl PreparedEvidence for ZonedPreparedEvidence { Ok(fragments) } - fn segment_requests( - &self, - _req: &EvidenceRequest<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - if self.zone_len == 0 || (self.falsifier.is_none() && self.satisfier.is_none()) { - return Ok(SegmentRequests::none()); - } - let selection = Mask::new_true( - usize::try_from(self.nzones) - .map_err(|_| vortex_err!("zoned stats length exceeds usize"))?, - ); - self.zones_read - .segment_requests(0..self.nzones, RowScope::selected(&selection), cx) - } - fn recheck_before_projection(&self) -> bool { true } @@ -745,6 +745,49 @@ impl PreparedEvidence for ZonedPreparedEvidence { fn fmt_prepared(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "zoned") } + + fn create_task( + self: Arc, + req: OwnedEvidenceRequest, + phase: ScanIoPhase, + ) -> VortexResult> { + Ok(Box::new(ZonedEvidenceTask { + evidence: self, + req, + phase, + })) + } +} + +impl EvidenceTask for ZonedEvidenceTask { + fn into_step(self: Box) -> VortexResult { + let Self { + evidence, + req, + phase, + } = *self; + if evidence.zone_len == 0 || (evidence.falsifier.is_none() && evidence.satisfier.is_none()) + { + return Ok(EvidenceStep::new( + Vec::new(), + Vec::new(), + move |io, results| evidence.evidence(&req.as_request(), io, results), + )); + } + let selection = Mask::new_true( + usize::try_from(evidence.nzones) + .map_err(|_| vortex_err!("zoned stats length exceeds usize"))?, + ); + let rows = OwnedRowScope::selected(selection); + let step = Arc::clone(&evidence.zones_read) + .create_task(0..evidence.nzones, rows, phase)? + .into_step()?; + Ok(EvidenceStep::new( + step.required_reads, + step.prefetch_reads, + move |io, results| evidence.evidence(&req.as_request(), io, results), + )) + } } impl ScanPlan for ZonedScanPlan { @@ -876,33 +919,13 @@ impl ScanPlan for ZonedScanPlan { } impl PreparedRead for ZonedPreparedRead { - fn segment_requests( - &self, - range: Range, - rows: RowScope<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - self.data.segment_requests(range, rows, cx) - } - - fn prefetch_segment_requests( - &self, - range: Range, - rows: RowScope<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - self.data.prefetch_segment_requests(range, rows, cx) - } - fn create_task( self: Arc, range: Range, rows: OwnedRowScope, - reads: Vec, - prefetch_reads: Vec, - cx: &mut SegmentPlanCtx, + phase: ScanIoPhase, ) -> VortexResult> { - Arc::clone(&self.data).create_task(range, rows, reads, prefetch_reads, cx) + Arc::clone(&self.data).create_task(range, rows, phase) } fn release(&self, frontier: u64) -> VortexResult<()> { @@ -1004,33 +1027,13 @@ impl ScanPlan for ZonedExprScanPlan { } impl PreparedRead for ZonedExprPreparedRead { - fn segment_requests( - &self, - range: Range, - rows: RowScope<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - self.data.segment_requests(range, rows, cx) - } - - fn prefetch_segment_requests( - &self, - range: Range, - rows: RowScope<'_>, - cx: &mut SegmentPlanCtx, - ) -> VortexResult { - self.data.prefetch_segment_requests(range, rows, cx) - } - fn create_task( self: Arc, range: Range, rows: OwnedRowScope, - reads: Vec, - prefetch_reads: Vec, - cx: &mut SegmentPlanCtx, + phase: ScanIoPhase, ) -> VortexResult> { - Arc::clone(&self.data).create_task(range, rows, reads, prefetch_reads, cx) + Arc::clone(&self.data).create_task(range, rows, phase) } fn release(&self, frontier: u64) -> VortexResult<()> { diff --git a/vortex-layout/src/scan/v2/row_idx.rs b/vortex-layout/src/scan/v2/row_idx.rs index 7ab8acda574..3ebeff51fb6 100644 --- a/vortex-layout/src/scan/v2/row_idx.rs +++ b/vortex-layout/src/scan/v2/row_idx.rs @@ -21,7 +21,6 @@ use vortex_array::scalar::PValue; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_bail; -use vortex_scan::read::ScanRead; use vortex_sequence::Sequence; use vortex_sequence::SequenceArray; @@ -36,7 +35,6 @@ use crate::scan::plan::PushCtx; use crate::scan::plan::ReadStep; use crate::scan::plan::ReadTask; use crate::scan::plan::ReadTaskOutput; -use crate::scan::plan::RowScope; use crate::scan::plan::ScanPlan; use crate::scan::plan::ScanPlanRef; use crate::scan::plan::ScanState; @@ -44,8 +42,6 @@ use crate::scan::plan::ScanStateRef; use crate::scan::plan::StateCtx; use crate::scan::plan::StructValueScanPlan; use crate::scan::plan::default_try_push_expr; -use crate::segments::SegmentPlanCtx; -use crate::segments::SegmentRequests; pub fn with_row_idx(root: ScanPlanRef, dtype: DType, row_offset: u64) -> ScanPlanRef { Arc::new(RowIdxScanPlan { @@ -251,22 +247,11 @@ impl ScanPlan for RowIdxExprScanPlan { } impl PreparedRead for RowIdxPreparedRead { - fn segment_requests( - &self, - _range: Range, - _rows: RowScope<'_>, - _cx: &mut SegmentPlanCtx, - ) -> VortexResult { - Ok(SegmentRequests::none()) - } - fn create_task( self: Arc, range: Range, rows: OwnedRowScope, - _reads: Vec, - _prefetch_reads: Vec, - _cx: &mut SegmentPlanCtx, + _phase: vortex_scan::read::ScanIoPhase, ) -> VortexResult> { Ok(Box::new(RowIdxReadTask { read: self, diff --git a/vortex-layout/src/segments/scheduled.rs b/vortex-layout/src/segments/scheduled.rs index 7fdb975b07c..ce9cbec1a3d 100644 --- a/vortex-layout/src/segments/scheduled.rs +++ b/vortex-layout/src/segments/scheduled.rs @@ -1,7 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use std::fmt; use std::sync::Arc; use futures::FutureExt; @@ -13,15 +12,12 @@ use vortex_array::buffer::BufferHandle; use vortex_error::SharedVortexResult; use vortex_error::VortexError; use vortex_error::VortexExpect; -use vortex_error::VortexResult; use vortex_scan::read::CancelGroup; use vortex_scan::read::ReadRequestKey; -use vortex_scan::read::ReadResults; use vortex_scan::read::ScanIoPhase; use vortex_scan::read::ScanPriority; use vortex_scan::read::ScanRead; use vortex_scan::read::ScanReadRequest; -use vortex_session::VortexSession; use vortex_utils::aliases::dash_map::DashMap; use vortex_utils::aliases::dash_map::Entry; use vortex_utils::aliases::hash_set::HashSet; @@ -122,128 +118,6 @@ impl SegmentRequest { } } -/// Planning result for segment request introspection. -#[derive(Clone, Debug, PartialEq, Eq)] -pub struct SegmentRequests { - exact: Option>, -} - -impl SegmentRequests { - /// Return an unknown segment request set. - pub fn unknown() -> Self { - Self { exact: None } - } - - /// Return an exact segment request set. - pub fn exact(requests: Vec) -> Self { - Self { - exact: Some(requests), - } - } - - /// Return an exact empty segment request set. - pub fn none() -> Self { - Self::exact(Vec::new()) - } - - /// Return whether this plan could not describe its segment requests. - pub fn is_unknown(&self) -> bool { - self.exact.is_none() - } - - /// Borrow the exact request set, if known. - pub fn as_exact(&self) -> Option<&[SegmentRequest]> { - self.exact.as_deref() - } - - /// Consume this value and return the exact request set, if known. - pub fn into_exact(self) -> Option> { - self.exact - } - - /// Append another request set, preserving `unknown` if either side is unknown. - pub fn extend(&mut self, other: SegmentRequests) { - match (&mut self.exact, other.exact) { - (Some(requests), Some(mut other)) => requests.append(&mut other), - _ => self.exact = None, - } - } -} - -/// Context used by plans when producing scheduler-visible segment requests. -#[derive(Clone)] -pub struct SegmentPlanCtx { - source: Arc, - session: VortexSession, - phase: ScanIoPhase, - priority: ScanPriority, - cancel_group: CancelGroup, -} - -impl fmt::Debug for SegmentPlanCtx { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("SegmentPlanCtx") - .field("phase", &self.phase) - .field("priority", &self.priority) - .field("cancel_group", &self.cancel_group) - .finish_non_exhaustive() - } -} - -impl SegmentPlanCtx { - /// Create a request-planning context for a registered segment source. - pub fn new(source: Arc, session: VortexSession) -> Self { - Self { - source, - session, - phase: ScanIoPhase::default(), - priority: ScanPriority::NORMAL, - cancel_group: CancelGroup::NONE, - } - } - - /// Return the source used to resolve segment metadata. - pub fn source(&self) -> &Arc { - &self.source - } - - /// Return the scan session used by lazy plans that must instantiate child plans. - pub fn session(&self) -> &VortexSession { - &self.session - } - - /// Return a copy of this context using the provided scan phase. - pub fn with_phase(mut self, phase: ScanIoPhase) -> Self { - self.phase = phase; - self - } - - /// Return a copy of this context using the provided priority. - pub fn with_priority(mut self, priority: ScanPriority) -> Self { - self.priority = priority; - self - } - - /// Return a copy of this context using the provided cancellation group. - pub fn with_cancel_group(mut self, cancel_group: CancelGroup) -> Self { - self.cancel_group = cancel_group; - self - } - - /// Create a segment request with this context's source and scheduling metadata. - pub fn request(&self, segment: SegmentId, info: SegmentInfo) -> SegmentRequest { - SegmentRequest::new(segment, info, self.phase) - .with_priority(self.priority) - .with_cancel_group(self.cancel_group) - } - - /// Create a segment request after resolving metadata from the registered source. - pub fn request_for_segment(&self, segment: SegmentId) -> VortexResult { - let info = self.source.segment_info(segment)?; - Ok(self.request(segment, info)) - } -} - type SharedSegmentFuture = BoxFuture<'static, SharedVortexResult>; /// Scan-local cache of in-flight segment futures keyed by logical segment request. @@ -293,12 +167,12 @@ impl SegmentFutureCache { } } - /// Register exact segment reads with a source, returning handles that keep the futures alive. - pub fn register(&self, source: &dyn SegmentSource, requests: SegmentRequests) -> Vec { - let Some(requests) = requests.into_exact() else { - return Vec::new(); - }; - + /// Register segment reads with a source, returning handles that keep the futures alive. + pub fn register( + &self, + source: &dyn SegmentSource, + requests: impl IntoIterator, + ) -> Vec { let mut seen: HashSet = HashSet::default(); let mut handles = Vec::new(); let mut misses = Vec::new(); @@ -354,15 +228,6 @@ impl SegmentFutureCache { } } -/// Register exact segment reads through a shared in-flight future cache. -pub fn register_segment_reads_cached( - cache: &SegmentFutureCache, - source: &dyn SegmentSource, - requests: SegmentRequests, -) -> Vec { - cache.register(source, requests) -} - fn shared_segment_handle(request: SegmentRequest, future: Shared) -> ScanRead { shared_read_handle(ScanReadRequest::from(&request), future) } @@ -371,79 +236,6 @@ fn shared_read_handle(request: ScanReadRequest, future: Shared, - cache: Arc, - phase: ScanIoPhase, -} - -/// Segment source backed by scheduler-resolved read results. -pub struct ReadResultsSegmentSource { - source: Arc, - results: ReadResults, -} - -impl ReadResultsSegmentSource { - /// Create a segment source over already-resolved scan read results. - pub fn new(source: Arc, results: ReadResults) -> Self { - Self { source, results } - } -} - -impl SegmentSource for ReadResultsSegmentSource { - fn segment_info(&self, id: SegmentId) -> VortexResult { - self.source.segment_info(id) - } - - fn request(&self, id: SegmentId) -> SegmentFuture { - let key = ReadRequestKey::from(SegmentRequestKey::new(id)); - let results = self.results.clone(); - async move { results.get(key) }.boxed() - } - - fn resolved(&self, id: SegmentId) -> VortexResult { - self.results - .get(ReadRequestKey::from(SegmentRequestKey::new(id))) - } -} - -impl CachedSegmentSource { - /// Create a cached source using projection reads as the default late-request phase. - pub fn new(source: Arc, cache: Arc) -> Self { - Self { - source, - cache, - phase: ScanIoPhase::ProjectionRead, - } - } - - /// Return a copy of this source with a different phase for late segment requests. - pub fn with_phase(mut self, phase: ScanIoPhase) -> Self { - self.phase = phase; - self - } -} - -impl SegmentSource for CachedSegmentSource { - fn segment_info(&self, id: SegmentId) -> VortexResult { - self.source.segment_info(id) - } - - fn request(&self, id: SegmentId) -> SegmentFuture { - let info = match self.source.segment_info(id) { - Ok(info) => info, - Err(error) => return async move { Err(error) }.boxed(), - }; - self.cache - .request_segment( - self.source.as_ref(), - SegmentRequest::new(id, info, self.phase), - ) - .future - } -} - #[cfg(test)] mod tests { use std::sync::atomic::AtomicUsize; @@ -454,6 +246,7 @@ mod tests { use parking_lot::Mutex; use vortex_array::buffer::BufferHandle; use vortex_buffer::ByteBuffer; + use vortex_error::VortexResult; use super::*; struct CountingSegmentSource { @@ -517,15 +310,15 @@ mod tests { #[test] fn register_segment_reads_dedupes_exact_segments() -> VortexResult<()> { let source = Arc::new(CountingSegmentSource::new(SegmentInfo::new(8))); - let segment_source: Arc = Arc::::clone(&source); - let ctx = SegmentPlanCtx::new(segment_source, VortexSession::empty()); - let request = ctx.request_for_segment(SegmentId::from(0))?; - - let reads = SegmentFutureCache::new().register( - source.as_ref(), - SegmentRequests::exact(vec![request, request]), + let segment = SegmentId::from(0); + let request = SegmentRequest::new( + segment, + source.segment_info(segment)?, + ScanIoPhase::ProjectionRead, ); + let reads = SegmentFutureCache::new().register(source.as_ref(), vec![request, request]); + assert_eq!(reads.len(), 1); assert_eq!(source.submit_count(), 1); @@ -535,15 +328,18 @@ mod tests { #[test] fn register_segment_reads_registers_each_miss() -> VortexResult<()> { let source = Arc::new(CountingMissSegmentSource::new(SegmentInfo::new(8))); - let segment_source: Arc = - Arc::::clone(&source); - let ctx = SegmentPlanCtx::new(segment_source, VortexSession::empty()); let requests = (0..5) - .map(|segment| ctx.request_for_segment(SegmentId::from(segment))) + .map(|segment| { + let segment = SegmentId::from(segment); + Ok(SegmentRequest::new( + segment, + source.segment_info(segment)?, + ScanIoPhase::ProjectionRead, + )) + }) .collect::>>()?; - let reads = - SegmentFutureCache::new().register(source.as_ref(), SegmentRequests::exact(requests)); + let reads = SegmentFutureCache::new().register(source.as_ref(), requests); assert_eq!(reads.len(), 5); assert_eq!(source.batches(), vec![1, 1, 1, 1, 1]); @@ -554,18 +350,20 @@ mod tests { #[test] fn segment_future_cache_reuses_prefetched_segment() -> VortexResult<()> { let source = Arc::new(CountingSegmentSource::new(SegmentInfo::new(8))); - let segment_source: Arc = Arc::::clone(&source); - let ctx = SegmentPlanCtx::new(Arc::clone(&segment_source), VortexSession::empty()); - let request = ctx.request_for_segment(SegmentId::from(0))?; + let segment = SegmentId::from(0); + let request = SegmentRequest::new( + segment, + source.segment_info(segment)?, + ScanIoPhase::ProjectionRead, + ); let cache = Arc::new(SegmentFutureCache::new()); - let reads = cache.register(source.as_ref(), SegmentRequests::exact(vec![request])); - let reader = CachedSegmentSource::new(segment_source, Arc::clone(&cache)); - let read = reader.request(SegmentId::from(0)); + let reads = cache.register(source.as_ref(), vec![request]); + let read = cache.request_segment(source.as_ref(), request); assert_eq!(reads.len(), 1); assert_eq!(source.submit_count(), 1); - assert_eq!(block_on(read)?.as_host().len(), 1); + assert_eq!(block_on(read.future)?.as_host().len(), 1); assert_eq!(source.submit_count(), 1); Ok(()) diff --git a/vortex-scan/src/read.rs b/vortex-scan/src/read.rs index 4fa245d4fec..46a40fcfbe5 100644 --- a/vortex-scan/src/read.rs +++ b/vortex-scan/src/read.rs @@ -192,6 +192,11 @@ impl ReadResults { pub fn contains(&self, key: ReadRequestKey) -> bool { self.store.get(key).is_some() } + + /// Return the backing read store. + pub fn store(&self) -> ReadStoreRef { + Arc::clone(&self.store) + } } impl ScanRead { From aa6096ea27d737557b55d8e16fe7c29dc142761f Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 24 Jun 2026 11:37:30 -0400 Subject: [PATCH 39/48] Remove split encoding changes from layout27 Signed-off-by: Nicholas Gates --- .../experimental/onpair/src/compute/like.rs | 520 ------------------ .../experimental/onpair/src/compute/mod.rs | 1 - encodings/experimental/onpair/src/kernel.rs | 3 - encodings/fsst/src/compute/like.rs | 24 +- encodings/fsst/src/dfa/mod.rs | 4 +- encodings/fsst/src/kernel.rs | 43 -- vortex-array/src/arrays/dict/compute/rules.rs | 107 ---- vortex-array/src/arrays/dict/vtable/kernel.rs | 2 - vortex-array/src/arrays/filter/kernel.rs | 9 +- vortex-array/src/arrays/shared/vtable.rs | 8 - vortex-array/src/executor.rs | 31 -- vortex-array/src/scalar_fn/fns/like/kernel.rs | 45 -- vortex-file/src/strategy.rs | 13 +- 13 files changed, 7 insertions(+), 803 deletions(-) delete mode 100644 encodings/experimental/onpair/src/compute/like.rs diff --git a/encodings/experimental/onpair/src/compute/like.rs b/encodings/experimental/onpair/src/compute/like.rs deleted file mode 100644 index 84925e18d69..00000000000 --- a/encodings/experimental/onpair/src/compute/like.rs +++ /dev/null @@ -1,520 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -use memchr::memmem::Finder; -use vortex_array::ArrayRef; -use vortex_array::ArrayView; -use vortex_array::ExecutionCtx; -use vortex_array::IntoArray; -use vortex_array::arrays::BoolArray; -use vortex_array::scalar_fn::fns::like::LikeKernel; -use vortex_array::scalar_fn::fns::like::LikeOptions; -use vortex_buffer::BitBuffer; -use vortex_error::VortexResult; -use vortex_error::vortex_ensure; -use vortex_error::vortex_err; - -use crate::OnPair; -use crate::OnPairArrayExt; -use crate::OnPairArraySlotsExt; -use crate::decode::code_boundary_at; -use crate::decode::collect_widened; - -#[derive(Clone, Copy)] -enum SimpleLike<'a> { - All, - Exact(&'a [u8]), - Prefix(&'a [u8]), - Suffix(&'a [u8]), - Contains(&'a [u8]), -} - -impl LikeKernel for OnPair { - fn like( - array: ArrayView<'_, Self>, - pattern: &ArrayRef, - options: LikeOptions, - ctx: &mut ExecutionCtx, - ) -> VortexResult> { - let Some(pattern_scalar) = pattern.as_constant() else { - return Ok(None); - }; - if options.case_insensitive { - return Ok(None); - } - - let pattern_bytes: &[u8] = if let Some(s) = pattern_scalar.as_utf8_opt() { - let Some(v) = s.value() else { - return Ok(None); - }; - v.as_ref() - } else if let Some(b) = pattern_scalar.as_binary_opt() { - let Some(v) = b.value() else { - return Ok(None); - }; - v - } else { - return Ok(None); - }; - let Some(parsed) = parse_simple_like(pattern_bytes) else { - return Ok(None); - }; - - let codes_offsets = array.codes_offsets(); - let code_start = code_boundary_at(codes_offsets, 0, ctx)?; - let code_end = code_boundary_at(codes_offsets, array.len(), ctx)?; - vortex_ensure!( - code_start <= code_end, - "OnPair codes_offsets must be nondecreasing" - ); - vortex_ensure!( - code_end <= array.codes().len(), - "OnPair codes_offsets end {} exceeds codes len {}", - code_end, - array.codes().len() - ); - - let codes = collect_widened::(&array.codes().slice(code_start..code_end)?, ctx)?; - let code_offsets = normalize_code_offsets( - collect_widened::(codes_offsets, ctx)?.as_slice(), - code_start, - code_end, - )?; - let dict_offsets = collect_widened::(array.dict_offsets(), ctx)?; - let dict_bytes = array.dict_bytes(); - let dict_bytes = dict_bytes.as_slice(); - let mut tail = Vec::new(); - let mut scratch = Vec::new(); - - let bits = match parsed { - SimpleLike::All => BitBuffer::collect_bool(array.len(), |_| true ^ options.negated), - SimpleLike::Exact(needle) => BitBuffer::collect_bool(array.len(), |row| { - row_matches_exact( - row_codes(&code_offsets, &codes, row), - dict_bytes, - dict_offsets.as_slice(), - needle, - ) ^ options.negated - }), - SimpleLike::Prefix(needle) => BitBuffer::collect_bool(array.len(), |row| { - row_matches_prefix( - row_codes(&code_offsets, &codes, row), - dict_bytes, - dict_offsets.as_slice(), - needle, - ) ^ options.negated - }), - SimpleLike::Suffix(needle) => BitBuffer::collect_bool(array.len(), |row| { - row_matches_suffix( - row_codes(&code_offsets, &codes, row), - dict_bytes, - dict_offsets.as_slice(), - needle, - &mut tail, - ) ^ options.negated - }), - SimpleLike::Contains(needle) => { - let finder = Finder::new(needle); - BitBuffer::collect_bool(array.len(), |row| { - row_matches_contains( - row_codes(&code_offsets, &codes, row), - dict_bytes, - dict_offsets.as_slice(), - needle, - &finder, - &mut tail, - &mut scratch, - ) ^ options.negated - }) - } - }; - - let validity = array - .array_validity() - .union_nullability(pattern_scalar.dtype().nullability()); - Ok(Some(BoolArray::new(bits, validity).into_array())) - } -} - -fn normalize_code_offsets( - code_offsets: &[u64], - code_start: usize, - code_end: usize, -) -> VortexResult> { - let offsets = code_offsets - .iter() - .map(|&offset| { - usize::try_from(offset) - .map_err(|_| vortex_err!("OnPair code offset {} exceeds usize", offset)) - }) - .collect::>>()?; - - for &offset in &offsets { - vortex_ensure!( - offset >= code_start && offset <= code_end, - "OnPair codes offset {} outside row window {}..{}", - offset, - code_start, - code_end - ); - } - for window in offsets.windows(2) { - vortex_ensure!( - window[0] <= window[1], - "OnPair codes_offsets must be nondecreasing" - ); - } - - Ok(offsets - .into_iter() - .map(|offset| offset - code_start) - .collect()) -} - -fn parse_simple_like(pattern: &[u8]) -> Option> { - if pattern.is_empty() { - return Some(SimpleLike::Exact(b"")); - } - if pattern.iter().any(|&b| matches!(b, b'_' | b'\\')) { - return None; - } - - let Some(first_literal) = pattern.iter().position(|&b| b != b'%') else { - return Some(SimpleLike::All); - }; - let last_literal = pattern.iter().rposition(|&b| b != b'%')? + 1; - let literal = &pattern[first_literal..last_literal]; - if literal.contains(&b'%') { - return None; - } - - match (first_literal == 0, last_literal == pattern.len()) { - (true, true) => Some(SimpleLike::Exact(literal)), - (true, false) => Some(SimpleLike::Prefix(literal)), - (false, true) => Some(SimpleLike::Suffix(literal)), - (false, false) => Some(SimpleLike::Contains(literal)), - } -} - -fn row_codes<'a>(code_offsets: &[usize], codes: &'a [u16], row: usize) -> &'a [u16] { - let start = code_offsets[row]; - let end = code_offsets[row + 1]; - &codes[start..end] -} - -fn token_bytes<'a>(dict_bytes: &'a [u8], dict_offsets: &[u32], code: u16) -> &'a [u8] { - let code = usize::from(code); - let start = dict_offsets[code] as usize; - let end = dict_offsets[code + 1] as usize; - &dict_bytes[start..end] -} - -fn row_matches_exact( - codes: &[u16], - dict_bytes: &[u8], - dict_offsets: &[u32], - needle: &[u8], -) -> bool { - let mut matched = 0; - for &code in codes { - let token = token_bytes(dict_bytes, dict_offsets, code); - if matched + token.len() > needle.len() { - return false; - } - if token != &needle[matched..matched + token.len()] { - return false; - } - matched += token.len(); - } - matched == needle.len() -} - -fn row_matches_prefix( - codes: &[u16], - dict_bytes: &[u8], - dict_offsets: &[u32], - needle: &[u8], -) -> bool { - if needle.is_empty() { - return true; - } - - let mut matched = 0; - for &code in codes { - let token = token_bytes(dict_bytes, dict_offsets, code); - let take = (needle.len() - matched).min(token.len()); - if token[..take] != needle[matched..matched + take] { - return false; - } - matched += take; - if matched == needle.len() { - return true; - } - } - false -} - -fn row_matches_suffix( - codes: &[u16], - dict_bytes: &[u8], - dict_offsets: &[u32], - needle: &[u8], - tail: &mut Vec, -) -> bool { - if needle.is_empty() { - return true; - } - - let mut total_len = 0; - tail.clear(); - for &code in codes { - let token = token_bytes(dict_bytes, dict_offsets, code); - total_len += token.len(); - append_tail(tail, token, needle.len()); - } - total_len >= needle.len() && tail.as_slice() == needle -} - -fn row_matches_contains( - codes: &[u16], - dict_bytes: &[u8], - dict_offsets: &[u32], - needle: &[u8], - finder: &Finder<'_>, - tail: &mut Vec, - scratch: &mut Vec, -) -> bool { - if needle.is_empty() { - return true; - } - - tail.clear(); - for &code in codes { - let token = token_bytes(dict_bytes, dict_offsets, code); - if finder.find(token).is_some() { - return true; - } - if !tail.is_empty() { - scratch.clear(); - scratch.extend_from_slice(tail); - scratch.extend_from_slice(token); - if finder.find(scratch).is_some() { - return true; - } - } - append_tail(tail, token, needle.len() - 1); - } - false -} - -fn append_tail(tail: &mut Vec, bytes: &[u8], max_len: usize) { - if max_len == 0 { - return; - } - if bytes.len() >= max_len { - tail.clear(); - tail.extend_from_slice(&bytes[bytes.len() - max_len..]); - return; - } - let overflow = tail.len() + bytes.len(); - if overflow > max_len { - tail.drain(..overflow - max_len); - } - tail.extend_from_slice(bytes); -} - -#[cfg(test)] -mod tests { - use std::sync::LazyLock; - - use vortex_array::ArrayRef; - use vortex_array::Canonical; - use vortex_array::IntoArray; - use vortex_array::VortexSessionExecute; - use vortex_array::arrays::BoolArray; - use vortex_array::arrays::ConstantArray; - use vortex_array::arrays::ScalarFn; - use vortex_array::arrays::SharedArray; - use vortex_array::arrays::VarBinArray; - use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt; - use vortex_array::assert_arrays_eq; - use vortex_array::dtype::DType; - use vortex_array::dtype::Nullability; - use vortex_array::scalar_fn::fns::like::Like; - use vortex_array::scalar_fn::fns::like::LikeOptions; - use vortex_error::VortexResult; - use vortex_mask::Mask; - use vortex_session::VortexSession; - - use crate::OnPair; - use crate::compress::DEFAULT_DICT12_CONFIG; - use crate::compress::onpair_compress; - - static SESSION: LazyLock = LazyLock::new(|| { - let session = vortex_array::array_session(); - crate::initialize(&session); - session - }); - - fn run_like( - values: &[Option<&str>], - pattern: &str, - options: LikeOptions, - ) -> VortexResult { - let input = - VarBinArray::from_iter(values.iter().copied(), DType::Utf8(Nullability::Nullable)); - let len = input.len(); - let dtype = input.dtype().clone(); - let array = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG)?.into_array(); - let pattern = ConstantArray::new(pattern, len).into_array(); - let result = Like - .try_new_array(len, options, [array, pattern])? - .into_array() - .execute::(&mut SESSION.create_execution_ctx())? - .into_bool(); - Ok(result) - } - - #[test] - fn like_contains() -> VortexResult<()> { - let mut ctx = SESSION.create_execution_ctx(); - let result = run_like( - &[ - Some("https://google.example"), - Some("no match"), - Some("prefix Google suffix"), - None, - ], - "%Google%", - LikeOptions::default(), - )?; - assert_arrays_eq!( - &result, - &BoolArray::from_iter([Some(false), Some(false), Some(true), None]), - &mut ctx - ); - Ok(()) - } - - #[test] - fn like_prefix_suffix_exact_and_negated() -> VortexResult<()> { - let mut ctx = SESSION.create_execution_ctx(); - let values = [ - Some("2020-10-01"), - Some("2020-11-01"), - Some("x-2020-10-01"), - Some(""), - ]; - assert_arrays_eq!( - &run_like(&values, "2020-10-%", LikeOptions::default())?, - &BoolArray::from_iter([Some(true), Some(false), Some(false), Some(false)]), - &mut ctx - ); - assert_arrays_eq!( - &run_like(&values, "%-01", LikeOptions::default())?, - &BoolArray::from_iter([Some(true), Some(true), Some(true), Some(false)]), - &mut ctx - ); - assert_arrays_eq!( - &run_like(&values, "2020-10-01", LikeOptions::default())?, - &BoolArray::from_iter([Some(true), Some(false), Some(false), Some(false)]), - &mut ctx - ); - assert_arrays_eq!( - &run_like( - &values, - "%2020%", - LikeOptions { - negated: true, - case_insensitive: false, - }, - )?, - &BoolArray::from_iter([Some(false), Some(false), Some(false), Some(true)]), - &mut ctx - ); - Ok(()) - } - - #[test] - fn like_filtered_onpair_stays_lazy_after_one_step() -> VortexResult<()> { - let input = VarBinArray::from_iter( - [ - Some("Google alpha"), - Some("irrelevant"), - Some("Google beta"), - Some("other"), - ], - DType::Utf8(Nullability::Nullable), - ); - let len = input.len(); - let dtype = input.dtype().clone(); - let array = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG)?.into_array(); - let filtered = array.filter(Mask::from_iter([true, false, true, false]))?; - let pattern = ConstantArray::new("%Google%", filtered.len()).into_array(); - let like = Like - .try_new_array(filtered.len(), LikeOptions::default(), [filtered, pattern])? - .into_array(); - - let stepped = like.execute::(&mut SESSION.create_execution_ctx())?; - assert!(stepped.is::()); - assert!(stepped.children()[0].is::()); - - let result = stepped - .execute::(&mut SESSION.create_execution_ctx())? - .into_bool(); - let mut ctx = SESSION.create_execution_ctx(); - assert_arrays_eq!( - &result, - &BoolArray::from_iter([Some(true), Some(true)]), - &mut ctx - ); - Ok(()) - } - - #[test] - fn filter_shared_onpair_stays_encoded_after_one_step() -> VortexResult<()> { - let input = VarBinArray::from_iter( - [ - Some("Google alpha"), - Some("irrelevant"), - Some("Google beta"), - Some("other"), - ], - DType::Utf8(Nullability::Nullable), - ); - let len = input.len(); - let dtype = input.dtype().clone(); - let array = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG)?.into_array(); - let shared = SharedArray::new(array).into_array(); - let filtered = shared.filter(Mask::from_iter([true, false, true, false]))?; - - let stepped = filtered.execute::(&mut SESSION.create_execution_ctx())?; - assert!(stepped.is::()); - assert_eq!(stepped.len(), 2); - Ok(()) - } - - #[test] - fn filter_sliced_onpair_stays_encoded_after_one_step() -> VortexResult<()> { - let input = VarBinArray::from_iter( - [ - Some("prefix"), - Some("Google alpha"), - Some("irrelevant"), - Some("Google beta"), - Some("suffix"), - ], - DType::Utf8(Nullability::Nullable), - ); - let len = input.len(); - let dtype = input.dtype().clone(); - let array = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG)?.into_array(); - let sliced = array.slice(1..4)?; - let filtered = sliced.filter(Mask::from_iter([true, false, true]))?; - - let stepped = filtered.execute::(&mut SESSION.create_execution_ctx())?; - assert!(stepped.is::()); - assert_eq!(stepped.len(), 2); - Ok(()) - } -} diff --git a/encodings/experimental/onpair/src/compute/mod.rs b/encodings/experimental/onpair/src/compute/mod.rs index 46cf8bf8bab..4ad5f48f578 100644 --- a/encodings/experimental/onpair/src/compute/mod.rs +++ b/encodings/experimental/onpair/src/compute/mod.rs @@ -5,5 +5,4 @@ mod byte_length; mod cast; mod compare; mod filter; -mod like; mod slice; diff --git a/encodings/experimental/onpair/src/kernel.rs b/encodings/experimental/onpair/src/kernel.rs index ed216bfa904..8863d750a72 100644 --- a/encodings/experimental/onpair/src/kernel.rs +++ b/encodings/experimental/onpair/src/kernel.rs @@ -10,8 +10,6 @@ use vortex_array::scalar_fn::fns::binary::Binary; use vortex_array::scalar_fn::fns::binary::CompareExecuteAdaptor; use vortex_array::scalar_fn::fns::byte_length::ByteLength; use vortex_array::scalar_fn::fns::byte_length::ByteLengthExecuteAdaptor; -use vortex_array::scalar_fn::fns::like::Like; -use vortex_array::scalar_fn::fns::like::LikeExecuteAdaptor; use vortex_session::VortexSession; use crate::OnPair; @@ -26,5 +24,4 @@ pub(super) fn initialize(session: &VortexSession) { OnPair, ByteLengthExecuteAdaptor(OnPair), ); - kernels.register_execute_parent_kernel(Like.id(), OnPair, LikeExecuteAdaptor(OnPair)); } diff --git a/encodings/fsst/src/compute/like.rs b/encodings/fsst/src/compute/like.rs index 1d621294b45..c922fba088e 100644 --- a/encodings/fsst/src/compute/like.rs +++ b/encodings/fsst/src/compute/like.rs @@ -3,28 +3,21 @@ use vortex_array::ArrayRef; use vortex_array::ArrayView; -use vortex_array::Canonical; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::BoolArray; use vortex_array::arrays::PrimitiveArray; -use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt; use vortex_array::arrays::varbin::VarBinArrayExt; use vortex_array::match_each_integer_ptype; -use vortex_array::scalar_fn::fns::like::Like; use vortex_array::scalar_fn::fns::like::LikeKernel; use vortex_array::scalar_fn::fns::like::LikeOptions; use vortex_error::VortexResult; use crate::FSST; use crate::FSSTArrayExt; -use crate::canonical::canonicalize_fsst; use crate::dfa::FsstMatcher; -use crate::dfa::LikeKind; use crate::dfa::dfa_scan_to_bitbuf; -const DECODE_CONTAINS_MAX_NEEDLE_LEN: usize = 16; - impl LikeKernel for FSST { fn like( array: ArrayView<'_, Self>, @@ -54,24 +47,9 @@ impl LikeKernel for FSST { return Ok(None); }; - let like_kind = LikeKind::parse(pattern_bytes); - if let Some(LikeKind::Contains(needle)) = like_kind - && !needle.is_empty() - && needle.len() <= DECODE_CONTAINS_MAX_NEEDLE_LEN - { - // For short substring patterns, bulk FSST decode plus Arrow's memmem-backed LIKE is - // faster than walking the compressed stream through the byte-at-a-time DFA. - let decoded = canonicalize_fsst(array, ctx)?; - let result = Like - .try_new_array(array.len(), options, [decoded, pattern.clone()])? - .into_array() - .execute::(ctx)? - .into_bool(); - return Ok(Some(result.into_array())); - } - let symbols = array.symbols(); let symbol_lengths = array.symbol_lengths(); + let Some(matcher) = FsstMatcher::try_new(symbols.as_slice(), symbol_lengths.as_slice(), pattern_bytes)? else { diff --git a/encodings/fsst/src/dfa/mod.rs b/encodings/fsst/src/dfa/mod.rs index 2fcafb19cfc..5f67f92997e 100644 --- a/encodings/fsst/src/dfa/mod.rs +++ b/encodings/fsst/src/dfa/mod.rs @@ -211,7 +211,7 @@ impl FsstMatcher { } /// The subset of LIKE patterns we can handle without decompression. -pub(crate) enum LikeKind<'a> { +enum LikeKind<'a> { /// `prefix%` Prefix(Cow<'a, [u8]>), /// `%needle%` @@ -219,7 +219,7 @@ pub(crate) enum LikeKind<'a> { } impl<'a> LikeKind<'a> { - pub(crate) fn parse(pattern: &'a [u8]) -> Option { + fn parse(pattern: &'a [u8]) -> Option { Self::parse_prefix(pattern).or_else(|| Self::parse_contains(pattern)) } diff --git a/encodings/fsst/src/kernel.rs b/encodings/fsst/src/kernel.rs index 95d25080af7..30f25006195 100644 --- a/encodings/fsst/src/kernel.rs +++ b/encodings/fsst/src/kernel.rs @@ -38,20 +38,14 @@ mod tests { use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::VortexSessionExecute; - use vortex_array::arrays::BoolArray; - use vortex_array::arrays::ConstantArray; use vortex_array::arrays::FilterArray; use vortex_array::arrays::PrimitiveArray; - use vortex_array::arrays::SharedArray; - use vortex_array::arrays::VarBinArray; use vortex_array::arrays::varbin::builder::VarBinBuilder; use vortex_array::assert_arrays_eq; - use vortex_array::builtins::ArrayBuiltins; use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; use vortex_array::expr::byte_length; use vortex_array::expr::root; - use vortex_array::scalar_fn::fns::operators::Operator; use vortex_error::VortexResult; use vortex_mask::Mask; use vortex_session::VortexSession; @@ -236,41 +230,4 @@ mod tests { assert_arrays_eq!(result, expected, &mut ctx); Ok(()) } - - #[test] - fn test_shared_fsst_parent_kernels() -> VortexResult<()> { - let session = vortex_array::array_session(); - crate::initialize(&session); - let mut ctx = session.create_execution_ctx(); - - let varbin = VarBinArray::from_iter( - ["hello", "", "world!!"].map(Some), - DType::Utf8(Nullability::NonNullable), - ) - .into_array(); - let compressor = fsst_train_compressor(&varbin, &mut ctx)?; - let fsst = fsst_compress(&varbin, &compressor, &mut ctx)?.into_array(); - let shared = SharedArray::new(fsst).into_array(); - - let lengths = shared.clone().apply(&byte_length(root()))?; - assert_arrays_eq!( - lengths, - PrimitiveArray::from_iter(vec![5u64, 0, 7]), - &mut ctx - ); - - let not_empty = shared - .binary( - ConstantArray::new("", shared.len()).into_array(), - Operator::NotEq, - )? - .execute::(&mut ctx)?; - assert_arrays_eq!( - not_empty, - BoolArray::from_iter([true, false, true]), - &mut ctx - ); - - Ok(()) - } } diff --git a/vortex-array/src/arrays/dict/compute/rules.rs b/vortex-array/src/arrays/dict/compute/rules.rs index e4d102ee861..b4804218e37 100644 --- a/vortex-array/src/arrays/dict/compute/rules.rs +++ b/vortex-array/src/arrays/dict/compute/rules.rs @@ -9,15 +9,12 @@ use crate::EqMode; use crate::IntoArray; use crate::array::ArrayView; use crate::array::VTable; -use crate::arrays::Chunked; -use crate::arrays::ChunkedArray; use crate::arrays::Constant; use crate::arrays::ConstantArray; use crate::arrays::Dict; use crate::arrays::DictArray; use crate::arrays::ScalarFn; use crate::arrays::ScalarFnArray; -use crate::arrays::chunked::ChunkedArrayExt; use crate::arrays::dict::DictArrayExt; use crate::arrays::dict::DictArraySlotsExt; use crate::arrays::filter::FilterReduceAdaptor; @@ -40,59 +37,11 @@ pub(crate) const PARENT_RULES: ParentRuleSet = ParentRuleSet::new(&[ ParentRuleSet::lift(&CastReduceAdaptor(Dict)), ParentRuleSet::lift(&MaskReduceAdaptor(Dict)), ParentRuleSet::lift(&LikeReduceAdaptor(Dict)), - ParentRuleSet::lift(&DictionaryChunkedValuesPullUpRule), ParentRuleSet::lift(&DictionaryScalarFnValuesPushDownRule), ParentRuleSet::lift(&DictionaryScalarFnCodesPullUpRule), ParentRuleSet::lift(&SliceReduceAdaptor(Dict)), ]); -/// Pull a common dictionary values array above chunked dictionary codes. -/// -/// Rewrites `Chunked>` into `Dict, values>` only when -/// every child dictionary shares the exact same values array allocation. -#[derive(Debug)] -struct DictionaryChunkedValuesPullUpRule; - -impl ArrayParentReduceRule for DictionaryChunkedValuesPullUpRule { - type Parent = Chunked; - - fn reduce_parent( - &self, - array: ArrayView<'_, Dict>, - parent: ArrayView<'_, Chunked>, - _child_idx: usize, - ) -> VortexResult> { - let values = array.values(); - let codes_dtype = array.codes().dtype().clone(); - let mut code_chunks = Vec::with_capacity(parent.nchunks()); - let mut all_values_referenced = array.has_all_values_referenced(); - - for chunk in parent.iter_chunks() { - let Some(dict) = chunk.as_opt::() else { - return Ok(None); - }; - if dict.codes().dtype() != &codes_dtype { - return Ok(None); - } - if !ArrayRef::ptr_eq(dict.values(), values) { - return Ok(None); - } - all_values_referenced |= dict.has_all_values_referenced(); - code_chunks.push(dict.codes().clone()); - } - - let codes = ChunkedArray::try_new(code_chunks, codes_dtype)?.into_array(); - let dict = DictArray::try_new(codes, values.clone())?; - let dict = if all_values_referenced { - unsafe { dict.set_all_values_referenced(true) } - } else { - dict - }; - - Ok(Some(dict.into_array())) - } -} - /// Push down a scalar function to run only over the values of a dictionary array. #[derive(Debug)] struct DictionaryScalarFnValuesPushDownRule; @@ -265,72 +214,16 @@ mod tests { use vortex_buffer::buffer; use vortex_error::VortexResult; - use crate::ArrayRef; use crate::IntoArray; use crate::arrays::BoolArray; - use crate::arrays::Chunked; - use crate::arrays::ChunkedArray; use crate::arrays::Dict; use crate::arrays::DictArray; - use crate::arrays::PrimitiveArray; - use crate::arrays::chunked::ChunkedArrayExt; use crate::arrays::dict::DictArrayExt; - use crate::arrays::dict::DictArraySlotsExt; use crate::arrays::scalar_fn::ScalarFnFactoryExt; - use crate::assert_arrays_eq; - use crate::executor::VortexSessionExecute; use crate::optimizer::ArrayOptimizer; use crate::scalar_fn::EmptyOptions; use crate::scalar_fn::fns::not::Not; - #[test] - fn chunked_dict_with_shared_values_pulls_values_up() -> VortexResult<()> { - let values = buffer![10u32, 20, 30].into_array(); - let chunk0 = DictArray::try_new(buffer![0u8, 1].into_array(), values.clone())?.into_array(); - let chunk1 = - DictArray::try_new(buffer![2u8, 0, 1].into_array(), values.clone())?.into_array(); - let array = - ChunkedArray::try_new(vec![chunk0, chunk1], values.dtype().clone())?.into_array(); - - let optimized = array.optimize()?; - let dict = optimized.as_::(); - let codes = dict.codes().as_::(); - - assert!(ArrayRef::ptr_eq(dict.values(), &values)); - assert_eq!(codes.nchunks(), 2); - let mut ctx = crate::LEGACY_SESSION.create_execution_ctx(); - assert_arrays_eq!( - optimized, - PrimitiveArray::from_iter([10u32, 20, 30, 10, 20]), - &mut ctx - ); - - Ok(()) - } - - #[test] - fn chunked_dict_with_distinct_values_stays_chunked() -> VortexResult<()> { - let values0 = buffer![10u32, 20, 30].into_array(); - let values1 = buffer![10u32, 20, 30].into_array(); - let chunk0 = - DictArray::try_new(buffer![0u8, 1].into_array(), values0.clone())?.into_array(); - let chunk1 = DictArray::try_new(buffer![2u8, 0, 1].into_array(), values1)?.into_array(); - let array = - ChunkedArray::try_new(vec![chunk0, chunk1], values0.dtype().clone())?.into_array(); - - let optimized = array.optimize()?; - - assert!(optimized.is::()); - let mut ctx = crate::LEGACY_SESSION.create_execution_ctx(); - assert_arrays_eq!( - optimized, - PrimitiveArray::from_iter([10u32, 20, 30, 10, 20]), - &mut ctx - ); - - Ok(()) - } - #[test] fn scalar_fn_values_pushdown_preserves_all_values_referenced() -> VortexResult<()> { let dict = unsafe { diff --git a/vortex-array/src/arrays/dict/vtable/kernel.rs b/vortex-array/src/arrays/dict/vtable/kernel.rs index 79659af18dd..ab750f7d663 100644 --- a/vortex-array/src/arrays/dict/vtable/kernel.rs +++ b/vortex-array/src/arrays/dict/vtable/kernel.rs @@ -4,7 +4,6 @@ use vortex_session::VortexSession; use crate::ArrayVTable; -use crate::arrays::Chunked; use crate::arrays::Dict; use crate::arrays::dict::TakeExecuteAdaptor; use crate::optimizer::kernels::ArrayKernelsExt; @@ -17,7 +16,6 @@ use crate::scalar_fn::fns::fill_null::FillNullExecuteAdaptor; pub(crate) fn initialize(session: &VortexSession) { let kernels = session.kernels(); kernels.register_execute_parent_kernel(Binary.id(), Dict, CompareExecuteAdaptor(Dict)); - kernels.register_execute_parent_kernel(Dict.id(), Chunked, TakeExecuteAdaptor(Chunked)); kernels.register_execute_parent_kernel(Dict.id(), Dict, TakeExecuteAdaptor(Dict)); kernels.register_execute_parent_kernel(FillNull.id(), Dict, FillNullExecuteAdaptor(Dict)); } diff --git a/vortex-array/src/arrays/filter/kernel.rs b/vortex-array/src/arrays/filter/kernel.rs index 4213d692509..21bd225bf55 100644 --- a/vortex-array/src/arrays/filter/kernel.rs +++ b/vortex-array/src/arrays/filter/kernel.rs @@ -26,14 +26,11 @@ use crate::kernel::ExecuteParentKernel; use crate::matcher::Matcher; use crate::optimizer::kernels::ArrayKernelsExt; use crate::optimizer::rules::ArrayParentReduceRule; -use crate::scalar_fn::ScalarFnVTable; -use crate::scalar_fn::fns::like::Like; -use crate::scalar_fn::fns::like::LikeFilterExecuteAdaptor; pub(crate) fn initialize(session: &VortexSession) { - let kernels = session.kernels(); - kernels.register_execute_parent_kernel(Dict.id(), Filter, TakeExecuteAdaptor(Filter)); - kernels.register_execute_parent_kernel(Like.id(), Filter, LikeFilterExecuteAdaptor); + session + .kernels() + .register_execute_parent_kernel(Dict.id(), Filter, TakeExecuteAdaptor(Filter)); } pub trait FilterReduce: VTable { diff --git a/vortex-array/src/arrays/shared/vtable.rs b/vortex-array/src/arrays/shared/vtable.rs index fc03255d785..3c3a09216d2 100644 --- a/vortex-array/src/arrays/shared/vtable.rs +++ b/vortex-array/src/arrays/shared/vtable.rs @@ -113,14 +113,6 @@ impl VTable for Shared { .get_or_compute(|source| source.clone().execute::(ctx)) .map(ExecutionResult::done) } - - fn reduce_parent( - array: ArrayView<'_, Self>, - parent: &ArrayRef, - child_idx: usize, - ) -> VortexResult> { - array.current_array_ref().reduce_parent(parent, child_idx) - } } impl OperationsVTable for Shared { fn scalar_at( diff --git a/vortex-array/src/executor.rs b/vortex-array/src/executor.rs index b09c2d048b9..0c083f18ee5 100644 --- a/vortex-array/src/executor.rs +++ b/vortex-array/src/executor.rs @@ -34,8 +34,6 @@ use crate::ArrayRef; use crate::Canonical; use crate::IntoArray; use crate::array::ArrayId; -use crate::arrays::Shared; -use crate::arrays::shared::SharedArrayExt; use crate::builders::ArrayBuilder; use crate::builders::builder_with_capacity_in; use crate::dtype::DType; @@ -570,35 +568,6 @@ fn execute_parent_for_child( slot_idx: usize, kernels: &ParentExecutionKernels, ctx: &mut ExecutionCtx, -) -> VortexResult> { - if let Some(result) = execute_parent_for_exact_child(parent, child, slot_idx, kernels, ctx)? { - return Ok(Some(result)); - } - - // Shared is a transparent cache wrapper. Try kernels against the wrapped source/current array - // before forcing Shared to canonicalize and populate its cache. - let mut current = child.clone(); - while let Some(source) = current - .as_opt::() - .map(|shared| shared.current_array_ref().clone()) - { - if let Some(result) = - execute_parent_for_exact_child(parent, &source, slot_idx, kernels, ctx)? - { - return Ok(Some(result)); - } - current = source; - } - - Ok(None) -} - -fn execute_parent_for_exact_child( - parent: &ArrayRef, - child: &ArrayRef, - slot_idx: usize, - kernels: &ParentExecutionKernels, - ctx: &mut ExecutionCtx, ) -> VortexResult> { let key = execute_parent_key(parent.encoding_id(), child.encoding_id()); if let Some(plugins) = kernels.get(&key) { diff --git a/vortex-array/src/scalar_fn/fns/like/kernel.rs b/vortex-array/src/scalar_fn/fns/like/kernel.rs index e62f41d9f92..b3b683212ff 100644 --- a/vortex-array/src/scalar_fn/fns/like/kernel.rs +++ b/vortex-array/src/scalar_fn/fns/like/kernel.rs @@ -6,12 +6,9 @@ use vortex_error::VortexResult; use crate::ArrayRef; use crate::ExecutionCtx; -use crate::IntoArray; use crate::array::ArrayView; use crate::array::VTable; -use crate::arrays::Filter; use crate::arrays::ScalarFn; -use crate::arrays::ScalarFnArray; use crate::arrays::scalar_fn::ExactScalarFn; use crate::arrays::scalar_fn::ScalarFnArrayExt; use crate::arrays::scalar_fn::ScalarFnArrayView; @@ -108,45 +105,3 @@ where ::like(array, pattern, options, ctx) } } - -/// Adaptor that executes a filtered input before evaluating LIKE. -/// -/// This preserves sparse row demand for `LIKE(Filter(child), constant)`: the filter executes into a -/// filtered child first, then the regular child-specific LIKE execute-parent kernel can run over -/// only the selected rows. -#[derive(Default, Debug)] -pub struct LikeFilterExecuteAdaptor; - -impl ExecuteParentKernel for LikeFilterExecuteAdaptor { - type Parent = ExactScalarFn; - - fn execute_parent( - &self, - array: ArrayView<'_, Filter>, - parent: ScalarFnArrayView<'_, LikeExpr>, - child_idx: usize, - ctx: &mut ExecutionCtx, - ) -> VortexResult> { - if child_idx != 0 { - return Ok(None); - } - let scalar_fn_array = parent - .as_opt::() - .vortex_expect("ExactScalarFn matcher confirmed ScalarFnArray"); - let filtered = array.array().clone().execute::(ctx)?; - let children = scalar_fn_array - .iter_children() - .enumerate() - .map(|(idx, child)| { - if idx == child_idx { - filtered.clone() - } else { - child.clone() - } - }) - .collect(); - Ok(Some( - ScalarFnArray::try_new(scalar_fn_array.scalar_fn().clone(), children)?.into_array(), - )) - } -} diff --git a/vortex-file/src/strategy.rs b/vortex-file/src/strategy.rs index 3cb9fb1bab9..804218779c8 100644 --- a/vortex-file/src/strategy.rs +++ b/vortex-file/src/strategy.rs @@ -32,8 +32,6 @@ use vortex_array::dtype::FieldPath; use vortex_btrblocks::BtrBlocksCompressorBuilder; use vortex_btrblocks::SchemeExt; use vortex_btrblocks::schemes::integer::IntDictScheme; -#[cfg(feature = "unstable_encodings")] -use vortex_btrblocks::schemes::string::OnPairScheme; use vortex_bytebool::ByteBool; use vortex_datetime_parts::DateTimeParts; use vortex_decimal_byte_parts::DecimalByteParts; @@ -162,17 +160,8 @@ impl Default for WriteStrategyBuilder { /// Create a new empty builder. It can be further configured, /// and then finally built yielding the [`LayoutStrategy`]. fn default() -> Self { - #[cfg_attr(not(feature = "unstable_encodings"), allow(unused_mut))] - let mut compressor = BtrBlocksCompressorBuilder::default(); - #[cfg(feature = "unstable_encodings")] - { - // OnPair currently optimizes for compressed size, but its string predicate kernels are - // not yet competitive with FSST for the scan-heavy default file format. - compressor = compressor.exclude_schemes([OnPairScheme.id()]); - } - Self { - compressor: CompressorConfig::BtrBlocks(compressor), + compressor: CompressorConfig::BtrBlocks(BtrBlocksCompressorBuilder::default()), row_block_size: 8192, field_writers: HashMap::new(), allow_encodings: Some(ALLOWED_ENCODINGS.clone()), From 29264207bccdaa90c0ea8ce6388d191d31c65767 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 24 Jun 2026 11:45:41 -0400 Subject: [PATCH 40/48] Remove row-index sortedness validation from layout27 Signed-off-by: "Nicholas Gates" --- vortex-bench/src/random_access/take.rs | 2 +- vortex-cxx/src/lib.rs | 5 +- vortex-cxx/src/read.rs | 5 +- vortex-datafusion/src/persistent/opener.rs | 12 +- vortex-duckdb/src/convert/table_filter.rs | 16 +- vortex-ffi/src/scan.rs | 4 +- vortex-file/src/scan_v1_v2_differential.rs | 8 +- vortex-file/src/tests.rs | 9 +- vortex-jni/src/scan.rs | 4 +- vortex-layout/src/scan/scan_builder.rs | 6 +- vortex-python/src/dataset.rs | 2 +- vortex-python/src/file.rs | 2 +- vortex-scan/src/selection.rs | 234 ++++----------------- 13 files changed, 64 insertions(+), 245 deletions(-) diff --git a/vortex-bench/src/random_access/take.rs b/vortex-bench/src/random_access/take.rs index 42d6ebb4924..e86e5d576d6 100644 --- a/vortex-bench/src/random_access/take.rs +++ b/vortex-bench/src/random_access/take.rs @@ -76,7 +76,7 @@ impl RandomAccessor for VortexRandomAccessor { let array = self .file .scan()? - .with_row_indices(indices_buf)? + .with_row_indices(indices_buf) .into_array_stream()? .read_all() .await?; diff --git a/vortex-cxx/src/lib.rs b/vortex-cxx/src/lib.rs index a7535e66027..b6d002513f6 100644 --- a/vortex-cxx/src/lib.rs +++ b/vortex-cxx/src/lib.rs @@ -95,10 +95,7 @@ mod ffi { fn with_projection(self: &mut VortexScanBuilder, projection: Box); fn with_projection_ref(self: &mut VortexScanBuilder, projection: &Expr); fn with_row_range(self: &mut VortexScanBuilder, row_range_start: u64, row_range_end: u64); - fn with_include_by_index( - self: &mut VortexScanBuilder, - include_by_index: &[u64], - ) -> Result<()>; + fn with_include_by_index(self: &mut VortexScanBuilder, include_by_index: &[u64]); fn with_limit(self: &mut VortexScanBuilder, limit: usize); unsafe fn with_output_schema( self: &mut VortexScanBuilder, diff --git a/vortex-cxx/src/read.rs b/vortex-cxx/src/read.rs index be071958d0b..4ce229b7a2b 100644 --- a/vortex-cxx/src/read.rs +++ b/vortex-cxx/src/read.rs @@ -91,10 +91,9 @@ impl VortexScanBuilder { }); } - pub(crate) fn with_include_by_index(&mut self, include_by_index: &[u64]) -> Result<()> { - let selection = Selection::include_by_index(Buffer::copy_from(include_by_index))?; + pub(crate) fn with_include_by_index(&mut self, include_by_index: &[u64]) { + let selection = Selection::IncludeByIndex(Buffer::copy_from(include_by_index)); take_mut::take(&mut self.inner, |inner| inner.with_selection(selection)); - Ok(()) } pub(crate) fn with_limit(&mut self, limit: usize) { diff --git a/vortex-datafusion/src/persistent/opener.rs b/vortex-datafusion/src/persistent/opener.rs index 29232546ee9..d0dd158fa41 100644 --- a/vortex-datafusion/src/persistent/opener.rs +++ b/vortex-datafusion/src/persistent/opener.rs @@ -1472,9 +1472,9 @@ mod tests { let mut file = PartitionedFile::new(file_path.to_string(), data_size); file.extensions .insert( - VortexAccessPlan::default().with_selection(Selection::include_by_index( + VortexAccessPlan::default().with_selection(Selection::IncludeByIndex( Buffer::from_iter(vec![1, 3, 5, 7]), - )?), + )), ); let opener = make_test_opener( @@ -1516,9 +1516,9 @@ mod tests { let mut file = PartitionedFile::new(file_path.to_string(), data_size); file.extensions .insert( - VortexAccessPlan::default().with_selection(Selection::exclude_by_index( + VortexAccessPlan::default().with_selection(Selection::ExcludeByIndex( Buffer::from_iter(vec![0, 2, 4, 6, 8]), - )?), + )), ); let opener = make_test_opener( @@ -1592,9 +1592,9 @@ mod tests { let mut file = PartitionedFile::new(file_path.to_string(), data_size); file.extensions .insert( - VortexAccessPlan::default().with_selection(Selection::include_by_index( + VortexAccessPlan::default().with_selection(Selection::IncludeByIndex( Buffer::from_iter(vec![1, 3, 5, 7, 9]), - )?), + )), ); let mut opener_v1 = make_test_opener( diff --git a/vortex-duckdb/src/convert/table_filter.rs b/vortex-duckdb/src/convert/table_filter.rs index 2c8be999e04..fa7ee26e744 100644 --- a/vortex-duckdb/src/convert/table_filter.rs +++ b/vortex-duckdb/src/convert/table_filter.rs @@ -166,12 +166,6 @@ fn intersect_sorted(left: &[u64], right: &[u64]) -> Vec { result } -fn normalize_indices(mut indices: Vec) -> Vec { - indices.sort_unstable(); - indices.dedup(); - indices -} - /// For constant comparison on IN filters over file_index or file_row_number /// virtual column, create a selection and a range covering the same range as /// expressions do. @@ -184,10 +178,7 @@ pub fn try_from_virtual_column_filter( .iter() .map(nonnegative_number_from_value) .collect::>>()?; - Ok(( - Selection::include_by_index(Buffer::from_iter(normalize_indices(indices)))?, - None, - )) + Ok((Selection::IncludeByIndex(Buffer::from_iter(indices)), None)) } TableFilterClass::ConstantComparison(const_) => { let n = nonnegative_number_from_value(const_.value)?; @@ -215,7 +206,7 @@ pub fn try_from_virtual_column_filter( let (sel, range) = try_from_virtual_column_filter(child)?; if let Selection::IncludeByIndex(buf) = sel { indices = Some(match indices { - None => buf.as_slice().to_vec(), + None => buf.iter().copied().collect(), Some(existing) => intersect_sorted(&existing, buf.as_ref()), }); } @@ -226,8 +217,7 @@ pub fn try_from_virtual_column_filter( } let range = (start < end).then_some(start..end); let sel = indices - .map(|v| Selection::include_by_index(Buffer::from_iter(v))) - .transpose()? + .map(|v| Selection::IncludeByIndex(Buffer::from_iter(v))) .unwrap_or(Selection::All); Ok((sel, range)) } diff --git a/vortex-ffi/src/scan.rs b/vortex-ffi/src/scan.rs index 5fb624d97cf..6452d0e232b 100644 --- a/vortex-ffi/src/scan.rs +++ b/vortex-ffi/src/scan.rs @@ -177,13 +177,13 @@ fn scan_request(opts: *const vx_scan_options) -> VortexResult { vortex_ensure!(!selection.idx.is_null()); let buf = unsafe { slice::from_raw_parts(selection.idx, selection.idx_len) }; let buf = Buffer::copy_from(buf); - Selection::include_by_index(buf)? + Selection::IncludeByIndex(buf) } vx_scan_selection_include::VX_SELECTION_EXCLUDE_RANGE => { vortex_ensure!(!selection.idx.is_null()); let buf = unsafe { slice::from_raw_parts(selection.idx, selection.idx_len) }; let buf = Buffer::copy_from(buf); - Selection::exclude_by_index(buf)? + Selection::ExcludeByIndex(buf) } }; diff --git a/vortex-file/src/scan_v1_v2_differential.rs b/vortex-file/src/scan_v1_v2_differential.rs index 6207dc51d5a..27b3b9c61c2 100644 --- a/vortex-file/src/scan_v1_v2_differential.rs +++ b/vortex-file/src/scan_v1_v2_differential.rs @@ -383,7 +383,7 @@ async fn differential_row_range() -> VortexResult<()> { async fn differential_include_selection() -> VortexResult<()> { let file = write_file(chunked(), false).await?; let scan_request = ScanRequest { - selection: Selection::include_by_index(Buffer::from_iter([0, 2, 5, 9]))?, + selection: Selection::IncludeByIndex(Buffer::from_iter([0, 2, 5, 9])), ..request(root(), None) }; assert_v1_eq_v2(&file, scan_request).await @@ -393,7 +393,7 @@ async fn differential_include_selection() -> VortexResult<()> { async fn differential_exclude_selection() -> VortexResult<()> { let file = write_file(chunked(), false).await?; let scan_request = ScanRequest { - selection: Selection::exclude_by_index(Buffer::from_iter([1, 4, 7]))?, + selection: Selection::ExcludeByIndex(Buffer::from_iter([1, 4, 7])), ..request(root(), None) }; assert_v1_eq_v2(&file, scan_request).await @@ -414,8 +414,8 @@ async fn differential_unordered_multi_file_partition_selection() -> VortexResult let request = ScanRequest { projection: get_item("numbers", root()), row_range: Some(1..4), - selection: Selection::exclude_by_index(Buffer::from_iter([2]))?, - partition_selection: Selection::include_by_index(Buffer::from_iter([0, 2]))?, + selection: Selection::ExcludeByIndex(Buffer::from_iter([2])), + partition_selection: Selection::IncludeByIndex(Buffer::from_iter([0, 2])), ordered: false, ..Default::default() }; diff --git a/vortex-file/src/tests.rs b/vortex-file/src/tests.rs index 824b46ca957..f583cd7e254 100644 --- a/vortex-file/src/tests.rs +++ b/vortex-file/src/tests.rs @@ -960,7 +960,6 @@ async fn test_with_indices_simple() { .scan() .unwrap() .with_row_indices(Buffer::::empty()) - .unwrap() .into_array_stream() .unwrap() .read_all() @@ -978,7 +977,6 @@ async fn test_with_indices_simple() { .scan() .unwrap() .with_row_indices(Buffer::from_iter(kept_indices)) - .unwrap() .into_array_stream() .unwrap() .read_all() @@ -1004,7 +1002,6 @@ async fn test_with_indices_simple() { .scan() .unwrap() .with_row_indices((0u64..500).collect::>()) - .unwrap() .into_array_stream() .unwrap() .read_all() @@ -1050,7 +1047,6 @@ async fn test_with_indices_on_two_columns() { .scan() .unwrap() .with_row_indices(Buffer::from_iter(kept_indices)) - .unwrap() .into_array_stream() .unwrap() .read_all() @@ -1108,7 +1104,6 @@ async fn test_with_indices_and_with_row_filter_simple() { .unwrap() .with_filter(gt(get_item("numbers", root()), lit(50_i16))) .with_row_indices(Buffer::empty()) - .unwrap() .into_array_stream() .unwrap() .read_all() @@ -1127,7 +1122,6 @@ async fn test_with_indices_and_with_row_filter_simple() { .unwrap() .with_filter(gt(get_item("numbers", root()), lit(50_i16))) .with_row_indices(Buffer::from_iter(kept_indices)) - .unwrap() .into_array_stream() .unwrap() .read_all() @@ -1156,7 +1150,6 @@ async fn test_with_indices_and_with_row_filter_simple() { .unwrap() .with_filter(gt(get_item("numbers", root()), lit(50_i16))) .with_row_indices((0..500).collect::>()) - .unwrap() .into_array_stream() .unwrap() .read_all() @@ -1459,7 +1452,7 @@ async fn file_take() -> VortexResult<()> { let vxf = chunked_file().await?; let result = vxf .scan()? - .with_row_indices(buffer![0, 1, 8])? + .with_row_indices(buffer![0, 1, 8]) .into_array_stream()? .read_all() .await?; diff --git a/vortex-jni/src/scan.rs b/vortex-jni/src/scan.rs index ba320add6e3..360d92aa989 100644 --- a/vortex-jni/src/scan.rs +++ b/vortex-jni/src/scan.rs @@ -96,8 +96,8 @@ fn build_scan_request( let selection = match selection_include { 0 => Selection::All, - 1 => Selection::include_by_index(Buffer::copy_from(selection_idx))?, - 2 => Selection::exclude_by_index(Buffer::copy_from(selection_idx))?, + 1 => Selection::IncludeByIndex(Buffer::copy_from(selection_idx)), + 2 => Selection::ExcludeByIndex(Buffer::copy_from(selection_idx)), 3 => Selection::IncludeRoaring(deserialize_roaring_selection(selection_roaring_bitmap)?), 4 => Selection::ExcludeRoaring(deserialize_roaring_selection(selection_roaring_bitmap)?), other => vortex_bail!("unknown selection include code: {other}"), diff --git a/vortex-layout/src/scan/scan_builder.rs b/vortex-layout/src/scan/scan_builder.rs index 224a9e8783c..11fd5c7b882 100644 --- a/vortex-layout/src/scan/scan_builder.rs +++ b/vortex-layout/src/scan/scan_builder.rs @@ -173,9 +173,9 @@ impl ScanBuilder { } /// Select rows by absolute indices relative to the scan input. - pub fn with_row_indices(mut self, row_indices: Buffer) -> VortexResult { - self.selection = Selection::include_by_index(row_indices)?; - Ok(self) + pub fn with_row_indices(mut self, row_indices: Buffer) -> Self { + self.selection = Selection::IncludeByIndex(row_indices); + self } /// Set the root row offset used by row-index expressions. diff --git a/vortex-python/src/dataset.rs b/vortex-python/src/dataset.rs index da2d8f018ad..acb8285e5a0 100644 --- a/vortex-python/src/dataset.rs +++ b/vortex-python/src/dataset.rs @@ -66,7 +66,7 @@ pub fn read_array_from_reader( if let Some(indices) = indices { let primitive = indices.execute::(ctx)?; let indices = primitive.into_buffer(); - scan = scan.with_row_indices(indices)?; + scan = scan.with_row_indices(indices); } if let Some((l, r)) = row_range { diff --git a/vortex-python/src/file.rs b/vortex-python/src/file.rs index 65327b1e967..f6bd1ed9cda 100644 --- a/vortex-python/src/file.rs +++ b/vortex-python/src/file.rs @@ -220,7 +220,7 @@ fn scan_builder( if let Some(indices) = indices { let casted = indices.cast(DType::Primitive(PType::U64, NonNullable))?; let indices = casted.execute::(ctx)?.into_buffer::(); - builder = builder.with_row_indices(indices)?; + builder = builder.with_row_indices(indices); } if let Some(batch_size) = batch_size { diff --git a/vortex-scan/src/selection.rs b/vortex-scan/src/selection.rs index d17af0b950d..79abafc3d4d 100644 --- a/vortex-scan/src/selection.rs +++ b/vortex-scan/src/selection.rs @@ -7,99 +7,11 @@ use std::ops::Not; use std::ops::Range; use vortex_buffer::Buffer; -use vortex_error::VortexResult; -use vortex_error::vortex_bail; use vortex_error::vortex_panic; use vortex_mask::Mask; use crate::row_mask::RowMask; -/// A validated selection of rows to include by absolute row index. -#[derive(Clone, Debug)] -pub struct IncludeByIndex { - indices: Buffer, -} - -impl IncludeByIndex { - /// Create a new include-by-index selection. - pub fn try_new(indices: Buffer) -> VortexResult { - validate_indices(&indices)?; - Ok(Self { indices }) - } - - /// Return the selected row indices. - pub fn as_slice(&self) -> &[u64] { - self.indices.as_slice() - } - - /// Return true if the selection contains no row indices. - pub fn is_empty(&self) -> bool { - self.indices.is_empty() - } - - /// Return the number of selected row indices. - pub fn len(&self) -> usize { - self.indices.len() - } -} - -impl std::ops::Deref for IncludeByIndex { - type Target = [u64]; - - fn deref(&self) -> &Self::Target { - self.as_slice() - } -} - -impl AsRef<[u64]> for IncludeByIndex { - fn as_ref(&self) -> &[u64] { - self.as_slice() - } -} - -/// A validated selection of rows to exclude by absolute row index. -#[derive(Clone, Debug)] -pub struct ExcludeByIndex { - indices: Buffer, -} - -impl ExcludeByIndex { - /// Create a new exclude-by-index selection. - pub fn try_new(indices: Buffer) -> VortexResult { - validate_indices(&indices)?; - Ok(Self { indices }) - } - - /// Return the excluded row indices. - pub fn as_slice(&self) -> &[u64] { - self.indices.as_slice() - } - - /// Return true if the selection contains no row indices. - pub fn is_empty(&self) -> bool { - self.indices.is_empty() - } - - /// Return the number of excluded row indices. - pub fn len(&self) -> usize { - self.indices.len() - } -} - -impl std::ops::Deref for ExcludeByIndex { - type Target = [u64]; - - fn deref(&self) -> &Self::Target { - self.as_slice() - } -} - -impl AsRef<[u64]> for ExcludeByIndex { - fn as_ref(&self) -> &[u64] { - self.as_slice() - } -} - /// A selection identifies a set of rows to include in the scan (in addition to applying any /// filter predicates). #[derive(Default, Clone, Debug)] @@ -107,10 +19,10 @@ pub enum Selection { /// No selection, all rows are included. #[default] All, - /// A selection of sorted, unique rows to include by index. - IncludeByIndex(IncludeByIndex), - /// A selection of sorted, unique rows to exclude by index. - ExcludeByIndex(ExcludeByIndex), + /// A selection of sorted rows to include by index. + IncludeByIndex(Buffer), + /// A selection of sorted rows to exclude by index. + ExcludeByIndex(Buffer), /// A selection of rows to include using a [`roaring::RoaringTreemap`]. IncludeRoaring(roaring::RoaringTreemap), /// A selection of rows to exclude using a [`roaring::RoaringTreemap`]. @@ -118,16 +30,6 @@ pub enum Selection { } impl Selection { - /// Create a selection of rows to include by absolute row index. - pub fn include_by_index(indices: Buffer) -> VortexResult { - Ok(Self::IncludeByIndex(IncludeByIndex::try_new(indices)?)) - } - - /// Create a selection of rows to exclude by absolute row index. - pub fn exclude_by_index(indices: Buffer) -> VortexResult { - Ok(Self::ExcludeByIndex(ExcludeByIndex::try_new(indices)?)) - } - /// Return the row count for this selection. pub fn row_count(&self, total_rows: u64) -> u64 { match self { @@ -160,12 +62,12 @@ impl Selection { match self { Selection::All => RowMask::new(range.start, Mask::new_true(range_len)), Selection::IncludeByIndex(include) => { - let indices = include.as_slice(); - let mask = indices_range(range, indices) + let mask = indices_range(range, include) .map(|idx_range| { Mask::from_indices( range_len, - indices[idx_range] + include + .slice(idx_range) .iter() .map(|idx| { idx.checked_sub(range.start).unwrap_or_else(|| { @@ -187,26 +89,10 @@ impl Selection { RowMask::new(range.start, mask) } Selection::ExcludeByIndex(exclude) => { - let indices = exclude.as_slice(); - let mask = indices_range(range, indices) - .map(|idx_range| { - Mask::from_indices( - range_len, - indices[idx_range] - .iter() - .map(|idx| { - idx.checked_sub(range.start).unwrap_or_else(|| { - vortex_panic!( - "index underflow, range: {:?}, idx: {:?}", - range, - idx - ) - }) - }) - .filter_map(|idx| usize::try_from(idx).ok()), - ) - }) - .unwrap_or_else(|| Mask::new_false(range_len)); + let mask = Selection::IncludeByIndex(exclude.clone()) + .row_mask(range) + .mask() + .clone(); RowMask::new(range.start, mask.not()) } Selection::IncludeRoaring(roaring) => { @@ -270,24 +156,6 @@ impl Selection { } } -fn validate_indices(indices: &[u64]) -> VortexResult<()> { - // Row-mask extraction uses binary search over these indices, and row_count treats - // them as set membership. Unsorted or duplicate input can otherwise silently - // mis-select rows or over-report the selected row count. - for (idx, window) in indices.windows(2).enumerate() { - if window[0] >= window[1] { - vortex_bail!( - "row index selection must be strictly increasing at positions {} and {}: {} >= {}", - idx, - idx + 1, - window[0], - window[1] - ); - } - } - Ok(()) -} - /// Find the positional range within row_indices that covers all rows in the given range. fn indices_range(range: &Range, row_indices: &[u64]) -> Option> { if row_indices.first().is_some_and(|&first| first >= range.end) @@ -309,39 +177,9 @@ fn indices_range(range: &Range, row_indices: &[u64]) -> Option mod tests { use vortex_buffer::Buffer; - use super::Selection; - - fn include(indices: impl IntoIterator) -> Selection { - Selection::include_by_index(Buffer::from_iter(indices)) - .expect("test indices should be strictly increasing") - } - - fn exclude(indices: impl IntoIterator) -> Selection { - Selection::exclude_by_index(Buffer::from_iter(indices)) - .expect("test indices should be strictly increasing") - } - - #[test] - fn include_by_index_rejects_unsorted_indices() { - let err = Selection::include_by_index(Buffer::from_iter([3, 1])).unwrap_err(); - assert!(err.to_string().contains("strictly increasing")); - } - - #[test] - fn include_by_index_rejects_duplicate_indices() { - let err = Selection::include_by_index(Buffer::from_iter([1, 1])).unwrap_err(); - assert!(err.to_string().contains("strictly increasing")); - } - - #[test] - fn exclude_by_index_rejects_unsorted_indices() { - let err = Selection::exclude_by_index(Buffer::from_iter([3, 1])).unwrap_err(); - assert!(err.to_string().contains("strictly increasing")); - } - #[test] fn test_row_mask_all() { - let selection = include([1, 3, 5, 7]); + let selection = super::Selection::IncludeByIndex(Buffer::from_iter(vec![1, 3, 5, 7])); let range = 1..8; let row_mask = selection.row_mask(&range); @@ -350,7 +188,7 @@ mod tests { #[test] fn test_row_mask_slice() { - let selection = include([1, 3, 5, 7]); + let selection = super::Selection::IncludeByIndex(Buffer::from_iter(vec![1, 3, 5, 7])); let range = 3..6; let row_mask = selection.row_mask(&range); @@ -359,7 +197,7 @@ mod tests { #[test] fn test_row_mask_exclusive() { - let selection = include([1, 3, 5, 7]); + let selection = super::Selection::IncludeByIndex(Buffer::from_iter(vec![1, 3, 5, 7])); let range = 3..5; let row_mask = selection.row_mask(&range); @@ -368,7 +206,7 @@ mod tests { #[test] fn test_row_mask_all_false() { - let selection = include([1, 3, 5, 7]); + let selection = super::Selection::IncludeByIndex(Buffer::from_iter(vec![1, 3, 5, 7])); let range = 8..10; let row_mask = selection.row_mask(&range); @@ -377,7 +215,7 @@ mod tests { #[test] fn test_row_mask_all_true() { - let selection = include([1, 3, 4, 5, 6]); + let selection = super::Selection::IncludeByIndex(Buffer::from_iter(vec![1, 3, 4, 5, 6])); let range = 3..7; let row_mask = selection.row_mask(&range); @@ -386,7 +224,7 @@ mod tests { #[test] fn test_row_mask_zero() { - let selection = include([0]); + let selection = super::Selection::IncludeByIndex(Buffer::from_iter(vec![0])); let range = 0..5; let row_mask = selection.row_mask(&range); @@ -406,7 +244,7 @@ mod tests { roaring.insert(5); roaring.insert(7); - let selection = Selection::IncludeRoaring(roaring); + let selection = super::super::Selection::IncludeRoaring(roaring); let range = 1..8; let row_mask = selection.row_mask(&range); @@ -421,7 +259,7 @@ mod tests { roaring.insert(5); roaring.insert(7); - let selection = Selection::IncludeRoaring(roaring); + let selection = super::super::Selection::IncludeRoaring(roaring); let range = 3..6; let row_mask = selection.row_mask(&range); @@ -436,7 +274,7 @@ mod tests { roaring.insert(5); roaring.insert(7); - let selection = Selection::IncludeRoaring(roaring); + let selection = super::super::Selection::IncludeRoaring(roaring); let range = 8..10; let row_mask = selection.row_mask(&range); @@ -451,7 +289,7 @@ mod tests { roaring.insert(i); } - let selection = Selection::IncludeRoaring(roaring); + let selection = super::super::Selection::IncludeRoaring(roaring); let range = 1000..2000; let row_mask = selection.row_mask(&range); @@ -466,7 +304,7 @@ mod tests { roaring.insert(3); roaring.insert(5); - let selection = Selection::ExcludeRoaring(roaring); + let selection = super::super::Selection::ExcludeRoaring(roaring); let range = 0..7; let row_mask = selection.row_mask(&range); @@ -482,7 +320,7 @@ mod tests { roaring.insert(i); } - let selection = Selection::ExcludeRoaring(roaring); + let selection = super::super::Selection::ExcludeRoaring(roaring); let range = 10..20; let row_mask = selection.row_mask(&range); @@ -495,7 +333,7 @@ mod tests { roaring.insert(100); roaring.insert(101); - let selection = Selection::ExcludeRoaring(roaring); + let selection = super::super::Selection::ExcludeRoaring(roaring); let range = 0..10; let row_mask = selection.row_mask(&range); @@ -511,7 +349,7 @@ mod tests { roaring.insert(7); roaring.insert(15); // Outside range - let selection = Selection::ExcludeRoaring(roaring); + let selection = super::super::Selection::ExcludeRoaring(roaring); let range = 5..10; let row_mask = selection.row_mask(&range); @@ -522,7 +360,7 @@ mod tests { #[test] fn test_roaring_include_empty() { let roaring = RoaringTreemap::new(); - let selection = Selection::IncludeRoaring(roaring); + let selection = super::super::Selection::IncludeRoaring(roaring); let range = 0..100; let row_mask = selection.row_mask(&range); @@ -532,7 +370,7 @@ mod tests { #[test] fn test_roaring_exclude_empty() { let roaring = RoaringTreemap::new(); - let selection = Selection::ExcludeRoaring(roaring); + let selection = super::super::Selection::ExcludeRoaring(roaring); let range = 0..100; let row_mask = selection.row_mask(&range); @@ -545,7 +383,7 @@ mod tests { roaring.insert(0); roaring.insert(99); - let selection = Selection::IncludeRoaring(roaring); + let selection = super::super::Selection::IncludeRoaring(roaring); let range = 0..100; let row_mask = selection.row_mask(&range); @@ -559,7 +397,7 @@ mod tests { roaring.insert_range(10..20); roaring.insert_range(30..40); - let selection = Selection::IncludeRoaring(roaring); + let selection = super::super::Selection::IncludeRoaring(roaring); let range = 15..35; let row_mask = selection.row_mask(&range); @@ -575,7 +413,7 @@ mod tests { roaring.insert(u64::MAX - 1); roaring.insert(u64::MAX); - let selection = Selection::IncludeRoaring(roaring); + let selection = super::super::Selection::IncludeRoaring(roaring); let range = u64::MAX - 10..u64::MAX; let row_mask = selection.row_mask(&range); @@ -588,7 +426,7 @@ mod tests { let mut roaring = RoaringTreemap::new(); roaring.insert(u64::MAX - 1); - let selection = Selection::ExcludeRoaring(roaring); + let selection = super::super::Selection::ExcludeRoaring(roaring); let range = u64::MAX - 10..u64::MAX; let row_mask = selection.row_mask(&range); @@ -601,13 +439,14 @@ mod tests { // Test that RoaringTreemap and Buffer produce same results let indices = vec![1, 3, 5, 7, 9]; - let buffer_selection = include(indices.clone()); + let buffer_selection = + super::super::Selection::IncludeByIndex(Buffer::from_iter(indices.clone())); let mut roaring = RoaringTreemap::new(); for idx in &indices { roaring.insert(*idx); } - let roaring_selection = Selection::IncludeRoaring(roaring); + let roaring_selection = super::super::Selection::IncludeRoaring(roaring); let range = 0..12; let buffer_mask = buffer_selection.row_mask(&range); @@ -624,13 +463,14 @@ mod tests { // Test that ExcludeRoaring and ExcludeByIndex produce same results let indices = vec![2, 4, 6, 8]; - let buffer_selection = exclude(indices.clone()); + let buffer_selection = + super::super::Selection::ExcludeByIndex(Buffer::from_iter(indices.clone())); let mut roaring = RoaringTreemap::new(); for idx in &indices { roaring.insert(*idx); } - let roaring_selection = Selection::ExcludeRoaring(roaring); + let roaring_selection = super::super::Selection::ExcludeRoaring(roaring); let range = 0..10; let buffer_mask = buffer_selection.row_mask(&range); From 97042b645ffd5836892741acd3fb8c9ac80b32fd Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 24 Jun 2026 12:04:42 -0400 Subject: [PATCH 41/48] Use session default scan scheduler Signed-off-by: "Nicholas Gates" --- vortex-datafusion/src/v2/source.rs | 28 ---------------------------- vortex-datafusion/src/v2/table.rs | 26 -------------------------- vortex-duckdb/src/table_function.rs | 1 - vortex-ffi/src/scan.rs | 1 - vortex-file/src/multi/scan_v2.rs | 15 +++------------ vortex-jni/src/scan.rs | 1 - vortex-scan/src/lib.rs | 5 ----- 7 files changed, 3 insertions(+), 74 deletions(-) diff --git a/vortex-datafusion/src/v2/source.rs b/vortex-datafusion/src/v2/source.rs index de624d54376..b7ecd0aafb9 100644 --- a/vortex-datafusion/src/v2/source.rs +++ b/vortex-datafusion/src/v2/source.rs @@ -120,9 +120,6 @@ use vortex::metrics::MetricsRegistry; use vortex::scan::DataSourceRef; use vortex::scan::PlannedMorselScanRef; use vortex::scan::ScanRequest; -use vortex::scan::ScanScheduler; -use vortex::scan::ScanSchedulerConfig; -use vortex::scan::ScanSchedulerProvider; use vortex::session::VortexSession; use vortex_utils::parallelism::get_available_parallelism; @@ -179,7 +176,6 @@ pub struct VortexDataSourceBuilder { arrow_schema: Option, projection: Option>, metrics_registry: Option>, - scheduler_provider: Option>, } impl VortexDataSourceBuilder { @@ -219,24 +215,6 @@ impl VortexDataSourceBuilder { self } - /// Configures a shared scan scheduler for scans from this DataFusion source. - pub fn with_scan_scheduler(mut self, scheduler: Arc) -> Self { - self.scheduler_provider = Some(Arc::new(ScanSchedulerProvider::Shared(scheduler))); - self - } - - /// Configures the scheduler ownership strategy for scans from this DataFusion source. - pub fn with_scan_scheduler_provider(mut self, provider: Arc) -> Self { - self.scheduler_provider = Some(provider); - self - } - - /// Configures this source to create a new scan scheduler for each Vortex scan. - pub fn with_new_scan_scheduler_per_query(mut self, config: ScanSchedulerConfig) -> Self { - self.scheduler_provider = Some(Arc::new(ScanSchedulerProvider::PerScan(config))); - self - } - /// Builds the [`VortexDataSource`]. /// /// The builder eagerly resolves statistics for the initial projection @@ -315,7 +293,6 @@ impl VortexDataSourceBuilder { ordered: false, num_partitions: get_available_parallelism().unwrap_or(1), metrics_registry: self.metrics_registry, - scheduler_provider: self.scheduler_provider, morsel_plan: Arc::new(OnceCell::new()), }) } @@ -330,7 +307,6 @@ impl VortexDataSource { arrow_schema: None, projection: None, metrics_registry: None, - scheduler_provider: None, } } @@ -427,9 +403,6 @@ pub struct VortexDataSource { /// Optional Vortex metrics registry populated by the wrapped source. metrics_registry: Option>, - /// Optional scheduler provider passed through the Vortex [`ScanRequest`]. - scheduler_provider: Option>, - /// Shared planned scan for DataFusion morsel repartitioning. morsel_plan: Arc>>, } @@ -495,7 +468,6 @@ impl DataSource for VortexDataSource { filter: self.filter.clone(), limit: self.limit.map(|l| u64::try_from(l).unwrap_or(u64::MAX)), ordered: self.ordered, - scheduler_provider: self.scheduler_provider.clone(), ..Default::default() }; diff --git a/vortex-datafusion/src/v2/table.rs b/vortex-datafusion/src/v2/table.rs index 64beee9cbf3..b46e995afe1 100644 --- a/vortex-datafusion/src/v2/table.rs +++ b/vortex-datafusion/src/v2/table.rs @@ -26,9 +26,6 @@ use datafusion_physical_plan::ExecutionPlan; use vortex::expr::stats::Precision as VortexPrecision; use vortex::metrics::MetricsRegistry; use vortex::scan::DataSourceRef; -use vortex::scan::ScanScheduler; -use vortex::scan::ScanSchedulerConfig; -use vortex::scan::ScanSchedulerProvider; use vortex::session::VortexSession; use crate::v2::source::VortexDataSource; @@ -81,7 +78,6 @@ pub struct VortexTable { session: VortexSession, arrow_schema: SchemaRef, metrics_registry: Option>, - scheduler_provider: Option>, } impl fmt::Debug for VortexTable { @@ -107,7 +103,6 @@ impl VortexTable { session, arrow_schema, metrics_registry: None, - scheduler_provider: None, } } @@ -119,24 +114,6 @@ impl VortexTable { self.metrics_registry = Some(metrics_registry); self } - - /// Configures a shared scan scheduler for scans from this table. - pub fn with_scan_scheduler(mut self, scheduler: Arc) -> Self { - self.scheduler_provider = Some(Arc::new(ScanSchedulerProvider::Shared(scheduler))); - self - } - - /// Configures the scheduler ownership strategy for scans from this table. - pub fn with_scan_scheduler_provider(mut self, provider: Arc) -> Self { - self.scheduler_provider = Some(provider); - self - } - - /// Configures this table to create a new scan scheduler for each Vortex scan. - pub fn with_new_scan_scheduler_per_query(mut self, config: ScanSchedulerConfig) -> Self { - self.scheduler_provider = Some(Arc::new(ScanSchedulerProvider::PerScan(config))); - self - } } #[async_trait] @@ -162,9 +139,6 @@ impl TableProvider for VortexTable { if let Some(metrics_registry) = &self.metrics_registry { builder = builder.with_metrics_registry(Arc::clone(metrics_registry)); } - if let Some(provider) = &self.scheduler_provider { - builder = builder.with_scan_scheduler_provider(Arc::clone(provider)); - } let data_source = builder .with_arrow_schema(Arc::clone(&self.arrow_schema)) // We push down the projection now since it can make building the physical plan a lot diff --git a/vortex-duckdb/src/table_function.rs b/vortex-duckdb/src/table_function.rs index 328f2ae7a8b..ebffd004678 100644 --- a/vortex-duckdb/src/table_function.rs +++ b/vortex-duckdb/src/table_function.rs @@ -247,7 +247,6 @@ pub fn init_global(init_input: &TableInitInput) -> VortexResult VortexResult { limit, partition_selection: Selection::All, partition_range: None, - scheduler_provider: None, }) } diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index c351b129ae4..4c7dc3e675b 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -659,10 +659,7 @@ impl DataSource for ScanPlanDataSource { let meta = ScanMeta { label: Some("scan2".to_string()), }; - let provider = scan_request - .scheduler_provider - .clone() - .unwrap_or_else(|| self.session.scan_scheduler_provider()); + let provider = self.session.scan_scheduler_provider(); let scheduler = provider.scheduler_for_scan(&meta); let mut planned_files = Vec::new(); @@ -717,10 +714,7 @@ impl DataSource for ScanPlanDataSource { let meta = ScanMeta { label: Some("scan2".to_string()), }; - let provider = scan_request - .scheduler_provider - .clone() - .unwrap_or_else(|| self.session.scan_scheduler_provider()); + let provider = self.session.scan_scheduler_provider(); let scheduler = provider.scheduler_for_scan(&meta); let mut ready = VecDeque::new(); @@ -923,10 +917,7 @@ pub(crate) fn scan_plan_file_stream( let meta = ScanMeta { label: Some("scan2".to_string()), }; - let provider = request - .scheduler_provider - .clone() - .unwrap_or_else(|| file.session().scan_scheduler_provider()); + let provider = file.session().scan_scheduler_provider(); let scheduler = provider.scheduler_for_scan(&meta); let limit_remaining = request.limit.map(AtomicU64::new).map(Arc::new); diff --git a/vortex-jni/src/scan.rs b/vortex-jni/src/scan.rs index 360d92aa989..606ec8cd040 100644 --- a/vortex-jni/src/scan.rs +++ b/vortex-jni/src/scan.rs @@ -119,7 +119,6 @@ fn build_scan_request( limit, partition_selection: Selection::All, partition_range: None, - scheduler_provider: None, }) } diff --git a/vortex-scan/src/lib.rs b/vortex-scan/src/lib.rs index 70c24e45ab0..9b50bdb7674 100644 --- a/vortex-scan/src/lib.rs +++ b/vortex-scan/src/lib.rs @@ -227,10 +227,6 @@ pub struct ScanRequest { /// Optional limit on the number of rows returned by scan. Limits are applied after all /// filtering and row selection. pub limit: Option, - /// Optional scheduler provider override for this scan. - /// - /// When absent, a data source should use the provider configured on its [`VortexSession`]. - pub scheduler_provider: Option>, } impl Default for ScanRequest { @@ -244,7 +240,6 @@ impl Default for ScanRequest { ordered: false, limit: None, partition_range: None, - scheduler_provider: None, } } } From 7d285c15264cdb3cda8ab0ee44e08f3130045439 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 24 Jun 2026 12:07:41 -0400 Subject: [PATCH 42/48] slqlogictest Signed-off-by: Nicholas Gates --- vortex-sqllogictest/slt/datafusion/sink.slt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vortex-sqllogictest/slt/datafusion/sink.slt b/vortex-sqllogictest/slt/datafusion/sink.slt index b0b1b8c31a7..7ce71590919 100644 --- a/vortex-sqllogictest/slt/datafusion/sink.slt +++ b/vortex-sqllogictest/slt/datafusion/sink.slt @@ -5,7 +5,7 @@ include ../setup.slt.no query TT EXPLAIN -COPY (SELECT * FROM (VALUES (1), (42), (100), (-5), (0)) AS t(num)) +COPY (SELECT * FROM (VALUES (1), (42), (100), (-5), (0)) AS t(num)) TO '$__TEST_DIR__/datafusion/sink/data1.vortex' STORED AS VORTEX; ---- From 2f37a1b80c43be0a03886519e5d57a86ccad3563 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 24 Jun 2026 12:26:30 -0400 Subject: [PATCH 43/48] Remove DuckDB max cardinality from layout27 Signed-off-by: "Nicholas Gates" --- .../cpp/include/duckdb_vx/table_function.h | 2 -- vortex-duckdb/cpp/table_function.cpp | 3 +-- vortex-duckdb/src/ffi.rs | 6 ------ vortex-duckdb/src/table_function.rs | 18 +++++------------- 4 files changed, 6 insertions(+), 23 deletions(-) diff --git a/vortex-duckdb/cpp/include/duckdb_vx/table_function.h b/vortex-duckdb/cpp/include/duckdb_vx/table_function.h index 0f87ceb3d88..550e1cf3635 100644 --- a/vortex-duckdb/cpp/include/duckdb_vx/table_function.h +++ b/vortex-duckdb/cpp/include/duckdb_vx/table_function.h @@ -86,8 +86,6 @@ typedef struct { typedef struct { idx_t estimated_cardinality; bool has_estimated_cardinality; - idx_t max_cardinality; - bool has_max_cardinality; } duckdb_vx_node_statistics; typedef struct { diff --git a/vortex-duckdb/cpp/table_function.cpp b/vortex-duckdb/cpp/table_function.cpp index c8aaa5cc5af..f18557a2d11 100644 --- a/vortex-duckdb/cpp/table_function.cpp +++ b/vortex-duckdb/cpp/table_function.cpp @@ -298,8 +298,7 @@ unique_ptr c_cardinality(ClientContext &, const FunctionData *bi auto out = make_uniq(); out->has_estimated_cardinality = stats.has_estimated_cardinality; out->estimated_cardinality = stats.estimated_cardinality; - out->has_max_cardinality = stats.has_max_cardinality; - out->max_cardinality = stats.max_cardinality; + out->has_max_cardinality = false; return out; } diff --git a/vortex-duckdb/src/ffi.rs b/vortex-duckdb/src/ffi.rs index 880c3f7f01c..a07ae3b17a4 100644 --- a/vortex-duckdb/src/ffi.rs +++ b/vortex-duckdb/src/ffi.rs @@ -172,12 +172,6 @@ pub unsafe extern "C-unwind" fn duckdb_table_function_cardinality( match cardinality(bind_data) { Cardinality::Unknown => {} - Cardinality::Exact(c) => { - node_stats.has_estimated_cardinality = true; - node_stats.estimated_cardinality = c as _; - node_stats.has_max_cardinality = true; - node_stats.max_cardinality = c as _; - } Cardinality::Estimate(c) => { node_stats.has_estimated_cardinality = true; node_stats.estimated_cardinality = c as _; diff --git a/vortex-duckdb/src/table_function.rs b/vortex-duckdb/src/table_function.rs index ebffd004678..3b60da0856f 100644 --- a/vortex-duckdb/src/table_function.rs +++ b/vortex-duckdb/src/table_function.rs @@ -152,8 +152,6 @@ pub struct PartitionData { pub enum Cardinality { /// Unknown number of rows Unknown, - /// The exact number of rows. - Exact(u64), /// An estimate of the number of rows. Estimate(u64), } @@ -575,18 +573,12 @@ fn column_statistics_aggregate( /// here. const DEFAULT_SELECTIVITY: f64 = 0.2; pub fn cardinality(bind_data: &TableFunctionBind) -> Cardinality { - let has_non_optional_filter = bind_data.has_non_optional_filter.load(Ordering::Relaxed); match bind_data.data_source.row_count() { - Precision::Exact(v) => { - if !has_non_optional_filter { - return Cardinality::Exact(v); - } - let post_cardinality = v as f64 * DEFAULT_SELECTIVITY; - let post_cardinality: u64 = post_cardinality.as_(); - Cardinality::Estimate(max(1, post_cardinality)) - } - Precision::Inexact(v) => { - if !has_non_optional_filter { + Precision::Exact(v) | Precision::Inexact(v) => { + if !bind_data.has_non_optional_filter.load(Ordering::Relaxed) { + // Although we may have an exact upper bound here, reporting + // it as exact has a negative performance impact on tpcds as + // it's not a real post-filter calculation. return Cardinality::Estimate(v); } let post_cardinality = v as f64 * DEFAULT_SELECTIVITY; From fb0d070d4b508ef3b3d5297c6b061c9fb5e49ad9 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 24 Jun 2026 12:31:05 -0400 Subject: [PATCH 44/48] Remove varbinview compaction change from layout27 Signed-off-by: Nicholas Gates --- vortex-array/src/arrays/varbinview/compact.rs | 23 +++++++------------ 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/vortex-array/src/arrays/varbinview/compact.rs b/vortex-array/src/arrays/varbinview/compact.rs index fc3f3477ffc..7dd29c4edeb 100644 --- a/vortex-array/src/arrays/varbinview/compact.rs +++ b/vortex-array/src/arrays/varbinview/compact.rs @@ -18,9 +18,6 @@ use crate::arrays::varbinview::Ref; use crate::builders::ArrayBuilder; use crate::builders::VarBinViewBuilder; -const DEFAULT_COMPACTION_THRESHOLD: f64 = 0.5; -const MIN_RETAINED_BYTES_PER_ROW_TO_CHECK_COMPACTION: u64 = 128; - impl VarBinViewArray { /// Returns a compacted copy of the input array, where all wasted space has been cleaned up. This /// operation can be very expensive, in the worst case copying all existing string data into @@ -36,7 +33,8 @@ impl VarBinViewArray { return Ok(self.clone()); } - self.compact_with_threshold(DEFAULT_COMPACTION_THRESHOLD) + // Use selective compaction with threshold of 1.0 (compact any buffer with any waste) + self.compact_with_threshold(1.0) } fn should_compact(&self) -> VortexResult { @@ -52,18 +50,12 @@ impl VarBinViewArray { return Ok(true); } + let bytes_referenced: u64 = self.count_referenced_bytes()?; let buffer_total_bytes: u64 = self.buffers.iter().map(|buf| buf.len() as u64).sum(); - if buffer_total_bytes == 0 { - return Ok(true); - } - let len = u64::try_from(self.len()).unwrap_or(u64::MAX); - if len > 0 && buffer_total_bytes / len <= MIN_RETAINED_BYTES_PER_ROW_TO_CHECK_COMPACTION { - return Ok(false); - } - - let bytes_referenced: u64 = self.count_referenced_bytes()?; - Ok((bytes_referenced as f64 / buffer_total_bytes as f64) < DEFAULT_COMPACTION_THRESHOLD) + // If there is any wasted space, we want to repack. + // This is very aggressive. + Ok(bytes_referenced < buffer_total_bytes || buffer_total_bytes == 0) } /// Iterates over all valid, non-inlined views, calling the provided @@ -272,7 +264,8 @@ mod tests { .execute::(&mut array_session().create_execution_ctx()) .unwrap(); - let optimized_array = taken_array.compact_with_threshold(1.0).unwrap(); + // Optimize the taken array + let optimized_array = taken_array.compact_buffers().unwrap(); // The optimized array should have exactly 1 buffer (consolidated) assert_eq!(optimized_array.data_buffers().len(), 1); From f4d8fa0feb21ef06d8a34ad42bd7e6c87431c9c4 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 24 Jun 2026 12:41:28 -0400 Subject: [PATCH 45/48] Remove Arrow export compaction change from layout27 Signed-off-by: Nicholas Gates --- vortex-array/src/arrow/executor/byte.rs | 26 -------------------- vortex-array/src/arrow/executor/byte_view.rs | 19 +++++++++++--- 2 files changed, 15 insertions(+), 30 deletions(-) diff --git a/vortex-array/src/arrow/executor/byte.rs b/vortex-array/src/arrow/executor/byte.rs index 2db9d3e6494..2be7afc29c3 100644 --- a/vortex-array/src/arrow/executor/byte.rs +++ b/vortex-array/src/arrow/executor/byte.rs @@ -80,11 +80,8 @@ mod tests { use arrow_array::cast::AsArray; use arrow_schema::DataType; use rstest::rstest; - use vortex_error::VortexResult; - use vortex_mask::Mask; use crate::IntoArray; - use crate::LEGACY_SESSION; use crate::VortexSessionExecute; use crate::array_session; use crate::arrow::ArrowArrayExecutor; @@ -182,27 +179,4 @@ mod tests { assert!(arrow.is_null(1)); assert!(!arrow.is_null(2)); } - - #[test] - fn filtered_utf8_view_export_does_not_retain_unselected_buffers() -> VortexResult<()> { - let unselected = "x".repeat(1 << 20); - let array = - VarBinViewArray::from_iter_str(["selected", unselected.as_str(), unselected.as_str()]); - let filtered = array - .into_array() - .filter(Mask::from_iter([true, false, false]))?; - - let arrow = filtered.execute_arrow( - Some(&DataType::Utf8View), - &mut LEGACY_SESSION.create_execution_ctx(), - )?; - - assert_eq!(arrow.as_string_view().value(0), "selected"); - assert!( - arrow.get_array_memory_size() < unselected.len(), - "filtered export retained unselected payload: {} bytes", - arrow.get_array_memory_size() - ); - Ok(()) - } } diff --git a/vortex-array/src/arrow/executor/byte_view.rs b/vortex-array/src/arrow/executor/byte_view.rs index 89f37681220..b88b1895d53 100644 --- a/vortex-array/src/arrow/executor/byte_view.rs +++ b/vortex-array/src/arrow/executor/byte_view.rs @@ -12,6 +12,7 @@ use vortex_error::VortexResult; use crate::ArrayRef; use crate::ExecutionCtx; use crate::arrays::VarBinViewArray; +use crate::arrow::executor::validity::to_arrow_null_buffer; use crate::arrow::null_buffer::to_null_buffer; use crate::builtins::ArrayBuiltins; use crate::dtype::DType; @@ -47,8 +48,19 @@ pub fn execute_varbinview_to_arrow( array: &VarBinViewArray, ctx: &mut ExecutionCtx, ) -> VortexResult { - let compacted = array.compact_buffers()?; - canonical_varbinview_to_arrow::(&compacted, ctx) + let views = + ScalarBuffer::::from(array.views_handle().as_host().clone().into_arrow_buffer()); + let buffers: Vec<_> = array + .data_buffers() + .iter() + .map(|buffer| buffer.as_host().clone().into_arrow_buffer()) + .collect(); + let nulls = to_arrow_null_buffer(array.validity()?, array.len(), ctx)?; + + // SAFETY: our own VarBinView array is considered safe. + Ok(Arc::new(unsafe { + GenericByteViewArray::::new_unchecked(views, buffers, nulls) + })) } pub(super) fn to_arrow_byte_view( @@ -61,7 +73,6 @@ pub(super) fn to_arrow_byte_view( // flexible since there's no prescribed nullability in Arrow types. let array = array.cast(DType::from_arrow((&T::DATA_TYPE, Nullability::Nullable)))?; - let array = array.execute::(ctx)?; let varbinview = array.execute::(ctx)?; - execute_varbinview_to_arrow::(&varbinview, ctx) + canonical_varbinview_to_arrow::(&varbinview, ctx) } From 62d88dc6f5903467e4715be55415881dc46719a8 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 24 Jun 2026 14:51:05 -0400 Subject: [PATCH 46/48] Move scan plan API into vortex-scan Signed-off-by: Nicholas Gates --- .../extending/writing-a-layout.md | 4 +- vortex-file/src/file.rs | 6 +- vortex-file/src/multi/scan_v2.rs | 58 ++++++++--------- vortex-layout/src/layout_v2.rs | 4 +- vortex-layout/src/layouts_v2/chunked.rs | 4 +- vortex-layout/src/layouts_v2/dict.rs | 4 +- vortex-layout/src/layouts_v2/flat.rs | 4 +- vortex-layout/src/layouts_v2/struct_.rs | 4 +- vortex-layout/src/layouts_v2/zoned.rs | 4 +- vortex-layout/src/scan/mod.rs | 1 - vortex-layout/src/scan/v2/layouts/chunked.rs | 62 +++++++++---------- vortex-layout/src/scan/v2/layouts/dict.rs | 36 +++++------ vortex-layout/src/scan/v2/layouts/flat.rs | 34 +++++----- vortex-layout/src/scan/v2/layouts/struct_.rs | 24 +++---- vortex-layout/src/scan/v2/layouts/zoned.rs | 60 +++++++++--------- vortex-layout/src/scan/v2/mod.rs | 8 +-- vortex-layout/src/scan/v2/row_idx.rs | 32 +++++----- vortex-scan/src/lib.rs | 1 + .../scan => vortex-scan/src}/plan/evidence.rs | 0 .../src/scan => vortex-scan/src}/plan/mod.rs | 14 +++-- .../scan => vortex-scan/src}/plan/request.rs | 0 21 files changed, 182 insertions(+), 182 deletions(-) rename {vortex-layout/src/scan => vortex-scan/src}/plan/evidence.rs (100%) rename {vortex-layout/src/scan => vortex-scan/src}/plan/mod.rs (99%) rename {vortex-layout/src/scan => vortex-scan/src}/plan/request.rs (100%) diff --git a/docs/developer-guide/extending/writing-a-layout.md b/docs/developer-guide/extending/writing-a-layout.md index 1221fa22ee6..738061a7e8c 100644 --- a/docs/developer-guide/extending/writing-a-layout.md +++ b/docs/developer-guide/extending/writing-a-layout.md @@ -23,8 +23,8 @@ layout-specific `LayoutData`. ```rust use vortex_layout::layout_v2; use vortex_layout::{LayoutChildType, LayoutId}; -use vortex_layout::scan::plan::ScanPlanRef; -use vortex_layout::scan::plan::request::ScanRequest; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::request::ScanRequest; use vortex_session::VortexSession; #[derive(Clone, Debug)] diff --git a/vortex-file/src/file.rs b/vortex-file/src/file.rs index 0ea069a24e8..fa4afdaa167 100644 --- a/vortex-file/src/file.rs +++ b/vortex-file/src/file.rs @@ -22,15 +22,15 @@ use vortex_array::stream::SendableArrayStream; use vortex_error::VortexResult; use vortex_layout::LayoutReader; use vortex_layout::scan::layout::LayoutReaderDataSource; -use vortex_layout::scan::plan::PreparedStateCache; -use vortex_layout::scan::plan::PreparedStateCacheRef; -use vortex_layout::scan::plan::ScanPlanRef; use vortex_layout::scan::scan_builder::ScanBuilder; use vortex_layout::scan::split_by::SplitBy; use vortex_layout::segments::SegmentFutureCache; use vortex_layout::segments::SegmentSource; use vortex_scan::DataSourceRef; use vortex_scan::ScanRequest; +use vortex_scan::plan::PreparedStateCache; +use vortex_scan::plan::PreparedStateCacheRef; +use vortex_scan::plan::ScanPlanRef; use vortex_session::VortexSession; use crate::FileStatistics; diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index 4c7dc3e675b..383bba52ba0 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -49,37 +49,8 @@ use vortex_io::filesystem::FileSystemRef; use vortex_io::runtime::Handle; use vortex_io::session::RuntimeSessionExt; use vortex_layout::layout_v2::LayoutScanPlanCtx; -use vortex_layout::scan::plan::EvidenceScope; -use vortex_layout::scan::plan::OwnedRowScope; -use vortex_layout::scan::plan::PrepareCtx; -use vortex_layout::scan::plan::PreparedAggregateRef; -use vortex_layout::scan::plan::PreparedEvidenceRef; -use vortex_layout::scan::plan::PreparedReadRef; -use vortex_layout::scan::plan::PreparedStats; -use vortex_layout::scan::plan::PreparedStatsRef; -use vortex_layout::scan::plan::PushCtx; -use vortex_layout::scan::plan::ReadContext; -use vortex_layout::scan::plan::ReadStep; -use vortex_layout::scan::plan::ReadTask; -use vortex_layout::scan::plan::ReadTaskOutput; -use vortex_layout::scan::plan::ScanPlan; -use vortex_layout::scan::plan::ScanPlanRef; -use vortex_layout::scan::plan::ScanState; -use vortex_layout::scan::plan::ScanStateRef; -use vortex_layout::scan::plan::StateCtx; -use vortex_layout::scan::plan::downcast_state; -use vortex_layout::scan::plan::evidence::EvidenceFragment; -use vortex_layout::scan::plan::evidence::PredicateEvidence; -use vortex_layout::scan::plan::evidence::PredicateEvidenceKind; -use vortex_layout::scan::plan::evidence::PredicateId; -use vortex_layout::scan::plan::evidence::PredicateVersion; -use vortex_layout::scan::plan::request::EvidenceMode; -use vortex_layout::scan::plan::request::OwnedEvidenceRequest; -use vortex_layout::scan::plan::request::ScanRequest; use vortex_layout::scan::v2::validate_temporal_comparisons; use vortex_layout::scan::v2::with_row_idx; -use vortex_layout::segments::ScanIoPhase; -use vortex_layout::segments::ScanRead; use vortex_mask::Mask; use vortex_metrics::MetricsRegistry; use vortex_scan::DataSource; @@ -94,9 +65,38 @@ use vortex_scan::ScanMeta; use vortex_scan::ScanRequest as DataSourceScanRequest; use vortex_scan::ScanScheduler; use vortex_scan::ScanSchedulerSessionExt; +use vortex_scan::plan::EvidenceScope; +use vortex_scan::plan::OwnedRowScope; +use vortex_scan::plan::PrepareCtx; +use vortex_scan::plan::PreparedAggregateRef; +use vortex_scan::plan::PreparedEvidenceRef; +use vortex_scan::plan::PreparedReadRef; +use vortex_scan::plan::PreparedStats; +use vortex_scan::plan::PreparedStatsRef; +use vortex_scan::plan::PushCtx; +use vortex_scan::plan::ReadContext; +use vortex_scan::plan::ReadStep; +use vortex_scan::plan::ReadTask; +use vortex_scan::plan::ReadTaskOutput; +use vortex_scan::plan::ScanPlan; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::ScanState; +use vortex_scan::plan::ScanStateRef; +use vortex_scan::plan::StateCtx; +use vortex_scan::plan::downcast_state; +use vortex_scan::plan::evidence::EvidenceFragment; +use vortex_scan::plan::evidence::PredicateEvidence; +use vortex_scan::plan::evidence::PredicateEvidenceKind; +use vortex_scan::plan::evidence::PredicateId; +use vortex_scan::plan::evidence::PredicateVersion; +use vortex_scan::plan::request::EvidenceMode; +use vortex_scan::plan::request::OwnedEvidenceRequest; +use vortex_scan::plan::request::ScanRequest; use vortex_scan::read::ReadResults; use vortex_scan::read::ReadStore; use vortex_scan::read::ReadStoreRef; +use vortex_scan::read::ScanIoPhase; +use vortex_scan::read::ScanRead; use vortex_scan::selection::Selection; use vortex_scan::task::ScanStep; use vortex_scan::task::ScanStepResult; diff --git a/vortex-layout/src/layout_v2.rs b/vortex-layout/src/layout_v2.rs index 006f5563d78..994767cca7e 100644 --- a/vortex-layout/src/layout_v2.rs +++ b/vortex-layout/src/layout_v2.rs @@ -19,14 +19,14 @@ use vortex_error::vortex_bail; use vortex_error::vortex_err; use vortex_flatbuffers::FlatBuffer; use vortex_flatbuffers::layout; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::request::ScanRequest; use vortex_session::VortexSession; use vortex_session::registry::ReadContext; use vortex_session::registry::Registry; use crate::LayoutChildType; use crate::LayoutId; -use crate::scan::plan::ScanPlanRef; -use crate::scan::plan::request::ScanRequest; use crate::segments::SegmentFutureCache; use crate::segments::SegmentId; use crate::segments::SegmentSource; diff --git a/vortex-layout/src/layouts_v2/chunked.rs b/vortex-layout/src/layouts_v2/chunked.rs index 373f99911c8..e4af2652364 100644 --- a/vortex-layout/src/layouts_v2/chunked.rs +++ b/vortex-layout/src/layouts_v2/chunked.rs @@ -8,6 +8,8 @@ use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_ensure; use vortex_error::vortex_err; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::request::ScanRequest; use crate::LayoutChildType; use crate::LayoutId; @@ -15,8 +17,6 @@ use crate::layout_v2::Layout; use crate::layout_v2::LayoutDeserializeArgs; use crate::layout_v2::LayoutScanPlanCtx; use crate::layout_v2::VTable; -use crate::scan::plan::ScanPlanRef; -use crate::scan::plan::request::ScanRequest; use crate::scan::v2::layouts::chunked as scan_chunked; /// V2 chunked layout vtable. diff --git a/vortex-layout/src/layouts_v2/dict.rs b/vortex-layout/src/layouts_v2/dict.rs index a7b14fdb11f..a3cb2234a07 100644 --- a/vortex-layout/src/layouts_v2/dict.rs +++ b/vortex-layout/src/layouts_v2/dict.rs @@ -7,6 +7,8 @@ use vortex_array::dtype::PType; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::request::ScanRequest; use crate::LayoutChildType; use crate::LayoutId; @@ -16,8 +18,6 @@ use crate::layout_v2::LayoutScanPlanCtx; use crate::layout_v2::VTable; use crate::layout_v2::metadata_bool_field; use crate::layout_v2::metadata_varint_field; -use crate::scan::plan::ScanPlanRef; -use crate::scan::plan::request::ScanRequest; use crate::scan::v2::layouts::dict as scan_dict; /// V2 dictionary layout vtable. diff --git a/vortex-layout/src/layouts_v2/flat.rs b/vortex-layout/src/layouts_v2/flat.rs index 061c256063c..12f02850f2c 100644 --- a/vortex-layout/src/layouts_v2/flat.rs +++ b/vortex-layout/src/layouts_v2/flat.rs @@ -6,6 +6,8 @@ use vortex_buffer::ByteBuffer; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_ensure; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::request::ScanRequest; use vortex_session::registry::ReadContext; use crate::LayoutChildType; @@ -15,8 +17,6 @@ use crate::layout_v2::LayoutDeserializeArgs; use crate::layout_v2::LayoutScanPlanCtx; use crate::layout_v2::VTable; use crate::layout_v2::metadata_bytes_field; -use crate::scan::plan::ScanPlanRef; -use crate::scan::plan::request::ScanRequest; use crate::scan::v2::layouts::flat as scan_flat; use crate::segments::SegmentId; diff --git a/vortex-layout/src/layouts_v2/struct_.rs b/vortex-layout/src/layouts_v2/struct_.rs index c512856e03b..077d4c330b5 100644 --- a/vortex-layout/src/layouts_v2/struct_.rs +++ b/vortex-layout/src/layouts_v2/struct_.rs @@ -7,6 +7,8 @@ use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; use vortex_error::VortexResult; use vortex_error::vortex_err; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::request::ScanRequest; use crate::LayoutChildType; use crate::LayoutId; @@ -14,8 +16,6 @@ use crate::layout_v2::Layout; use crate::layout_v2::LayoutDeserializeArgs; use crate::layout_v2::LayoutScanPlanCtx; use crate::layout_v2::VTable; -use crate::scan::plan::ScanPlanRef; -use crate::scan::plan::request::ScanRequest; use crate::scan::v2::layouts::struct_ as scan_struct; /// V2 struct layout vtable. diff --git a/vortex-layout/src/layouts_v2/zoned.rs b/vortex-layout/src/layouts_v2/zoned.rs index f8f6e6da51b..596b488404b 100644 --- a/vortex-layout/src/layouts_v2/zoned.rs +++ b/vortex-layout/src/layouts_v2/zoned.rs @@ -8,6 +8,8 @@ use vortex_array::aggregate_fn::AggregateFnRef; use vortex_array::dtype::DType; use vortex_error::VortexResult; use vortex_error::vortex_bail; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::request::ScanRequest; use crate::LayoutChildType; use crate::LayoutId; @@ -21,8 +23,6 @@ use crate::layouts::zoned::ZonedMetadata; use crate::layouts::zoned::aggregate_fns_from_specs; use crate::layouts::zoned::aggregate_stats_table_dtype; use crate::layouts::zoned::legacy_stats_table_dtype; -use crate::scan::plan::ScanPlanRef; -use crate::scan::plan::request::ScanRequest; use crate::scan::v2::layouts::zoned as scan_zoned; /// V2 zoned layout vtable. diff --git a/vortex-layout/src/scan/mod.rs b/vortex-layout/src/scan/mod.rs index 55c57d51089..ab003641eb5 100644 --- a/vortex-layout/src/scan/mod.rs +++ b/vortex-layout/src/scan/mod.rs @@ -5,7 +5,6 @@ pub mod arrow; mod filter; pub mod layout; pub mod multi; -pub mod plan; pub mod repeated_scan; pub mod scan_builder; pub mod split_by; diff --git a/vortex-layout/src/scan/v2/layouts/chunked.rs b/vortex-layout/src/scan/v2/layouts/chunked.rs index b9c0bc8e5f8..4092ba19ad3 100644 --- a/vortex-layout/src/scan/v2/layouts/chunked.rs +++ b/vortex-layout/src/scan/v2/layouts/chunked.rs @@ -39,6 +39,37 @@ use vortex_array::scalar::Scalar; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; +use vortex_scan::plan::AggregateAnswer; +use vortex_scan::plan::DeferredReadTask; +use vortex_scan::plan::EvidenceStep; +use vortex_scan::plan::EvidenceTask; +use vortex_scan::plan::OwnedRowScope; +use vortex_scan::plan::PrepareCtx; +use vortex_scan::plan::PreparedAggregate; +use vortex_scan::plan::PreparedAggregateRef; +use vortex_scan::plan::PreparedEvidence; +use vortex_scan::plan::PreparedEvidenceRef; +use vortex_scan::plan::PreparedRead; +use vortex_scan::plan::PreparedReadRef; +use vortex_scan::plan::PreparedStateCacheRef; +use vortex_scan::plan::PreparedStateKey; +use vortex_scan::plan::PushCtx; +use vortex_scan::plan::ReadContext; +use vortex_scan::plan::ReadStep; +use vortex_scan::plan::ReadTask; +use vortex_scan::plan::ReadTaskOutput; +use vortex_scan::plan::ScanPlan; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::ScanState; +use vortex_scan::plan::ScanStateRef; +use vortex_scan::plan::StateCtx; +use vortex_scan::plan::default_try_push_expr; +use vortex_scan::plan::downcast_state; +use vortex_scan::plan::evidence::EvidenceFragment; +use vortex_scan::plan::request::EvidenceMode; +use vortex_scan::plan::request::EvidenceRequest; +use vortex_scan::plan::request::OwnedEvidenceRequest; +use vortex_scan::plan::request::ScanRequest; use vortex_scan::read::ReadResults; use vortex_scan::read::ScanIoPhase; use vortex_session::VortexSession; @@ -47,37 +78,6 @@ use crate::layout_v2::Layout; use crate::layout_v2::LayoutRef; use crate::layout_v2::LayoutScanPlanCtx; use crate::layouts_v2::chunked::Chunked; -use crate::scan::plan::AggregateAnswer; -use crate::scan::plan::DeferredReadTask; -use crate::scan::plan::EvidenceStep; -use crate::scan::plan::EvidenceTask; -use crate::scan::plan::OwnedRowScope; -use crate::scan::plan::PrepareCtx; -use crate::scan::plan::PreparedAggregate; -use crate::scan::plan::PreparedAggregateRef; -use crate::scan::plan::PreparedEvidence; -use crate::scan::plan::PreparedEvidenceRef; -use crate::scan::plan::PreparedRead; -use crate::scan::plan::PreparedReadRef; -use crate::scan::plan::PreparedStateCacheRef; -use crate::scan::plan::PreparedStateKey; -use crate::scan::plan::PushCtx; -use crate::scan::plan::ReadContext; -use crate::scan::plan::ReadStep; -use crate::scan::plan::ReadTask; -use crate::scan::plan::ReadTaskOutput; -use crate::scan::plan::ScanPlan; -use crate::scan::plan::ScanPlanRef; -use crate::scan::plan::ScanState; -use crate::scan::plan::ScanStateRef; -use crate::scan::plan::StateCtx; -use crate::scan::plan::default_try_push_expr; -use crate::scan::plan::downcast_state; -use crate::scan::plan::evidence::EvidenceFragment; -use crate::scan::plan::request::EvidenceMode; -use crate::scan::plan::request::EvidenceRequest; -use crate::scan::plan::request::OwnedEvidenceRequest; -use crate::scan::plan::request::ScanRequest; pub(crate) fn new_scan_plan( layout: Layout, diff --git a/vortex-layout/src/scan/v2/layouts/dict.rs b/vortex-layout/src/scan/v2/layouts/dict.rs index 4420f125a69..a84c5bfcb09 100644 --- a/vortex-layout/src/scan/v2/layouts/dict.rs +++ b/vortex-layout/src/scan/v2/layouts/dict.rs @@ -38,29 +38,29 @@ use vortex_error::vortex_bail; use vortex_error::vortex_err; use vortex_mask::AllOr; use vortex_mask::Mask; +use vortex_scan::plan::DeferredReadTask; +use vortex_scan::plan::OwnedRowScope; +use vortex_scan::plan::PrepareCtx; +use vortex_scan::plan::PreparedRead; +use vortex_scan::plan::PreparedReadRef; +use vortex_scan::plan::PreparedStateKey; +use vortex_scan::plan::PushCtx; +use vortex_scan::plan::ReadStep; +use vortex_scan::plan::ReadTask; +use vortex_scan::plan::ReadTaskOutput; +use vortex_scan::plan::RowScope; +use vortex_scan::plan::ScanPlan; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::ScanState; +use vortex_scan::plan::ScanStateRef; +use vortex_scan::plan::StateCtx; +use vortex_scan::plan::default_try_push_expr; +use vortex_scan::plan::request::ScanRequest; use vortex_scan::read::ScanIoPhase; use crate::layout_v2::Layout; use crate::layout_v2::LayoutScanPlanCtx; use crate::layouts_v2::dict::Dict; -use crate::scan::plan::DeferredReadTask; -use crate::scan::plan::OwnedRowScope; -use crate::scan::plan::PrepareCtx; -use crate::scan::plan::PreparedRead; -use crate::scan::plan::PreparedReadRef; -use crate::scan::plan::PreparedStateKey; -use crate::scan::plan::PushCtx; -use crate::scan::plan::ReadStep; -use crate::scan::plan::ReadTask; -use crate::scan::plan::ReadTaskOutput; -use crate::scan::plan::RowScope; -use crate::scan::plan::ScanPlan; -use crate::scan::plan::ScanPlanRef; -use crate::scan::plan::ScanState; -use crate::scan::plan::ScanStateRef; -use crate::scan::plan::StateCtx; -use crate::scan::plan::default_try_push_expr; -use crate::scan::plan::request::ScanRequest; const DENSE_REMAP_MAX_VALUES: usize = 1 << 20; const DENSE_REMAP_VALUES_PER_CODE: usize = 4; diff --git a/vortex-layout/src/scan/v2/layouts/flat.rs b/vortex-layout/src/scan/v2/layouts/flat.rs index 9e0f8c2405e..1974581b00f 100644 --- a/vortex-layout/src/scan/v2/layouts/flat.rs +++ b/vortex-layout/src/scan/v2/layouts/flat.rs @@ -23,6 +23,23 @@ use vortex_array::serde::SerializedArray; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; +use vortex_scan::plan::OwnedRowScope; +use vortex_scan::plan::PrepareCtx; +use vortex_scan::plan::PreparedRead; +use vortex_scan::plan::PreparedReadRef; +use vortex_scan::plan::PreparedStateKey; +use vortex_scan::plan::PushCtx; +use vortex_scan::plan::ReadStep; +use vortex_scan::plan::ReadTask; +use vortex_scan::plan::ReadTaskOutput; +use vortex_scan::plan::ScanPlan; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::ScanState; +use vortex_scan::plan::ScanStateRef; +use vortex_scan::plan::StateCtx; +use vortex_scan::plan::default_try_push_expr; +use vortex_scan::plan::downcast_state; +use vortex_scan::plan::request::ScanRequest; use vortex_scan::read::ReadRequestKey; use vortex_scan::read::ReadResults; use vortex_scan::read::ScanIoPhase; @@ -33,23 +50,6 @@ use crate::layout_v2::Layout; use crate::layout_v2::LayoutRef; use crate::layout_v2::LayoutScanPlanCtx; use crate::layouts_v2::flat::Flat; -use crate::scan::plan::OwnedRowScope; -use crate::scan::plan::PrepareCtx; -use crate::scan::plan::PreparedRead; -use crate::scan::plan::PreparedReadRef; -use crate::scan::plan::PreparedStateKey; -use crate::scan::plan::PushCtx; -use crate::scan::plan::ReadStep; -use crate::scan::plan::ReadTask; -use crate::scan::plan::ReadTaskOutput; -use crate::scan::plan::ScanPlan; -use crate::scan::plan::ScanPlanRef; -use crate::scan::plan::ScanState; -use crate::scan::plan::ScanStateRef; -use crate::scan::plan::StateCtx; -use crate::scan::plan::default_try_push_expr; -use crate::scan::plan::downcast_state; -use crate::scan::plan::request::ScanRequest; use crate::segments::SegmentFutureCache; use crate::segments::SegmentRequest; use crate::segments::SegmentRequestKey; diff --git a/vortex-layout/src/scan/v2/layouts/struct_.rs b/vortex-layout/src/scan/v2/layouts/struct_.rs index b36dee457df..108722bcbc1 100644 --- a/vortex-layout/src/scan/v2/layouts/struct_.rs +++ b/vortex-layout/src/scan/v2/layouts/struct_.rs @@ -25,23 +25,23 @@ use vortex_array::scalar_fn::fns::root::Root; use vortex_array::scalar_fn::fns::select::Select; use vortex_error::VortexResult; use vortex_error::vortex_bail; +use vortex_scan::plan::ApplyScanPlan; +use vortex_scan::plan::MaskScanPlan; +use vortex_scan::plan::PrepareCtx; +use vortex_scan::plan::PreparedReadRef; +use vortex_scan::plan::PushCtx; +use vortex_scan::plan::ScanPlan; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::ScanStateRef; +use vortex_scan::plan::StateCtx; +use vortex_scan::plan::StructValueScanPlan; +use vortex_scan::plan::literal_scan_plan; +use vortex_scan::plan::request::ScanRequest; use crate::layout_v2::Layout; use crate::layout_v2::LayoutRef; use crate::layout_v2::LayoutScanPlanCtx; use crate::layouts_v2::struct_::Struct; -use crate::scan::plan::ApplyScanPlan; -use crate::scan::plan::MaskScanPlan; -use crate::scan::plan::PrepareCtx; -use crate::scan::plan::PreparedReadRef; -use crate::scan::plan::PushCtx; -use crate::scan::plan::ScanPlan; -use crate::scan::plan::ScanPlanRef; -use crate::scan::plan::ScanStateRef; -use crate::scan::plan::StateCtx; -use crate::scan::plan::StructValueScanPlan; -use crate::scan::plan::literal_scan_plan; -use crate::scan::plan::request::ScanRequest; use crate::scan::v2::referenced_fields; use crate::scan::v2::struct_fields; diff --git a/vortex-layout/src/scan/v2/layouts/zoned.rs b/vortex-layout/src/scan/v2/layouts/zoned.rs index 9ca432ae41f..9d0ba8b13c5 100644 --- a/vortex-layout/src/scan/v2/layouts/zoned.rs +++ b/vortex-layout/src/scan/v2/layouts/zoned.rs @@ -42,6 +42,36 @@ use vortex_array::scalar::Scalar; use vortex_error::VortexResult; use vortex_error::vortex_err; use vortex_mask::Mask; +use vortex_scan::plan::AggregateAnswer; +use vortex_scan::plan::EvidenceCost; +use vortex_scan::plan::EvidenceScope; +use vortex_scan::plan::EvidenceStep; +use vortex_scan::plan::EvidenceTask; +use vortex_scan::plan::OwnedRowScope; +use vortex_scan::plan::PrepareCtx; +use vortex_scan::plan::PreparedAggregate; +use vortex_scan::plan::PreparedAggregateRef; +use vortex_scan::plan::PreparedEvidence; +use vortex_scan::plan::PreparedEvidenceRef; +use vortex_scan::plan::PreparedRead; +use vortex_scan::plan::PreparedReadRef; +use vortex_scan::plan::PreparedStateKey; +use vortex_scan::plan::PushCtx; +use vortex_scan::plan::ReadContext; +use vortex_scan::plan::ReadTask; +use vortex_scan::plan::ReadTaskOutput; +use vortex_scan::plan::ScanPlan; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::ScanState; +use vortex_scan::plan::ScanStateRef; +use vortex_scan::plan::StateCtx; +use vortex_scan::plan::default_try_push_expr; +use vortex_scan::plan::downcast_state; +use vortex_scan::plan::evidence::EvidenceFragment; +use vortex_scan::plan::evidence::PredicateEvidenceKind; +use vortex_scan::plan::request::EvidenceRequest; +use vortex_scan::plan::request::OwnedEvidenceRequest; +use vortex_scan::plan::request::ScanRequest; use vortex_scan::read::ReadResults; use vortex_scan::read::ReadStore; use vortex_scan::read::ReadStoreRef; @@ -57,36 +87,6 @@ use crate::layouts::zoned::MIN_IS_TRUNCATED; use crate::layouts::zoned::ZoneMapSchema; use crate::layouts::zoned::zone_map::ZoneMap; use crate::layouts_v2::zoned::ZonedData; -use crate::scan::plan::AggregateAnswer; -use crate::scan::plan::EvidenceCost; -use crate::scan::plan::EvidenceScope; -use crate::scan::plan::EvidenceStep; -use crate::scan::plan::EvidenceTask; -use crate::scan::plan::OwnedRowScope; -use crate::scan::plan::PrepareCtx; -use crate::scan::plan::PreparedAggregate; -use crate::scan::plan::PreparedAggregateRef; -use crate::scan::plan::PreparedEvidence; -use crate::scan::plan::PreparedEvidenceRef; -use crate::scan::plan::PreparedRead; -use crate::scan::plan::PreparedReadRef; -use crate::scan::plan::PreparedStateKey; -use crate::scan::plan::PushCtx; -use crate::scan::plan::ReadContext; -use crate::scan::plan::ReadTask; -use crate::scan::plan::ReadTaskOutput; -use crate::scan::plan::ScanPlan; -use crate::scan::plan::ScanPlanRef; -use crate::scan::plan::ScanState; -use crate::scan::plan::ScanStateRef; -use crate::scan::plan::StateCtx; -use crate::scan::plan::default_try_push_expr; -use crate::scan::plan::downcast_state; -use crate::scan::plan::evidence::EvidenceFragment; -use crate::scan::plan::evidence::PredicateEvidenceKind; -use crate::scan::plan::request::EvidenceRequest; -use crate::scan::plan::request::OwnedEvidenceRequest; -use crate::scan::plan::request::ScanRequest; pub(crate) fn new_scan_plan>( layout: Layout, diff --git a/vortex-layout/src/scan/v2/mod.rs b/vortex-layout/src/scan/v2/mod.rs index b1c6d81f053..bf3e36d53f3 100644 --- a/vortex-layout/src/scan/v2/mod.rs +++ b/vortex-layout/src/scan/v2/mod.rs @@ -4,7 +4,7 @@ //! Scan2 layout plan machinery. //! //! This module contains the layout-tree expansion vtables and executable -//! [`ScanPlan`](crate::scan::plan::ScanPlan) plans used by the alternate scan implementation. +//! [`ScanPlan`](vortex_scan::plan::ScanPlan) plans used by the alternate scan implementation. pub(crate) mod layouts; mod row_idx; @@ -19,10 +19,6 @@ use vortex_array::scalar_fn::fns::binary::Binary; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; - -pub use crate::scan::plan::evidence; -pub use crate::scan::plan::request; - /// Environment variable selecting the file scan implementation. /// /// Accepted values: @@ -30,7 +26,7 @@ pub use crate::scan::plan::request; /// - `v1`, `scan`, `scan_builder`, `scan-builder`, `layout-reader`, or unset: use the /// existing LayoutReader-based scan. /// - `v2` or `scan2`: use the scan2 -/// [`ScanPlan`](crate::scan::plan::ScanPlan) implementation. +/// [`ScanPlan`](vortex_scan::plan::ScanPlan) implementation. pub const SCAN_IMPL_ENV: &str = "VORTEX_SCAN_IMPL"; /// Returns whether the scan2 implementation should be used by scan data sources. diff --git a/vortex-layout/src/scan/v2/row_idx.rs b/vortex-layout/src/scan/v2/row_idx.rs index 3ebeff51fb6..e3213a6f26e 100644 --- a/vortex-layout/src/scan/v2/row_idx.rs +++ b/vortex-layout/src/scan/v2/row_idx.rs @@ -21,27 +21,27 @@ use vortex_array::scalar::PValue; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_bail; +use vortex_scan::plan::ApplyScanPlan; +use vortex_scan::plan::OwnedRowScope; +use vortex_scan::plan::PrepareCtx; +use vortex_scan::plan::PreparedRead; +use vortex_scan::plan::PreparedReadRef; +use vortex_scan::plan::PushCtx; +use vortex_scan::plan::ReadStep; +use vortex_scan::plan::ReadTask; +use vortex_scan::plan::ReadTaskOutput; +use vortex_scan::plan::ScanPlan; +use vortex_scan::plan::ScanPlanRef; +use vortex_scan::plan::ScanState; +use vortex_scan::plan::ScanStateRef; +use vortex_scan::plan::StateCtx; +use vortex_scan::plan::StructValueScanPlan; +use vortex_scan::plan::default_try_push_expr; use vortex_sequence::Sequence; use vortex_sequence::SequenceArray; use crate::layouts::row_idx::RowIdx; use crate::layouts::row_idx::row_idx; -use crate::scan::plan::ApplyScanPlan; -use crate::scan::plan::OwnedRowScope; -use crate::scan::plan::PrepareCtx; -use crate::scan::plan::PreparedRead; -use crate::scan::plan::PreparedReadRef; -use crate::scan::plan::PushCtx; -use crate::scan::plan::ReadStep; -use crate::scan::plan::ReadTask; -use crate::scan::plan::ReadTaskOutput; -use crate::scan::plan::ScanPlan; -use crate::scan::plan::ScanPlanRef; -use crate::scan::plan::ScanState; -use crate::scan::plan::ScanStateRef; -use crate::scan::plan::StateCtx; -use crate::scan::plan::StructValueScanPlan; -use crate::scan::plan::default_try_push_expr; pub fn with_row_idx(root: ScanPlanRef, dtype: DType, row_offset: u64) -> ScanPlanRef { Arc::new(RowIdxScanPlan { diff --git a/vortex-scan/src/lib.rs b/vortex-scan/src/lib.rs index 9b50bdb7674..0ea5a3994ca 100644 --- a/vortex-scan/src/lib.rs +++ b/vortex-scan/src/lib.rs @@ -22,6 +22,7 @@ //! * We should add a way for the client to negotiate capabilities with the data source, for //! example which encodings it knows about. +pub mod plan; pub mod read; pub mod row_mask; pub mod scheduler; diff --git a/vortex-layout/src/scan/plan/evidence.rs b/vortex-scan/src/plan/evidence.rs similarity index 100% rename from vortex-layout/src/scan/plan/evidence.rs rename to vortex-scan/src/plan/evidence.rs diff --git a/vortex-layout/src/scan/plan/mod.rs b/vortex-scan/src/plan/mod.rs similarity index 99% rename from vortex-layout/src/scan/plan/mod.rs rename to vortex-scan/src/plan/mod.rs index 989957aed1f..408f993009e 100644 --- a/vortex-layout/src/scan/plan/mod.rs +++ b/vortex-scan/src/plan/mod.rs @@ -51,14 +51,14 @@ use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; use vortex_mask::Mask; -use vortex_scan::read::ReadResults; -use vortex_scan::read::ScanIoPhase; -use vortex_scan::read::ScanRead; use vortex_session::VortexSession; use self::evidence::EvidenceFragment; use self::request::EvidenceRequest; use self::request::OwnedEvidenceRequest; +use crate::read::ReadResults; +use crate::read::ScanIoPhase; +use crate::read::ScanRead; /// Execution context for prepared scan tasks. #[derive(Clone)] @@ -869,7 +869,11 @@ impl ReadTask for StructReadTask { } } -pub(crate) struct DeferredReadTask; +/// Placeholder read task for recursive plan continuations that replace child tasks later. +/// +/// This task is never valid to execute directly. It exists so composite plan implementations can +/// build a self-referential continuation state before the next child task is available. +pub struct DeferredReadTask; impl ReadTask for DeferredReadTask { fn into_step(self: Box) -> VortexResult { @@ -1599,9 +1603,9 @@ mod tests { use vortex_array::arrays::Constant; use vortex_array::dtype::Nullability; use vortex_array::expr::lit; - use vortex_scan::read::ReadStore; use super::*; + use crate::read::ReadStore; struct TestStatsNode; diff --git a/vortex-layout/src/scan/plan/request.rs b/vortex-scan/src/plan/request.rs similarity index 100% rename from vortex-layout/src/scan/plan/request.rs rename to vortex-scan/src/plan/request.rs From 150a52c622798d5162aa76aff21463aba70674f0 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 24 Jun 2026 17:27:06 -0400 Subject: [PATCH 47/48] Move scan plan runtime into vortex-scan Signed-off-by: Nicholas Gates --- Cargo.lock | 1 + vortex-file/src/file.rs | 10 - vortex-file/src/multi/scan_v2.rs | 3055 +---------------- vortex-layout/src/scan/v2/layouts/chunked.rs | 16 + vortex-layout/src/scan/v2/layouts/dict.rs | 23 + vortex-layout/src/scan/v2/layouts/flat.rs | 9 + vortex-layout/src/scan/v2/layouts/struct_.rs | 16 +- vortex-layout/src/scan/v2/layouts/zoned.rs | 21 +- vortex-layout/src/scan/v2/row_idx.rs | 42 +- vortex-scan/Cargo.toml | 1 + vortex-scan/src/plan/data_source.rs | 3187 ++++++++++++++++++ vortex-scan/src/plan/mod.rs | 140 +- 12 files changed, 3471 insertions(+), 3050 deletions(-) create mode 100644 vortex-scan/src/plan/data_source.rs diff --git a/Cargo.lock b/Cargo.lock index b378d183700..316597039fb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10392,6 +10392,7 @@ dependencies = [ "vortex-array", "vortex-buffer", "vortex-error", + "vortex-io", "vortex-mask", "vortex-session", "vortex-utils", diff --git a/vortex-file/src/file.rs b/vortex-file/src/file.rs index fa4afdaa167..e142e0a2edd 100644 --- a/vortex-file/src/file.rs +++ b/vortex-file/src/file.rs @@ -28,8 +28,6 @@ use vortex_layout::segments::SegmentFutureCache; use vortex_layout::segments::SegmentSource; use vortex_scan::DataSourceRef; use vortex_scan::ScanRequest; -use vortex_scan::plan::PreparedStateCache; -use vortex_scan::plan::PreparedStateCacheRef; use vortex_scan::plan::ScanPlanRef; use vortex_session::VortexSession; @@ -56,8 +54,6 @@ pub struct VortexFile { layout_reader_cache: Option>>, /// Shared cache for the v2 physical scan plan root. scan_plan_root_cache: Arc>, - /// Shared cache for v2 prepared state across row-range scans of this file. - scan_plan_state_cache: PreparedStateCacheRef, /// Shared cache for v2 in-flight segment futures across row-range scans of this file. scan_plan_segment_future_cache: Arc, } @@ -96,7 +92,6 @@ impl VortexFile { session, layout_reader_cache: None, scan_plan_root_cache: Arc::new(OnceLock::new()), - scan_plan_state_cache: Arc::new(PreparedStateCache::default()), scan_plan_segment_future_cache: Arc::new(SegmentFutureCache::new()), } } @@ -112,7 +107,6 @@ impl VortexFile { session: self.session, layout_reader_cache: Some(OnceLock::new()), scan_plan_root_cache: self.scan_plan_root_cache, - scan_plan_state_cache: self.scan_plan_state_cache, scan_plan_segment_future_cache: self.scan_plan_segment_future_cache, } } @@ -196,10 +190,6 @@ impl VortexFile { Ok(root) } - pub(crate) fn scan_plan_state_cache(&self) -> PreparedStateCacheRef { - Arc::clone(&self.scan_plan_state_cache) - } - pub(crate) fn scan_plan_segment_future_cache(&self) -> Arc { Arc::clone(&self.scan_plan_segment_future_cache) } diff --git a/vortex-file/src/multi/scan_v2.rs b/vortex-file/src/multi/scan_v2.rs index 383bba52ba0..0ce65cb212b 100644 --- a/vortex-file/src/multi/scan_v2.rs +++ b/vortex-file/src/multi/scan_v2.rs @@ -1,72 +1,37 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! ScanPlan-backed multi-file data source. +//! File adapters for ScanPlan-backed scans. -use std::any::Any; -use std::collections::BTreeMap; -use std::collections::VecDeque; use std::fmt; use std::ops::Range; use std::sync::Arc; -use std::sync::atomic::AtomicU64; -use std::sync::atomic::Ordering; use async_trait::async_trait; -use futures::FutureExt; -use futures::StreamExt; use futures::TryStreamExt; use futures::future::BoxFuture; -use futures::stream; -use futures::stream::FuturesUnordered; -use parking_lot::Mutex; -use tracing::Instrument; -use vortex_array::ArrayRef; -use vortex_array::VortexSessionExecute; use vortex_array::aggregate_fn::AggregateFnRef; use vortex_array::dtype::DType; use vortex_array::dtype::FieldName; use vortex_array::dtype::FieldPath; use vortex_array::dtype::StructFields; use vortex_array::expr::Expression; -use vortex_array::expr::forms::conjuncts; use vortex_array::expr::stats::Precision; use vortex_array::expr::stats::Stat; use vortex_array::scalar::Scalar; -use vortex_array::scalar::ScalarValue; -use vortex_array::scalar_fn::fns::dynamic::DynamicExprUpdates; use vortex_array::scalar_fn::fns::get_item::GetItem; use vortex_array::scalar_fn::fns::root::Root; use vortex_array::stats::StatsSet; -use vortex_array::stream::ArrayStreamAdapter; -use vortex_array::stream::ArrayStreamExt; use vortex_array::stream::SendableArrayStream; use vortex_error::VortexResult; use vortex_error::vortex_bail; use vortex_error::vortex_err; use vortex_io::filesystem::FileListing; use vortex_io::filesystem::FileSystemRef; -use vortex_io::runtime::Handle; -use vortex_io::session::RuntimeSessionExt; use vortex_layout::layout_v2::LayoutScanPlanCtx; -use vortex_layout::scan::v2::validate_temporal_comparisons; use vortex_layout::scan::v2::with_row_idx; -use vortex_mask::Mask; use vortex_metrics::MetricsRegistry; -use vortex_scan::DataSource; -use vortex_scan::DataSourceScan; -use vortex_scan::DataSourceScanRef; -use vortex_scan::Partition; -use vortex_scan::PartitionRef; -use vortex_scan::PartitionStream; -use vortex_scan::PlannedMorselScan; -use vortex_scan::PlannedMorselScanRef; -use vortex_scan::ScanMeta; use vortex_scan::ScanRequest as DataSourceScanRequest; -use vortex_scan::ScanScheduler; -use vortex_scan::ScanSchedulerSessionExt; -use vortex_scan::plan::EvidenceScope; -use vortex_scan::plan::OwnedRowScope; use vortex_scan::plan::PrepareCtx; use vortex_scan::plan::PreparedAggregateRef; use vortex_scan::plan::PreparedEvidenceRef; @@ -75,39 +40,21 @@ use vortex_scan::plan::PreparedStats; use vortex_scan::plan::PreparedStatsRef; use vortex_scan::plan::PushCtx; use vortex_scan::plan::ReadContext; -use vortex_scan::plan::ReadStep; -use vortex_scan::plan::ReadTask; -use vortex_scan::plan::ReadTaskOutput; use vortex_scan::plan::ScanPlan; +use vortex_scan::plan::ScanPlanDataSource; +use vortex_scan::plan::ScanPlanFactory; use vortex_scan::plan::ScanPlanRef; use vortex_scan::plan::ScanState; use vortex_scan::plan::ScanStateRef; use vortex_scan::plan::StateCtx; use vortex_scan::plan::downcast_state; -use vortex_scan::plan::evidence::EvidenceFragment; -use vortex_scan::plan::evidence::PredicateEvidence; -use vortex_scan::plan::evidence::PredicateEvidenceKind; -use vortex_scan::plan::evidence::PredicateId; -use vortex_scan::plan::evidence::PredicateVersion; -use vortex_scan::plan::request::EvidenceMode; -use vortex_scan::plan::request::OwnedEvidenceRequest; use vortex_scan::plan::request::ScanRequest; -use vortex_scan::read::ReadResults; -use vortex_scan::read::ReadStore; -use vortex_scan::read::ReadStoreRef; -use vortex_scan::read::ScanIoPhase; -use vortex_scan::read::ScanRead; -use vortex_scan::selection::Selection; -use vortex_scan::task::ScanStep; -use vortex_scan::task::ScanStepResult; -use vortex_scan::task::ScanTask; -use vortex_scan::task::ScanTaskBox; -use vortex_scan::task::ScanTaskLane; -use vortex_scan::task::ScanTaskQueue; -use vortex_scan::task::ScanTaskRead; -use vortex_scan::task::scan_task_read_bytes; +use vortex_scan::plan::scan_plan_projected_splits; +use vortex_scan::plan::scan_plan_split_ranges; +use vortex_scan::plan::scan_plan_statistics; +use vortex_scan::plan::scan_plan_statistics_many; +use vortex_scan::plan::scan_plan_stream; use vortex_session::VortexSession; -use vortex_utils::parallelism::get_available_parallelism; use super::MultiFileDataSource; use super::create_local_filesystem; @@ -116,16 +63,6 @@ use crate::FileStatistics; use crate::VortexFile; use crate::VortexOpenOptions; -const DEFAULT_CONCURRENCY: usize = 8; -const IDEAL_SPLIT_SIZE: u64 = 100_000; -const MAX_SELECTION_RANGE_SIZE: u64 = IDEAL_SPLIT_SIZE / 25; -const MIN_SELECTION_GAP_BETWEEN_RANGES: u64 = IDEAL_SPLIT_SIZE / 2; -/// Below this demanded-row density, evaluate a residual predicate over only the demanded rows -/// (filter-first) rather than the whole morsel. -const EXPR_EVAL_THRESHOLD: f64 = 0.2; -const INLINE_ZERO_READ_EVIDENCE_MAX_PRIORITY: u64 = 100_150; -const SCAN_SCOPE_MIN_PREDICATE_COST: u64 = 100; - struct FileStatsScanPlan { data: ScanPlanRef, stats: Arc, @@ -173,6 +110,14 @@ impl FileStatsScanPlan { } impl ScanPlan for FileStatsScanPlan { + fn dtype(&self) -> &DType { + self.data.dtype() + } + + fn row_count(&self) -> u64 { + self.row_count + } + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { cx.init_plan(&self.data) } @@ -259,6 +204,14 @@ impl ScanPlan for FileStatsScanPlan { } impl ScanPlan for FileStatsExprScanPlan { + fn dtype(&self) -> &DType { + &self.field_dtype + } + + fn row_count(&self) -> u64 { + self.row_count + } + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { cx.init_plan(&self.data) } @@ -370,62 +323,11 @@ fn root_field(expr: &Expression) -> Option<&FieldName> { expr.child(0).is::().then_some(name) } -fn root_field_path(expr: &Expression) -> Option { - if expr.is::() { - return Some(FieldPath::root()); - } - root_field(expr).cloned().map(FieldPath::from_name) -} - -/// Static cost estimate for a filter conjunct, used to order predicate evaluation cheapest-first. -/// -/// We sum a per-node cost over the whole expression tree. Primitive comparisons, null checks and -/// data access (`vortex.binary`, `vortex.between`, `vortex.is_null`, `vortex.get_item`, ...) are -/// cheap; per-row string/byte work (`vortex.like`, `vortex.byte_length`, `vortex.list.contains`) -/// and opaque/dynamic functions are expensive. Unrecognized functions get a moderate cost so they -/// sort after primitives but ahead of known-expensive matchers. -fn predicate_cost(expr: &Expression) -> u64 { - fn node_cost(expr: &Expression) -> u64 { - match expr.id().as_str() { - // Free or near-free structural / access nodes. - "vortex.root" | "vortex.literal" | "vortex.get_item" => 0, - // Cheap primitive predicates. - "vortex.binary" | "vortex.between" | "vortex.is_null" | "vortex.is_not_null" - | "vortex.not" | "vortex.fill_null" | "vortex.cast" => 1, - // Expensive per-row string / byte / matching work, and fallible UDFs. - "vortex.like" | "vortex.byte_length" | "vortex.list.contains" => 100, - "vortex.dynamic" | "vortex.variant_get" | "vortex.parquet.variant" => 100, - // Unknown functions: more expensive than primitives, cheaper than known matchers. - _ => 10, - } - } - - let mut cost = node_cost(expr); - for child in expr.children().iter() { - cost = cost.saturating_add(predicate_cost(child)); - } - cost -} - fn absent_statistics(funcs: &[AggregateFnRef]) -> Vec> { funcs.iter().map(|_| Precision::Absent).collect() } -fn scalar_precision_to_value(precision: Precision) -> Precision { - match precision { - Precision::Exact(scalar) => scalar - .into_value() - .map(Precision::Exact) - .unwrap_or(Precision::Absent), - Precision::Inexact(scalar) => scalar - .into_value() - .map(Precision::Inexact) - .unwrap_or(Precision::Absent), - Precision::Absent => Precision::Absent, - } -} - -/// Build a scan2 [`DataSource`] from a multi-file builder. +/// Build a scan2 [`DataSource`](vortex_scan::DataSource) from a multi-file builder. pub(crate) async fn build_scan_plan_data_source( builder: MultiFileDataSource, ) -> VortexResult { @@ -471,8 +373,9 @@ pub(crate) async fn build_scan_plan_data_source( builder.open_options_fn.as_ref(), ) .await?; + let first_root = first_file.scan_plan_root()?; - let factories: Vec> = all_files[1..] + let factories: Vec> = all_files[1..] .iter() .map(|(file, fs)| { Arc::new(ScanPlanFileFactory { @@ -481,22 +384,17 @@ pub(crate) async fn build_scan_plan_data_source( session: builder.session.clone(), open_options_fn: Arc::clone(&builder.open_options_fn), metrics_registry: builder.metrics_registry.clone(), - }) as Arc + }) as Arc }) .collect(); Ok(ScanPlanDataSource::new_with_first( - first_file, + first_root, factories, &builder.session, )) } -#[async_trait] -trait VortexFileFactory: 'static + Send + Sync { - async fn open(&self) -> VortexResult>; -} - struct ScanPlanFileFactory { fs: FileSystemRef, file: FileListing, @@ -506,8 +404,8 @@ struct ScanPlanFileFactory { } #[async_trait] -impl VortexFileFactory for ScanPlanFileFactory { - async fn open(&self) -> VortexResult> { +impl ScanPlanFactory for ScanPlanFileFactory { + async fn open(&self) -> VortexResult> { let file = open_file( &self.fs, &self.file, @@ -516,418 +414,16 @@ impl VortexFileFactory for ScanPlanFileFactory { self.open_options_fn.as_ref(), ) .await?; - Ok(Some(file)) - } -} - -enum ScanPlanChild { - Opened(VortexFile), - Deferred(Arc), -} - -/// Multi-file data source backed by scan2 ScanPlan plans. -pub struct ScanPlanDataSource { - dtype: DType, - session: VortexSession, - children: Vec, - concurrency: usize, -} - -impl ScanPlanDataSource { - fn new_with_first( - first: VortexFile, - remaining: Vec>, - session: &VortexSession, - ) -> Self { - let dtype = first.dtype().clone(); - let concurrency = get_available_parallelism().unwrap_or(DEFAULT_CONCURRENCY); - - let mut children = Vec::with_capacity(1 + remaining.len()); - children.push(ScanPlanChild::Opened(first)); - children.extend(remaining.into_iter().map(ScanPlanChild::Deferred)); - - Self { - dtype, - session: session.clone(), - children, - concurrency, - } - } - - async fn open_files(&self, ordered: bool) -> VortexResult> { - let jobs = self - .children - .iter() - .enumerate() - .map(|(idx, child)| match child { - ScanPlanChild::Opened(file) => { - let file = file.clone(); - async move { Ok(Some((idx, file))) }.boxed() - } - ScanPlanChild::Deferred(factory) => { - let factory = Arc::clone(factory); - async move { - factory - .open() - .instrument(tracing::info_span!("VortexFileFactory::open")) - .await - .map(|file| file.map(|file| (idx, file))) - } - .boxed() - } - }) - .collect::>>>>(); - - let files = if ordered { - stream::iter(jobs) - .buffered(self.concurrency) - .try_filter_map(|file| async move { Ok(file) }) - .try_collect::>() - .await? - } else { - stream::iter(jobs) - .buffer_unordered(self.concurrency) - .try_filter_map(|file| async move { Ok(file) }) - .try_collect::>() - .await? - }; - - let mut files = files; - files.sort_unstable_by_key(|(idx, _)| *idx); - Ok(files) - } -} - -#[async_trait] -impl DataSource for ScanPlanDataSource { - fn dtype(&self) -> &DType { - &self.dtype - } - - fn row_count(&self) -> Precision { - let mut sum: u64 = 0; - let mut opened_count: u64 = 0; - let mut deferred_count: u64 = 0; - - for child in &self.children { - match child { - ScanPlanChild::Opened(file) => { - opened_count += 1; - sum = sum.saturating_add(file.row_count()); - } - ScanPlanChild::Deferred(_) => { - deferred_count += 1; - } - } - } - - let total_count = opened_count + deferred_count; - if total_count == 0 { - return Precision::exact(0u64); - } - - if deferred_count == 0 { - Precision::exact(sum) - } else if opened_count > 0 { - let avg = sum / opened_count; - Precision::inexact(avg.saturating_mul(total_count)) - } else { - Precision::Absent - } - } - - fn deserialize_partition( - &self, - _data: &[u8], - _session: &VortexSession, - ) -> VortexResult { - vortex_bail!("ScanPlanDataSource partitions are not yet serializable") - } - - async fn plan_morsel_partitions( - &self, - scan_request: DataSourceScanRequest, - target_partitions: usize, - ) -> VortexResult> { - if scan_request.ordered || scan_request.limit.is_some() { - return Ok(None); - } - - let target_partitions = target_partitions.max(1); - let dtype = scan_request.projection.return_dtype(&self.dtype)?; - - let meta = ScanMeta { - label: Some("scan2".to_string()), - }; - let provider = self.session.scan_scheduler_provider(); - let scheduler = provider.scheduler_for_scan(&meta); - - let mut planned_files = Vec::new(); - let mut total_morsels = 0usize; - for (partition_idx, file) in self.open_files(false).await? { - let Some(request) = file_scan_request(partition_idx, &file, scan_request.clone())? - else { - continue; - }; - let row_range = request - .row_range - .clone() - .ok_or_else(|| vortex_err!("scan2 partition row range missing"))?; - let prepared = Arc::new(PreparedScanPlan::try_new(&file, &request)?); - let execution = Arc::new(ScanExecution::try_new(file, prepared, None)?); - let ranges = execution.splits(&row_range)?; - if ranges.is_empty() { - continue; - } - total_morsels = total_morsels.saturating_add(ranges.len()); - planned_files.push((execution, ranges)); - } - - // The physical plan may expose more engine partitions than we can fill with morsels. - // Keep only non-empty planned partitions; engine adapters can return empty streams for - // any surplus advertised partitions. - let partition_count = total_morsels.min(target_partitions); - let mut partitions = vec![Vec::new(); partition_count]; - let mut morsel_idx = 0usize; - for (execution, ranges) in planned_files { - for range in ranges { - let partition = morsel_idx % partition_count; - partitions[partition].push(PlannedScanPlanMorsel { - execution: Arc::clone(&execution), - range, - }); - morsel_idx = morsel_idx.saturating_add(1); - } - } - - let read_byte_budget = read_byte_budget(&scheduler); - - Ok(Some(Arc::new(PlannedScanPlanScan { - dtype, - partitions, - handle: self.session.handle(), - read_byte_budget, - }))) - } - - async fn scan(&self, scan_request: DataSourceScanRequest) -> VortexResult { - let meta = ScanMeta { - label: Some("scan2".to_string()), - }; - let provider = self.session.scan_scheduler_provider(); - let scheduler = provider.scheduler_for_scan(&meta); - - let mut ready = VecDeque::new(); - let mut deferred = VecDeque::new(); - - for (index, child) in self.children.iter().enumerate() { - match child { - ScanPlanChild::Opened(file) => ready.push_back((index, file.clone())), - ScanPlanChild::Deferred(factory) => { - deferred.push_back((index, Arc::clone(factory))); - } - } - } - - let dtype = scan_request.projection.return_dtype(&self.dtype)?; - let limit_remaining = scan_request.limit.map(AtomicU64::new).map(Arc::new); - - Ok(Box::new(ScanPlanDataSourceScan { - dtype, - request: scan_request, - ready, - deferred, - handle: self.session.handle(), - concurrency: self.concurrency, - scheduler, - limit_remaining, - })) - } - - async fn statistics( - &self, - expr: &Expression, - funcs: &[AggregateFnRef], - ) -> VortexResult>> { - if self.children.len() != 1 { - return Ok(absent_statistics(funcs)); - } - let ScanPlanChild::Opened(file) = &self.children[0] else { - return Ok(absent_statistics(funcs)); - }; - scan_plan_file_statistics(file.clone(), expr, funcs).await - } - - async fn field_statistics(&self, field_path: &FieldPath) -> VortexResult { - if field_path.parts().len() != 1 { - return Ok(StatsSet::default()); - } - let Some(field_name) = field_path.parts()[0].as_name() else { - return Ok(StatsSet::default()); - }; - let funcs = Stat::all() - .filter_map(|stat| stat.aggregate_fn().map(|func| (stat, func))) - .collect::>(); - let aggregate_funcs = funcs - .iter() - .map(|(_, func)| func.clone()) - .collect::>(); - let stats = self - .statistics( - &vortex_array::expr::get_item(field_name, vortex_array::expr::root()), - &aggregate_funcs, - ) - .await?; - let mut stats_set = StatsSet::default(); - for ((stat, _), value) in funcs.into_iter().zip(stats) { - stats_set.set(stat, scalar_precision_to_value(value)); - } - Ok(stats_set) - } - - fn supports_morsel_partitioning(&self) -> bool { - true - } -} - -struct ScanPlanDataSourceScan { - dtype: DType, - request: DataSourceScanRequest, - ready: VecDeque<(usize, VortexFile)>, - deferred: VecDeque<(usize, Arc)>, - handle: Handle, - concurrency: usize, - scheduler: Arc, - limit_remaining: Option>, -} - -impl DataSourceScan for ScanPlanDataSourceScan { - fn dtype(&self) -> &DType { - &self.dtype - } - - fn partition_count(&self) -> Precision { - let count = self.ready.len() + self.deferred.len(); - if self.deferred.is_empty() { - Precision::exact(count) - } else { - Precision::inexact(count) - } - } - - fn partitions(self: Box) -> PartitionStream { - let Self { - dtype: _, - request, - ready, - deferred, - handle, - concurrency, - scheduler, - limit_remaining, - } = *self; - - let ordered = request.ordered; - let ready_stream = stream::iter(ready).map(Ok); - let spawned = stream::iter(deferred).map(move |(index, factory)| { - handle.spawn(async move { - factory - .open() - .instrument(tracing::info_span!("VortexFileFactory::open")) - .await - .map(|file| file.map(|file| (index, file))) - }) - }); - - let deferred_stream = if ordered { - spawned - .buffered(concurrency) - .filter_map(|result| async move { - match result { - Ok(Some(file)) => Some(Ok(file)), - Ok(None) => None, - Err(error) => Some(Err(error)), - } - }) - .boxed() - } else { - spawned - .buffer_unordered(concurrency) - .filter_map(|result| async move { - match result { - Ok(Some(file)) => Some(Ok(file)), - Ok(None) => None, - Err(error) => Some(Err(error)), - } - }) - .boxed() - }; - - ready_stream - .chain(deferred_stream) - .filter_map(move |file_result| { - let request = request.clone(); - let scheduler = Arc::clone(&scheduler); - let limit_remaining = limit_remaining.clone(); - async move { - match file_result { - Ok((index, file)) => { - file_partition(index, file, request, scheduler, limit_remaining) - .transpose() - } - Err(error) => Some(Err(error)), - } - } - }) - .boxed() + Ok(Some(file.scan_plan_root()?)) } } -fn file_partition( - partition_idx: usize, - file: VortexFile, - request: DataSourceScanRequest, - scheduler: Arc, - limit_remaining: Option>, -) -> VortexResult> { - let Some(request) = file_scan_request(partition_idx, &file, request)? else { - return Ok(None); - }; - let row_range = request - .row_range - .clone() - .ok_or_else(|| vortex_err!("scan2 partition row range missing"))?; - let prepared = Arc::new(PreparedScanPlan::try_new(&file, &request)?); - - Ok(Some(Box::new(ScanPlanPartition { - file, - prepared, - row_range, - index: partition_idx, - scheduler, - limit_remaining, - }))) -} - pub(crate) fn scan_plan_file_stream( file: VortexFile, request: DataSourceScanRequest, ) -> VortexResult { - let dtype = request.projection.return_dtype(file.dtype())?; - let meta = ScanMeta { - label: Some("scan2".to_string()), - }; - let provider = file.session().scan_scheduler_provider(); - let scheduler = provider.scheduler_for_scan(&meta); - - let limit_remaining = request.limit.map(AtomicU64::new).map(Arc::new); - let Some(partition) = file_partition(0, file, request, scheduler, limit_remaining)? else { - return Ok(ArrayStreamExt::boxed(ArrayStreamAdapter::new( - dtype, - stream::empty(), - ))); - }; - partition.execute() + let root = file.scan_plan_root()?; + scan_plan_stream(root, file.session().clone(), request) } pub(crate) async fn scan_plan_file_statistics( @@ -935,8 +431,8 @@ pub(crate) async fn scan_plan_file_statistics( expr: &Expression, funcs: &[AggregateFnRef], ) -> VortexResult>> { - let mut stats = scan_plan_file_statistics_many(file, std::slice::from_ref(expr), funcs).await?; - Ok(stats.pop().unwrap_or_else(|| absent_statistics(funcs))) + let root = file.scan_plan_root()?; + scan_plan_statistics(root, file.session().clone(), expr, funcs).await } pub(crate) async fn scan_plan_file_statistics_many( @@ -944,66 +440,21 @@ pub(crate) async fn scan_plan_file_statistics_many( exprs: &[Expression], funcs: &[AggregateFnRef], ) -> VortexResult>>> { - let session = file.session().clone(); let root = file.scan_plan_root()?; - let reader = ReadContext::new(session); - let mut result = Vec::with_capacity(exprs.len()); - for expr in exprs { - let plan = if let Some(field_path) = root_field_path(expr) { - Arc::clone(&root).prepare_field_stats( - &field_path, - funcs, - &mut PrepareCtx::new(reader.session().clone()), - )? - } else { - let pushed = push_expr(&root, expr, file.dtype(), reader.session())?; - pushed.prepare_field_stats( - &FieldPath::root(), - funcs, - &mut PrepareCtx::new(reader.session().clone()), - )? - }; - let Some(plan) = plan else { - result.push(absent_statistics(funcs)); - continue; - }; - let state = plan.init_state(reader.session())?; - result.push( - plan.stats(0..file.row_count(), &reader, state.as_ref()) - .await?, - ); - } - Ok(result) + scan_plan_statistics_many(root, file.session().clone(), exprs, funcs).await } pub(crate) fn scan_plan_file_splits(file: &VortexFile) -> VortexResult>> { let root = file.scan_plan_root()?; - Ok(split_ranges_from_node(&root, file.row_count())) + Ok(scan_plan_split_ranges(&root)) } pub(crate) async fn scan_plan_file_plan_splits( file: VortexFile, projection: &Expression, ) -> VortexResult>> { - let session = file.session().clone(); let root = file.scan_plan_root()?; - let pushed = push_expr(&root, projection, file.dtype(), &session)?; - let Some(plan) = pushed.prepare_splits(&mut PrepareCtx::new(session.clone()))? else { - return Ok(std::iter::once(0..file.row_count()).collect()); - }; - let reader = ReadContext::new(session.clone()); - let state = plan.init_state(&session)?; - plan.splits(0..file.row_count(), &reader, state.as_ref()) - .await -} - -fn split_ranges_from_node(node: &ScanPlanRef, row_count: u64) -> Vec> { - let mut points = Vec::new(); - if let Some(hints) = node.split_hints() { - points.extend_from_slice(hints); - } - let points = normalize_split_points(row_count, points); - natural_split_ranges(&points, None) + scan_plan_projected_splits(root, file.session().clone(), projection).await } pub(crate) fn build_file_scan_plan_root(file: &VortexFile) -> VortexResult { @@ -1018,7 +469,7 @@ pub(crate) fn build_file_scan_plan_root(file: &VortexFile) -> VortexResult FileStatsScanPlan::try_new( Arc::clone(&root), @@ -1031,2429 +482,3 @@ pub(crate) fn build_file_scan_plan_root(file: &VortexFile) -> VortexResult root, }) } - -fn file_scan_request( - partition_idx: usize, - file: &VortexFile, - request: DataSourceScanRequest, -) -> VortexResult> { - let partition_idx_u64 = partition_idx as u64; - if let Some(range) = &request.partition_range - && !range.contains(&partition_idx_u64) - { - return Ok(None); - } - match &request.partition_selection { - Selection::IncludeByIndex(buffer) => { - if buffer.as_slice().binary_search(&partition_idx_u64).is_err() { - return Ok(None); - } - } - Selection::ExcludeByIndex(buffer) => { - if buffer.as_slice().binary_search(&partition_idx_u64).is_ok() { - return Ok(None); - } - } - _ => {} - }; - - let row_count = file.row_count(); - let row_range = request.row_range.clone().unwrap_or(0..row_count); - check_range(&row_range, row_count)?; - - if let Some(filter) = &request.filter - && file.can_prune(filter)? - { - return Ok(None); - } - - Ok(Some(DataSourceScanRequest { - row_range: Some(row_range), - ..request - })) -} - -type QueuedWork = ScanTaskBox; - -struct LaunchedWorkOutput { - lane: ScanTaskLane, - reads: Vec, - output: VortexResult, -} - -struct EvidenceWorkOutput { - morsel_id: usize, - predicate_idx: usize, - version: PredicateVersion, - source: EvidenceWorkSource, - fragments: Vec, -} - -struct ScanEvidenceWorkOutput { - execution: Arc, - morsel_id: usize, - predicate_idx: usize, - evidence_idx: usize, - version: PredicateVersion, - fragments: Option>, -} - -enum EvidenceWorkSource { - Provider, - Predicate { input_rows: usize, pass_rows: usize }, -} - -struct ProjectionWorkOutput { - morsel_id: usize, - array: ArrayRef, -} - -enum WorkOutput { - Evidence(EvidenceWorkOutput), - ScanEvidence(ScanEvidenceWorkOutput), - Projection(ProjectionWorkOutput), -} - -enum WorkPoll { - Ready(WorkOutput), - Pending(QueuedWork), -} - -struct ScanEvidenceWaitTask { - execution: Arc, - morsel_id: usize, - predicate_idx: usize, - evidence_idx: usize, - version: PredicateVersion, - lane: ScanTaskLane, - priority: u64, -} - -impl ScanTask for ScanEvidenceWaitTask { - fn morsel_id(&self) -> usize { - self.morsel_id - } - - fn phase(&self) -> ScanIoPhase { - ScanIoPhase::EvidenceProbe - } - - fn lane(&self) -> ScanTaskLane { - self.lane - } - - fn reads(&self) -> &[ScanTaskRead] { - &[] - } - - fn priority(&self) -> u64 { - self.priority - } - - fn into_step(self: Box) -> VortexResult> { - let task = *self; - let morsel_id = task.morsel_id; - let lane = task.lane; - let priority = task.priority; - Ok(ScanStep::new( - morsel_id, - ScanIoPhase::EvidenceProbe, - lane, - Vec::new(), - Vec::new(), - Vec::new(), - move |_| { - if !task.execution.scan_evidence_provider_ready( - task.predicate_idx, - task.evidence_idx, - task.version, - ) && task.execution.predicates[task.predicate_idx].version() == task.version - { - return Ok(ScanStepResult::Continue(Box::new(task))); - } - - Ok(ScanStepResult::Ready(WorkOutput::ScanEvidence( - ScanEvidenceWorkOutput { - execution: Arc::clone(&task.execution), - morsel_id: task.morsel_id, - predicate_idx: task.predicate_idx, - evidence_idx: task.evidence_idx, - version: task.version, - fragments: None, - }, - ))) - }, - ) - .with_priority(priority)) - } -} - -struct PredicateReadWorkState { - execution: Arc, - morsel_id: usize, - predicate_idx: usize, - version: PredicateVersion, - range: Range, - need: Mask, - compact: bool, - len: usize, - priority: u64, - lane: ScanTaskLane, -} - -struct PredicateReadWorkTask { - state: PredicateReadWorkState, - step: ReadStep, - reads: Vec, -} - -impl PredicateReadWorkTask { - fn try_new(state: PredicateReadWorkState, task: Box) -> VortexResult { - let step = task.into_step()?; - let reads = ScanTaskRead::from_scan_reads(&step.required_reads); - Ok(Self { state, step, reads }) - } -} - -impl ScanTask for PredicateReadWorkTask { - fn morsel_id(&self) -> usize { - self.state.morsel_id - } - - fn phase(&self) -> ScanIoPhase { - ScanIoPhase::PredicateRead - } - - fn lane(&self) -> ScanTaskLane { - self.state.lane - } - - fn reads(&self) -> &[ScanTaskRead] { - &self.reads - } - - fn priority(&self) -> u64 { - self.state.priority - } - - fn into_step(self: Box) -> VortexResult> { - let task = *self; - let state = task.state; - let morsel_id = state.morsel_id; - let lane = state.lane; - let reads = task.reads.clone(); - let priority = state.priority; - let read_step = task.step; - Ok(ScanStep::new( - morsel_id, - ScanIoPhase::PredicateRead, - lane, - reads, - read_step.required_reads, - read_step.prefetch_reads, - move |results| { - let reader = state.execution.read_context(); - let mut ctx = state.execution.session.create_execution_ctx(); - let array = match read_step.continuation.run(&reader, &mut ctx, results)? { - ReadTaskOutput::Ready(array) => array, - ReadTaskOutput::Continue(read_task) => { - return Ok(ScanStepResult::Continue(Box::new( - PredicateReadWorkTask::try_new(state, read_task)?, - ))); - } - }; - let result = if state.compact { - let compact = array.null_as_false().execute(&mut ctx)?; - if compact.len() != state.need.true_count() { - vortex_bail!( - "compacted residual result length {} does not match demanded row count {}", - compact.len(), - state.need.true_count() - ); - } - state.need.intersect_by_rank(&compact) - } else { - array.null_as_false().execute(&mut ctx)? - }; - if result.len() != state.len { - vortex_bail!( - "residual result length {} does not match morsel length {}", - result.len(), - state.len - ); - } - let pass = &result & &state.need; - let input_rows = state.need.true_count(); - let pass_rows = pass.true_count(); - let exact = !&state.need | &pass; - Ok(ScanStepResult::Ready(WorkOutput::Evidence( - EvidenceWorkOutput { - morsel_id: state.morsel_id, - predicate_idx: state.predicate_idx, - version: state.version, - source: EvidenceWorkSource::Predicate { - input_rows, - pass_rows, - }, - fragments: vec![EvidenceFragment::new( - state.range.clone(), - PredicateEvidenceKind::ExactMask(exact), - )], - }, - ))) - }, - ) - .with_priority(priority)) - } -} - -struct ProjectionReadWorkTask { - execution: Arc, - step: ReadStep, - reads: Vec, - morsel_id: usize, -} - -impl ProjectionReadWorkTask { - fn try_new( - execution: Arc, - task: Box, - morsel_id: usize, - ) -> VortexResult { - let step = task.into_step()?; - let reads = ScanTaskRead::from_scan_reads(&step.required_reads); - Ok(Self { - execution, - step, - reads, - morsel_id, - }) - } -} - -impl ScanTask for ProjectionReadWorkTask { - fn morsel_id(&self) -> usize { - self.morsel_id - } - - fn phase(&self) -> ScanIoPhase { - ScanIoPhase::ProjectionRead - } - - fn lane(&self) -> ScanTaskLane { - ScanTaskLane::Projection - } - - fn reads(&self) -> &[ScanTaskRead] { - &self.reads - } - - fn priority(&self) -> u64 { - ScanStep::::DEFAULT_PRIORITY - } - - fn into_step(self: Box) -> VortexResult> { - let task = *self; - let reads = task.reads.clone(); - let read_step = task.step; - Ok(ScanStep::new( - task.morsel_id, - ScanIoPhase::ProjectionRead, - ScanTaskLane::Projection, - reads, - read_step.required_reads, - read_step.prefetch_reads, - move |results| { - let reader = task.execution.read_context(); - let mut ctx = task.execution.session.create_execution_ctx(); - match read_step.continuation.run(&reader, &mut ctx, results)? { - ReadTaskOutput::Ready(array) => Ok(ScanStepResult::Ready( - WorkOutput::Projection(ProjectionWorkOutput { - morsel_id: task.morsel_id, - array, - }), - )), - ReadTaskOutput::Continue(read_task) => Ok(ScanStepResult::Continue(Box::new( - ProjectionReadWorkTask::try_new(task.execution, read_task, task.morsel_id)?, - ))), - } - }, - )) - } -} - -async fn resolve_step_reads(read_store: ReadStoreRef, reads: Vec) -> VortexResult<()> { - let mut pending_reads = FuturesUnordered::new(); - for read in reads { - let key = read.request.key; - if read_store.get(key).is_none() { - pending_reads.push(async move { read.future.await.map(|buffer| (key, buffer)) }); - } - } - while let Some(result) = pending_reads.next().await { - let (key, buffer) = result?; - read_store.insert(key, buffer); - } - Ok(()) -} - -fn prefetch_step_reads(handle: &Handle, read_store: ReadStoreRef, reads: Vec) { - if reads.is_empty() { - return; - } - handle - .spawn(async move { - if let Err(error) = resolve_step_reads(read_store, reads).await { - tracing::debug!( - target: "vortex_file::scan_v2", - ?error, - "scan2 prefetch read failed" - ); - } - }) - .detach(); -} - -async fn run_scan_task_step( - work: QueuedWork, - read_store: ReadStoreRef, - handle: Handle, -) -> VortexResult { - let mut step = work.into_step()?; - let (required_reads, prefetch_reads) = step.take_reads(); - prefetch_step_reads(&handle, Arc::clone(&read_store), prefetch_reads); - resolve_step_reads(Arc::clone(&read_store), required_reads).await?; - match step.continue_with(ReadResults::new(Arc::clone(&read_store)))? { - ScanStepResult::Ready(output) => Ok(WorkPoll::Ready(output)), - ScanStepResult::Continue(work) => Ok(WorkPoll::Pending(work)), - } -} - -enum CompletedMorsel { - Empty, - Output(ArrayRef), -} - -struct PlannedMorselWork { - state: MorselState, - evidence: Vec, -} - -struct MorselState { - execution: Arc, - range: Range, - selected: Mask, - evidence: Vec>, - pending_evidence: Vec, - pending_scan_evidence: Vec, - scan_evidence_generation: Vec, - predicate_queued: Vec, - predicate_done: Vec, - next_recheck_predicate: usize, - projection_queued: bool, -} - -#[derive(Default)] -struct ScanEvidenceStore { - predicates: Vec, -} - -#[derive(Default)] -struct PredicateScanEvidenceStore { - generation: u64, - providers: Vec, -} - -#[derive(Default)] -struct ScanEvidenceSlot { - version: Option, - pending: Option, - fragments: Vec, -} - -enum ScanEvidenceAction { - Ready, - Pending, - Prepare, - Wait, -} - -#[derive(Default)] -struct PredicateRuntimeStats { - input_rows: u64, - rejected_rows: u64, -} - -struct PartitionWorkSchedulerState { - pending: VecDeque, - morsels: Vec>, - active_morsels: usize, - has_dynamic_predicates: bool, - in_flight_projection_tasks: usize, - next_morsel_id: usize, - next_emit_morsel_id: usize, - task_queue: ScanTaskQueue, - in_flight: FuturesUnordered>, - read_store: ReadStoreRef, - completed_morsels: BTreeMap, - handle: Handle, - ordered: bool, - plan_window: usize, -} - -fn plan_window_for_limit(limited: bool) -> usize { - if limited { 1 } else { usize::MAX } -} - -fn read_byte_budget(scheduler: &ScanScheduler) -> u64 { - scheduler.config().read_byte_budget().unwrap_or(u64::MAX) -} - -fn partition_work_stream( - morsels: Vec, - handle: Handle, - ordered: bool, - plan_window: usize, - read_byte_budget: u64, -) -> impl futures::Stream> + Send + 'static { - let has_dynamic_predicates = morsels - .iter() - .any(|morsel| morsel.execution.has_dynamic_predicates()); - tracing::debug!( - target: "vortex_file::scan_v2", - morsel_count = morsels.len(), - ordered, - plan_window, - read_byte_budget, - has_dynamic_predicates, - "created scan2 task stream" - ); - let state = PartitionWorkSchedulerState { - pending: VecDeque::from(morsels), - morsels: Vec::new(), - active_morsels: 0, - has_dynamic_predicates, - in_flight_projection_tasks: 0, - next_morsel_id: 0, - next_emit_morsel_id: 0, - task_queue: ScanTaskQueue::new(read_byte_budget), - in_flight: FuturesUnordered::new(), - read_store: Arc::new(ReadStore::new()), - completed_morsels: BTreeMap::new(), - handle, - ordered, - plan_window, - }; - - stream::unfold(state, |mut state| async move { - loop { - if let Some(array) = state.pop_ready_output() { - return Some((Ok(array), state)); - } - - while state.active_morsels < state.plan_window && !state.pending.is_empty() { - if let Err(error) = state.plan_next_morsel() { - state.clear(); - return Some((Err(error), state)); - } - } - - while state.launch_next_admissible_work() {} - - if state.in_flight.is_empty() { - if state.is_done() { - return None; - } - let error = vortex_err!( - "scan2 work scheduler stalled: {} active morsels, {} pending morsels, {} evidence work items, {} predicate work items, {} projection work items, {} active read bytes", - state.active_morsels, - state.pending.len(), - state.task_queue.evidence_len(), - state.task_queue.predicate_len(), - state.task_queue.projection_len(), - state.task_queue.active_read_bytes() - ); - state.clear(); - return Some((Err(error), state)); - } - - match state.in_flight.next().await { - Some(output) => { - state.release_reads(output.lane, &output.reads); - match output.output { - Ok(WorkPoll::Ready(output)) => match state.complete_work(output) { - Ok(Some(array)) => return Some((Ok(array), state)), - Ok(None) => continue, - Err(error) => return Some((Err(error), state)), - }, - Ok(WorkPoll::Pending(work)) => { - state.task_queue.push(work); - continue; - } - Err(error) => return Some((Err(error), state)), - } - } - None if state.is_done() => return None, - None => continue, - } - } - }) -} - -impl PartitionWorkSchedulerState { - fn clear(&mut self) { - self.pending.clear(); - self.morsels.clear(); - self.active_morsels = 0; - self.in_flight_projection_tasks = 0; - self.next_emit_morsel_id = 0; - self.task_queue.clear(); - self.in_flight = FuturesUnordered::new(); - self.read_store = Arc::new(ReadStore::new()); - self.completed_morsels.clear(); - } - - fn is_done(&self) -> bool { - self.pending.is_empty() - && self.active_morsels == 0 - && self.task_queue.is_empty() - && self.in_flight.is_empty() - && self.completed_morsels.is_empty() - } - - fn plan_next_morsel(&mut self) -> VortexResult<()> { - let Some(morsel) = self.pending.pop_front() else { - return Ok(()); - }; - let morsel_id = self.next_morsel_id; - let range = morsel.range.clone(); - let Some(planned) = morsel.execution.plan_morsel(morsel_id, morsel.range)? else { - tracing::trace!( - target: "vortex_file::scan_v2", - morsel_id, - range_start = range.start, - range_end = range.end, - pending_morsels = self.pending.len(), - active_morsels = self.active_morsels, - "scan2 skipped empty morsel" - ); - return Ok(()); - }; - self.next_morsel_id = self.next_morsel_id.saturating_add(1); - self.active_morsels = self.active_morsels.saturating_add(1); - if self.morsels.len() <= morsel_id { - self.morsels.resize_with(morsel_id + 1, || None); - } - self.morsels[morsel_id] = Some(planned.state); - let evidence_len = planned.evidence.len(); - self.task_queue.extend(planned.evidence); - self.enqueue_ready_work(morsel_id)?; - tracing::trace!( - target: "vortex_file::scan_v2", - morsel_id, - range_start = range.start, - range_end = range.end, - pending_morsels = self.pending.len(), - active_morsels = self.active_morsels, - queued_evidence = evidence_len, - evidence_queue_len = self.task_queue.evidence_len(), - predicate_queue_len = self.task_queue.predicate_len(), - projection_queue_len = self.task_queue.projection_len(), - "scan2 planned morsel" - ); - Ok(()) - } - - fn launch_next_admissible_work(&mut self) -> bool { - let in_flight_empty = self.in_flight.is_empty(); - // Backlogged output should stop speculative projection for dynamic scans, but not the - // single projection needed to unblock an otherwise idle ordered stream. - let projection_admissible = !self.has_dynamic_predicates - || (self.in_flight_projection_tasks == 0 && !self.has_completed_output_backlog()) - || in_flight_empty; - let morsels = &self.morsels; - let Some(task) = self.task_queue.pop_next_admissible_with_projection_gate( - in_flight_empty, - projection_admissible, - |morsel_id| morsels.get(morsel_id).and_then(Option::as_ref).is_some(), - ) else { - return false; - }; - let (task, lane, reads) = task.into_parts(); - self.launch_admitted(task, lane, reads); - true - } - - fn launch_admitted(&mut self, work: QueuedWork, lane: ScanTaskLane, reads: Vec) { - let morsel_id = work.morsel_id(); - let phase = work.phase(); - let priority = work.priority(); - let bytes = scan_task_read_bytes(&reads); - let read_count = reads.len(); - tracing::trace!( - target: "vortex_file::scan_v2", - morsel_id, - ?phase, - ?lane, - read_count, - read_bytes = bytes, - priority, - in_flight = self.in_flight.len(), - in_flight_projection_tasks = self.in_flight_projection_tasks, - active_morsels = self.active_morsels, - pending_morsels = self.pending.len(), - evidence_queue_len = self.task_queue.evidence_len(), - predicate_queue_len = self.task_queue.predicate_len(), - projection_queue_len = self.task_queue.projection_len(), - active_read_count = self.task_queue.active_read_count(), - active_read_bytes = self.task_queue.active_read_bytes(), - active_evidence_read_bytes = self.task_queue.active_evidence_read_bytes(), - active_predicate_read_bytes = self.task_queue.active_predicate_read_bytes(), - active_projection_read_bytes = self.task_queue.active_projection_read_bytes(), - "scan2 launching work" - ); - let read_store = Arc::clone(&self.read_store); - let handle = self.handle.clone(); - let future = async move { - let output = run_scan_task_step(work, read_store, handle).await; - LaunchedWorkOutput { - lane, - reads, - output, - } - } - .instrument(tracing::trace_span!( - "scan2_work", - morsel_id, - phase = ?phase, - lane = ?lane, - read_count, - read_bytes = bytes, - )); - let inline_zero_read = bytes == 0 - && match phase { - ScanIoPhase::EvidenceProbe | ScanIoPhase::EvidenceSetup => { - priority <= INLINE_ZERO_READ_EVIDENCE_MAX_PRIORITY - } - ScanIoPhase::PredicateRead - | ScanIoPhase::ProjectionRead - | ScanIoPhase::AggregateRead => false, - }; - if inline_zero_read { - self.in_flight.push(future.boxed()); - } else { - self.in_flight.push(self.handle.spawn(future).boxed()); - } - if matches!(lane, ScanTaskLane::Projection) { - self.in_flight_projection_tasks = self.in_flight_projection_tasks.saturating_add(1); - } - } - - fn release_reads(&mut self, lane: ScanTaskLane, reads: &[ScanTaskRead]) { - self.task_queue.release_reads(lane, reads); - if matches!(lane, ScanTaskLane::Projection) { - self.in_flight_projection_tasks = self.in_flight_projection_tasks.saturating_sub(1); - } - } - - fn complete_work(&mut self, output: WorkOutput) -> VortexResult> { - match output { - WorkOutput::Evidence(output) => self.complete_evidence(output), - WorkOutput::ScanEvidence(output) => self.complete_scan_evidence(output), - WorkOutput::Projection(output) => { - Ok(self.finish_output_morsel(output.morsel_id, output.array)) - } - } - } - - fn complete_scan_evidence( - &mut self, - output: ScanEvidenceWorkOutput, - ) -> VortexResult> { - if let Some(morsel) = self - .morsels - .get_mut(output.morsel_id) - .and_then(Option::as_mut) - && let Some(pending) = morsel.pending_scan_evidence.get_mut(output.predicate_idx) - { - *pending = pending.saturating_sub(1); - } - - if let Some(fragments) = output.fragments { - output.execution.record_scan_evidence( - output.predicate_idx, - output.evidence_idx, - output.version, - fragments, - )?; - } - - let affected = self - .morsels - .iter() - .enumerate() - .filter_map(|(morsel_id, morsel)| { - morsel - .as_ref() - .filter(|morsel| Arc::ptr_eq(&morsel.execution, &output.execution)) - .map(|_| morsel_id) - }) - .collect::>(); - - for morsel_id in affected { - if self - .morsels - .get(morsel_id) - .and_then(Option::as_ref) - .is_none() - { - continue; - } - if self.refresh_morsel_scan_evidence(morsel_id, output.predicate_idx)? { - if let Some(array) = self.finish_empty_morsel(morsel_id) { - return Ok(Some(array)); - } - } else { - self.enqueue_ready_work(morsel_id)?; - } - } - Ok(None) - } - - fn refresh_all_scan_evidence(&mut self, morsel_id: usize) -> VortexResult { - let Some(predicate_count) = self - .morsels - .get(morsel_id) - .and_then(Option::as_ref) - .map(|morsel| morsel.execution.predicates.len()) - else { - return Ok(false); - }; - - for predicate_idx in 0..predicate_count { - if self.refresh_morsel_scan_evidence(morsel_id, predicate_idx)? { - return Ok(true); - } - } - Ok(false) - } - - fn refresh_morsel_scan_evidence( - &mut self, - morsel_id: usize, - predicate_idx: usize, - ) -> VortexResult { - let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { - return Ok(false); - }; - let predicate = &morsel.execution.predicates[predicate_idx]; - let version = predicate.version(); - let (generation, fragments) = - morsel - .execution - .scan_evidence_fragments(predicate_idx, version, &morsel.range)?; - let Some(seen_generation) = morsel.scan_evidence_generation.get_mut(predicate_idx) else { - vortex_bail!("missing scan evidence generation slot {predicate_idx}"); - }; - if generation <= *seen_generation { - return Ok(false); - } - *seen_generation = generation; - - let Some(slot) = morsel.evidence.get_mut(predicate_idx) else { - vortex_bail!("missing predicate evidence slot {predicate_idx}"); - }; - if slot - .as_ref() - .is_none_or(|evidence| evidence.version() != version) - { - *slot = Some(PredicateEvidence::new( - predicate.id, - version, - morsel.range.clone(), - )?); - } - let evidence = slot - .as_mut() - .ok_or_else(|| vortex_err!("missing predicate evidence after initialization"))?; - for fragment in fragments { - evidence.absorb(fragment)?; - } - let maybe = evidence.maybe().clone(); - let all_false = evidence.all_false(); - morsel.selected = &morsel.selected & &maybe; - Ok(morsel.selected.all_false() || all_false) - } - - fn complete_evidence(&mut self, output: EvidenceWorkOutput) -> VortexResult> { - let mut record_predicate = None; - let finish_empty = { - let Some(morsel) = self - .morsels - .get_mut(output.morsel_id) - .and_then(Option::as_mut) - else { - return Ok(None); - }; - match output.source { - EvidenceWorkSource::Provider => { - let Some(pending) = morsel.pending_evidence.get_mut(output.predicate_idx) - else { - vortex_bail!("missing predicate evidence count {}", output.predicate_idx); - }; - *pending = pending.saturating_sub(1); - } - EvidenceWorkSource::Predicate { - input_rows, - pass_rows, - } => { - let Some(queued) = morsel.predicate_queued.get_mut(output.predicate_idx) else { - vortex_bail!("missing predicate queued slot {}", output.predicate_idx); - }; - *queued = false; - let Some(done) = morsel.predicate_done.get_mut(output.predicate_idx) else { - vortex_bail!("missing predicate done slot {}", output.predicate_idx); - }; - *done = true; - record_predicate = Some(( - Arc::clone(&morsel.execution), - output.predicate_idx, - input_rows, - pass_rows, - )); - } - } - let predicate = &morsel.execution.predicates[output.predicate_idx]; - let Some(slot) = morsel.evidence.get_mut(output.predicate_idx) else { - vortex_bail!("missing predicate evidence slot {}", output.predicate_idx); - }; - if slot - .as_ref() - .is_none_or(|evidence| evidence.version() != output.version) - { - *slot = Some(PredicateEvidence::new( - predicate.id, - output.version, - morsel.range.clone(), - )?); - } - let evidence = slot - .as_mut() - .ok_or_else(|| vortex_err!("missing predicate evidence after initialization"))?; - for fragment in output.fragments { - evidence.absorb(fragment)?; - } - let maybe = evidence.maybe().clone(); - let all_false = evidence.all_false(); - morsel.selected = &morsel.selected & &maybe; - morsel.selected.all_false() || all_false - }; - - if let Some((execution, predicate_idx, input_rows, pass_rows)) = record_predicate - && !execution.has_dynamic_predicates() - { - execution.record_predicate_result(predicate_idx, input_rows, pass_rows); - } - - if finish_empty { - return Ok(self.finish_empty_morsel(output.morsel_id)); - } - - self.enqueue_ready_work(output.morsel_id)?; - Ok(None) - } - - fn enqueue_ready_work(&mut self, morsel_id: usize) -> VortexResult<()> { - if self.refresh_all_scan_evidence(morsel_id)? { - self.finish_empty_morsel(morsel_id); - return Ok(()); - } - - if let Some((predicate_idx, need, priority)) = self.choose_ready_predicate(morsel_id)? { - let Some(morsel) = self.morsels.get(morsel_id).and_then(Option::as_ref) else { - return Ok(()); - }; - let work = morsel.execution.plan_predicate_work( - morsel_id, - predicate_idx, - morsel.range.clone(), - need, - morsel.execution.predicates[predicate_idx].version(), - priority, - )?; - let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { - return Ok(()); - }; - morsel.predicate_queued[predicate_idx] = true; - self.task_queue.push(work); - return Ok(()); - } - - let ready_to_project = self - .morsels - .get(morsel_id) - .and_then(Option::as_ref) - .is_some_and(|morsel| { - !morsel.projection_queued - && morsel.pending_evidence.iter().all(|pending| *pending == 0) - && morsel - .pending_scan_evidence - .iter() - .all(|pending| *pending == 0) - && morsel.predicate_queued.iter().all(|queued| !*queued) - && morsel.predicate_done.iter().all(|done| *done) - }); - if !ready_to_project { - return Ok(()); - } - - if self.enqueue_recheck_evidence(morsel_id)? { - return Ok(()); - } - - let Some(morsel) = self.morsels.get(morsel_id).and_then(Option::as_ref) else { - return Ok(()); - }; - let projection = morsel.execution.plan_projection_work( - morsel_id, - morsel.range.clone(), - morsel.selected.clone(), - )?; - let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { - return Ok(()); - }; - morsel.projection_queued = true; - match projection { - Some(work) => self.task_queue.push(work), - None => { - self.finish_empty_morsel(morsel_id); - } - } - Ok(()) - } - - fn choose_ready_predicate( - &mut self, - morsel_id: usize, - ) -> VortexResult> { - loop { - let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { - return Ok(None); - }; - if morsel.predicate_queued.iter().any(|queued| *queued) { - return Ok(None); - } - let dynamic_scan = morsel.execution.has_dynamic_predicates(); - if dynamic_scan - && (morsel.pending_evidence.iter().any(|pending| *pending != 0) - || morsel - .pending_scan_evidence - .iter() - .any(|pending| *pending != 0)) - { - return Ok(None); - } - - let mut best: Option<(u64, usize, Mask)> = None; - let mut advanced = false; - for predicate_idx in 0..morsel.execution.predicates.len() { - if morsel.predicate_done[predicate_idx] - || morsel.predicate_queued[predicate_idx] - || morsel.pending_evidence[predicate_idx] != 0 - || morsel.pending_scan_evidence[predicate_idx] != 0 - { - continue; - } - if morsel.evidence[predicate_idx].is_none() { - let predicate = &morsel.execution.predicates[predicate_idx]; - morsel.evidence[predicate_idx] = Some(PredicateEvidence::new( - predicate.id, - predicate.version(), - morsel.range.clone(), - )?); - } - let evidence = morsel.evidence[predicate_idx].as_ref().ok_or_else(|| { - vortex_err!( - "missing evidence for predicate {predicate_idx} before residual read" - ) - })?; - let need = &morsel.selected & &evidence.unproven(); - if need.all_false() { - morsel.predicate_done[predicate_idx] = true; - advanced = true; - continue; - } - let priority = if dynamic_scan { - u64::try_from(predicate_idx).unwrap_or(u64::MAX) - } else { - morsel - .execution - .predicate_priority(predicate_idx, need.true_count()) - }; - if best.as_ref().is_none_or(|(best_priority, best_idx, _)| { - (priority, predicate_idx) < (*best_priority, *best_idx) - }) { - best = Some((priority, predicate_idx, need)); - } - } - if advanced { - continue; - } - return Ok(best.map(|(priority, predicate_idx, need)| (predicate_idx, need, priority))); - } - } - - fn enqueue_recheck_evidence(&mut self, morsel_id: usize) -> VortexResult { - loop { - let Some(morsel) = self.morsels.get(morsel_id).and_then(Option::as_ref) else { - return Ok(false); - }; - if morsel.next_recheck_predicate >= morsel.execution.predicates.len() { - return Ok(false); - } - - let predicate_idx = morsel.next_recheck_predicate; - let predicate = &morsel.execution.predicates[predicate_idx]; - let current_version = predicate.version(); - let evidence_version = morsel.evidence[predicate_idx] - .as_ref() - .map(PredicateEvidence::version) - .unwrap_or(PredicateVersion::STATIC); - let has_dynamic = predicate.dynamic_updates.is_some(); - let has_scan_recheck_evidence = predicate.has_scan_recheck_evidence(); - let has_morsel_recheck_evidence = predicate.has_morsel_recheck_evidence(); - - if has_dynamic && has_scan_recheck_evidence && current_version != evidence_version { - let work = morsel.execution.plan_scan_evidence_work( - morsel_id, - predicate_idx, - current_version, - EvidenceMode::RecheckBeforeProjection, - )?; - if !work.is_empty() { - let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) - else { - return Ok(false); - }; - morsel.pending_scan_evidence[predicate_idx] = - morsel.pending_scan_evidence[predicate_idx].saturating_add(work.len()); - self.task_queue.extend(work); - return Ok(true); - } - if self.refresh_morsel_scan_evidence(morsel_id, predicate_idx)? { - self.finish_empty_morsel(morsel_id); - return Ok(true); - } - } - - let Some(morsel) = self.morsels.get(morsel_id).and_then(Option::as_ref) else { - return Ok(false); - }; - let evidence_version = morsel.evidence[predicate_idx] - .as_ref() - .map(PredicateEvidence::version) - .unwrap_or(PredicateVersion::STATIC); - - if has_dynamic && has_morsel_recheck_evidence && current_version != evidence_version { - let work = morsel.execution.plan_evidence_work( - morsel_id, - predicate_idx, - morsel.range.clone(), - current_version, - EvidenceMode::RecheckBeforeProjection, - )?; - if work.is_empty() { - let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) - else { - return Ok(false); - }; - morsel.next_recheck_predicate = morsel.next_recheck_predicate.saturating_add(1); - continue; - } - let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { - return Ok(false); - }; - morsel.pending_evidence[predicate_idx] = - morsel.pending_evidence[predicate_idx].saturating_add(work.len()); - self.task_queue.extend(work); - return Ok(true); - } - - let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { - return Ok(false); - }; - morsel.next_recheck_predicate = morsel.next_recheck_predicate.saturating_add(1); - } - } - - fn finish_empty_morsel(&mut self, morsel_id: usize) -> Option { - if self.finish_morsel(morsel_id) && self.ordered { - self.completed_morsels - .insert(morsel_id, CompletedMorsel::Empty); - return self.pop_ready_output(); - } - None - } - - fn finish_output_morsel(&mut self, morsel_id: usize, array: ArrayRef) -> Option { - if !self.finish_morsel(morsel_id) { - return None; - } - if self.ordered { - self.completed_morsels - .insert(morsel_id, CompletedMorsel::Output(array)); - self.pop_ready_output() - } else { - Some(array) - } - } - - fn finish_morsel(&mut self, morsel_id: usize) -> bool { - if let Some(slot) = self.morsels.get_mut(morsel_id) - && slot.take().is_some() - { - self.active_morsels = self.active_morsels.saturating_sub(1); - return true; - } - false - } - - fn pop_ready_output(&mut self) -> Option { - if !self.ordered { - return None; - } - loop { - match self.completed_morsels.remove(&self.next_emit_morsel_id) { - Some(CompletedMorsel::Empty) => { - self.next_emit_morsel_id = self.next_emit_morsel_id.saturating_add(1); - } - Some(CompletedMorsel::Output(array)) => { - self.next_emit_morsel_id = self.next_emit_morsel_id.saturating_add(1); - return Some(array); - } - None => return None, - } - } - } - - fn has_completed_output_backlog(&self) -> bool { - self.completed_morsels - .values() - .any(|morsel| matches!(morsel, CompletedMorsel::Output(_))) - } -} - -struct ScanPlanPartition { - file: VortexFile, - prepared: Arc, - row_range: Range, - index: usize, - scheduler: Arc, - limit_remaining: Option>, -} - -impl Partition for ScanPlanPartition { - fn as_any(&self) -> &dyn Any { - self - } - - fn index(&self) -> usize { - self.index - } - - fn row_count(&self) -> Precision { - let row_count = self.row_range.end - self.row_range.start; - let row_count = self.prepared.selection().row_count(row_count); - let row_count = self - .prepared - .limit() - .map_or(row_count, |limit| row_count.min(limit)); - - if self.prepared.has_filter() { - Precision::inexact(row_count) - } else { - Precision::exact(row_count) - } - } - - fn byte_size(&self) -> Precision { - Precision::Absent - } - - fn execute(self: Box) -> VortexResult { - let ScanPlanPartition { - file, - prepared, - row_range, - index: _, - scheduler, - limit_remaining, - } = *self; - - let execution = Arc::new(ScanExecution::try_new(file, prepared, limit_remaining)?); - let handle = execution.session.handle(); - let dtype = execution.plan.dtype().clone(); - let ranges = execution.splits(&row_range)?; - let ordered = execution.plan.ordered(); - let plan_window = plan_window_for_limit(execution.limit_remaining.is_some()); - let read_byte_budget = read_byte_budget(&scheduler); - let morsels = ranges - .into_iter() - .map(|range| PlannedScanPlanMorsel { - execution: Arc::clone(&execution), - range, - }) - .collect::>(); - - let stream = partition_work_stream(morsels, handle, ordered, plan_window, read_byte_budget); - - Ok(ArrayStreamExt::boxed(ArrayStreamAdapter::new( - dtype, stream, - ))) - } -} - -struct PlannedScanPlanScan { - dtype: DType, - partitions: Vec>, - handle: Handle, - read_byte_budget: u64, -} - -#[derive(Clone)] -struct PlannedScanPlanMorsel { - execution: Arc, - range: Range, -} - -impl PlannedMorselScan for PlannedScanPlanScan { - fn dtype(&self) -> &DType { - &self.dtype - } - - fn partition_count(&self) -> usize { - self.partitions.len() - } - - fn partition(self: Arc, partition: usize) -> VortexResult { - if partition >= self.partitions.len() { - vortex_bail!( - "planned scan partition {partition} is outside 0..{}", - self.partitions.len() - ); - } - - Ok(Box::new(PlannedScanPlanPartition { - planned: self, - index: partition, - })) - } -} - -struct PlannedScanPlanPartition { - planned: Arc, - index: usize, -} - -impl Partition for PlannedScanPlanPartition { - fn as_any(&self) -> &dyn Any { - self - } - - fn index(&self) -> usize { - self.index - } - - fn row_count(&self) -> Precision { - let mut row_count = 0u64; - let mut has_filter = false; - - for morsel in &self.planned.partitions[self.index] { - let range_len = morsel.range.end - morsel.range.start; - row_count = - row_count.saturating_add(morsel.execution.plan.selection().row_count(range_len)); - has_filter |= morsel.execution.plan.has_filter(); - } - - if has_filter { - Precision::inexact(row_count) - } else { - Precision::exact(row_count) - } - } - - fn byte_size(&self) -> Precision { - Precision::Absent - } - - fn execute(self: Box) -> VortexResult { - let PlannedScanPlanPartition { planned, index } = *self; - let morsels = planned.partitions[index].clone(); - let dtype = planned.dtype.clone(); - let handle = planned.handle.clone(); - let stream = - partition_work_stream(morsels, handle, false, usize::MAX, planned.read_byte_budget); - - Ok(ArrayStreamExt::boxed(ArrayStreamAdapter::new( - dtype, stream, - ))) - } -} - -struct PreparedScanPlan { - // Request-level physical plan after pushdown. This must stay free of per-scan IO state. - dtype: DType, - selection: Selection, - ordered: bool, - limit: Option, - row_count: u64, - split_hints: Vec, - projection: ScanPlanRef, - predicates: Vec, -} - -struct PreparedPredicatePlan { - id: PredicateId, - expr: Expression, - plan: ScanPlanRef, -} - -struct ScanExecution { - // Runtime instantiation of a prepared plan: source binding, prepared handles, and scan state. - session: VortexSession, - plan: Arc, - limit_remaining: Option>, - projection: PreparedReadRef, - predicates: Vec, - predicate_stats: Mutex>, - scan_evidence: Mutex, -} - -struct ExecutionPredicate { - id: PredicateId, - expr: Expression, - static_cost: u64, - dynamic_updates: Option, - read: PreparedReadRef, - evidence: Vec, -} - -impl ExecutionPredicate { - fn version(&self) -> PredicateVersion { - self.dynamic_updates - .as_ref() - .map(|updates| PredicateVersion::new(updates.version())) - .unwrap_or(PredicateVersion::STATIC) - } - - fn has_morsel_recheck_evidence(&self) -> bool { - self.evidence - .iter() - .any(|plan| plan.scope() == EvidenceScope::Morsel && plan.recheck_before_projection()) - } - - fn has_scan_recheck_evidence(&self) -> bool { - self.evidence - .iter() - .any(|plan| plan.scope() == EvidenceScope::Scan && plan.recheck_before_projection()) - } -} - -impl PreparedScanPlan { - fn try_new(file: &VortexFile, request: &DataSourceScanRequest) -> VortexResult { - let session = file.session().clone(); - let dtype = request.projection.return_dtype(file.dtype())?; - let projection = request.projection.optimize_recursive(file.dtype())?; - let filter = request - .filter - .clone() - .map(|filter| filter.optimize_recursive(file.dtype())) - .transpose()?; - - let root = file.scan_plan_root()?; - let projection_pushed = push_expr(&root, &projection, file.dtype(), &session)?; - let mut split_hints = Vec::new(); - extend_split_hints(&projection_pushed, &mut split_hints); - - // Run cheap, likely-selective conjuncts first so an expensive residual (e.g. an FSST `LIKE`) - // only evaluates over the rows that survive the cheaper predicates. AND is commutative, so - // reordering is semantically safe; `PredicateId`s are assigned by final slot below (after the - // sort) so each predicate's evidence/read stay self-consistent with its id. - let mut ordered_conjuncts = filter.as_ref().map(conjuncts).unwrap_or_default(); - ordered_conjuncts.sort_by_cached_key(predicate_cost); - let predicates = ordered_conjuncts - .into_iter() - .enumerate() - .map(|(idx, expr)| { - let id = PredicateId::new( - u32::try_from(idx).map_err(|_| vortex_err!("too many predicates"))?, - ); - let pushed = push_expr(&root, &expr, file.dtype(), &session)?; - extend_split_hints(&pushed, &mut split_hints); - Ok(PreparedPredicatePlan { - id, - expr, - plan: pushed, - }) - }) - .collect::>>()?; - - Ok(Self { - dtype, - selection: request.selection.clone(), - ordered: request.ordered, - limit: request.limit, - row_count: file.row_count(), - split_hints, - projection: projection_pushed, - predicates, - }) - } - - fn dtype(&self) -> &DType { - &self.dtype - } - - fn selection(&self) -> &Selection { - &self.selection - } - - fn ordered(&self) -> bool { - self.ordered - } - - fn limit(&self) -> Option { - self.limit - } - - fn predicates(&self) -> &[PreparedPredicatePlan] { - &self.predicates - } - - fn has_filter(&self) -> bool { - !self.predicates.is_empty() - } - - fn projection(&self) -> &ScanPlanRef { - &self.projection - } - - fn splits(&self, row_range: &Range) -> VortexResult>> { - check_range(row_range, self.row_count)?; - let (splits, split_kind) = prepare_split_ranges( - self.row_count, - row_range, - &self.selection, - self.split_hints.clone(), - ); - trace_prepared_splits(row_range, &splits, split_kind, self.has_filter()); - Ok(splits) - } -} - -impl ScanExecution { - fn try_new( - file: VortexFile, - plan: Arc, - limit_remaining: Option>, - ) -> VortexResult { - let session = file.session().clone(); - let mut prepare_ctx = - PrepareCtx::with_state_cache(session.clone(), file.scan_plan_state_cache()); - let projection = Arc::clone(plan.projection()) - .prepare_read(&mut prepare_ctx)? - .ok_or_else(|| vortex_err!("scan2 could not plan read for pushed projection"))?; - let predicates = plan - .predicates() - .iter() - .map(|predicate| { - let read = Arc::clone(&predicate.plan) - .prepare_read(&mut prepare_ctx)? - .ok_or_else(|| { - vortex_err!("scan2 could not plan predicate read {}", predicate.expr) - })?; - let evidence = Arc::clone(&predicate.plan).prepare_evidence(&mut prepare_ctx)?; - let dynamic_updates = DynamicExprUpdates::new(&predicate.expr); - Ok(ExecutionPredicate { - id: predicate.id, - expr: predicate.expr.clone(), - static_cost: predicate_cost(&predicate.expr), - dynamic_updates, - read, - evidence, - }) - }) - .collect::>>()?; - let predicate_stats = (0..predicates.len()) - .map(|_| PredicateRuntimeStats::default()) - .collect(); - let scan_evidence = ScanEvidenceStore { - predicates: predicates - .iter() - .map(|predicate| PredicateScanEvidenceStore { - generation: 0, - providers: predicate - .evidence - .iter() - .map(|_| ScanEvidenceSlot::default()) - .collect(), - }) - .collect(), - }; - - Ok(Self { - session, - plan, - limit_remaining, - projection, - predicates, - predicate_stats: Mutex::new(predicate_stats), - scan_evidence: Mutex::new(scan_evidence), - }) - } - - fn read_context(&self) -> ReadContext { - ReadContext::new(self.session.clone()) - } - - fn predicate_priority(&self, predicate_idx: usize, demand_rows: usize) -> u64 { - let predicate = &self.predicates[predicate_idx]; - let static_cost = predicate.static_cost.max(1); - let demand_rows = u64::try_from(demand_rows).unwrap_or(u64::MAX).max(1); - let stats = self.predicate_stats.lock(); - let stats = &stats[predicate_idx]; - let rejection_per_mille = if stats.input_rows >= 1024 { - stats.rejected_rows.saturating_mul(1000) / stats.input_rows.max(1) - } else { - // Before feedback exists, preserve the existing static cheap-first ordering while still - // giving every predicate a nonzero expected benefit. - 500 - } - .max(1); - let expected_rejected = demand_rows.saturating_mul(rejection_per_mille) / 1000; - static_cost.saturating_mul(1_000_000) / expected_rejected.max(1) - } - - fn has_dynamic_predicates(&self) -> bool { - self.predicates - .iter() - .any(|predicate| predicate.dynamic_updates.is_some()) - } - - fn record_predicate_result(&self, predicate_idx: usize, input_rows: usize, pass_rows: usize) { - let input_rows = u64::try_from(input_rows).unwrap_or(u64::MAX); - let pass_rows = u64::try_from(pass_rows).unwrap_or(u64::MAX); - let rejected_rows = input_rows.saturating_sub(pass_rows); - let mut stats = self.predicate_stats.lock(); - let stats = &mut stats[predicate_idx]; - stats.input_rows = stats.input_rows.saturating_add(input_rows); - stats.rejected_rows = stats.rejected_rows.saturating_add(rejected_rows); - } - - fn use_scan_scope_evidence(&self, predicate_idx: usize, mode: EvidenceMode) -> bool { - mode == EvidenceMode::RecheckBeforeProjection - || self.predicates[predicate_idx].static_cost >= SCAN_SCOPE_MIN_PREDICATE_COST - } - - fn plan_morsel( - self: &Arc, - morsel_id: usize, - range: Range, - ) -> VortexResult> { - let selected = self.plan.selection().row_mask(&range).mask().clone(); - if selected.all_false() { - return Ok(None); - } - - let mut evidence = Vec::new(); - let mut pending_evidence = Vec::with_capacity(self.predicates.len()); - let mut pending_scan_evidence = Vec::with_capacity(self.predicates.len()); - for predicate_idx in 0..self.predicates.len() { - let version = self.predicates[predicate_idx].version(); - let scan_work = self.plan_scan_evidence_work( - morsel_id, - predicate_idx, - version, - EvidenceMode::Normal, - )?; - pending_scan_evidence.push(scan_work.len()); - evidence.extend(scan_work); - - let morsel_work = self.plan_evidence_work( - morsel_id, - predicate_idx, - range.clone(), - version, - EvidenceMode::Normal, - )?; - pending_evidence.push(morsel_work.len()); - evidence.extend(morsel_work); - } - - let state = MorselState { - execution: Arc::clone(self), - range, - selected, - evidence: (0..self.predicates.len()).map(|_| None).collect(), - pending_evidence, - pending_scan_evidence, - scan_evidence_generation: vec![0; self.predicates.len()], - predicate_queued: vec![false; self.predicates.len()], - predicate_done: vec![false; self.predicates.len()], - next_recheck_predicate: 0, - projection_queued: false, - }; - - Ok(Some(PlannedMorselWork { state, evidence })) - } - - fn reserve_scan_evidence( - &self, - predicate_idx: usize, - evidence_idx: usize, - version: PredicateVersion, - create_waiter: bool, - ) -> VortexResult { - let mut store = self.scan_evidence.lock(); - let slot = store - .predicates - .get_mut(predicate_idx) - .and_then(|predicate| predicate.providers.get_mut(evidence_idx)) - .ok_or_else(|| { - vortex_err!( - "missing scan evidence slot for predicate {predicate_idx} provider {evidence_idx}" - ) - })?; - if slot.version == Some(version) { - return Ok(ScanEvidenceAction::Ready); - } - if slot.pending == Some(version) { - if !create_waiter { - return Ok(ScanEvidenceAction::Pending); - } - return Ok(ScanEvidenceAction::Wait); - } - - // Any older version is superseded. Polling waiters observe the version change and - // re-enter planning for the current dynamic boundary. - slot.pending = Some(version); - Ok(ScanEvidenceAction::Prepare) - } - - fn clear_scan_evidence_pending( - &self, - predicate_idx: usize, - evidence_idx: usize, - version: PredicateVersion, - ) { - let mut store = self.scan_evidence.lock(); - let Some(slot) = store - .predicates - .get_mut(predicate_idx) - .and_then(|predicate| predicate.providers.get_mut(evidence_idx)) - else { - return; - }; - if slot.pending == Some(version) { - slot.pending = None; - } - } - - fn scan_evidence_provider_ready( - &self, - predicate_idx: usize, - evidence_idx: usize, - version: PredicateVersion, - ) -> bool { - self.scan_evidence - .lock() - .predicates - .get(predicate_idx) - .and_then(|predicate| predicate.providers.get(evidence_idx)) - .is_some_and(|slot| slot.version == Some(version)) - } - - fn record_scan_evidence( - &self, - predicate_idx: usize, - evidence_idx: usize, - version: PredicateVersion, - mut fragments: Vec, - ) -> VortexResult { - fragments.sort_by_key(|fragment| (fragment.rows.start, fragment.rows.end)); - let mut store = self.scan_evidence.lock(); - let predicate = store - .predicates - .get_mut(predicate_idx) - .ok_or_else(|| vortex_err!("missing scan evidence predicate slot {predicate_idx}"))?; - let slot = predicate.providers.get_mut(evidence_idx).ok_or_else(|| { - vortex_err!( - "missing scan evidence provider slot {evidence_idx} for predicate {predicate_idx}" - ) - })?; - - if slot.pending != Some(version) && slot.version != Some(version) { - return Ok(false); - } - - slot.version = Some(version); - slot.pending = None; - slot.fragments = fragments; - predicate.generation = predicate.generation.saturating_add(1); - Ok(true) - } - - fn scan_evidence_fragments( - &self, - predicate_idx: usize, - version: PredicateVersion, - range: &Range, - ) -> VortexResult<(u64, Vec)> { - let store = self.scan_evidence.lock(); - let Some(predicate) = store.predicates.get(predicate_idx) else { - vortex_bail!("missing scan evidence predicate slot {predicate_idx}"); - }; - let generation = predicate.generation; - let mut fragments = Vec::new(); - for slot in &predicate.providers { - if slot.version == Some(version) { - push_overlapping_fragments(&slot.fragments, range, &mut fragments)?; - } - } - Ok((generation, fragments)) - } - - fn plan_scan_evidence_work( - self: &Arc, - morsel_id: usize, - predicate_idx: usize, - version: PredicateVersion, - mode: EvidenceMode, - ) -> VortexResult> { - if !self.use_scan_scope_evidence(predicate_idx, mode) { - return Ok(Vec::new()); - } - - let predicate = &self.predicates[predicate_idx]; - let predicate_idx_u32 = - u32::try_from(predicate_idx).map_err(|_| vortex_err!("too many predicates"))?; - let mut work = Vec::new(); - for (evidence_idx, plan) in predicate.evidence.iter().enumerate() { - if plan.scope() != EvidenceScope::Scan { - continue; - } - if mode == EvidenceMode::RecheckBeforeProjection && !plan.recheck_before_projection() { - continue; - } - - let evidence_idx_u32 = - u32::try_from(evidence_idx).map_err(|_| vortex_err!("too many evidence plans"))?; - let priority = plan - .cost( - &OwnedEvidenceRequest { - id: predicate.id, - version, - predicate: predicate.expr.clone(), - range: 0..self.plan.row_count, - mode, - } - .as_request(), - ) - .priority(0, mode == EvidenceMode::RecheckBeforeProjection) - .saturating_add(predicate.static_cost); - - let create_waiter = mode == EvidenceMode::RecheckBeforeProjection; - match self.reserve_scan_evidence(predicate_idx, evidence_idx, version, create_waiter)? { - ScanEvidenceAction::Ready => {} - ScanEvidenceAction::Pending => {} - ScanEvidenceAction::Wait => { - work.push(Box::new(ScanEvidenceWaitTask { - execution: Arc::clone(self), - morsel_id, - predicate_idx, - evidence_idx, - version, - lane: ScanTaskLane::ScanEvidence { - predicate_idx: predicate_idx_u32, - evidence_idx: evidence_idx_u32, - }, - priority, - }) as QueuedWork); - } - ScanEvidenceAction::Prepare => { - let req = OwnedEvidenceRequest { - id: predicate.id, - version, - predicate: predicate.expr.clone(), - range: 0..self.plan.row_count, - mode, - }; - let result = (|| { - let task = Arc::clone(plan) - .create_task(req.clone(), ScanIoPhase::EvidenceProbe)?; - let step = task.into_step()?; - let work_reads = ScanTaskRead::from_scan_reads(&step.required_reads); - let priority = plan - .cost(&req.as_request()) - .priority( - scan_task_read_bytes(&work_reads), - mode == EvidenceMode::RecheckBeforeProjection, - ) - .saturating_add(predicate.static_cost); - let execution = Arc::clone(self); - Ok(ScanStep::new( - morsel_id, - ScanIoPhase::EvidenceProbe, - ScanTaskLane::ScanEvidence { - predicate_idx: predicate_idx_u32, - evidence_idx: evidence_idx_u32, - }, - work_reads, - step.required_reads, - step.prefetch_reads, - move |results| { - let reader = execution.read_context(); - let fragments = step.continuation.run(&reader, results)?; - Ok(ScanStepResult::Ready(WorkOutput::ScanEvidence( - ScanEvidenceWorkOutput { - execution, - morsel_id, - predicate_idx, - evidence_idx, - version, - fragments: Some(fragments), - }, - ))) - }, - ) - .with_priority(priority) - .boxed()) - })(); - match result { - Ok(task) => work.push(task), - Err(error) => { - self.clear_scan_evidence_pending(predicate_idx, evidence_idx, version); - return Err(error); - } - } - } - } - } - Ok(work) - } - - fn plan_evidence_work( - self: &Arc, - morsel_id: usize, - predicate_idx: usize, - range: Range, - version: PredicateVersion, - mode: EvidenceMode, - ) -> VortexResult> { - let predicate = &self.predicates[predicate_idx]; - let req = OwnedEvidenceRequest { - id: predicate.id, - version, - predicate: predicate.expr.clone(), - range, - mode, - }; - let predicate_idx_u32 = - u32::try_from(predicate_idx).map_err(|_| vortex_err!("too many predicates"))?; - let mut work = Vec::with_capacity(predicate.evidence.len()); - for (evidence_idx, plan) in predicate.evidence.iter().enumerate() { - if plan.scope() == EvidenceScope::Scan - && self.use_scan_scope_evidence(predicate_idx, mode) - { - continue; - } - if mode == EvidenceMode::RecheckBeforeProjection && !plan.recheck_before_projection() { - continue; - } - let evidence_idx_u32 = - u32::try_from(evidence_idx).map_err(|_| vortex_err!("too many evidence plans"))?; - let task = Arc::clone(plan).create_task(req.clone(), ScanIoPhase::EvidenceProbe)?; - let step = task.into_step()?; - let work_reads = ScanTaskRead::from_scan_reads(&step.required_reads); - let priority = plan - .cost(&req.as_request()) - .priority( - scan_task_read_bytes(&work_reads), - mode == EvidenceMode::RecheckBeforeProjection, - ) - .saturating_add(predicate.static_cost); - let execution = Arc::clone(self); - work.push( - ScanStep::new( - morsel_id, - ScanIoPhase::EvidenceProbe, - ScanTaskLane::Evidence { - predicate_idx: predicate_idx_u32, - evidence_idx: evidence_idx_u32, - }, - work_reads, - step.required_reads, - step.prefetch_reads, - move |results| { - let reader = execution.read_context(); - let fragments = step.continuation.run(&reader, results)?; - Ok(ScanStepResult::Ready(WorkOutput::Evidence( - EvidenceWorkOutput { - morsel_id, - predicate_idx, - version, - source: EvidenceWorkSource::Provider, - fragments, - }, - ))) - }, - ) - .with_priority(priority) - .boxed(), - ); - } - Ok(work) - } - - fn plan_predicate_work( - self: &Arc, - morsel_id: usize, - predicate_idx: usize, - range: Range, - need: Mask, - version: PredicateVersion, - priority: u64, - ) -> VortexResult { - let len = range_len(&range)?; - let predicate = &self.predicates[predicate_idx]; - let compact = need.density() < EXPR_EVAL_THRESHOLD; - let rows = if compact { - OwnedRowScope::selected(need.clone()) - } else { - OwnedRowScope::try_new(Mask::new_true(len), need.clone())? - }; - let phase = ScanIoPhase::PredicateRead; - let task = Arc::clone(&predicate.read).create_task(range.clone(), rows, phase)?; - - let predicate_idx_u32 = - u32::try_from(predicate_idx).map_err(|_| vortex_err!("too many predicates"))?; - let state = PredicateReadWorkState { - execution: Arc::clone(self), - morsel_id, - predicate_idx, - version, - range, - need, - compact, - len, - priority, - lane: ScanTaskLane::Predicate { - predicate_idx: predicate_idx_u32, - }, - }; - Ok(Box::new(PredicateReadWorkTask::try_new(state, task)?)) - } - - fn plan_projection_work( - self: &Arc, - morsel_id: usize, - range: Range, - selected: Mask, - ) -> VortexResult> { - // Projection consumes the final selected rows after every predicate plan has contributed - // metadata evidence and, if needed, exact residual evidence. There is no separate - // predicate-demand mask at this point. - let len = range_len(&range)?; - let selected = if let Some(limit_remaining) = &self.limit_remaining { - limit_mask(selected, limit_remaining)? - } else { - selected - }; - if selected.all_false() { - return Ok(None); - } - if selected.len() != len { - vortex_bail!( - "scan2 projection selection length {} does not match range length {len}", - selected.len() - ); - } - - let rows = OwnedRowScope::selected(selected); - let phase = ScanIoPhase::ProjectionRead; - let task = Arc::clone(&self.projection).create_task(range, rows, phase)?; - - let execution = Arc::clone(self); - Ok(Some(Box::new(ProjectionReadWorkTask::try_new( - execution, task, morsel_id, - )?))) - } - - fn splits(&self, row_range: &Range) -> VortexResult>> { - self.plan.splits(row_range) - } -} - -fn push_overlapping_fragments( - fragments: &[EvidenceFragment], - range: &Range, - output: &mut Vec, -) -> VortexResult<()> { - let start = fragments - .partition_point(|fragment| fragment.rows.start < range.start) - .saturating_sub(1); - for fragment in &fragments[start..] { - if fragment.rows.start >= range.end { - break; - } - if let Some(fragment) = slice_evidence_fragment(fragment, range)? { - output.push(fragment); - } - } - Ok(()) -} - -fn slice_evidence_fragment( - fragment: &EvidenceFragment, - range: &Range, -) -> VortexResult> { - let rows = fragment.rows.start.max(range.start)..fragment.rows.end.min(range.end); - if rows.start >= rows.end { - return Ok(None); - } - if rows == fragment.rows { - return Ok(Some(fragment.clone())); - } - - let local = usize::try_from(rows.start - fragment.rows.start) - .map_err(|_| vortex_err!("evidence fragment exceeds usize"))? - ..usize::try_from(rows.end - fragment.rows.start) - .map_err(|_| vortex_err!("evidence fragment exceeds usize"))?; - let kind = match &fragment.kind { - PredicateEvidenceKind::AllFalse => PredicateEvidenceKind::AllFalse, - PredicateEvidenceKind::AllTrue => PredicateEvidenceKind::AllTrue, - PredicateEvidenceKind::Unknown => PredicateEvidenceKind::Unknown, - PredicateEvidenceKind::ExactMask(mask) => { - PredicateEvidenceKind::ExactMask(mask.slice(local)) - } - PredicateEvidenceKind::CandidateMask(mask) => { - PredicateEvidenceKind::CandidateMask(mask.slice(local)) - } - }; - Ok(Some(EvidenceFragment::new(rows, kind))) -} - -fn push_expr( - root: &ScanPlanRef, - expr: &Expression, - dtype: &DType, - session: &VortexSession, -) -> VortexResult { - validate_temporal_comparisons(expr, dtype)?; - Arc::clone(root) - .try_push_expr(expr, &mut PushCtx::new(session.clone()))? - .ok_or_else(|| vortex_err!("scan2 could not push expression {expr}")) -} - -fn extend_split_hints(plan: &ScanPlanRef, points: &mut Vec) { - if let Some(hints) = plan.split_hints() { - points.extend_from_slice(hints); - } -} - -#[derive(Clone, Copy, Debug)] -enum PreparedSplitKind { - SelectionRanges, - Natural, -} - -fn prepare_split_ranges( - row_count: u64, - row_range: &Range, - selection: &Selection, - split_hints: Vec, -) -> (Vec>, PreparedSplitKind) { - let explicit_row_range = explicit_row_range(row_count, row_range); - if let Some(ranges) = selection_split_ranges(selection, explicit_row_range) { - return (ranges, PreparedSplitKind::SelectionRanges); - } - - let file_range = 0..row_count; - let selection_range = intersect_ranges(Some(&file_range), selection_bounding_range(selection)); - let bounded_range = intersect_ranges(explicit_row_range, selection_range); - let points = normalize_split_points(row_count, split_hints); - ( - natural_split_ranges(&points, bounded_range.as_ref()), - PreparedSplitKind::Natural, - ) -} - -fn explicit_row_range(row_count: u64, row_range: &Range) -> Option<&Range> { - (row_range.start != 0 || row_range.end != row_count).then_some(row_range) -} - -fn selection_split_ranges( - selection: &Selection, - row_range: Option<&Range>, -) -> Option>> { - let Selection::IncludeByIndex(buffer) = selection else { - return None; - }; - if row_range.is_some() { - return None; - } - - let indices = buffer.as_slice(); - if indices.is_empty() { - return Some(Vec::new()); - } - debug_assert!(indices.is_sorted()); - - let mut ranges = Vec::with_capacity((indices.len() as u64 / MAX_SELECTION_RANGE_SIZE) as usize); - let mut curr_start = indices[0]; - let mut curr_end = indices[0].saturating_add(1); - for &idx in &indices[1..] { - let idx_end = idx.saturating_add(1); - let new_range_size = idx_end.saturating_sub(curr_start); - let gap = idx_end.saturating_sub(curr_end); - if new_range_size >= MAX_SELECTION_RANGE_SIZE { - if gap >= MIN_SELECTION_GAP_BETWEEN_RANGES { - ranges.push(curr_start..curr_end); - curr_start = idx; - curr_end = idx_end; - } else { - return None; - } - } else { - curr_end = idx_end; - } - } - ranges.push(curr_start..curr_end); - Some(ranges) -} - -fn selection_bounding_range(selection: &Selection) -> Option> { - match selection { - Selection::IncludeByIndex(buffer) => { - let indices = buffer.as_slice(); - indices - .first() - .zip(indices.last()) - .map(|(&first, &last)| first..last.saturating_add(1)) - } - Selection::IncludeRoaring(roaring) if !roaring.is_empty() => { - Some(roaring.min()?..roaring.max()?.saturating_add(1)) - } - _ => None, - } -} - -fn intersect_ranges(left: Option<&Range>, right: Option>) -> Option> { - match (left, right) { - (Some(left), Some(right)) => Some(left.start.max(right.start)..left.end.min(right.end)), - (Some(left), None) => Some(left.clone()), - (None, Some(right)) => Some(right), - (None, None) => None, - } -} - -fn normalize_split_points(row_count: u64, mut hints: Vec) -> Vec { - hints.push(0); - hints.push(row_count); - hints.retain(|&hint| hint <= row_count); - hints.sort_unstable(); - hints.dedup(); - hints -} - -fn natural_split_ranges(split_points: &[u64], row_range: Option<&Range>) -> Vec> { - let points = if let Some(row_range) = row_range { - if row_range.start >= row_range.end { - return Vec::new(); - } - let mut points = Vec::new(); - points.push(row_range.start); - points.extend( - split_points - .iter() - .copied() - .filter(|&point| row_range.start < point && point < row_range.end), - ); - points.push(row_range.end); - points.sort_unstable(); - points.dedup(); - points - } else { - split_points.to_vec() - }; - - points - .windows(2) - .filter_map(|window| { - let range = window[0]..window[1]; - (range.start < range.end).then_some(range) - }) - .collect() -} - -fn trace_prepared_splits( - row_range: &Range, - splits: &[Range], - split_kind: PreparedSplitKind, - has_filter: bool, -) { - tracing::debug!( - target: "vortex_file::scan_v2", - ?split_kind, - split_count = splits.len(), - row_start = row_range.start, - row_end = row_range.end, - first_split = ?splits.first(), - last_split = ?splits.last(), - has_filter, - "prepared scan2 splits" - ); - tracing::trace!( - target: "vortex_file::scan_v2", - ?splits, - "prepared scan2 split ranges" - ); -} - -fn check_range(range: &Range, row_count: u64) -> VortexResult<()> { - if range.start > range.end || range.end > row_count { - vortex_bail!( - "scan2 row range {:?} is out of bounds for row count {}", - range, - row_count - ); - } - range_len(range).map(|_| ()) -} - -fn range_len(range: &Range) -> VortexResult { - let len = range - .end - .checked_sub(range.start) - .ok_or_else(|| vortex_err!("scan2 row range end is before start: {range:?}"))?; - usize::try_from(len).map_err(|_| vortex_err!("scan2 row range exceeds usize")) -} - -fn limit_mask(mask: Mask, remaining: &AtomicU64) -> VortexResult { - let true_count = mask.true_count(); - let true_count = - u64::try_from(true_count).map_err(|_| vortex_err!("mask count exceeds u64"))?; - - loop { - let available = remaining.load(Ordering::Acquire); - if available == 0 { - return Ok(Mask::new_false(mask.len())); - } - - let take = true_count.min(available); - if remaining - .compare_exchange_weak( - available, - available - take, - Ordering::AcqRel, - Ordering::Acquire, - ) - .is_err() - { - continue; - } - - if take == true_count { - return Ok(mask); - } - - let take = usize::try_from(take).unwrap_or(usize::MAX); - return Ok(Mask::from_indices( - mask.len(), - (0..mask.len()).filter(|idx| mask.value(*idx)).take(take), - )); - } -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - use std::sync::atomic::AtomicU64; - use std::sync::atomic::Ordering; - - use vortex_array::expr::get_item; - use vortex_array::expr::like; - use vortex_array::expr::lit; - use vortex_array::expr::not_eq; - use vortex_array::expr::root; - use vortex_error::VortexResult; - use vortex_error::vortex_err; - use vortex_mask::Mask; - - use super::limit_mask; - use super::predicate_cost; - - #[test] - fn predicate_cost_orders_cheap_before_expensive() { - let cheap = not_eq(get_item("search", root()), lit("")); - let expensive = like(get_item("url", root()), lit("%google%")); - assert!( - predicate_cost(&cheap) < predicate_cost(&expensive), - "primitive comparison must be cheaper than LIKE: cheap={}, expensive={}", - predicate_cost(&cheap), - predicate_cost(&expensive), - ); - } - - #[test] - fn limit_mask_consumes_full_mask_when_limit_allows() -> VortexResult<()> { - let remaining = AtomicU64::new(4); - - let selected = limit_mask(Mask::from_indices(6, [1, 2, 4]), &remaining)?; - - assert_eq!(selected.true_count(), 3); - assert!(selected.value(1)); - assert!(selected.value(2)); - assert!(selected.value(4)); - assert_eq!(remaining.load(Ordering::Acquire), 1); - Ok(()) - } - - #[test] - fn limit_mask_trims_mask_to_remaining_rows() -> VortexResult<()> { - let remaining = AtomicU64::new(2); - - let selected = limit_mask(Mask::from_indices(6, [1, 2, 4]), &remaining)?; - - assert_eq!(selected.true_count(), 2); - assert!(selected.value(1)); - assert!(selected.value(2)); - assert!(!selected.value(4)); - assert_eq!(remaining.load(Ordering::Acquire), 0); - Ok(()) - } - - #[test] - fn limit_mask_shared_counter_never_overselects() -> VortexResult<()> { - let remaining = Arc::new(AtomicU64::new(10)); - - let handles = (0..16) - .map(|_| { - let remaining = Arc::clone(&remaining); - std::thread::spawn(move || limit_mask(Mask::new_true(8), &remaining)) - }) - .collect::>(); - - let mut selected_rows = 0; - for handle in handles { - let selected = handle - .join() - .map_err(|_| vortex_err!("limit mask worker thread panicked"))??; - selected_rows += selected.true_count(); - } - - assert_eq!(selected_rows, 10); - assert_eq!(remaining.load(Ordering::Acquire), 0); - Ok(()) - } -} diff --git a/vortex-layout/src/scan/v2/layouts/chunked.rs b/vortex-layout/src/scan/v2/layouts/chunked.rs index 4092ba19ad3..125947de9ef 100644 --- a/vortex-layout/src/scan/v2/layouts/chunked.rs +++ b/vortex-layout/src/scan/v2/layouts/chunked.rs @@ -552,6 +552,14 @@ impl PreparedAggregate for ChunkedPreparedAggregate { } impl ScanPlan for ChunkedScanPlan { + fn dtype(&self) -> &DType { + self.layout.dtype() + } + + fn row_count(&self) -> u64 { + self.layout.row_count() + } + fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { Ok(Arc::new(ChunkedScanState::default())) } @@ -750,6 +758,14 @@ impl PreparedRead for ChunkedPreparedRead { } impl ScanPlan for ChunkedExprScanPlan { + fn dtype(&self) -> &DType { + &self.dtype + } + + fn row_count(&self) -> u64 { + self.chunked.layout.row_count() + } + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { let _ = cx; Ok(Arc::new(ChunkedExprScanState { diff --git a/vortex-layout/src/scan/v2/layouts/dict.rs b/vortex-layout/src/scan/v2/layouts/dict.rs index a84c5bfcb09..e8273e1b226 100644 --- a/vortex-layout/src/scan/v2/layouts/dict.rs +++ b/vortex-layout/src/scan/v2/layouts/dict.rs @@ -75,6 +75,8 @@ pub(crate) fn new_scan_plan( let codes = layout.child(1)?; Ok(Arc::new(DictScanPlan { values_len: values.row_count(), + dtype: layout.dtype().clone(), + row_count: layout.row_count(), // Values and codes live in other row domains. values: values.new_scan_plan(&mut ScanRequest::empty(), ctx)?, codes: codes.new_scan_plan(&mut ScanRequest::empty(), ctx)?, @@ -87,6 +89,8 @@ pub struct DictScanPlan { values: ScanPlanRef, values_len: u64, codes: ScanPlanRef, + dtype: DType, + row_count: u64, } /// Per-query dictionary caches for value-domain expression results. @@ -120,6 +124,7 @@ impl Default for DictSharedState { struct DictExprScanPlan { dict: Arc, expr: Expression, + dtype: DType, } struct DictPreparedRead { @@ -191,6 +196,14 @@ impl DictScanPlan { } impl ScanPlan for DictScanPlan { + fn dtype(&self) -> &DType { + &self.dtype + } + + fn row_count(&self) -> u64 { + self.row_count + } + fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { Ok(Arc::new(DictScanState::new())) } @@ -203,9 +216,11 @@ impl ScanPlan for DictScanPlan { if is_root(expr) { Ok(Some(self)) } else { + let dtype = expr.return_dtype(&self.dtype)?; Ok(Some(Arc::new(DictExprScanPlan { dict: self, expr: expr.clone(), + dtype, }))) } } @@ -242,6 +257,14 @@ impl ScanPlan for DictScanPlan { } impl ScanPlan for DictExprScanPlan { + fn dtype(&self) -> &DType { + &self.dtype + } + + fn row_count(&self) -> u64 { + self.dict.row_count + } + fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { Ok(Arc::new(DictScanState::new())) } diff --git a/vortex-layout/src/scan/v2/layouts/flat.rs b/vortex-layout/src/scan/v2/layouts/flat.rs index 1974581b00f..c888f7f02dc 100644 --- a/vortex-layout/src/scan/v2/layouts/flat.rs +++ b/vortex-layout/src/scan/v2/layouts/flat.rs @@ -17,6 +17,7 @@ use parking_lot::Mutex; use vortex_array::ArrayRef; use vortex_array::IntoArray; use vortex_array::arrays::SliceArray; +use vortex_array::dtype::DType; use vortex_array::expr::Expression; use vortex_array::optimizer::ArrayOptimizer; use vortex_array::serde::SerializedArray; @@ -113,6 +114,14 @@ impl FlatScanPlan { } impl ScanPlan for FlatScanPlan { + fn dtype(&self) -> &DType { + self.layout.dtype() + } + + fn row_count(&self) -> u64 { + self.layout.row_count() + } + fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { Ok(Arc::new(FlatScanState::default())) } diff --git a/vortex-layout/src/scan/v2/layouts/struct_.rs b/vortex-layout/src/scan/v2/layouts/struct_.rs index 108722bcbc1..01516ecfa23 100644 --- a/vortex-layout/src/scan/v2/layouts/struct_.rs +++ b/vortex-layout/src/scan/v2/layouts/struct_.rs @@ -11,6 +11,7 @@ use std::fmt; use std::sync::Arc; use parking_lot::Mutex; +use vortex_array::dtype::DType; use vortex_array::dtype::FieldName; use vortex_array::dtype::FieldNames; use vortex_array::dtype::StructFields; @@ -83,6 +84,14 @@ pub struct StructScanPlan { } impl ScanPlan for StructScanPlan { + fn dtype(&self) -> &DType { + self.layout.dtype() + } + + fn row_count(&self) -> u64 { + self.layout.row_count() + } + fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { Ok(Arc::new(())) } @@ -123,7 +132,7 @@ impl ScanPlan for StructScanPlan { return Ok(self.apply_validity(child.try_push_expr(&scoped, cx)?)); } let input = self.push_struct(fields.clone().into(), cx)?; - Ok(Some(Arc::new(ApplyScanPlan::new(input, expr.clone())))) + Ok(Some(Arc::new(ApplyScanPlan::try_new(input, expr.clone())?))) } fn prepare_read( @@ -206,11 +215,12 @@ impl StructScanPlan { .ok_or_else(|| vortex_error::vortex_err!("field {name} did not push root")) }) .collect::>>()?; - Ok(Arc::new(StructValueScanPlan::new( + Ok(Arc::new(StructValueScanPlan::try_new( names, fields, self.validity.clone(), - ))) + self.layout.row_count(), + )?)) } } diff --git a/vortex-layout/src/scan/v2/layouts/zoned.rs b/vortex-layout/src/scan/v2/layouts/zoned.rs index 9d0ba8b13c5..5c46d49a6f6 100644 --- a/vortex-layout/src/scan/v2/layouts/zoned.rs +++ b/vortex-layout/src/scan/v2/layouts/zoned.rs @@ -184,6 +184,7 @@ struct ZonedExprScanPlan { zones: ScanPlanRef, nzones: u64, column_dtype: DType, + dtype: DType, zone_len: u64, row_count: u64, zone_map_schema: ZoneMapSchema, @@ -791,6 +792,14 @@ impl EvidenceTask for ZonedEvidenceTask { } impl ScanPlan for ZonedScanPlan { + fn dtype(&self) -> &DType { + &self.column_dtype + } + + fn row_count(&self) -> u64 { + self.row_count + } + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { Ok(Arc::new(Self::empty_state_with_data( cx.init_plan(&self.data)?, @@ -815,7 +824,8 @@ impl ScanPlan for ZonedScanPlan { let Some(data) = Arc::clone(&self.data).try_push_expr(expr, cx)? else { return Ok(None); }; - let is_predicate = matches!(expr.return_dtype(&self.column_dtype)?, DType::Bool(_)); + let dtype = expr.return_dtype(&self.column_dtype)?; + let is_predicate = matches!(&dtype, DType::Bool(_)); let (falsifier, satisfier) = if self.zone_len > 0 && is_predicate { ( expr.falsify(&self.column_dtype, cx.session())?, @@ -829,6 +839,7 @@ impl ScanPlan for ZonedScanPlan { zones: Arc::clone(&self.zones), nzones: self.nzones, column_dtype: self.column_dtype.clone(), + dtype, zone_len: self.zone_len, row_count: self.row_count, zone_map_schema: self.zone_map_schema.clone(), @@ -958,6 +969,14 @@ impl PreparedAggregate for ZonedPreparedAggregate { } impl ScanPlan for ZonedExprScanPlan { + fn dtype(&self) -> &DType { + &self.dtype + } + + fn row_count(&self) -> u64 { + self.row_count + } + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { Ok(Arc::new(ZonedScanPlan::empty_state_with_data( cx.init_plan(&self.data)?, diff --git a/vortex-layout/src/scan/v2/row_idx.rs b/vortex-layout/src/scan/v2/row_idx.rs index e3213a6f26e..9a3ee09c297 100644 --- a/vortex-layout/src/scan/v2/row_idx.rs +++ b/vortex-layout/src/scan/v2/row_idx.rs @@ -43,7 +43,8 @@ use vortex_sequence::SequenceArray; use crate::layouts::row_idx::RowIdx; use crate::layouts::row_idx::row_idx; -pub fn with_row_idx(root: ScanPlanRef, dtype: DType, row_offset: u64) -> ScanPlanRef { +pub fn with_row_idx(root: ScanPlanRef, row_offset: u64) -> ScanPlanRef { + let dtype = root.dtype().clone(); Arc::new(RowIdxScanPlan { child: root, dtype, @@ -126,6 +127,14 @@ impl RowIdxScanPlan { } impl ScanPlan for RowIdxScanPlan { + fn dtype(&self) -> &DType { + &self.dtype + } + + fn row_count(&self) -> u64 { + self.child.row_count() + } + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { cx.init_plan(&self.child) } @@ -138,6 +147,7 @@ impl ScanPlan for RowIdxScanPlan { match self.partition_expr(expr)? { Partitioning::RowIdx(expr) => Ok(Some(Arc::new(RowIdxExprScanPlan::try_new( self.row_offset, + self.child.row_count(), expr, )?))), Partitioning::Child(expr) => Arc::clone(&self.child).try_push_expr(&expr, cx), @@ -149,10 +159,11 @@ impl ScanPlan for RowIdxScanPlan { .zip(partitioned.partition_annotations.iter()) { let field = match annotation { - Partition::RowIdx => { - Arc::new(RowIdxExprScanPlan::try_new(self.row_offset, expr.clone())?) - as ScanPlanRef - } + Partition::RowIdx => Arc::new(RowIdxExprScanPlan::try_new( + self.row_offset, + self.child.row_count(), + expr.clone(), + )?) as ScanPlanRef, Partition::Child => Arc::clone(&self.child) .try_push_expr(expr, cx)? .ok_or_else(|| { @@ -163,15 +174,16 @@ impl ScanPlan for RowIdxScanPlan { }; fields.push(field); } - let input = Arc::new(StructValueScanPlan::new( + let input: ScanPlanRef = Arc::new(StructValueScanPlan::try_new( partitioned.partition_names.clone(), fields, None, - )); - Ok(Some(Arc::new(ApplyScanPlan::new( + self.child.row_count(), + )?); + Ok(Some(Arc::new(ApplyScanPlan::try_new( input, partitioned.root.clone(), - )))) + )?))) } } } @@ -196,15 +208,17 @@ impl ScanPlan for RowIdxScanPlan { struct RowIdxExprScanPlan { row_offset: u64, + row_count: u64, expr: Expression, dtype: DType, } impl RowIdxExprScanPlan { - fn try_new(row_offset: u64, expr: Expression) -> VortexResult { + fn try_new(row_offset: u64, row_count: u64, expr: Expression) -> VortexResult { let dtype = expr.return_dtype(&row_idx_dtype())?; Ok(Self { row_offset, + row_count, expr, dtype, }) @@ -222,6 +236,14 @@ struct RowIdxReadTask { } impl ScanPlan for RowIdxExprScanPlan { + fn dtype(&self) -> &DType { + &self.dtype + } + + fn row_count(&self) -> u64 { + self.row_count + } + fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { Ok(Arc::new(())) } diff --git a/vortex-scan/Cargo.toml b/vortex-scan/Cargo.toml index 3f587020606..5d05c873d0b 100644 --- a/vortex-scan/Cargo.toml +++ b/vortex-scan/Cargo.toml @@ -20,6 +20,7 @@ all-features = true vortex-array = { workspace = true } vortex-buffer = { workspace = true } vortex-error = { workspace = true } +vortex-io = { workspace = true } vortex-mask = { workspace = true } vortex-session = { workspace = true } vortex-utils = { workspace = true, features = ["dashmap"] } diff --git a/vortex-scan/src/plan/data_source.rs b/vortex-scan/src/plan/data_source.rs new file mode 100644 index 00000000000..0fa5437337c --- /dev/null +++ b/vortex-scan/src/plan/data_source.rs @@ -0,0 +1,3187 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! ScanPlan-backed multi-file data source. + +use std::any::Any; +use std::collections::BTreeMap; +use std::collections::VecDeque; +use std::ops::Range; +use std::sync::Arc; +use std::sync::atomic::AtomicU64; +use std::sync::atomic::Ordering; + +use async_trait::async_trait; +use futures::FutureExt; +use futures::StreamExt; +use futures::TryStreamExt; +use futures::future::BoxFuture; +use futures::stream; +use futures::stream::FuturesUnordered; +use parking_lot::Mutex; +use tracing::Instrument; +use vortex_array::ArrayRef; +use vortex_array::VortexSessionExecute; +use vortex_array::aggregate_fn::AggregateFnRef; +use vortex_array::dtype::DType; +use vortex_array::dtype::FieldPath; +use vortex_array::expr::Expression; +use vortex_array::expr::forms::conjuncts; +use vortex_array::expr::stats::Precision; +use vortex_array::expr::stats::Stat; +use vortex_array::extension::datetime::AnyTemporal; +use vortex_array::scalar::Scalar; +use vortex_array::scalar::ScalarValue; +use vortex_array::scalar_fn::fns::binary::Binary; +use vortex_array::scalar_fn::fns::dynamic::DynamicExprUpdates; +use vortex_array::scalar_fn::fns::get_item::GetItem; +use vortex_array::scalar_fn::fns::root::Root; +use vortex_array::stats::StatsSet; +use vortex_array::stream::ArrayStreamAdapter; +use vortex_array::stream::ArrayStreamExt; +use vortex_array::stream::SendableArrayStream; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_err; +use vortex_io::runtime::Handle; +use vortex_io::session::RuntimeSessionExt; +use vortex_mask::Mask; +use vortex_session::VortexSession; +use vortex_utils::parallelism::get_available_parallelism; + +use super::EvidenceScope; +use super::OwnedRowScope; +use super::PrepareCtx; +use super::PreparedEvidenceRef; +use super::PreparedReadRef; +use super::PreparedStateCache; +use super::PreparedStateCacheRef; +use super::PushCtx; +use super::ReadContext; +use super::ReadStep; +use super::ReadTask; +use super::ReadTaskOutput; +use super::ScanPlanRef; +use super::evidence::EvidenceFragment; +use super::evidence::PredicateEvidence; +use super::evidence::PredicateEvidenceKind; +use super::evidence::PredicateId; +use super::evidence::PredicateVersion; +use super::request::EvidenceMode; +use super::request::OwnedEvidenceRequest; +use crate::DataSource; +use crate::DataSourceScan; +use crate::DataSourceScanRef; +use crate::Partition; +use crate::PartitionRef; +use crate::PartitionStream; +use crate::PlannedMorselScan; +use crate::PlannedMorselScanRef; +use crate::ScanMeta; +use crate::ScanRequest as DataSourceScanRequest; +use crate::ScanScheduler; +use crate::ScanSchedulerSessionExt; +use crate::read::ReadResults; +use crate::read::ReadStore; +use crate::read::ReadStoreRef; +use crate::read::ScanIoPhase; +use crate::read::ScanRead; +use crate::selection::Selection; +use crate::task::ScanStep; +use crate::task::ScanStepResult; +use crate::task::ScanTask; +use crate::task::ScanTaskBox; +use crate::task::ScanTaskLane; +use crate::task::ScanTaskQueue; +use crate::task::ScanTaskRead; +use crate::task::scan_task_read_bytes; + +const DEFAULT_CONCURRENCY: usize = 8; +const IDEAL_SPLIT_SIZE: u64 = 100_000; +const MAX_SELECTION_RANGE_SIZE: u64 = IDEAL_SPLIT_SIZE / 25; +const MIN_SELECTION_GAP_BETWEEN_RANGES: u64 = IDEAL_SPLIT_SIZE / 2; +/// Below this demanded-row density, evaluate a residual predicate over only the demanded rows +/// (filter-first) rather than the whole morsel. +const EXPR_EVAL_THRESHOLD: f64 = 0.2; +const INLINE_ZERO_READ_EVIDENCE_MAX_PRIORITY: u64 = 100_150; +const SCAN_SCOPE_MIN_PREDICATE_COST: u64 = 100; + +fn root_field_path(expr: &Expression) -> Option { + if expr.is::() { + return Some(FieldPath::root()); + } + let field = expr.as_opt::()?; + expr.child(0) + .is::() + .then(|| FieldPath::from_name(field.clone())) +} + +/// Static cost estimate for a filter conjunct, used to order predicate evaluation cheapest-first. +/// +/// We sum a per-node cost over the whole expression tree. Primitive comparisons, null checks and +/// data access (`vortex.binary`, `vortex.between`, `vortex.is_null`, `vortex.get_item`, ...) are +/// cheap; per-row string/byte work (`vortex.like`, `vortex.byte_length`, `vortex.list.contains`) +/// and opaque/dynamic functions are expensive. Unrecognized functions get a moderate cost so they +/// sort after primitives but ahead of known-expensive matchers. +fn predicate_cost(expr: &Expression) -> u64 { + fn node_cost(expr: &Expression) -> u64 { + match expr.id().as_str() { + // Free or near-free structural / access nodes. + "vortex.root" | "vortex.literal" | "vortex.get_item" => 0, + // Cheap primitive predicates. + "vortex.binary" | "vortex.between" | "vortex.is_null" | "vortex.is_not_null" + | "vortex.not" | "vortex.fill_null" | "vortex.cast" => 1, + // Expensive per-row string / byte / matching work, and fallible UDFs. + "vortex.like" | "vortex.byte_length" | "vortex.list.contains" => 100, + "vortex.dynamic" | "vortex.variant_get" | "vortex.parquet.variant" => 100, + // Unknown functions: more expensive than primitives, cheaper than known matchers. + _ => 10, + } + } + + let mut cost = node_cost(expr); + for child in expr.children().iter() { + cost = cost.saturating_add(predicate_cost(child)); + } + cost +} + +fn absent_statistics(funcs: &[AggregateFnRef]) -> Vec> { + funcs.iter().map(|_| Precision::Absent).collect() +} + +fn scalar_precision_to_value(precision: Precision) -> Precision { + match precision { + Precision::Exact(scalar) => scalar + .into_value() + .map(Precision::Exact) + .unwrap_or(Precision::Absent), + Precision::Inexact(scalar) => scalar + .into_value() + .map(Precision::Inexact) + .unwrap_or(Precision::Absent), + Precision::Absent => Precision::Absent, + } +} + +#[derive(Clone)] +struct ScanPlanBinding { + root: ScanPlanRef, + state_cache: PreparedStateCacheRef, +} + +impl ScanPlanBinding { + fn new(root: ScanPlanRef) -> Self { + Self { + root, + state_cache: Arc::new(PreparedStateCache::default()), + } + } + + fn root(&self) -> &ScanPlanRef { + &self.root + } + + fn row_count(&self) -> u64 { + self.root.row_count() + } + + fn state_cache(&self) -> PreparedStateCacheRef { + Arc::clone(&self.state_cache) + } +} + +/// Execute a scan over one root scan plan. +pub fn scan_plan_stream( + root: ScanPlanRef, + session: VortexSession, + request: DataSourceScanRequest, +) -> VortexResult { + let binding = ScanPlanBinding::new(root); + scan_plan_binding_stream(binding, session, request) +} + +/// Return aggregate-function statistics for one expression over one root scan plan. +pub async fn scan_plan_statistics( + root: ScanPlanRef, + session: VortexSession, + expr: &Expression, + funcs: &[AggregateFnRef], +) -> VortexResult>> { + let mut stats = + scan_plan_statistics_many(root, session, std::slice::from_ref(expr), funcs).await?; + Ok(stats.pop().unwrap_or_else(|| absent_statistics(funcs))) +} + +/// Return aggregate-function statistics for several expressions over one root scan plan. +pub async fn scan_plan_statistics_many( + root: ScanPlanRef, + session: VortexSession, + exprs: &[Expression], + funcs: &[AggregateFnRef], +) -> VortexResult>>> { + let reader = ReadContext::new(session); + let dtype = root.dtype(); + let row_count = root.row_count(); + let mut result = Vec::with_capacity(exprs.len()); + for expr in exprs { + let plan = if let Some(field_path) = root_field_path(expr) { + Arc::clone(&root).prepare_field_stats( + &field_path, + funcs, + &mut PrepareCtx::new(reader.session().clone()), + )? + } else { + let pushed = push_expr(&root, expr, dtype, reader.session())?; + pushed.prepare_field_stats( + &FieldPath::root(), + funcs, + &mut PrepareCtx::new(reader.session().clone()), + )? + }; + let Some(plan) = plan else { + result.push(absent_statistics(funcs)); + continue; + }; + let state = plan.init_state(reader.session())?; + result.push(plan.stats(0..row_count, &reader, state.as_ref()).await?); + } + Ok(result) +} + +/// Return natural row split ranges from one root scan plan. +pub fn scan_plan_split_ranges(root: &ScanPlanRef) -> Vec> { + split_ranges_from_node(root) +} + +/// Plan projected row split ranges for one root scan plan. +pub async fn scan_plan_projected_splits( + root: ScanPlanRef, + session: VortexSession, + projection: &Expression, +) -> VortexResult>> { + let dtype = root.dtype(); + let row_count = root.row_count(); + let pushed = push_expr(&root, projection, dtype, &session)?; + let Some(plan) = pushed.prepare_splits(&mut PrepareCtx::new(session.clone()))? else { + return Ok(std::iter::once(0..row_count).collect()); + }; + let reader = ReadContext::new(session.clone()); + let state = plan.init_state(&session)?; + plan.splits(0..row_count, &reader, state.as_ref()).await +} + +/// Opens a root scan plan lazily for a multi-source scan. +#[async_trait] +pub trait ScanPlanFactory: 'static + Send + Sync { + /// Open the root scan plan, returning `None` when it should be skipped. + async fn open(&self) -> VortexResult>; +} + +enum ScanPlanChild { + Opened(ScanPlanBinding), + Deferred(Arc), +} + +/// Multi-file data source backed by scan2 ScanPlan plans. +pub struct ScanPlanDataSource { + dtype: DType, + session: VortexSession, + children: Vec, + concurrency: usize, +} + +impl ScanPlanDataSource { + /// Create a scan-plan data source with one already opened source. + pub fn new_with_first( + first: ScanPlanRef, + remaining: Vec>, + session: &VortexSession, + ) -> Self { + let concurrency = get_available_parallelism().unwrap_or(DEFAULT_CONCURRENCY); + + let mut children = Vec::with_capacity(1 + remaining.len()); + let dtype = first.dtype().clone(); + children.push(ScanPlanChild::Opened(ScanPlanBinding::new(first))); + children.extend(remaining.into_iter().map(ScanPlanChild::Deferred)); + + Self { + dtype, + session: session.clone(), + children, + concurrency, + } + } + + async fn open_sources(&self, ordered: bool) -> VortexResult> { + let jobs = self + .children + .iter() + .enumerate() + .map(|(idx, child)| match child { + ScanPlanChild::Opened(binding) => { + let binding = binding.clone(); + async move { Ok(Some((idx, binding))) }.boxed() + } + ScanPlanChild::Deferred(factory) => { + let factory = Arc::clone(factory); + async move { + factory + .open() + .instrument(tracing::info_span!("ScanPlanFactory::open")) + .await + .map(|opened| opened.map(|root| (idx, ScanPlanBinding::new(root)))) + } + .boxed() + } + }) + .collect::>>>>(); + + let bindings = if ordered { + stream::iter(jobs) + .buffered(self.concurrency) + .try_filter_map(|binding| async move { Ok(binding) }) + .try_collect::>() + .await? + } else { + stream::iter(jobs) + .buffer_unordered(self.concurrency) + .try_filter_map(|binding| async move { Ok(binding) }) + .try_collect::>() + .await? + }; + + let mut bindings = bindings; + bindings.sort_unstable_by_key(|(idx, _)| *idx); + Ok(bindings) + } +} + +#[async_trait] +impl DataSource for ScanPlanDataSource { + fn dtype(&self) -> &DType { + &self.dtype + } + + fn row_count(&self) -> Precision { + let mut sum: u64 = 0; + let mut opened_count: u64 = 0; + let mut deferred_count: u64 = 0; + + for child in &self.children { + match child { + ScanPlanChild::Opened(binding) => { + opened_count += 1; + sum = sum.saturating_add(binding.row_count()); + } + ScanPlanChild::Deferred(_) => { + deferred_count += 1; + } + } + } + + let total_count = opened_count + deferred_count; + if total_count == 0 { + return Precision::exact(0u64); + } + + if deferred_count == 0 { + Precision::exact(sum) + } else if opened_count > 0 { + let avg = sum / opened_count; + Precision::inexact(avg.saturating_mul(total_count)) + } else { + Precision::Absent + } + } + + fn deserialize_partition( + &self, + _data: &[u8], + _session: &VortexSession, + ) -> VortexResult { + vortex_bail!("ScanPlanDataSource partitions are not yet serializable") + } + + async fn plan_morsel_partitions( + &self, + scan_request: DataSourceScanRequest, + target_partitions: usize, + ) -> VortexResult> { + if scan_request.ordered || scan_request.limit.is_some() { + return Ok(None); + } + + let target_partitions = target_partitions.max(1); + let dtype = scan_request.projection.return_dtype(&self.dtype)?; + + let meta = ScanMeta { + label: Some("scan2".to_string()), + }; + let provider = self.session.scan_scheduler_provider(); + let scheduler = provider.scheduler_for_scan(&meta); + + let mut planned_bindings = Vec::new(); + let mut total_morsels = 0usize; + for (partition_idx, binding) in self.open_sources(false).await? { + let Some(request) = + binding_scan_request(partition_idx, &binding, scan_request.clone())? + else { + continue; + }; + let row_range = request + .row_range + .clone() + .ok_or_else(|| vortex_err!("scan2 partition row range missing"))?; + let prepared = Arc::new(PreparedScanPlan::try_new( + &binding, + &self.session, + &request, + )?); + let execution = Arc::new(ScanExecution::try_new( + binding, + self.session.clone(), + prepared, + None, + )?); + let ranges = execution.splits(&row_range)?; + if ranges.is_empty() { + continue; + } + total_morsels = total_morsels.saturating_add(ranges.len()); + planned_bindings.push((execution, ranges)); + } + + // The physical plan may expose more engine partitions than we can fill with morsels. + // Keep only non-empty planned partitions; engine adapters can return empty streams for + // any surplus advertised partitions. + let partition_count = total_morsels.min(target_partitions); + let mut partitions = vec![Vec::new(); partition_count]; + let mut morsel_idx = 0usize; + for (execution, ranges) in planned_bindings { + for range in ranges { + let partition = morsel_idx % partition_count; + partitions[partition].push(PlannedScanPlanMorsel { + execution: Arc::clone(&execution), + range, + }); + morsel_idx = morsel_idx.saturating_add(1); + } + } + + let read_byte_budget = read_byte_budget(&scheduler); + + Ok(Some(Arc::new(PlannedScanPlanScan { + dtype, + partitions, + handle: self.session.handle(), + read_byte_budget, + }))) + } + + async fn scan(&self, scan_request: DataSourceScanRequest) -> VortexResult { + let meta = ScanMeta { + label: Some("scan2".to_string()), + }; + let provider = self.session.scan_scheduler_provider(); + let scheduler = provider.scheduler_for_scan(&meta); + + let mut ready = VecDeque::new(); + let mut deferred = VecDeque::new(); + + for (index, child) in self.children.iter().enumerate() { + match child { + ScanPlanChild::Opened(binding) => ready.push_back((index, binding.clone())), + ScanPlanChild::Deferred(factory) => { + deferred.push_back((index, Arc::clone(factory))); + } + } + } + + let dtype = scan_request.projection.return_dtype(&self.dtype)?; + let limit_remaining = scan_request.limit.map(AtomicU64::new).map(Arc::new); + + Ok(Box::new(ScanPlanDataSourceScan { + dtype, + request: scan_request, + ready, + deferred, + handle: self.session.handle(), + session: self.session.clone(), + concurrency: self.concurrency, + scheduler, + limit_remaining, + })) + } + + async fn statistics( + &self, + expr: &Expression, + funcs: &[AggregateFnRef], + ) -> VortexResult>> { + if self.children.len() != 1 { + return Ok(absent_statistics(funcs)); + } + let ScanPlanChild::Opened(binding) = &self.children[0] else { + return Ok(absent_statistics(funcs)); + }; + scan_plan_statistics( + Arc::clone(binding.root()), + self.session.clone(), + expr, + funcs, + ) + .await + } + + async fn field_statistics(&self, field_path: &FieldPath) -> VortexResult { + if field_path.parts().len() != 1 { + return Ok(StatsSet::default()); + } + let Some(field_name) = field_path.parts()[0].as_name() else { + return Ok(StatsSet::default()); + }; + let funcs = Stat::all() + .filter_map(|stat| stat.aggregate_fn().map(|func| (stat, func))) + .collect::>(); + let aggregate_funcs = funcs + .iter() + .map(|(_, func)| func.clone()) + .collect::>(); + let stats = self + .statistics( + &vortex_array::expr::get_item(field_name, vortex_array::expr::root()), + &aggregate_funcs, + ) + .await?; + let mut stats_set = StatsSet::default(); + for ((stat, _), value) in funcs.into_iter().zip(stats) { + stats_set.set(stat, scalar_precision_to_value(value)); + } + Ok(stats_set) + } + + fn supports_morsel_partitioning(&self) -> bool { + true + } +} + +struct ScanPlanDataSourceScan { + dtype: DType, + request: DataSourceScanRequest, + ready: VecDeque<(usize, ScanPlanBinding)>, + deferred: VecDeque<(usize, Arc)>, + handle: Handle, + session: VortexSession, + concurrency: usize, + scheduler: Arc, + limit_remaining: Option>, +} + +impl DataSourceScan for ScanPlanDataSourceScan { + fn dtype(&self) -> &DType { + &self.dtype + } + + fn partition_count(&self) -> Precision { + let count = self.ready.len() + self.deferred.len(); + if self.deferred.is_empty() { + Precision::exact(count) + } else { + Precision::inexact(count) + } + } + + fn partitions(self: Box) -> PartitionStream { + let Self { + dtype: _, + request, + ready, + deferred, + handle, + session, + concurrency, + scheduler, + limit_remaining, + } = *self; + + let ordered = request.ordered; + let ready_stream = stream::iter(ready).map(Ok); + let spawned = stream::iter(deferred).map(move |(index, factory)| { + handle.spawn(async move { + factory + .open() + .instrument(tracing::info_span!("ScanPlanFactory::open")) + .await + .map(|opened| opened.map(|root| (index, ScanPlanBinding::new(root)))) + }) + }); + + let deferred_stream = if ordered { + spawned + .buffered(concurrency) + .filter_map(|result| async move { + match result { + Ok(Some(binding)) => Some(Ok(binding)), + Ok(None) => None, + Err(error) => Some(Err(error)), + } + }) + .boxed() + } else { + spawned + .buffer_unordered(concurrency) + .filter_map(|result| async move { + match result { + Ok(Some(binding)) => Some(Ok(binding)), + Ok(None) => None, + Err(error) => Some(Err(error)), + } + }) + .boxed() + }; + + ready_stream + .chain(deferred_stream) + .filter_map(move |binding_result| { + let request = request.clone(); + let scheduler = Arc::clone(&scheduler); + let limit_remaining = limit_remaining.clone(); + let session = session.clone(); + async move { + match binding_result { + Ok((index, binding)) => binding_partition( + index, + binding, + session, + request, + scheduler, + limit_remaining, + ) + .transpose(), + Err(error) => Some(Err(error)), + } + } + }) + .boxed() + } +} + +fn binding_partition( + partition_idx: usize, + binding: ScanPlanBinding, + session: VortexSession, + request: DataSourceScanRequest, + scheduler: Arc, + limit_remaining: Option>, +) -> VortexResult> { + let Some(request) = binding_scan_request(partition_idx, &binding, request)? else { + return Ok(None); + }; + let row_range = request + .row_range + .clone() + .ok_or_else(|| vortex_err!("scan2 partition row range missing"))?; + let prepared = Arc::new(PreparedScanPlan::try_new(&binding, &session, &request)?); + + Ok(Some(Box::new(ScanPlanPartition { + binding, + session, + prepared, + row_range, + index: partition_idx, + scheduler, + limit_remaining, + }))) +} + +fn scan_plan_binding_stream( + binding: ScanPlanBinding, + session: VortexSession, + request: DataSourceScanRequest, +) -> VortexResult { + let output_dtype = request.projection.return_dtype(binding.root().dtype())?; + let meta = ScanMeta { + label: Some("scan2".to_string()), + }; + let provider = session.scan_scheduler_provider(); + let scheduler = provider.scheduler_for_scan(&meta); + + let limit_remaining = request.limit.map(AtomicU64::new).map(Arc::new); + let Some(partition) = + binding_partition(0, binding, session, request, scheduler, limit_remaining)? + else { + return Ok(ArrayStreamExt::boxed(ArrayStreamAdapter::new( + output_dtype, + stream::empty(), + ))); + }; + partition.execute() +} + +fn split_ranges_from_node(node: &ScanPlanRef) -> Vec> { + let mut points = Vec::new(); + if let Some(hints) = node.split_hints() { + points.extend_from_slice(hints); + } + let points = normalize_split_points(node.row_count(), points); + natural_split_ranges(&points, None) +} + +fn binding_scan_request( + partition_idx: usize, + binding: &ScanPlanBinding, + request: DataSourceScanRequest, +) -> VortexResult> { + let partition_idx_u64 = partition_idx as u64; + if let Some(range) = &request.partition_range + && !range.contains(&partition_idx_u64) + { + return Ok(None); + } + match &request.partition_selection { + Selection::IncludeByIndex(buffer) => { + if buffer.as_slice().binary_search(&partition_idx_u64).is_err() { + return Ok(None); + } + } + Selection::ExcludeByIndex(buffer) => { + if buffer.as_slice().binary_search(&partition_idx_u64).is_ok() { + return Ok(None); + } + } + _ => {} + }; + + let row_count = binding.row_count(); + let row_range = request.row_range.clone().unwrap_or(0..row_count); + check_range(&row_range, row_count)?; + + Ok(Some(DataSourceScanRequest { + row_range: Some(row_range), + ..request + })) +} + +type QueuedWork = ScanTaskBox; + +struct LaunchedWorkOutput { + lane: ScanTaskLane, + reads: Vec, + output: VortexResult, +} + +struct EvidenceWorkOutput { + morsel_id: usize, + predicate_idx: usize, + version: PredicateVersion, + source: EvidenceWorkSource, + fragments: Vec, +} + +struct ScanEvidenceWorkOutput { + execution: Arc, + morsel_id: usize, + predicate_idx: usize, + evidence_idx: usize, + version: PredicateVersion, + fragments: Option>, +} + +enum EvidenceWorkSource { + Provider, + Predicate { input_rows: usize, pass_rows: usize }, +} + +struct ProjectionWorkOutput { + morsel_id: usize, + array: ArrayRef, +} + +enum WorkOutput { + Evidence(EvidenceWorkOutput), + ScanEvidence(ScanEvidenceWorkOutput), + Projection(ProjectionWorkOutput), +} + +enum WorkPoll { + Ready(WorkOutput), + Pending(QueuedWork), +} + +struct ScanEvidenceWaitTask { + execution: Arc, + morsel_id: usize, + predicate_idx: usize, + evidence_idx: usize, + version: PredicateVersion, + lane: ScanTaskLane, + priority: u64, +} + +impl ScanTask for ScanEvidenceWaitTask { + fn morsel_id(&self) -> usize { + self.morsel_id + } + + fn phase(&self) -> ScanIoPhase { + ScanIoPhase::EvidenceProbe + } + + fn lane(&self) -> ScanTaskLane { + self.lane + } + + fn reads(&self) -> &[ScanTaskRead] { + &[] + } + + fn priority(&self) -> u64 { + self.priority + } + + fn into_step(self: Box) -> VortexResult> { + let task = *self; + let morsel_id = task.morsel_id; + let lane = task.lane; + let priority = task.priority; + Ok(ScanStep::new( + morsel_id, + ScanIoPhase::EvidenceProbe, + lane, + Vec::new(), + Vec::new(), + Vec::new(), + move |_| { + if !task.execution.scan_evidence_provider_ready( + task.predicate_idx, + task.evidence_idx, + task.version, + ) && task.execution.predicates[task.predicate_idx].version() == task.version + { + return Ok(ScanStepResult::Continue(Box::new(task))); + } + + Ok(ScanStepResult::Ready(WorkOutput::ScanEvidence( + ScanEvidenceWorkOutput { + execution: Arc::clone(&task.execution), + morsel_id: task.morsel_id, + predicate_idx: task.predicate_idx, + evidence_idx: task.evidence_idx, + version: task.version, + fragments: None, + }, + ))) + }, + ) + .with_priority(priority)) + } +} + +struct PredicateReadWorkState { + execution: Arc, + morsel_id: usize, + predicate_idx: usize, + version: PredicateVersion, + range: Range, + need: Mask, + compact: bool, + len: usize, + priority: u64, + lane: ScanTaskLane, +} + +struct PredicateReadWorkTask { + state: PredicateReadWorkState, + step: ReadStep, + reads: Vec, +} + +impl PredicateReadWorkTask { + fn try_new(state: PredicateReadWorkState, task: Box) -> VortexResult { + let step = task.into_step()?; + let reads = ScanTaskRead::from_scan_reads(&step.required_reads); + Ok(Self { state, step, reads }) + } +} + +impl ScanTask for PredicateReadWorkTask { + fn morsel_id(&self) -> usize { + self.state.morsel_id + } + + fn phase(&self) -> ScanIoPhase { + ScanIoPhase::PredicateRead + } + + fn lane(&self) -> ScanTaskLane { + self.state.lane + } + + fn reads(&self) -> &[ScanTaskRead] { + &self.reads + } + + fn priority(&self) -> u64 { + self.state.priority + } + + fn into_step(self: Box) -> VortexResult> { + let task = *self; + let state = task.state; + let morsel_id = state.morsel_id; + let lane = state.lane; + let reads = task.reads.clone(); + let priority = state.priority; + let read_step = task.step; + Ok(ScanStep::new( + morsel_id, + ScanIoPhase::PredicateRead, + lane, + reads, + read_step.required_reads, + read_step.prefetch_reads, + move |results| { + let reader = state.execution.read_context(); + let mut ctx = state.execution.session.create_execution_ctx(); + let array = match read_step.continuation.run(&reader, &mut ctx, results)? { + ReadTaskOutput::Ready(array) => array, + ReadTaskOutput::Continue(read_task) => { + return Ok(ScanStepResult::Continue(Box::new( + PredicateReadWorkTask::try_new(state, read_task)?, + ))); + } + }; + let result = if state.compact { + let compact = array.null_as_false().execute(&mut ctx)?; + if compact.len() != state.need.true_count() { + vortex_bail!( + "compacted residual result length {} does not match demanded row count {}", + compact.len(), + state.need.true_count() + ); + } + state.need.intersect_by_rank(&compact) + } else { + array.null_as_false().execute(&mut ctx)? + }; + if result.len() != state.len { + vortex_bail!( + "residual result length {} does not match morsel length {}", + result.len(), + state.len + ); + } + let pass = &result & &state.need; + let input_rows = state.need.true_count(); + let pass_rows = pass.true_count(); + let exact = !&state.need | &pass; + Ok(ScanStepResult::Ready(WorkOutput::Evidence( + EvidenceWorkOutput { + morsel_id: state.morsel_id, + predicate_idx: state.predicate_idx, + version: state.version, + source: EvidenceWorkSource::Predicate { + input_rows, + pass_rows, + }, + fragments: vec![EvidenceFragment::new( + state.range.clone(), + PredicateEvidenceKind::ExactMask(exact), + )], + }, + ))) + }, + ) + .with_priority(priority)) + } +} + +struct ProjectionReadWorkTask { + execution: Arc, + step: ReadStep, + reads: Vec, + morsel_id: usize, +} + +impl ProjectionReadWorkTask { + fn try_new( + execution: Arc, + task: Box, + morsel_id: usize, + ) -> VortexResult { + let step = task.into_step()?; + let reads = ScanTaskRead::from_scan_reads(&step.required_reads); + Ok(Self { + execution, + step, + reads, + morsel_id, + }) + } +} + +impl ScanTask for ProjectionReadWorkTask { + fn morsel_id(&self) -> usize { + self.morsel_id + } + + fn phase(&self) -> ScanIoPhase { + ScanIoPhase::ProjectionRead + } + + fn lane(&self) -> ScanTaskLane { + ScanTaskLane::Projection + } + + fn reads(&self) -> &[ScanTaskRead] { + &self.reads + } + + fn priority(&self) -> u64 { + ScanStep::::DEFAULT_PRIORITY + } + + fn into_step(self: Box) -> VortexResult> { + let task = *self; + let reads = task.reads.clone(); + let read_step = task.step; + Ok(ScanStep::new( + task.morsel_id, + ScanIoPhase::ProjectionRead, + ScanTaskLane::Projection, + reads, + read_step.required_reads, + read_step.prefetch_reads, + move |results| { + let reader = task.execution.read_context(); + let mut ctx = task.execution.session.create_execution_ctx(); + match read_step.continuation.run(&reader, &mut ctx, results)? { + ReadTaskOutput::Ready(array) => Ok(ScanStepResult::Ready( + WorkOutput::Projection(ProjectionWorkOutput { + morsel_id: task.morsel_id, + array, + }), + )), + ReadTaskOutput::Continue(read_task) => Ok(ScanStepResult::Continue(Box::new( + ProjectionReadWorkTask::try_new(task.execution, read_task, task.morsel_id)?, + ))), + } + }, + )) + } +} + +async fn resolve_step_reads(read_store: ReadStoreRef, reads: Vec) -> VortexResult<()> { + let mut pending_reads = FuturesUnordered::new(); + for read in reads { + let key = read.request.key; + if read_store.get(key).is_none() { + pending_reads.push(async move { read.future.await.map(|buffer| (key, buffer)) }); + } + } + while let Some(result) = pending_reads.next().await { + let (key, buffer) = result?; + read_store.insert(key, buffer); + } + Ok(()) +} + +fn prefetch_step_reads(handle: &Handle, read_store: ReadStoreRef, reads: Vec) { + if reads.is_empty() { + return; + } + handle + .spawn(async move { + if let Err(error) = resolve_step_reads(read_store, reads).await { + tracing::debug!( + target: "vortex_scan::plan::data_source", + ?error, + "scan2 prefetch read failed" + ); + } + }) + .detach(); +} + +async fn run_scan_task_step( + work: QueuedWork, + read_store: ReadStoreRef, + handle: Handle, +) -> VortexResult { + let mut step = work.into_step()?; + let (required_reads, prefetch_reads) = step.take_reads(); + prefetch_step_reads(&handle, Arc::clone(&read_store), prefetch_reads); + resolve_step_reads(Arc::clone(&read_store), required_reads).await?; + match step.continue_with(ReadResults::new(Arc::clone(&read_store)))? { + ScanStepResult::Ready(output) => Ok(WorkPoll::Ready(output)), + ScanStepResult::Continue(work) => Ok(WorkPoll::Pending(work)), + } +} + +enum CompletedMorsel { + Empty, + Output(ArrayRef), +} + +struct PlannedMorselWork { + state: MorselState, + evidence: Vec, +} + +struct MorselState { + execution: Arc, + range: Range, + selected: Mask, + evidence: Vec>, + pending_evidence: Vec, + pending_scan_evidence: Vec, + scan_evidence_generation: Vec, + predicate_queued: Vec, + predicate_done: Vec, + next_recheck_predicate: usize, + projection_queued: bool, +} + +#[derive(Default)] +struct ScanEvidenceStore { + predicates: Vec, +} + +#[derive(Default)] +struct PredicateScanEvidenceStore { + generation: u64, + providers: Vec, +} + +#[derive(Default)] +struct ScanEvidenceSlot { + version: Option, + pending: Option, + fragments: Vec, +} + +enum ScanEvidenceAction { + Ready, + Pending, + Prepare, + Wait, +} + +#[derive(Default)] +struct PredicateRuntimeStats { + input_rows: u64, + rejected_rows: u64, +} + +struct PartitionWorkSchedulerState { + pending: VecDeque, + morsels: Vec>, + active_morsels: usize, + has_dynamic_predicates: bool, + in_flight_projection_tasks: usize, + next_morsel_id: usize, + next_emit_morsel_id: usize, + task_queue: ScanTaskQueue, + in_flight: FuturesUnordered>, + read_store: ReadStoreRef, + completed_morsels: BTreeMap, + handle: Handle, + ordered: bool, + plan_window: usize, +} + +fn plan_window_for_limit(limited: bool) -> usize { + if limited { 1 } else { usize::MAX } +} + +fn read_byte_budget(scheduler: &ScanScheduler) -> u64 { + scheduler.config().read_byte_budget().unwrap_or(u64::MAX) +} + +fn partition_work_stream( + morsels: Vec, + handle: Handle, + ordered: bool, + plan_window: usize, + read_byte_budget: u64, +) -> impl futures::Stream> + Send + 'static { + let has_dynamic_predicates = morsels + .iter() + .any(|morsel| morsel.execution.has_dynamic_predicates()); + tracing::debug!( + target: "vortex_scan::plan::data_source", + morsel_count = morsels.len(), + ordered, + plan_window, + read_byte_budget, + has_dynamic_predicates, + "created scan2 task stream" + ); + let state = PartitionWorkSchedulerState { + pending: VecDeque::from(morsels), + morsels: Vec::new(), + active_morsels: 0, + has_dynamic_predicates, + in_flight_projection_tasks: 0, + next_morsel_id: 0, + next_emit_morsel_id: 0, + task_queue: ScanTaskQueue::new(read_byte_budget), + in_flight: FuturesUnordered::new(), + read_store: Arc::new(ReadStore::new()), + completed_morsels: BTreeMap::new(), + handle, + ordered, + plan_window, + }; + + stream::unfold(state, |mut state| async move { + loop { + if let Some(array) = state.pop_ready_output() { + return Some((Ok(array), state)); + } + + while state.active_morsels < state.plan_window && !state.pending.is_empty() { + if let Err(error) = state.plan_next_morsel() { + state.clear(); + return Some((Err(error), state)); + } + } + + while state.launch_next_admissible_work() {} + + if state.in_flight.is_empty() { + if state.is_done() { + return None; + } + let error = vortex_err!( + "scan2 work scheduler stalled: {} active morsels, {} pending morsels, {} evidence work items, {} predicate work items, {} projection work items, {} active read bytes", + state.active_morsels, + state.pending.len(), + state.task_queue.evidence_len(), + state.task_queue.predicate_len(), + state.task_queue.projection_len(), + state.task_queue.active_read_bytes() + ); + state.clear(); + return Some((Err(error), state)); + } + + match state.in_flight.next().await { + Some(output) => { + state.release_reads(output.lane, &output.reads); + match output.output { + Ok(WorkPoll::Ready(output)) => match state.complete_work(output) { + Ok(Some(array)) => return Some((Ok(array), state)), + Ok(None) => continue, + Err(error) => return Some((Err(error), state)), + }, + Ok(WorkPoll::Pending(work)) => { + state.task_queue.push(work); + continue; + } + Err(error) => return Some((Err(error), state)), + } + } + None if state.is_done() => return None, + None => continue, + } + } + }) +} + +impl PartitionWorkSchedulerState { + fn clear(&mut self) { + self.pending.clear(); + self.morsels.clear(); + self.active_morsels = 0; + self.in_flight_projection_tasks = 0; + self.next_emit_morsel_id = 0; + self.task_queue.clear(); + self.in_flight = FuturesUnordered::new(); + self.read_store = Arc::new(ReadStore::new()); + self.completed_morsels.clear(); + } + + fn is_done(&self) -> bool { + self.pending.is_empty() + && self.active_morsels == 0 + && self.task_queue.is_empty() + && self.in_flight.is_empty() + && self.completed_morsels.is_empty() + } + + fn plan_next_morsel(&mut self) -> VortexResult<()> { + let Some(morsel) = self.pending.pop_front() else { + return Ok(()); + }; + let morsel_id = self.next_morsel_id; + let range = morsel.range.clone(); + let Some(planned) = morsel.execution.plan_morsel(morsel_id, morsel.range)? else { + tracing::trace!( + target: "vortex_scan::plan::data_source", + morsel_id, + range_start = range.start, + range_end = range.end, + pending_morsels = self.pending.len(), + active_morsels = self.active_morsels, + "scan2 skipped empty morsel" + ); + return Ok(()); + }; + self.next_morsel_id = self.next_morsel_id.saturating_add(1); + self.active_morsels = self.active_morsels.saturating_add(1); + if self.morsels.len() <= morsel_id { + self.morsels.resize_with(morsel_id + 1, || None); + } + self.morsels[morsel_id] = Some(planned.state); + let evidence_len = planned.evidence.len(); + self.task_queue.extend(planned.evidence); + self.enqueue_ready_work(morsel_id)?; + tracing::trace!( + target: "vortex_scan::plan::data_source", + morsel_id, + range_start = range.start, + range_end = range.end, + pending_morsels = self.pending.len(), + active_morsels = self.active_morsels, + queued_evidence = evidence_len, + evidence_queue_len = self.task_queue.evidence_len(), + predicate_queue_len = self.task_queue.predicate_len(), + projection_queue_len = self.task_queue.projection_len(), + "scan2 planned morsel" + ); + Ok(()) + } + + fn launch_next_admissible_work(&mut self) -> bool { + let in_flight_empty = self.in_flight.is_empty(); + // Backlogged output should stop speculative projection for dynamic scans, but not the + // single projection needed to unblock an otherwise idle ordered stream. + let projection_admissible = !self.has_dynamic_predicates + || (self.in_flight_projection_tasks == 0 && !self.has_completed_output_backlog()) + || in_flight_empty; + let morsels = &self.morsels; + let Some(task) = self.task_queue.pop_next_admissible_with_projection_gate( + in_flight_empty, + projection_admissible, + |morsel_id| morsels.get(morsel_id).and_then(Option::as_ref).is_some(), + ) else { + return false; + }; + let (task, lane, reads) = task.into_parts(); + self.launch_admitted(task, lane, reads); + true + } + + fn launch_admitted(&mut self, work: QueuedWork, lane: ScanTaskLane, reads: Vec) { + let morsel_id = work.morsel_id(); + let phase = work.phase(); + let priority = work.priority(); + let bytes = scan_task_read_bytes(&reads); + let read_count = reads.len(); + tracing::trace!( + target: "vortex_scan::plan::data_source", + morsel_id, + ?phase, + ?lane, + read_count, + read_bytes = bytes, + priority, + in_flight = self.in_flight.len(), + in_flight_projection_tasks = self.in_flight_projection_tasks, + active_morsels = self.active_morsels, + pending_morsels = self.pending.len(), + evidence_queue_len = self.task_queue.evidence_len(), + predicate_queue_len = self.task_queue.predicate_len(), + projection_queue_len = self.task_queue.projection_len(), + active_read_count = self.task_queue.active_read_count(), + active_read_bytes = self.task_queue.active_read_bytes(), + active_evidence_read_bytes = self.task_queue.active_evidence_read_bytes(), + active_predicate_read_bytes = self.task_queue.active_predicate_read_bytes(), + active_projection_read_bytes = self.task_queue.active_projection_read_bytes(), + "scan2 launching work" + ); + let read_store = Arc::clone(&self.read_store); + let handle = self.handle.clone(); + let future = async move { + let output = run_scan_task_step(work, read_store, handle).await; + LaunchedWorkOutput { + lane, + reads, + output, + } + } + .instrument(tracing::trace_span!( + "scan2_work", + morsel_id, + phase = ?phase, + lane = ?lane, + read_count, + read_bytes = bytes, + )); + let inline_zero_read = bytes == 0 + && match phase { + ScanIoPhase::EvidenceProbe | ScanIoPhase::EvidenceSetup => { + priority <= INLINE_ZERO_READ_EVIDENCE_MAX_PRIORITY + } + ScanIoPhase::PredicateRead + | ScanIoPhase::ProjectionRead + | ScanIoPhase::AggregateRead => false, + }; + if inline_zero_read { + self.in_flight.push(future.boxed()); + } else { + self.in_flight.push(self.handle.spawn(future).boxed()); + } + if matches!(lane, ScanTaskLane::Projection) { + self.in_flight_projection_tasks = self.in_flight_projection_tasks.saturating_add(1); + } + } + + fn release_reads(&mut self, lane: ScanTaskLane, reads: &[ScanTaskRead]) { + self.task_queue.release_reads(lane, reads); + if matches!(lane, ScanTaskLane::Projection) { + self.in_flight_projection_tasks = self.in_flight_projection_tasks.saturating_sub(1); + } + } + + fn complete_work(&mut self, output: WorkOutput) -> VortexResult> { + match output { + WorkOutput::Evidence(output) => self.complete_evidence(output), + WorkOutput::ScanEvidence(output) => self.complete_scan_evidence(output), + WorkOutput::Projection(output) => { + Ok(self.finish_output_morsel(output.morsel_id, output.array)) + } + } + } + + fn complete_scan_evidence( + &mut self, + output: ScanEvidenceWorkOutput, + ) -> VortexResult> { + if let Some(morsel) = self + .morsels + .get_mut(output.morsel_id) + .and_then(Option::as_mut) + && let Some(pending) = morsel.pending_scan_evidence.get_mut(output.predicate_idx) + { + *pending = pending.saturating_sub(1); + } + + if let Some(fragments) = output.fragments { + output.execution.record_scan_evidence( + output.predicate_idx, + output.evidence_idx, + output.version, + fragments, + )?; + } + + let affected = self + .morsels + .iter() + .enumerate() + .filter_map(|(morsel_id, morsel)| { + morsel + .as_ref() + .filter(|morsel| Arc::ptr_eq(&morsel.execution, &output.execution)) + .map(|_| morsel_id) + }) + .collect::>(); + + for morsel_id in affected { + if self + .morsels + .get(morsel_id) + .and_then(Option::as_ref) + .is_none() + { + continue; + } + if self.refresh_morsel_scan_evidence(morsel_id, output.predicate_idx)? { + if let Some(array) = self.finish_empty_morsel(morsel_id) { + return Ok(Some(array)); + } + } else { + self.enqueue_ready_work(morsel_id)?; + } + } + Ok(None) + } + + fn refresh_all_scan_evidence(&mut self, morsel_id: usize) -> VortexResult { + let Some(predicate_count) = self + .morsels + .get(morsel_id) + .and_then(Option::as_ref) + .map(|morsel| morsel.execution.predicates.len()) + else { + return Ok(false); + }; + + for predicate_idx in 0..predicate_count { + if self.refresh_morsel_scan_evidence(morsel_id, predicate_idx)? { + return Ok(true); + } + } + Ok(false) + } + + fn refresh_morsel_scan_evidence( + &mut self, + morsel_id: usize, + predicate_idx: usize, + ) -> VortexResult { + let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { + return Ok(false); + }; + let predicate = &morsel.execution.predicates[predicate_idx]; + let version = predicate.version(); + let (generation, fragments) = + morsel + .execution + .scan_evidence_fragments(predicate_idx, version, &morsel.range)?; + let Some(seen_generation) = morsel.scan_evidence_generation.get_mut(predicate_idx) else { + vortex_bail!("missing scan evidence generation slot {predicate_idx}"); + }; + if generation <= *seen_generation { + return Ok(false); + } + *seen_generation = generation; + + let Some(slot) = morsel.evidence.get_mut(predicate_idx) else { + vortex_bail!("missing predicate evidence slot {predicate_idx}"); + }; + if slot + .as_ref() + .is_none_or(|evidence| evidence.version() != version) + { + *slot = Some(PredicateEvidence::new( + predicate.id, + version, + morsel.range.clone(), + )?); + } + let evidence = slot + .as_mut() + .ok_or_else(|| vortex_err!("missing predicate evidence after initialization"))?; + for fragment in fragments { + evidence.absorb(fragment)?; + } + let maybe = evidence.maybe().clone(); + let all_false = evidence.all_false(); + morsel.selected = &morsel.selected & &maybe; + Ok(morsel.selected.all_false() || all_false) + } + + fn complete_evidence(&mut self, output: EvidenceWorkOutput) -> VortexResult> { + let mut record_predicate = None; + let finish_empty = { + let Some(morsel) = self + .morsels + .get_mut(output.morsel_id) + .and_then(Option::as_mut) + else { + return Ok(None); + }; + match output.source { + EvidenceWorkSource::Provider => { + let Some(pending) = morsel.pending_evidence.get_mut(output.predicate_idx) + else { + vortex_bail!("missing predicate evidence count {}", output.predicate_idx); + }; + *pending = pending.saturating_sub(1); + } + EvidenceWorkSource::Predicate { + input_rows, + pass_rows, + } => { + let Some(queued) = morsel.predicate_queued.get_mut(output.predicate_idx) else { + vortex_bail!("missing predicate queued slot {}", output.predicate_idx); + }; + *queued = false; + let Some(done) = morsel.predicate_done.get_mut(output.predicate_idx) else { + vortex_bail!("missing predicate done slot {}", output.predicate_idx); + }; + *done = true; + record_predicate = Some(( + Arc::clone(&morsel.execution), + output.predicate_idx, + input_rows, + pass_rows, + )); + } + } + let predicate = &morsel.execution.predicates[output.predicate_idx]; + let Some(slot) = morsel.evidence.get_mut(output.predicate_idx) else { + vortex_bail!("missing predicate evidence slot {}", output.predicate_idx); + }; + if slot + .as_ref() + .is_none_or(|evidence| evidence.version() != output.version) + { + *slot = Some(PredicateEvidence::new( + predicate.id, + output.version, + morsel.range.clone(), + )?); + } + let evidence = slot + .as_mut() + .ok_or_else(|| vortex_err!("missing predicate evidence after initialization"))?; + for fragment in output.fragments { + evidence.absorb(fragment)?; + } + let maybe = evidence.maybe().clone(); + let all_false = evidence.all_false(); + morsel.selected = &morsel.selected & &maybe; + morsel.selected.all_false() || all_false + }; + + if let Some((execution, predicate_idx, input_rows, pass_rows)) = record_predicate + && !execution.has_dynamic_predicates() + { + execution.record_predicate_result(predicate_idx, input_rows, pass_rows); + } + + if finish_empty { + return Ok(self.finish_empty_morsel(output.morsel_id)); + } + + self.enqueue_ready_work(output.morsel_id)?; + Ok(None) + } + + fn enqueue_ready_work(&mut self, morsel_id: usize) -> VortexResult<()> { + if self.refresh_all_scan_evidence(morsel_id)? { + self.finish_empty_morsel(morsel_id); + return Ok(()); + } + + if let Some((predicate_idx, need, priority)) = self.choose_ready_predicate(morsel_id)? { + let Some(morsel) = self.morsels.get(morsel_id).and_then(Option::as_ref) else { + return Ok(()); + }; + let work = morsel.execution.plan_predicate_work( + morsel_id, + predicate_idx, + morsel.range.clone(), + need, + morsel.execution.predicates[predicate_idx].version(), + priority, + )?; + let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { + return Ok(()); + }; + morsel.predicate_queued[predicate_idx] = true; + self.task_queue.push(work); + return Ok(()); + } + + let ready_to_project = self + .morsels + .get(morsel_id) + .and_then(Option::as_ref) + .is_some_and(|morsel| { + !morsel.projection_queued + && morsel.pending_evidence.iter().all(|pending| *pending == 0) + && morsel + .pending_scan_evidence + .iter() + .all(|pending| *pending == 0) + && morsel.predicate_queued.iter().all(|queued| !*queued) + && morsel.predicate_done.iter().all(|done| *done) + }); + if !ready_to_project { + return Ok(()); + } + + if self.enqueue_recheck_evidence(morsel_id)? { + return Ok(()); + } + + let Some(morsel) = self.morsels.get(morsel_id).and_then(Option::as_ref) else { + return Ok(()); + }; + let projection = morsel.execution.plan_projection_work( + morsel_id, + morsel.range.clone(), + morsel.selected.clone(), + )?; + let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { + return Ok(()); + }; + morsel.projection_queued = true; + match projection { + Some(work) => self.task_queue.push(work), + None => { + self.finish_empty_morsel(morsel_id); + } + } + Ok(()) + } + + fn choose_ready_predicate( + &mut self, + morsel_id: usize, + ) -> VortexResult> { + loop { + let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { + return Ok(None); + }; + if morsel.predicate_queued.iter().any(|queued| *queued) { + return Ok(None); + } + let dynamic_scan = morsel.execution.has_dynamic_predicates(); + if dynamic_scan + && (morsel.pending_evidence.iter().any(|pending| *pending != 0) + || morsel + .pending_scan_evidence + .iter() + .any(|pending| *pending != 0)) + { + return Ok(None); + } + + let mut best: Option<(u64, usize, Mask)> = None; + let mut advanced = false; + for predicate_idx in 0..morsel.execution.predicates.len() { + if morsel.predicate_done[predicate_idx] + || morsel.predicate_queued[predicate_idx] + || morsel.pending_evidence[predicate_idx] != 0 + || morsel.pending_scan_evidence[predicate_idx] != 0 + { + continue; + } + if morsel.evidence[predicate_idx].is_none() { + let predicate = &morsel.execution.predicates[predicate_idx]; + morsel.evidence[predicate_idx] = Some(PredicateEvidence::new( + predicate.id, + predicate.version(), + morsel.range.clone(), + )?); + } + let evidence = morsel.evidence[predicate_idx].as_ref().ok_or_else(|| { + vortex_err!( + "missing evidence for predicate {predicate_idx} before residual read" + ) + })?; + let need = &morsel.selected & &evidence.unproven(); + if need.all_false() { + morsel.predicate_done[predicate_idx] = true; + advanced = true; + continue; + } + let priority = if dynamic_scan { + u64::try_from(predicate_idx).unwrap_or(u64::MAX) + } else { + morsel + .execution + .predicate_priority(predicate_idx, need.true_count()) + }; + if best.as_ref().is_none_or(|(best_priority, best_idx, _)| { + (priority, predicate_idx) < (*best_priority, *best_idx) + }) { + best = Some((priority, predicate_idx, need)); + } + } + if advanced { + continue; + } + return Ok(best.map(|(priority, predicate_idx, need)| (predicate_idx, need, priority))); + } + } + + fn enqueue_recheck_evidence(&mut self, morsel_id: usize) -> VortexResult { + loop { + let Some(morsel) = self.morsels.get(morsel_id).and_then(Option::as_ref) else { + return Ok(false); + }; + if morsel.next_recheck_predicate >= morsel.execution.predicates.len() { + return Ok(false); + } + + let predicate_idx = morsel.next_recheck_predicate; + let predicate = &morsel.execution.predicates[predicate_idx]; + let current_version = predicate.version(); + let evidence_version = morsel.evidence[predicate_idx] + .as_ref() + .map(PredicateEvidence::version) + .unwrap_or(PredicateVersion::STATIC); + let has_dynamic = predicate.dynamic_updates.is_some(); + let has_scan_recheck_evidence = predicate.has_scan_recheck_evidence(); + let has_morsel_recheck_evidence = predicate.has_morsel_recheck_evidence(); + + if has_dynamic && has_scan_recheck_evidence && current_version != evidence_version { + let work = morsel.execution.plan_scan_evidence_work( + morsel_id, + predicate_idx, + current_version, + EvidenceMode::RecheckBeforeProjection, + )?; + if !work.is_empty() { + let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) + else { + return Ok(false); + }; + morsel.pending_scan_evidence[predicate_idx] = + morsel.pending_scan_evidence[predicate_idx].saturating_add(work.len()); + self.task_queue.extend(work); + return Ok(true); + } + if self.refresh_morsel_scan_evidence(morsel_id, predicate_idx)? { + self.finish_empty_morsel(morsel_id); + return Ok(true); + } + } + + let Some(morsel) = self.morsels.get(morsel_id).and_then(Option::as_ref) else { + return Ok(false); + }; + let evidence_version = morsel.evidence[predicate_idx] + .as_ref() + .map(PredicateEvidence::version) + .unwrap_or(PredicateVersion::STATIC); + + if has_dynamic && has_morsel_recheck_evidence && current_version != evidence_version { + let work = morsel.execution.plan_evidence_work( + morsel_id, + predicate_idx, + morsel.range.clone(), + current_version, + EvidenceMode::RecheckBeforeProjection, + )?; + if work.is_empty() { + let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) + else { + return Ok(false); + }; + morsel.next_recheck_predicate = morsel.next_recheck_predicate.saturating_add(1); + continue; + } + let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { + return Ok(false); + }; + morsel.pending_evidence[predicate_idx] = + morsel.pending_evidence[predicate_idx].saturating_add(work.len()); + self.task_queue.extend(work); + return Ok(true); + } + + let Some(morsel) = self.morsels.get_mut(morsel_id).and_then(Option::as_mut) else { + return Ok(false); + }; + morsel.next_recheck_predicate = morsel.next_recheck_predicate.saturating_add(1); + } + } + + fn finish_empty_morsel(&mut self, morsel_id: usize) -> Option { + if self.finish_morsel(morsel_id) && self.ordered { + self.completed_morsels + .insert(morsel_id, CompletedMorsel::Empty); + return self.pop_ready_output(); + } + None + } + + fn finish_output_morsel(&mut self, morsel_id: usize, array: ArrayRef) -> Option { + if !self.finish_morsel(morsel_id) { + return None; + } + if self.ordered { + self.completed_morsels + .insert(morsel_id, CompletedMorsel::Output(array)); + self.pop_ready_output() + } else { + Some(array) + } + } + + fn finish_morsel(&mut self, morsel_id: usize) -> bool { + if let Some(slot) = self.morsels.get_mut(morsel_id) + && slot.take().is_some() + { + self.active_morsels = self.active_morsels.saturating_sub(1); + return true; + } + false + } + + fn pop_ready_output(&mut self) -> Option { + if !self.ordered { + return None; + } + loop { + match self.completed_morsels.remove(&self.next_emit_morsel_id) { + Some(CompletedMorsel::Empty) => { + self.next_emit_morsel_id = self.next_emit_morsel_id.saturating_add(1); + } + Some(CompletedMorsel::Output(array)) => { + self.next_emit_morsel_id = self.next_emit_morsel_id.saturating_add(1); + return Some(array); + } + None => return None, + } + } + } + + fn has_completed_output_backlog(&self) -> bool { + self.completed_morsels + .values() + .any(|morsel| matches!(morsel, CompletedMorsel::Output(_))) + } +} + +struct ScanPlanPartition { + binding: ScanPlanBinding, + session: VortexSession, + prepared: Arc, + row_range: Range, + index: usize, + scheduler: Arc, + limit_remaining: Option>, +} + +impl Partition for ScanPlanPartition { + fn as_any(&self) -> &dyn Any { + self + } + + fn index(&self) -> usize { + self.index + } + + fn row_count(&self) -> Precision { + let row_count = self.row_range.end - self.row_range.start; + let row_count = self.prepared.selection().row_count(row_count); + let row_count = self + .prepared + .limit() + .map_or(row_count, |limit| row_count.min(limit)); + + if self.prepared.has_filter() { + Precision::inexact(row_count) + } else { + Precision::exact(row_count) + } + } + + fn byte_size(&self) -> Precision { + Precision::Absent + } + + fn execute(self: Box) -> VortexResult { + let ScanPlanPartition { + binding, + session, + prepared, + row_range, + index: _, + scheduler, + limit_remaining, + } = *self; + + let execution = Arc::new(ScanExecution::try_new( + binding, + session, + prepared, + limit_remaining, + )?); + let handle = execution.session.handle(); + let dtype = execution.plan.dtype().clone(); + let ranges = execution.splits(&row_range)?; + let ordered = execution.plan.ordered(); + let plan_window = plan_window_for_limit(execution.limit_remaining.is_some()); + let read_byte_budget = read_byte_budget(&scheduler); + let morsels = ranges + .into_iter() + .map(|range| PlannedScanPlanMorsel { + execution: Arc::clone(&execution), + range, + }) + .collect::>(); + + let stream = partition_work_stream(morsels, handle, ordered, plan_window, read_byte_budget); + + Ok(ArrayStreamExt::boxed(ArrayStreamAdapter::new( + dtype, stream, + ))) + } +} + +struct PlannedScanPlanScan { + dtype: DType, + partitions: Vec>, + handle: Handle, + read_byte_budget: u64, +} + +#[derive(Clone)] +struct PlannedScanPlanMorsel { + execution: Arc, + range: Range, +} + +impl PlannedMorselScan for PlannedScanPlanScan { + fn dtype(&self) -> &DType { + &self.dtype + } + + fn partition_count(&self) -> usize { + self.partitions.len() + } + + fn partition(self: Arc, partition: usize) -> VortexResult { + if partition >= self.partitions.len() { + vortex_bail!( + "planned scan partition {partition} is outside 0..{}", + self.partitions.len() + ); + } + + Ok(Box::new(PlannedScanPlanPartition { + planned: self, + index: partition, + })) + } +} + +struct PlannedScanPlanPartition { + planned: Arc, + index: usize, +} + +impl Partition for PlannedScanPlanPartition { + fn as_any(&self) -> &dyn Any { + self + } + + fn index(&self) -> usize { + self.index + } + + fn row_count(&self) -> Precision { + let mut row_count = 0u64; + let mut has_filter = false; + + for morsel in &self.planned.partitions[self.index] { + let range_len = morsel.range.end - morsel.range.start; + row_count = + row_count.saturating_add(morsel.execution.plan.selection().row_count(range_len)); + has_filter |= morsel.execution.plan.has_filter(); + } + + if has_filter { + Precision::inexact(row_count) + } else { + Precision::exact(row_count) + } + } + + fn byte_size(&self) -> Precision { + Precision::Absent + } + + fn execute(self: Box) -> VortexResult { + let PlannedScanPlanPartition { planned, index } = *self; + let morsels = planned.partitions[index].clone(); + let dtype = planned.dtype.clone(); + let handle = planned.handle.clone(); + let stream = + partition_work_stream(morsels, handle, false, usize::MAX, planned.read_byte_budget); + + Ok(ArrayStreamExt::boxed(ArrayStreamAdapter::new( + dtype, stream, + ))) + } +} + +struct PreparedScanPlan { + // Request-level physical plan after pushdown. This must stay free of per-scan IO state. + dtype: DType, + selection: Selection, + ordered: bool, + limit: Option, + row_count: u64, + split_hints: Vec, + projection: ScanPlanRef, + predicates: Vec, +} + +struct PreparedPredicatePlan { + id: PredicateId, + expr: Expression, + plan: ScanPlanRef, +} + +struct ScanExecution { + // Runtime instantiation of a prepared plan: source binding, prepared handles, and scan state. + session: VortexSession, + plan: Arc, + limit_remaining: Option>, + projection: PreparedReadRef, + predicates: Vec, + predicate_stats: Mutex>, + scan_evidence: Mutex, +} + +struct ExecutionPredicate { + id: PredicateId, + expr: Expression, + static_cost: u64, + dynamic_updates: Option, + read: PreparedReadRef, + evidence: Vec, +} + +impl ExecutionPredicate { + fn version(&self) -> PredicateVersion { + self.dynamic_updates + .as_ref() + .map(|updates| PredicateVersion::new(updates.version())) + .unwrap_or(PredicateVersion::STATIC) + } + + fn has_morsel_recheck_evidence(&self) -> bool { + self.evidence + .iter() + .any(|plan| plan.scope() == EvidenceScope::Morsel && plan.recheck_before_projection()) + } + + fn has_scan_recheck_evidence(&self) -> bool { + self.evidence + .iter() + .any(|plan| plan.scope() == EvidenceScope::Scan && plan.recheck_before_projection()) + } +} + +impl PreparedScanPlan { + fn try_new( + binding: &ScanPlanBinding, + session: &VortexSession, + request: &DataSourceScanRequest, + ) -> VortexResult { + let dtype = binding.root().dtype(); + let return_dtype = request.projection.return_dtype(dtype)?; + let projection = request.projection.optimize_recursive(dtype)?; + let filter = request + .filter + .clone() + .map(|filter| filter.optimize_recursive(dtype)) + .transpose()?; + + let root = binding.root(); + let projection_pushed = push_expr(root, &projection, dtype, session)?; + let mut split_hints = Vec::new(); + extend_split_hints(&projection_pushed, &mut split_hints); + + // Run cheap, likely-selective conjuncts first so an expensive residual (e.g. an FSST `LIKE`) + // only evaluates over the rows that survive the cheaper predicates. AND is commutative, so + // reordering is semantically safe; `PredicateId`s are assigned by final slot below (after the + // sort) so each predicate's evidence/read stay self-consistent with its id. + let mut ordered_conjuncts = filter.as_ref().map(conjuncts).unwrap_or_default(); + ordered_conjuncts.sort_by_cached_key(predicate_cost); + let predicates = ordered_conjuncts + .into_iter() + .enumerate() + .map(|(idx, expr)| { + let id = PredicateId::new( + u32::try_from(idx).map_err(|_| vortex_err!("too many predicates"))?, + ); + let pushed = push_expr(root, &expr, dtype, session)?; + extend_split_hints(&pushed, &mut split_hints); + Ok(PreparedPredicatePlan { + id, + expr, + plan: pushed, + }) + }) + .collect::>>()?; + + Ok(Self { + dtype: return_dtype, + selection: request.selection.clone(), + ordered: request.ordered, + limit: request.limit, + row_count: binding.row_count(), + split_hints, + projection: projection_pushed, + predicates, + }) + } + + fn dtype(&self) -> &DType { + &self.dtype + } + + fn selection(&self) -> &Selection { + &self.selection + } + + fn ordered(&self) -> bool { + self.ordered + } + + fn limit(&self) -> Option { + self.limit + } + + fn predicates(&self) -> &[PreparedPredicatePlan] { + &self.predicates + } + + fn has_filter(&self) -> bool { + !self.predicates.is_empty() + } + + fn projection(&self) -> &ScanPlanRef { + &self.projection + } + + fn splits(&self, row_range: &Range) -> VortexResult>> { + check_range(row_range, self.row_count)?; + let (splits, split_kind) = prepare_split_ranges( + self.row_count, + row_range, + &self.selection, + self.split_hints.clone(), + ); + trace_prepared_splits(row_range, &splits, split_kind, self.has_filter()); + Ok(splits) + } +} + +impl ScanExecution { + fn try_new( + binding: ScanPlanBinding, + session: VortexSession, + plan: Arc, + limit_remaining: Option>, + ) -> VortexResult { + let mut prepare_ctx = PrepareCtx::with_state_cache(session.clone(), binding.state_cache()); + let projection = Arc::clone(plan.projection()) + .prepare_read(&mut prepare_ctx)? + .ok_or_else(|| vortex_err!("scan2 could not plan read for pushed projection"))?; + let predicates = plan + .predicates() + .iter() + .map(|predicate| { + let read = Arc::clone(&predicate.plan) + .prepare_read(&mut prepare_ctx)? + .ok_or_else(|| { + vortex_err!("scan2 could not plan predicate read {}", predicate.expr) + })?; + let evidence = Arc::clone(&predicate.plan).prepare_evidence(&mut prepare_ctx)?; + let dynamic_updates = DynamicExprUpdates::new(&predicate.expr); + Ok(ExecutionPredicate { + id: predicate.id, + expr: predicate.expr.clone(), + static_cost: predicate_cost(&predicate.expr), + dynamic_updates, + read, + evidence, + }) + }) + .collect::>>()?; + let predicate_stats = (0..predicates.len()) + .map(|_| PredicateRuntimeStats::default()) + .collect(); + let scan_evidence = ScanEvidenceStore { + predicates: predicates + .iter() + .map(|predicate| PredicateScanEvidenceStore { + generation: 0, + providers: predicate + .evidence + .iter() + .map(|_| ScanEvidenceSlot::default()) + .collect(), + }) + .collect(), + }; + + Ok(Self { + session, + plan, + limit_remaining, + projection, + predicates, + predicate_stats: Mutex::new(predicate_stats), + scan_evidence: Mutex::new(scan_evidence), + }) + } + + fn read_context(&self) -> ReadContext { + ReadContext::new(self.session.clone()) + } + + fn predicate_priority(&self, predicate_idx: usize, demand_rows: usize) -> u64 { + let predicate = &self.predicates[predicate_idx]; + let static_cost = predicate.static_cost.max(1); + let demand_rows = u64::try_from(demand_rows).unwrap_or(u64::MAX).max(1); + let stats = self.predicate_stats.lock(); + let stats = &stats[predicate_idx]; + let rejection_per_mille = if stats.input_rows >= 1024 { + stats.rejected_rows.saturating_mul(1000) / stats.input_rows.max(1) + } else { + // Before feedback exists, preserve the existing static cheap-first ordering while still + // giving every predicate a nonzero expected benefit. + 500 + } + .max(1); + let expected_rejected = demand_rows.saturating_mul(rejection_per_mille) / 1000; + static_cost.saturating_mul(1_000_000) / expected_rejected.max(1) + } + + fn has_dynamic_predicates(&self) -> bool { + self.predicates + .iter() + .any(|predicate| predicate.dynamic_updates.is_some()) + } + + fn record_predicate_result(&self, predicate_idx: usize, input_rows: usize, pass_rows: usize) { + let input_rows = u64::try_from(input_rows).unwrap_or(u64::MAX); + let pass_rows = u64::try_from(pass_rows).unwrap_or(u64::MAX); + let rejected_rows = input_rows.saturating_sub(pass_rows); + let mut stats = self.predicate_stats.lock(); + let stats = &mut stats[predicate_idx]; + stats.input_rows = stats.input_rows.saturating_add(input_rows); + stats.rejected_rows = stats.rejected_rows.saturating_add(rejected_rows); + } + + fn use_scan_scope_evidence(&self, predicate_idx: usize, mode: EvidenceMode) -> bool { + mode == EvidenceMode::RecheckBeforeProjection + || self.predicates[predicate_idx].static_cost >= SCAN_SCOPE_MIN_PREDICATE_COST + } + + fn plan_morsel( + self: &Arc, + morsel_id: usize, + range: Range, + ) -> VortexResult> { + let selected = self.plan.selection().row_mask(&range).mask().clone(); + if selected.all_false() { + return Ok(None); + } + + let mut evidence = Vec::new(); + let mut pending_evidence = Vec::with_capacity(self.predicates.len()); + let mut pending_scan_evidence = Vec::with_capacity(self.predicates.len()); + for predicate_idx in 0..self.predicates.len() { + let version = self.predicates[predicate_idx].version(); + let scan_work = self.plan_scan_evidence_work( + morsel_id, + predicate_idx, + version, + EvidenceMode::Normal, + )?; + pending_scan_evidence.push(scan_work.len()); + evidence.extend(scan_work); + + let morsel_work = self.plan_evidence_work( + morsel_id, + predicate_idx, + range.clone(), + version, + EvidenceMode::Normal, + )?; + pending_evidence.push(morsel_work.len()); + evidence.extend(morsel_work); + } + + let state = MorselState { + execution: Arc::clone(self), + range, + selected, + evidence: (0..self.predicates.len()).map(|_| None).collect(), + pending_evidence, + pending_scan_evidence, + scan_evidence_generation: vec![0; self.predicates.len()], + predicate_queued: vec![false; self.predicates.len()], + predicate_done: vec![false; self.predicates.len()], + next_recheck_predicate: 0, + projection_queued: false, + }; + + Ok(Some(PlannedMorselWork { state, evidence })) + } + + fn reserve_scan_evidence( + &self, + predicate_idx: usize, + evidence_idx: usize, + version: PredicateVersion, + create_waiter: bool, + ) -> VortexResult { + let mut store = self.scan_evidence.lock(); + let slot = store + .predicates + .get_mut(predicate_idx) + .and_then(|predicate| predicate.providers.get_mut(evidence_idx)) + .ok_or_else(|| { + vortex_err!( + "missing scan evidence slot for predicate {predicate_idx} provider {evidence_idx}" + ) + })?; + if slot.version == Some(version) { + return Ok(ScanEvidenceAction::Ready); + } + if slot.pending == Some(version) { + if !create_waiter { + return Ok(ScanEvidenceAction::Pending); + } + return Ok(ScanEvidenceAction::Wait); + } + + // Any older version is superseded. Polling waiters observe the version change and + // re-enter planning for the current dynamic boundary. + slot.pending = Some(version); + Ok(ScanEvidenceAction::Prepare) + } + + fn clear_scan_evidence_pending( + &self, + predicate_idx: usize, + evidence_idx: usize, + version: PredicateVersion, + ) { + let mut store = self.scan_evidence.lock(); + let Some(slot) = store + .predicates + .get_mut(predicate_idx) + .and_then(|predicate| predicate.providers.get_mut(evidence_idx)) + else { + return; + }; + if slot.pending == Some(version) { + slot.pending = None; + } + } + + fn scan_evidence_provider_ready( + &self, + predicate_idx: usize, + evidence_idx: usize, + version: PredicateVersion, + ) -> bool { + self.scan_evidence + .lock() + .predicates + .get(predicate_idx) + .and_then(|predicate| predicate.providers.get(evidence_idx)) + .is_some_and(|slot| slot.version == Some(version)) + } + + fn record_scan_evidence( + &self, + predicate_idx: usize, + evidence_idx: usize, + version: PredicateVersion, + mut fragments: Vec, + ) -> VortexResult { + fragments.sort_by_key(|fragment| (fragment.rows.start, fragment.rows.end)); + let mut store = self.scan_evidence.lock(); + let predicate = store + .predicates + .get_mut(predicate_idx) + .ok_or_else(|| vortex_err!("missing scan evidence predicate slot {predicate_idx}"))?; + let slot = predicate.providers.get_mut(evidence_idx).ok_or_else(|| { + vortex_err!( + "missing scan evidence provider slot {evidence_idx} for predicate {predicate_idx}" + ) + })?; + + if slot.pending != Some(version) && slot.version != Some(version) { + return Ok(false); + } + + slot.version = Some(version); + slot.pending = None; + slot.fragments = fragments; + predicate.generation = predicate.generation.saturating_add(1); + Ok(true) + } + + fn scan_evidence_fragments( + &self, + predicate_idx: usize, + version: PredicateVersion, + range: &Range, + ) -> VortexResult<(u64, Vec)> { + let store = self.scan_evidence.lock(); + let Some(predicate) = store.predicates.get(predicate_idx) else { + vortex_bail!("missing scan evidence predicate slot {predicate_idx}"); + }; + let generation = predicate.generation; + let mut fragments = Vec::new(); + for slot in &predicate.providers { + if slot.version == Some(version) { + push_overlapping_fragments(&slot.fragments, range, &mut fragments)?; + } + } + Ok((generation, fragments)) + } + + fn plan_scan_evidence_work( + self: &Arc, + morsel_id: usize, + predicate_idx: usize, + version: PredicateVersion, + mode: EvidenceMode, + ) -> VortexResult> { + if !self.use_scan_scope_evidence(predicate_idx, mode) { + return Ok(Vec::new()); + } + + let predicate = &self.predicates[predicate_idx]; + let predicate_idx_u32 = + u32::try_from(predicate_idx).map_err(|_| vortex_err!("too many predicates"))?; + let mut work = Vec::new(); + for (evidence_idx, plan) in predicate.evidence.iter().enumerate() { + if plan.scope() != EvidenceScope::Scan { + continue; + } + if mode == EvidenceMode::RecheckBeforeProjection && !plan.recheck_before_projection() { + continue; + } + + let evidence_idx_u32 = + u32::try_from(evidence_idx).map_err(|_| vortex_err!("too many evidence plans"))?; + let priority = plan + .cost( + &OwnedEvidenceRequest { + id: predicate.id, + version, + predicate: predicate.expr.clone(), + range: 0..self.plan.row_count, + mode, + } + .as_request(), + ) + .priority(0, mode == EvidenceMode::RecheckBeforeProjection) + .saturating_add(predicate.static_cost); + + let create_waiter = mode == EvidenceMode::RecheckBeforeProjection; + match self.reserve_scan_evidence(predicate_idx, evidence_idx, version, create_waiter)? { + ScanEvidenceAction::Ready => {} + ScanEvidenceAction::Pending => {} + ScanEvidenceAction::Wait => { + work.push(Box::new(ScanEvidenceWaitTask { + execution: Arc::clone(self), + morsel_id, + predicate_idx, + evidence_idx, + version, + lane: ScanTaskLane::ScanEvidence { + predicate_idx: predicate_idx_u32, + evidence_idx: evidence_idx_u32, + }, + priority, + }) as QueuedWork); + } + ScanEvidenceAction::Prepare => { + let req = OwnedEvidenceRequest { + id: predicate.id, + version, + predicate: predicate.expr.clone(), + range: 0..self.plan.row_count, + mode, + }; + let result = (|| { + let task = Arc::clone(plan) + .create_task(req.clone(), ScanIoPhase::EvidenceProbe)?; + let step = task.into_step()?; + let work_reads = ScanTaskRead::from_scan_reads(&step.required_reads); + let priority = plan + .cost(&req.as_request()) + .priority( + scan_task_read_bytes(&work_reads), + mode == EvidenceMode::RecheckBeforeProjection, + ) + .saturating_add(predicate.static_cost); + let execution = Arc::clone(self); + Ok(ScanStep::new( + morsel_id, + ScanIoPhase::EvidenceProbe, + ScanTaskLane::ScanEvidence { + predicate_idx: predicate_idx_u32, + evidence_idx: evidence_idx_u32, + }, + work_reads, + step.required_reads, + step.prefetch_reads, + move |results| { + let reader = execution.read_context(); + let fragments = step.continuation.run(&reader, results)?; + Ok(ScanStepResult::Ready(WorkOutput::ScanEvidence( + ScanEvidenceWorkOutput { + execution, + morsel_id, + predicate_idx, + evidence_idx, + version, + fragments: Some(fragments), + }, + ))) + }, + ) + .with_priority(priority) + .boxed()) + })(); + match result { + Ok(task) => work.push(task), + Err(error) => { + self.clear_scan_evidence_pending(predicate_idx, evidence_idx, version); + return Err(error); + } + } + } + } + } + Ok(work) + } + + fn plan_evidence_work( + self: &Arc, + morsel_id: usize, + predicate_idx: usize, + range: Range, + version: PredicateVersion, + mode: EvidenceMode, + ) -> VortexResult> { + let predicate = &self.predicates[predicate_idx]; + let req = OwnedEvidenceRequest { + id: predicate.id, + version, + predicate: predicate.expr.clone(), + range, + mode, + }; + let predicate_idx_u32 = + u32::try_from(predicate_idx).map_err(|_| vortex_err!("too many predicates"))?; + let mut work = Vec::with_capacity(predicate.evidence.len()); + for (evidence_idx, plan) in predicate.evidence.iter().enumerate() { + if plan.scope() == EvidenceScope::Scan + && self.use_scan_scope_evidence(predicate_idx, mode) + { + continue; + } + if mode == EvidenceMode::RecheckBeforeProjection && !plan.recheck_before_projection() { + continue; + } + let evidence_idx_u32 = + u32::try_from(evidence_idx).map_err(|_| vortex_err!("too many evidence plans"))?; + let task = Arc::clone(plan).create_task(req.clone(), ScanIoPhase::EvidenceProbe)?; + let step = task.into_step()?; + let work_reads = ScanTaskRead::from_scan_reads(&step.required_reads); + let priority = plan + .cost(&req.as_request()) + .priority( + scan_task_read_bytes(&work_reads), + mode == EvidenceMode::RecheckBeforeProjection, + ) + .saturating_add(predicate.static_cost); + let execution = Arc::clone(self); + work.push( + ScanStep::new( + morsel_id, + ScanIoPhase::EvidenceProbe, + ScanTaskLane::Evidence { + predicate_idx: predicate_idx_u32, + evidence_idx: evidence_idx_u32, + }, + work_reads, + step.required_reads, + step.prefetch_reads, + move |results| { + let reader = execution.read_context(); + let fragments = step.continuation.run(&reader, results)?; + Ok(ScanStepResult::Ready(WorkOutput::Evidence( + EvidenceWorkOutput { + morsel_id, + predicate_idx, + version, + source: EvidenceWorkSource::Provider, + fragments, + }, + ))) + }, + ) + .with_priority(priority) + .boxed(), + ); + } + Ok(work) + } + + fn plan_predicate_work( + self: &Arc, + morsel_id: usize, + predicate_idx: usize, + range: Range, + need: Mask, + version: PredicateVersion, + priority: u64, + ) -> VortexResult { + let len = range_len(&range)?; + let predicate = &self.predicates[predicate_idx]; + let compact = need.density() < EXPR_EVAL_THRESHOLD; + let rows = if compact { + OwnedRowScope::selected(need.clone()) + } else { + OwnedRowScope::try_new(Mask::new_true(len), need.clone())? + }; + let phase = ScanIoPhase::PredicateRead; + let task = Arc::clone(&predicate.read).create_task(range.clone(), rows, phase)?; + + let predicate_idx_u32 = + u32::try_from(predicate_idx).map_err(|_| vortex_err!("too many predicates"))?; + let state = PredicateReadWorkState { + execution: Arc::clone(self), + morsel_id, + predicate_idx, + version, + range, + need, + compact, + len, + priority, + lane: ScanTaskLane::Predicate { + predicate_idx: predicate_idx_u32, + }, + }; + Ok(Box::new(PredicateReadWorkTask::try_new(state, task)?)) + } + + fn plan_projection_work( + self: &Arc, + morsel_id: usize, + range: Range, + selected: Mask, + ) -> VortexResult> { + // Projection consumes the final selected rows after every predicate plan has contributed + // metadata evidence and, if needed, exact residual evidence. There is no separate + // predicate-demand mask at this point. + let len = range_len(&range)?; + let selected = if let Some(limit_remaining) = &self.limit_remaining { + limit_mask(selected, limit_remaining)? + } else { + selected + }; + if selected.all_false() { + return Ok(None); + } + if selected.len() != len { + vortex_bail!( + "scan2 projection selection length {} does not match range length {len}", + selected.len() + ); + } + + let rows = OwnedRowScope::selected(selected); + let phase = ScanIoPhase::ProjectionRead; + let task = Arc::clone(&self.projection).create_task(range, rows, phase)?; + + let execution = Arc::clone(self); + Ok(Some(Box::new(ProjectionReadWorkTask::try_new( + execution, task, morsel_id, + )?))) + } + + fn splits(&self, row_range: &Range) -> VortexResult>> { + self.plan.splits(row_range) + } +} + +fn push_overlapping_fragments( + fragments: &[EvidenceFragment], + range: &Range, + output: &mut Vec, +) -> VortexResult<()> { + let start = fragments + .partition_point(|fragment| fragment.rows.start < range.start) + .saturating_sub(1); + for fragment in &fragments[start..] { + if fragment.rows.start >= range.end { + break; + } + if let Some(fragment) = slice_evidence_fragment(fragment, range)? { + output.push(fragment); + } + } + Ok(()) +} + +fn slice_evidence_fragment( + fragment: &EvidenceFragment, + range: &Range, +) -> VortexResult> { + let rows = fragment.rows.start.max(range.start)..fragment.rows.end.min(range.end); + if rows.start >= rows.end { + return Ok(None); + } + if rows == fragment.rows { + return Ok(Some(fragment.clone())); + } + + let local = usize::try_from(rows.start - fragment.rows.start) + .map_err(|_| vortex_err!("evidence fragment exceeds usize"))? + ..usize::try_from(rows.end - fragment.rows.start) + .map_err(|_| vortex_err!("evidence fragment exceeds usize"))?; + let kind = match &fragment.kind { + PredicateEvidenceKind::AllFalse => PredicateEvidenceKind::AllFalse, + PredicateEvidenceKind::AllTrue => PredicateEvidenceKind::AllTrue, + PredicateEvidenceKind::Unknown => PredicateEvidenceKind::Unknown, + PredicateEvidenceKind::ExactMask(mask) => { + PredicateEvidenceKind::ExactMask(mask.slice(local)) + } + PredicateEvidenceKind::CandidateMask(mask) => { + PredicateEvidenceKind::CandidateMask(mask.slice(local)) + } + }; + Ok(Some(EvidenceFragment::new(rows, kind))) +} + +fn push_expr( + root: &ScanPlanRef, + expr: &Expression, + dtype: &DType, + session: &VortexSession, +) -> VortexResult { + validate_temporal_comparisons(expr, dtype)?; + Arc::clone(root) + .try_push_expr(expr, &mut PushCtx::new(session.clone()))? + .ok_or_else(|| vortex_err!("scan2 could not push expression {expr}")) +} + +fn validate_temporal_comparisons(expr: &Expression, scope: &DType) -> VortexResult<()> { + for child in expr.children().iter() { + validate_temporal_comparisons(child, scope)?; + } + + let Some(operator) = expr.as_opt::() else { + return Ok(()); + }; + if !operator.is_comparison() { + return Ok(()); + } + + let lhs = expr.child(0).return_dtype(scope)?; + let rhs = expr.child(1).return_dtype(scope)?; + if is_temporal(&lhs) && is_temporal(&rhs) && !lhs.eq_ignore_nullability(&rhs) { + vortex_bail!("Cannot compare temporal DTypes with different metadata: {lhs} and {rhs}"); + } + + Ok(()) +} + +fn is_temporal(dtype: &DType) -> bool { + match dtype { + DType::Extension(ext) => ext.metadata_opt::().is_some(), + _ => false, + } +} + +fn extend_split_hints(plan: &ScanPlanRef, points: &mut Vec) { + if let Some(hints) = plan.split_hints() { + points.extend_from_slice(hints); + } +} + +#[derive(Clone, Copy, Debug)] +enum PreparedSplitKind { + SelectionRanges, + Natural, +} + +fn prepare_split_ranges( + row_count: u64, + row_range: &Range, + selection: &Selection, + split_hints: Vec, +) -> (Vec>, PreparedSplitKind) { + let explicit_row_range = explicit_row_range(row_count, row_range); + if let Some(ranges) = selection_split_ranges(selection, explicit_row_range) { + return (ranges, PreparedSplitKind::SelectionRanges); + } + + let file_range = 0..row_count; + let selection_range = intersect_ranges(Some(&file_range), selection_bounding_range(selection)); + let bounded_range = intersect_ranges(explicit_row_range, selection_range); + let points = normalize_split_points(row_count, split_hints); + ( + natural_split_ranges(&points, bounded_range.as_ref()), + PreparedSplitKind::Natural, + ) +} + +fn explicit_row_range(row_count: u64, row_range: &Range) -> Option<&Range> { + (row_range.start != 0 || row_range.end != row_count).then_some(row_range) +} + +fn selection_split_ranges( + selection: &Selection, + row_range: Option<&Range>, +) -> Option>> { + let Selection::IncludeByIndex(buffer) = selection else { + return None; + }; + if row_range.is_some() { + return None; + } + + let indices = buffer.as_slice(); + if indices.is_empty() { + return Some(Vec::new()); + } + debug_assert!(indices.is_sorted()); + + let mut ranges = Vec::with_capacity((indices.len() as u64 / MAX_SELECTION_RANGE_SIZE) as usize); + let mut curr_start = indices[0]; + let mut curr_end = indices[0].saturating_add(1); + for &idx in &indices[1..] { + let idx_end = idx.saturating_add(1); + let new_range_size = idx_end.saturating_sub(curr_start); + let gap = idx_end.saturating_sub(curr_end); + if new_range_size >= MAX_SELECTION_RANGE_SIZE { + if gap >= MIN_SELECTION_GAP_BETWEEN_RANGES { + ranges.push(curr_start..curr_end); + curr_start = idx; + curr_end = idx_end; + } else { + return None; + } + } else { + curr_end = idx_end; + } + } + ranges.push(curr_start..curr_end); + Some(ranges) +} + +fn selection_bounding_range(selection: &Selection) -> Option> { + match selection { + Selection::IncludeByIndex(buffer) => { + let indices = buffer.as_slice(); + indices + .first() + .zip(indices.last()) + .map(|(&first, &last)| first..last.saturating_add(1)) + } + Selection::IncludeRoaring(roaring) if !roaring.is_empty() => { + Some(roaring.min()?..roaring.max()?.saturating_add(1)) + } + _ => None, + } +} + +fn intersect_ranges(left: Option<&Range>, right: Option>) -> Option> { + match (left, right) { + (Some(left), Some(right)) => Some(left.start.max(right.start)..left.end.min(right.end)), + (Some(left), None) => Some(left.clone()), + (None, Some(right)) => Some(right), + (None, None) => None, + } +} + +fn normalize_split_points(row_count: u64, mut hints: Vec) -> Vec { + hints.push(0); + hints.push(row_count); + hints.retain(|&hint| hint <= row_count); + hints.sort_unstable(); + hints.dedup(); + hints +} + +fn natural_split_ranges(split_points: &[u64], row_range: Option<&Range>) -> Vec> { + let points = if let Some(row_range) = row_range { + if row_range.start >= row_range.end { + return Vec::new(); + } + let mut points = Vec::new(); + points.push(row_range.start); + points.extend( + split_points + .iter() + .copied() + .filter(|&point| row_range.start < point && point < row_range.end), + ); + points.push(row_range.end); + points.sort_unstable(); + points.dedup(); + points + } else { + split_points.to_vec() + }; + + points + .windows(2) + .filter_map(|window| { + let range = window[0]..window[1]; + (range.start < range.end).then_some(range) + }) + .collect() +} + +fn trace_prepared_splits( + row_range: &Range, + splits: &[Range], + split_kind: PreparedSplitKind, + has_filter: bool, +) { + tracing::debug!( + target: "vortex_scan::plan::data_source", + ?split_kind, + split_count = splits.len(), + row_start = row_range.start, + row_end = row_range.end, + first_split = ?splits.first(), + last_split = ?splits.last(), + has_filter, + "prepared scan2 splits" + ); + tracing::trace!( + target: "vortex_scan::plan::data_source", + ?splits, + "prepared scan2 split ranges" + ); +} + +fn check_range(range: &Range, row_count: u64) -> VortexResult<()> { + if range.start > range.end || range.end > row_count { + vortex_bail!( + "scan2 row range {:?} is out of bounds for row count {}", + range, + row_count + ); + } + range_len(range).map(|_| ()) +} + +fn range_len(range: &Range) -> VortexResult { + let len = range + .end + .checked_sub(range.start) + .ok_or_else(|| vortex_err!("scan2 row range end is before start: {range:?}"))?; + usize::try_from(len).map_err(|_| vortex_err!("scan2 row range exceeds usize")) +} + +fn limit_mask(mask: Mask, remaining: &AtomicU64) -> VortexResult { + let true_count = mask.true_count(); + let true_count = + u64::try_from(true_count).map_err(|_| vortex_err!("mask count exceeds u64"))?; + + loop { + let available = remaining.load(Ordering::Acquire); + if available == 0 { + return Ok(Mask::new_false(mask.len())); + } + + let take = true_count.min(available); + if remaining + .compare_exchange_weak( + available, + available - take, + Ordering::AcqRel, + Ordering::Acquire, + ) + .is_err() + { + continue; + } + + if take == true_count { + return Ok(mask); + } + + let take = usize::try_from(take).unwrap_or(usize::MAX); + return Ok(Mask::from_indices( + mask.len(), + (0..mask.len()).filter(|idx| mask.value(*idx)).take(take), + )); + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + use std::sync::atomic::AtomicU64; + use std::sync::atomic::Ordering; + + use vortex_array::expr::get_item; + use vortex_array::expr::like; + use vortex_array::expr::lit; + use vortex_array::expr::not_eq; + use vortex_array::expr::root; + use vortex_error::VortexResult; + use vortex_error::vortex_err; + use vortex_mask::Mask; + + use super::limit_mask; + use super::predicate_cost; + + #[test] + fn predicate_cost_orders_cheap_before_expensive() { + let cheap = not_eq(get_item("search", root()), lit("")); + let expensive = like(get_item("url", root()), lit("%google%")); + assert!( + predicate_cost(&cheap) < predicate_cost(&expensive), + "primitive comparison must be cheaper than LIKE: cheap={}, expensive={}", + predicate_cost(&cheap), + predicate_cost(&expensive), + ); + } + + #[test] + fn limit_mask_consumes_full_mask_when_limit_allows() -> VortexResult<()> { + let remaining = AtomicU64::new(4); + + let selected = limit_mask(Mask::from_indices(6, [1, 2, 4]), &remaining)?; + + assert_eq!(selected.true_count(), 3); + assert!(selected.value(1)); + assert!(selected.value(2)); + assert!(selected.value(4)); + assert_eq!(remaining.load(Ordering::Acquire), 1); + Ok(()) + } + + #[test] + fn limit_mask_trims_mask_to_remaining_rows() -> VortexResult<()> { + let remaining = AtomicU64::new(2); + + let selected = limit_mask(Mask::from_indices(6, [1, 2, 4]), &remaining)?; + + assert_eq!(selected.true_count(), 2); + assert!(selected.value(1)); + assert!(selected.value(2)); + assert!(!selected.value(4)); + assert_eq!(remaining.load(Ordering::Acquire), 0); + Ok(()) + } + + #[test] + fn limit_mask_shared_counter_never_overselects() -> VortexResult<()> { + let remaining = Arc::new(AtomicU64::new(10)); + + let handles = (0..16) + .map(|_| { + let remaining = Arc::clone(&remaining); + std::thread::spawn(move || limit_mask(Mask::new_true(8), &remaining)) + }) + .collect::>(); + + let mut selected_rows = 0; + for handle in handles { + let selected = handle + .join() + .map_err(|_| vortex_err!("limit mask worker thread panicked"))??; + selected_rows += selected.true_count(); + } + + assert_eq!(selected_rows, 10); + assert_eq!(remaining.load(Ordering::Acquire), 0); + Ok(()) + } +} diff --git a/vortex-scan/src/plan/mod.rs b/vortex-scan/src/plan/mod.rs index 408f993009e..be56a0d13ed 100644 --- a/vortex-scan/src/plan/mod.rs +++ b/vortex-scan/src/plan/mod.rs @@ -14,6 +14,7 @@ //! controls output cardinality, and demand controls which selected rows //! must contain meaningful values. +pub mod data_source; pub mod evidence; pub mod request; @@ -23,6 +24,13 @@ use std::ops::Range; use std::sync::Arc; use std::sync::OnceLock; +pub use data_source::ScanPlanDataSource; +pub use data_source::ScanPlanFactory; +pub use data_source::scan_plan_projected_splits; +pub use data_source::scan_plan_split_ranges; +pub use data_source::scan_plan_statistics; +pub use data_source::scan_plan_statistics_many; +pub use data_source::scan_plan_stream; use futures::future::BoxFuture; use parking_lot::Mutex; use rustc_hash::FxHashMap; @@ -34,10 +42,12 @@ use vortex_array::aggregate_fn::AggregateFnRef; use vortex_array::arrays::ConstantArray; use vortex_array::arrays::StructArray; use vortex_array::builtins::ArrayBuiltins; +use vortex_array::dtype::DType; use vortex_array::dtype::Field; use vortex_array::dtype::FieldNames; use vortex_array::dtype::FieldPath; use vortex_array::dtype::Nullability; +use vortex_array::dtype::StructFields; use vortex_array::expr::Expression; use vortex_array::expr::get_item; use vortex_array::expr::is_root; @@ -339,6 +349,12 @@ pub struct AggregateAnswer { /// objects created while preparing reads, evidence, statistics, and aggregates for /// a file scan. pub trait ScanPlan: 'static + Send + Sync { + /// Logical dtype produced by this plan's root value. + fn dtype(&self) -> &DType; + + /// Number of rows in this plan's row domain. + fn row_count(&self) -> u64; + /// Create this plan's per-file/query state. fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult; @@ -446,7 +462,7 @@ pub fn default_try_push_expr( if is_root(expr) { Ok(Some(plan)) } else { - Ok(Some(Arc::new(ApplyScanPlan::new(plan, expr.clone())))) + Ok(Some(Arc::new(ApplyScanPlan::try_new(plan, expr.clone())?))) } } @@ -499,6 +515,14 @@ struct LiteralReadTask { } impl ScanPlan for LiteralScanPlan { + fn dtype(&self) -> &DType { + self.scalar.dtype() + } + + fn row_count(&self) -> u64 { + self.row_count + } + fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { Ok(Arc::new(())) } @@ -1100,18 +1124,62 @@ pub struct StructValueScanPlan { names: FieldNames, fields: Vec, validity: Option, + dtype: DType, + row_count: u64, split_hints: OnceLock>>, } impl StructValueScanPlan { /// Create a virtual plan that assembles a struct from child field plans. - pub fn new(names: FieldNames, fields: Vec, validity: Option) -> Self { - Self { + pub fn try_new( + names: FieldNames, + fields: Vec, + validity: Option, + row_count: u64, + ) -> VortexResult { + if names.len() != fields.len() { + vortex_bail!( + "struct scan plan has {} names for {} fields", + names.len(), + fields.len() + ); + } + for field in &fields { + if field.row_count() != row_count { + vortex_bail!( + "struct field row count {} does not match row domain {}", + field.row_count(), + row_count + ); + } + } + if let Some(validity) = &validity + && validity.row_count() != row_count + { + vortex_bail!( + "struct validity row count {} does not match row domain {}", + validity.row_count(), + row_count + ); + } + let nullability = if validity.is_some() { + Nullability::Nullable + } else { + Nullability::NonNullable + }; + let dtypes = fields + .iter() + .map(|field| field.dtype().clone()) + .collect::>(); + let dtype = DType::Struct(StructFields::new(names.clone(), dtypes), nullability); + Ok(Self { names, fields, validity, + dtype, + row_count, split_hints: OnceLock::new(), - } + }) } fn compute_split_hints(&self) -> Option> { @@ -1146,6 +1214,14 @@ struct StructValuePreparedRead { } impl ScanPlan for StructValueScanPlan { + fn dtype(&self) -> &DType { + &self.dtype + } + + fn row_count(&self) -> u64 { + self.row_count + } + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { let fields = self .fields @@ -1268,12 +1344,14 @@ impl PreparedRead for StructValuePreparedRead { pub struct ApplyScanPlan { input: ScanPlanRef, expr: Expression, + dtype: DType, } impl ApplyScanPlan { /// Create a virtual plan that applies `expr` to `input`. - pub fn new(input: ScanPlanRef, expr: Expression) -> Self { - Self { input, expr } + pub fn try_new(input: ScanPlanRef, expr: Expression) -> VortexResult { + let dtype = expr.return_dtype(input.dtype())?; + Ok(Self { input, expr, dtype }) } } @@ -1283,6 +1361,14 @@ struct ApplyPreparedRead { } impl ScanPlan for ApplyScanPlan { + fn dtype(&self) -> &DType { + &self.dtype + } + + fn row_count(&self) -> u64 { + self.input.row_count() + } + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { cx.init_plan(&self.input) } @@ -1349,6 +1435,7 @@ impl PreparedRead for ApplyPreparedRead { pub struct MaskScanPlan { input: ScanPlanRef, validity: ScanPlanRef, + dtype: DType, } impl MaskScanPlan { @@ -1357,7 +1444,12 @@ impl MaskScanPlan { /// `validity` must read a non-nullable boolean array in the same row domain /// as `input` (the struct layout's validity child). pub fn new(input: ScanPlanRef, validity: ScanPlanRef) -> Self { - Self { input, validity } + let dtype = input.dtype().as_nullable(); + Self { + input, + validity, + dtype, + } } } @@ -1374,6 +1466,14 @@ struct MaskPreparedRead { } impl ScanPlan for MaskScanPlan { + fn dtype(&self) -> &DType { + &self.dtype + } + + fn row_count(&self) -> u64 { + self.input.row_count() + } + fn init_state(&self, cx: &mut StateCtx<'_>) -> VortexResult { Ok(Arc::new(MaskState { input: cx.init_plan(&self.input)?, @@ -1602,14 +1702,26 @@ mod tests { use vortex_array::aggregate_fn::fns::min::Min; use vortex_array::arrays::Constant; use vortex_array::dtype::Nullability; + use vortex_array::dtype::PType; use vortex_array::expr::lit; use super::*; use crate::read::ReadStore; - struct TestStatsNode; + struct TestStatsNode { + dtype: DType, + row_count: u64, + } impl ScanPlan for TestStatsNode { + fn dtype(&self) -> &DType { + &self.dtype + } + + fn row_count(&self) -> u64 { + self.row_count + } + fn init_state(&self, _cx: &mut StateCtx<'_>) -> VortexResult { Ok(Arc::new(())) } @@ -1619,7 +1731,7 @@ mod tests { expr: &Expression, _cx: &mut PushCtx, ) -> VortexResult> { - if let Some(literal) = literal_scan_plan(expr, 20) { + if let Some(literal) = literal_scan_plan(expr, self.row_count) { return Ok(Some(literal)); } default_try_push_expr(self, expr) @@ -1681,7 +1793,10 @@ mod tests { #[test] fn stats_plan_erasure_preserves_positional_results() -> VortexResult<()> { let session = VortexSession::empty(); - let plan_root: ScanPlanRef = Arc::new(TestStatsNode); + let plan_root: ScanPlanRef = Arc::new(TestStatsNode { + dtype: DType::Primitive(PType::I32, Nullability::NonNullable), + row_count: 20, + }); let funcs = vec![ Min.bind(NumericalAggregateOpts::default()), Max.bind(NumericalAggregateOpts::default()), @@ -1708,7 +1823,10 @@ mod tests { #[test] fn literal_pushdown_prepares_without_input_read() -> VortexResult<()> { let session = VortexSession::empty(); - let plan_root: ScanPlanRef = Arc::new(TestStatsNode); + let plan_root: ScanPlanRef = Arc::new(TestStatsNode { + dtype: DType::Primitive(PType::I32, Nullability::NonNullable), + row_count: 20, + }); let literal = lit(42i32); let plan = plan_root From 04ded805f1cbfd01afeed30f0da4aa8f165832d1 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 24 Jun 2026 22:00:02 -0400 Subject: [PATCH 48/48] Default to ScanV2 Signed-off-by: Nicholas Gates --- .github/workflows/bench-pr.yml | 2 - .github/workflows/sql-benchmarks.yml | 2 +- .github/workflows/sql-pr.yml | 1 - Cargo.lock | 1 - vortex-file/Cargo.toml | 1 - vortex-file/src/multi/mod.rs | 4 +- vortex-file/src/tests.rs | 335 +++++++++++++-------------- vortex-layout/src/scan/v2/mod.rs | 64 ++++- 8 files changed, 224 insertions(+), 186 deletions(-) diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml index f048c4fbb4f..8223ea80a56 100644 --- a/.github/workflows/bench-pr.yml +++ b/.github/workflows/bench-pr.yml @@ -87,7 +87,6 @@ jobs: shell: bash env: RUST_BACKTRACE: full - VORTEX_SCAN_IMPL: "v2" VORTEX_EXPERIMENTAL_PATCHED_ARRAY: "1" FLAT_LAYOUT_INLINE_ARRAY_NODE: "1" run: | @@ -141,4 +140,3 @@ jobs: secrets: inherit with: mode: "pr" - vortex_scan_impl: "v2" diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml index 903e688d42e..a141d2f11e2 100644 --- a/.github/workflows/sql-benchmarks.yml +++ b/.github/workflows/sql-benchmarks.yml @@ -14,7 +14,7 @@ on: required: false type: string default: "" - description: "Optional VORTEX_SCAN_IMPL value for Vortex file scans" + description: "Optional VORTEX_SCAN_IMPL override for Vortex file scans, e.g. v1 for legacy scans" benchmark_matrix: required: false type: string diff --git a/.github/workflows/sql-pr.yml b/.github/workflows/sql-pr.yml index b89615c3a24..45b0ed1d675 100644 --- a/.github/workflows/sql-pr.yml +++ b/.github/workflows/sql-pr.yml @@ -39,5 +39,4 @@ jobs: secrets: inherit with: mode: "pr" - vortex_scan_impl: "v2" benchmark_profile: ${{ inputs.benchmark_profile || 'base' }} diff --git a/Cargo.lock b/Cargo.lock index 8bc5c371e72..535af96b257 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9992,7 +9992,6 @@ dependencies = [ "parking_lot", "pin-project-lite", "rstest", - "temp-env", "tokio", "tracing", "vortex-alp", diff --git a/vortex-file/Cargo.toml b/vortex-file/Cargo.toml index fb8fec2be33..752ccbc753d 100644 --- a/vortex-file/Cargo.toml +++ b/vortex-file/Cargo.toml @@ -60,7 +60,6 @@ vortex-zstd = { workspace = true, optional = true } [dev-dependencies] rstest = { workspace = true } -temp-env = { workspace = true } tokio = { workspace = true, features = ["full"] } vortex-array = { workspace = true, features = ["_test-harness"] } vortex-io = { workspace = true, features = ["tokio"] } diff --git a/vortex-file/src/multi/mod.rs b/vortex-file/src/multi/mod.rs index 744bcbfc738..fbf12960a87 100644 --- a/vortex-file/src/multi/mod.rs +++ b/vortex-file/src/multi/mod.rs @@ -229,8 +229,8 @@ impl MultiFileDataSource { /// Build the [`DataSource`] selected by `VORTEX_SCAN_IMPL`. /// - /// The default is the existing LayoutReader-backed scan. Setting - /// `VORTEX_SCAN_IMPL=v2` (or `scan2`) builds the ScanPlan-backed V2 scan. + /// The default is the ScanPlan-backed V2 scan. Setting + /// `VORTEX_SCAN_IMPL=v1` (or `legacy`) falls back to the existing LayoutReader-backed scan. pub async fn build_data_source(self) -> VortexResult { if scan2_enabled()? { Ok(Arc::new(scan_v2::build_scan_plan_data_source(self).await?)) diff --git a/vortex-file/src/tests.rs b/vortex-file/src/tests.rs index 97ecd934fca..3bdb5a864b0 100644 --- a/vortex-file/src/tests.rs +++ b/vortex-file/src/tests.rs @@ -122,189 +122,182 @@ fn multi_file_scan_plan_data_source_filters_and_projects() -> VortexResult<()> { let runtime = SingleThreadRuntime::default(); let session = new_test_session().with_handle(runtime.handle()); - temp_env::with_var("VORTEX_SCAN_IMPL", Some("v2"), || { - runtime.block_on(async { - use async_trait::async_trait; - use futures::stream; - use futures::stream::BoxStream; - use vortex_array::aggregate_fn::AggregateFnVTableExt; - use vortex_array::aggregate_fn::EmptyOptions; - use vortex_array::aggregate_fn::NumericalAggregateOpts; - use vortex_array::aggregate_fn::fns::max::Max; - use vortex_array::aggregate_fn::fns::min::Min; - use vortex_array::aggregate_fn::fns::null_count::NullCount; - use vortex_io::VortexReadAt; - use vortex_io::filesystem::FileListing; - use vortex_io::filesystem::FileSystem; - use vortex_io::filesystem::FileSystemRef; - - #[derive(Debug)] - struct MemoryFileSystem { - files: std::collections::BTreeMap, + runtime.block_on(async { + use async_trait::async_trait; + use futures::stream; + use futures::stream::BoxStream; + use vortex_array::aggregate_fn::AggregateFnVTableExt; + use vortex_array::aggregate_fn::EmptyOptions; + use vortex_array::aggregate_fn::NumericalAggregateOpts; + use vortex_array::aggregate_fn::fns::max::Max; + use vortex_array::aggregate_fn::fns::min::Min; + use vortex_array::aggregate_fn::fns::null_count::NullCount; + use vortex_io::VortexReadAt; + use vortex_io::filesystem::FileListing; + use vortex_io::filesystem::FileSystem; + use vortex_io::filesystem::FileSystemRef; + + #[derive(Debug)] + struct MemoryFileSystem { + files: std::collections::BTreeMap, + } + + #[async_trait] + impl FileSystem for MemoryFileSystem { + fn list(&self, prefix: &str) -> BoxStream<'_, VortexResult> { + let listings = self + .files + .iter() + .filter_map(move |(path, bytes)| { + path.starts_with(prefix).then_some(Ok(FileListing { + path: path.clone(), + size: Some(bytes.len() as u64), + })) + }) + .collect::>(); + stream::iter(listings).boxed() } - #[async_trait] - impl FileSystem for MemoryFileSystem { - fn list(&self, prefix: &str) -> BoxStream<'_, VortexResult> { - let listings = self - .files - .iter() - .filter_map(move |(path, bytes)| { - path.starts_with(prefix).then_some(Ok(FileListing { - path: path.clone(), - size: Some(bytes.len() as u64), - })) - }) - .collect::>(); - stream::iter(listings).boxed() - } - - async fn head(&self, path: &str) -> VortexResult> { - Ok(self.files.get(path).map(|bytes| FileListing { - path: path.to_string(), - size: Some(bytes.len() as u64), - })) - } - - async fn open_read(&self, path: &str) -> VortexResult> { - self.files - .get(path) - .cloned() - .map(|bytes| Arc::new(bytes) as Arc) - .ok_or_else(|| vortex_error::vortex_err!("missing test file {path}")) - } - - async fn delete(&self, _path: &str) -> VortexResult<()> { - Ok(()) - } + async fn head(&self, path: &str) -> VortexResult> { + Ok(self.files.get(path).map(|bytes| FileListing { + path: path.to_string(), + size: Some(bytes.len() as u64), + })) } - async fn write_part( - session: &VortexSession, - values: ArrayRef, - ) -> VortexResult { - let mut buf = ByteBufferMut::empty(); - session - .write_options() - .write(&mut buf, values.to_array_stream()) - .await?; - Ok(buf.freeze()) + async fn open_read(&self, path: &str) -> VortexResult> { + self.files + .get(path) + .cloned() + .map(|bytes| Arc::new(bytes) as Arc) + .ok_or_else(|| vortex_error::vortex_err!("missing test file {path}")) } - async fn write_part_with_stats( - session: &VortexSession, - values: ArrayRef, - ) -> VortexResult { - let mut buf = ByteBufferMut::empty(); - let mut writer = session - .write_options() - .with_file_statistics(PRUNING_STATS.to_vec()) - .writer(&mut buf, values.dtype().clone()); - writer.push(values).await?; - writer.finish().await?; - Ok(buf.freeze()) + async fn delete(&self, _path: &str) -> VortexResult<()> { + Ok(()) } + } - let single = - StructArray::from_fields(&[("numbers", buffer![10u32, 20, 30].into_array())])? - .into_array(); - let single_fs: FileSystemRef = Arc::new(MemoryFileSystem { - files: std::collections::BTreeMap::from_iter([( - "single.vortex".to_string(), - write_part_with_stats(&session, single).await?, - )]), - }); - let single_source = MultiFileDataSource::new(session.clone()) - .with_glob("single.vortex", Some(single_fs)) - .build_data_source() - .await?; - let stats = single_source - .statistics( - &col("numbers"), - &[ - Min.bind(NumericalAggregateOpts::default()), - Max.bind(NumericalAggregateOpts::default()), - NullCount.bind(EmptyOptions), - ], - ) - .await?; - assert_eq!(exact_u32_stat(&stats[0]), Some(10)); - assert_eq!(exact_u32_stat(&stats[1]), Some(30)); - assert_eq!(exact_u64_stat(&stats[2]), Some(0)); - - let first = StructArray::from_fields(&[("numbers", buffer![1u32, 2, 3].into_array())])? - .into_array(); - let second = - StructArray::from_fields(&[("numbers", buffer![4u32, 5, 6].into_array())])? - .into_array(); - - let fs: FileSystemRef = Arc::new(MemoryFileSystem { - files: std::collections::BTreeMap::from_iter([ - ( - "part-0.vortex".to_string(), - write_part(&session, first).await?, - ), - ( - "part-1.vortex".to_string(), - write_part(&session, second).await?, - ), - ]), - }); - - let data_source = MultiFileDataSource::new(session.clone()) - .with_glob("part-*.vortex", Some(fs)) - .build_data_source() + async fn write_part(session: &VortexSession, values: ArrayRef) -> VortexResult { + let mut buf = ByteBufferMut::empty(); + session + .write_options() + .write(&mut buf, values.to_array_stream()) .await?; - let scan = data_source - .scan(vortex_scan::ScanRequest { + Ok(buf.freeze()) + } + + async fn write_part_with_stats( + session: &VortexSession, + values: ArrayRef, + ) -> VortexResult { + let mut buf = ByteBufferMut::empty(); + let mut writer = session + .write_options() + .with_file_statistics(PRUNING_STATS.to_vec()) + .writer(&mut buf, values.dtype().clone()); + writer.push(values).await?; + writer.finish().await?; + Ok(buf.freeze()) + } + + let single = StructArray::from_fields(&[("numbers", buffer![10u32, 20, 30].into_array())])? + .into_array(); + let single_fs: FileSystemRef = Arc::new(MemoryFileSystem { + files: std::collections::BTreeMap::from_iter([( + "single.vortex".to_string(), + write_part_with_stats(&session, single).await?, + )]), + }); + let single_source = MultiFileDataSource::new(session.clone()) + .with_glob("single.vortex", Some(single_fs)) + .build_data_source() + .await?; + let stats = single_source + .statistics( + &col("numbers"), + &[ + Min.bind(NumericalAggregateOpts::default()), + Max.bind(NumericalAggregateOpts::default()), + NullCount.bind(EmptyOptions), + ], + ) + .await?; + assert_eq!(exact_u32_stat(&stats[0]), Some(10)); + assert_eq!(exact_u32_stat(&stats[1]), Some(30)); + assert_eq!(exact_u64_stat(&stats[2]), Some(0)); + + let first = StructArray::from_fields(&[("numbers", buffer![1u32, 2, 3].into_array())])? + .into_array(); + let second = StructArray::from_fields(&[("numbers", buffer![4u32, 5, 6].into_array())])? + .into_array(); + + let fs: FileSystemRef = Arc::new(MemoryFileSystem { + files: std::collections::BTreeMap::from_iter([ + ( + "part-0.vortex".to_string(), + write_part(&session, first).await?, + ), + ( + "part-1.vortex".to_string(), + write_part(&session, second).await?, + ), + ]), + }); + + let data_source = MultiFileDataSource::new(session.clone()) + .with_glob("part-*.vortex", Some(fs)) + .build_data_source() + .await?; + let scan = data_source + .scan(vortex_scan::ScanRequest { + projection: col("numbers"), + filter: Some(gt(col("numbers"), lit(2u32))), + ordered: true, + ..Default::default() + }) + .await?; + + let dtype = scan.dtype().clone(); + let stream = scan + .partitions() + .then(|partition| async move { partition?.execute() }) + .try_flatten() + .boxed(); + let actual = ArrayStreamAdapter::new(dtype, stream).read_all().await?; + + let mut ctx = session.create_execution_ctx(); + assert_arrays_eq!(actual, buffer![3u32, 4, 5, 6].into_array(), &mut ctx); + + let planned = data_source + .plan_morsel_partitions( + vortex_scan::ScanRequest { projection: col("numbers"), filter: Some(gt(col("numbers"), lit(2u32))), - ordered: true, ..Default::default() - }) - .await?; - - let dtype = scan.dtype().clone(); - let stream = scan - .partitions() - .then(|partition| async move { partition?.execute() }) - .try_flatten() - .boxed(); - let actual = ArrayStreamAdapter::new(dtype, stream).read_all().await?; - - let mut ctx = session.create_execution_ctx(); - assert_arrays_eq!(actual, buffer![3u32, 4, 5, 6].into_array(), &mut ctx); - - let planned = data_source - .plan_morsel_partitions( - vortex_scan::ScanRequest { - projection: col("numbers"), - filter: Some(gt(col("numbers"), lit(2u32))), - ..Default::default() - }, - 128, - ) - .await? - .ok_or_else(|| { - vortex_error::vortex_err!("scan plan data source must plan morsel partitions") - })?; - - assert_eq!(planned.partition_count(), 2); - - let dtype = planned.dtype().clone(); - let stream = stream::iter(0..planned.partition_count()) - .then(|partition| { - let planned = Arc::clone(&planned); - async move { planned.partition(partition)?.execute() } - }) - .try_flatten() - .boxed(); - let actual = ArrayStreamAdapter::new(dtype, stream).read_all().await?; - - let mut ctx = session.create_execution_ctx(); - assert_arrays_eq!(actual, buffer![3u32, 4, 5, 6].into_array(), &mut ctx); - Ok(()) - }) + }, + 128, + ) + .await? + .ok_or_else(|| { + vortex_error::vortex_err!("scan plan data source must plan morsel partitions") + })?; + + assert_eq!(planned.partition_count(), 2); + + let dtype = planned.dtype().clone(); + let stream = stream::iter(0..planned.partition_count()) + .then(|partition| { + let planned = Arc::clone(&planned); + async move { planned.partition(partition)?.execute() } + }) + .try_flatten() + .boxed(); + let actual = ArrayStreamAdapter::new(dtype, stream).read_all().await?; + + let mut ctx = session.create_execution_ctx(); + assert_arrays_eq!(actual, buffer![3u32, 4, 5, 6].into_array(), &mut ctx); + Ok(()) }) } diff --git a/vortex-layout/src/scan/v2/mod.rs b/vortex-layout/src/scan/v2/mod.rs index bf3e36d53f3..54ec4082bc9 100644 --- a/vortex-layout/src/scan/v2/mod.rs +++ b/vortex-layout/src/scan/v2/mod.rs @@ -23,18 +23,18 @@ use vortex_error::vortex_err; /// /// Accepted values: /// -/// - `v1`, `scan`, `scan_builder`, `scan-builder`, `layout-reader`, or unset: use the -/// existing LayoutReader-based scan. -/// - `v2` or `scan2`: use the scan2 +/// - unset, empty, `v2`, or `scan2`: use the scan2 /// [`ScanPlan`](vortex_scan::plan::ScanPlan) implementation. +/// - `v1`, `scan`, `scan_builder`, `scan-builder`, `layout-reader`, or `legacy`: use the +/// existing LayoutReader-based scan. pub const SCAN_IMPL_ENV: &str = "VORTEX_SCAN_IMPL"; /// Returns whether the scan2 implementation should be used by scan data sources. pub fn scan2_enabled() -> VortexResult { match std::env::var(SCAN_IMPL_ENV) { - Ok(value) if value.is_empty() => Ok(false), + Ok(value) if value.is_empty() => Ok(true), Ok(value) => parse_scan_impl(&value), - Err(std::env::VarError::NotPresent) => Ok(false), + Err(std::env::VarError::NotPresent) => Ok(true), Err(std::env::VarError::NotUnicode(value)) => { vortex_bail!("{SCAN_IMPL_ENV} must be valid unicode, got {value:?}") } @@ -43,10 +43,10 @@ pub fn scan2_enabled() -> VortexResult { fn parse_scan_impl(value: &str) -> VortexResult { match value { - "v1" | "scan" | "scan_builder" | "scan-builder" | "layout-reader" => Ok(false), + "v1" | "scan" | "scan_builder" | "scan-builder" | "layout-reader" | "legacy" => Ok(false), "v2" | "scan2" => Ok(true), other => vortex_bail!( - "{SCAN_IMPL_ENV} must be one of v1, scan, scan_builder, scan-builder, layout-reader, v2, or scan2, got {other:?}" + "{SCAN_IMPL_ENV} must be one of v1, scan, scan_builder, scan-builder, layout-reader, legacy, v2, or scan2, got {other:?}" ), } } @@ -92,3 +92,53 @@ fn is_temporal(dtype: &DType) -> bool { _ => false, } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn scan_impl_env_defaults_to_scan2() -> VortexResult<()> { + temp_env::with_var(SCAN_IMPL_ENV, None::<&str>, || { + assert!(scan2_enabled()?); + Ok(()) + }) + } + + #[test] + fn scan_impl_env_empty_uses_scan2() -> VortexResult<()> { + temp_env::with_var(SCAN_IMPL_ENV, Some(""), || { + assert!(scan2_enabled()?); + Ok(()) + }) + } + + #[test] + fn scan_impl_env_legacy_values_disable_scan2() -> VortexResult<()> { + for value in [ + "v1", + "scan", + "scan_builder", + "scan-builder", + "layout-reader", + "legacy", + ] { + temp_env::with_var(SCAN_IMPL_ENV, Some(value), || -> VortexResult<()> { + assert!(!scan2_enabled()?); + Ok(()) + })?; + } + Ok(()) + } + + #[test] + fn scan_impl_env_scan2_values_enable_scan2() -> VortexResult<()> { + for value in ["v2", "scan2"] { + temp_env::with_var(SCAN_IMPL_ENV, Some(value), || -> VortexResult<()> { + assert!(scan2_enabled()?); + Ok(()) + })?; + } + Ok(()) + } +}