diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml index 1fa6ceb7258..9057016a62f 100644 --- a/.github/workflows/sql-benchmarks.yml +++ b/.github/workflows/sql-benchmarks.yml @@ -574,6 +574,7 @@ jobs: env: RUSTFLAGS: "-C target-cpu=native" run: | + # vx-bench prepare-data shells out to target/release_debug/data-gen. packages=(--bin data-gen --bin datafusion-bench --bin duckdb-bench) if [ "${{ inputs.mode }}" != "pr" ]; then packages+=(--bin lance-bench) diff --git a/benchmarks/datafusion-bench/src/main.rs b/benchmarks/datafusion-bench/src/main.rs index b8f9ac42df6..10415a10e45 100644 --- a/benchmarks/datafusion-bench/src/main.rs +++ b/benchmarks/datafusion-bench/src/main.rs @@ -31,16 +31,15 @@ use vortex::io::filesystem::FileSystemRef; use vortex::scan::DataSourceRef; use vortex_bench::Benchmark; use vortex_bench::BenchmarkArg; -use vortex_bench::CompactionStrategy; use vortex_bench::Engine; use vortex_bench::Format; use vortex_bench::Opt; use vortex_bench::Opts; use vortex_bench::SESSION; -use vortex_bench::conversions::convert_parquet_directory_to_vortex; use vortex_bench::create_benchmark; use vortex_bench::create_output_writer; use vortex_bench::display::DisplayFormat; +use vortex_bench::require_prepared_data; use vortex_bench::runner::BenchmarkMode; use vortex_bench::runner::BenchmarkQueryResult; use vortex_bench::runner::SqlBenchmarkRunner; @@ -131,29 +130,7 @@ async fn main() -> anyhow::Result<()> { args.exclude_queries.as_ref(), ); - // Generate Vortex files from Parquet for any Vortex formats requested - if benchmark.data_url().scheme() == "file" { - benchmark.generate_base_data().await?; - - let base_path = benchmark - .data_url() - .to_file_path() - .map_err(|_| anyhow::anyhow!("Invalid file URL: {}", benchmark.data_url()))?; - - for format in args.formats.iter() { - match format { - Format::OnDiskVortex => { - convert_parquet_directory_to_vortex(&base_path, CompactionStrategy::Default) - .await?; - } - Format::VortexCompact => { - convert_parquet_directory_to_vortex(&base_path, CompactionStrategy::Compact) - .await?; - } - _ => {} - } - } - } + require_prepared_data(&*benchmark, &args.formats)?; let benchmark_name = benchmark.dataset().to_string(); diff --git a/benchmarks/duckdb-bench/src/lib.rs b/benchmarks/duckdb-bench/src/lib.rs index 69e4da63853..a4f69b24141 100644 --- a/benchmarks/duckdb-bench/src/lib.rs +++ b/benchmarks/duckdb-bench/src/lib.rs @@ -51,13 +51,38 @@ impl DuckClient { format!("{name}/{format}/", name = benchmark.dataset_name()).to_data_path() }; let dir = base_path.join(format.name()); - std::fs::create_dir_all(&dir)?; let db_path = dir.join("duckdb.db"); + if format == Format::OnDiskDuckDB && data_url.scheme() != "file" { + anyhow::bail!("DuckDB format requires local data prepared by data-gen"); + } + + if format == Format::OnDiskDuckDB { + if !db_path.exists() { + anyhow::bail!( + "prepared DuckDB database is missing at {}. Generate it with \ + `vx-bench prepare-data {} --formats-json '[\"duckdb\"]'` or \ + `cargo run --bin data-gen -- {} --formats duckdb` using the same --opt values.", + db_path.display(), + benchmark.dataset_name(), + benchmark.dataset_name(), + ); + } + } else { + std::fs::create_dir_all(&dir)?; + } + tracing::info!(db_path = %db_path.display(), "Opening DuckDB"); if delete_database && db_path.exists() { - std::fs::remove_file(&db_path)?; + if format == Format::OnDiskDuckDB { + tracing::info!( + db_path = %db_path.display(), + "Keeping prepared DuckDB format database" + ); + } else { + std::fs::remove_file(&db_path)?; + } } let (db, connection) = Self::open_and_setup_database(Some(db_path.clone()), threads)?; @@ -147,9 +172,14 @@ impl DuckClient { benchmark: &B, file_format: Format, ) -> Result<()> { + if file_format == Format::OnDiskDuckDB { + // Native DuckDB data is materialized by data-gen. The opened database already + // contains benchmark tables, so there is nothing to register here. + return Ok(()); + } + let object_type = match file_format { Format::Parquet | Format::OnDiskVortex | Format::VortexCompact => "VIEW", - Format::OnDiskDuckDB => "TABLE", Format::Lance => { anyhow::bail!( "Lance format is not supported for DuckDB engine. \ @@ -159,11 +189,7 @@ impl DuckClient { format => anyhow::bail!("Format {format} isn't supported for DuckDB"), }; - // DuckDB loads from parquet for OnDiskDuckDB format - let load_format = match file_format { - Format::Parquet | Format::OnDiskDuckDB => Format::Parquet, - f => f, - }; + let load_format = file_format; // Get the base URL for the format's data directory let format_url = benchmark.format_path(load_format, benchmark.data_url())?; diff --git a/benchmarks/duckdb-bench/src/main.rs b/benchmarks/duckdb-bench/src/main.rs index 8ba4937f566..d020fb93a95 100644 --- a/benchmarks/duckdb-bench/src/main.rs +++ b/benchmarks/duckdb-bench/src/main.rs @@ -8,18 +8,16 @@ use std::path::PathBuf; use clap::Parser; use clap::value_parser; use duckdb_bench::DuckClient; -use tokio::runtime::Runtime; use vortex::metrics::tracing::set_global_labels; use vortex_bench::BenchmarkArg; -use vortex_bench::CompactionStrategy; use vortex_bench::Engine; use vortex_bench::Format; use vortex_bench::Opt; use vortex_bench::Opts; -use vortex_bench::conversions::convert_parquet_directory_to_vortex; use vortex_bench::create_benchmark; use vortex_bench::create_output_writer; use vortex_bench::display::DisplayFormat; +use vortex_bench::require_prepared_data; use vortex_bench::runner::BenchmarkMode; use vortex_bench::runner::SqlBenchmarkRunner; use vortex_bench::runner::filter_queries; @@ -110,43 +108,7 @@ fn main() -> anyhow::Result<()> { anyhow::bail!("provide a format with --formats"); } - // Generate Vortex files from Parquet for any Vortex formats requested - if benchmark.data_url().scheme() == "file" { - // This is ugly, but otherwise some complicated async interaction might result in a deadlock - let runtime = Runtime::new()?; - - runtime.block_on(async { - benchmark.generate_base_data().await?; - - let base_path = benchmark - .data_url() - .to_file_path() - .map_err(|_| anyhow::anyhow!("Invalid file URL: {}", benchmark.data_url()))?; - - for format in args.formats.iter().copied() { - match format { - Format::OnDiskVortex => { - convert_parquet_directory_to_vortex( - &base_path, - CompactionStrategy::Default, - ) - .await?; - } - Format::VortexCompact => { - convert_parquet_directory_to_vortex( - &base_path, - CompactionStrategy::Compact, - ) - .await?; - } - // OnDiskDuckDB tables are created during register_tables by loading from Parquet - _ => {} - } - } - - anyhow::Ok(()) - })?; - } + require_prepared_data(&*benchmark, &args.formats)?; let mut runner = SqlBenchmarkRunner::new( &*benchmark, diff --git a/vortex-bench/src/benchmark.rs b/vortex-bench/src/benchmark.rs index 2872a02aa64..7d9e845279b 100644 --- a/vortex-bench/src/benchmark.rs +++ b/vortex-bench/src/benchmark.rs @@ -36,8 +36,8 @@ pub trait Benchmark: Send + Sync { /// This is the canonical source data that can be converted to other formats. /// This should be idempotent - safe to call multiple times. /// - /// Format-specific benchmark binaries (like lance-bench, datafusion-bench, duckdb-bench) should - /// call this method to ensure base data exists, then perform their own format conversion. + /// The `data-gen` binary calls this method before creating requested derived formats. + /// Engine-specific benchmark binaries should only run against prepared data. async fn generate_base_data(&self) -> anyhow::Result<()>; /// Get expected row counts for validation (optional) diff --git a/vortex-bench/src/lib.rs b/vortex-bench/src/lib.rs index 8981b4859cf..2f2ece62f7b 100644 --- a/vortex-bench/src/lib.rs +++ b/vortex-bench/src/lib.rs @@ -245,6 +245,66 @@ impl CompactionStrategy { } } +/// Verify that local data has already been prepared for the requested benchmark formats. +/// +/// Engine-specific benchmark binaries call this before running queries. Data generation itself +/// belongs to the `data-gen` binary. +pub fn require_prepared_data(benchmark: &B, formats: &[Format]) -> anyhow::Result<()> +where + B: Benchmark + ?Sized, +{ + if benchmark.data_url().scheme() != "file" { + return Ok(()); + } + + let base_path = benchmark + .data_url() + .to_file_path() + .map_err(|_| anyhow::anyhow!("Invalid file URL: {}", benchmark.data_url()))?; + + let mut missing = Vec::new(); + for format in formats.iter().copied().unique() { + let required_path = match format { + Format::Arrow | Format::Parquet => base_path.join(Format::Parquet.name()), + Format::OnDiskVortex => base_path.join(Format::OnDiskVortex.name()), + Format::VortexCompact => base_path.join(Format::VortexCompact.name()), + Format::OnDiskDuckDB => base_path + .join(Format::OnDiskDuckDB.name()) + .join("duckdb.db"), + format => base_path.join(format.name()), + }; + + if !required_path.exists() { + missing.push((format, required_path)); + } + } + + if missing.is_empty() { + return Ok(()); + } + + let missing_data = missing + .iter() + .map(|(format, path)| format!("{format} ({})", path.display())) + .join(", "); + let requested_formats = formats + .iter() + .copied() + .unique() + .map(|format| format!("\"{format}\"")) + .join(","); + + anyhow::bail!( + "prepared data is missing for {}: {missing_data}. Generate it first with \ + `vx-bench prepare-data {} --formats-json '[{requested_formats}]'` or \ + `cargo run --bin data-gen -- {} --formats {}` using the same --opt values.", + benchmark.dataset_display(), + benchmark.dataset_name(), + benchmark.dataset_name(), + formats.iter().copied().unique().join(","), + ); +} + /// CLI argument for selecting which benchmark to run. #[derive(clap::ValueEnum, Clone, Copy)] pub enum BenchmarkArg {