Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/sql-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,7 @@ jobs:
env:
RUSTFLAGS: "-C target-cpu=native"
run: |
# vx-bench prepare-data shells out to target/release_debug/data-gen.
packages=(--bin data-gen --bin datafusion-bench --bin duckdb-bench)
if [ "${{ inputs.mode }}" != "pr" ]; then
packages+=(--bin lance-bench)
Expand Down
27 changes: 2 additions & 25 deletions benchmarks/datafusion-bench/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,15 @@ use vortex::io::filesystem::FileSystemRef;
use vortex::scan::DataSourceRef;
use vortex_bench::Benchmark;
use vortex_bench::BenchmarkArg;
use vortex_bench::CompactionStrategy;
use vortex_bench::Engine;
use vortex_bench::Format;
use vortex_bench::Opt;
use vortex_bench::Opts;
use vortex_bench::SESSION;
use vortex_bench::conversions::convert_parquet_directory_to_vortex;
use vortex_bench::create_benchmark;
use vortex_bench::create_output_writer;
use vortex_bench::display::DisplayFormat;
use vortex_bench::require_prepared_data;
use vortex_bench::runner::BenchmarkMode;
use vortex_bench::runner::BenchmarkQueryResult;
use vortex_bench::runner::SqlBenchmarkRunner;
Expand Down Expand Up @@ -131,29 +130,7 @@ async fn main() -> anyhow::Result<()> {
args.exclude_queries.as_ref(),
);

// Generate Vortex files from Parquet for any Vortex formats requested
if benchmark.data_url().scheme() == "file" {
benchmark.generate_base_data().await?;

let base_path = benchmark
.data_url()
.to_file_path()
.map_err(|_| anyhow::anyhow!("Invalid file URL: {}", benchmark.data_url()))?;

for format in args.formats.iter() {
match format {
Format::OnDiskVortex => {
convert_parquet_directory_to_vortex(&base_path, CompactionStrategy::Default)
.await?;
}
Format::VortexCompact => {
convert_parquet_directory_to_vortex(&base_path, CompactionStrategy::Compact)
.await?;
}
_ => {}
}
}
}
require_prepared_data(&*benchmark, &args.formats)?;

let benchmark_name = benchmark.dataset().to_string();

Expand Down
42 changes: 34 additions & 8 deletions benchmarks/duckdb-bench/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,38 @@ impl DuckClient {
format!("{name}/{format}/", name = benchmark.dataset_name()).to_data_path()
};
let dir = base_path.join(format.name());
std::fs::create_dir_all(&dir)?;
let db_path = dir.join("duckdb.db");

if format == Format::OnDiskDuckDB && data_url.scheme() != "file" {
anyhow::bail!("DuckDB format requires local data prepared by data-gen");
}

if format == Format::OnDiskDuckDB {
if !db_path.exists() {
anyhow::bail!(
"prepared DuckDB database is missing at {}. Generate it with \
`vx-bench prepare-data {} --formats-json '[\"duckdb\"]'` or \
`cargo run --bin data-gen -- {} --formats duckdb` using the same --opt values.",
db_path.display(),
benchmark.dataset_name(),
benchmark.dataset_name(),
);
}
} else {
std::fs::create_dir_all(&dir)?;
}

tracing::info!(db_path = %db_path.display(), "Opening DuckDB");

if delete_database && db_path.exists() {
std::fs::remove_file(&db_path)?;
if format == Format::OnDiskDuckDB {
tracing::info!(
db_path = %db_path.display(),
"Keeping prepared DuckDB format database"
);
} else {
std::fs::remove_file(&db_path)?;
}
}

let (db, connection) = Self::open_and_setup_database(Some(db_path.clone()), threads)?;
Expand Down Expand Up @@ -147,9 +172,14 @@ impl DuckClient {
benchmark: &B,
file_format: Format,
) -> Result<()> {
if file_format == Format::OnDiskDuckDB {
// Native DuckDB data is materialized by data-gen. The opened database already
// contains benchmark tables, so there is nothing to register here.
return Ok(());
}

let object_type = match file_format {
Format::Parquet | Format::OnDiskVortex | Format::VortexCompact => "VIEW",
Format::OnDiskDuckDB => "TABLE",
Format::Lance => {
anyhow::bail!(
"Lance format is not supported for DuckDB engine. \
Expand All @@ -159,11 +189,7 @@ impl DuckClient {
format => anyhow::bail!("Format {format} isn't supported for DuckDB"),
};

// DuckDB loads from parquet for OnDiskDuckDB format
let load_format = match file_format {
Format::Parquet | Format::OnDiskDuckDB => Format::Parquet,
f => f,
};
let load_format = file_format;

// Get the base URL for the format's data directory
let format_url = benchmark.format_path(load_format, benchmark.data_url())?;
Expand Down
42 changes: 2 additions & 40 deletions benchmarks/duckdb-bench/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,16 @@ use std::path::PathBuf;
use clap::Parser;
use clap::value_parser;
use duckdb_bench::DuckClient;
use tokio::runtime::Runtime;
use vortex::metrics::tracing::set_global_labels;
use vortex_bench::BenchmarkArg;
use vortex_bench::CompactionStrategy;
use vortex_bench::Engine;
use vortex_bench::Format;
use vortex_bench::Opt;
use vortex_bench::Opts;
use vortex_bench::conversions::convert_parquet_directory_to_vortex;
use vortex_bench::create_benchmark;
use vortex_bench::create_output_writer;
use vortex_bench::display::DisplayFormat;
use vortex_bench::require_prepared_data;
use vortex_bench::runner::BenchmarkMode;
use vortex_bench::runner::SqlBenchmarkRunner;
use vortex_bench::runner::filter_queries;
Expand Down Expand Up @@ -110,43 +108,7 @@ fn main() -> anyhow::Result<()> {
anyhow::bail!("provide a format with --formats");
}

// Generate Vortex files from Parquet for any Vortex formats requested
if benchmark.data_url().scheme() == "file" {
// This is ugly, but otherwise some complicated async interaction might result in a deadlock
let runtime = Runtime::new()?;

runtime.block_on(async {
benchmark.generate_base_data().await?;

let base_path = benchmark
.data_url()
.to_file_path()
.map_err(|_| anyhow::anyhow!("Invalid file URL: {}", benchmark.data_url()))?;

for format in args.formats.iter().copied() {
match format {
Format::OnDiskVortex => {
convert_parquet_directory_to_vortex(
&base_path,
CompactionStrategy::Default,
)
.await?;
}
Format::VortexCompact => {
convert_parquet_directory_to_vortex(
&base_path,
CompactionStrategy::Compact,
)
.await?;
}
// OnDiskDuckDB tables are created during register_tables by loading from Parquet
_ => {}
}
}

anyhow::Ok(())
})?;
}
require_prepared_data(&*benchmark, &args.formats)?;

let mut runner = SqlBenchmarkRunner::new(
&*benchmark,
Expand Down
4 changes: 2 additions & 2 deletions vortex-bench/src/benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ pub trait Benchmark: Send + Sync {
/// This is the canonical source data that can be converted to other formats.
/// This should be idempotent - safe to call multiple times.
///
/// Format-specific benchmark binaries (like lance-bench, datafusion-bench, duckdb-bench) should
/// call this method to ensure base data exists, then perform their own format conversion.
/// The `data-gen` binary calls this method before creating requested derived formats.
/// Engine-specific benchmark binaries should only run against prepared data.
async fn generate_base_data(&self) -> anyhow::Result<()>;

/// Get expected row counts for validation (optional)
Expand Down
60 changes: 60 additions & 0 deletions vortex-bench/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,66 @@ impl CompactionStrategy {
}
}

/// Verify that local data has already been prepared for the requested benchmark formats.
///
/// Engine-specific benchmark binaries call this before running queries. Data generation itself
/// belongs to the `data-gen` binary.
pub fn require_prepared_data<B>(benchmark: &B, formats: &[Format]) -> anyhow::Result<()>
where
B: Benchmark + ?Sized,
{
if benchmark.data_url().scheme() != "file" {
return Ok(());
}

let base_path = benchmark
.data_url()
.to_file_path()
.map_err(|_| anyhow::anyhow!("Invalid file URL: {}", benchmark.data_url()))?;

let mut missing = Vec::new();
for format in formats.iter().copied().unique() {
let required_path = match format {
Format::Arrow | Format::Parquet => base_path.join(Format::Parquet.name()),
Format::OnDiskVortex => base_path.join(Format::OnDiskVortex.name()),
Format::VortexCompact => base_path.join(Format::VortexCompact.name()),
Format::OnDiskDuckDB => base_path
.join(Format::OnDiskDuckDB.name())
.join("duckdb.db"),
format => base_path.join(format.name()),
};

if !required_path.exists() {
missing.push((format, required_path));
}
}

if missing.is_empty() {
return Ok(());
}

let missing_data = missing
.iter()
.map(|(format, path)| format!("{format} ({})", path.display()))
.join(", ");
let requested_formats = formats
.iter()
.copied()
.unique()
.map(|format| format!("\"{format}\""))
.join(",");

anyhow::bail!(
"prepared data is missing for {}: {missing_data}. Generate it first with \
`vx-bench prepare-data {} --formats-json '[{requested_formats}]'` or \

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In bail we print, --formats-json '["vortex-file-compressed"]'. But we don't accept --formats-json '["vortex-file-compressed"]' on the CLI?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do! this is the python tool, it has its own CLI args.

`cargo run --bin data-gen -- {} --formats {}` using the same --opt values.",
benchmark.dataset_display(),
benchmark.dataset_name(),
benchmark.dataset_name(),
formats.iter().copied().unique().join(","),
);
}

/// CLI argument for selecting which benchmark to run.
#[derive(clap::ValueEnum, Clone, Copy)]
pub enum BenchmarkArg {
Expand Down
Loading