diff --git a/.github/scripts/run-sql-bench.sh b/.github/scripts/run-sql-bench.sh index 93e96cb89dd..9fd91b0dd7f 100755 --- a/.github/scripts/run-sql-bench.sh +++ b/.github/scripts/run-sql-bench.sh @@ -2,8 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright the Vortex contributors # -# Runs SQL benchmarks (datafusion-bench, duckdb-bench, lance-bench) for the given targets. -# This script is used by the sql-benchmarks.yml workflow. +# Runs SQL benchmarks (datafusion-bench, duckdb-bench, lance-bench, clickhouse-bench) +# for the given targets. This script is used by the sql-benchmarks.yml workflow. # # Usage: # run-sql-bench.sh [options] @@ -11,12 +11,12 @@ # Arguments: # subcommand The benchmark subcommand (e.g., tpch, clickbench, tpcds) # targets Comma-separated list of engine:format pairs -# (e.g., "datafusion:parquet,datafusion:vortex,duckdb:parquet") +# (e.g., "datafusion:parquet,datafusion:vortex,duckdb:parquet,clickhouse:parquet") # # Options: # --scale-factor Scale factor for the benchmark (e.g., 1.0, 10.0) # --remote-storage Remote storage URL (e.g., s3://bucket/path/) -# If provided, runs in remote mode (no lance support). +# If provided, runs in remote mode (no lance/clickhouse support). # --benchmark-id Benchmark ID for error messages (e.g., tpch-s3) set -Eeu -o pipefail @@ -78,6 +78,7 @@ fi df_formats=$(echo "$targets" | tr ',' '\n' | (grep '^datafusion:' | grep -v ':lance$' || true) | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//') ddb_formats=$(echo "$targets" | tr ',' '\n' | (grep '^duckdb:' || true) | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//') has_lance=$(echo "$targets" | grep -q 'datafusion:lance' && echo "true" || echo "false") +has_clickhouse=$(echo "$targets" | grep -q '^clickhouse:' && echo "true" || echo "false") # Build options string. opts="" @@ -127,3 +128,14 @@ if ! $is_remote && [[ "$has_lance" == "true" ]] && [[ -f "target/release_debug/l cat lance-results.json >> results.json fi + +# ClickHouse-bench only runs for local benchmarks (clickhouse-local reads local files). +if ! $is_remote && [[ "$has_clickhouse" == "true" ]] && [[ -f "target/release_debug/clickhouse-bench" ]]; then + # shellcheck disable=SC2086 + target/release_debug/clickhouse-bench "$subcommand" \ + -d gh-json \ + $opts \ + -o ch-results.json + + cat ch-results.json >> results.json +fi diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 35c2c057cb8..9df15be5f5c 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -119,7 +119,7 @@ jobs: "id": "clickbench-nvme", "subcommand": "clickbench", "name": "Clickbench on NVME", - "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb", + "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb,clickhouse:parquet", "build_lance": true }, { diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml index b5152e36148..3be564f7512 100644 --- a/.github/workflows/sql-benchmarks.yml +++ b/.github/workflows/sql-benchmarks.yml @@ -21,7 +21,7 @@ on: "id": "clickbench-nvme", "subcommand": "clickbench", "name": "Clickbench on NVME", - "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb" + "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb,clickhouse:parquet" }, { "id": "tpch-nvme", @@ -130,6 +130,16 @@ jobs: - uses: ./.github/actions/system-info + - name: Install ClickHouse + if: contains(matrix.targets, 'clickhouse:') + env: + CLICKHOUSE_VERSION: "25.8.18.1" + run: | + wget -qO- "https://github.com/ClickHouse/ClickHouse/releases/download/v${CLICKHOUSE_VERSION}-lts/clickhouse-common-static-${CLICKHOUSE_VERSION}-amd64.tgz" | tar xz + cp clickhouse-common-static-${CLICKHOUSE_VERSION}/usr/bin/clickhouse . + chmod +x clickhouse + echo "CLICKHOUSE_BINARY=$PWD/clickhouse" >> $GITHUB_ENV + - name: Build binaries shell: bash env: @@ -139,6 +149,9 @@ jobs: if [ "${{ matrix.build_lance }}" = "true" ]; then packages="$packages --bin lance-bench" fi + if echo "${{ matrix.targets }}" | grep -q 'clickhouse:'; then + packages="$packages --bin clickhouse-bench" + fi cargo build $packages --profile release_debug - name: Generate data diff --git a/Cargo.lock b/Cargo.lock index 733d82da3c2..fc71dddbea7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -125,7 +125,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.60.2", ] [[package]] @@ -136,7 +136,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.61.2", + "windows-sys 0.60.2", ] [[package]] @@ -1191,6 +1191,17 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" +[[package]] +name = "clickhouse-bench" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "tokio", + "tracing", + "vortex-bench", +] + [[package]] name = "codespan-reporting" version = "0.13.1" @@ -3178,6 +3189,53 @@ dependencies = [ "url", ] +[[package]] +name = "datafusion-sqllogictest" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d388fec80647198ae041d314dd7d9e2305207836ecec3ad48908eac6844cdef" +dependencies = [ + "arrow", + "async-trait", + "bigdecimal", + "clap", + "datafusion 52.1.0", + "datafusion-spark", + "datafusion-substrait", + "futures", + "half", + "indicatif", + "itertools 0.14.0", + "log", + "object_store", + "sqllogictest", + "sqlparser", + "tempfile", + "thiserror 2.0.18", + "tokio", +] + +[[package]] +name = "datafusion-substrait" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6042adacd0bd64e56c22f6a7f9ce0ce1793dd367c899d868179d029f110d9215" +dependencies = [ + "async-recursion", + "async-trait", + "chrono", + "datafusion 52.1.0", + "half", + "itertools 0.14.0", + "object_store", + "pbjson-types", + "prost 0.14.3", + "substrait", + "tokio", + "url", + "uuid", +] + [[package]] name = "deepsize" version = "0.2.0" @@ -3276,7 +3334,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -3468,7 +3526,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -3846,7 +3904,7 @@ dependencies = [ "libc", "log", "rustversion", - "windows-link 0.2.1", + "windows-link 0.1.3", "windows-result 0.4.1", ] @@ -4659,7 +4717,7 @@ checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" dependencies = [ "hermit-abi", "libc", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -4740,7 +4798,7 @@ dependencies = [ "portable-atomic", "portable-atomic-util", "serde_core", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -6047,7 +6105,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -10760,7 +10818,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 0da5ee805ba..2ca595b517e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,6 +49,7 @@ members = [ "encodings/zstd", "encodings/bytebool", # Benchmarks + "benchmarks/clickhouse-bench", "benchmarks/lance-bench", "benchmarks/compress-bench", "benchmarks/datafusion-bench", diff --git a/benchmarks/clickhouse-bench/Cargo.toml b/benchmarks/clickhouse-bench/Cargo.toml new file mode 100644 index 00000000000..7b26ae12053 --- /dev/null +++ b/benchmarks/clickhouse-bench/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "clickhouse-bench" +description = "ClickHouse (clickhouse-local) benchmark runner for Vortex" +authors.workspace = true +edition.workspace = true +homepage.workspace = true +license.workspace = true +readme.workspace = true +repository.workspace = true +rust-version.workspace = true +version.workspace = true +publish = false + +[dependencies] +anyhow = { workspace = true } +clap = { workspace = true, features = ["derive"] } +tokio = { workspace = true, features = ["full"] } +tracing = { workspace = true } +vortex-bench = { workspace = true } + +[lints] +workspace = true diff --git a/benchmarks/clickhouse-bench/build.rs b/benchmarks/clickhouse-bench/build.rs new file mode 100644 index 00000000000..7ef98c8e48d --- /dev/null +++ b/benchmarks/clickhouse-bench/build.rs @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Build script that exports the ClickHouse binary path. +//! +//! Resolution order: +//! 1. `CLICKHOUSE_BINARY` env var — use as-is. +//! 2. Falls back to `"clickhouse"` (i.e., resolve from `$PATH` at runtime). +//! +//! Users must install ClickHouse themselves for local runs. +//! In CI, it is installed via the workflow before the benchmark step. + +fn main() { + println!("cargo:rerun-if-env-changed=CLICKHOUSE_BINARY"); + + let binary = std::env::var("CLICKHOUSE_BINARY").unwrap_or_else(|_| "clickhouse".to_string()); + println!("cargo:rustc-env=CLICKHOUSE_BINARY={binary}"); +} diff --git a/benchmarks/clickhouse-bench/src/lib.rs b/benchmarks/clickhouse-bench/src/lib.rs new file mode 100644 index 00000000000..9327776578b --- /dev/null +++ b/benchmarks/clickhouse-bench/src/lib.rs @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! ClickHouse Local context for benchmarks. +//! +//! Uses `clickhouse-local` via `std::process::Command` to execute SQL queries +//! against Parquet files on disk. +//! +//! The ClickHouse binary is resolved at build time via `build.rs`: +//! 1. `CLICKHOUSE_BINARY` env var — use the specified path. +//! 2. Falls back to `"clickhouse"` — resolved from `$PATH` at runtime. +//! +//! For local runs, install ClickHouse manually (e.g., `brew install clickhouse` +//! or download from ). +//! In CI, it is installed by the workflow before the benchmark step. + +use std::io::Write; +use std::path::PathBuf; +use std::process::Command; +use std::process::Stdio; +use std::time::Duration; +use std::time::Instant; + +use anyhow::Context; +use anyhow::Result; +use tracing::trace; +use vortex_bench::Benchmark; +use vortex_bench::Format; + +/// Path to the ClickHouse binary, set by build.rs at compile time. +/// +/// This is either the value of the `CLICKHOUSE_BINARY` env var at build time, +/// or `"clickhouse"` (resolved from `$PATH` at runtime). +const CLICKHOUSE_BINARY: &str = env!("CLICKHOUSE_BINARY"); + +/// A client that wraps `clickhouse-local` for running SQL benchmarks. +pub struct ClickHouseClient { + /// The path to the `clickhouse` binary. + binary: PathBuf, + /// SQL statements to run before each query (CREATE VIEW statements). + setup_sql: Vec, +} + +impl ClickHouseClient { + /// Create a new client. Only Parquet format is supported. + /// + /// The ClickHouse binary is resolved from (in order): + /// 1. `CLICKHOUSE_BINARY` env var at build time + /// 2. `"clickhouse"` on `$PATH` + pub fn new(benchmark: &dyn Benchmark, format: Format) -> Result { + if format != Format::Parquet { + anyhow::bail!("clickhouse-bench only supports Parquet format, got {format}"); + } + + let binary = PathBuf::from(CLICKHOUSE_BINARY); + + // Verify the binary is usable (either absolute path exists, or resolvable via PATH). + Self::verify_binary(&binary)?; + + tracing::info!(binary = %binary.display(), "Using clickhouse-local"); + + let mut client = Self { + binary, + setup_sql: Vec::new(), + }; + client.register_tables(benchmark, format)?; + Ok(client) + } + + /// Check that the ClickHouse binary is available. + /// + /// For absolute paths, checks that the file exists on disk. + /// For bare names (e.g., `"clickhouse"`), tries to invoke it to verify it's resolvable. + fn verify_binary(binary: &PathBuf) -> Result<()> { + if binary.is_absolute() { + anyhow::ensure!( + binary.exists(), + "ClickHouse binary not found at '{path}'. \ + Set CLICKHOUSE_BINARY env var to the correct path, or install ClickHouse \ + and ensure it is on $PATH.", + path = binary.display() + ); + } + + // Verify the binary is actually usable by running `clickhouse local --version`. + let output = Command::new(binary.as_os_str()) + .args(["local", "--version"]) + .output() + .with_context(|| { + format!( + "ClickHouse binary '{name}' not found on $PATH. \ + Install ClickHouse (https://clickhouse.com/docs/en/install) or set \ + CLICKHOUSE_BINARY env var to an absolute path before building.", + name = binary.display() + ) + })?; + + anyhow::ensure!( + output.status.success(), + "ClickHouse binary at '{name}' failed to run: {stderr}", + name = binary.display(), + stderr = String::from_utf8_lossy(&output.stderr) + ); + + let version = String::from_utf8_lossy(&output.stdout); + tracing::debug!(version = version.trim(), "Verified clickhouse binary"); + + Ok(()) + } + + /// Generate `CREATE VIEW ... AS SELECT * FROM file(...)` statements. + /// + /// We use a VIEW over the `file()` table function rather than `CREATE TABLE ... ENGINE = File()` + /// because the `file()` function handles glob patterns (e.g., `*.parquet`) more reliably across + /// ClickHouse versions. + fn register_tables(&mut self, benchmark: &dyn Benchmark, format: Format) -> Result<()> { + let data_url = benchmark.data_url(); + let base_dir = if data_url.scheme() == "file" { + data_url + .to_file_path() + .map_err(|_| anyhow::anyhow!("Invalid file URL: {data_url}"))? + } else { + anyhow::bail!("clickhouse-bench only supports local file:// data URLs"); + }; + + let format_dir = base_dir.join(format.name()); + if !format_dir.exists() { + anyhow::bail!( + "Data directory does not exist: {}. Run data generation first.", + format_dir.display() + ); + } + + for table_spec in benchmark.table_specs() { + let name = table_spec.name; + let pattern = benchmark + .pattern(name, format) + .map(|p| p.to_string()) + .unwrap_or_else(|| format!("*.{}", format.ext())); + + let data_path = format!("{}/{}", format_dir.display(), pattern); + + tracing::info!( + table = name, + path = %data_path, + "Registering ClickHouse table" + ); + + let create_sql = format!( + "CREATE VIEW IF NOT EXISTS {name} AS \ + SELECT * FROM file('{data_path}', Parquet);" + ); + self.setup_sql.push(create_sql); + } + + Ok(()) + } + + /// Execute a SQL query via `clickhouse-local`, returning `(row_count, timing)`. + /// + /// The approach: + /// 1. Prepend all CREATE VIEW statements + /// 2. Append the benchmark query + /// 3. Pipe the combined SQL into `clickhouse local` via stdin + /// 4. Parse stdout to count result rows + pub fn execute_query(&self, query: &str) -> Result<(usize, Option)> { + trace!("execute clickhouse query: {query}"); + + // Build the full SQL: setup views + the actual query + let mut full_sql = String::new(); + for stmt in &self.setup_sql { + full_sql.push_str(stmt); + full_sql.push('\n'); + } + full_sql.push_str(query); + // Ensure we have a trailing semicolon + if !query.trim_end().ends_with(';') { + full_sql.push(';'); + } + + let time_instant = Instant::now(); + + // The `clickhouse` binary is a multi-tool; invoke it as `clickhouse local`. + let mut child = Command::new(&self.binary) + .args(["local", "--format", "TabSeparated"]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .context("Failed to spawn clickhouse-local")?; + + // Write SQL to stdin + { + let stdin = child + .stdin + .as_mut() + .context("Failed to open clickhouse-local stdin")?; + stdin + .write_all(full_sql.as_bytes()) + .context("Failed to write SQL to clickhouse-local stdin")?; + } + + let output = child + .wait_with_output() + .context("Failed to wait for clickhouse-local")?; + + let query_time = time_instant.elapsed(); + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + anyhow::bail!( + "clickhouse-local failed (exit {}): {stderr}", + output.status.code().unwrap_or(-1) + ); + } + + // Count non-empty lines in stdout as row count + let stdout = String::from_utf8_lossy(&output.stdout); + let row_count = stdout.lines().filter(|line| !line.is_empty()).count(); + + Ok((row_count, Some(query_time))) + } +} diff --git a/benchmarks/clickhouse-bench/src/main.rs b/benchmarks/clickhouse-bench/src/main.rs new file mode 100644 index 00000000000..d705b23eb22 --- /dev/null +++ b/benchmarks/clickhouse-bench/src/main.rs @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::path::PathBuf; + +use clap::Parser; +use clickhouse_bench::ClickHouseClient; +use tokio::runtime::Runtime; +use vortex_bench::BenchmarkArg; +use vortex_bench::Engine; +use vortex_bench::Format; +use vortex_bench::Opt; +use vortex_bench::Opts; +use vortex_bench::create_benchmark; +use vortex_bench::create_output_writer; +use vortex_bench::display::DisplayFormat; +use vortex_bench::runner::BenchmarkMode; +use vortex_bench::runner::BenchmarkQueryResult; +use vortex_bench::runner::SqlBenchmarkRunner; +use vortex_bench::runner::filter_queries; +use vortex_bench::setup_logging_and_tracing; + +/// ClickHouse (clickhouse-local) benchmark runner. +/// +/// Runs queries against Parquet data using clickhouse-local as a performance baseline. +/// This allows comparing ClickHouse's native Parquet reading performance against other engines +/// (DuckDB, DataFusion) on the same hardware and dataset. +#[derive(Parser)] +struct Args { + #[arg(value_enum)] + benchmark: BenchmarkArg, + + #[arg(short, long, default_value_t = 5)] + iterations: usize, + + #[arg(short, long)] + verbose: bool, + + #[arg(long)] + tracing: bool, + + #[arg(short, long, default_value_t, value_enum)] + display_format: DisplayFormat, + + #[arg(short, long, value_delimiter = ',')] + queries: Option>, + + #[arg(short, long, value_delimiter = ',')] + exclude_queries: Option>, + + #[arg(short)] + output_path: Option, + + #[arg(long, default_value_t = false)] + track_memory: bool, + + #[arg(long, default_value_t = false)] + hide_progress_bar: bool, + + #[arg(long = "opt", value_delimiter = ',', value_parser = clap::value_parser!(Opt))] + options: Vec, +} + +struct ClickHouseQueryResult { + row_count: usize, +} + +impl BenchmarkQueryResult for ClickHouseQueryResult { + fn row_count(&self) -> usize { + self.row_count + } + + fn display(self) -> String { + format!("{} rows", self.row_count) + } +} + +fn main() -> anyhow::Result<()> { + let args = Args::parse(); + let opts = Opts::from(args.options); + + setup_logging_and_tracing(args.verbose, args.tracing)?; + + let benchmark = create_benchmark(args.benchmark, &opts)?; + + let filtered_queries = filter_queries( + benchmark.queries()?, + args.queries.as_ref(), + args.exclude_queries.as_ref(), + ); + + // Generate base Parquet data if needed. + if benchmark.data_url().scheme() == "file" { + let runtime = Runtime::new()?; + runtime.block_on(async { benchmark.generate_base_data().await })?; + } + + let formats = vec![Format::Parquet]; + + let mut runner = SqlBenchmarkRunner::new( + benchmark.as_ref(), + Engine::ClickHouse, + formats, + args.track_memory, + args.hide_progress_bar, + )?; + + runner.run_all( + &filtered_queries, + BenchmarkMode::Run { + iterations: args.iterations, + }, + |format| ClickHouseClient::new(benchmark.as_ref(), format), + |ctx, _query_idx, _format, query| { + let (row_count, duration) = ctx.execute_query(query)?; + Ok((duration, ClickHouseQueryResult { row_count })) + }, + )?; + + let benchmark_id = format!("clickhouse-{}", benchmark.dataset_name()); + let writer = create_output_writer(&args.display_format, args.output_path, &benchmark_id)?; + runner.export_to(&args.display_format, writer)?; + + Ok(()) +} diff --git a/vortex-bench/src/clickbench/benchmark.rs b/vortex-bench/src/clickbench/benchmark.rs index 5e14cbcf40e..a0dcb4ea44f 100644 --- a/vortex-bench/src/clickbench/benchmark.rs +++ b/vortex-bench/src/clickbench/benchmark.rs @@ -1,14 +1,12 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use std::env; use std::fs; -use std::path::Path; +use std::path::PathBuf; use anyhow::Result; use reqwest::Client; use url::Url; -use vortex::error::VortexExpect; use crate::Benchmark; use crate::BenchmarkDataset; @@ -37,14 +35,21 @@ impl ClickBenchBenchmark { }) } + /// Returns the path to the queries file. + fn queries_file_path(&self) -> PathBuf { + if let Some(file) = &self.queries_file { + return file.into(); + } + let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + manifest_dir.join("clickbench_queries.sql") + } + fn create_data_url(remote_data_dir: &Option, flavor: Flavor) -> Result { match remote_data_dir { None => { let basepath = format!("clickbench_{flavor}").to_data_path(); - Ok(Url::parse(&format!( - "file:{}/", - basepath.to_str().vortex_expect("path should be utf8") - ))?) + Url::from_directory_path(basepath) + .map_err(|_| anyhow::anyhow!("Failed to convert ClickBench data path to URL")) } Some(remote_data_dir) => { if !remote_data_dir.ends_with("/") { @@ -69,10 +74,7 @@ impl ClickBenchBenchmark { #[async_trait::async_trait] impl Benchmark for ClickBenchBenchmark { fn queries(&self) -> Result> { - let queries_filepath = match &self.queries_file { - Some(file) => file.into(), - None => Path::new(env!("CARGO_MANIFEST_DIR")).join("clickbench_queries.sql"), - }; + let queries_filepath = self.queries_file_path(); Ok(fs::read_to_string(queries_filepath)? .split(';') diff --git a/vortex-bench/src/lib.rs b/vortex-bench/src/lib.rs index 6dad0f0f6a1..8be4c6bcea8 100644 --- a/vortex-bench/src/lib.rs +++ b/vortex-bench/src/lib.rs @@ -206,6 +206,9 @@ pub enum Engine { #[clap(name = "duckdb")] #[serde(rename = "duckdb")] DuckDB, + #[clap(name = "clickhouse")] + #[serde(rename = "clickhouse")] + ClickHouse, } impl Display for Engine { @@ -213,6 +216,7 @@ impl Display for Engine { match self { Engine::DataFusion => write!(f, "datafusion"), Engine::DuckDB => write!(f, "duckdb"), + Engine::ClickHouse => write!(f, "clickhouse"), Engine::Vortex => write!(f, "vortex"), Engine::Arrow => write!(f, "arrow"), }