diff --git a/datafusion-vortex-partitioned/README.md b/datafusion-vortex-partitioned/README.md new file mode 100644 index 000000000..767eed848 --- /dev/null +++ b/datafusion-vortex-partitioned/README.md @@ -0,0 +1,43 @@ +# DataFusion + Vortex + +Partitioned Vortex dataset, converted one-for-one from the 100 ClickBench Parquet files and queried with [`vortex-datafusion-cli`]. + +[`vortex-datafusion-cli`]: https://github.com/vortex-data/vortex-datafusion-cli + +## Cookbook: Generate benchmark results + +Follow the same EC2 setup used by [datafusion-partitioned](../datafusion-partitioned/README.md), then run: + +```bash +cd ClickBench/datafusion-vortex-partitioned +bash benchmark.sh +``` + +The benchmark script builds `vortex-datafusion-cli`, downloads the partitioned Parquet files, converts each `partitioned/hits_N.parquet` file into exactly one `vortex/hits_N.vortex` file, and runs the query set. + +`benchmark.sh` checks out `vortex-datafusion-cli` tag `0.70.0-53.1.0`. CLI tags use `-`, where the first component is the `vortex-datafusion` crate version and the second is the DataFusion/DataFusion CLI version. + +You can update/preview the results by running: + +```bash +./make-json.sh # Example. ./make-json.sh c6a.xlarge +``` + +## Parquet to Vortex conversion + +Each input file is converted independently through `vortex-datafusion-cli`: + +```sql +CREATE EXTERNAL TABLE hits_parquet +STORED AS PARQUET +LOCATION 'partitioned/hits_0.parquet' +OPTIONS ('binary_as_string' 'true'); + +COPY ( + SELECT * EXCEPT ("EventDate"), + CAST(CAST("EventDate" AS INTEGER) AS DATE) AS "EventDate" + FROM hits_parquet +) TO 'vortex/hits_0.vortex' STORED AS VORTEX; +``` + +`binary_as_string=true` handles the incorrect Parquet logical annotation before Vortex is written. The produced Vortex files store those fields as strings, so benchmark reads use only the Vortex table registration. diff --git a/datafusion-vortex-partitioned/benchmark.sh b/datafusion-vortex-partitioned/benchmark.sh new file mode 100755 index 000000000..c1995eac4 --- /dev/null +++ b/datafusion-vortex-partitioned/benchmark.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +set -Eeuo pipefail + +export HOME=${HOME:=~} +# - +CLI_TAG=0.70.0-53.1.0 +WITH_SWAP=false + +if [ "$(free -g | awk '/^Mem:/{print $2}')" -lt 12 ]; then + echo "LOW MEMORY MODE" + if [ "$(swapon --noheadings --show | wc -l)" -eq 0 ]; then + echo "Enabling 8G swap" + sudo fallocate -l 8G /swapfile + sudo chmod 600 /swapfile + sudo mkswap /swapfile + sudo swapon /swapfile + WITH_SWAP=true + fi +fi + +echo "Install Rust" +if ! command -v cargo >/dev/null 2>&1; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh + bash rust-init.sh -y +fi +source "$HOME/.cargo/env" + +echo "Install dependencies" +sudo apt-get update -y +sudo apt-get install -y build-essential cmake pkg-config time + +echo "Install vortex-datafusion-cli" +rm -rf vortex-datafusion-cli +git clone https://github.com/vortex-data/vortex-datafusion-cli.git +cd vortex-datafusion-cli +git checkout "$CLI_TAG" +CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release --bin vortex-datafusion-cli +export PATH="$(pwd)/target/release:$PATH" +cd .. + +echo "Download benchmark target data, partitioned" +mkdir -p partitioned +seq 0 99 | xargs -P100 -I{} bash -c 'wget --directory-prefix partitioned --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet' + +echo "Convert Parquet to Vortex" +rm -rf vortex +mkdir -p vortex +if ! /usr/bin/time -f '%e' -o load-time.txt bash -c ' + set -Eeuo pipefail + seq 0 99 | xargs -P"$(nproc)" -I{} ./convert.sh "partitioned/hits_{}.parquet" "vortex/hits_{}.vortex" +' > convert.log 2>&1; then + cat convert.log + exit 1 +fi +VORTEX_FILES=$(find vortex -maxdepth 1 -name 'hits_*.vortex' | wc -l) +if [ "$VORTEX_FILES" -ne 100 ]; then + echo "Expected 100 Vortex files, found $VORTEX_FILES" >&2 + exit 1 +fi +echo "Load time: $(cat load-time.txt)" + +echo "Run benchmarks for partitioned" +./run.sh + +echo "Data size: $(du -bcs vortex/*.vortex | grep total)" + +if [ "$WITH_SWAP" = true ]; then + echo "Disable swap" + sudo swapoff /swapfile + sudo rm /swapfile +fi diff --git a/datafusion-vortex-partitioned/convert.sh b/datafusion-vortex-partitioned/convert.sh new file mode 100755 index 000000000..6bf5c19b5 --- /dev/null +++ b/datafusion-vortex-partitioned/convert.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -Eeuo pipefail + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " >&2 + exit 2 +fi + +INPUT=$1 +OUTPUT=$2 + +mkdir -p "$(dirname "$OUTPUT")" +rm -f "$OUTPUT" + +vortex-datafusion-cli -q \ + -c "SET datafusion.execution.target_partitions = 1;" \ + -c "CREATE EXTERNAL TABLE hits_parquet STORED AS PARQUET LOCATION '$INPUT' OPTIONS ('binary_as_string' 'true');" \ + -c "COPY (SELECT * EXCEPT (\"EventDate\"), CAST(CAST(\"EventDate\" AS INTEGER) AS DATE) AS \"EventDate\" FROM hits_parquet) TO '$OUTPUT' STORED AS VORTEX;" + +test -f "$OUTPUT" diff --git a/datafusion-vortex-partitioned/create.sql b/datafusion-vortex-partitioned/create.sql new file mode 100644 index 000000000..e54d401ac --- /dev/null +++ b/datafusion-vortex-partitioned/create.sql @@ -0,0 +1,3 @@ +CREATE EXTERNAL TABLE hits +STORED AS VORTEX +LOCATION 'vortex'; diff --git a/datafusion-vortex-partitioned/make-json.sh b/datafusion-vortex-partitioned/make-json.sh new file mode 100755 index 000000000..8f6135f90 --- /dev/null +++ b/datafusion-vortex-partitioned/make-json.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# This script converts the raw `result.csv` data from `benchmark.sh` into the +# final json format used by the benchmark dashboard. +# +# usage : ./make-json.sh +# +# example (save results/c6a.4xlarge.json) +# ./make-json.sh c6a.4xlarge + +MACHINE=$1 +OUTPUT_FILE="results/${MACHINE}.json" +SYSTEM_NAME="DataFusion (Vortex, partitioned)" +DATE=$(date +%Y-%m-%d) +LOAD_TIME=${LOAD_TIME:-$(cat load-time.txt 2>/dev/null || echo null)} +DATA_SIZE=${DATA_SIZE:-$(du -bcs vortex/*.vortex 2>/dev/null | awk '/total/ { print $1 }')} +DATA_SIZE=${DATA_SIZE:-null} + +mkdir -p results + +# Read the CSV and build the result array using sed +RESULT_ARRAY=$(awk -F, '{arr[$1]=arr[$1]","$3} END {for (i=1;i<=length(arr);i++) {gsub(/^,/, "", arr[i]); printf " ["arr[i]"]"; if (i $OUTPUT_FILE +{ + "system": "$SYSTEM_NAME", + "date": "$DATE", + "machine": "$MACHINE", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "hardware": "cpu", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": $LOAD_TIME, + "data_size": $DATA_SIZE, + "result": [ + $RESULT_ARRAY + ] +} +EOF diff --git a/datafusion-vortex-partitioned/queries.sql b/datafusion-vortex-partitioned/queries.sql new file mode 100644 index 000000000..9a183cd6e --- /dev/null +++ b/datafusion-vortex-partitioned/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0; +SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits; +SELECT AVG("UserID") FROM hits; +SELECT COUNT(DISTINCT "UserID") FROM hits; +SELECT COUNT(DISTINCT "SearchPhrase") FROM hits; +SELECT MIN("EventDate"), MAX("EventDate") FROM hits; +SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY COUNT(*) DESC; +SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10; +SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10; +SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10; +SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10; +SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; +SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10; +SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10; +SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10; +SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; +SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10; +SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; +SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%'; +SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; +SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY "EventTime" LIMIT 10; +SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime" LIMIT 10; +SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10; +SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime", "SearchPhrase" LIMIT 10; +SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits; +SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10; +SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10; +SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10; +SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10; +SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10; +SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10; +SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10; +SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10; +SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT "URLHash", "EventDate", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate" ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000; diff --git a/datafusion-vortex-partitioned/results/c6a.2xlarge.json b/datafusion-vortex-partitioned/results/c6a.2xlarge.json new file mode 100644 index 000000000..77659d69c --- /dev/null +++ b/datafusion-vortex-partitioned/results/c6a.2xlarge.json @@ -0,0 +1,57 @@ +{ + "system": "DataFusion (Vortex, partitioned)", + "date": "2026-05-05", + "machine": "c6a.2xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "hardware": "cpu", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 108.58, + "data_size": 15328662856, + "result": [ + [0.078,0.002,0.002], + [0.170,0.027,0.028], + [0.214,0.072,0.069], + [0.650,0.071,0.070], + [1.367,0.840,0.837], + [1.355,0.785,0.786], + [0.075,0.002,0.002], + [0.179,0.031,0.033], + [1.269,1.045,1.044], + [1.670,1.251,1.237], + [0.770,0.162,0.156], + [1.099,0.192,0.191], + [1.603,0.694,0.682], + [3.280,1.199,1.198], + [1.395,0.667,0.670], + [1.108,0.948,0.942], + [3.158,1.772,1.779], + [3.122,1.778,1.785], + [5.120,3.531,3.507], + [0.313,0.043,0.048], + [15.675,0.906,0.901], + [17.831,0.929,0.926], + [22.767,1.105,1.103], + [22.582,1.521,1.574], + [0.319,0.079,0.075], + [1.603,0.146,0.147], + [0.609,0.081,0.082], + [16.343,1.328,1.365], + [15.637,15.200,15.211], + [0.814,0.656,0.668], + [2.796,0.592,0.595], + [5.885,0.641,0.629], + [3.929,3.007,3.019], + [16.025,3.545,3.512], + [15.999,3.567,3.503], + [1.455,1.293,1.298], + [0.254,0.074,0.073], + [0.203,0.034,0.034], + [0.243,0.024,0.022], + [0.386,0.130,0.129], + [0.247,0.019,0.016], + [0.249,0.015,0.015], + [0.242,0.014,0.015] + ] +} \ No newline at end of file diff --git a/datafusion-vortex-partitioned/results/c6a.4xlarge.json b/datafusion-vortex-partitioned/results/c6a.4xlarge.json new file mode 100644 index 000000000..cb9a10682 --- /dev/null +++ b/datafusion-vortex-partitioned/results/c6a.4xlarge.json @@ -0,0 +1,57 @@ +{ + "system": "DataFusion (Vortex, partitioned)", + "date": "2026-05-05", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 99.94, + "data_size": 15328662856, + "result": [ + [0.082, 0.002, 0.002], + [0.152, 0.030, 0.027], + [0.175, 0.059, 0.060], + [0.633, 0.087, 0.089], + [1.306, 0.628, 0.626], + [1.316, 0.603, 0.592], + [0.090, 0.002, 0.002], + [0.166, 0.031, 0.030], + [1.223, 0.786, 0.772], + [1.664, 0.861, 0.871], + [0.729, 0.131, 0.133], + [1.116, 0.148, 0.147], + [1.605, 0.581, 0.578], + [3.174, 1.070, 1.068], + [1.527, 0.610, 0.597], + [0.887, 0.727, 0.715], + [3.174, 1.509, 1.532], + [3.153, 1.510, 1.506], + [4.788, 2.907, 2.827], + [0.313, 0.048, 0.049], + [15.848, 0.537, 0.528], + [17.859, 0.781, 0.772], + [22.900, 0.894, 0.878], + [21.132, 0.858, 0.785], + [0.287, 0.084, 0.091], + [1.607, 0.153, 0.146], + [0.757, 0.088, 0.088], + [16.230, 0.974, 1.011], + [13.690, 8.122, 8.151], + [0.512, 0.369, 0.364], + [2.788, 0.487, 0.487], + [5.882, 0.592, 0.589], + [3.929, 2.653, 2.674], + [15.934, 3.107, 2.890], + [15.951, 2.902, 2.918], + [1.060, 0.930, 0.917], + [0.261, 0.082, 0.082], + [0.208, 0.035, 0.036], + [0.197, 0.024, 0.023], + [0.385, 0.146, 0.146], + [0.251, 0.017, 0.016], + [0.248, 0.019, 0.014], + [0.244, 0.015, 0.015] + ] +} diff --git a/datafusion-vortex-partitioned/results/c6a.xlarge.json b/datafusion-vortex-partitioned/results/c6a.xlarge.json new file mode 100644 index 000000000..162555df7 --- /dev/null +++ b/datafusion-vortex-partitioned/results/c6a.xlarge.json @@ -0,0 +1,57 @@ +{ + "system": "DataFusion (Vortex, partitioned)", + "date": "2026-05-05", + "machine": "c6a.xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 203.96, + "data_size": 15328662856, + "result": [ + [0.094, 0.002, 0.002], + [0.255, 0.048, 0.045], + [0.372, 0.126, 0.126], + [0.646, 0.104, 0.102], + [1.758, 1.538, 1.542], + [1.531, 1.347, 1.361], + [0.076, 0.002, 0.002], + [0.269, 0.059, 0.055], + [2.173, 1.919, 1.895], + [2.624, 2.323, 2.232], + [0.781, 0.277, 0.277], + [0.881, 0.347, 0.351], + [1.571, 1.092, 1.096], + [3.396, 1.602, 1.609], + [1.666, 1.054, 1.063], + [1.921, 1.705, 1.688], + [3.797, 3.020, 3.037], + [3.719, 3.013, 3.018], + [12.789, 13.580, 8.736], + [0.363, 0.068, 0.067], + [15.387, 1.708, 1.698], + [17.891, 1.673, 1.663], + [22.657, 1.888, 1.923], + [18.103, 2.159, 2.178], + [0.339, 0.105, 0.109], + [1.313, 0.246, 0.249], + [0.351, 0.117, 0.119], + [16.247, 2.395, 2.436], + [29.179, 28.872, 28.767], + [1.477, 1.302, 1.294], + [2.846, 0.978, 0.975], + [5.908, 0.972, 0.956], + [29.987, 6.008, 8.928], + [18.597, 23.579, 21.585], + [24.180, 18.393, 21.401], + [2.550, 2.433, 2.394], + [0.309, 0.081, 0.079], + [0.255, 0.036, 0.036], + [0.255, 0.032, 0.029], + [0.412, 0.140, 0.138], + [0.243, 0.017, 0.017], + [0.237, 0.017, 0.018], + [0.235, 0.017, 0.016] + ] +} diff --git a/datafusion-vortex-partitioned/results/c8g.4xlarge.json b/datafusion-vortex-partitioned/results/c8g.4xlarge.json new file mode 100644 index 000000000..2838bd357 --- /dev/null +++ b/datafusion-vortex-partitioned/results/c8g.4xlarge.json @@ -0,0 +1,57 @@ +{ + "system": "DataFusion (Vortex, partitioned)", + "date": "2026-05-05", + "machine": "c8g.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 107.21, + "data_size": 15329147920, + "result": [ + [0.047, 0.001, 0.001], + [0.092, 0.015, 0.015], + [0.139, 0.038, 0.036], + [0.614, 0.032, 0.032], + [1.347, 0.233, 0.233], + [1.323, 0.236, 0.237], + [0.045, 0.001, 0.001], + [0.100, 0.018, 0.017], + [1.034, 0.311, 0.315], + [1.591, 0.577, 0.555], + [0.664, 0.060, 0.061], + [1.381, 0.074, 0.075], + [1.700, 0.214, 0.223], + [2.986, 0.342, 0.355], + [1.295, 0.222, 0.219], + [0.939, 0.257, 0.252], + [2.847, 0.512, 0.510], + [2.693, 0.508, 0.506], + [3.899, 0.980, 0.966], + [0.279, 0.020, 0.022], + [15.898, 0.469, 0.469], + [17.882, 0.303, 0.302], + [23.013, 0.611, 0.373], + [19.679, 0.519, 0.519], + [0.285, 0.037, 0.036], + [1.842, 0.055, 0.057], + [1.036, 0.033, 0.041], + [16.514, 0.451, 0.451], + [13.751, 6.440, 6.440], + [0.434, 0.345, 0.344], + [2.739, 0.191, 0.192], + [6.081, 0.198, 0.190], + [4.114, 0.772, 0.776], + [15.880, 1.048, 1.045], + [15.887, 1.044, 1.054], + [0.561, 0.449, 0.446], + [0.166, 0.054, 0.053], + [0.139, 0.031, 0.031], + [0.145, 0.016, 0.014], + [0.263, 0.106, 0.105], + [0.147, 0.014, 0.013], + [0.138, 0.012, 0.012], + [0.128, 0.014, 0.014] + ] +} diff --git a/datafusion-vortex-partitioned/run.sh b/datafusion-vortex-partitioned/run.sh new file mode 100755 index 000000000..32c93fc81 --- /dev/null +++ b/datafusion-vortex-partitioned/run.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +TRIES=3 +QUERY_NUM=1 +: > result.csv + +cat queries.sql | while read -r query; do + sync + echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null + + echo "$query" > /tmp/query.sql + + echo -n "[" + for i in $(seq 1 $TRIES); do + RES=$(vortex-datafusion-cli -f create.sql -f /tmp/query.sql 2>&1 | grep "Elapsed" | tail -1 | awk '{ print $2 }') + [[ $RES != "" ]] && \ + echo -n "$RES" || \ + echo -n "null" + [[ "$i" != $TRIES ]] && echo -n ", " + echo "${QUERY_NUM},${i},${RES}" >> result.csv + done + echo "]," + + QUERY_NUM=$((QUERY_NUM + 1)) +done diff --git a/datafusion-vortex-partitioned/template.json b/datafusion-vortex-partitioned/template.json new file mode 100644 index 000000000..cb9f36f87 --- /dev/null +++ b/datafusion-vortex-partitioned/template.json @@ -0,0 +1,12 @@ +{ + "system": "DataFusion (Vortex, partitioned)", + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "Rust", + "column-oriented", + "embedded", + "stateless" + ] +} diff --git a/datafusion-vortex/README.md b/datafusion-vortex/README.md new file mode 100644 index 000000000..b20e590b3 --- /dev/null +++ b/datafusion-vortex/README.md @@ -0,0 +1,43 @@ +# DataFusion + Vortex + +Single-file Vortex dataset, converted from the ClickBench Parquet file and queried with [`vortex-datafusion-cli`]. + +[`vortex-datafusion-cli`]: https://github.com/vortex-data/vortex-datafusion-cli + +## Cookbook: Generate benchmark results + +Follow the same EC2 setup used by [datafusion](../datafusion/README.md), then run: + +```bash +cd ClickBench/datafusion-vortex +bash benchmark.sh +``` + +The benchmark script builds `vortex-datafusion-cli`, downloads `hits.parquet`, converts it to `vortex/hits.vortex`, and runs the query set. + +`benchmark.sh` checks out `vortex-datafusion-cli` tag `0.70.0-53.1.0`. CLI tags use `-`, where the first component is the `vortex-datafusion` crate version and the second is the DataFusion/DataFusion CLI version. + +You can update/preview the results by running: + +```bash +./make-json.sh # Example. ./make-json.sh c6a.xlarge +``` + +## Parquet to Vortex conversion + +The conversion intentionally goes through the DataFusion CLI path: + +```sql +CREATE EXTERNAL TABLE hits_parquet +STORED AS PARQUET +LOCATION 'hits.parquet' +OPTIONS ('binary_as_string' 'true'); + +COPY ( + SELECT * EXCEPT ("EventDate"), + CAST(CAST("EventDate" AS INTEGER) AS DATE) AS "EventDate" + FROM hits_parquet +) TO 'vortex/hits.vortex' STORED AS VORTEX; +``` + +`binary_as_string=true` handles the ClickBench Parquet byte/string mismatch before Vortex is written. The resulting Vortex file stores those columns as strings, so Vortex reads do not need the Parquet-only option. diff --git a/datafusion-vortex/benchmark.sh b/datafusion-vortex/benchmark.sh new file mode 100755 index 000000000..697766d58 --- /dev/null +++ b/datafusion-vortex/benchmark.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +set -Eeuo pipefail + +export HOME=${HOME:=~} +# - +CLI_TAG=0.70.0-53.1.0 +WITH_SWAP=false + +if [ "$(free -g | awk '/^Mem:/{print $2}')" -lt 12 ]; then + echo "LOW MEMORY MODE" + if [ "$(swapon --noheadings --show | wc -l)" -eq 0 ]; then + echo "Enabling 8G swap" + sudo fallocate -l 8G /swapfile + sudo chmod 600 /swapfile + sudo mkswap /swapfile + sudo swapon /swapfile + WITH_SWAP=true + fi +fi + +echo "Install Rust" +if ! command -v cargo >/dev/null 2>&1; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh + bash rust-init.sh -y +fi +source "$HOME/.cargo/env" + +echo "Install dependencies" +sudo apt-get update -y +sudo apt-get install -y build-essential cmake pkg-config time + +echo "Install vortex-datafusion-cli" +rm -rf vortex-datafusion-cli +git clone https://github.com/vortex-data/vortex-datafusion-cli.git +cd vortex-datafusion-cli +git checkout "$CLI_TAG" +CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release --bin vortex-datafusion-cli +export PATH="$(pwd)/target/release:$PATH" +cd .. + +echo "Download benchmark target data, single file" +wget --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/hits.parquet + +echo "Convert Parquet to Vortex" +rm -rf vortex +mkdir -p vortex +if ! /usr/bin/time -f '%e' -o load-time.txt ./convert.sh hits.parquet vortex/hits.vortex > convert.log 2>&1; then + cat convert.log + exit 1 +fi +echo "Load time: $(cat load-time.txt)" + +echo "Run benchmarks" +./run.sh + +echo "Data size: $(du -bcs vortex/*.vortex | grep total)" + +if [ "$WITH_SWAP" = true ]; then + echo "Disable swap" + sudo swapoff /swapfile + sudo rm /swapfile +fi diff --git a/datafusion-vortex/convert.sh b/datafusion-vortex/convert.sh new file mode 100755 index 000000000..6bf5c19b5 --- /dev/null +++ b/datafusion-vortex/convert.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -Eeuo pipefail + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " >&2 + exit 2 +fi + +INPUT=$1 +OUTPUT=$2 + +mkdir -p "$(dirname "$OUTPUT")" +rm -f "$OUTPUT" + +vortex-datafusion-cli -q \ + -c "SET datafusion.execution.target_partitions = 1;" \ + -c "CREATE EXTERNAL TABLE hits_parquet STORED AS PARQUET LOCATION '$INPUT' OPTIONS ('binary_as_string' 'true');" \ + -c "COPY (SELECT * EXCEPT (\"EventDate\"), CAST(CAST(\"EventDate\" AS INTEGER) AS DATE) AS \"EventDate\" FROM hits_parquet) TO '$OUTPUT' STORED AS VORTEX;" + +test -f "$OUTPUT" diff --git a/datafusion-vortex/create.sql b/datafusion-vortex/create.sql new file mode 100644 index 000000000..a5cfc6c05 --- /dev/null +++ b/datafusion-vortex/create.sql @@ -0,0 +1,3 @@ +CREATE EXTERNAL TABLE hits +STORED AS VORTEX +LOCATION 'vortex/hits.vortex'; diff --git a/datafusion-vortex/make-json.sh b/datafusion-vortex/make-json.sh new file mode 100755 index 000000000..8c2e6d259 --- /dev/null +++ b/datafusion-vortex/make-json.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# This script converts the raw `result.csv` data from `benchmark.sh` into the +# final json format used by the benchmark dashboard. +# +# usage : ./make-json.sh +# +# example ./make-json.sh c6a.4xlarge # saves results/c6a.4xlarge.json +# + +MACHINE=$1 +OUTPUT_FILE="results/${MACHINE}.json" +SYSTEM_NAME="DataFusion (Vortex, single)" +DATE=$(date +%Y-%m-%d) +LOAD_TIME=${LOAD_TIME:-$(cat load-time.txt 2>/dev/null || echo null)} +DATA_SIZE=${DATA_SIZE:-$(du -bcs vortex/*.vortex 2>/dev/null | awk '/total/ { print $1 }')} +DATA_SIZE=${DATA_SIZE:-null} + +mkdir -p results + +# Read the CSV and build the result array using sed +RESULT_ARRAY=$(awk -F, '{arr[$1]=arr[$1]","$3} END {for (i=1;i<=length(arr);i++) {gsub(/^,/, "", arr[i]); printf " ["arr[i]"]"; if (i $OUTPUT_FILE +{ + "system": "$SYSTEM_NAME", + "date": "$DATE", + "machine": "$MACHINE", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "hardware": "cpu", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": $LOAD_TIME, + "data_size": $DATA_SIZE, + "result": [ + $RESULT_ARRAY + ] +} +EOF diff --git a/datafusion-vortex/queries.sql b/datafusion-vortex/queries.sql new file mode 100644 index 000000000..9a183cd6e --- /dev/null +++ b/datafusion-vortex/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0; +SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits; +SELECT AVG("UserID") FROM hits; +SELECT COUNT(DISTINCT "UserID") FROM hits; +SELECT COUNT(DISTINCT "SearchPhrase") FROM hits; +SELECT MIN("EventDate"), MAX("EventDate") FROM hits; +SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY COUNT(*) DESC; +SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10; +SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10; +SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10; +SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10; +SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; +SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10; +SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10; +SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10; +SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; +SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10; +SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; +SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%'; +SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; +SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY "EventTime" LIMIT 10; +SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime" LIMIT 10; +SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10; +SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime", "SearchPhrase" LIMIT 10; +SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits; +SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10; +SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10; +SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10; +SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10; +SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10; +SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10; +SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10; +SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10; +SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT "URLHash", "EventDate", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate" ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000; diff --git a/datafusion-vortex/results/c6a.2xlarge.json b/datafusion-vortex/results/c6a.2xlarge.json new file mode 100644 index 000000000..1b2f6ae65 --- /dev/null +++ b/datafusion-vortex/results/c6a.2xlarge.json @@ -0,0 +1,57 @@ +{ + "system": "DataFusion (Vortex, single)", + "date": "2026-05-05", + "machine": "c6a.2xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "hardware": "cpu", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 146.68, + "data_size": 15269997296, + "result": [ + [0.099,0.001,0.001], + [0.298,0.114,0.116], + [0.331,0.161,0.160], + [0.589,0.187,0.190], + [1.180,0.942,0.942], + [1.442,0.953,0.911], + [0.075,0.001,0.001], + [0.275,0.125,0.119], + [1.401,1.153,1.151], + [1.698,1.331,1.357], + [0.707,0.257,0.248], + [0.774,0.272,0.285], + [1.316,0.786,0.809], + [3.030,1.094,1.300], + [1.613,0.761,0.777], + [1.313,1.055,1.061], + [3.449,1.902,1.863], + [3.430,1.894,1.892], + [5.018,3.543,3.527], + [0.382,0.168,0.170], + [15.322,1.071,1.049], + [18.207,1.296,1.316], + [21.967,7.898,6.495], + [0.111,0.118,0.117], + [3.054,0.329,0.334], + [1.231,0.297,0.274], + [2.308,0.318,0.319], + [16.213,1.628,1.613], + [17.861,17.254,17.424], + [0.997,0.832,0.772], + [2.710,0.720,0.738], + [5.799,0.783,0.792], + [4.209,3.275,3.257], + [16.457,3.560,3.559], + [16.425,3.573,3.531], + [1.514,1.348,1.355], + [0.352,0.183,0.186], + [0.291,0.136,0.130], + [0.299,0.139,0.138], + [0.444,0.267,0.271], + [0.284,0.115,0.116], + [0.331,0.116,0.116], + [0.331,0.115,0.111] + ] +} \ No newline at end of file diff --git a/datafusion-vortex/results/c6a.4xlarge.json b/datafusion-vortex/results/c6a.4xlarge.json new file mode 100644 index 000000000..6aa75ba3d --- /dev/null +++ b/datafusion-vortex/results/c6a.4xlarge.json @@ -0,0 +1,57 @@ +{ + "system": "DataFusion (Vortex, single)", + "date": "2026-05-05", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": null, + "data_size": 15269997296, + "result": [ + [0.100, 0.001, 0.001], + [0.318, 0.142, 0.142], + [0.305, 0.173, 0.170], + [0.622, 0.223, 0.222], + [0.966, 0.743, 0.740], + [1.415, 0.805, 0.806], + [0.078, 0.001, 0.001], + [0.300, 0.146, 0.147], + [1.390, 0.907, 0.903], + [1.811, 0.964, 1.022], + [0.740, 0.244, 0.240], + [0.860, 0.251, 0.254], + [1.506, 0.763, 0.755], + [3.031, 1.076, 1.073], + [1.519, 0.785, 0.791], + [1.056, 0.839, 0.841], + [3.343, 1.647, 1.646], + [3.329, 1.633, 1.635], + [4.732, 2.774, 2.966], + [0.380, 0.193, 0.195], + [15.839, 0.827, 0.833], + [18.245, 1.367, 1.365], + [22.015, 1.433, 1.427], + [57.863, 55.932, 56.962], + [2.468, 0.360, 0.355], + [1.275, 0.316, 0.312], + [2.342, 0.353, 0.360], + [16.237, 1.416, 1.438], + [14.892, 8.576, 9.159], + [0.579, 0.445, 0.479], + [2.761, 0.621, 0.630], + [5.838, 0.722, 0.714], + [5.407, 2.822, 2.790], + [16.400, 3.074, 3.087], + [16.436, 3.055, 3.110], + [1.150, 0.974, 0.969], + [0.383, 0.198, 0.199], + [0.332, 0.159, 0.151], + [0.336, 0.166, 0.161], + [0.482, 0.300, 0.303], + [0.323, 0.151, 0.148], + [0.313, 0.140, 0.145], + [0.301, 0.142, 0.138] + ] +} diff --git a/datafusion-vortex/results/c6a.xlarge.json b/datafusion-vortex/results/c6a.xlarge.json new file mode 100644 index 000000000..96212fe5c --- /dev/null +++ b/datafusion-vortex/results/c6a.xlarge.json @@ -0,0 +1,57 @@ +{ + "system": "DataFusion (Vortex, single)", + "date": "2026-05-05", + "machine": "c6a.xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "hardware": "cpu", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 214.48, + "data_size": 15269997296, + "result": [ + [0.095,0.001,0.001], + [0.294,0.118,0.118], + [0.368,0.207,0.204], + [0.606,0.202,0.207], + [1.783,1.602,1.609], + [1.597,1.440,1.435], + [0.076,0.001,0.001], + [0.266,0.127,0.127], + [2.126,1.966,1.952], + [2.343,2.128,2.112], + [0.715,0.351,0.343], + [0.767,0.405,0.403], + [1.625,1.201,1.196], + [3.496,1.662,1.918], + [1.746,1.138,1.141], + [1.917,1.756,1.766], + [3.860,3.092,3.051], + [3.811,3.068,3.039], + [14.188,14.738,15.730], + [0.407,0.167,0.185], + [15.198,2.291,2.160], + [30.110,36.377,35.010], + [42.205,40.162,38.771], + [0.144,0.144,0.143], + [2.343,0.441,0.430], + [1.271,0.416,0.378], + [2.310,0.463,0.474], + [28.792,28.452,28.469], + [39.856,38.212,39.387], + [1.448,1.290,1.289], + [2.854,1.035,1.040], + [5.820,1.052,1.035], + [21.409,24.075,8.592], + [30.597,33.675,38.120], + [26.918,21.302,19.043], + [2.485,2.311,2.292], + [0.410,0.194,0.182], + [0.346,0.126,0.123], + [0.352,0.142,0.141], + [0.543,0.303,0.280], + [0.331,0.110,0.110], + [0.310,0.108,0.114], + [0.316,0.101,0.107] + ] +} \ No newline at end of file diff --git a/datafusion-vortex/results/c8g.4xlarge.json b/datafusion-vortex/results/c8g.4xlarge.json new file mode 100644 index 000000000..ac55cb45f --- /dev/null +++ b/datafusion-vortex/results/c8g.4xlarge.json @@ -0,0 +1,57 @@ +{ + "system": "DataFusion (Vortex, single)", + "date": "2026-05-05", + "machine": "c8g.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": ["Rust","column-oriented","embedded","stateless"], + "load_time": 70.42, + "data_size": 15269901360, + "result": [ + [0.058, 0.001, 0.001], + [0.221, 0.121, 0.123], + [0.224, 0.125, 0.133], + [0.567, 0.128, 0.131], + [1.030, 0.314, 0.311], + [1.231, 0.402, 0.407], + [0.052, 0.001, 0.001], + [0.224, 0.127, 0.122], + [1.063, 0.396, 0.404], + [1.639, 0.566, 0.609], + [0.655, 0.151, 0.167], + [0.992, 0.161, 0.160], + [1.503, 0.348, 0.354], + [2.757, 0.413, 0.412], + [1.434, 0.355, 0.379], + [0.686, 0.339, 0.341], + [2.707, 0.636, 0.597], + [2.697, 0.620, 0.626], + [3.912, 0.998, 1.044], + [0.339, 0.140, 0.149], + [15.985, 0.618, 0.614], + [18.192, 0.523, 0.528], + [21.971, 0.617, 0.619], + [57.869, 57.269, 56.779], + [2.600, 0.193, 0.198], + [1.413, 0.176, 0.173], + [2.498, 0.196, 0.200], + [16.312, 0.682, 0.701], + [15.384, 6.855, 7.000], + [0.484, 0.403, 0.383], + [2.618, 0.305, 0.300], + [5.743, 0.305, 0.311], + [3.820, 0.837, 0.832], + [16.154, 1.232, 1.247], + [16.173, 1.246, 1.170], + [0.666, 0.468, 0.468], + [0.282, 0.158, 0.154], + [0.255, 0.135, 0.140], + [0.261, 0.151, 0.147], + [0.405, 0.274, 0.281], + [0.240, 0.123, 0.128], + [0.237, 0.125, 0.122], + [0.231, 0.127, 0.125] + ] +} diff --git a/datafusion-vortex/run.sh b/datafusion-vortex/run.sh new file mode 100755 index 000000000..32c93fc81 --- /dev/null +++ b/datafusion-vortex/run.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +TRIES=3 +QUERY_NUM=1 +: > result.csv + +cat queries.sql | while read -r query; do + sync + echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null + + echo "$query" > /tmp/query.sql + + echo -n "[" + for i in $(seq 1 $TRIES); do + RES=$(vortex-datafusion-cli -f create.sql -f /tmp/query.sql 2>&1 | grep "Elapsed" | tail -1 | awk '{ print $2 }') + [[ $RES != "" ]] && \ + echo -n "$RES" || \ + echo -n "null" + [[ "$i" != $TRIES ]] && echo -n ", " + echo "${QUERY_NUM},${i},${RES}" >> result.csv + done + echo "]," + + QUERY_NUM=$((QUERY_NUM + 1)) +done diff --git a/datafusion-vortex/template.json b/datafusion-vortex/template.json new file mode 100644 index 000000000..22a86d6fe --- /dev/null +++ b/datafusion-vortex/template.json @@ -0,0 +1,12 @@ +{ + "system": "DataFusion (Vortex, single)", + "proprietary": "no", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "Rust", + "column-oriented", + "embedded", + "stateless" + ] +}