Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 43 additions & 43 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -108,24 +108,24 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Cache benchmark data
id: cache-data
uses: actions/cache@v4
with:
path: benchmark-data-sf${{ env.SCALE_FACTOR }}
key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }}

- name: Setup Python
if: steps.cache-data.outputs.cache-hit != 'true'
uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Install huggingface-hub
if: steps.cache-data.outputs.cache-hit != 'true'
run: pip install huggingface-hub

- name: Download benchmark data from Hugging Face
if: steps.cache-data.outputs.cache-hit != 'true'
run: |
Expand All @@ -136,36 +136,36 @@ jobs:
else
HF_SF="sf${SF}"
fi

echo "Downloading data from HF: ${{ env.HF_DATASET }}/${{ env.HF_DATA_VERSION }}/${HF_SF}"

python -c "
from huggingface_hub import snapshot_download
import os

sf = os.environ['SCALE_FACTOR']
hf_sf = 'sf0.1' if sf == '0.1' else f'sf{sf}'

snapshot_download(
repo_id='${{ env.HF_DATASET }}',
repo_type='dataset',
local_dir='hf-data',
allow_patterns=[f'${{ env.HF_DATA_VERSION }}/{hf_sf}/**'],
)
"

# Move data to expected location
mkdir -p benchmark-data-sf${{ env.SCALE_FACTOR }}

SF="${{ env.SCALE_FACTOR }}"
if [ "$SF" = "0.1" ]; then
HF_SF="sf0.1"
else
HF_SF="sf${SF}"
fi

cp -r hf-data/${{ env.HF_DATA_VERSION }}/${HF_SF}/* benchmark-data-sf${{ env.SCALE_FACTOR }}/

echo "Downloaded data structure:"
find benchmark-data-sf${{ env.SCALE_FACTOR }} -type f -name "*.parquet" | head -20
echo ""
Expand All @@ -174,7 +174,7 @@ jobs:
echo ""
echo "Total size:"
du -sh benchmark-data-sf${{ env.SCALE_FACTOR }}/

- name: Show cached data info
if: steps.cache-data.outputs.cache-hit == 'true'
run: |
Expand All @@ -192,19 +192,19 @@ jobs:
if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars', 'duckdb')
steps:
- uses: actions/checkout@v4

- name: Restore benchmark data from cache
uses: actions/cache/restore@v4
with:
path: benchmark-data-sf${{ env.SCALE_FACTOR }}
key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }}
fail-on-cache-miss: true

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Install dependencies
run: |
echo "=== DuckDB Installation Parameters ==="
Expand All @@ -221,12 +221,12 @@ jobs:
pip install duckdb pyarrow pandas
fi
echo "Installed DuckDB version: $(python -c 'import duckdb; print(duckdb.__version__)')"

- name: Pre-install DuckDB spatial extension
run: |
# Dev builds don't have spatial extension in core_nightly, so always use default repo
python -c "import duckdb; con = duckdb.connect(); con.execute('INSTALL spatial'); print('DuckDB spatial extension installed')"

- name: Run DuckDB benchmark
run: |
python benchmark/run_benchmark.py \
Expand All @@ -236,7 +236,7 @@ jobs:
--runs ${{ env.BENCHMARK_RUNS }} \
--scale-factor ${{ env.SCALE_FACTOR }} \
--output duckdb_results.json

- name: Upload results
uses: actions/upload-artifact@v4
with:
Expand All @@ -251,19 +251,19 @@ jobs:
if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars', 'geopandas')
steps:
- uses: actions/checkout@v4

- name: Restore benchmark data from cache
uses: actions/cache/restore@v4
with:
path: benchmark-data-sf${{ env.SCALE_FACTOR }}
key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }}
fail-on-cache-miss: true

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Install dependencies
run: |
if [ -n "${{ env.GEOPANDAS_VERSION }}" ]; then
Expand All @@ -272,7 +272,7 @@ jobs:
pip install geopandas pandas pyarrow shapely
fi
echo "Installed GeoPandas version: $(python -c 'from importlib.metadata import version; print(version("geopandas"))')"

- name: Run GeoPandas benchmark
run: |
python benchmark/run_benchmark.py \
Expand All @@ -282,7 +282,7 @@ jobs:
--runs ${{ env.BENCHMARK_RUNS }} \
--scale-factor ${{ env.SCALE_FACTOR }} \
--output geopandas_results.json

- name: Upload results
uses: actions/upload-artifact@v4
with:
Expand All @@ -297,19 +297,19 @@ jobs:
if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars', 'sedonadb')
steps:
- uses: actions/checkout@v4

- name: Restore benchmark data from cache
uses: actions/cache/restore@v4
with:
path: benchmark-data-sf${{ env.SCALE_FACTOR }}
key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }}
fail-on-cache-miss: true

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Install dependencies
run: |
echo "=== SedonaDB Installation Parameters ==="
Expand All @@ -328,7 +328,7 @@ jobs:
pip install "sedonadb[geopandas]" pandas pyarrow pyproj
fi
echo "Installed SedonaDB version: $(python -c 'from importlib.metadata import version; print(version("sedonadb"))')"

- name: Run SedonaDB benchmark
run: |
python benchmark/run_benchmark.py \
Expand All @@ -338,7 +338,7 @@ jobs:
--runs ${{ env.BENCHMARK_RUNS }} \
--scale-factor ${{ env.SCALE_FACTOR }} \
--output sedonadb_results.json

- name: Upload results
uses: actions/upload-artifact@v4
with:
Expand All @@ -353,19 +353,19 @@ jobs:
if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars', 'spatial_polars')
steps:
- uses: actions/checkout@v4

- name: Restore benchmark data from cache
uses: actions/cache/restore@v4
with:
path: benchmark-data-sf${{ env.SCALE_FACTOR }}
key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }}
fail-on-cache-miss: true

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Install dependencies
run: |
if [ -n "${{ env.SPATIAL_POLARS_VERSION }}" ]; then
Expand All @@ -374,7 +374,7 @@ jobs:
pip install "spatial-polars[knn]" pyarrow
fi
echo "Installed Spatial Polars version: $(python -c 'from importlib.metadata import version; print(version("spatial-polars"))')"

- name: Run Spatial Polars benchmark
run: |
python benchmark/run_benchmark.py \
Expand All @@ -384,7 +384,7 @@ jobs:
--runs ${{ env.BENCHMARK_RUNS }} \
--scale-factor ${{ env.SCALE_FACTOR }} \
--output spatial_polars_results.json

- name: Upload results
uses: actions/upload-artifact@v4
with:
Expand All @@ -399,58 +399,58 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Download DuckDB results
if: needs.benchmark-duckdb.result == 'success'
uses: actions/download-artifact@v4
with:
name: duckdb-results-sf${{ env.SCALE_FACTOR }}
path: results
continue-on-error: true

- name: Download GeoPandas results
if: needs.benchmark-geopandas.result == 'success'
uses: actions/download-artifact@v4
with:
name: geopandas-results-sf${{ env.SCALE_FACTOR }}
path: results
continue-on-error: true

- name: Download SedonaDB results
if: needs.benchmark-sedonadb.result == 'success'
uses: actions/download-artifact@v4
with:
name: sedonadb-results-sf${{ env.SCALE_FACTOR }}
path: results
continue-on-error: true

- name: Download Spatial Polars results
if: needs.benchmark-spatial-polars.result == 'success'
uses: actions/download-artifact@v4
with:
name: spatial_polars-results-sf${{ env.SCALE_FACTOR }}
path: results
continue-on-error: true

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Generate summary
run: |
python benchmark/summarize_results.py \
--results-dir results \
--timeout ${{ env.QUERY_TIMEOUT }} \
--runs ${{ env.BENCHMARK_RUNS }} \
--output benchmark_summary.md

- name: Display summary
run: cat benchmark_summary.md

- name: Add summary to job output
run: cat benchmark_summary.md >> $GITHUB_STEP_SUMMARY

- name: Upload combined results
uses: actions/upload-artifact@v4
with:
Expand Down
4 changes: 4 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,7 @@ repos:
- id: forbid-submodules
name: run forbid-submodules
description: forbids any submodules in the repository
- id: trailing-whitespace
name: run trailing-whitespace
description: trims trailing whitespace
args: [--markdown-linebreak-ext=md]
2 changes: 1 addition & 1 deletion ARCHITECTURE.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ has many more dependencies.

## Performance

Speed is a very important aspect of this project, and care has been taken to keep
Speed is a very important aspect of this project, and care has been taken to keep
the code as fast as possible, using some of the following techniques:
1. Avoiding heap allocations during data generation
2. Integer arithmetic and display instead of floating point arithmetic and display
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ writing "idiomatic" pandas code. We would be interested in hearing feedback on t
hand-optimized" version of the queries.

[Spatial Polars](https://atl2001.github.io/spatial_polars), like Geopandas, is not SQL-based. It uses shapely to extend
polars, enabling it to work with geospatial data similar to how Geopandas extends pandas. It is much newer and nowhere
polars, enabling it to work with geospatial data similar to how Geopandas extends pandas. It is much newer and nowhere
near as popular/tested as Geopandas, but is capable of computing all of the spatial bench queries, and has been included.

We welcome contributions and civil discussions on how to improve the queries and their implementations.
Expand Down
2 changes: 1 addition & 1 deletion dev/release/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ git push upstream sedona-spatialbench-0.1.0-rc1

### Signing Commands

Now the assets need to be signed with signatures.
Now the assets need to be signed with signatures.

**GPG Signing:**

Expand Down
2 changes: 1 addition & 1 deletion docs/contributors-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ Your first step is to create a personal copy of the repository and connect it to
origin https://github.com/YourUsername/sedona-spatialbench.git (push)
upstream https://github.com/apache/sedona-spatialbench.git (fetch)
upstream https://github.com/apache/sedona-spatialbench.git (push)
```
```

## Development Setup

Expand Down
8 changes: 4 additions & 4 deletions spatialbench-arrow/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,18 @@ This crate generates Spatial Bench data directly into [Apache Arrow] format usin
[Apache Arrow]: https://arrow.apache.org/
[arrow]: https://crates.io/crates/arrow

# Example usage:
# Example usage:

See [docs.rs page](https://docs.rs/tpchgen-arrow/latest/tpchgen_arrow/)

# Testing:
This crate ensures correct results using two methods.

1. Basic functional tests are in Rust doc tests in the source code (`cargo test --doc`)
2. The `reparse` integration test ensures that the Arrow generators
produce the same results as parsing the original `tbl` format (`cargo test --test reparse`)
2. The `reparse` integration test ensures that the Arrow generators
produce the same results as parsing the original `tbl` format (`cargo test --test reparse`)

# Contributing:
# Contributing:

Please see [CONTRIBUTING.md] for more information on how to contribute to this project.

Expand Down
2 changes: 1 addition & 1 deletion spatialbench-queries/spatial_polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

import spatial_polars # NOQA:F401 needed to add spatial namespace to polars dataframes

# for Q12 Spatial polars uses scipy's KDtree for KNN joins.
# for Q12 Spatial polars uses scipy's KDtree for KNN joins.
# Scipy must be installed for this to work.
# `pip install spatial-polars[knn]`
# which is essentially the same as
Expand Down
Loading