[CI] Add pre-commit hook to trim trailing whitespace (#79)

jbampton · web-flow · commit e681aa79f8a9 · 2026-01-23T22:10:10.000-08:00
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -108,24 +108,24 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-      
+
       - name: Cache benchmark data
         id: cache-data
         uses: actions/cache@v4
         with:
           path: benchmark-data-sf${{ env.SCALE_FACTOR }}
           key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }}
-      
+
       - name: Setup Python
         if: steps.cache-data.outputs.cache-hit != 'true'
         uses: actions/setup-python@v5
         with:
           python-version: '3.11'
-      
+
       - name: Install huggingface-hub
         if: steps.cache-data.outputs.cache-hit != 'true'
         run: pip install huggingface-hub
-      
+
       - name: Download benchmark data from Hugging Face
         if: steps.cache-data.outputs.cache-hit != 'true'
         run: |
@@ -136,36 +136,36 @@ jobs:
           else
             HF_SF="sf${SF}"
           fi
-          
+
           echo "Downloading data from HF: ${{ env.HF_DATASET }}/${{ env.HF_DATA_VERSION }}/${HF_SF}"
-          
+
           python -c "
           from huggingface_hub import snapshot_download
           import os
-          
+
           sf = os.environ['SCALE_FACTOR']
           hf_sf = 'sf0.1' if sf == '0.1' else f'sf{sf}'
-          
+
           snapshot_download(
               repo_id='${{ env.HF_DATASET }}',
               repo_type='dataset',
               local_dir='hf-data',
               allow_patterns=[f'${{ env.HF_DATA_VERSION }}/{hf_sf}/**'],
           )
           "
-          
+
           # Move data to expected location
           mkdir -p benchmark-data-sf${{ env.SCALE_FACTOR }}
-          
+
           SF="${{ env.SCALE_FACTOR }}"
           if [ "$SF" = "0.1" ]; then
             HF_SF="sf0.1"
           else
             HF_SF="sf${SF}"
           fi
-          
+
           cp -r hf-data/${{ env.HF_DATA_VERSION }}/${HF_SF}/* benchmark-data-sf${{ env.SCALE_FACTOR }}/
-          
+
           echo "Downloaded data structure:"
           find benchmark-data-sf${{ env.SCALE_FACTOR }} -type f -name "*.parquet" | head -20
           echo ""
@@ -174,7 +174,7 @@ jobs:
           echo ""
           echo "Total size:"
           du -sh benchmark-data-sf${{ env.SCALE_FACTOR }}/
-      
+
       - name: Show cached data info
         if: steps.cache-data.outputs.cache-hit == 'true'
         run: |
@@ -192,19 +192,19 @@ jobs:
     if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars', 'duckdb')
     steps:
       - uses: actions/checkout@v4
-      
+
       - name: Restore benchmark data from cache
         uses: actions/cache/restore@v4
         with:
           path: benchmark-data-sf${{ env.SCALE_FACTOR }}
           key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }}
           fail-on-cache-miss: true
-      
+
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
           python-version: '3.11'
-      
+
       - name: Install dependencies
         run: |
           echo "=== DuckDB Installation Parameters ==="
@@ -221,12 +221,12 @@ jobs:
             pip install duckdb pyarrow pandas
           fi
           echo "Installed DuckDB version: $(python -c 'import duckdb; print(duckdb.__version__)')"
-      
+
       - name: Pre-install DuckDB spatial extension
         run: |
           # Dev builds don't have spatial extension in core_nightly, so always use default repo
           python -c "import duckdb; con = duckdb.connect(); con.execute('INSTALL spatial'); print('DuckDB spatial extension installed')"
-      
+
       - name: Run DuckDB benchmark
         run: |
           python benchmark/run_benchmark.py \
@@ -236,7 +236,7 @@ jobs:
             --runs ${{ env.BENCHMARK_RUNS }} \
             --scale-factor ${{ env.SCALE_FACTOR }} \
             --output duckdb_results.json
-      
+
       - name: Upload results
         uses: actions/upload-artifact@v4
         with:
@@ -251,19 +251,19 @@ jobs:
     if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars', 'geopandas')
     steps:
       - uses: actions/checkout@v4
-      
+
       - name: Restore benchmark data from cache
         uses: actions/cache/restore@v4
         with:
           path: benchmark-data-sf${{ env.SCALE_FACTOR }}
           key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }}
           fail-on-cache-miss: true
-      
+
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
           python-version: '3.11'
-      
+
       - name: Install dependencies
         run: |
           if [ -n "${{ env.GEOPANDAS_VERSION }}" ]; then
@@ -272,7 +272,7 @@ jobs:
             pip install geopandas pandas pyarrow shapely
           fi
           echo "Installed GeoPandas version: $(python -c 'from importlib.metadata import version; print(version("geopandas"))')"
-      
+
       - name: Run GeoPandas benchmark
         run: |
           python benchmark/run_benchmark.py \
@@ -282,7 +282,7 @@ jobs:
             --runs ${{ env.BENCHMARK_RUNS }} \
             --scale-factor ${{ env.SCALE_FACTOR }} \
             --output geopandas_results.json
-      
+
       - name: Upload results
         uses: actions/upload-artifact@v4
         with:
@@ -297,19 +297,19 @@ jobs:
     if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars', 'sedonadb')
     steps:
       - uses: actions/checkout@v4
-      
+
       - name: Restore benchmark data from cache
         uses: actions/cache/restore@v4
         with:
           path: benchmark-data-sf${{ env.SCALE_FACTOR }}
           key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }}
           fail-on-cache-miss: true
-      
+
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
           python-version: '3.11'
-      
+
       - name: Install dependencies
         run: |
           echo "=== SedonaDB Installation Parameters ==="
@@ -328,7 +328,7 @@ jobs:
             pip install "sedonadb[geopandas]" pandas pyarrow pyproj
           fi
           echo "Installed SedonaDB version: $(python -c 'from importlib.metadata import version; print(version("sedonadb"))')"
-      
+
       - name: Run SedonaDB benchmark
         run: |
           python benchmark/run_benchmark.py \
@@ -338,7 +338,7 @@ jobs:
             --runs ${{ env.BENCHMARK_RUNS }} \
             --scale-factor ${{ env.SCALE_FACTOR }} \
             --output sedonadb_results.json
-      
+
       - name: Upload results
         uses: actions/upload-artifact@v4
         with:
@@ -353,19 +353,19 @@ jobs:
     if: contains(github.event.inputs.engines || 'duckdb,geopandas,sedonadb,spatial_polars', 'spatial_polars')
     steps:
       - uses: actions/checkout@v4
-      
+
       - name: Restore benchmark data from cache
         uses: actions/cache/restore@v4
         with:
           path: benchmark-data-sf${{ env.SCALE_FACTOR }}
           key: benchmark-data-${{ env.HF_DATA_VERSION }}-sf${{ env.SCALE_FACTOR }}
           fail-on-cache-miss: true
-      
+
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
           python-version: '3.11'
-      
+
       - name: Install dependencies
         run: |
           if [ -n "${{ env.SPATIAL_POLARS_VERSION }}" ]; then
@@ -374,7 +374,7 @@ jobs:
             pip install "spatial-polars[knn]" pyarrow
           fi
           echo "Installed Spatial Polars version: $(python -c 'from importlib.metadata import version; print(version("spatial-polars"))')"
-      
+
       - name: Run Spatial Polars benchmark
         run: |
           python benchmark/run_benchmark.py \
@@ -384,7 +384,7 @@ jobs:
             --runs ${{ env.BENCHMARK_RUNS }} \
             --scale-factor ${{ env.SCALE_FACTOR }} \
             --output spatial_polars_results.json
-      
+
       - name: Upload results
         uses: actions/upload-artifact@v4
         with:
@@ -399,58 +399,58 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-      
+
       - name: Download DuckDB results
         if: needs.benchmark-duckdb.result == 'success'
         uses: actions/download-artifact@v4
         with:
           name: duckdb-results-sf${{ env.SCALE_FACTOR }}
           path: results
         continue-on-error: true
-      
+
       - name: Download GeoPandas results
         if: needs.benchmark-geopandas.result == 'success'
         uses: actions/download-artifact@v4
         with:
           name: geopandas-results-sf${{ env.SCALE_FACTOR }}
           path: results
         continue-on-error: true
-      
+
       - name: Download SedonaDB results
         if: needs.benchmark-sedonadb.result == 'success'
         uses: actions/download-artifact@v4
         with:
           name: sedonadb-results-sf${{ env.SCALE_FACTOR }}
           path: results
         continue-on-error: true
-      
+
       - name: Download Spatial Polars results
         if: needs.benchmark-spatial-polars.result == 'success'
         uses: actions/download-artifact@v4
         with:
           name: spatial_polars-results-sf${{ env.SCALE_FACTOR }}
           path: results
         continue-on-error: true
-      
+
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
           python-version: '3.11'
-      
+
       - name: Generate summary
         run: |
           python benchmark/summarize_results.py \
             --results-dir results \
             --timeout ${{ env.QUERY_TIMEOUT }} \
             --runs ${{ env.BENCHMARK_RUNS }} \
             --output benchmark_summary.md
-      
+
       - name: Display summary
         run: cat benchmark_summary.md
-      
+
       - name: Add summary to job output
         run: cat benchmark_summary.md >> $GITHUB_STEP_SUMMARY
-      
+
       - name: Upload combined results
         uses: actions/upload-artifact@v4
         with:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -89,3 +89,7 @@ repos:
       - id: forbid-submodules
         name: run forbid-submodules
         description: forbids any submodules in the repository
+      - id: trailing-whitespace
+        name: run trailing-whitespace
+        description: trims trailing whitespace
+        args: [--markdown-linebreak-ext=md]
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
@@ -21,7 +21,7 @@ has many more dependencies.
 
 ## Performance
 
-Speed is a very important aspect of this project, and care has been taken to keep 
+Speed is a very important aspect of this project, and care has been taken to keep
 the code as fast as possible, using some of the following techniques:
 1. Avoiding heap allocations during data generation
 2. Integer arithmetic and display instead of floating point arithmetic and display
diff --git a/README.md b/README.md
@@ -29,7 +29,7 @@ writing "idiomatic" pandas code. We would be interested in hearing feedback on t
 hand-optimized" version of the queries.
 
 [Spatial Polars](https://atl2001.github.io/spatial_polars), like Geopandas, is not SQL-based.  It uses shapely to extend
-polars, enabling it to work with geospatial data similar to how Geopandas extends pandas.  It is much newer and nowhere 
+polars, enabling it to work with geospatial data similar to how Geopandas extends pandas.  It is much newer and nowhere
 near as popular/tested as Geopandas, but is capable of computing all of the spatial bench queries, and has been included.
 
 We welcome contributions and civil discussions on how to improve the queries and their implementations.
diff --git a/dev/release/README.md b/dev/release/README.md
@@ -91,7 +91,7 @@ git push upstream sedona-spatialbench-0.1.0-rc1
 
 ### Signing Commands
 
-Now the assets need to be signed with signatures. 
+Now the assets need to be signed with signatures.
 
 **GPG Signing:**
 
diff --git a/docs/contributors-guide.md b/docs/contributors-guide.md
@@ -67,7 +67,7 @@ Your first step is to create a personal copy of the repository and connect it to
     origin    https://github.com/YourUsername/sedona-spatialbench.git (push)
     upstream  https://github.com/apache/sedona-spatialbench.git (fetch)
     upstream  https://github.com/apache/sedona-spatialbench.git (push)
-    ``` 
+    ```
 
 ## Development Setup
 
diff --git a/spatialbench-arrow/README.md b/spatialbench-arrow/README.md
@@ -24,18 +24,18 @@ This crate generates Spatial Bench data directly into [Apache Arrow] format usin
 [Apache Arrow]: https://arrow.apache.org/
 [arrow]: https://crates.io/crates/arrow
 
-# Example usage: 
+# Example usage:
 
 See [docs.rs page](https://docs.rs/tpchgen-arrow/latest/tpchgen_arrow/)
 
 # Testing:
 This crate ensures correct results using two methods.
 
 1. Basic functional tests are in Rust doc tests in the source code (`cargo test --doc`)
-2. The `reparse` integration test ensures that the Arrow generators 
-   produce the same results as parsing the original `tbl` format (`cargo test --test reparse`) 
+2. The `reparse` integration test ensures that the Arrow generators
+   produce the same results as parsing the original `tbl` format (`cargo test --test reparse`)
 
-# Contributing: 
+# Contributing:
 
 Please see [CONTRIBUTING.md] for more information on how to contribute to this project.
 
diff --git a/spatialbench-queries/spatial_polars.py b/spatialbench-queries/spatial_polars.py
@@ -21,7 +21,7 @@
 
 import spatial_polars  # NOQA:F401 needed to add spatial namespace to polars dataframes
 
-# for Q12 Spatial polars uses scipy's KDtree for KNN joins. 
+# for Q12 Spatial polars uses scipy's KDtree for KNN joins.
 # Scipy must be installed for this to work.
 # `pip install spatial-polars[knn]`
 # which is essentially the same as
diff --git a/spatialbench/data/README.md b/spatialbench/data/README.md
diff --git a/spatialbench/src/q_and_a/answers_sf1.rs b/spatialbench/src/q_and_a/answers_sf1.rs