From 8879b80ef64ac954731f7dd4effa0dcefc350dc0 Mon Sep 17 00:00:00 2001
From: Jeisson Leal <jeisson-javier.leal-rojas@ufz.de>
Date: Sun, 26 Apr 2026 18:00:23 +0200
Subject: [PATCH 1/3] Add ASV backend benchmarks

---
 .github/workflows/asv-benchmarks.yml |  48 ++++++++
 .gitignore                           |   3 +
 asv.conf.json                        |  23 ++++
 benchmarks/README.md                 |  64 ++++++++++
 benchmarks/__init__.py               |   1 +
 benchmarks/benchmark_backends.py     | 147 +++++++++++++++++++++++
 tools/asv_speedup_summary.py         | 173 +++++++++++++++++++++++++++
 7 files changed, 459 insertions(+)
 create mode 100644 .github/workflows/asv-benchmarks.yml
 create mode 100644 asv.conf.json
 create mode 100644 benchmarks/README.md
 create mode 100644 benchmarks/__init__.py
 create mode 100644 benchmarks/benchmark_backends.py
 create mode 100644 tools/asv_speedup_summary.py

diff --git a/.github/workflows/asv-benchmarks.yml b/.github/workflows/asv-benchmarks.yml
new file mode 100644
index 000000000..294dbff42
--- /dev/null
+++ b/.github/workflows/asv-benchmarks.yml
@@ -0,0 +1,48 @@
+name: ASV Benchmarks
+
+on:
+  workflow_dispatch:
+
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - uses: conda-incubator/setup-miniconda@v3
+        with:
+          activate-environment: asv
+          python-version: "3.12"
+          channels: conda-forge
+          auto-activate-base: false
+
+      - name: Install ASV
+        shell: bash -l {0}
+        run: |
+          conda install -y -c conda-forge asv
+
+      - name: Configure ASV machine
+        shell: bash -l {0}
+        run: |
+          asv machine --yes
+
+      - name: Run ASV benchmarks
+        shell: bash -l {0}
+        run: |
+          asv run
+
+      - name: Publish ASV report
+        shell: bash -l {0}
+        run: |
+          asv publish
+
+      - name: Upload ASV results
+        uses: actions/upload-artifact@v4
+        with:
+          name: asv-results
+          path: |
+            .asv/results/
+            .asv/html/
diff --git a/.gitignore b/.gitignore
index bcdc980be..aa7b7f9fd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,6 +41,9 @@ htmlcov/
 .coverage
 .coverage.*
 .cache
+.asv/env/
+.asv/results/
+.asv/html/
 nosetests.xml
 coverage.xml
 *.cover
diff --git a/asv.conf.json b/asv.conf.json
new file mode 100644
index 000000000..211c4f51c
--- /dev/null
+++ b/asv.conf.json
@@ -0,0 +1,23 @@
+{
+  "version": 1,
+  "project": "GSTools",
+  "project_url": "https://github.com/jeilealr/GSTools",
+  "repo": ".",
+  "branches": ["main"],
+  "benchmark_dir": "benchmarks",
+  "env_dir": ".asv/env",
+  "results_dir": ".asv/results",
+  "html_dir": ".asv/html",
+  "show_commit_url": "https://github.com/jeilealr/GSTools/commit/",
+  "environment_type": "conda",
+  "conda_channels": ["conda-forge"],
+  "pythons": ["3.12"],
+  "matrix": {
+    "req": {
+      "numpy": [""]
+    }
+  },
+  "install_command": [
+    "in-dir={env_dir} python -m pip install {build_dir}[rust]"
+  ]
+}
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 000000000..1c36b5be3
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,64 @@
+# GSTools ASV Benchmarks
+
+This directory contains performance benchmarks for GSTools. Unit tests in
+`tests/` should remain focused on correctness; ASV benchmarks in this
+directory measure runtime and peak memory only.
+
+For a beginner-friendly explanation of ASV and every file added here, read
+[`ASV_TUTORIAL.md`](ASV_TUTORIAL.md).
+
+## Setup
+
+Run ASV commands from the GSTools repository root, where `asv.conf.json`
+is located:
+
+```bash
+cd /Users/lealroja/Documents/UFZ/MPS-Tools/GSTools
+conda install -c conda-forge asv
+asv machine
+```
+
+## Common Commands
+
+```bash
+asv run --quick
+asv run HEAD^!
+asv run
+asv publish
+asv preview
+asv compare HEAD~1 HEAD
+```
+
+`asv run --quick` is the quick development check for ASV 0.6.x. It runs each
+benchmark only once and does not save useful performance results.
+
+`asv run HEAD^!` benchmarks only the current commit. Plain `asv run` follows
+the branches configured in `asv.conf.json`.
+
+If you need to run ASV from another directory, pass the config explicitly:
+
+```bash
+asv --config /Users/lealroja/Documents/UFZ/MPS-Tools/GSTools/asv.conf.json run --quick
+```
+
+## Backend Comparison
+
+Benchmarks are parameterized with readable backend labels:
+
+- `cython_fallback`
+- `rust_core`
+
+ASV tracks each backend separately. Interpret Rust speedup on the same machine
+and same benchmark as:
+
+```text
+speedup = cython_fallback_time / rust_core_time
+```
+
+So:
+
+- `speedup > 1.0` means Rust is faster
+- `speedup = 1.0` means similar performance
+- `speedup < 1.0` means Rust is slower
+
+Do not compare absolute benchmark times across different machines.
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 000000000..c94e5d288
--- /dev/null
+++ b/benchmarks/__init__.py
@@ -0,0 +1 @@
+"""ASV benchmarks for GSTools."""
diff --git a/benchmarks/benchmark_backends.py b/benchmarks/benchmark_backends.py
new file mode 100644
index 000000000..2469fbd0c
--- /dev/null
+++ b/benchmarks/benchmark_backends.py
@@ -0,0 +1,147 @@
+"""Backend benchmarks for GSTools.
+
+Usage:
+    cd /my/path/to/GSTools
+    conda install -c conda-forge asv
+    asv machine
+    asv run --quick
+    asv run HEAD^!
+    asv run
+    asv publish
+    asv preview
+    asv compare HEAD~1 HEAD
+
+Backend speedup should be interpreted as:
+    speedup = cython_fallback_time / rust_core_time
+
+Values greater than 1.0 mean the Rust backend is faster on the same machine
+for the same benchmark and commit.
+"""
+
+from __future__ import annotations
+
+import contextlib
+
+import numpy as np
+
+import gstools as gs
+
+
+BACKENDS = ("cython_fallback", "rust_core")
+
+
+@contextlib.contextmanager
+def gstools_backend(use_core):
+    """Temporarily force either gstools-core or the Cython fallback."""
+    previous = (gs.config._GSTOOLS_CORE_AVAIL, gs.config.USE_GSTOOLS_CORE)
+    try:
+        if use_core:
+            if not previous[0]:
+                raise NotImplementedError("gstools_core is not available")
+            gs.config._GSTOOLS_CORE_AVAIL = True
+            gs.config.USE_GSTOOLS_CORE = True
+        else:
+            gs.config._GSTOOLS_CORE_AVAIL = False
+            gs.config.USE_GSTOOLS_CORE = False
+        yield
+    finally:
+        gs.config._GSTOOLS_CORE_AVAIL, gs.config.USE_GSTOOLS_CORE = previous
+
+
+def _use_core(backend):
+    if backend == "rust_core":
+        return True
+    if backend == "cython_fallback":
+        return False
+    raise ValueError(f"Unknown backend: {backend}")
+
+
+class BackendBenchmarks:
+    """Runtime and peak-memory benchmarks for backend-dispatched operations."""
+
+    params = BACKENDS
+    param_names = ["backend"]
+
+    def setup_cache(self):
+        """Create deterministic data once per benchmark environment."""
+        srf_x = np.random.RandomState(20220425).rand(2000) * 100.0
+        srf_y = np.random.RandomState(20220426).rand(2000) * 100.0
+
+        vario_x = np.random.RandomState(20220427).rand(900) * 100.0
+        vario_y = np.random.RandomState(20220428).rand(900) * 100.0
+        vario_field = np.sin(vario_x / 10.0) + np.cos(vario_y / 15.0)
+        vario_bins = np.linspace(0.0, 60.0, 16)
+
+        rng = np.random.RandomState(20220429)
+        cond_x = rng.rand(40) * 50.0
+        cond_y = rng.rand(40) * 50.0
+        cond_val = np.sin(cond_x / 8.0) + np.cos(cond_y / 9.0)
+        target_pos = (rng.rand(1000) * 50.0, rng.rand(1000) * 50.0)
+
+        return {
+            "srf": (srf_x, srf_y),
+            "variogram": ((vario_x, vario_y), vario_field, vario_bins),
+            "krige": ((cond_x, cond_y), cond_val, target_pos),
+        }
+
+    def setup(self, data, backend):
+        """Skip only the Rust parameter when gstools-core is unavailable."""
+        if backend == "rust_core" and not gs.config._GSTOOLS_CORE_AVAIL:
+            raise NotImplementedError("gstools_core is not available")
+
+    def time_srf(self, data, backend):
+        with gstools_backend(_use_core(backend)):
+            self._run_srf(data)
+
+    def peakmem_srf(self, data, backend):
+        with gstools_backend(_use_core(backend)):
+            self._run_srf(data)
+
+    def time_variogram(self, data, backend):
+        with gstools_backend(_use_core(backend)):
+            self._run_variogram(data)
+
+    def peakmem_variogram(self, data, backend):
+        with gstools_backend(_use_core(backend)):
+            self._run_variogram(data)
+
+    def time_krige(self, data, backend):
+        with gstools_backend(_use_core(backend)):
+            self._run_krige(data)
+
+    def peakmem_krige(self, data, backend):
+        with gstools_backend(_use_core(backend)):
+            self._run_krige(data)
+
+    def _run_srf(self, data):
+        x, y = data["srf"]
+        model = gs.Exponential(dim=2, var=2.0, len_scale=8.0)
+        srf = gs.SRF(model, mean=1.0, seed=20220425, mode_no=512)
+        return srf((x, y), mesh_type="unstructured")
+
+    def _run_variogram(self, data):
+        pos, field, bins = data["variogram"]
+        return gs.vario_estimate(
+            pos,
+            field,
+            bins,
+            mesh_type="unstructured",
+            return_counts=True,
+        )
+
+    def _run_krige(self, data):
+        cond_pos, cond_val, target_pos = data["krige"]
+        model = gs.Exponential(dim=2, var=1.5, len_scale=12.0, nugget=0.05)
+        krige = gs.Krige(
+            model,
+            cond_pos,
+            cond_val,
+            exact=False,
+            cond_err=0.05,
+        )
+        return krige(
+            target_pos,
+            mesh_type="unstructured",
+            return_var=True,
+            store=False,
+        )
diff --git a/tools/asv_speedup_summary.py b/tools/asv_speedup_summary.py
new file mode 100644
index 000000000..01ebf3700
--- /dev/null
+++ b/tools/asv_speedup_summary.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python
+"""Print Rust-vs-Cython speedups from local ASV result files.
+
+The summary is optional. ASV itself remains the source of truth for benchmark
+storage and visualization.
+
+Usage:
+    python tools/asv_speedup_summary.py
+    python tools/asv_speedup_summary.py --results-dir .asv/results
+
+Speedup is calculated as:
+    cython_fallback_time / rust_core_time
+
+Values greater than 1.0 mean Rust was faster on the same machine, commit,
+environment, and benchmark.
+"""
+
+from __future__ import annotations
+
+import argparse
+import itertools
+import json
+import math
+from pathlib import Path
+
+
+BACKENDS = ("cython_fallback", "rust_core")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--results-dir",
+        default=".asv/results",
+        type=Path,
+        help="Path to the ASV results directory.",
+    )
+    parser.add_argument(
+        "--all",
+        action="store_true",
+        help="Include non-time benchmarks as ratios too.",
+    )
+    return parser.parse_args()
+
+
+def iter_result_files(results_dir):
+    for path in sorted(results_dir.glob("**/*.json")):
+        if path.name in {"benchmarks.json", "machine.json"}:
+            continue
+        yield path
+
+
+def load_json(path):
+    try:
+        with path.open(encoding="utf8") as handle:
+            return json.load(handle)
+    except json.JSONDecodeError:
+        return None
+
+
+def result_entry(raw_result, result_columns):
+    if isinstance(raw_result, dict):
+        return raw_result
+    if isinstance(raw_result, list) and result_columns:
+        return dict(zip(result_columns, raw_result))
+    return {"result": raw_result, "params": []}
+
+
+def is_number(value):
+    return isinstance(value, (int, float)) and not math.isnan(value)
+
+
+def backend_values(entry):
+    result = entry.get("result")
+    params = entry.get("params") or []
+    if not isinstance(result, list) or not params:
+        return {}
+
+    values = {}
+    combinations = itertools.product(*params)
+    for combo, value in zip(combinations, result):
+        if not is_number(value):
+            continue
+        combo_values = [str(item).strip("'\"") for item in combo]
+        for backend in BACKENDS:
+            if backend in combo_values:
+                values[backend] = float(value)
+    return values
+
+
+def short_benchmark_name(name):
+    return name.rsplit(".", maxsplit=1)[-1]
+
+
+def collect_speedups(results_dir, include_all):
+    rows = []
+    for path in iter_result_files(results_dir):
+        data = load_json(path)
+        if not data:
+            continue
+        result_columns = data.get("result_columns", [])
+        commit = data.get("commit_hash", "unknown")[:8]
+        env_name = data.get("env_name", path.stem)
+        results = data.get("results", {})
+        for benchmark, raw_result in results.items():
+            if not include_all and ".time_" not in benchmark:
+                continue
+            values = backend_values(result_entry(raw_result, result_columns))
+            cython = values.get("cython_fallback")
+            rust = values.get("rust_core")
+            if not is_number(cython) or not is_number(rust) or rust == 0:
+                continue
+            rows.append(
+                {
+                    "commit": commit,
+                    "env": env_name,
+                    "benchmark": short_benchmark_name(benchmark),
+                    "cython": cython,
+                    "rust": rust,
+                    "speedup": cython / rust,
+                }
+            )
+    return rows
+
+
+def print_table(rows):
+    if not rows:
+        print("No matching Rust-vs-Cython ASV results found.")
+        return
+
+    headers = [
+        "commit",
+        "env",
+        "benchmark",
+        "cython",
+        "rust",
+        "speedup",
+    ]
+    table = [
+        [
+            row["commit"],
+            row["env"],
+            row["benchmark"],
+            f"{row['cython']:.6g}",
+            f"{row['rust']:.6g}",
+            f"{row['speedup']:.3f}x",
+        ]
+        for row in rows
+    ]
+    widths = [
+        max(len(str(item)) for item in column)
+        for column in zip(headers, *table)
+    ]
+
+    def fmt(row):
+        return "  ".join(
+            str(item).ljust(width) for item, width in zip(row, widths)
+        )
+
+    print(fmt(headers))
+    print(fmt(["-" * width for width in widths]))
+    for row in table:
+        print(fmt(row))
+
+
+def main():
+    args = parse_args()
+    rows = collect_speedups(args.results_dir, args.all)
+    print_table(rows)
+
+
+if __name__ == "__main__":
+    main()

From bfba184514a13fdcad8037b1948afd88dc26fbef Mon Sep 17 00:00:00 2001
From: Jeisson Leal <jeisson-javier.leal-rojas@ufz.de>
Date: Thu, 7 May 2026 16:36:15 +0200
Subject: [PATCH 2/3] First Bench marking: current status, both backends, ASV
 and cProfile

---
 benchmarks/README.md                          | 451 ++++++++++++++++--
 benchmarks/benchmark_backends.py              | 226 ++++++---
 .../tools}/asv_speedup_summary.py             | 104 +++-
 .../tools/profile_benchmark_workflows.py      | 192 ++++++++
 pyproject.toml                                |   1 +
 5 files changed, 867 insertions(+), 107 deletions(-)
 rename {tools => benchmarks/tools}/asv_speedup_summary.py (56%)
 create mode 100644 benchmarks/tools/profile_benchmark_workflows.py

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 1c36b5be3..061062be4 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,64 +1,455 @@
-# GSTools ASV Benchmarks
+# GSTools Benchmark Guide
 
-This directory contains performance benchmarks for GSTools. Unit tests in
-`tests/` should remain focused on correctness; ASV benchmarks in this
-directory measure runtime and peak memory only.
+This directory contains the Airspeed Velocity ([ASV](https://github.com/airspeed-velocity/asv/)) benchmark suite for GSTools and a complementary profiling helper implemented with cProfile (part of the Python standard library).
 
-For a beginner-friendly explanation of ASV and every file added here, read
-[`ASV_TUTORIAL.md`](ASV_TUTORIAL.md).
+This is a measurement-first guide: benchmark real workflows, inspect the
+results, profile the slow paths, and then decide what to optimize.
+
+Unit tests in `tests/` answer "is the code correct?". The ASV benchmarks in
+`benchmarks/` answer "how fast is this workflow, how much memory does it use,
+and did that change across commits?". The complementary cProfile helper
+answers "inside this workflow, which Python functions are taking most of the
+time right now?".
+
+The benchmarks compare two GSTools backends, which gives more context for
+deciding where optimization work should go:
+
+- `cython_fallback`: the default Cython-backed fallback implementation from
+  [gstools-cython](https://github.com/GeoStat-Framework/GSTools-Cython).
+- `rust_core`: the Rust-backed implementation from
+  [gstools_core](https://github.com/GeoStat-Framework/GSTools-Core).
+
+## Index
+
+- [Setup](#setup)
+- [Benchmarking Scripts](#benchmarking-scripts)
+- [ASV Configuration](#asv-configuration)
+- [Benchmark Naming](#benchmark-naming)
+- [Benchmark Coverage](#benchmark-coverage)
+- [Benchmark Classes](#benchmark-classes)
+- [VariogramWorkflowBenchmarks](#variogramworkflowbenchmarks)
+- [KrigingWorkflowBenchmarks](#krigingworkflowbenchmarks)
+- [RandomFieldWorkflowBenchmarks](#randomfieldworkflowbenchmarks)
+- [Running The Benchmarks](#running-the-benchmarks)
+- [Profiling With cProfile](#profiling-with-cprofile)
+- [More ASV Commands](#more-asv-commands)
+- [External Reference](#external-reference)
 
 ## Setup
 
-Run ASV commands from the GSTools repository root, where `asv.conf.json`
-is located:
+The regular installation commands in the main `README.md` install GSTools for
+normal use. For benchmark work, install this local checkout with the optional
+benchmark dependencies.
+
+1. Move to the GSTools repository root:
 
 ```bash
-cd /Users/lealroja/Documents/UFZ/MPS-Tools/GSTools
-conda install -c conda-forge asv
-asv machine
+cd /path/to/GSTools
 ```
 
-## Common Commands
+2. Install GSTools in editable mode with the benchmark tooling and Rust backend:
 
 ```bash
-asv run --quick
-asv run HEAD^!
-asv run
-asv publish
-asv preview
-asv compare HEAD~1 HEAD
+python -m pip install -e ".[benchmark,rust]"
+```
+
+3. Create a machine profile once per computer:
+
+```bash
+asv machine --yes
+```
+
+Notes:
+
+- The machine profile records local hardware information so ASV can label
+  results correctly. Do not compare absolute times across different machines.
+- You can also install ASV with conda or pip, and you can install the Rust
+  backend package from
+  [gstools_core](https://github.com/GeoStat-Framework/GSTools-Core) directly.
+
+## Benchmarking Scripts
+
+The benchmarking setup currently consists of:
+
+- `asv.conf.json`: tells ASV how to build GSTools, where benchmarks live, where
+  to store results, and which Python/environment matrix to use.
+- `benchmarks/benchmark_backends.py`: contains the ASV benchmark classes.
+- `benchmarks/README.md`: this practical guide.
+- `benchmarks/tools/asv_speedup_summary.py`: reads `.asv/results/` and prints
+  Rust-vs-Cython speedup ratios.
+- `benchmarks/tools/profile_benchmark_workflows.py`: runs one representative
+  workflow from `benchmark_backends.py` under Python's built-in `cProfile`, so
+  you can see which functions take time in the current checkout.
+
+Do not run `benchmarks/benchmark_backends.py` directly with Python. ASV loads
+that file, discovers benchmark classes and methods, and runs them inside
+isolated benchmark environments. The scripts in `benchmarks/tools/` are
+different: run them directly with Python. The profiling helper can run against
+the current checkout at any time; the speedup-summary helper needs saved ASV
+results in `.asv/results/`.
+
+### ASV Configuration
+
+The repo root `asv.conf.json` is tailored to this GSTools checkout:
+
+```json
+{
+  "repo": ".",
+  "branches": ["main"],
+  "benchmark_dir": "benchmarks",
+  "env_dir": ".asv/env",
+  "results_dir": ".asv/results",
+  "html_dir": ".asv/html",
+  "environment_type": "conda",
+  "pythons": ["3.12"],
+  "install_command": [
+    "in-dir={env_dir} python -m pip install {build_dir}[rust]"
+  ]
+}
+```
+
+Important details:
+
+- `install_command` installs the checked-out GSTools revision with the `[rust]`
+  extra, so `gstools_core` should be available for Rust backend measurements.
+  ASV still needs its own `install_command` because it creates isolated
+  environments for the commits it benchmarks.
+- This is separate from any editable install in your active development
+  environment, such as `python -m pip install -e ".[benchmark,rust]"`.
+  The editable install is only needed when you want your active environment to
+  import the current checkout directly, for example when running
+  `benchmarks/tools/profile_benchmark_workflows.py` with `--backend rust_core`.
+- ASV and the cProfile helper use different environments. ASV runs
+  `benchmarks/benchmark_backends.py` inside `.asv/env/`; the cProfile helper
+  imports the same benchmark classes but runs them in your active Python
+  environment.
+
+ASV creates these generated directories:
+
+```text
+.asv/env/      benchmark environments
+.asv/results/  local benchmark result JSON files
+.asv/html/     generated local benchmark website
 ```
 
-`asv run --quick` is the quick development check for ASV 0.6.x. It runs each
-benchmark only once and does not save useful performance results.
+Those directories are machine-specific generated artifacts. They should
+normally stay out of git.
+
+If needed, users can list more than one branch, Python version, benchmark
+directory, and so on. For example:
 
-`asv run HEAD^!` benchmarks only the current commit. Plain `asv run` follows
-the branches configured in `asv.conf.json`.
+```json
+"branches": ["main", "my-feature-branch"]
+```
 
-If you need to run ASV from another directory, pass the config explicitly:
+Users can also benchmark any explicit branch, commit, tag, or range without
+changing `asv.conf.json`:
 
 ```bash
-asv --config /Users/lealroja/Documents/UFZ/MPS-Tools/GSTools/asv.conf.json run --quick
+asv run my-feature-branch^! --bench benchmark_backends
+asv run main..my-feature-branch --bench benchmark_backends
+```
+
+ASV checks out package code at each git commit being benchmarked. Commit source
+changes before benchmarking them with ASV. Otherwise ASV may benchmark the last
+committed package code rather than your uncommitted source changes.
+
+
+### Benchmark Naming
+
+ASV recognizes benchmark methods by name:
+
+- methods starting with `time_` measure runtime
+- methods starting with `peakmem_` measure peak memory
+- `setup_cache()` creates reusable data once per benchmark environment
+- `setup()` can skip or prepare individual parameter combinations
+
+## Benchmark Coverage
+
+### Shared Constants
+
+```python
+BACKENDS = ("cython_fallback", "rust_core")
+VARIOGRAM_CASES = (
+    "full_900",
+    "sampled_5000_to_1500",
+    "sampled_15000_to_4500",
+)
+KRIGE_CASES = ("small_30x500", "large_120x2000", "extra_large_360x6000")
+FIELD_CASES = (
+    "srf_unstructured_randmeth",
+    "srf_structured_randmeth",
+    "srf_structured_fourier",
+    "condsrf_unstructured",
+)
 ```
 
-## Backend Comparison
+These constants define parameter labels shown in ASV results.
 
-Benchmarks are parameterized with readable backend labels:
+`BACKENDS` compares:
 
 - `cython_fallback`
 - `rust_core`
 
-ASV tracks each backend separately. Interpret Rust speedup on the same machine
-and same benchmark as:
+### Shared Helpers
+
+`gstools_backend(use_core)` temporarily forces GSTools to use either the Cython
+fallback backend or the Rust `gstools_core` backend.
+
+`_random_points(seed, count, scale)` creates deterministic 2D point clouds.
+
+`_smooth_field(x, y)` creates deterministic synthetic values:
+
+```python
+np.sin(x / 10.0) + np.cos(y / 15.0)
+```
+
+`_make_variogram_data(...)` creates positions, field values, and bins for
+variogram estimation.
+
+`_make_krige_data(...)` creates conditioning points, conditioning values, and
+target points for kriging and conditioned random fields.
+
+The fixed random seeds are intentional. They keep benchmark inputs stable so
+changes in results are more likely to come from code changes, not new random
+data.
+
+### Benchmark Classes
+
+The ASV benchmarking is organized around workflow classes. Each workflow class
+compares `cython_fallback` and `rust_core`, and each class includes both
+runtime and peak-memory methods.
+
+The suite currently measures:
+
+- `VariogramWorkflowBenchmarks`: full pairwise work vs sampled large work
+- `KrigingWorkflowBenchmarks`: small vs larger global kriging systems
+- `RandomFieldWorkflowBenchmarks`: unstructured SRF, structured SRF, Fourier
+  SRF, and conditioned SRF
+
+This keeps the ASV suite focused on representative workflows rather than
+separate duplicate backend checks.
+
+#### VariogramWorkflowBenchmarks
+
+This class measures variogram estimation cases:
+
+```text
+full_900
+sampled_5000_to_1500
+sampled_15000_to_4500
+```
+
+The labels mean:
+
+- `full_900`: create 900 scattered points and use all 900 points for the
+  variogram calculation.
+- `sampled_5000_to_1500`: create 5,000 scattered points, then randomly select
+  1,500 of those points for the variogram calculation.
+- `sampled_15000_to_4500`: create 15,000 scattered points, then randomly select
+  4,500 of those points for the variogram calculation.
+
+The sampled cases still represent larger input datasets, but the variogram
+calculation is done on the randomly selected subset so the pairwise work stays
+practical.
+
+#### KrigingWorkflowBenchmarks
+
+This class measures global kriging at three scales:
+
+```text
+small_30x500
+large_120x2000
+extra_large_360x6000
+```
+
+The labels mean:
+
+- `small_30x500`: 30 conditioning points, 500 target points
+- `large_120x2000`: 120 conditioning points, 2,000 target points
+- `extra_large_360x6000`: 360 conditioning points, 6,000 target points
+
+#### RandomFieldWorkflowBenchmarks
+
+This class measures SRF and CondSRF generation workflows:
+
+```text
+srf_unstructured_randmeth
+srf_structured_randmeth
+srf_structured_fourier
+condsrf_unstructured
+```
+
+The cases are:
+
+- `srf_unstructured_randmeth`: SRF using RandMeth on 2,000 unstructured points
+- `srf_structured_randmeth`: SRF using RandMeth on a 64 by 64 structured grid
+- `srf_structured_fourier`: SRF using the Fourier generator on a 64 by 64
+  structured grid
+- `condsrf_unstructured`: conditioned SRF with 40 conditioning points and 1,000
+  target points
+
+## Running The Benchmarks
+
+Check that the benchmark module imports and runs:
+
+```bash
+asv run --quick --show-stderr --bench benchmark_backends
+```
+
+Save a baseline for the current commit:
+
+```bash
+asv run HEAD^! --bench benchmark_backends
+```
+
+Run the last five commits on a linear branch:
+
+```bash
+asv run HEAD~5..HEAD --bench benchmark_backends
+```
+
+Build and open the local website:
+
+```bash
+asv publish
+asv preview
+```
+
+Then open the printed local URL,  for example:
+
+```text
+http://127.0.0.1:8082/#/
+```
+(or any other `http://127.0.0.1:<port>/#/` URL shown by the running preview).
+
+After ASV has saved results, print explicit Rust-vs-Cython speedup ratios:
+
+```bash
+python benchmarks/tools/asv_speedup_summary.py
+```
+
+The helper reads `.asv/results/` and reports:
 
 ```text
 speedup = cython_fallback_time / rust_core_time
 ```
 
-So:
+Interpret the ratio as:
 
 - `speedup > 1.0` means Rust is faster
 - `speedup = 1.0` means similar performance
 - `speedup < 1.0` means Rust is slower
 
-Do not compare absolute benchmark times across different machines.
+The browser report shows ASV plots and trends. The speedup helper prints the
+backend ratio explicitly in the terminal. By default, the helper skips removed
+legacy duplicate rows from older saved results.
+
+## Profiling With cProfile
+
+`cProfile` is useful for the current checkout. It does not update the ASV
+browser report. Instead, it prints a table in the terminal showing which Python
+functions consumed time while one workflow ran.
+
+The helper script is:
+
+```text
+benchmarks/tools/profile_benchmark_workflows.py
+```
+
+It imports the ASV benchmark classes from `benchmark_backends.py`, selects one
+case, forces one backend, and runs that case under `cProfile`.
+
+List available cases:
+
+```bash
+python benchmarks/tools/profile_benchmark_workflows.py --list
+```
+
+Profile selected cases:
+
+```bash
+python benchmarks/tools/profile_benchmark_workflows.py --case variogram-sampled --backend rust_core --limit 10
+python benchmarks/tools/profile_benchmark_workflows.py --case variogram-extra-large --backend rust_core --limit 10
+python benchmarks/tools/profile_benchmark_workflows.py --case krige-large --backend rust_core --limit 10
+python benchmarks/tools/profile_benchmark_workflows.py --case krige-extra-large --backend rust_core --limit 10
+python benchmarks/tools/profile_benchmark_workflows.py --case condsrf --backend rust_core --limit 10
+```
+
+Useful options:
+
+- `--case`: choose one workflow, or use `all`
+- `--backend`: choose `cython_fallback` or `rust_core`
+- `--limit`: number of function rows to print from the cProfile table
+- `--sort cumtime`: sort by cumulative time, usually the best first view
+- `--sort tottime`: sort by time spent directly in each function
+- `--repeat`: repeat a workflow inside the profiler
+
+For example, `--limit 10` means "print the top 10 function rows after sorting".
+
+## More ASV Commands
+
+Save results for only the current commit:
+
+```bash
+asv run HEAD^! --bench benchmark_backends
+```
+
+Compare current commit with previous commit:
+
+```bash
+asv run HEAD~1^! --bench benchmark_backends
+asv run HEAD^! --bench benchmark_backends
+asv compare HEAD~1 HEAD
+```
+
+Compare local `main` with the current branch tip:
+
+```bash
+asv run main^! --bench benchmark_backends
+asv run HEAD^! --bench benchmark_backends
+asv compare main HEAD
+```
+
+Compare remote `main` with the current branch tip:
+
+```bash
+git fetch origin main
+asv run origin/main^! --bench benchmark_backends
+asv run HEAD^! --bench benchmark_backends
+asv compare origin/main HEAD
+```
+
+On a linear branch, `HEAD~5..HEAD` benchmarks:
+
+```text
+HEAD~4
+HEAD~3
+HEAD~2
+HEAD~1
+HEAD
+```
+
+Run a selected list of commits:
+
+```bash
+git rev-parse HEAD HEAD~3 main e20c88f7 > /tmp/gstools-asv-commits.txt
+asv run HASHFILE:/tmp/gstools-asv-commits.txt --bench benchmark_backends
+```
+
+Use full commit hashes when sharing results. Short hashes and branch names are
+fine locally but can become ambiguous later.
+
+If running ASV from outside the repo root, pass the config explicitly:
+
+```bash
+asv --config /path/to/MPS-Tools/GSTools/asv.conf.json run --quick --bench benchmark_backends
+```
+
+## External Reference
+
+For complete ASV command syntax, see:
+
+```text
+https://asv.readthedocs.io/en/stable/commands.html
+```
diff --git a/benchmarks/benchmark_backends.py b/benchmarks/benchmark_backends.py
index 2469fbd0c..517c61893 100644
--- a/benchmarks/benchmark_backends.py
+++ b/benchmarks/benchmark_backends.py
@@ -1,11 +1,11 @@
-"""Backend benchmarks for GSTools.
+"""Workflow benchmarks for GSTools backends.
 
 Usage:
-    cd /my/path/to/GSTools
-    conda install -c conda-forge asv
-    asv machine
-    asv run --quick
-    asv run HEAD^!
+    cd /path/to/MPS-Tools/GSTools
+    python -m pip install -e ".[benchmark]"
+    asv machine --yes
+    asv run --quick --show-stderr --bench benchmark_backends
+    asv run HEAD^! --bench benchmark_backends
     asv run
     asv publish
     asv preview
@@ -28,6 +28,18 @@
 
 
 BACKENDS = ("cython_fallback", "rust_core")
+VARIOGRAM_CASES = (
+    "full_900",
+    "sampled_5000_to_1500",
+    "sampled_15000_to_4500",
+)
+KRIGE_CASES = ("small_30x500", "large_120x2000", "extra_large_360x6000")
+FIELD_CASES = (
+    "srf_unstructured_randmeth",
+    "srf_structured_randmeth",
+    "srf_structured_fourier",
+    "condsrf_unstructured",
+)
 
 
 @contextlib.contextmanager
@@ -56,81 +68,181 @@ def _use_core(backend):
     raise ValueError(f"Unknown backend: {backend}")
 
 
-class BackendBenchmarks:
-    """Runtime and peak-memory benchmarks for backend-dispatched operations."""
+def _random_points(seed, count, scale):
+    rng = np.random.RandomState(seed)
+    return rng.rand(count) * scale, rng.rand(count) * scale
 
-    params = BACKENDS
-    param_names = ["backend"]
 
-    def setup_cache(self):
-        """Create deterministic data once per benchmark environment."""
-        srf_x = np.random.RandomState(20220425).rand(2000) * 100.0
-        srf_y = np.random.RandomState(20220426).rand(2000) * 100.0
+def _smooth_field(x, y):
+    return np.sin(x / 10.0) + np.cos(y / 15.0)
+
+
+def _make_variogram_data(seed, count, scale=100.0):
+    x, y = _random_points(seed, count, scale)
+    field = _smooth_field(x, y)
+    bins = np.linspace(0.0, scale * 0.6, 16)
+    return (x, y), field, bins
+
 
-        vario_x = np.random.RandomState(20220427).rand(900) * 100.0
-        vario_y = np.random.RandomState(20220428).rand(900) * 100.0
-        vario_field = np.sin(vario_x / 10.0) + np.cos(vario_y / 15.0)
-        vario_bins = np.linspace(0.0, 60.0, 16)
+def _make_krige_data(seed, cond_count, target_count, scale=50.0):
+    rng = np.random.RandomState(seed)
+    cond_x = rng.rand(cond_count) * scale
+    cond_y = rng.rand(cond_count) * scale
+    cond_val = _smooth_field(cond_x, cond_y)
+    target_pos = (
+        rng.rand(target_count) * scale,
+        rng.rand(target_count) * scale,
+    )
+    return (cond_x, cond_y), cond_val, target_pos
 
-        rng = np.random.RandomState(20220429)
-        cond_x = rng.rand(40) * 50.0
-        cond_y = rng.rand(40) * 50.0
-        cond_val = np.sin(cond_x / 8.0) + np.cos(cond_y / 9.0)
-        target_pos = (rng.rand(1000) * 50.0, rng.rand(1000) * 50.0)
 
+class VariogramWorkflowBenchmarks:
+    """Variogram workflow benchmarks by case and backend."""
+
+    params = [VARIOGRAM_CASES, BACKENDS]
+    param_names = ["case", "backend"]
+
+    def setup_cache(self):
         return {
-            "srf": (srf_x, srf_y),
-            "variogram": ((vario_x, vario_y), vario_field, vario_bins),
-            "krige": ((cond_x, cond_y), cond_val, target_pos),
+            "full_900": _make_variogram_data(20220501, 900),
+            "sampled_5000_to_1500": _make_variogram_data(20220502, 5000),
+            "sampled_15000_to_4500": _make_variogram_data(20220503, 15000),
         }
 
-    def setup(self, data, backend):
-        """Skip only the Rust parameter when gstools-core is unavailable."""
+    def setup(self, data, case, backend):
         if backend == "rust_core" and not gs.config._GSTOOLS_CORE_AVAIL:
             raise NotImplementedError("gstools_core is not available")
 
-    def time_srf(self, data, backend):
+    def time_variogram_estimate(self, data, case, backend):
         with gstools_backend(_use_core(backend)):
-            self._run_srf(data)
+            self._run_variogram(data, case)
 
-    def peakmem_srf(self, data, backend):
+    def peakmem_variogram_estimate(self, data, case, backend):
         with gstools_backend(_use_core(backend)):
-            self._run_srf(data)
+            self._run_variogram(data, case)
+
+    def _run_variogram(self, data, case):
+        pos, field, bins = data[case]
+        kwargs = {}
+        if case == "sampled_5000_to_1500":
+            kwargs = {"sampling_size": 1500, "sampling_seed": 20220504}
+        if case == "sampled_15000_to_4500":
+            kwargs = {"sampling_size": 4500, "sampling_seed": 20220505}
+        return gs.vario_estimate(
+            pos,
+            field,
+            bins,
+            mesh_type="unstructured",
+            return_counts=True,
+            **kwargs,
+        )
+
 
-    def time_variogram(self, data, backend):
+class KrigingWorkflowBenchmarks:
+    """Global kriging workflow benchmarks by case and backend."""
+
+    params = [KRIGE_CASES, BACKENDS]
+    param_names = ["case", "backend"]
+
+    def setup_cache(self):
+        return {
+            "small_30x500": _make_krige_data(20220506, 30, 500),
+            "large_120x2000": _make_krige_data(20220507, 120, 2000),
+            "extra_large_360x6000": _make_krige_data(20220508, 360, 6000),
+        }
+
+    def setup(self, data, case, backend):
+        if backend == "rust_core" and not gs.config._GSTOOLS_CORE_AVAIL:
+            raise NotImplementedError("gstools_core is not available")
+
+    def time_global_krige(self, data, case, backend):
         with gstools_backend(_use_core(backend)):
-            self._run_variogram(data)
+            self._run_krige(data, case)
 
-    def peakmem_variogram(self, data, backend):
+    def peakmem_global_krige(self, data, case, backend):
         with gstools_backend(_use_core(backend)):
-            self._run_variogram(data)
+            self._run_krige(data, case)
+
+    def _run_krige(self, data, case):
+        cond_pos, cond_val, target_pos = data[case]
+        model = gs.Exponential(dim=2, var=1.5, len_scale=12.0, nugget=0.05)
+        krige = gs.Krige(
+            model,
+            cond_pos,
+            cond_val,
+            exact=False,
+            cond_err=0.05,
+        )
+        return krige(
+            target_pos,
+            mesh_type="unstructured",
+            return_var=True,
+            store=False,
+        )
+
+
+class RandomFieldWorkflowBenchmarks:
+    """SRF and CondSRF workflow benchmarks by case and backend."""
+
+    params = [FIELD_CASES, BACKENDS]
+    param_names = ["case", "backend"]
+
+    def setup_cache(self):
+        return {
+            "unstructured_pos": _random_points(20220509, 2000, 100.0),
+            "structured_pos": (
+                np.linspace(0.0, 100.0, 64),
+                np.linspace(0.0, 100.0, 64),
+            ),
+            "condsrf": _make_krige_data(20220510, 40, 1000),
+        }
 
-    def time_krige(self, data, backend):
+    def setup(self, data, case, backend):
+        if backend == "rust_core" and not gs.config._GSTOOLS_CORE_AVAIL:
+            raise NotImplementedError("gstools_core is not available")
+
+    def time_field_generation(self, data, case, backend):
         with gstools_backend(_use_core(backend)):
-            self._run_krige(data)
+            self._run_field(data, case)
 
-    def peakmem_krige(self, data, backend):
+    def peakmem_field_generation(self, data, case, backend):
         with gstools_backend(_use_core(backend)):
-            self._run_krige(data)
+            self._run_field(data, case)
+
+    def _run_field(self, data, case):
+        if case == "srf_unstructured_randmeth":
+            return self._run_srf_unstructured(data)
+        if case == "srf_structured_randmeth":
+            return self._run_srf_structured(data)
+        if case == "srf_structured_fourier":
+            return self._run_srf_fourier(data)
+        if case == "condsrf_unstructured":
+            return self._run_condsrf(data)
+        raise ValueError(f"Unknown field benchmark case: {case}")
 
-    def _run_srf(self, data):
-        x, y = data["srf"]
+    def _run_srf_unstructured(self, data):
         model = gs.Exponential(dim=2, var=2.0, len_scale=8.0)
-        srf = gs.SRF(model, mean=1.0, seed=20220425, mode_no=512)
-        return srf((x, y), mesh_type="unstructured")
+        srf = gs.SRF(model, mean=1.0, seed=20220508, mode_no=512)
+        return srf(data["unstructured_pos"], mesh_type="unstructured")
 
-    def _run_variogram(self, data):
-        pos, field, bins = data["variogram"]
-        return gs.vario_estimate(
-            pos,
-            field,
-            bins,
-            mesh_type="unstructured",
-            return_counts=True,
+    def _run_srf_structured(self, data):
+        model = gs.Exponential(dim=2, var=2.0, len_scale=8.0)
+        srf = gs.SRF(model, mean=1.0, seed=20220509, mode_no=512)
+        return srf(data["structured_pos"], mesh_type="structured")
+
+    def _run_srf_fourier(self, data):
+        model = gs.Gaussian(dim=2, var=2.0, len_scale=30.0)
+        srf = gs.SRF(
+            model,
+            generator="Fourier",
+            period=[100.0, 100.0],
+            mode_no=[32, 32],
+            seed=20220510,
         )
+        return srf(data["structured_pos"], mesh_type="structured")
 
-    def _run_krige(self, data):
-        cond_pos, cond_val, target_pos = data["krige"]
+    def _run_condsrf(self, data):
+        cond_pos, cond_val, target_pos = data["condsrf"]
         model = gs.Exponential(dim=2, var=1.5, len_scale=12.0, nugget=0.05)
         krige = gs.Krige(
             model,
@@ -139,9 +251,11 @@ def _run_krige(self, data):
             exact=False,
             cond_err=0.05,
         )
-        return krige(
+        cond_srf = gs.CondSRF(krige, seed=20220511, mode_no=512)
+        return cond_srf(
             target_pos,
             mesh_type="unstructured",
-            return_var=True,
+            seed=20220512,
             store=False,
+            krige_store=False,
         )
diff --git a/tools/asv_speedup_summary.py b/benchmarks/tools/asv_speedup_summary.py
similarity index 56%
rename from tools/asv_speedup_summary.py
rename to benchmarks/tools/asv_speedup_summary.py
index 01ebf3700..d341d2178 100644
--- a/tools/asv_speedup_summary.py
+++ b/benchmarks/tools/asv_speedup_summary.py
@@ -5,14 +5,15 @@
 storage and visualization.
 
 Usage:
-    python tools/asv_speedup_summary.py
-    python tools/asv_speedup_summary.py --results-dir .asv/results
+    python benchmarks/tools/asv_speedup_summary.py
+    python benchmarks/tools/asv_speedup_summary.py --results-dir .asv/results
+    python benchmarks/tools/asv_speedup_summary.py --include-legacy
 
 Speedup is calculated as:
     cython_fallback_time / rust_core_time
 
 Values greater than 1.0 mean Rust was faster on the same machine, commit,
-environment, and benchmark.
+environment, benchmark, and non-backend parameter combination.
 """
 
 from __future__ import annotations
@@ -25,6 +26,14 @@
 
 
 BACKENDS = ("cython_fallback", "rust_core")
+LEGACY_BENCHMARKS = {
+    "time_srf",
+    "peakmem_srf",
+    "time_variogram",
+    "peakmem_variogram",
+    "time_krige",
+    "peakmem_krige",
+}
 
 
 def parse_args():
@@ -40,6 +49,11 @@ def parse_args():
         action="store_true",
         help="Include non-time benchmarks as ratios too.",
     )
+    parser.add_argument(
+        "--include-legacy",
+        action="store_true",
+        help="Include removed BackendBenchmarks rows from older saved results.",
+    )
     return parser.parse_args()
 
 
@@ -70,6 +84,14 @@ def is_number(value):
     return isinstance(value, (int, float)) and not math.isnan(value)
 
 
+def flatten_values(values):
+    if isinstance(values, list):
+        for value in values:
+            yield from flatten_values(value)
+        return
+    yield values
+
+
 def backend_values(entry):
     result = entry.get("result")
     params = entry.get("params") or []
@@ -78,7 +100,7 @@ def backend_values(entry):
 
     values = {}
     combinations = itertools.product(*params)
-    for combo, value in zip(combinations, result):
+    for combo, value in zip(combinations, flatten_values(result)):
         if not is_number(value):
             continue
         combo_values = [str(item).strip("'\"") for item in combo]
@@ -88,11 +110,40 @@ def backend_values(entry):
     return values
 
 
+def backend_rows(entry):
+    result = entry.get("result")
+    params = entry.get("params") or []
+    if not isinstance(result, list) or not params:
+        return []
+
+    rows = []
+    combinations = itertools.product(*params)
+    for combo, value in zip(combinations, flatten_values(result)):
+        if not is_number(value):
+            continue
+        combo_values = [str(item).strip("'\"") for item in combo]
+        backend = next(
+            (candidate for candidate in BACKENDS if candidate in combo_values),
+            None,
+        )
+        if backend is None:
+            continue
+        case_values = [item for item in combo_values if item not in BACKENDS]
+        rows.append(
+            {
+                "backend": backend,
+                "case": "/".join(case_values) if case_values else "-",
+                "value": float(value),
+            }
+        )
+    return rows
+
+
 def short_benchmark_name(name):
     return name.rsplit(".", maxsplit=1)[-1]
 
 
-def collect_speedups(results_dir, include_all):
+def collect_speedups(results_dir, include_all, include_legacy):
     rows = []
     for path in iter_result_files(results_dir):
         data = load_json(path)
@@ -103,23 +154,32 @@ def collect_speedups(results_dir, include_all):
         env_name = data.get("env_name", path.stem)
         results = data.get("results", {})
         for benchmark, raw_result in results.items():
-            if not include_all and ".time_" not in benchmark:
+            benchmark_name = short_benchmark_name(benchmark)
+            if not include_legacy and benchmark_name in LEGACY_BENCHMARKS:
                 continue
-            values = backend_values(result_entry(raw_result, result_columns))
-            cython = values.get("cython_fallback")
-            rust = values.get("rust_core")
-            if not is_number(cython) or not is_number(rust) or rust == 0:
+            if not include_all and ".time_" not in benchmark:
                 continue
-            rows.append(
-                {
-                    "commit": commit,
-                    "env": env_name,
-                    "benchmark": short_benchmark_name(benchmark),
-                    "cython": cython,
-                    "rust": rust,
-                    "speedup": cython / rust,
-                }
-            )
+            by_case = {}
+            for row in backend_rows(result_entry(raw_result, result_columns)):
+                by_case.setdefault(row["case"], {})[row["backend"]] = row[
+                    "value"
+                ]
+            for case, values in by_case.items():
+                cython = values.get("cython_fallback")
+                rust = values.get("rust_core")
+                if not is_number(cython) or not is_number(rust) or rust == 0:
+                    continue
+                rows.append(
+                    {
+                        "commit": commit,
+                        "env": env_name,
+                        "benchmark": benchmark_name,
+                        "case": case,
+                        "cython": cython,
+                        "rust": rust,
+                        "speedup": cython / rust,
+                    }
+                )
     return rows
 
 
@@ -132,6 +192,7 @@ def print_table(rows):
         "commit",
         "env",
         "benchmark",
+        "case",
         "cython",
         "rust",
         "speedup",
@@ -141,6 +202,7 @@ def print_table(rows):
             row["commit"],
             row["env"],
             row["benchmark"],
+            row["case"],
             f"{row['cython']:.6g}",
             f"{row['rust']:.6g}",
             f"{row['speedup']:.3f}x",
@@ -165,7 +227,7 @@ def fmt(row):
 
 def main():
     args = parse_args()
-    rows = collect_speedups(args.results_dir, args.all)
+    rows = collect_speedups(args.results_dir, args.all, args.include_legacy)
     print_table(rows)
 
 
diff --git a/benchmarks/tools/profile_benchmark_workflows.py b/benchmarks/tools/profile_benchmark_workflows.py
new file mode 100644
index 000000000..7f953e0dd
--- /dev/null
+++ b/benchmarks/tools/profile_benchmark_workflows.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+"""Profile the representative GSTools benchmark workflows with cProfile.
+
+This is a quick measurement helper. ASV remains the source of truth for saved
+benchmark results, while this script helps identify the top cumulative Python
+call sites before making algorithmic changes.
+
+Usage:
+    cd /path/to/MPS-Tools/GSTools
+    python benchmarks/tools/profile_benchmark_workflows.py --list
+    python benchmarks/tools/profile_benchmark_workflows.py --case variogram-sampled
+    python benchmarks/tools/profile_benchmark_workflows.py --case krige-large \
+        --backend rust_core --limit 30
+"""
+
+from __future__ import annotations
+
+import argparse
+import cProfile
+from pathlib import Path
+import pstats
+import sys
+
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(REPO_ROOT))
+sys.path.insert(0, str(REPO_ROOT / "src"))
+
+
+CASES = {
+    "variogram-full": (
+        "VariogramWorkflowBenchmarks",
+        "time_variogram_estimate",
+        ("full_900",),
+    ),
+    "variogram-sampled": (
+        "VariogramWorkflowBenchmarks",
+        "time_variogram_estimate",
+        ("sampled_5000_to_1500",),
+    ),
+    "variogram-extra-large": (
+        "VariogramWorkflowBenchmarks",
+        "time_variogram_estimate",
+        ("sampled_15000_to_4500",),
+    ),
+    "krige-small": (
+        "KrigingWorkflowBenchmarks",
+        "time_global_krige",
+        ("small_30x500",),
+    ),
+    "krige-large": (
+        "KrigingWorkflowBenchmarks",
+        "time_global_krige",
+        ("large_120x2000",),
+    ),
+    "krige-extra-large": (
+        "KrigingWorkflowBenchmarks",
+        "time_global_krige",
+        ("extra_large_360x6000",),
+    ),
+    "srf-unstructured": (
+        "RandomFieldWorkflowBenchmarks",
+        "time_field_generation",
+        ("srf_unstructured_randmeth",),
+    ),
+    "srf-structured": (
+        "RandomFieldWorkflowBenchmarks",
+        "time_field_generation",
+        ("srf_structured_randmeth",),
+    ),
+    "srf-fourier": (
+        "RandomFieldWorkflowBenchmarks",
+        "time_field_generation",
+        ("srf_structured_fourier",),
+    ),
+    "condsrf": (
+        "RandomFieldWorkflowBenchmarks",
+        "time_field_generation",
+        ("condsrf_unstructured",),
+    ),
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--case",
+        default="all",
+        choices=["all", *CASES],
+        help="Workflow to profile. Defaults to all workflows.",
+    )
+    parser.add_argument(
+        "--repeat",
+        default=1,
+        type=int,
+        help="Number of times to run each selected workflow.",
+    )
+    parser.add_argument(
+        "--limit",
+        default=25,
+        type=int,
+        help="Number of cProfile rows to print per workflow.",
+    )
+    parser.add_argument(
+        "--sort",
+        default="cumtime",
+        choices=["cumtime", "tottime", "calls"],
+        help="pstats sort key.",
+    )
+    parser.add_argument(
+        "--backend",
+        default="rust_core",
+        choices=["cython_fallback", "rust_core"],
+        help="Backend label to force while profiling.",
+    )
+    parser.add_argument(
+        "--list",
+        action="store_true",
+        help="List available workflow cases and exit.",
+    )
+    return parser.parse_args()
+
+
+def iter_selected(case):
+    if case == "all":
+        yield from CASES.items()
+        return
+    yield case, CASES[case]
+
+
+def load_suite_class(class_name):
+    try:
+        from benchmarks import benchmark_backends
+    except ModuleNotFoundError as err:
+        print(
+            "Could not import GSTools benchmark dependencies. Activate the "
+            "GSTools benchmark environment or install the project dependencies "
+            f"first. Original error: {err}",
+            file=sys.stderr,
+        )
+        raise SystemExit(1) from err
+    return getattr(benchmark_backends, class_name)
+
+
+def run_case(
+    name,
+    class_name,
+    method_name,
+    params,
+    repeat,
+    limit,
+    sort,
+    backend,
+):
+    suite_cls = load_suite_class(class_name)
+    suite = suite_cls()
+    data = suite.setup_cache()
+    method = getattr(suite, method_name)
+
+    profiler = cProfile.Profile()
+    profiler.enable()
+    for _ in range(repeat):
+        method(data, *params, backend)
+    profiler.disable()
+
+    print(f"\n== {name} [{backend}] ==")
+    stats = pstats.Stats(profiler, stream=sys.stdout)
+    stats.strip_dirs().sort_stats(sort).print_stats(limit)
+
+
+def main():
+    args = parse_args()
+    if args.list:
+        for name in CASES:
+            print(name)
+        return
+
+    for name, (suite_cls, method_name, params) in iter_selected(args.case):
+        run_case(
+            name,
+            suite_cls,
+            method_name,
+            params,
+            args.repeat,
+            args.limit,
+            args.sort,
+            args.backend,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index 885bcd774..f39ea2eab 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -66,6 +66,7 @@ doc = [
 ]
 plotting = ["matplotlib>=3.7", "pyvista>=0.40"]
 rust = ["gstools_core>=1.0.0"]
+benchmark = ["asv"]
 test = ["pytest-cov>=3"]
 lint = ["ruff"]
 

From f8870650ec99193498736a3a5061a28920b0d726 Mon Sep 17 00:00:00 2001
From: Jeisson Leal <jeisson-javier.leal-rojas@ufz.de>
Date: Fri, 15 May 2026 16:53:34 +0200
Subject: [PATCH 3/3] fix parallelisation in benchmarking, adding macos
 parallelisation

---
 .gitignore                                    |   5 +-
 asv.conf.json                                 |  11 +-
 asv.macos-openmp.conf.json                    |  36 ++
 benchmarks/README.md                          | 462 +++++++++++++++---
 benchmarks/benchmark_backends.py              | 100 +++-
 benchmarks/tools/asv_speedup_summary.py       |  28 +-
 benchmarks/tools/check_cython_openmp.py       | 103 ++++
 .../tools/install_macos_openmp_cython.py      | 139 ++++++
 .../tools/profile_benchmark_workflows.py      |  36 +-
 9 files changed, 798 insertions(+), 122 deletions(-)
 create mode 100644 asv.macos-openmp.conf.json
 create mode 100644 benchmarks/tools/check_cython_openmp.py
 create mode 100644 benchmarks/tools/install_macos_openmp_cython.py

diff --git a/.gitignore b/.gitignore
index aa7b7f9fd..1cec6e510 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,9 +41,8 @@ htmlcov/
 .coverage
 .coverage.*
 .cache
-.asv/env/
-.asv/results/
-.asv/html/
+.asv/*
+.asv-openmp/*
 nosetests.xml
 coverage.xml
 *.cover
diff --git a/asv.conf.json b/asv.conf.json
index 211c4f51c..289cc426a 100644
--- a/asv.conf.json
+++ b/asv.conf.json
@@ -14,10 +14,17 @@
   "pythons": ["3.12"],
   "matrix": {
     "req": {
-      "numpy": [""]
+      "emcee": [""],
+      "hankel": [""],
+      "meshio": [""],
+      "numpy": [""],
+      "pyevtk": [""],
+      "scipy": [""],
+      "gstools-cython": [""]
     }
   },
   "install_command": [
-    "in-dir={env_dir} python -m pip install {build_dir}[rust]"
+    "in-dir={env_dir} python -m pip install gstools_core>=1.0.0",
+    "in-dir={env_dir} python -m pip install --no-deps {build_dir}"
   ]
 }
diff --git a/asv.macos-openmp.conf.json b/asv.macos-openmp.conf.json
new file mode 100644
index 000000000..9df8067ab
--- /dev/null
+++ b/asv.macos-openmp.conf.json
@@ -0,0 +1,36 @@
+{
+  "version": 1,
+  "project": "GSTools",
+  "project_url": "https://github.com/jeilealr/GSTools",
+  "repo": ".",
+  "branches": ["main"],
+  "benchmark_dir": "benchmarks",
+  "env_dir": ".asv-openmp/env",
+  "results_dir": ".asv-openmp/results",
+  "html_dir": ".asv-openmp/html",
+  "show_commit_url": "https://github.com/jeilealr/GSTools/commit/",
+  "environment_type": "conda",
+  "conda_channels": ["conda-forge"],
+  "pythons": ["3.12"],
+  "matrix": {
+    "req": {
+      "cython": [""],
+      "emcee": [""],
+      "extension-helpers": [""],
+      "hankel": [""],
+      "llvm-openmp": [""],
+      "meshio": [""],
+      "numpy": [""],
+      "pyevtk": [""],
+      "scipy": [""],
+      "setuptools": [""],
+      "wheel": [""]
+    }
+  },
+  "install_command": [
+    "in-dir={env_dir} python -m pip install gstools_core>=1.0.0",
+    "in-dir={env_dir} python {conf_dir}/benchmarks/tools/install_macos_openmp_cython.py {env_dir}",
+    "in-dir={env_dir} python {conf_dir}/benchmarks/tools/check_cython_openmp.py --fail-if-no-openmp",
+    "in-dir={env_dir} python -m pip install --no-deps {build_dir}"
+  ]
+}
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 061062be4..b4df15b6a 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -2,8 +2,8 @@
 
 This directory contains the Airspeed Velocity ([ASV](https://github.com/airspeed-velocity/asv/)) benchmark suite for GSTools and a complementary profiling helper implemented with cProfile (part of the Python standard library).
 
-This is a measurement-first guide: benchmark real workflows, inspect the
-results, profile the slow paths, and then decide what to optimize.
+This guide benchmarks GSTools, inspects the
+results, profiles where runtime is spent, and then decides what to optimize.
 
 Unit tests in `tests/` answer "is the code correct?". The ASV benchmarks in
 `benchmarks/` answer "how fast is this workflow, how much memory does it use,
@@ -23,23 +23,54 @@ deciding where optimization work should go:
 
 - [Setup](#setup)
 - [Benchmarking Scripts](#benchmarking-scripts)
-- [ASV Configuration](#asv-configuration)
-- [Benchmark Naming](#benchmark-naming)
+  - [ASV Configuration](#asv-configuration)
+  - [Benchmark Naming](#benchmark-naming)
 - [Benchmark Coverage](#benchmark-coverage)
-- [Benchmark Classes](#benchmark-classes)
-- [VariogramWorkflowBenchmarks](#variogramworkflowbenchmarks)
-- [KrigingWorkflowBenchmarks](#krigingworkflowbenchmarks)
-- [RandomFieldWorkflowBenchmarks](#randomfieldworkflowbenchmarks)
+  - [Shared Constants](#shared-constants)
+  - [Shared Helpers](#shared-helpers)
+  - [Benchmark Classes](#benchmark-classes)
+    - [VariogramWorkflowBenchmarks](#variogramworkflowbenchmarks)
+    - [KrigingWorkflowBenchmarks](#krigingworkflowbenchmarks)
+    - [RandomFieldWorkflowBenchmarks](#randomfieldworkflowbenchmarks)
 - [Running The Benchmarks](#running-the-benchmarks)
-- [Profiling With cProfile](#profiling-with-cprofile)
+  - [Baseline Benchmark](#baseline-benchmark)
+    - [Current Commit Baseline](#current-commit-baseline)
+    - [Several Commits Baseline](#several-commits-baseline)
+    - [Summary of Results](#summary-of-results)
+    - [Visualization of Results](#visualization-of-results)
+    - [Profiling With cProfile](#profiling-with-cprofile)
+- [Optional Parallelisation with OpenMP](#optional-parallelisation-with-openmp)
+  - [Shared OpenMP Rule](#shared-openmp-rule)
+  - [macOS Example](#macos-example)
+    - [What The macOS OpenMP Config Does](#what-the-macos-openmp-config-does)
+    - [Run On macOS](#run-on-macos)
+    - [Interpreting The macOS OpenMP Run](#interpreting-the-macos-openmp-run)
+  - [Windows Example](#windows-example)
+  - [Linux Example](#linux-example)
+  - [HPC Example](#hpc-example)
+  - [Profiling With cProfile for Multiple Threads](#profiling-with-cprofile-for-multiple-threads)
 - [More ASV Commands](#more-asv-commands)
 - [External Reference](#external-reference)
 
 ## Setup
 
 The regular installation commands in the main `README.md` install GSTools for
-normal use. For benchmark work, install this local checkout with the optional
-benchmark dependencies.
+normal use. This benchmark guide uses conda because ASV creates isolated
+benchmark environments for the commits it measures.
+
+The default benchmark configuration intentionally compares both backends with
+one GSTools thread:
+
+```text
+gstools.config.NUM_THREADS = 1
+```
+
+That keeps the first comparison simple: Cython fallback vs Rust core without
+parallelism as a confounding factor. Parallel/OpenMP scaling is treated as a
+separate optional experiment because the correct Cython OpenMP build depends on
+the user's operating system, compiler, and runtime environment.
+
+To run the benchmark and the optional cProfile helper, follow these steps:
 
 1. Move to the GSTools repository root:
 
@@ -47,25 +78,28 @@ benchmark dependencies.
 cd /path/to/GSTools
 ```
 
-2. Install GSTools in editable mode with the benchmark tooling and Rust backend:
+2. Create and activate a conda environment for local benchmark work:
 
 ```bash
-python -m pip install -e ".[benchmark,rust]"
+conda create -n gstools-benchmark -c conda-forge python=3.12 asv packaging
+conda activate gstools-benchmark
 ```
 
-3. Create a machine profile once per computer:
+If you already have a suitable conda environment, activate that instead.
+
+3. If you use an existing environment, make sure ASV is installed:
 
 ```bash
-asv machine --yes
+conda install -c conda-forge asv
 ```
 
-Notes:
+4. Create a machine profile once per computer:
 
-- The machine profile records local hardware information so ASV can label
-  results correctly. Do not compare absolute times across different machines.
-- You can also install ASV with conda or pip, and you can install the Rust
-  backend package from
-  [gstools_core](https://github.com/GeoStat-Framework/GSTools-Core) directly.
+```bash
+asv machine --yes
+```
+
+The machine profile records local hardware information so ASV can label results correctly. Do not compare absolute times across different machines.
 
 ## Benchmarking Scripts
 
@@ -73,6 +107,8 @@ The benchmarking setup currently consists of:
 
 - `asv.conf.json`: tells ASV how to build GSTools, where benchmarks live, where
   to store results, and which Python/environment matrix to use.
+- `asv.macos-openmp.conf.json`: optional macOS-specific ASV configuration that
+  builds `gstools-cython` from source with OpenMP inside ASV's own environment.
 - `benchmarks/benchmark_backends.py`: contains the ASV benchmark classes.
 - `benchmarks/README.md`: this practical guide.
 - `benchmarks/tools/asv_speedup_summary.py`: reads `.asv/results/` and prints
@@ -80,13 +116,13 @@ The benchmarking setup currently consists of:
 - `benchmarks/tools/profile_benchmark_workflows.py`: runs one representative
   workflow from `benchmark_backends.py` under Python's built-in `cProfile`, so
   you can see which functions take time in the current checkout.
+- `benchmarks/tools/check_cython_openmp.py`: optional helper for checking
+  whether the active Python environment's GSTools-Cython extensions detect
+  OpenMP parallel support.
+- `benchmarks/tools/install_macos_openmp_cython.py`: helper used only by
+  `asv.macos-openmp.conf.json` to compile `gstools-cython` with `llvm-openmp`
+  on macOS.
 
-Do not run `benchmarks/benchmark_backends.py` directly with Python. ASV loads
-that file, discovers benchmark classes and methods, and runs them inside
-isolated benchmark environments. The scripts in `benchmarks/tools/` are
-different: run them directly with Python. The profiling helper can run against
-the current checkout at any time; the speedup-summary helper needs saved ASV
-results in `.asv/results/`.
 
 ### ASV Configuration
 
@@ -102,27 +138,46 @@ The repo root `asv.conf.json` is tailored to this GSTools checkout:
   "html_dir": ".asv/html",
   "environment_type": "conda",
   "pythons": ["3.12"],
+  "matrix": {
+    "req": {
+      "emcee": [""],
+      "hankel": [""],
+      "meshio": [""],
+      "numpy": [""],
+      "pyevtk": [""],
+      "scipy": [""],
+      "gstools-cython": [""]
+    }
+  },
   "install_command": [
-    "in-dir={env_dir} python -m pip install {build_dir}[rust]"
+    "in-dir={env_dir} python -m pip install gstools_core>=1.0.0",
+    "in-dir={env_dir} python -m pip install --no-deps {build_dir}"
   ]
 }
 ```
 
 Important details:
 
-- `install_command` installs the checked-out GSTools revision with the `[rust]`
-  extra, so `gstools_core` should be available for Rust backend measurements.
-  ASV still needs its own `install_command` because it creates isolated
+- `environment_type: "conda"` means conda is required for the ASV workflow in
+  this guide. ASV creates isolated conda environments for the commits it
+  benchmarks.
+- `pythons: ["3.12"]` means ASV creates Python 3.12 benchmark environments.
+  Keep this pinned unless you intentionally validate a newer Python/GSTools
+  backend stack.
+- `matrix.req` asks ASV to install GSTools runtime dependencies before
+  installing the checked-out GSTools source. It includes `gstools-cython`
+  explicitly because the GSTools commit is installed with `--no-deps`.
+- `{build_dir}` is ASV's temporary checkout/build directory for the exact
+  GSTools commit being benchmarked.
+- `install_command` installs the checked-out GSTools revision with `--no-deps`.
+  It also installs `gstools_core` with pip because `gstools-core` is not
+  available as a conda package in every solver/platform combination.
+- ASV still needs its own `install_command` because it creates isolated
   environments for the commits it benchmarks.
-- This is separate from any editable install in your active development
-  environment, such as `python -m pip install -e ".[benchmark,rust]"`.
-  The editable install is only needed when you want your active environment to
-  import the current checkout directly, for example when running
-  `benchmarks/tools/profile_benchmark_workflows.py` with `--backend rust_core`.
-- ASV and the cProfile helper use different environments. ASV runs
-  `benchmarks/benchmark_backends.py` inside `.asv/env/`; the cProfile helper
-  imports the same benchmark classes but runs them in your active Python
-  environment.
+- Run the cProfile helper with the Python executable from ASV's isolated
+  environment, for example `.asv/env/<env-id>/bin/python`. In that mode, the
+  ASV environment provides dependencies while the helper imports the current
+  checkout through the repo `src/` path.
 
 ASV creates these generated directories:
 
@@ -154,7 +209,6 @@ ASV checks out package code at each git commit being benchmarked. Commit source
 changes before benchmarking them with ASV. Otherwise ASV may benchmark the last
 committed package code rather than your uncommitted source changes.
 
-
 ### Benchmark Naming
 
 ASV recognizes benchmark methods by name:
@@ -166,10 +220,21 @@ ASV recognizes benchmark methods by name:
 
 ## Benchmark Coverage
 
+This section describes what is measured by the ASV suite and how the benchmark
+labels map to real GSTools workflows. The goal is to cover representative
+operations that are relevant for geostatistical work, not isolated
+micro-functions.
+
+The current suite measures runtime and peak memory for variogram estimation,
+global kriging, spatial random field generation, and conditioned random field
+generation. Each workflow is run with both backends so the results can show
+both absolute performance and Rust-vs-Cython differences.
+
 ### Shared Constants
 
 ```python
 BACKENDS = ("cython_fallback", "rust_core")
+THREAD_COUNTS = _configured_thread_counts()
 VARIOGRAM_CASES = (
     "full_900",
     "sampled_5000_to_1500",
@@ -191,10 +256,18 @@ These constants define parameter labels shown in ASV results.
 - `cython_fallback`
 - `rust_core`
 
+`THREAD_COUNTS` defaults to:
+
+- `threads_1`: force `gstools.config.NUM_THREADS = 1`
+
+That is the default because the first benchmark target is a clean Cython-vs-Rust
+backend comparison without parallelism.
+
 ### Shared Helpers
 
-`gstools_backend(use_core)` temporarily forces GSTools to use either the Cython
-fallback backend or the Rust `gstools_core` backend.
+`gstools_backend(use_core, num_threads)` temporarily forces GSTools to use
+either the Cython fallback backend or the Rust `gstools_core` backend, and
+sets `gstools.config.NUM_THREADS` for that benchmark run.
 
 `_random_points(seed, count, scale)` creates deterministic 2D point clouds.
 
@@ -291,45 +364,39 @@ The cases are:
 
 ## Running The Benchmarks
 
-Check that the benchmark module imports and runs:
+### Baseline Benchmark
 
-```bash
-asv run --quick --show-stderr --bench benchmark_backends
-```
+The baseline benchmark is the first result set to create before doing any
+optimization work. It uses the default ASV configuration, so each workflow is
+measured with `threads_1` for both `cython_fallback` and `rust_core`.
+
+#### Current Commit Baseline
 
-Save a baseline for the current commit:
+- Save a baseline for the current commit:
 
 ```bash
 asv run HEAD^! --bench benchmark_backends
 ```
 
-Run the last five commits on a linear branch:
+#### Several Commits Baseline
 
-```bash
-asv run HEAD~5..HEAD --bench benchmark_backends
-```
+As mentioned previously, ASV can also compare several commits, here we will run the last five commits:
 
-Build and open the local website:
+- Run the last five commits on main branch:
 
 ```bash
-asv publish
-asv preview
+asv run HEAD~5..HEAD --bench benchmark_backends
 ```
 
-Then open the printed local URL,  for example:
-
-```text
-http://127.0.0.1:8082/#/
-```
-(or any other `http://127.0.0.1:<port>/#/` URL shown by the running preview).
+#### Summary of Results
 
-After ASV has saved results, print explicit Rust-vs-Cython speedup ratios:
+After running ASV, inspect the explicit Rust-vs-Cython speedup ratios:
 
 ```bash
 python benchmarks/tools/asv_speedup_summary.py
 ```
 
-The helper reads `.asv/results/` and reports:
+The helper reads `.asv/results/` and reports ratios per case and thread label:
 
 ```text
 speedup = cython_fallback_time / rust_core_time
@@ -341,14 +408,39 @@ Interpret the ratio as:
 - `speedup = 1.0` means similar performance
 - `speedup < 1.0` means Rust is slower
 
-The browser report shows ASV plots and trends. The speedup helper prints the
-backend ratio explicitly in the terminal. By default, the helper skips removed
-legacy duplicate rows from older saved results.
+The speedup helper prints the backend ratio explicitly in the terminal. By
+default, the helper skips removed legacy duplicate rows from older saved
+results.
 
-## Profiling With cProfile
+#### Visualization of Results
 
-`cProfile` is useful for the current checkout. It does not update the ASV
-browser report. Instead, it prints a table in the terminal showing which Python
+You can inspect the results in the ASV browser report by building and opening
+the local website:
+
+```bash
+asv publish
+asv preview
+```
+
+Then open the printed local URL,  for example:
+
+```text
+http://127.0.0.1:8082/#/
+```
+(or any other `http://127.0.0.1:<port>/#/` URL shown by the running preview).
+
+The browser report shows ASV plots and trends. ASV plot views do not draw a line/graph when there is only one x-axis point, therefore running `asv run HEAD^! --bench benchmark_backends` will most likely not load any graphs.
+
+For the default benchmark run, the `threads` column should show `threads_1`.
+If you later run the
+[optional OpenMP scaling experiment](#optional-parallelisation-with-openmp),
+the same column can be used to compare several threads.
+
+
+### Profiling With cProfile
+
+`cProfile` does not update the ASV results shown in the browser report.
+Instead, it prints a table in the terminal showing which Python
 functions consumed time while one workflow ran.
 
 The helper script is:
@@ -360,26 +452,244 @@ benchmarks/tools/profile_benchmark_workflows.py
 It imports the ASV benchmark classes from `benchmark_backends.py`, selects one
 case, forces one backend, and runs that case under `cProfile`.
 
+Since ASV has already created an isolated Python environment, select that
+environment to execute the profiling helper:
+
+```bash
+ASV_ENV="$(ls -td .asv/env/* | head -n 1)"
+ASV_PYTHON="$ASV_ENV/bin/python"
+```
+
+The helper still profiles the current checkout because
+`profile_benchmark_workflows.py` adds the repository `src/` directory to
+`sys.path`. The ASV environment provides the installed dependencies, including
+`gstools-cython` and `gstools_core`.
+
 List available cases:
 
 ```bash
-python benchmarks/tools/profile_benchmark_workflows.py --list
+"$ASV_PYTHON" benchmarks/tools/profile_benchmark_workflows.py --list
 ```
 
-Profile selected cases:
+Possible profile selected cases:
 
 ```bash
-python benchmarks/tools/profile_benchmark_workflows.py --case variogram-sampled --backend rust_core --limit 10
-python benchmarks/tools/profile_benchmark_workflows.py --case variogram-extra-large --backend rust_core --limit 10
-python benchmarks/tools/profile_benchmark_workflows.py --case krige-large --backend rust_core --limit 10
-python benchmarks/tools/profile_benchmark_workflows.py --case krige-extra-large --backend rust_core --limit 10
-python benchmarks/tools/profile_benchmark_workflows.py --case condsrf --backend rust_core --limit 10
+"$ASV_PYTHON" benchmarks/tools/profile_benchmark_workflows.py --case variogram-sampled --backend rust_core --threads threads_1 --limit 10
+"$ASV_PYTHON" benchmarks/tools/profile_benchmark_workflows.py --case variogram-extra-large --backend rust_core --threads threads_1 --limit 10
+"$ASV_PYTHON" benchmarks/tools/profile_benchmark_workflows.py --case krige-large --backend rust_core --threads threads_1 --limit 10
+"$ASV_PYTHON" benchmarks/tools/profile_benchmark_workflows.py --case krige-extra-large --backend rust_core --threads threads_1 --limit 10
+"$ASV_PYTHON" benchmarks/tools/profile_benchmark_workflows.py --case condsrf --backend rust_core --threads threads_1 --limit 10
+```
+
+## Optional Parallelisation with OpenMP
+
+This section collects optional workflows for testing Cython and Rust with
+several thread counts. OpenMP setup is platform-dependent, so each operating
+system should have its own tested instructions.
+
+The default setup above remains the recommended baseline: one thread, normal
+ASV environment, and no extra OpenMP build steps. Use this section only when
+you explicitly want to measure backend scaling with multiple thread counts.
+
+### Shared OpenMP Rule
+
+The benchmark code can be run with several thread labels by setting for example 
+`GSTOOLS_BENCHMARK_THREADS=1,2,4,8,16`. That only passes different
+`gstools.config.NUM_THREADS` values to GSTools. It does not, by itself, make
+the Cython backend parallel.
+
+For Cython OpenMP scaling, the Cython extension must be compiled with OpenMP
+support inside the same ASV environment that runs the benchmark. Always verify
+that environment before interpreting Cython scaling results:
+
+```bash
+ASV_ENV="$(ls -td .asv-openmp/env/* | head -n 1)"
+"$ASV_ENV/bin/python" benchmarks/tools/check_cython_openmp.py --fail-if-no-openmp
+```
+
+If the check fails, the benchmark may still run, but the Cython backend should
+not be interpreted as an OpenMP-enabled Cython run.
+
+### macOS Example
+
+This is the currently tested OpenMP workflow. It is separate from the
+default setup above.
+
+The default ASV configuration, `asv.conf.json`, stays conservative: it is the
+one-thread baseline and uses the normal conda-forge `gstools-cython` package.
+The default `.asv/env/` environment does not provide Cython OpenMP support. That is why this section uses a second ASV configuration:
+
+```text
+asv.macos-openmp.conf.json
+```
+
+This OpenMP config creates separate generated directories:
+
+```text
+.asv-openmp/env/
+.asv-openmp/results/
+.asv-openmp/html/
+```
+
+That keeps the OpenMP experiment separate from the default `.asv/` baseline.
+
+#### What The macOS OpenMP Config Does
+
+`asv.macos-openmp.conf.json` asks conda to install the build/runtime pieces
+needed for the macOS OpenMP experiment:
+
+```text
+llvm-openmp
+cython
+extension-helpers
+setuptools
+wheel
+```
+
+During ASV installation, it runs:
+
+```bash
+benchmarks/tools/install_macos_openmp_cython.py
+```
+
+That helper compiles `gstools-cython` from source inside ASV's own environment,
+not inside your active conda environment. This matters because ASV benchmarks
+the packages installed under `.asv-openmp/env/`.
+
+Internally, the helper sets:
+
+```text
+GSTOOLS_BUILD_PARALLEL=1
+CC=<ASV OpenMP env>/bin/gstools-asv-clang-openmp
+CXX=<ASV OpenMP env>/bin/gstools-asv-clang-openmp++
+```
+
+The wrapper translates the plain `-fopenmp` flag used by the Cython build into
+Apple-clang-compatible compiler and linker arguments that use conda's
+`llvm-openmp`.
+
+#### Run On macOS
+
+In the previous section, the default config gives a quick overview for both
+backends with `threads_1`. In this section, the OpenMP config runs several
+thread labels: `threads_1`, `threads_2`, `threads_4`, `threads_8`, and
+`threads_16`.
+
+Start from the GSTools repository root:
+
+```bash
+cd /path/to/GSTools
+```
+
+Create a clean driver environment. This environment only runs ASV; ASV will
+create the real benchmark environment under `.asv-openmp/env/`.
+
+```bash
+conda create -n gstools-benchmark -c conda-forge python=3.12 asv
+conda activate gstools-benchmark
+```
+
+Create the ASV machine profile once:
+
+```bash
+asv --config asv.macos-openmp.conf.json machine --yes
+```
+
+Run a quick current-commit OpenMP check. This builds the OpenMP-enabled
+`gstools-cython` package inside `.asv-openmp/env/` and runs the benchmark suite:
+
+```bash
+GSTOOLS_BENCHMARK_THREADS=1,2,4,8,16 \
+asv --config asv.macos-openmp.conf.json run HEAD^! --quick --bench benchmark_backends --show-stderr
+```
+
+Verify that the ASV OpenMP environment really uses Cython OpenMP:
+
+```bash
+ASV_OPENMP_ENV="$(ls -td .asv-openmp/env/* | head -n 1)"
+"$ASV_OPENMP_ENV/bin/python" benchmarks/tools/check_cython_openmp.py --verbose
+"$ASV_OPENMP_ENV/bin/python" benchmarks/tools/check_cython_openmp.py --fail-if-no-openmp
+```
+
+Expected result on the tested Mac M2 setup:
+
+```text
+variogram default None -> 10
+field default None -> 10
+krige default None -> 10
+OpenMP check: PASS
+```
+
+If that check passes, run the last-five-commits OpenMP benchmark:
+
+```bash
+GSTOOLS_BENCHMARK_THREADS=1,2,4,8,16 \
+asv --config asv.macos-openmp.conf.json run HEAD~5..HEAD --bench benchmark_backends --show-stderr
+```
+
+Print Rust-vs-Cython ratios from the OpenMP result folder:
+
+```bash
+python benchmarks/tools/asv_speedup_summary.py --results-dir .asv-openmp/results
+```
+
+Build and preview the OpenMP browser report:
+
+```bash
+asv --config asv.macos-openmp.conf.json publish
+asv --config asv.macos-openmp.conf.json preview
+```
+
+#### Interpreting The macOS OpenMP Run
+
+- Use default `asv.conf.json` for the reproducible one-thread baseline.
+- Use `asv.macos-openmp.conf.json` for the macOS OpenMP experiment.
+- Only claim Cython OpenMP scaling if `check_cython_openmp.py` passes inside
+  `.asv-openmp/env/...`.
+- The active `gstools-benchmark` conda environment does not need `gstools`
+  installed. It only needs ASV. The benchmarked GSTools packages live inside
+  `.asv-openmp/env/...`.
+
+This workflow is intended for macOS systems that use Apple clang with conda's
+`llvm-openmp`. It should be portable across many macOS machines, including
+Apple Silicon and Intel Macs, but it is not guaranteed for every macOS setup.
+
+It is not guaranteed to run without local changes on:
+
+- older macOS versions
+- systems missing Xcode command-line tools
+- systems with a nonstandard compiler setup
+- HPC or managed macOS environments
+- unusual conda installations
+
+Do not assume this exact OpenMP setup applies to Linux, Windows, or HPC systems.
+
+### Windows Example
+
+### Linux Example
+
+### HPC Example
+
+### Profiling With cProfile for Multiple Threads
+
+To profile how a workflow changes across configured thread counts, run the
+same cProfile case several times with the OpenMP ASV environment:
+
+```bash
+ASV_OPENMP_ENV="$(ls -td .asv-openmp/env/* | head -n 1)"
+ASV_OPENMP_PYTHON="$ASV_OPENMP_ENV/bin/python"
+
+for threads in threads_1 threads_2 threads_4 threads_8 threads_16; do
+  "$ASV_OPENMP_PYTHON" benchmarks/tools/profile_benchmark_workflows.py --case krige-extra-large --backend rust_core --threads "$threads" --limit 10
+done
 ```
 
 Useful options:
 
 - `--case`: choose one workflow, or use `all`
 - `--backend`: choose `cython_fallback` or `rust_core`
+- `--threads`: choose `threads_1`, `threads_2`, `threads_4`, `threads_8`,
+  or `threads_16`
 - `--limit`: number of function rows to print from the cProfile table
 - `--sort cumtime`: sort by cumulative time, usually the best first view
 - `--sort tottime`: sort by time spent directly in each function
diff --git a/benchmarks/benchmark_backends.py b/benchmarks/benchmark_backends.py
index 517c61893..c9526eb79 100644
--- a/benchmarks/benchmark_backends.py
+++ b/benchmarks/benchmark_backends.py
@@ -2,7 +2,7 @@
 
 Usage:
     cd /path/to/MPS-Tools/GSTools
-    python -m pip install -e ".[benchmark]"
+    # See benchmarks/README.md for ASV and optional cProfile setup.
     asv machine --yes
     asv run --quick --show-stderr --bench benchmark_backends
     asv run HEAD^! --bench benchmark_backends
@@ -15,12 +15,17 @@
     speedup = cython_fallback_time / rust_core_time
 
 Values greater than 1.0 mean the Rust backend is faster on the same machine
-for the same benchmark and commit.
+for the same benchmark, commit, and thread label.
+
+By default the suite uses one GSTools thread. For local OpenMP scaling
+experiments, set GSTOOLS_BENCHMARK_THREADS, for example:
+    GSTOOLS_BENCHMARK_THREADS=1,2,4,8,16 asv run HEAD^!
 """
 
 from __future__ import annotations
 
 import contextlib
+import os
 
 import numpy as np
 
@@ -28,6 +33,29 @@
 
 
 BACKENDS = ("cython_fallback", "rust_core")
+
+
+def _configured_thread_counts():
+    raw = os.environ.get("GSTOOLS_BENCHMARK_THREADS", "1")
+    thread_counts = []
+    for item in raw.split(","):
+        item = item.strip()
+        if not item:
+            continue
+        if item.startswith("threads_"):
+            label = item
+            value = item.removeprefix("threads_")
+        else:
+            label = f"threads_{item}"
+            value = item
+        int(value)
+        thread_counts.append(label)
+    if not thread_counts:
+        raise ValueError("GSTOOLS_BENCHMARK_THREADS did not define threads")
+    return tuple(thread_counts)
+
+
+THREAD_COUNTS = _configured_thread_counts()
 VARIOGRAM_CASES = (
     "full_900",
     "sampled_5000_to_1500",
@@ -43,9 +71,13 @@
 
 
 @contextlib.contextmanager
-def gstools_backend(use_core):
-    """Temporarily force either gstools-core or the Cython fallback."""
-    previous = (gs.config._GSTOOLS_CORE_AVAIL, gs.config.USE_GSTOOLS_CORE)
+def gstools_backend(use_core, num_threads):
+    """Temporarily force backend and GSTools thread count."""
+    previous = (
+        gs.config._GSTOOLS_CORE_AVAIL,
+        gs.config.USE_GSTOOLS_CORE,
+        gs.config.NUM_THREADS,
+    )
     try:
         if use_core:
             if not previous[0]:
@@ -55,9 +87,14 @@ def gstools_backend(use_core):
         else:
             gs.config._GSTOOLS_CORE_AVAIL = False
             gs.config.USE_GSTOOLS_CORE = False
+        gs.config.NUM_THREADS = num_threads
         yield
     finally:
-        gs.config._GSTOOLS_CORE_AVAIL, gs.config.USE_GSTOOLS_CORE = previous
+        (
+            gs.config._GSTOOLS_CORE_AVAIL,
+            gs.config.USE_GSTOOLS_CORE,
+            gs.config.NUM_THREADS,
+        ) = previous
 
 
 def _use_core(backend):
@@ -68,6 +105,12 @@ def _use_core(backend):
     raise ValueError(f"Unknown backend: {backend}")
 
 
+def _num_threads(thread_count):
+    if thread_count.startswith("threads_"):
+        return int(thread_count.removeprefix("threads_"))
+    raise ValueError(f"Unknown thread count: {thread_count}")
+
+
 def _random_points(seed, count, scale):
     rng = np.random.RandomState(seed)
     return rng.rand(count) * scale, rng.rand(count) * scale
@@ -99,8 +142,8 @@ def _make_krige_data(seed, cond_count, target_count, scale=50.0):
 class VariogramWorkflowBenchmarks:
     """Variogram workflow benchmarks by case and backend."""
 
-    params = [VARIOGRAM_CASES, BACKENDS]
-    param_names = ["case", "backend"]
+    params = [VARIOGRAM_CASES, BACKENDS, THREAD_COUNTS]
+    param_names = ["case", "backend", "threads"]
 
     def setup_cache(self):
         return {
@@ -109,16 +152,17 @@ def setup_cache(self):
             "sampled_15000_to_4500": _make_variogram_data(20220503, 15000),
         }
 
-    def setup(self, data, case, backend):
+    def setup(self, data, case, backend, threads):
         if backend == "rust_core" and not gs.config._GSTOOLS_CORE_AVAIL:
             raise NotImplementedError("gstools_core is not available")
+        _num_threads(threads)
 
-    def time_variogram_estimate(self, data, case, backend):
-        with gstools_backend(_use_core(backend)):
+    def time_variogram_estimate(self, data, case, backend, threads):
+        with gstools_backend(_use_core(backend), _num_threads(threads)):
             self._run_variogram(data, case)
 
-    def peakmem_variogram_estimate(self, data, case, backend):
-        with gstools_backend(_use_core(backend)):
+    def peakmem_variogram_estimate(self, data, case, backend, threads):
+        with gstools_backend(_use_core(backend), _num_threads(threads)):
             self._run_variogram(data, case)
 
     def _run_variogram(self, data, case):
@@ -141,8 +185,8 @@ def _run_variogram(self, data, case):
 class KrigingWorkflowBenchmarks:
     """Global kriging workflow benchmarks by case and backend."""
 
-    params = [KRIGE_CASES, BACKENDS]
-    param_names = ["case", "backend"]
+    params = [KRIGE_CASES, BACKENDS, THREAD_COUNTS]
+    param_names = ["case", "backend", "threads"]
 
     def setup_cache(self):
         return {
@@ -151,16 +195,17 @@ def setup_cache(self):
             "extra_large_360x6000": _make_krige_data(20220508, 360, 6000),
         }
 
-    def setup(self, data, case, backend):
+    def setup(self, data, case, backend, threads):
         if backend == "rust_core" and not gs.config._GSTOOLS_CORE_AVAIL:
             raise NotImplementedError("gstools_core is not available")
+        _num_threads(threads)
 
-    def time_global_krige(self, data, case, backend):
-        with gstools_backend(_use_core(backend)):
+    def time_global_krige(self, data, case, backend, threads):
+        with gstools_backend(_use_core(backend), _num_threads(threads)):
             self._run_krige(data, case)
 
-    def peakmem_global_krige(self, data, case, backend):
-        with gstools_backend(_use_core(backend)):
+    def peakmem_global_krige(self, data, case, backend, threads):
+        with gstools_backend(_use_core(backend), _num_threads(threads)):
             self._run_krige(data, case)
 
     def _run_krige(self, data, case):
@@ -184,8 +229,8 @@ def _run_krige(self, data, case):
 class RandomFieldWorkflowBenchmarks:
     """SRF and CondSRF workflow benchmarks by case and backend."""
 
-    params = [FIELD_CASES, BACKENDS]
-    param_names = ["case", "backend"]
+    params = [FIELD_CASES, BACKENDS, THREAD_COUNTS]
+    param_names = ["case", "backend", "threads"]
 
     def setup_cache(self):
         return {
@@ -197,16 +242,17 @@ def setup_cache(self):
             "condsrf": _make_krige_data(20220510, 40, 1000),
         }
 
-    def setup(self, data, case, backend):
+    def setup(self, data, case, backend, threads):
         if backend == "rust_core" and not gs.config._GSTOOLS_CORE_AVAIL:
             raise NotImplementedError("gstools_core is not available")
+        _num_threads(threads)
 
-    def time_field_generation(self, data, case, backend):
-        with gstools_backend(_use_core(backend)):
+    def time_field_generation(self, data, case, backend, threads):
+        with gstools_backend(_use_core(backend), _num_threads(threads)):
             self._run_field(data, case)
 
-    def peakmem_field_generation(self, data, case, backend):
-        with gstools_backend(_use_core(backend)):
+    def peakmem_field_generation(self, data, case, backend, threads):
+        with gstools_backend(_use_core(backend), _num_threads(threads)):
             self._run_field(data, case)
 
     def _run_field(self, data, case):
diff --git a/benchmarks/tools/asv_speedup_summary.py b/benchmarks/tools/asv_speedup_summary.py
index d341d2178..b3239d702 100644
--- a/benchmarks/tools/asv_speedup_summary.py
+++ b/benchmarks/tools/asv_speedup_summary.py
@@ -13,7 +13,7 @@
     cython_fallback_time / rust_core_time
 
 Values greater than 1.0 mean Rust was faster on the same machine, commit,
-environment, benchmark, and non-backend parameter combination.
+environment, benchmark, case, and thread-count combination.
 """
 
 from __future__ import annotations
@@ -26,6 +26,7 @@
 
 
 BACKENDS = ("cython_fallback", "rust_core")
+THREAD_PREFIX = "threads_"
 LEGACY_BENCHMARKS = {
     "time_srf",
     "peakmem_srf",
@@ -128,11 +129,24 @@ def backend_rows(entry):
         )
         if backend is None:
             continue
-        case_values = [item for item in combo_values if item not in BACKENDS]
+        case_values = [
+            item
+            for item in combo_values
+            if item not in BACKENDS and not item.startswith(THREAD_PREFIX)
+        ]
+        threads = next(
+            (
+                item
+                for item in combo_values
+                if item.startswith(THREAD_PREFIX)
+            ),
+            "-",
+        )
         rows.append(
             {
                 "backend": backend,
                 "case": "/".join(case_values) if case_values else "-",
+                "threads": threads,
                 "value": float(value),
             }
         )
@@ -161,10 +175,9 @@ def collect_speedups(results_dir, include_all, include_legacy):
                 continue
             by_case = {}
             for row in backend_rows(result_entry(raw_result, result_columns)):
-                by_case.setdefault(row["case"], {})[row["backend"]] = row[
-                    "value"
-                ]
-            for case, values in by_case.items():
+                key = (row["case"], row["threads"])
+                by_case.setdefault(key, {})[row["backend"]] = row["value"]
+            for (case, threads), values in by_case.items():
                 cython = values.get("cython_fallback")
                 rust = values.get("rust_core")
                 if not is_number(cython) or not is_number(rust) or rust == 0:
@@ -175,6 +188,7 @@ def collect_speedups(results_dir, include_all, include_legacy):
                         "env": env_name,
                         "benchmark": benchmark_name,
                         "case": case,
+                        "threads": threads,
                         "cython": cython,
                         "rust": rust,
                         "speedup": cython / rust,
@@ -193,6 +207,7 @@ def print_table(rows):
         "env",
         "benchmark",
         "case",
+        "threads",
         "cython",
         "rust",
         "speedup",
@@ -203,6 +218,7 @@ def print_table(rows):
             row["env"],
             row["benchmark"],
             row["case"],
+            row["threads"],
             f"{row['cython']:.6g}",
             f"{row['rust']:.6g}",
             f"{row['speedup']:.3f}x",
diff --git a/benchmarks/tools/check_cython_openmp.py b/benchmarks/tools/check_cython_openmp.py
new file mode 100644
index 000000000..02fe73db3
--- /dev/null
+++ b/benchmarks/tools/check_cython_openmp.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+"""Check whether GSTools-Cython detects OpenMP parallel support.
+
+This script verifies the active Python environment. Use it with the editable
+development environment or with an ASV-created environment.
+
+Examples:
+    python benchmarks/tools/check_cython_openmp.py
+    python benchmarks/tools/check_cython_openmp.py --fail-if-no-openmp
+    python benchmarks/tools/check_cython_openmp.py --verbose
+    .asv/env/<hash>/bin/python3 benchmarks/tools/check_cython_openmp.py
+"""
+
+from __future__ import annotations
+
+import argparse
+import importlib
+import sys
+
+
+MODULES = {
+    "variogram": "gstools_cython.variogram",
+    "field": "gstools_cython.field",
+    "krige": "gstools_cython.krige",
+}
+EXPLICIT_THREAD_COUNTS = (1, 2, 4, 8, 16)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--fail-if-no-openmp",
+        action="store_true",
+        help="Exit with status 1 if OpenMP thread detection reports <= 1.",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Print per-module default and explicit thread-count values.",
+    )
+    return parser.parse_args()
+
+
+def package_version(package_name):
+    try:
+        package = importlib.import_module(package_name)
+    except ModuleNotFoundError:
+        return "not installed"
+    return getattr(package, "__version__", "unknown")
+
+
+def check_module(label, module_name):
+    module = importlib.import_module(module_name)
+    default_threads = module.set_num_threads(None)
+    explicit = {
+        count: module.set_num_threads(count)
+        for count in EXPLICIT_THREAD_COUNTS
+    }
+    return label, default_threads, explicit
+
+
+def main():
+    args = parse_args()
+
+    print(f"python: {sys.executable}")
+    print(f"gstools: {package_version('gstools')}")
+    print(f"gstools_cython: {package_version('gstools_cython')}")
+    print(f"gstools_core: {package_version('gstools_core')}")
+    if args.verbose:
+        print(
+            "OpenMP evidence: default None should be >1. "
+            "Explicit values only prove the wrapper accepts the requested count."
+        )
+
+    default_values = []
+    for label, module_name in MODULES.items():
+        try:
+            label, default_threads, explicit = check_module(label, module_name)
+        except ModuleNotFoundError as err:
+            print(f"OpenMP check: FAIL. Missing module: {err.name}")
+            return 1
+        default_values.append(default_threads)
+        if args.verbose:
+            explicit_text = ", ".join(
+                f"{request}->{actual}" for request, actual in explicit.items()
+            )
+            print(f"{label} default None -> {default_threads}")
+            print(f"{label} explicit -> {explicit_text}")
+
+    if min(default_values) > 1:
+        print("OpenMP check: PASS")
+        return 0
+
+    print(
+        "OpenMP check: FAIL. GSTools-Cython reports one default thread. "
+        "Explicit thread values may be accepted by the wrapper, but this does "
+        "not prove that the compiled extension is using OpenMP."
+    )
+    return 1 if args.fail_if_no_openmp else 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/benchmarks/tools/install_macos_openmp_cython.py b/benchmarks/tools/install_macos_openmp_cython.py
new file mode 100644
index 000000000..09f0840cc
--- /dev/null
+++ b/benchmarks/tools/install_macos_openmp_cython.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python
+"""Install GSTools-Cython with OpenMP inside a macOS ASV environment.
+
+This helper is intentionally macOS-specific. It is called from
+``asv.macos-openmp.conf.json`` after ASV has created a conda environment that
+contains ``llvm-openmp``.
+"""
+
+from __future__ import annotations
+
+import os
+import platform
+import stat
+import subprocess
+import sys
+from pathlib import Path
+
+
+def run(command, env=None, check=True):
+    print("+ " + " ".join(str(part) for part in command), flush=True)
+    return subprocess.run(command, check=check, env=env)
+
+
+def write_wrapper(path, force_cxx=False):
+    text = """#!/bin/bash
+set -e
+prefix="${GSTOOLS_OPENMP_PREFIX:-${CONDA_PREFIX:-}}"
+name="$(basename "$0")"
+if [[ "${GSTOOLS_FORCE_CXX:-0}" == "1" || "$name" == *++* ]]; then
+  real="${GSTOOLS_REAL_CXX:-/usr/bin/clang++}"
+else
+  real="${GSTOOLS_REAL_CC:-/usr/bin/clang}"
+fi
+is_compile=0
+for arg in "$@"; do
+  [[ "$arg" == "-c" ]] && is_compile=1
+done
+args=()
+for arg in "$@"; do
+  if [[ "$arg" == "-fopenmp" ]]; then
+    if [[ "$is_compile" == "1" ]]; then
+      args+=("-Xpreprocessor" "-fopenmp" "-I${prefix}/include")
+    else
+      args+=("-L${prefix}/lib" "-lomp" "-Wl,-rpath,${prefix}/lib")
+    fi
+  else
+    args+=("$arg")
+  fi
+done
+exec "$real" "${args[@]}"
+"""
+    if force_cxx:
+        text = """#!/bin/bash
+GSTOOLS_FORCE_CXX=1 exec "$(dirname "$0")/gstools-asv-clang-openmp" "$@"
+"""
+    path.write_text(text, encoding="utf8")
+    path.chmod(path.stat().st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
+
+
+def main():
+    if len(sys.argv) != 2:
+        print(
+            "Usage: install_macos_openmp_cython.py <asv-env-dir>",
+            file=sys.stderr,
+        )
+        return 2
+
+    if platform.system() != "Darwin":
+        print(
+            "This helper is macOS-specific. Use the default ASV config or "
+            "write an OpenMP setup for this platform.",
+            file=sys.stderr,
+        )
+        return 2
+
+    env_dir = Path(sys.argv[1]).resolve()
+    include_dir = env_dir / "include"
+    lib_dir = env_dir / "lib"
+    omp_header = include_dir / "omp.h"
+    omp_lib = lib_dir / "libomp.dylib"
+
+    if not omp_header.exists() or not omp_lib.exists():
+        print(
+            "llvm-openmp was not found in the ASV environment. Expected "
+            f"{omp_header} and {omp_lib}.",
+            file=sys.stderr,
+        )
+        return 2
+
+    cc_wrapper = env_dir / "bin" / "gstools-asv-clang-openmp"
+    cxx_wrapper = env_dir / "bin" / "gstools-asv-clang-openmp++"
+    write_wrapper(cc_wrapper)
+    write_wrapper(cxx_wrapper, force_cxx=True)
+
+    build_env = os.environ.copy()
+    build_env.update(
+        {
+            "GSTOOLS_BUILD_PARALLEL": "1",
+            "GSTOOLS_OPENMP_PREFIX": str(env_dir),
+            "CC": str(cc_wrapper),
+            "CXX": str(cxx_wrapper),
+            "CFLAGS": f"-I{include_dir}",
+            "LDFLAGS": f"-L{lib_dir}",
+        }
+    )
+
+    run(
+        [
+            sys.executable,
+            "-m",
+            "pip",
+            "uninstall",
+            "-y",
+            "gstools-cython",
+            "gstools_cython",
+        ],
+        env=build_env,
+        check=False,
+    )
+    run(
+        [
+            sys.executable,
+            "-m",
+            "pip",
+            "install",
+            "--no-build-isolation",
+            "--no-cache-dir",
+            "--force-reinstall",
+            "--no-binary=gstools-cython",
+            "--no-deps",
+            "gstools-cython",
+        ],
+        env=build_env,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/benchmarks/tools/profile_benchmark_workflows.py b/benchmarks/tools/profile_benchmark_workflows.py
index 7f953e0dd..2fd2fdff5 100644
--- a/benchmarks/tools/profile_benchmark_workflows.py
+++ b/benchmarks/tools/profile_benchmark_workflows.py
@@ -7,10 +7,13 @@
 
 Usage:
     cd /path/to/MPS-Tools/GSTools
-    python benchmarks/tools/profile_benchmark_workflows.py --list
-    python benchmarks/tools/profile_benchmark_workflows.py --case variogram-sampled
-    python benchmarks/tools/profile_benchmark_workflows.py --case krige-large \
-        --backend rust_core --limit 30
+    ASV_ENV="$(ls -td .asv/env/* | head -n 1)"
+    "$ASV_ENV/bin/python" benchmarks/tools/profile_benchmark_workflows.py --list
+    "$ASV_ENV/bin/python" benchmarks/tools/profile_benchmark_workflows.py \
+        --case variogram-sampled
+    "$ASV_ENV/bin/python" benchmarks/tools/profile_benchmark_workflows.py \
+        --case krige-large \
+        --backend rust_core --threads threads_1 --limit 30
 """
 
 from __future__ import annotations
@@ -80,6 +83,14 @@
     ),
 }
 
+THREAD_COUNTS = (
+    "threads_1",
+    "threads_2",
+    "threads_4",
+    "threads_8",
+    "threads_16",
+)
+
 
 def parse_args():
     parser = argparse.ArgumentParser(description=__doc__)
@@ -113,6 +124,12 @@ def parse_args():
         choices=["cython_fallback", "rust_core"],
         help="Backend label to force while profiling.",
     )
+    parser.add_argument(
+        "--threads",
+        default="threads_1",
+        choices=THREAD_COUNTS,
+        help="GSTools thread count label.",
+    )
     parser.add_argument(
         "--list",
         action="store_true",
@@ -134,8 +151,9 @@ def load_suite_class(class_name):
     except ModuleNotFoundError as err:
         print(
             "Could not import GSTools benchmark dependencies. Activate the "
-            "GSTools benchmark environment or install the project dependencies "
-            f"first. Original error: {err}",
+            "GSTools benchmark environment, run this script with an ASV env "
+            "Python from .asv/env/<env-id>/bin/python, or install the project "
+            f"dependencies first. Original error: {err}",
             file=sys.stderr,
         )
         raise SystemExit(1) from err
@@ -151,6 +169,7 @@ def run_case(
     limit,
     sort,
     backend,
+    threads,
 ):
     suite_cls = load_suite_class(class_name)
     suite = suite_cls()
@@ -160,10 +179,10 @@ def run_case(
     profiler = cProfile.Profile()
     profiler.enable()
     for _ in range(repeat):
-        method(data, *params, backend)
+        method(data, *params, backend, threads)
     profiler.disable()
 
-    print(f"\n== {name} [{backend}] ==")
+    print(f"\n== {name} [{backend}, {threads}] ==")
     stats = pstats.Stats(profiler, stream=sys.stdout)
     stats.strip_dirs().sort_stats(sort).print_stats(limit)
 
@@ -185,6 +204,7 @@ def main():
             args.limit,
             args.sort,
             args.backend,
+            args.threads,
         )