quantifyearth
diff --git a/‎.github/workflows/python-package.yml‎
Lines changed: 6 additions & 6 deletions b/‎.github/workflows/python-package.yml‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 22 additions & 14 deletions b/‎Dockerfile‎
Lines changed: 22 additions & 14 deletions
diff --git a/‎README.md‎
Lines changed: 35 additions & 5 deletions b/‎README.md‎
Lines changed: 35 additions & 5 deletions
diff --git a/‎config/config.yaml‎
Lines changed: 37 additions & 0 deletions b/‎config/config.yaml‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎docker-entrypoint.sh‎
Lines changed: 20 additions & 0 deletions b/‎docker-entrypoint.sh‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎prepare_layers/convert_crosswalk.py‎
Lines changed: 5 additions & 0 deletions b/‎prepare_layers/convert_crosswalk.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎prepare_layers/remove_nans_from_mask.py‎
Lines changed: 5 additions & 0 deletions b/‎prepare_layers/remove_nans_from_mask.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎prepare_species/apply_birdlife_data.py‎
Lines changed: 25 additions & 1 deletion b/‎prepare_species/apply_birdlife_data.py‎
Lines changed: 25 additions & 1 deletion
diff --git a/‎prepare_species/common.py‎
Lines changed: 2 additions & 2 deletions b/‎prepare_species/common.py‎
Lines changed: 2 additions & 2 deletions
@@ -12,7 +12,7 @@ on:
 jobs:
   build:
     runs-on: ubuntu-latest
-    container: ghcr.io/osgeo/gdal:ubuntu-small-3.11.4
+    container: ghcr.io/osgeo/gdal:ubuntu-small-3.12.1
     strategy:
       fail-fast: false
       matrix:
@@ -22,7 +22,7 @@ jobs:
       - name: Install system
         run: |
           apt-get update -qqy
-          apt-get install -y git python3-pip libpq5 libpq-dev r-base libtirpc-dev shellcheck
+          apt-get install -y git python3-pip libpq5 libpq-dev r-base libtirpc-dev
       - uses: actions/checkout@v4
         with:
           submodules: 'true'
@@ -35,8 +35,9 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          python -m pip install gdal[numpy]==3.11.4
+          python -m pip install gdal[numpy]==3.12.1
           python -m pip install -r requirements.txt
+          python -m pip install snakefmt
 
       - name: Lint with pylint
         run: python3 -m pylint utils prepare_layers prepare_species threats
@@ -47,6 +48,5 @@ jobs:
       - name: Tests
         run: python3 -m pytest ./tests
 
-      - name: Script checks
-        run: |
-          shellcheck ./scripts/run.sh
+      - name: Snakemake format check
+        run: snakefmt --check workflow/
@@ -3,6 +3,8 @@ __pycache__/
 *.py[cod]
 *$py.class
 
+.snakemake/
+
 # C extensions
 *.so
 
 
@@ -1,45 +1,40 @@
+# Build stage for reclaimer (used to download from Zenodo)
 FROM golang:latest AS reclaimerbuild
 RUN git clone https://github.com/quantifyearth/reclaimer.git
 WORKDIR /go/reclaimer
 RUN go mod tidy
 RUN go build
 
-FROM golang:latest AS littlejohnbuild
-RUN git clone https://github.com/quantifyearth/littlejohn.git
-WORKDIR /go/littlejohn
-RUN go mod tidy
-RUN go build
-
-FROM ghcr.io/osgeo/gdal:ubuntu-small-3.11.4
+FROM ghcr.io/osgeo/gdal:ubuntu-small-3.12.1
 
 RUN apt-get update -qqy && \
 	apt-get install -qy \
 		git \
 		cmake \
 		python3-pip \
-		shellcheck \
 		r-base \
 		libpq-dev \
 		libtirpc-dev \
 	&& rm -rf /var/lib/apt/lists/* \
 	&& rm -rf /var/cache/apt/*
 
 COPY --from=reclaimerbuild /go/reclaimer/reclaimer /bin/reclaimer
-COPY --from=littlejohnbuild /go/littlejohn/littlejohn /bin/littlejohn
 
 RUN rm /usr/lib/python3.*/EXTERNALLY-MANAGED
-RUN pip install gdal[numpy]==3.11.4
+RUN pip install gdal[numpy]==3.12.1
 
 COPY requirements.txt /tmp/
 RUN pip install -r /tmp/requirements.txt
 
+# Snakemake linting/formatting tools
+RUN pip install snakefmt
+
 RUN mkdir /root/R
 ENV R_LIBS_USER=/root/R
 RUN Rscript -e 'install.packages(c("lme4","lmerTest","emmeans"), repos="https://cloud.r-project.org")'
 
 COPY ./ /root/star
 WORKDIR /root/star
-RUN chmod 755 ./scripts/run.sh
 
 # We create a DATADIR - this should be mapped at container creation
 # time to a volume somewhere else
@@ -53,6 +48,19 @@ ENV VIRTUAL_ENV=/usr
 ENV PYTHONPATH=/root/star
 
 RUN python3 -m pytest ./tests
-RUN python3 -m pylint prepare_layers prepare_species utils tests
-RUN python3 -m mypy prepare_layers prepare_species utils tests
-RUN shellcheck ./scripts/run.sh
+RUN python3 -m pylint prepare_layers prepare_species threats utils tests
+RUN python3 -m mypy prepare_layers prepare_species threats utils tests
+
+# Snakemake validation
+RUN snakefmt --check workflow/
+# RUN snakemake --snakefile workflow/Snakefile --lint
+
+# Copy and set up entrypoint script
+COPY docker-entrypoint.sh /usr/local/bin/
+RUN chmod +x /usr/local/bin/docker-entrypoint.sh
+
+# Default command runs the full Snakemake pipeline
+# Use --cores to specify parallelism, e.g.: docker run ... --cores 8
+# Logs are written to $DATADIR/logs/ and .snakemake/ metadata is stored in $DATADIR/
+ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
+CMD ["--cores", "4", "all"]
@@ -2,8 +2,6 @@
 
 An implementation of the threat based [STAR biodiversity metric by Muir et al](https://www.nature.com/articles/s41559-021-01432-0) (also known as STAR(t)).
 
-See [method.md](method.md) for a description of the methodology, or `scripts/run.sh` for how to execute the pipeline.
-
 ## Checking out the code
 
 The code is available on github, and can be checked out from there:
@@ -23,7 +21,6 @@ There are some additional inputs required to run the pipeline, which should be p
 
 The script also assumes you have a Postgres database with the IUCN Redlist database in it.
 
-
 ## Species data acquisition
 
 There are two scripts for getting the species data from the Redlist. For those in the IUCN with access to the database version of the redlist, use `extract_species_data_psql.py`.
@@ -34,6 +31,20 @@ For those outside the IUCN, there is a script called `extract_species_data_redli
 
 There are two ways to run the pipeline. The easiest way is to use Docker if you have it available to you, as it will manage all the dependencies for you. But you can check out and run it locally if you want to also, but it requires a little more effort.
 
+Either way, the pipeline itself is ran using [Snakemake](https://snakemake.readthedocs.io/en/stable/), which is a tool designed to run data-science pipelines made up from many different scripts and sources of information. Snakemake will track dependancies making it easier to re-run the pipeline and only the bits that depend on what changed will rerun. However, in STAR the initial data processing of raster layers is very slow, so we've configured Snakemake to never re-generate those unless the generated rasters have been deleted manually.
+
+Because sometimes you do not need to run all the pipeline for a specific job, the snakemake script has multiple targets you can invoke:
+
+* prepaer: Generate the necessary input rasters for the STAR pipeline.
+* species_data: Extract species data into GeoJSON files from Redlist database.
+* aohs: Just generate the species AOHs and summary CSV.
+* validation: Run model validation.
+* occurrence_validation: Run occurrence validation - this can be VERY SLOW as it fetches occurrence data from GBIF.
+* threats: Generate the STAR(t) raster layers.
+* all: Do everything except occurrence validation.
+
+There is a configuration file in `config/config.yaml` that is used to set experimental parameters such as which taxa to run the pipeline for.
+
 ### Running with Docker
 
 There is included a docker file, which is based on the GDAL container image, which is set up to install everything ready to use. You can build that using:
@@ -42,15 +53,21 @@ There is included a docker file, which is based on the GDAL container image, whi
 $ docker buildx build -t star .
 ```
 
+Note that depending on how many CPU cores you provide, you will probably need to give Docker more memory that the out of the box setting (which is a few GB). We recommend giving it as much as you can allow.
+
 You can then invoke the run script using this. You should map an external folder into the container as a place to store the intermediary data and final results, and you should provide details about the Postgres instance with the IUCN redlist:
 
 ```shell
 $ docker run --rm -v /some/local/dir:/data \
+	-p 5432:5432 \
 	-e DB_HOST=localhost \
 	-e DB_NAME=iucnredlist \
 	-e DB_PASSWORD=supersecretpassword \
 	-e DB_USER=postgres \
-	star ./scripts/run.sh
+	-e GBIF_USERNAME=myusename \
+	-e GBIF_PASSWORD=mypassword \
+	-e GBIF_EMAIL=myemail \
+	star --cores 8 all
 ```
 
 ### Running without Docker
@@ -61,7 +78,6 @@ If you prefer not to use Docker, you will need:
 * GDAL
 * R (required for validation)
 * [Reclaimer](https://github.com/quantifyearth/reclaimer/) - a Go tool for fetching data from Zenodo
-* [Littlejohn](https://github.com/quantifyearth/littlejohn/) - a Go tool for running scripts in parallel
 
 If you are using macOS please note that the default Python install that Apple ships is now several years out of date (Python 3.9, released Oct 2020) and you'll need to install a more recent version (for example, using [homebrew](https://brew.sh)).
 
@@ -91,6 +107,20 @@ export DB_PASSWORD=supersecretpassword
 export DB_USER=postgres
 ```
 
+If on macOS then you can set the following extra flag to use GPU acceleration:
+
+```shell
+export YIRGACHEFFE_BACKEND=MLX
+```
+
+For occurrence validation you will need a GBIF account and have to set the details as follows:
+
+```shell
+export GBIF_USERNAME=myusename
+export GBIF_PASSWORD=mypassword
+export GBIF_EMAIL=myemail
+```
+
 Once you have all that you can then run the pipeline:
 
 ```shell
 
@@ -0,0 +1,37 @@
+# STAR Pipeline Configuration
+# ===========================
+
+# Taxonomic classes to process
+taxa:
+  - AMPHIBIA
+  - AVES
+  - MAMMALIA
+  - REPTILIA
+
+# Scenario for habitat layers (for future expansion)
+scenario: current
+
+# Projection used throughout the pipeline
+projection: "ESRI:54009"
+
+# Scale for habitat processing (meters)
+habitat_scale: 992.292720200000133
+
+# Input data files (relative to datadir)
+# These are expected to be pre-downloaded in the Zenodo subfolder
+inputs:
+  zenodo_mask: "Zenodo/CGLS100Inland_withGADMIslands.tif"
+  zenodo_elevation_max: "Zenodo/FABDEM_1km_max_patched.tif"
+  zenodo_elevation_min: "Zenodo/FABDEM_1km_min_patched.tif"
+  zenodo_islands: "Zenodo/MissingLandcover_1km_cover.tif"
+  crosswalk_source: "data/crosswalk_bin_T.csv"
+
+# Optional input files (pipeline will check if these exist)
+optional_inputs:
+  birdlife_elevations: "BL_Species_Elevations_2023.csv"
+  species_excludes: "SpeciesList_generalisedRangePolygons.csv"
+
+# Zenodo configuration for downloading raw habitat
+zenodo:
+  habitat_id: 3939050
+  habitat_filename: "PROBAV_LC100_global_v3.0.1_2019-nrt_Discrete-Classification-map_EPSG-4326.tif"
@@ -0,0 +1,20 @@
+#!/bin/bash
+set -e
+
+# Ensure logs directory exists
+mkdir -p "${DATADIR}/logs"
+
+# Change to DATADIR so .snakemake/ metadata is stored there
+cd "${DATADIR}"
+
+# Generate timestamped log filename
+LOG_FILE="${DATADIR}/logs/snakemake_$(date +%Y%m%d_%H%M%S).log"
+
+echo "Snakemake logs will be written to: ${LOG_FILE}"
+
+# Run snakemake with all passed arguments, capturing output to log file
+exec snakemake \
+    --snakefile /root/star/workflow/Snakefile \
+    --scheduler greedy \
+    "$@" \
+    2>&1 | tee "${LOG_FILE}"
@@ -2,6 +2,7 @@
 from pathlib import Path
 
 import pandas as pd
+from snakemake_argparse_bridge import snakemake_compatible
 
 # Take from https://www.iucnredlist.org/resources/habitat-classification-scheme
 IUCN_HABITAT_CODES = {
@@ -57,6 +58,10 @@ def convert_crosswalk(
     df = pd.DataFrame(res, columns=["code", "value"])
     df.to_csv(output_path, index=False)
 
+@snakemake_compatible(mapping={
+    "original_path": "input.original",
+    "output_path": "output.crosswalk",
+})
 def main() -> None:
     parser = argparse.ArgumentParser(description="Convert IUCN crosswalk to minimal common format.")
     parser.add_argument(
 
@@ -3,6 +3,7 @@
 from pathlib import Path
 
 import yirgacheffe as yg
+from snakemake_argparse_bridge import snakemake_compatible
 
 def remove_nans_from_mask(
     input_path: Path,
@@ -13,6 +14,10 @@ def remove_nans_from_mask(
         converted = layer.nan_to_num()
         converted.to_geotiff(output_path)
 
+@snakemake_compatible(mapping={
+    "original_path": "input.original",
+    "output_path": "output.mask",
+})
 def main() -> None:
     parser = argparse.ArgumentParser(description="Convert NaNs to zeros in mask layers")
     parser.add_argument(
 
@@ -1,10 +1,12 @@
 import argparse
 import math
+import os
 from pathlib import Path
 
 import aoh
 import geopandas as gpd
 import pandas as pd
+from snakemake_argparse_bridge import snakemake_compatible
 
 # Columns from current BirdLife data overrides:
 # SIS ID
@@ -24,6 +26,7 @@
 def apply_birdlife_data(
     geojson_directory_path: Path,
     overrides_path: Path,
+    sentinel_path: Path | None,
 ) -> None:
     overrides = pd.read_csv(overrides_path, encoding="latin1")
 
@@ -51,6 +54,18 @@ def apply_birdlife_data(
         res = gpd.GeoDataFrame(data.to_frame().transpose(), crs=species_info.crs, geometry="geometry")
         res.to_file(path, driver="GeoJSON")
 
+    # This script modifies the GeoJSON files, but snakemake needs one
+    # output to say when this is done, so if we're in snakemake mode we touch a sentinel file to
+    # let it know we've done. One day this should be another decorator.
+    if sentinel_path is not None:
+        os.makedirs(sentinel_path.parent, exist_ok=True)
+        sentinel_path.touch()
+
+@snakemake_compatible(mapping={
+    "geojson_directory_path": "params.geojson_dir",
+    "overrides": "input.overrides",
+    "sentinel_path": "output.sentinel",
+})
 def main() -> None:
     parser = argparse.ArgumentParser(description="Process agregate species data to per-species-file.")
     parser.add_argument(
@@ -67,11 +82,20 @@ def main() -> None:
         required=True,
         dest="overrides",
     )
+    parser.add_argument(
+        '--sentinel',
+        type=Path,
+        help='Generate a sentinel file on completion for snakemake to track',
+        required=False,
+        default=None,
+        dest='sentinel_path',
+    )
     args = parser.parse_args()
 
     apply_birdlife_data(
         args.geojson_directory_path,
-        args.overrides
+        args.overrides,
+        args.sentinel_path,
     )
 
 if __name__ == "__main__":
 
@@ -161,9 +161,9 @@ def process_systems(
     return systems
 
 def process_threats(
-    threat_data: list[tuple[int, str, str]],
+    threat_data: list[tuple[str, str, str]],
     report: SpeciesReport,
-) -> list[tuple[int, int]]:
+) -> list[tuple[str, int]]:
     cleaned_threats = []
     for code, scope, severity in threat_data:
         if scope is None or scope.lower() == "unknown":