From 04b8d7653c7ab1f2d6f509ac250413599a6d8af4 Mon Sep 17 00:00:00 2001
From: Jared Lewis <jared@jared.kiwi.nz>
Date: Thu, 27 Feb 2025 17:16:44 +1100
Subject: [PATCH 1/6] chore: Refactor into a package

---
 pyproject.toml                           |   5 +
 ruff.toml                                |   1 +
 scripts/fetch_test_data.py               | 271 +----------------------
 src/sample_data/__init__.py              |  14 ++
 src/sample_data/data_request/__init__.py |   5 +
 src/sample_data/data_request/base.py     |  27 +++
 src/sample_data/data_request/cmip6.py    | 125 +++++++++++
 src/sample_data/data_request/obs4mips.py | 136 ++++++++++++
 uv.lock                                  |   2 +-
 9 files changed, 320 insertions(+), 266 deletions(-)
 create mode 100644 src/sample_data/__init__.py
 create mode 100644 src/sample_data/data_request/__init__.py
 create mode 100644 src/sample_data/data_request/base.py
 create mode 100644 src/sample_data/data_request/cmip6.py
 create mode 100644 src/sample_data/data_request/obs4mips.py

diff --git a/pyproject.toml b/pyproject.toml
index 841946eb..65872512 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,6 +29,11 @@ dev-dependencies = [
     "bump-my-version>=0.29.0",
 ]
 
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+
 
 [tool.coverage.run]
 source = ["packages"]
diff --git a/ruff.toml b/ruff.toml
index 9268df3f..3a5a3cc6 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -23,6 +23,7 @@ ignore = [
     "D200",
     "D400",
     "UP007",
+    "S101" # Use of `assert` detected
 ]
 
 [lint.per-file-ignores]
diff --git a/scripts/fetch_test_data.py b/scripts/fetch_test_data.py
index 47a3bbfe..e3f7d727 100644
--- a/scripts/fetch_test_data.py
+++ b/scripts/fetch_test_data.py
@@ -1,269 +1,13 @@
-import os
-import pathlib
-from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Any
 
 import pandas as pd
 import pooch
 import xarray as xr
 from intake_esgf import ESGFCatalog
 
-OUTPUT_PATH = Path("data")
-
-
-class DataRequest(ABC):
-    """
-    Represents a request for a dataset
-
-    A polymorphic association is used to capture the different types of datasets as each
-    dataset type may have different metadata fields and may need to be handled
-    differently to generate the sample data.
-    """
-
-    def __init__(self, remove_ensembles: bool, time_span: tuple[str, str]):
-        self.remove_ensembles = remove_ensembles
-        self.time_span = time_span
-
-    @abstractmethod
-    def decimate_dataset(self, dataset: xr.Dataset, time_span: tuple[str, str] | None) -> xr.Dataset | None:
-        """Downscale the dataset to a smaller size."""
-        pass
-
-    @abstractmethod
-    def create_out_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: str) -> pathlib.Path:
-        """Create the output filename for the dataset."""
-        pass
-
-
-class CMIP6Request(DataRequest):
-    """
-    Represents a CMIP6 dataset request
-
-    """
-
-    def __init__(self, facets: dict[str, Any], remove_ensembles: bool, time_span: tuple[str, str] | None):
-        self.avail_facets = [
-            "mip_era",
-            "activity_drs",
-            "institution_id",
-            "source_id",
-            "experiment_id",
-            "member_id",
-            "table_id",
-            "variable_id",
-            "grid_label",
-            "version",
-            "data_node",
-        ]
-
-        self.facets = facets
-
-        super().__init__(remove_ensembles, time_span)
-
-        self.cmip6_path_items = [
-            "mip_era",
-            "activity_drs",
-            "institution_id",
-            "source_id",
-            "experiment_id",
-            "member_id",
-            "table_id",
-            "variable_id",
-            "grid_label",
-        ]
-
-        self.cmip6_filename_paths = [
-            "variable_id",
-            "table_id",
-            "source_id",
-            "experiment_id",
-            "member_id",
-            "grid_label",
-        ]
-
-        assert all(key in self.avail_facets for key in self.cmip6_path_items), "Error message"
-        assert all(key in self.avail_facets for key in self.cmip6_filename_paths), "Error message"
-
-    def decimate_dataset(self, dataset: xr.Dataset, time_span: tuple[str, str] | None) -> xr.Dataset | None:
-        """
-        Downscale the dataset to a smaller size.
-
-        Parameters
-        ----------
-        dataset
-            The dataset to downscale
-        time_span
-            The time span to extract from a dataset
-
-        Returns
-        -------
-        xr.Dataset
-            The downscaled dataset
-        """
-        has_latlon = "lat" in dataset.dims and "lon" in dataset.dims
-        has_ij = "i" in dataset.dims and "j" in dataset.dims
-
-        if has_latlon:
-            assert len(dataset.lat.dims) == 1 and len(dataset.lon.dims) == 1
-            result = dataset.interp(lat=dataset.lat[:10], lon=dataset.lon[:10])
-        elif has_ij:
-            # 2d lat/lon grid (generally ocean variables)
-            # Choose a starting point around the middle of the grid to maximise chance that it has values
-            # TODO: Be smarter about this?
-            j_midpoint = len(dataset.j) // 2
-            result = dataset.interp(i=dataset.i[:10], j=dataset.j[j_midpoint : j_midpoint + 10])
-        else:
-            raise ValueError("Cannot decimate this grid: too many dimensions")
-
-        if "time" in dataset.dims and time_span is not None:
-            result = result.sel(time=slice(*time_span))
-            if result.time.size == 0:
-                result = None
-
-        return result
-
-    def create_out_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: str) -> pathlib.Path:
-        """
-        Create the output filename for the dataset.
+from sample_data import CMIP6Request, DataRequest, Obs4MIPsRequest
 
-        Parameters
-        ----------
-        ds
-            Loaded dataset
-
-        Returns
-        -------
-            The output filename
-        """
-        output_path = (
-            Path(os.path.join(*[metadata[item] for item in self.cmip6_path_items]))
-            / f"v{metadata['version']}"
-        )
-        filename_prefix = "_".join([metadata[item] for item in self.cmip6_filename_paths])
-
-        if "time" in ds.dims:
-            time_range = (
-                f"{ds.time.min().dt.strftime('%Y%m').item()}-{ds.time.max().dt.strftime('%Y%m').item()}"
-            )
-            filename = f"{filename_prefix}_{time_range}.nc"
-        else:
-            filename = f"{filename_prefix}.nc"
-
-        return output_path / filename
-
-
-class Obs4MIPsRequest(DataRequest):
-    """
-    Represents a Obs4MIPs dataset request
-
-    """
-
-    def __init__(self, facets: dict[str, Any], remove_ensembles: bool, time_span: tuple[str, str] | None):
-        self.avail_facets = [
-            "activity_id",
-            "institution_id",
-            "source_id",
-            "frequency",
-            "variable_id",
-            "grid_label",
-            "version",
-            "data_node",
-        ]
-
-        self.facets = facets
-
-        super().__init__(remove_ensembles, time_span)
-
-        self.obs4mips_path_items = [
-            "activity_id",
-            "institution_id",
-            "source_id",
-            "variable_id",
-            "grid_label",
-        ]
-
-        self.obs4mips_filename_paths = [
-            "variable_id",
-            "source_id",
-            "grid_label",
-        ]
-
-        assert all(key in self.avail_facets for key in self.obs4mips_path_items), "Error message"
-        assert all(key in self.avail_facets for key in self.obs4mips_filename_paths), "Error message"
-
-    def decimate_dataset(self, dataset: xr.Dataset, time_span: tuple[str, str] | None) -> xr.Dataset | None:
-        """
-        Downscale the dataset to a smaller size.
-
-        Parameters
-        ----------
-        dataset
-            The dataset to downscale
-        time_span
-            The time span to extract from a dataset
-
-        Returns
-        -------
-        xr.Dataset
-            The downscaled dataset
-        """
-        has_latlon = "lat" in dataset.dims and "lon" in dataset.dims
-        has_ij = "i" in dataset.dims and "j" in dataset.dims
-
-        if has_latlon:
-            assert len(dataset.lat.dims) == 1 and len(dataset.lon.dims) == 1
-            result = dataset.interp(lat=dataset.lat[:10], lon=dataset.lon[:10])
-        elif has_ij:
-            # 2d lat/lon grid (generally ocean variables)
-            # Choose a starting point around the middle of the grid to maximise chance that it has values
-            # TODO: Be smarter about this?
-            j_midpoint = len(dataset.j) // 2
-            result = dataset.interp(i=dataset.i[:10], j=dataset.j[j_midpoint : j_midpoint + 10])
-        else:
-            raise ValueError("Cannot decimate this grid: too many dimensions")
-
-        if "time" in dataset.dims and time_span is not None:
-            result = result.sel(time=slice(*time_span))
-            if result.time.size == 0:
-                result = None
-
-        return result
-
-    def create_out_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: str) -> pathlib.Path:
-        """
-        Create the output filename for the dataset.
-
-        Parameters
-        ----------
-        ds
-            Loaded dataset
-
-        Returns
-        -------
-            The output filename
-        """
-        output_path = (
-            Path(os.path.join(*[metadata[item] for item in self.obs4mips_path_items]))
-            / f"v{metadata['version']}"
-        )
-        if ds_filename.name.split("_")[0] == ds.variable_id:
-            filename_prefix = "_".join([metadata[item] for item in self.obs4mips_filename_paths])
-        else:
-            filename_prefix = ds_filename.name.split("_")[0] + "_"
-            filename_prefix += "_".join(
-                [metadata[item] for item in self.obs4mips_filename_paths if item != "variable_id"]
-            )
-
-        if "time" in ds.dims:
-            time_range = (
-                f"{ds.time.min().dt.strftime('%Y%m').item()}-{ds.time.max().dt.strftime('%Y%m').item()}"
-            )
-            filename = f"{filename_prefix}_{time_range}.nc"
-        else:
-            filename = f"{filename_prefix}.nc"
-
-        return output_path / filename
+OUTPUT_PATH = Path("data")
 
 
 def fetch_datasets(request: DataRequest) -> pd.DataFrame:
@@ -272,15 +16,12 @@ def fetch_datasets(request: DataRequest) -> pd.DataFrame:
 
     Parameters
     ----------
-    search_facets
-        Facets to search for
-    remove_ensembles
-        Whether to remove ensembles from the dataset
-        (i.e. include only a single ensemble member)
+    request
+        The request object
 
     Returns
     -------
-    List of paths to the fetched datasets
+        Dataframe that contains metadata and paths to the fetched datasets
     """
     cat = ESGFCatalog()
 
@@ -351,7 +92,7 @@ def create_sample_dataset(request: DataRequest):
             ds_decimated.to_netcdf(output_filename)
 
     # Regenerate the registry.txt file
-    pooch.make_registry(OUTPUT_PATH, "registry.txt")
+    pooch.make_registry(str(OUTPUT_PATH), "registry.txt")
 
 
 if __name__ == "__main__":
diff --git a/src/sample_data/__init__.py b/src/sample_data/__init__.py
new file mode 100644
index 00000000..dfc754d4
--- /dev/null
+++ b/src/sample_data/__init__.py
@@ -0,0 +1,14 @@
+"""
+REF sample data
+"""
+
+import importlib.metadata
+
+__version__ = importlib.metadata.version("sample_data")
+
+
+from .data_request.base import DataRequest
+from .data_request.cmip6 import CMIP6Request
+from .data_request.obs4mips import Obs4MIPsRequest
+
+__all__ = ["DataRequest", "CMIP6Request", "Obs4MIPsRequest"]
diff --git a/src/sample_data/data_request/__init__.py b/src/sample_data/data_request/__init__.py
new file mode 100644
index 00000000..30cc3a36
--- /dev/null
+++ b/src/sample_data/data_request/__init__.py
@@ -0,0 +1,5 @@
+"""
+Data requests
+
+Provides an abstraction over the different possible data queries that intake-esgf can perform
+"""
diff --git a/src/sample_data/data_request/base.py b/src/sample_data/data_request/base.py
new file mode 100644
index 00000000..1c1cabd5
--- /dev/null
+++ b/src/sample_data/data_request/base.py
@@ -0,0 +1,27 @@
+import pathlib
+from typing import Protocol
+
+import pandas as pd
+import xarray as xr
+
+
+class DataRequest(Protocol):
+    """
+    Represents a request for a dataset
+
+    A polymorphic association is used to capture the different types of datasets as each
+    dataset type may have different metadata fields and may need to be handled
+    differently to generate the sample data.
+    """
+
+    facets: dict[str, str | tuple[str, ...]]
+    remove_ensembles: bool
+    time_span: tuple[str, str]
+
+    def decimate_dataset(self, dataset: xr.Dataset, time_span: tuple[str, str] | None) -> xr.Dataset | None:
+        """Downscale the dataset to a smaller size."""
+        ...
+
+    def create_out_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: str) -> pathlib.Path:
+        """Create the output filename for the dataset."""
+        ...
diff --git a/src/sample_data/data_request/cmip6.py b/src/sample_data/data_request/cmip6.py
new file mode 100644
index 00000000..6a1dd018
--- /dev/null
+++ b/src/sample_data/data_request/cmip6.py
@@ -0,0 +1,125 @@
+import os.path
+from pathlib import Path
+from typing import Any
+
+import pandas as pd
+import xarray as xr
+
+from sample_data.data_request.base import DataRequest
+
+
+class CMIP6Request(DataRequest):
+    """
+    Represents a CMIP6 dataset request
+
+    """
+
+    cmip6_path_items = (
+        "mip_era",
+        "activity_drs",
+        "institution_id",
+        "source_id",
+        "experiment_id",
+        "member_id",
+        "table_id",
+        "variable_id",
+        "grid_label",
+    )
+
+    cmip6_filename_paths = (
+        "variable_id",
+        "table_id",
+        "source_id",
+        "experiment_id",
+        "member_id",
+        "grid_label",
+    )
+
+    def __init__(self, facets: dict[str, Any], remove_ensembles: bool, time_span: tuple[str, str] | None):
+        self.avail_facets = [
+            "mip_era",
+            "activity_drs",
+            "institution_id",
+            "source_id",
+            "experiment_id",
+            "member_id",
+            "table_id",
+            "variable_id",
+            "grid_label",
+            "version",
+            "data_node",
+        ]
+
+        self.facets = facets
+        self.remove_ensembles = remove_ensembles
+        self.time_span = time_span
+
+        assert all(key in self.avail_facets for key in self.cmip6_path_items), "Error message"
+        assert all(key in self.avail_facets for key in self.cmip6_filename_paths), "Error message"
+
+    def decimate_dataset(self, dataset: xr.Dataset, time_span: tuple[str, str] | None) -> xr.Dataset | None:
+        """
+        Downscale the dataset to a smaller size.
+
+        Parameters
+        ----------
+        dataset
+            The dataset to downscale
+        time_span
+            The time span to extract from a dataset
+
+        Returns
+        -------
+        xr.Dataset
+            The downscaled dataset
+        """
+        has_latlon = "lat" in dataset.dims and "lon" in dataset.dims
+        has_ij = "i" in dataset.dims and "j" in dataset.dims
+
+        if has_latlon:
+            assert len(dataset.lat.dims) == 1 and len(dataset.lon.dims) == 1
+            result = dataset.interp(lat=dataset.lat[:10], lon=dataset.lon[:10])
+        elif has_ij:
+            # 2d lat/lon grid (generally ocean variables)
+            # Choose a starting point around the middle of the grid to maximise chance that it has values
+            # TODO: Be smarter about this?
+            j_midpoint = len(dataset.j) // 2
+            result = dataset.interp(i=dataset.i[:10], j=dataset.j[j_midpoint : j_midpoint + 10])
+        else:
+            raise ValueError("Cannot decimate this grid: too many dimensions")
+
+        if "time" in dataset.dims and time_span is not None:
+            result = result.sel(time=slice(*time_span))
+            if result.time.size == 0:
+                result = None
+
+        return result
+
+    def create_out_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: str) -> Path:
+        """
+        Create the output filename for the dataset.
+
+        Parameters
+        ----------
+        ds
+            Loaded dataset
+
+        Returns
+        -------
+            The output filename
+        """
+        output_path = (
+            Path(os.path.join(*[metadata[item] for item in self.cmip6_path_items]))
+            / f"v{metadata['version']}"
+        )
+        filename_prefix = "_".join([metadata[item] for item in self.cmip6_filename_paths])
+
+        if "time" in ds.dims:
+            time_range = (
+                f"{ds.time.min().dt.strftime('%Y%m').item()}-{ds.time.max().dt.strftime('%Y%m').item()}"
+            )
+            filename = f"{filename_prefix}_{time_range}.nc"
+        else:
+            filename = f"{filename_prefix}.nc"
+
+        return output_path / filename
diff --git a/src/sample_data/data_request/obs4mips.py b/src/sample_data/data_request/obs4mips.py
new file mode 100644
index 00000000..41ba0080
--- /dev/null
+++ b/src/sample_data/data_request/obs4mips.py
@@ -0,0 +1,136 @@
+import os.path
+from pathlib import Path
+from typing import Any
+
+import pandas as pd
+import xarray as xr
+
+from sample_data.data_request.base import DataRequest
+
+
+class Obs4MIPsRequest(DataRequest):
+    """
+    Represents a Obs4MIPs dataset request
+    """
+
+    obs4mips_path_items = (
+        "activity_id",
+        "institution_id",
+        "source_id",
+        "variable_id",
+        "grid_label",
+    )
+
+    obs4mips_filename_paths = (
+        "variable_id",
+        "source_id",
+        "grid_label",
+    )
+
+    def __init__(self, facets: dict[str, Any], remove_ensembles: bool, time_span: tuple[str, str] | None):
+        self.avail_facets = [
+            "activity_id",
+            "institution_id",
+            "source_id",
+            "frequency",
+            "variable_id",
+            "grid_label",
+            "version",
+            "data_node",
+        ]
+
+        self.facets = facets
+        self.remove_ensembles = remove_ensembles
+        self.time_span = time_span
+
+        super().__init__(remove_ensembles, time_span)
+
+        self.obs4mips_path_items = [
+            "activity_id",
+            "institution_id",
+            "source_id",
+            "variable_id",
+            "grid_label",
+        ]
+
+        self.obs4mips_filename_paths = [
+            "variable_id",
+            "source_id",
+            "grid_label",
+        ]
+
+        assert all(key in self.avail_facets for key in self.obs4mips_path_items), "Error message"
+        assert all(key in self.avail_facets for key in self.obs4mips_filename_paths), "Error message"
+
+    def decimate_dataset(self, dataset: xr.Dataset, time_span: tuple[str, str] | None) -> xr.Dataset | None:
+        """
+        Downscale the dataset to a smaller size.
+
+        Parameters
+        ----------
+        dataset
+            The dataset to downscale
+        time_span
+            The time span to extract from a dataset
+
+        Returns
+        -------
+        xr.Dataset
+            The downscaled dataset
+        """
+        has_latlon = "lat" in dataset.dims and "lon" in dataset.dims
+        has_ij = "i" in dataset.dims and "j" in dataset.dims
+
+        if has_latlon:
+            assert len(dataset.lat.dims) == 1 and len(dataset.lon.dims) == 1
+            result = dataset.interp(lat=dataset.lat[:10], lon=dataset.lon[:10])
+        elif has_ij:
+            # 2d lat/lon grid (generally ocean variables)
+            # Choose a starting point around the middle of the grid to maximise chance that it has values
+            # TODO: Be smarter about this?
+            j_midpoint = len(dataset.j) // 2
+            result = dataset.interp(i=dataset.i[:10], j=dataset.j[j_midpoint : j_midpoint + 10])
+        else:
+            raise ValueError("Cannot decimate this grid: too many dimensions")
+
+        if "time" in dataset.dims and time_span is not None:
+            result = result.sel(time=slice(*time_span))
+            if result.time.size == 0:
+                result = None
+
+        return result
+
+    def create_out_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: str) -> Path:
+        """
+        Create the output filename for the dataset.
+
+        Parameters
+        ----------
+        ds
+            Loaded dataset
+
+        Returns
+        -------
+            The output filename
+        """
+        output_path = (
+            Path(os.path.join(*[metadata[item] for item in self.obs4mips_path_items]))
+            / f"v{metadata['version']}"
+        )
+        if ds_filename.name.split("_")[0] == ds.variable_id:
+            filename_prefix = "_".join([metadata[item] for item in self.obs4mips_filename_paths])
+        else:
+            filename_prefix = ds_filename.name.split("_")[0] + "_"
+            filename_prefix += "_".join(
+                [metadata[item] for item in self.obs4mips_filename_paths if item != "variable_id"]
+            )
+
+        if "time" in ds.dims:
+            time_range = (
+                f"{ds.time.min().dt.strftime('%Y%m').item()}-{ds.time.max().dt.strftime('%Y%m').item()}"
+            )
+            filename = f"{filename_prefix}_{time_range}.nc"
+        else:
+            filename = f"{filename_prefix}.nc"
+
+        return output_path / filename
diff --git a/uv.lock b/uv.lock
index 4f771d9a..ab0ad311 100644
--- a/uv.lock
+++ b/uv.lock
@@ -244,7 +244,7 @@ wheels = [
 [[package]]
 name = "cmip-ref-sample-data"
 version = "0.3.2"
-source = { virtual = "." }
+source = { editable = "." }
 dependencies = [
     { name = "intake-esgf" },
     { name = "matplotlib" },

From 1563a566e629b328fe335014544040bf13f4d89d Mon Sep 17 00:00:00 2001
From: Jared Lewis <jared@jared.kiwi.nz>
Date: Thu, 27 Feb 2025 17:34:53 +1100
Subject: [PATCH 2/6] chore: Refactor as a typer app

---
 pyproject.toml                                |   7 +-
 scripts/fetch_test_data.py                    | 179 ++++++++++--------
 .../__init__.py                               |   2 +-
 .../data_request/__init__.py                  |   0
 .../data_request/base.py                      |   2 +-
 .../data_request/cmip6.py                     |   4 +-
 .../data_request/obs4mips.py                  |   4 +-
 uv.lock                                       | 108 +++++++----
 8 files changed, 175 insertions(+), 131 deletions(-)
 mode change 100644 => 100755 scripts/fetch_test_data.py
 rename src/{sample_data => ref_sample_data}/__init__.py (80%)
 rename src/{sample_data => ref_sample_data}/data_request/__init__.py (100%)
 rename src/{sample_data => ref_sample_data}/data_request/base.py (87%)
 rename src/{sample_data => ref_sample_data}/data_request/cmip6.py (95%)
 rename src/{sample_data => ref_sample_data}/data_request/obs4mips.py (96%)

diff --git a/pyproject.toml b/pyproject.toml
index 65872512..fb7bdf31 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [project]
-name = "cmip-ref-sample-data"
+name = "ref-sample-data"
 version = "0.3.2"
 description = "CMIP Rapid Evaluation Framework Sample Data"
 readme = "README.md"
@@ -13,6 +13,7 @@ dependencies = [
     "matplotlib>=3.10.0",
     "scipy>=1.15.0",
     "xarray>=2024.10.0",
+    "typer>=0.15.1",
 ]
 
 [project.license]
@@ -33,10 +34,8 @@ dev-dependencies = [
 requires = ["hatchling"]
 build-backend = "hatchling.build"
 
-
-
 [tool.coverage.run]
-source = ["packages"]
+source = ["src"]
 branch = true
 
 [tool.coverage.report]
diff --git a/scripts/fetch_test_data.py b/scripts/fetch_test_data.py
old mode 100644
new mode 100755
index e3f7d727..4cd5cd79
--- a/scripts/fetch_test_data.py
+++ b/scripts/fetch_test_data.py
@@ -1,13 +1,16 @@
+import pathlib
 from pathlib import Path
 
 import pandas as pd
 import pooch
+import typer
 import xarray as xr
 from intake_esgf import ESGFCatalog
 
-from sample_data import CMIP6Request, DataRequest, Obs4MIPsRequest
+from ref_sample_data import CMIP6Request, DataRequest, Obs4MIPsRequest
 
 OUTPUT_PATH = Path("data")
+app = typer.Typer()
 
 
 def fetch_datasets(request: DataRequest) -> pd.DataFrame:
@@ -37,7 +40,7 @@ def fetch_datasets(request: DataRequest) -> pd.DataFrame:
     return merged_df
 
 
-def deduplicate_datasets(request: DataRequest) -> pd.DataFrame:
+def deduplicate_datasets(datasets: pd.DataFrame) -> pd.DataFrame:
     """
     Deduplicate a dataset collection.
 
@@ -54,7 +57,6 @@ def deduplicate_datasets(request: DataRequest) -> pd.DataFrame:
     pd.DataFrame
         The deduplicated dataset collection spanning the times requested
     """
-    datasets = fetch_datasets(request)
 
     def _deduplicate_group(group: pd.DataFrame) -> pd.DataFrame:
         first = group.iloc[0].copy()
@@ -66,28 +68,36 @@ def _deduplicate_group(group: pd.DataFrame) -> pd.DataFrame:
     return datasets.groupby("key").apply(_deduplicate_group, include_groups=False).reset_index()
 
 
-def create_sample_dataset(request: DataRequest):
+def process_sample_data_request(request: DataRequest, decimate: bool, output_directory: Path) -> None:
     """
-    Create the output filename for the dataset.
+    Fetch and create sample datasets
 
     Parameters
     ----------
-    ds
-        Loaded dataset
+    request
+        The request to execute
 
-    Returns
-    -------
-        The output filename
+        This may be different types of requests, such as CMIP6Request or Obs4MIPsRequest.
+    decimate
+        Whether to decimate the datasets
+    output_directory
+        The directory to write the output to
     """
-    datasets = deduplicate_datasets(request)
+    datasets = fetch_datasets(request)
+    datasets = deduplicate_datasets(datasets)
+
     for _, dataset in datasets.iterrows():
         for ds_filename in dataset["files"]:
             ds_orig = xr.open_dataset(ds_filename)
-            ds_decimated = request.decimate_dataset(ds_orig, request.time_span)
+
+            if decimate:
+                ds_decimated = request.decimate_dataset(ds_orig, request.time_span)
+            else:
+                ds_decimated = ds_orig
             if ds_decimated is None:
                 continue
 
-            output_filename = OUTPUT_PATH / request.create_out_filename(dataset, ds_decimated, ds_filename)
+            output_filename = output_directory / request.generate_filename(dataset, ds_decimated, ds_filename)
             output_filename.parent.mkdir(parents=True, exist_ok=True)
             ds_decimated.to_netcdf(output_filename)
 
@@ -95,77 +105,86 @@ def create_sample_dataset(request: DataRequest):
     pooch.make_registry(str(OUTPUT_PATH), "registry.txt")
 
 
-if __name__ == "__main__":
-    datasets_to_fetch = [
-        # Example metric data
-        CMIP6Request(
-            facets=dict(
-                source_id="ACCESS-ESM1-5",
-                frequency=["fx", "mon"],
-                variable_id=["areacella", "tas", "tos", "rsut", "rlut", "rsdt"],
-                experiment_id=["ssp126", "historical"],
-            ),
-            remove_ensembles=True,
-            time_span=("2000", "2025"),
+DATASETS_TO_FETCH = [
+    # Example metric data
+    CMIP6Request(
+        facets=dict(
+            source_id="ACCESS-ESM1-5",
+            frequency=["fx", "mon"],
+            variable_id=["areacella", "tas", "tos", "rsut", "rlut", "rsdt"],
+            experiment_id=["ssp126", "historical"],
         ),
-        # ESMValTool ECS data
-        CMIP6Request(
-            facets=dict(
-                source_id="ACCESS-ESM1-5",
-                frequency=["fx", "mon"],
-                variable_id=["areacella", "rlut", "rsdt", "rsut", "tas"],
-                experiment_id=["abrupt-4xCO2", "piControl"],
-            ),
-            remove_ensembles=True,
-            time_span=("0101", "0125"),
+        remove_ensembles=True,
+        time_span=("2000", "2025"),
+    ),
+    # ESMValTool ECS data
+    CMIP6Request(
+        facets=dict(
+            source_id="ACCESS-ESM1-5",
+            frequency=["fx", "mon"],
+            variable_id=["areacella", "rlut", "rsdt", "rsut", "tas"],
+            experiment_id=["abrupt-4xCO2", "piControl"],
         ),
-        # ESMValTool TCR data
-        CMIP6Request(
-            facets=dict(
-                source_id="ACCESS-ESM1-5",
-                frequency=["fx", "mon"],
-                variable_id=["areacella", "tas"],
-                experiment_id=["1pctCO2", "piControl"],
-            ),
-            remove_ensembles=True,
-            time_span=("0101", "0180"),
+        remove_ensembles=True,
+        time_span=("0101", "0125"),
+    ),
+    # ESMValTool TCR data
+    CMIP6Request(
+        facets=dict(
+            source_id="ACCESS-ESM1-5",
+            frequency=["fx", "mon"],
+            variable_id=["areacella", "tas"],
+            experiment_id=["1pctCO2", "piControl"],
         ),
-        # ILAMB data
-        CMIP6Request(
-            facets=dict(
-                source_id="ACCESS-ESM1-5",
-                frequency=["fx", "mon"],
-                variable_id=["areacella", "sftlf", "gpp", "pr"],
-                experiment_id=["historical"],
-            ),
-            remove_ensembles=True,
-            time_span=("2000", "2025"),
+        remove_ensembles=True,
+        time_span=("0101", "0180"),
+    ),
+    # ILAMB data
+    CMIP6Request(
+        facets=dict(
+            source_id="ACCESS-ESM1-5",
+            frequency=["fx", "mon"],
+            variable_id=["areacella", "sftlf", "gpp", "pr"],
+            experiment_id=["historical"],
         ),
-        # PMP PDO data
-        CMIP6Request(
-            facets=dict(
-                source_id="ACCESS-ESM1-5",
-                frequency=["fx", "mon"],
-                variable_id=["areacella", "ts"],
-                experiment_id=["historical", "hist-GHG"],
-                variant_label=["r1i1p1f1", "r2i1p1f1"],
-            ),
-            remove_ensembles=False,
-            time_span=("2000", "2025"),
+        remove_ensembles=True,
+        time_span=("2000", "2025"),
+    ),
+    # PMP PDO data
+    CMIP6Request(
+        facets=dict(
+            source_id="ACCESS-ESM1-5",
+            frequency=["fx", "mon"],
+            variable_id=["areacella", "ts"],
+            experiment_id=["historical", "hist-GHG"],
+            variant_label=["r1i1p1f1", "r2i1p1f1"],
         ),
-        # Obs4MIPs AIRS data
-        Obs4MIPsRequest(
-            facets=dict(
-                project="obs4MIPs",
-                institution_id="NASA-JPL",
-                frequency="mon",
-                source_id="AIRS-2-1",
-                variable_id="ta",
-            ),
-            remove_ensembles=False,
-            time_span=("2002", "2016"),
+        remove_ensembles=False,
+        time_span=("2000", "2025"),
+    ),
+    # Obs4MIPs AIRS data
+    Obs4MIPsRequest(
+        facets=dict(
+            project="obs4MIPs",
+            institution_id="NASA-JPL",
+            frequency="mon",
+            source_id="AIRS-2-1",
+            variable_id="ta",
         ),
-    ]
+        remove_ensembles=False,
+        time_span=("2002", "2016"),
+    ),
+]
+
 
-    for dataset_requested in datasets_to_fetch:
-        create_sample_dataset(dataset_requested)
+@app.command()
+def create_sample_data(decimate: bool = True, output: Path = OUTPUT_PATH) -> None:
+    """Fetch and create sample datasets"""
+    for dataset_requested in DATASETS_TO_FETCH:
+        process_sample_data_request(
+            dataset_requested, decimate=decimate, output_directory=pathlib.Path(output)
+        )
+
+
+if __name__ == "__main__":
+    app()
diff --git a/src/sample_data/__init__.py b/src/ref_sample_data/__init__.py
similarity index 80%
rename from src/sample_data/__init__.py
rename to src/ref_sample_data/__init__.py
index dfc754d4..c33270f0 100644
--- a/src/sample_data/__init__.py
+++ b/src/ref_sample_data/__init__.py
@@ -4,7 +4,7 @@
 
 import importlib.metadata
 
-__version__ = importlib.metadata.version("sample_data")
+__version__ = importlib.metadata.version("ref_sample_data")
 
 
 from .data_request.base import DataRequest
diff --git a/src/sample_data/data_request/__init__.py b/src/ref_sample_data/data_request/__init__.py
similarity index 100%
rename from src/sample_data/data_request/__init__.py
rename to src/ref_sample_data/data_request/__init__.py
diff --git a/src/sample_data/data_request/base.py b/src/ref_sample_data/data_request/base.py
similarity index 87%
rename from src/sample_data/data_request/base.py
rename to src/ref_sample_data/data_request/base.py
index 1c1cabd5..a2f7e751 100644
--- a/src/sample_data/data_request/base.py
+++ b/src/ref_sample_data/data_request/base.py
@@ -22,6 +22,6 @@ def decimate_dataset(self, dataset: xr.Dataset, time_span: tuple[str, str] | Non
         """Downscale the dataset to a smaller size."""
         ...
 
-    def create_out_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: str) -> pathlib.Path:
+    def generate_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: str) -> pathlib.Path:
         """Create the output filename for the dataset."""
         ...
diff --git a/src/sample_data/data_request/cmip6.py b/src/ref_sample_data/data_request/cmip6.py
similarity index 95%
rename from src/sample_data/data_request/cmip6.py
rename to src/ref_sample_data/data_request/cmip6.py
index 6a1dd018..922dfa62 100644
--- a/src/sample_data/data_request/cmip6.py
+++ b/src/ref_sample_data/data_request/cmip6.py
@@ -5,7 +5,7 @@
 import pandas as pd
 import xarray as xr
 
-from sample_data.data_request.base import DataRequest
+from ref_sample_data.data_request.base import DataRequest
 
 
 class CMIP6Request(DataRequest):
@@ -95,7 +95,7 @@ def decimate_dataset(self, dataset: xr.Dataset, time_span: tuple[str, str] | Non
 
         return result
 
-    def create_out_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: str) -> Path:
+    def generate_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: str) -> Path:
         """
         Create the output filename for the dataset.
 
diff --git a/src/sample_data/data_request/obs4mips.py b/src/ref_sample_data/data_request/obs4mips.py
similarity index 96%
rename from src/sample_data/data_request/obs4mips.py
rename to src/ref_sample_data/data_request/obs4mips.py
index 41ba0080..1d9c6fe9 100644
--- a/src/sample_data/data_request/obs4mips.py
+++ b/src/ref_sample_data/data_request/obs4mips.py
@@ -5,7 +5,7 @@
 import pandas as pd
 import xarray as xr
 
-from sample_data.data_request.base import DataRequest
+from ref_sample_data.data_request.base import DataRequest
 
 
 class Obs4MIPsRequest(DataRequest):
@@ -100,7 +100,7 @@ def decimate_dataset(self, dataset: xr.Dataset, time_span: tuple[str, str] | Non
 
         return result
 
-    def create_out_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: str) -> Path:
+    def generate_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: str) -> Path:
         """
         Create the output filename for the dataset.
 
diff --git a/uv.lock b/uv.lock
index ab0ad311..99e2665b 100644
--- a/uv.lock
+++ b/uv.lock
@@ -241,47 +241,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/48/41/e1d85ca3cab0b674e277c8c4f678cf66a91cd2cecf93df94353a606fe0db/cloudpickle-3.1.0-py3-none-any.whl", hash = "sha256:fe11acda67f61aaaec473e3afe030feb131d78a43461b718185363384f1ba12e", size = 22021 },
 ]
 
-[[package]]
-name = "cmip-ref-sample-data"
-version = "0.3.2"
-source = { editable = "." }
-dependencies = [
-    { name = "intake-esgf" },
-    { name = "matplotlib" },
-    { name = "pooch" },
-    { name = "scipy" },
-    { name = "xarray" },
-]
-
-[package.dev-dependencies]
-dev = [
-    { name = "bump-my-version" },
-    { name = "liccheck" },
-    { name = "pip" },
-    { name = "pre-commit" },
-    { name = "ruff" },
-    { name = "towncrier" },
-]
-
-[package.metadata]
-requires-dist = [
-    { name = "intake-esgf" },
-    { name = "matplotlib", specifier = ">=3.10.0" },
-    { name = "pooch" },
-    { name = "scipy", specifier = ">=1.15.0" },
-    { name = "xarray", specifier = ">=2024.10.0" },
-]
-
-[package.metadata.requires-dev]
-dev = [
-    { name = "bump-my-version", specifier = ">=0.29.0" },
-    { name = "liccheck", specifier = ">=0.9.2" },
-    { name = "pip", specifier = ">=24.3.1" },
-    { name = "pre-commit", specifier = ">=3.3.1" },
-    { name = "ruff", specifier = ">=0.6.9" },
-    { name = "towncrier", specifier = ">=24.8.0" },
-]
-
 [[package]]
 name = "colorama"
 version = "0.4.6"
@@ -1478,6 +1437,49 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ad/3f/11dd4cd4f39e05128bfd20138faea57bec56f9ffba6185d276e3107ba5b2/questionary-2.1.0-py3-none-any.whl", hash = "sha256:44174d237b68bc828e4878c763a9ad6790ee61990e0ae72927694ead57bab8ec", size = 36747 },
 ]
 
+[[package]]
+name = "ref-sample-data"
+version = "0.3.2"
+source = { editable = "." }
+dependencies = [
+    { name = "intake-esgf" },
+    { name = "matplotlib" },
+    { name = "pooch" },
+    { name = "scipy" },
+    { name = "typer" },
+    { name = "xarray" },
+]
+
+[package.dev-dependencies]
+dev = [
+    { name = "bump-my-version" },
+    { name = "liccheck" },
+    { name = "pip" },
+    { name = "pre-commit" },
+    { name = "ruff" },
+    { name = "towncrier" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "intake-esgf" },
+    { name = "matplotlib", specifier = ">=3.10.0" },
+    { name = "pooch" },
+    { name = "scipy", specifier = ">=1.15.0" },
+    { name = "typer", specifier = ">=0.15.1" },
+    { name = "xarray", specifier = ">=2024.10.0" },
+]
+
+[package.metadata.requires-dev]
+dev = [
+    { name = "bump-my-version", specifier = ">=0.29.0" },
+    { name = "liccheck", specifier = ">=0.9.2" },
+    { name = "pip", specifier = ">=24.3.1" },
+    { name = "pre-commit", specifier = ">=3.3.1" },
+    { name = "ruff", specifier = ">=0.6.9" },
+    { name = "towncrier", specifier = ">=24.8.0" },
+]
+
 [[package]]
 name = "requests"
 version = "2.32.3"
@@ -1605,6 +1607,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6a/23/8146aad7d88f4fcb3a6218f41a60f6c2d4e3a72de72da1825dc7c8f7877c/semantic_version-2.10.0-py2.py3-none-any.whl", hash = "sha256:de78a3b8e0feda74cabc54aab2da702113e33ac9d9eb9d2389bcf1f58b7d9177", size = 15552 },
 ]
 
+[[package]]
+name = "shellingham"
+version = "1.5.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755 },
+]
+
 [[package]]
 name = "six"
 version = "1.16.0"
@@ -1704,6 +1715,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359 },
 ]
 
+[[package]]
+name = "typer"
+version = "0.15.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "rich" },
+    { name = "shellingham" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cb/ce/dca7b219718afd37a0068f4f2530a727c2b74a8b6e8e0c0080a4c0de4fcd/typer-0.15.1.tar.gz", hash = "sha256:a0588c0a7fa68a1978a069818657778f86abe6ff5ea6abf472f940a08bfe4f0a", size = 99789 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d0/cc/0a838ba5ca64dc832aa43f727bd586309846b0ffb2ce52422543e6075e8a/typer-0.15.1-py3-none-any.whl", hash = "sha256:7994fb7b8155b64d3402518560648446072864beefd44aa2dc36972a5972e847", size = 44908 },
+]
+
 [[package]]
 name = "typing-extensions"
 version = "4.12.2"

From 5094c8ac0d48e57de0ad4961b554fc3fc2e13b9a Mon Sep 17 00:00:00 2001
From: Jared Lewis <jared@jared.kiwi.nz>
Date: Thu, 27 Feb 2025 17:36:43 +1100
Subject: [PATCH 3/6] chore: Ignore expected output directory

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 3eb1fc7c..f281cf16 100644
--- a/.gitignore
+++ b/.gitignore
@@ -151,4 +151,5 @@ dmypy.json
 
 # Generated output
 out
+data-raw
 .ref

From 541ecc1edd9e7118dc80ed925e932235bd6323a5 Mon Sep 17 00:00:00 2001
From: Jared Lewis <jared@jared.kiwi.nz>
Date: Thu, 27 Feb 2025 17:38:03 +1100
Subject: [PATCH 4/6] docs: Changelog

---
 changelog/16.feature.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 changelog/16.feature.md

diff --git a/changelog/16.feature.md b/changelog/16.feature.md
new file mode 100644
index 00000000..64559109
--- /dev/null
+++ b/changelog/16.feature.md
@@ -0,0 +1 @@
+Allow for the fetching of non-decimated datasets

From dc7bb575ea41815786cdecb054cc8d1597821cbd Mon Sep 17 00:00:00 2001
From: Jared Lewis <jared@jared.kiwi.nz>
Date: Thu, 27 Feb 2025 17:50:29 +1100
Subject: [PATCH 5/6] chore: Optionally cache ESGF data

---
 .github/actions/regenerate/action.yml |  1 +
 .github/actions/setup/action.yml      | 11 +++++++++++
 .github/workflows/ci.yaml             |  1 +
 3 files changed, 13 insertions(+)

diff --git a/.github/actions/regenerate/action.yml b/.github/actions/regenerate/action.yml
index dfc25062..e3f70817 100644
--- a/.github/actions/regenerate/action.yml
+++ b/.github/actions/regenerate/action.yml
@@ -6,6 +6,7 @@ runs:
     - uses: ./.github/actions/setup
       with:
         python-version: 3.12
+        cache-esgf: true
 
     - name: Verify registry
       shell: bash
diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
index 898b566e..f4bbf7c0 100644
--- a/.github/actions/setup/action.yml
+++ b/.github/actions/setup/action.yml
@@ -9,6 +9,10 @@ inputs:
     description: "The version of uv to use"
     required: true
     default: ">=0.4.20"
+  cache-esgf:
+    description: "Cache any downloaded ESGF data"
+    required: false
+    default: "false"
 
 runs:
   using: "composite"
@@ -27,3 +31,10 @@ runs:
       shell: bash
       run: |
         uv sync --all-extras --dev --locked
+    - name: Cache downloaded ESGF data
+      uses: actions/cache@v4
+      if: ${{ inputs.cache-esgf == 'true' }}
+      with:
+        path: |
+          ~/.esgf
+        key: esgf
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 82a109de..cd5cda4f 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -36,6 +36,7 @@ jobs:
       - uses: ./.github/actions/setup
         with:
           python-version: ${{ matrix.python-version }}
+          cache-esgf: true
 
       - name: Verify registry
         run: |

From f256b0e48d5ed0b19217c06d51afc6f46097c27a Mon Sep 17 00:00:00 2001
From: Jared Lewis <jared@jared.kiwi.nz>
Date: Thu, 27 Feb 2025 21:04:54 +1100
Subject: [PATCH 6/6] chore: Add quiet mode on CI

---
 .github/actions/regenerate/action.yml |  2 ++
 .github/workflows/ci.yaml             |  2 ++
 scripts/fetch_test_data.py            | 23 +++++++++++++++++------
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/.github/actions/regenerate/action.yml b/.github/actions/regenerate/action.yml
index e3f70817..0ad55a89 100644
--- a/.github/actions/regenerate/action.yml
+++ b/.github/actions/regenerate/action.yml
@@ -10,6 +10,8 @@ runs:
 
     - name: Verify registry
       shell: bash
+      env:
+        QUIET: true
       run: |
         git config --global user.name "$GITHUB_ACTOR"
         git config --global user.email "$CI_COMMIT_EMAIL"
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index cd5cda4f..46629d46 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -39,6 +39,8 @@ jobs:
           cache-esgf: true
 
       - name: Verify registry
+        env:
+          QUIET: true
         run: |
           make fetch-test-data
           git diff --exit-code
diff --git a/scripts/fetch_test_data.py b/scripts/fetch_test_data.py
index 4cd5cd79..64cc9494 100755
--- a/scripts/fetch_test_data.py
+++ b/scripts/fetch_test_data.py
@@ -1,5 +1,6 @@
 import pathlib
 from pathlib import Path
+from typing import Annotated
 
 import pandas as pd
 import pooch
@@ -13,7 +14,7 @@
 app = typer.Typer()
 
 
-def fetch_datasets(request: DataRequest) -> pd.DataFrame:
+def fetch_datasets(request: DataRequest, quiet: bool) -> pd.DataFrame:
     """
     Fetch the datasets from ESGF.
 
@@ -21,6 +22,8 @@ def fetch_datasets(request: DataRequest) -> pd.DataFrame:
     ----------
     request
         The request object
+    quiet
+        Whether to suppress progress messages from intake-esgf
 
     Returns
     -------
@@ -32,7 +35,7 @@ def fetch_datasets(request: DataRequest) -> pd.DataFrame:
     if request.remove_ensembles:
         cat.remove_ensembles()
 
-    path_dict = cat.to_path_dict(prefer_streaming=False, minimal_keys=False)
+    path_dict = cat.to_path_dict(prefer_streaming=False, minimal_keys=False, quiet=quiet)
     merged_df = cat.df.merge(pd.Series(path_dict, name="files"), left_on="key", right_index=True)
     if request.time_span:
         merged_df["time_start"] = request.time_span[0]
@@ -68,7 +71,9 @@ def _deduplicate_group(group: pd.DataFrame) -> pd.DataFrame:
     return datasets.groupby("key").apply(_deduplicate_group, include_groups=False).reset_index()
 
 
-def process_sample_data_request(request: DataRequest, decimate: bool, output_directory: Path) -> None:
+def process_sample_data_request(
+    request: DataRequest, decimate: bool, output_directory: Path, quiet: bool
+) -> None:
     """
     Fetch and create sample datasets
 
@@ -82,8 +87,10 @@ def process_sample_data_request(request: DataRequest, decimate: bool, output_dir
         Whether to decimate the datasets
     output_directory
         The directory to write the output to
+    quiet
+        Whether to suppress progress messages
     """
-    datasets = fetch_datasets(request)
+    datasets = fetch_datasets(request, quiet)
     datasets = deduplicate_datasets(datasets)
 
     for _, dataset in datasets.iterrows():
@@ -178,11 +185,15 @@ def process_sample_data_request(request: DataRequest, decimate: bool, output_dir
 
 
 @app.command()
-def create_sample_data(decimate: bool = True, output: Path = OUTPUT_PATH) -> None:
+def create_sample_data(
+    decimate: bool = True,
+    output: Path = OUTPUT_PATH,
+    quiet: Annotated[bool, typer.Argument(envvar="QUIET")] = False,
+) -> None:
     """Fetch and create sample datasets"""
     for dataset_requested in DATASETS_TO_FETCH:
         process_sample_data_request(
-            dataset_requested, decimate=decimate, output_directory=pathlib.Path(output)
+            dataset_requested, decimate=decimate, output_directory=pathlib.Path(output), quiet=quiet
         )