Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/actions/regenerate/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ runs:
steps:
- uses: ./.github/actions/setup
with:
python-version: 3.12
cache-esgf: true

- name: Verify registry
Expand Down
10 changes: 4 additions & 6 deletions .github/actions/setup/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,11 @@ runs:
- name: Install pixi
uses: prefix-dev/setup-pixi@v0.8.3
with:
pixi-version: "latest"
pixi-version: "v0.40.2"
cache: true
- name: Install the project
shell: bash
run: |
# Only installs if the lock file is up-to-date with the manifest
pixi install --locked
# Frozen is needed as the ref git dependency was not playing nice with a fully locked environment
frozen: true
log-level: "v"
- name: Cache downloaded ESGF data
uses: actions/cache@v4
if: ${{ inputs.cache-esgf == 'true' }}
Expand Down
1 change: 1 addition & 0 deletions changelog/24.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add datasets that from pmp which are not yet published on obs4MIPs
Binary file not shown.
Binary file not shown.
643 changes: 642 additions & 1 deletion pixi.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ xesmf = ">=0.8.7,<0.9"
[tool.pixi.pypi-dependencies]
# Add any dependencies that aren't available on conda-forge here
ref_sample_data = { path = ".", editable = true }
# TODO: Pin a release
# This rev includes the PMP reference data
cmip-ref = { git = "https://github.com/Climate-REF/climate-ref", subdirectory = "packages/ref", rev = "7ea9c966fc44b91e4b0e3d8b31f6f2c3f1445677" }

[tool.pixi.feature.dev.dependencies]
ruff = "*"
Expand Down
2 changes: 2 additions & 0 deletions registry.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,5 @@ CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/fx/areacella/gn/v20210318/
obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/taNobs_AIRS-2-1_gn_200209-201609.nc 3489895fc6cdd936ae64fa64fa221474e50f6b6bf347458c82d9a61f945f2d9d
obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/taStderr_AIRS-2-1_gn_200209-201609.nc 81e12ba5c6b058ace93737a3b69b317d2beb17e07fd6aa9f709b3e528ebfb4a2
obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/ta_AIRS-2-1_gn_200209-201609.nc a72d7172cd0c9df9eb0199082b196655490e5628fbb6a61ed1e7f8f83c610c0b
obs4REF/obs4MIPs_PCMDI_monthly/MOHC/HadISST-1-1/mon/ts/gn/v20210727/ts_mon_HadISST-1-1_PCMDI_gn_187001-201907.nc 4f9a9270d001fc30488b49cdafe28e77db88e78e981ab580f0fae209f849a2da
obs4REF/obs4MIPs_PCMDI_monthly/NOAA-ESRL-PSD/20CR/mon/psl/gn/v20210727/psl_mon_20CR_PCMDI_gn_187101-201212.nc 357e8915cc2ad30af1dd02cbecfb55f3083c13f54a11912e2f28396ccc84bd9c
34 changes: 3 additions & 31 deletions scripts/fetch_test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,45 +2,16 @@
from pathlib import Path
from typing import Annotated

import pandas as pd
import pooch
import typer
import xarray as xr

from ref_sample_data import CMIP6Request, DataRequest, Obs4MIPsRequest
from ref_sample_data import CMIP6Request, DataRequest, Obs4MIPsRequest, Obs4REFRequest

OUTPUT_PATH = Path("data")
app = typer.Typer()


def deduplicate_datasets(datasets: pd.DataFrame) -> pd.DataFrame:
"""
Deduplicate a dataset collection.

Uses the metadata from the first dataset in each group,
but expands the time range to the min/max timespan of the group.

Parameters
----------
datasets
The dataset collection

Returns
-------
pd.DataFrame
The deduplicated dataset collection spanning the times requested
"""

def _deduplicate_group(group: pd.DataFrame) -> pd.DataFrame:
first = group.iloc[0].copy()
first.time_start = group.time_start.min()
first.time_end = group.time_end.max()

return first

return datasets.groupby("key").apply(_deduplicate_group, include_groups=False).reset_index()


def process_sample_data_request(
request: DataRequest, decimate: bool, output_directory: Path, quiet: bool
) -> None:
Expand All @@ -61,7 +32,6 @@ def process_sample_data_request(
Whether to suppress progress messages
"""
datasets = request.fetch_datasets()
datasets = deduplicate_datasets(datasets)

for _, dataset in datasets.iterrows():
for ds_filename in dataset["files"]:
Expand Down Expand Up @@ -183,6 +153,8 @@ def process_sample_data_request(
remove_ensembles=False,
time_span=("2002", "2016"),
),
# All unpublished obs4mips datasets
Obs4REFRequest(),
]


Expand Down
3 changes: 2 additions & 1 deletion src/ref_sample_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@
from .data_request.base import DataRequest
from .data_request.cmip6 import CMIP6Request
from .data_request.obs4mips import Obs4MIPsRequest
from .data_request.obs4ref import Obs4REFRequest

__all__ = ["CMIP6Request", "DataRequest", "Obs4MIPsRequest"]
__all__ = ["CMIP6Request", "DataRequest", "Obs4MIPsRequest", "Obs4REFRequest"]
30 changes: 29 additions & 1 deletion src/ref_sample_data/data_request/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,34 @@ def generate_filename(
...


def _deduplicate_datasets(datasets: pd.DataFrame) -> pd.DataFrame:
"""
Deduplicate a dataset collection.

Uses the metadata from the first dataset in each group,
but expands the time range to the min/max timespan of the group.

Parameters
----------
datasets
The dataset collection

Returns
-------
pd.DataFrame
The deduplicated dataset collection spanning the times requested
"""

def _deduplicate_group(group: pd.DataFrame) -> pd.DataFrame:
first = group.iloc[0].copy()
first.time_start = group.time_start.min()
first.time_end = group.time_end.max()

return first

return datasets.groupby("key").apply(_deduplicate_group, include_groups=False).reset_index()


class IntakeESGFDataRequest(DataRequest):
"""
A data request that fetches datasets from ESGF using intake-esgf.
Expand All @@ -56,4 +84,4 @@ def fetch_datasets(self) -> pd.DataFrame:
if self.time_span:
merged_df["time_start"] = self.time_span[0]
merged_df["time_end"] = self.time_span[1]
return merged_df
return _deduplicate_datasets(merged_df)
2 changes: 0 additions & 2 deletions src/ref_sample_data/data_request/cmip6.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,6 @@ def decimate_dataset(self, dataset: xr.Dataset) -> xr.Dataset | None:
----------
dataset
The dataset to downscale
time_span
The time span to extract from a dataset

Returns
-------
Expand Down
2 changes: 0 additions & 2 deletions src/ref_sample_data/data_request/obs4mips.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,6 @@ def decimate_dataset(self, dataset: xr.Dataset) -> xr.Dataset | None:
----------
dataset
The dataset to downscale
time_span
The time span to extract from a dataset

Returns
-------
Expand Down
88 changes: 88 additions & 0 deletions src/ref_sample_data/data_request/obs4ref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import pathlib
from pathlib import Path

import pandas as pd
import xarray as xr
from cmip_ref.dataset_registry import build_reference_data_registry

from ref_sample_data.data_request.base import DataRequest
from ref_sample_data.resample import decimate_curvilinear, decimate_rectilinear


class Obs4REFRequest(DataRequest):
"""
Fetch the unpublished Obs4MIPs datasets from the PMP registry

This includes all files that would be downloaded if you ran:
```
ref datasets fetch-obs4ref-data --output-data ...
```
"""

def fetch_datasets(self) -> pd.DataFrame:
"""
Fetch the datasets from the source

Returns a dataframe of the metadata and paths to the fetched datasets.
"""
registry = build_reference_data_registry()

datasets = []
for key in registry.registry.keys():
dataset_path = registry.fetch(key)
datasets.append(
{
"key": key,
"files": [dataset_path],
}
)
return pd.DataFrame(datasets)

def decimate_dataset(self, dataset: xr.Dataset) -> xr.Dataset | None:
"""
Downscale the dataset to a smaller size.

Parameters
----------
dataset
The dataset to downscale

Returns
-------
xr.Dataset
The downscaled dataset
"""
has_latlon = "lat" in dataset.dims and "lon" in dataset.dims
has_ij = "i" in dataset.dims and "j" in dataset.dims

if has_latlon:
assert len(dataset.lat.dims) == 1 and len(dataset.lon.dims) == 1

result = decimate_rectilinear(dataset)
elif has_ij:
# 2d curvilinear grid (generally ocean variables)
result = decimate_curvilinear(dataset)
else:
raise ValueError("Cannot decimate this grid: too many dimensions")

return result

def generate_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: pathlib.Path) -> Path:
"""
Create the output filename for the dataset.

Parameters
----------
metadata
Metadata from the file
ds
Loaded dataset

ds_filename:
Filename of the dataset (Unused)

Returns
-------
The output filename
"""
return Path("obs4REF") / metadata.key
Loading