diff --git a/cf_xarray/accessor.py b/cf_xarray/accessor.py index c89cc0bf..2d828963 100644 --- a/cf_xarray/accessor.py +++ b/cf_xarray/accessor.py @@ -556,6 +556,37 @@ def _parse_grid_mapping_attribute( return Frozen(result) +def _hashable_attrs(attrs: Mapping[Any, Any]) -> tuple: + """Return a hashable, order-independent representation of an attrs mapping. + + List- and array-valued attributes (e.g. ``standard_parallel``) are coerced + to tuples so the result can be used as an ``lru_cache`` key. + """ + frozen = [] + for key, value in attrs.items(): + if hasattr(value, "tolist"): # numpy scalars/arrays + value = value.tolist() + if isinstance(value, list | tuple): + value = tuple(value) + frozen.append((key, value)) + frozen.sort(key=lambda kv: repr(kv[0])) + return tuple(frozen) + + +@functools.lru_cache(maxsize=256) +def _crs_from_cf_attrs(attrs_items: tuple) -> Any: + """Build a ``pyproj.CRS`` from frozen CF grid-mapping attrs (memoized). + + ``pyproj.CRS.from_cf`` re-parses the datum/ellipsoid on every call, which is + expensive for grid mappings carrying explicit ellipsoid parameters (e.g. + geostationary). A dataset routinely references the same grid mapping from + many variables, so cache on the attribute items. + """ + import pyproj + + return pyproj.CRS.from_cf(dict(attrs_items)) + + def _create_grid_mapping( var_name: str, ds: Dataset, @@ -669,7 +700,7 @@ def _create_grid_mapping( } ) else: - crs = pyproj.CRS.from_cf(var.attrs) + crs = _crs_from_cf_attrs(_hashable_attrs(var.attrs)) # Get associated coordinate variables, fallback to dimension names coordinates: list[Hashable] = grid_mapping_dict.get(var_name, []) diff --git a/cf_xarray/tests/test_accessor.py b/cf_xarray/tests/test_accessor.py index d1a06bbd..ef7dee10 100644 --- a/cf_xarray/tests/test_accessor.py +++ b/cf_xarray/tests/test_accessor.py @@ -1254,6 +1254,44 @@ def test_grid_mappings_property(): assert gm.coordinates == ("x", "y") +@requires_pyproj +def test_grid_mappings_crs_construction_is_cached(monkeypatch): + """``pyproj.CRS.from_cf`` is memoized per grid-mapping attrs. + + Building the CRS re-parses the datum/ellipsoid on every call. A dataset + references the same grid mapping from many variables and the property may + be accessed repeatedly, so each distinct grid mapping should be built once. + """ + import pyproj + + from ..accessor import _crs_from_cf_attrs + + _crs_from_cf_attrs.cache_clear() + + from ..datasets import hrrrds + + ds = hrrrds + + calls = {"n": 0} + orig = pyproj.CRS.from_cf + + def counting_from_cf(*args, **kwargs): + calls["n"] += 1 + return orig(*args, **kwargs) + + monkeypatch.setattr(pyproj.CRS, "from_cf", staticmethod(counting_from_cf)) + + # Repeated property accesses, including via a DataArray, must not rebuild + # the same CRS: hrrrds has 3 distinct grid mappings, each built once. + ds.cf.grid_mappings + ds.cf.grid_mappings + ds.foo.cf.grid_mappings + + assert calls["n"] == 3 + + _crs_from_cf_attrs.cache_clear() + + @requires_pyproj def test_grid_mappings_coordinates_attribute(): """Test that coordinates attribute is always populated correctly for DataArray grid mappings."""