From ed1bfd4e9844e0672a68010881188d948c5eca5a Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Sat, 27 Jun 2026 12:53:15 +0200 Subject: [PATCH 1/3] Add open_dataset and open_mfdataset wrappers to mpas_tools.io Add thin wrappers around xarray.open_dataset and xarray.open_mfdataset that select the NetCDF engine from the module-level mpas_tools.io.default_engine variable when no engine is passed explicitly. xarray.open_dataset sniffs a file for "magic bits" to auto-select a backend, and that probe can crash on NETCDF3_64BIT_DATA (CDF5) files (see https://github.com/E3SM-Project/polaris/issues/624). Specifying an engine explicitly avoids the sniffing. Since xarray has no global default-engine setting, reusing the existing default_engine variable (already consumed by write_netcdf) gives downstream tools a single, process-wide knob for both reading and writing without modifying every call site. The logger argument is included for API symmetry with write_netcdf and future diagnostics; no error recovery is performed because the CDF5 failure is a hard crash that cannot be caught. Co-Authored-By: Claude Opus 4.8 --- conda_package/mpas_tools/io.py | 90 ++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/conda_package/mpas_tools/io.py b/conda_package/mpas_tools/io.py index b6ebec839..e50323ef3 100644 --- a/conda_package/mpas_tools/io.py +++ b/conda_package/mpas_tools/io.py @@ -6,6 +6,7 @@ import netCDF4 import numpy +import xarray from mpas_tools.logging import check_call @@ -180,6 +181,95 @@ def write_netcdf( os.remove(out_filename) +def open_dataset(filename, engine=None, logger=None, **kwargs): + """ + Open an ``xarray.Dataset`` from a NetCDF file, accounting for quirks + specific to MPAS components. This is a thin wrapper around + :py:func:`xarray.open_dataset` that selects the NetCDF ``engine`` from + ``mpas_tools.io.default_engine`` when ``engine`` is not given. + + Specifying an ``engine`` explicitly is important because + :py:func:`xarray.open_dataset` otherwise sniffs the file for "magic bits" + to auto-select a backend, and that probe can crash on + ``NETCDF3_64BIT_DATA`` (CDF5) files. Setting + ``mpas_tools.io.default_engine`` (e.g. to ``'netcdf4'``) provides a single, + process-wide way to avoid that crash without modifying every call site. + + Parameters + ---------- + filename : str or path-like or file-like + The path to the NetCDF file to open, passed on to + :py:func:`xarray.open_dataset` + + engine : {'netcdf4', 'scipy', 'h5netcdf'}, optional + The library to use for reading the NetCDF file. The default is + ``mpas_tools.io.default_engine``, which can be modified but which + defaults to ``None`` (xarray auto-selects the backend) + + logger : logging.Logger, optional + A logger to write messages to. Reserved for future diagnostics; no + error recovery is performed because the CDF5 backend-sniffing failure + is a hard crash that cannot be caught. + + **kwargs + Additional keyword arguments passed on to + :py:func:`xarray.open_dataset` (e.g. ``decode_times``, ``decode_cf``, + ``mask_and_scale``) + + Returns + ------- + ds : xarray.Dataset + The opened dataset + """ + if engine is None: + engine = default_engine + + return xarray.open_dataset(filename, engine=engine, **kwargs) + + +def open_mfdataset(paths, engine=None, logger=None, **kwargs): + """ + Open a multi-file ``xarray.Dataset`` from NetCDF files, accounting for + quirks specific to MPAS components. This is a thin wrapper around + :py:func:`xarray.open_mfdataset` that selects the NetCDF ``engine`` from + ``mpas_tools.io.default_engine`` when ``engine`` is not given. + + See :py:func:`mpas_tools.io.open_dataset` for why specifying an ``engine`` + explicitly (via ``mpas_tools.io.default_engine``) is useful for + ``NETCDF3_64BIT_DATA`` (CDF5) files. + + Parameters + ---------- + paths : str or sequence of str or path-like + The paths to the NetCDF files to open, passed on to + :py:func:`xarray.open_mfdataset` + + engine : {'netcdf4', 'scipy', 'h5netcdf'}, optional + The library to use for reading the NetCDF files. The default is + ``mpas_tools.io.default_engine``, which can be modified but which + defaults to ``None`` (xarray auto-selects the backend) + + logger : logging.Logger, optional + A logger to write messages to. Reserved for future diagnostics; no + error recovery is performed because the CDF5 backend-sniffing failure + is a hard crash that cannot be caught. + + **kwargs + Additional keyword arguments passed on to + :py:func:`xarray.open_mfdataset` (e.g. ``combine``, ``concat_dim``, + ``decode_times``) + + Returns + ------- + ds : xarray.Dataset + The opened dataset + """ + if engine is None: + engine = default_engine + + return xarray.open_mfdataset(paths, engine=engine, **kwargs) + + def update_history(ds): """Add or append history to attributes of a data set""" From d47af244b1c84d1b4827ffb6b6207fd344ffe396 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Sat, 27 Jun 2026 12:53:16 +0200 Subject: [PATCH 2/3] Add tests for open_dataset and open_mfdataset Cover the new wrappers: a basic write/read round trip, opening a NETCDF3_64BIT_DATA (CDF5) file with an explicit engine (exercising the backend-sniffing crash the wrappers work around), resolving the engine from mpas_tools.io.default_engine when engine is None (restoring the global afterward), and a multi-file open_mfdataset smoke test. Co-Authored-By: Claude Opus 4.8 --- conda_package/tests/test_io.py | 61 +++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/conda_package/tests/test_io.py b/conda_package/tests/test_io.py index b55271043..0eefedd41 100644 --- a/conda_package/tests/test_io.py +++ b/conda_package/tests/test_io.py @@ -5,7 +5,8 @@ import pytest import xarray as xr -from mpas_tools.io import write_netcdf +import mpas_tools.io +from mpas_tools.io import open_dataset, open_mfdataset, write_netcdf from mpas_tools.logging import LoggingContext from .util import get_test_data_file @@ -59,6 +60,64 @@ def test_write_netcdf_cdf5_format(tmp_path): assert not os.path.exists(tmp_file) +def test_open_dataset_basic(tmp_path): + # Write a file then read it back via the wrapper + arr = np.array([1.0, 2.0, 3.0], dtype=np.float32) + ds = xr.Dataset({'foo': (('x',), arr)}) + out_file = tmp_path / 'test_open_basic.nc' + write_netcdf(ds, str(out_file)) + ds2 = open_dataset(str(out_file)) + assert set(ds.dims) == set(ds2.dims) + assert 'foo' in ds2.data_vars + np.testing.assert_array_equal(ds2['foo'].values, arr) + ds2.close() + + +def test_open_dataset_cdf5(tmp_path): + # Opening a CDF5 (NETCDF3_64BIT_DATA) file with an explicit engine should + # succeed; this exercises the bug the wrapper works around. + arr = np.array([1.0, 2.0, 3.0], dtype=np.float32) + ds = xr.Dataset({'foo': (('x',), arr)}) + out_file = tmp_path / 'test_open_cdf5.nc' + write_netcdf(ds, str(out_file), format='NETCDF3_64BIT_DATA') + ds2 = open_dataset(str(out_file), engine='netcdf4') + np.testing.assert_array_equal(ds2['foo'].values, arr) + ds2.close() + + +def test_open_dataset_default_engine(tmp_path): + # When engine is None, the wrapper should use mpas_tools.io.default_engine + arr = np.array([1.0, 2.0, 3.0], dtype=np.float32) + ds = xr.Dataset({'foo': (('x',), arr)}) + out_file = tmp_path / 'test_open_default_engine.nc' + write_netcdf(ds, str(out_file), format='NETCDF3_64BIT_DATA') + saved_engine = mpas_tools.io.default_engine + try: + mpas_tools.io.default_engine = 'netcdf4' + ds2 = open_dataset(str(out_file)) + np.testing.assert_array_equal(ds2['foo'].values, arr) + ds2.close() + finally: + mpas_tools.io.default_engine = saved_engine + + +def test_open_mfdataset(tmp_path): + # Smoke test: write two files along a dimension and open them combined + out_files = [] + for index in range(2): + arr = np.array([index], dtype=np.float32) + ds = xr.Dataset({'foo': (('Time',), arr)}) + out_file = tmp_path / f'test_open_mf_{index}.nc' + write_netcdf(ds, str(out_file)) + out_files.append(str(out_file)) + ds2 = open_mfdataset( + out_files, engine='netcdf4', combine='nested', concat_dim='Time' + ) + assert ds2.sizes['Time'] == 2 + np.testing.assert_array_equal(ds2['foo'].values, [0.0, 1.0]) + ds2.close() + + def test_write_netcdf_int64_conversion_and_attr(tmp_path): # Create a dataset with int64 variable and an attribute arr = np.array([1, 2, 3], dtype=np.int64) From 07f776fc3e2db376fd9a52b433d22ec9128f68ba Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Sat, 27 Jun 2026 12:53:16 +0200 Subject: [PATCH 3/3] Document open_dataset and open_mfdataset Add the new functions to the I/O autosummary in api.rst and describe them in io.rst, including why specifying an engine via mpas_tools.io.default_engine avoids the CDF5 backend-sniffing crash and an example of setting the default engine. Co-Authored-By: Claude Opus 4.8 --- conda_package/docs/api.rst | 2 ++ conda_package/docs/io.rst | 28 ++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/conda_package/docs/api.rst b/conda_package/docs/api.rst index 0cd784d01..e9dcac9b5 100644 --- a/conda_package/docs/api.rst +++ b/conda_package/docs/api.rst @@ -151,6 +151,8 @@ I/O :toctree: generated/ write_netcdf + open_dataset + open_mfdataset Parallelism ----------- diff --git a/conda_package/docs/io.rst b/conda_package/docs/io.rst index 12d412c35..7ecbcd92a 100644 --- a/conda_package/docs/io.rst +++ b/conda_package/docs/io.rst @@ -26,3 +26,31 @@ Example usage: # Create a simple dataset ds = xr.Dataset({'foo': (('x',), [1, 2, 3])}) write_netcdf(ds, 'output.nc') + +open_dataset and open_mfdataset +=============================== + +The :py:func:`mpas_tools.io.open_dataset()` and +:py:func:`mpas_tools.io.open_mfdataset()` functions are thin wrappers around +:py:func:`xarray.open_dataset()` and :py:func:`xarray.open_mfdataset()`. They +select the NetCDF ``engine`` from the module-level +:py:data:`mpas_tools.io.default_engine` variable when an ``engine`` is not +passed explicitly. + +This is useful because :py:func:`xarray.open_dataset()` otherwise sniffs the +file for "magic bits" to auto-select a backend, and that probe can crash on +``NETCDF3_64BIT_DATA`` (CDF5) files. xarray provides no global way to set a +default engine, so ``mpas_tools.io.default_engine`` offers a single, +process-wide knob that applies to both reading and writing. + +Example usage: + +.. code-block:: python + + import mpas_tools.io + from mpas_tools.io import open_dataset + + # use the netcdf4 engine everywhere to avoid the CDF5 sniffing crash + mpas_tools.io.default_engine = 'netcdf4' + + ds = open_dataset('mesh.nc')