Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions conda_package/docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ I/O
:toctree: generated/

write_netcdf
open_dataset
open_mfdataset

Parallelism
-----------
Expand Down
28 changes: 28 additions & 0 deletions conda_package/docs/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,31 @@ Example usage:
# Create a simple dataset
ds = xr.Dataset({'foo': (('x',), [1, 2, 3])})
write_netcdf(ds, 'output.nc')

open_dataset and open_mfdataset
===============================

The :py:func:`mpas_tools.io.open_dataset()` and
:py:func:`mpas_tools.io.open_mfdataset()` functions are thin wrappers around
:py:func:`xarray.open_dataset()` and :py:func:`xarray.open_mfdataset()`. They
select the NetCDF ``engine`` from the module-level
:py:data:`mpas_tools.io.default_engine` variable when an ``engine`` is not
passed explicitly.

This is useful because :py:func:`xarray.open_dataset()` otherwise sniffs the
file for "magic bits" to auto-select a backend, and that probe can crash on
``NETCDF3_64BIT_DATA`` (CDF5) files. xarray provides no global way to set a
default engine, so ``mpas_tools.io.default_engine`` offers a single,
process-wide knob that applies to both reading and writing.

Example usage:

.. code-block:: python

import mpas_tools.io
from mpas_tools.io import open_dataset

# use the netcdf4 engine everywhere to avoid the CDF5 sniffing crash
mpas_tools.io.default_engine = 'netcdf4'

ds = open_dataset('mesh.nc')
90 changes: 90 additions & 0 deletions conda_package/mpas_tools/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import netCDF4
import numpy
import xarray

from mpas_tools.logging import check_call

Expand Down Expand Up @@ -180,6 +181,95 @@ def write_netcdf(
os.remove(out_filename)


def open_dataset(filename, engine=None, logger=None, **kwargs):
"""
Open an ``xarray.Dataset`` from a NetCDF file, accounting for quirks
specific to MPAS components. This is a thin wrapper around
:py:func:`xarray.open_dataset` that selects the NetCDF ``engine`` from
``mpas_tools.io.default_engine`` when ``engine`` is not given.

Specifying an ``engine`` explicitly is important because
:py:func:`xarray.open_dataset` otherwise sniffs the file for "magic bits"
to auto-select a backend, and that probe can crash on
``NETCDF3_64BIT_DATA`` (CDF5) files. Setting
``mpas_tools.io.default_engine`` (e.g. to ``'netcdf4'``) provides a single,
process-wide way to avoid that crash without modifying every call site.

Parameters
----------
filename : str or path-like or file-like
The path to the NetCDF file to open, passed on to
:py:func:`xarray.open_dataset`

engine : {'netcdf4', 'scipy', 'h5netcdf'}, optional
The library to use for reading the NetCDF file. The default is
``mpas_tools.io.default_engine``, which can be modified but which
defaults to ``None`` (xarray auto-selects the backend)

logger : logging.Logger, optional
A logger to write messages to. Reserved for future diagnostics; no
error recovery is performed because the CDF5 backend-sniffing failure
is a hard crash that cannot be caught.

**kwargs
Additional keyword arguments passed on to
:py:func:`xarray.open_dataset` (e.g. ``decode_times``, ``decode_cf``,
``mask_and_scale``)

Returns
-------
ds : xarray.Dataset
The opened dataset
"""
if engine is None:
engine = default_engine

return xarray.open_dataset(filename, engine=engine, **kwargs)


def open_mfdataset(paths, engine=None, logger=None, **kwargs):
"""
Open a multi-file ``xarray.Dataset`` from NetCDF files, accounting for
quirks specific to MPAS components. This is a thin wrapper around
:py:func:`xarray.open_mfdataset` that selects the NetCDF ``engine`` from
``mpas_tools.io.default_engine`` when ``engine`` is not given.

See :py:func:`mpas_tools.io.open_dataset` for why specifying an ``engine``
explicitly (via ``mpas_tools.io.default_engine``) is useful for
``NETCDF3_64BIT_DATA`` (CDF5) files.

Parameters
----------
paths : str or sequence of str or path-like
The paths to the NetCDF files to open, passed on to
:py:func:`xarray.open_mfdataset`

engine : {'netcdf4', 'scipy', 'h5netcdf'}, optional
The library to use for reading the NetCDF files. The default is
``mpas_tools.io.default_engine``, which can be modified but which
defaults to ``None`` (xarray auto-selects the backend)

logger : logging.Logger, optional
A logger to write messages to. Reserved for future diagnostics; no
error recovery is performed because the CDF5 backend-sniffing failure
is a hard crash that cannot be caught.

**kwargs
Additional keyword arguments passed on to
:py:func:`xarray.open_mfdataset` (e.g. ``combine``, ``concat_dim``,
``decode_times``)

Returns
-------
ds : xarray.Dataset
The opened dataset
"""
if engine is None:
engine = default_engine

return xarray.open_mfdataset(paths, engine=engine, **kwargs)


def update_history(ds):
"""Add or append history to attributes of a data set"""

Expand Down
61 changes: 60 additions & 1 deletion conda_package/tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import pytest
import xarray as xr

from mpas_tools.io import write_netcdf
import mpas_tools.io
from mpas_tools.io import open_dataset, open_mfdataset, write_netcdf
from mpas_tools.logging import LoggingContext

from .util import get_test_data_file
Expand Down Expand Up @@ -59,6 +60,64 @@ def test_write_netcdf_cdf5_format(tmp_path):
assert not os.path.exists(tmp_file)


def test_open_dataset_basic(tmp_path):
# Write a file then read it back via the wrapper
arr = np.array([1.0, 2.0, 3.0], dtype=np.float32)
ds = xr.Dataset({'foo': (('x',), arr)})
out_file = tmp_path / 'test_open_basic.nc'
write_netcdf(ds, str(out_file))
ds2 = open_dataset(str(out_file))
assert set(ds.dims) == set(ds2.dims)
assert 'foo' in ds2.data_vars
np.testing.assert_array_equal(ds2['foo'].values, arr)
ds2.close()


def test_open_dataset_cdf5(tmp_path):
# Opening a CDF5 (NETCDF3_64BIT_DATA) file with an explicit engine should
# succeed; this exercises the bug the wrapper works around.
arr = np.array([1.0, 2.0, 3.0], dtype=np.float32)
ds = xr.Dataset({'foo': (('x',), arr)})
out_file = tmp_path / 'test_open_cdf5.nc'
write_netcdf(ds, str(out_file), format='NETCDF3_64BIT_DATA')
ds2 = open_dataset(str(out_file), engine='netcdf4')
np.testing.assert_array_equal(ds2['foo'].values, arr)
ds2.close()


def test_open_dataset_default_engine(tmp_path):
# When engine is None, the wrapper should use mpas_tools.io.default_engine
arr = np.array([1.0, 2.0, 3.0], dtype=np.float32)
ds = xr.Dataset({'foo': (('x',), arr)})
out_file = tmp_path / 'test_open_default_engine.nc'
write_netcdf(ds, str(out_file), format='NETCDF3_64BIT_DATA')
saved_engine = mpas_tools.io.default_engine
try:
mpas_tools.io.default_engine = 'netcdf4'
ds2 = open_dataset(str(out_file))
np.testing.assert_array_equal(ds2['foo'].values, arr)
ds2.close()
finally:
mpas_tools.io.default_engine = saved_engine


def test_open_mfdataset(tmp_path):
# Smoke test: write two files along a dimension and open them combined
out_files = []
for index in range(2):
arr = np.array([index], dtype=np.float32)
ds = xr.Dataset({'foo': (('Time',), arr)})
out_file = tmp_path / f'test_open_mf_{index}.nc'
write_netcdf(ds, str(out_file))
out_files.append(str(out_file))
ds2 = open_mfdataset(
out_files, engine='netcdf4', combine='nested', concat_dim='Time'
)
assert ds2.sizes['Time'] == 2
np.testing.assert_array_equal(ds2['foo'].values, [0.0, 1.0])
ds2.close()


def test_write_netcdf_int64_conversion_and_attr(tmp_path):
# Create a dataset with int64 variable and an attribute
arr = np.array([1, 2, 3], dtype=np.int64)
Expand Down
Loading