Define the supported encodings, and test them all.

pp-mo · pp-mo · commit 59bb3f543ce5 · 2026-03-11T15:37:06.000Z
diff --git a/lib/iris/fileformats/netcdf/__init__.py b/lib/iris/fileformats/netcdf/__init__.py
@@ -29,6 +29,7 @@
     DECODE_TO_STRINGS_ON_READ,
     DEFAULT_READ_ENCODING,
     DEFAULT_WRITE_ENCODING,
+    SUPPORTED_ENCODINGS,
 )
 from .loader import DEBUG, NetCDFDataProxy, load_cubes
 from .saver import (
@@ -53,6 +54,7 @@
     "MESH_ELEMENTS",
     "NetCDFDataProxy",
     "SPATIO_TEMPORAL_AXES",
+    "SUPPORTED_ENCODINGS",
     "Saver",
     "UnknownCellMethodWarning",
     "load_cubes",
diff --git a/lib/iris/fileformats/netcdf/_bytecoding_datasets.py b/lib/iris/fileformats/netcdf/_bytecoding_datasets.py
@@ -44,6 +44,7 @@
 import contextlib
 import dataclasses
 import threading
+from typing import Callable
 import warnings
 
 import numpy as np
@@ -117,8 +118,8 @@ class VariableEncoder:
     varname: str  # just for the error messages
     dtype: np.dtype
     is_chardata: bool  # just a shortcut for the dtype test
-    read_encoding: str  # IF 'is_chardata': a valid encoding from the codecs package
-    write_encoding: str  # IF 'is_chardata': a valid encoding from the codecs package
+    read_encoding: str  # IF 'is_chardata': one of the supported encodings
+    write_encoding: str  # IF 'is_chardata': one of the supported encodings
     n_chars_dim: int  # IF 'is_chardata': length of associated character dimension
     string_width: int  # IF 'is_chardata': width when viewed as strings (i.e. "Uxx")
 
@@ -138,59 +139,30 @@ def __init__(self, cf_var):
         self.dtype = cf_var.dtype
         self.is_chardata = np.issubdtype(self.dtype, np.bytes_)
         if self.is_chardata:
-            self.read_encoding = self._get_encoding(cf_var, writing=False)
-            self.write_encoding = self._get_encoding(cf_var, writing=True)
+            encoding_attr = getattr(cf_var, "_Encoding", None)
+            self.read_encoding = _identify_encoding(
+                encoding_attr, var_name=cf_var.name, writing=False
+            )
+            self.write_encoding = _identify_encoding(
+                encoding_attr, var_name=cf_var.name, writing=True
+            )
             n_chars_dim = 1  # default to 1 for a scalar var
             if len(cf_var.dimensions) >= 1:
                 dim_name = cf_var.dimensions[-1]
                 if dim_name in cf_var.group().dimensions:
                     n_chars_dim = cf_var.group().dimensions[dim_name].size
             self.n_chars_dim = n_chars_dim
-            self.string_width = self._get_string_width(cf_var)
+            self.string_width = self._get_string_width()
 
-    @staticmethod
-    def _get_encoding(cf_var, writing=False) -> str:
-        """Get the byte encoding defined for this variable (or None)."""
-        result = getattr(cf_var, "_Encoding", None)
-        if result is not None:
-            try:
-                # Accept + normalise naming of encodings
-                result = codecs.lookup(result).name
-                # NOTE: if encoding does not suit data, errors can occur.
-                # For example, _Encoding = "ascii", with non-ascii content.
-            except LookupError:
-                # Unrecognised encoding name : handle this as just a warning
-                msg = (
-                    f"Ignoring unknown encoding for variable {cf_var.name!r}: "
-                    f"_Encoding = {result!r}."
-                )
-                warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning
-                warnings.warn(msg, category=warntype)
-                # Proceed as if there is no specified encoding
-                result = None
-
-        if result is None:
-            if writing:
-                result = DEFAULT_WRITE_ENCODING
-            else:
-                result = DEFAULT_READ_ENCODING
-        return result
-
-    def _get_string_width(self, cf_var) -> int:
+    def _get_string_width(self) -> int:
         """Return the string-length defined for this variable."""
         # Work out the actual byte width from the parent dataset dimensions.
-        strlen = self.n_chars_dim
+        n_bytes = self.n_chars_dim
         # Convert the string dimension length (i.e. bytes) to a sufficiently-long
         #  string width, depending on the (read) encoding used.
         encoding = self.read_encoding
-        if "utf-16" in encoding:
-            # Each char needs at least 2 bytes -- including a terminator char
-            strlen = (strlen // 2) - 1
-        elif "utf-32" in encoding:
-            # Each char needs exactly 4 bytes -- including a terminator char
-            strlen = (strlen // 4) - 1
-        # "ELSE": assume there can be (at most) as many chars as bytes
-        return strlen
+        n_chars = _ENCODING_WIDTH_TRANSLATIONS[encoding].nbytes_2_nchars(n_bytes)
+        return n_chars
 
     def decode_bytes_to_stringarray(self, data: np.ndarray) -> np.ndarray:
         if self.is_chardata:
@@ -252,6 +224,98 @@ def context(self, perform_decoding: bool):
 DEFAULT_WRITE_ENCODING = "ascii"
 
 
+@dataclasses.dataclass
+class EncodingWidthRelations:
+    """Encode the default string-width <-> byte-dimension relations.
+
+    These translations are just a "best guess"...
+
+    When translating bytes (dtype S1) to strings (dtype Uxx), the chosen (default)
+    string width may be longer than is needed for the actual content.  But it is at
+    least "safe".
+
+    When translating strings to bytes, we *can* get more bytes than the default
+    byte dimension length, and the code will then truncate
+    ( with a warning : see '_identify_encoding' ).
+    This can be avoided if necessary, in specific cases, by recasting the data to a
+    dtype with greater width (Uxx).
+    """
+
+    nchars_2_nbytes: Callable[[int], int]
+    nbytes_2_nchars: Callable[[int], int]
+
+
+_ENCODING_WIDTH_TRANSLATIONS = {
+    "ascii": EncodingWidthRelations(lambda x: x, lambda x: x),
+    "utf-8": EncodingWidthRelations(lambda x: x, lambda x: x),
+    "utf-16": EncodingWidthRelations(
+        nchars_2_nbytes=lambda x: x + 2,
+        nbytes_2_nchars=lambda x: x - 2,
+    ),
+    "utf-32": EncodingWidthRelations(
+        nchars_2_nbytes=lambda x: (x + 1) * 4,
+        nbytes_2_nchars=lambda x: x // 4 - 1,
+    ),
+}
+SUPPORTED_ENCODINGS = list(_ENCODING_WIDTH_TRANSLATIONS.keys())
+
+
+def _identify_encoding(encoding, var_name: str, writing: bool = False) -> str:
+    """Normalise an encoding name + check it is supported.
+
+    Parameters
+    ----------
+    encoding : Any
+        Select an encoding : None, or a string, or anything printable (via str()).
+    var_name : str
+        Name of the relevant dataste variable (i.e. 'var_name') :
+        used only to produce warning messages.
+    writing : bool
+        Specify whether reading or writing, which affects any *default* return value,
+        i.e. select between DEFAULT_READ_ENCODING / DEFAULT_WRITE_ENCODING.
+
+    If given, and supported, return a normalised encoding name,
+    -- i.e. always one of SUPPORTED_ENCODINGS.
+    If not given, or not supported, return the default encoding name.
+
+    If given **but not recognised/supported**, also emit a warning (and return default).
+    """
+    if encoding is not None:
+        encoding = str(encoding)
+
+    result: str | None = None  # not yet 'found' : we will never *return* this
+
+    if encoding is not None:
+        # Normalise the name : NB must recognised by Python "codecs".
+        try:
+            result = codecs.lookup(encoding).name
+        except LookupError:
+            pass
+
+        if result is not None:
+            if result not in SUPPORTED_ENCODINGS:
+                # Python "codecs" recognised it, but we don't support it.
+                result = None
+
+    if encoding is not None and result is None:
+        # Unrecognised encoding name : handle this as just a warning
+        msg = (
+            f"Ignoring unsupported encoding for netCDF variable {var_name!r}: "
+            f"_Encoding = {encoding!r}, is not recognised as one of the supported "
+            f"encodings, {SUPPORTED_ENCODINGS}."
+        )
+        warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning
+        warnings.warn(msg, category=warntype)
+
+    if result is None:
+        if writing:
+            result = DEFAULT_WRITE_ENCODING
+        else:
+            result = DEFAULT_READ_ENCODING
+
+    return result
+
+
 class EncodedVariable(VariableWrapper):
     """A variable wrapper that translates variable data according to byte encodings."""
 
diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py
@@ -1833,17 +1833,46 @@ def _create_generic_cf_array_var(
         if not is_dataless and np.issubdtype(data.dtype, np.str_):
             # Deal with string-type variables.
             # Typically CF label variables, but also possibly ancil-vars ?
+
+            # NOTE: all we are doing here is to calculate the byte dimension length,
+            # based on the dtype and any encoding attribute.
+            # The actual char --> byte data *translation* is done by the variable,
+            # being a _bytecoding_datasets.EncodedVariable.
             string_dimension_depth = data.dtype.itemsize
+
             if data.dtype.kind == "U":
-                encoding = element.attributes.get("_Encoding", "ascii")
-                # TODO: this can fail -- use a sensible warning + default?
-                encoding = codecs.lookup(encoding).name
-                if encoding == "utf-32":
-                    # UTF-32 is a special case -- always 4 exactly bytes per char, plus 4
-                    string_dimension_depth += 4
-                else:
-                    # generally, 4 bytes per char in numpy --> make bytewidth = string-width
-                    string_dimension_depth //= 4
+                # String content (U) instead of bytes (S).
+                # For numpy strings, itemsize is **always** a multiple of 4
+                if string_dimension_depth % 4 != 0:
+                    msg = (
+                        "Unexpected numpy string 'itemsize' for element "
+                        f"{cube_or_mesh.name()}: "
+                        f"'dtype.itemsize = {string_dimension_depth}, expected "
+                        "a multiple of four (always)."
+                    )
+                    raise ValueError(msg)
+                nchars = string_dimension_depth // 4
+
+                encoding_attr = element.attributes.get("_Encoding", "ascii")
+                # Look this up + return a supported encoding name
+                # NB implements defaults and raises a warning if given not recognised.
+                encoding = bytecoding_datasets._identify_encoding(
+                    encoding=encoding_attr, var_name=cf_name, writing=True
+                )
+                width_fns = bytecoding_datasets._ENCODING_WIDTH_TRANSLATIONS[encoding]
+                string_dimension_depth = width_fns.nchars_2_nbytes(nchars)
+            else:
+                if data.dtype.kind != "S" or data.dtype.itemsize != 1:
+                    # Some type of data we don't "understand".
+                    # NB this includes "Sxx" types other than "S1" :  It seems that
+                    # netCDF4 can treat Sxx as if it was Uxx, as least if there is an
+                    # _Encoding attribute.  But we don't support that type in Iris.
+                    msg = (
+                        f"Variable {cf_name!r} has unexpected string/character dtype, "
+                        f"{data.dtype} -- should be either 'S' or 'U' type."
+                    )
+                    raise ValueError(msg)
+
             string_dimension_name = "string%d" % string_dimension_depth
 
             # Determine whether to create the string length dimension.
@@ -1861,26 +1890,6 @@ def _create_generic_cf_array_var(
 
             # Create the label coordinate variable.
             cf_var = self._dataset.createVariable(cf_name, "|S1", element_dims)
-
-            # # Convert data from an array of strings into a character array
-            # # with an extra string-length dimension.
-            # if len(element_dims) == 1:
-            #     # Scalar variable (only has string dimension).
-            #     data_first = data[0]
-            #     if is_lazy_data(data_first):
-            #         data_first = dask.compute(data_first)
-            #     data = list("%- *s" % (string_dimension_depth, data_first))
-            # else:
-            #     # NOTE: at present, can't do this lazily??
-            #     orig_shape = data.shape
-            #     new_shape = orig_shape + (string_dimension_depth,)
-            #     new_data = np.zeros(new_shape, cf_var.dtype)
-            #     for index in np.ndindex(orig_shape):
-            #         index_slice = tuple(list(index) + [slice(None, None)])
-            #         new_data[index_slice] = list(
-            #             "%- *s" % (string_dimension_depth, data[index])
-            #         )
-            #     data = new_data
         else:
             # A normal (numeric) variable.
             # ensure a valid datatype for the file format.
diff --git a/lib/iris/tests/integration/netcdf/test_stringdata.py b/lib/iris/tests/integration/netcdf/test_stringdata.py
@@ -19,7 +19,7 @@
 import iris
 from iris.coords import AuxCoord, DimCoord
 from iris.cube import Cube
-from iris.fileformats.netcdf import _thread_safe_nc
+from iris.fileformats.netcdf import SUPPORTED_ENCODINGS, _thread_safe_nc
 
 
 @pytest.fixture(scope="module")
@@ -38,14 +38,7 @@ def all_lazy_auxcoords():
 PERSIST_TESTFILES: str | None = None
 
 NO_ENCODING_STR = "<noencoding>"
-TEST_ENCODINGS = [
-    NO_ENCODING_STR,
-    "ascii",
-    "utf-8",
-    # "iso8859-1",  # a common one-byte-per-char "codepage" type
-    # "utf-16",
-    "utf-32",
-]
+TEST_ENCODINGS = [NO_ENCODING_STR] + SUPPORTED_ENCODINGS
 
 
 #
@@ -255,10 +248,12 @@ def test_valid_encodings(self, encoding, readtest_data: SamplefileDetails):
         assert load_problems_list() == []
         assert cube.shape == (N_XDIM,)
 
-        if encoding != "utf-32":
-            expected_string_width = N_CHARS_DIM
-        else:
+        if encoding == "utf-32":
             expected_string_width = (N_CHARS_DIM // 4) - 1
+        elif encoding == "utf-16":
+            expected_string_width = N_CHARS_DIM - 2
+        else:
+            expected_string_width = N_CHARS_DIM
         assert cube.dtype == f"<U{expected_string_width}"
         cube_data = cube.data
         assert np.all(cube_data == datavar_strings)
@@ -303,6 +298,8 @@ def make_testcube(
         charlen = N_CHARS_DIM
         if encoding_str == "utf-32":
             charlen = charlen // 4 - 1
+        elif encoding_str == "utf-16":
+            charlen = charlen - 2
         strings_dtype = np.dtype(f"U{charlen}")
         coordvar_array = np.array(coordvar_strings, dtype=strings_dtype)
         datavar_array = np.array(datavar_strings, dtype=strings_dtype)
diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py b/lib/iris/tests/unit/fileformats/netcdf/test_bytecoding_datasets.py