Skip to content

Commit 59bb3f5

Browse files
committed
Define the supported encodings, and test them all.
1 parent 4996be2 commit 59bb3f5

5 files changed

Lines changed: 180 additions & 94 deletions

File tree

lib/iris/fileformats/netcdf/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
DECODE_TO_STRINGS_ON_READ,
3030
DEFAULT_READ_ENCODING,
3131
DEFAULT_WRITE_ENCODING,
32+
SUPPORTED_ENCODINGS,
3233
)
3334
from .loader import DEBUG, NetCDFDataProxy, load_cubes
3435
from .saver import (
@@ -53,6 +54,7 @@
5354
"MESH_ELEMENTS",
5455
"NetCDFDataProxy",
5556
"SPATIO_TEMPORAL_AXES",
57+
"SUPPORTED_ENCODINGS",
5658
"Saver",
5759
"UnknownCellMethodWarning",
5860
"load_cubes",

lib/iris/fileformats/netcdf/_bytecoding_datasets.py

Lines changed: 107 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
import contextlib
4545
import dataclasses
4646
import threading
47+
from typing import Callable
4748
import warnings
4849

4950
import numpy as np
@@ -117,8 +118,8 @@ class VariableEncoder:
117118
varname: str # just for the error messages
118119
dtype: np.dtype
119120
is_chardata: bool # just a shortcut for the dtype test
120-
read_encoding: str # IF 'is_chardata': a valid encoding from the codecs package
121-
write_encoding: str # IF 'is_chardata': a valid encoding from the codecs package
121+
read_encoding: str # IF 'is_chardata': one of the supported encodings
122+
write_encoding: str # IF 'is_chardata': one of the supported encodings
122123
n_chars_dim: int # IF 'is_chardata': length of associated character dimension
123124
string_width: int # IF 'is_chardata': width when viewed as strings (i.e. "Uxx")
124125

@@ -138,59 +139,30 @@ def __init__(self, cf_var):
138139
self.dtype = cf_var.dtype
139140
self.is_chardata = np.issubdtype(self.dtype, np.bytes_)
140141
if self.is_chardata:
141-
self.read_encoding = self._get_encoding(cf_var, writing=False)
142-
self.write_encoding = self._get_encoding(cf_var, writing=True)
142+
encoding_attr = getattr(cf_var, "_Encoding", None)
143+
self.read_encoding = _identify_encoding(
144+
encoding_attr, var_name=cf_var.name, writing=False
145+
)
146+
self.write_encoding = _identify_encoding(
147+
encoding_attr, var_name=cf_var.name, writing=True
148+
)
143149
n_chars_dim = 1 # default to 1 for a scalar var
144150
if len(cf_var.dimensions) >= 1:
145151
dim_name = cf_var.dimensions[-1]
146152
if dim_name in cf_var.group().dimensions:
147153
n_chars_dim = cf_var.group().dimensions[dim_name].size
148154
self.n_chars_dim = n_chars_dim
149-
self.string_width = self._get_string_width(cf_var)
155+
self.string_width = self._get_string_width()
150156

151-
@staticmethod
152-
def _get_encoding(cf_var, writing=False) -> str:
153-
"""Get the byte encoding defined for this variable (or None)."""
154-
result = getattr(cf_var, "_Encoding", None)
155-
if result is not None:
156-
try:
157-
# Accept + normalise naming of encodings
158-
result = codecs.lookup(result).name
159-
# NOTE: if encoding does not suit data, errors can occur.
160-
# For example, _Encoding = "ascii", with non-ascii content.
161-
except LookupError:
162-
# Unrecognised encoding name : handle this as just a warning
163-
msg = (
164-
f"Ignoring unknown encoding for variable {cf_var.name!r}: "
165-
f"_Encoding = {result!r}."
166-
)
167-
warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning
168-
warnings.warn(msg, category=warntype)
169-
# Proceed as if there is no specified encoding
170-
result = None
171-
172-
if result is None:
173-
if writing:
174-
result = DEFAULT_WRITE_ENCODING
175-
else:
176-
result = DEFAULT_READ_ENCODING
177-
return result
178-
179-
def _get_string_width(self, cf_var) -> int:
157+
def _get_string_width(self) -> int:
180158
"""Return the string-length defined for this variable."""
181159
# Work out the actual byte width from the parent dataset dimensions.
182-
strlen = self.n_chars_dim
160+
n_bytes = self.n_chars_dim
183161
# Convert the string dimension length (i.e. bytes) to a sufficiently-long
184162
# string width, depending on the (read) encoding used.
185163
encoding = self.read_encoding
186-
if "utf-16" in encoding:
187-
# Each char needs at least 2 bytes -- including a terminator char
188-
strlen = (strlen // 2) - 1
189-
elif "utf-32" in encoding:
190-
# Each char needs exactly 4 bytes -- including a terminator char
191-
strlen = (strlen // 4) - 1
192-
# "ELSE": assume there can be (at most) as many chars as bytes
193-
return strlen
164+
n_chars = _ENCODING_WIDTH_TRANSLATIONS[encoding].nbytes_2_nchars(n_bytes)
165+
return n_chars
194166

195167
def decode_bytes_to_stringarray(self, data: np.ndarray) -> np.ndarray:
196168
if self.is_chardata:
@@ -252,6 +224,98 @@ def context(self, perform_decoding: bool):
252224
DEFAULT_WRITE_ENCODING = "ascii"
253225

254226

227+
@dataclasses.dataclass
228+
class EncodingWidthRelations:
229+
"""Encode the default string-width <-> byte-dimension relations.
230+
231+
These translations are just a "best guess"...
232+
233+
When translating bytes (dtype S1) to strings (dtype Uxx), the chosen (default)
234+
string width may be longer than is needed for the actual content. But it is at
235+
least "safe".
236+
237+
When translating strings to bytes, we *can* get more bytes than the default
238+
byte dimension length, and the code will then truncate
239+
( with a warning : see '_identify_encoding' ).
240+
This can be avoided if necessary, in specific cases, by recasting the data to a
241+
dtype with greater width (Uxx).
242+
"""
243+
244+
nchars_2_nbytes: Callable[[int], int]
245+
nbytes_2_nchars: Callable[[int], int]
246+
247+
248+
_ENCODING_WIDTH_TRANSLATIONS = {
249+
"ascii": EncodingWidthRelations(lambda x: x, lambda x: x),
250+
"utf-8": EncodingWidthRelations(lambda x: x, lambda x: x),
251+
"utf-16": EncodingWidthRelations(
252+
nchars_2_nbytes=lambda x: x + 2,
253+
nbytes_2_nchars=lambda x: x - 2,
254+
),
255+
"utf-32": EncodingWidthRelations(
256+
nchars_2_nbytes=lambda x: (x + 1) * 4,
257+
nbytes_2_nchars=lambda x: x // 4 - 1,
258+
),
259+
}
260+
SUPPORTED_ENCODINGS = list(_ENCODING_WIDTH_TRANSLATIONS.keys())
261+
262+
263+
def _identify_encoding(encoding, var_name: str, writing: bool = False) -> str:
264+
"""Normalise an encoding name + check it is supported.
265+
266+
Parameters
267+
----------
268+
encoding : Any
269+
Select an encoding : None, or a string, or anything printable (via str()).
270+
var_name : str
271+
Name of the relevant dataste variable (i.e. 'var_name') :
272+
used only to produce warning messages.
273+
writing : bool
274+
Specify whether reading or writing, which affects any *default* return value,
275+
i.e. select between DEFAULT_READ_ENCODING / DEFAULT_WRITE_ENCODING.
276+
277+
If given, and supported, return a normalised encoding name,
278+
-- i.e. always one of SUPPORTED_ENCODINGS.
279+
If not given, or not supported, return the default encoding name.
280+
281+
If given **but not recognised/supported**, also emit a warning (and return default).
282+
"""
283+
if encoding is not None:
284+
encoding = str(encoding)
285+
286+
result: str | None = None # not yet 'found' : we will never *return* this
287+
288+
if encoding is not None:
289+
# Normalise the name : NB must recognised by Python "codecs".
290+
try:
291+
result = codecs.lookup(encoding).name
292+
except LookupError:
293+
pass
294+
295+
if result is not None:
296+
if result not in SUPPORTED_ENCODINGS:
297+
# Python "codecs" recognised it, but we don't support it.
298+
result = None
299+
300+
if encoding is not None and result is None:
301+
# Unrecognised encoding name : handle this as just a warning
302+
msg = (
303+
f"Ignoring unsupported encoding for netCDF variable {var_name!r}: "
304+
f"_Encoding = {encoding!r}, is not recognised as one of the supported "
305+
f"encodings, {SUPPORTED_ENCODINGS}."
306+
)
307+
warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning
308+
warnings.warn(msg, category=warntype)
309+
310+
if result is None:
311+
if writing:
312+
result = DEFAULT_WRITE_ENCODING
313+
else:
314+
result = DEFAULT_READ_ENCODING
315+
316+
return result
317+
318+
255319
class EncodedVariable(VariableWrapper):
256320
"""A variable wrapper that translates variable data according to byte encodings."""
257321

lib/iris/fileformats/netcdf/saver.py

Lines changed: 38 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1833,17 +1833,46 @@ def _create_generic_cf_array_var(
18331833
if not is_dataless and np.issubdtype(data.dtype, np.str_):
18341834
# Deal with string-type variables.
18351835
# Typically CF label variables, but also possibly ancil-vars ?
1836+
1837+
# NOTE: all we are doing here is to calculate the byte dimension length,
1838+
# based on the dtype and any encoding attribute.
1839+
# The actual char --> byte data *translation* is done by the variable,
1840+
# being a _bytecoding_datasets.EncodedVariable.
18361841
string_dimension_depth = data.dtype.itemsize
1842+
18371843
if data.dtype.kind == "U":
1838-
encoding = element.attributes.get("_Encoding", "ascii")
1839-
# TODO: this can fail -- use a sensible warning + default?
1840-
encoding = codecs.lookup(encoding).name
1841-
if encoding == "utf-32":
1842-
# UTF-32 is a special case -- always 4 exactly bytes per char, plus 4
1843-
string_dimension_depth += 4
1844-
else:
1845-
# generally, 4 bytes per char in numpy --> make bytewidth = string-width
1846-
string_dimension_depth //= 4
1844+
# String content (U) instead of bytes (S).
1845+
# For numpy strings, itemsize is **always** a multiple of 4
1846+
if string_dimension_depth % 4 != 0:
1847+
msg = (
1848+
"Unexpected numpy string 'itemsize' for element "
1849+
f"{cube_or_mesh.name()}: "
1850+
f"'dtype.itemsize = {string_dimension_depth}, expected "
1851+
"a multiple of four (always)."
1852+
)
1853+
raise ValueError(msg)
1854+
nchars = string_dimension_depth // 4
1855+
1856+
encoding_attr = element.attributes.get("_Encoding", "ascii")
1857+
# Look this up + return a supported encoding name
1858+
# NB implements defaults and raises a warning if given not recognised.
1859+
encoding = bytecoding_datasets._identify_encoding(
1860+
encoding=encoding_attr, var_name=cf_name, writing=True
1861+
)
1862+
width_fns = bytecoding_datasets._ENCODING_WIDTH_TRANSLATIONS[encoding]
1863+
string_dimension_depth = width_fns.nchars_2_nbytes(nchars)
1864+
else:
1865+
if data.dtype.kind != "S" or data.dtype.itemsize != 1:
1866+
# Some type of data we don't "understand".
1867+
# NB this includes "Sxx" types other than "S1" : It seems that
1868+
# netCDF4 can treat Sxx as if it was Uxx, as least if there is an
1869+
# _Encoding attribute. But we don't support that type in Iris.
1870+
msg = (
1871+
f"Variable {cf_name!r} has unexpected string/character dtype, "
1872+
f"{data.dtype} -- should be either 'S' or 'U' type."
1873+
)
1874+
raise ValueError(msg)
1875+
18471876
string_dimension_name = "string%d" % string_dimension_depth
18481877

18491878
# Determine whether to create the string length dimension.
@@ -1861,26 +1890,6 @@ def _create_generic_cf_array_var(
18611890

18621891
# Create the label coordinate variable.
18631892
cf_var = self._dataset.createVariable(cf_name, "|S1", element_dims)
1864-
1865-
# # Convert data from an array of strings into a character array
1866-
# # with an extra string-length dimension.
1867-
# if len(element_dims) == 1:
1868-
# # Scalar variable (only has string dimension).
1869-
# data_first = data[0]
1870-
# if is_lazy_data(data_first):
1871-
# data_first = dask.compute(data_first)
1872-
# data = list("%- *s" % (string_dimension_depth, data_first))
1873-
# else:
1874-
# # NOTE: at present, can't do this lazily??
1875-
# orig_shape = data.shape
1876-
# new_shape = orig_shape + (string_dimension_depth,)
1877-
# new_data = np.zeros(new_shape, cf_var.dtype)
1878-
# for index in np.ndindex(orig_shape):
1879-
# index_slice = tuple(list(index) + [slice(None, None)])
1880-
# new_data[index_slice] = list(
1881-
# "%- *s" % (string_dimension_depth, data[index])
1882-
# )
1883-
# data = new_data
18841893
else:
18851894
# A normal (numeric) variable.
18861895
# ensure a valid datatype for the file format.

lib/iris/tests/integration/netcdf/test_stringdata.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import iris
2020
from iris.coords import AuxCoord, DimCoord
2121
from iris.cube import Cube
22-
from iris.fileformats.netcdf import _thread_safe_nc
22+
from iris.fileformats.netcdf import SUPPORTED_ENCODINGS, _thread_safe_nc
2323

2424

2525
@pytest.fixture(scope="module")
@@ -38,14 +38,7 @@ def all_lazy_auxcoords():
3838
PERSIST_TESTFILES: str | None = None
3939

4040
NO_ENCODING_STR = "<noencoding>"
41-
TEST_ENCODINGS = [
42-
NO_ENCODING_STR,
43-
"ascii",
44-
"utf-8",
45-
# "iso8859-1", # a common one-byte-per-char "codepage" type
46-
# "utf-16",
47-
"utf-32",
48-
]
41+
TEST_ENCODINGS = [NO_ENCODING_STR] + SUPPORTED_ENCODINGS
4942

5043

5144
#
@@ -255,10 +248,12 @@ def test_valid_encodings(self, encoding, readtest_data: SamplefileDetails):
255248
assert load_problems_list() == []
256249
assert cube.shape == (N_XDIM,)
257250

258-
if encoding != "utf-32":
259-
expected_string_width = N_CHARS_DIM
260-
else:
251+
if encoding == "utf-32":
261252
expected_string_width = (N_CHARS_DIM // 4) - 1
253+
elif encoding == "utf-16":
254+
expected_string_width = N_CHARS_DIM - 2
255+
else:
256+
expected_string_width = N_CHARS_DIM
262257
assert cube.dtype == f"<U{expected_string_width}"
263258
cube_data = cube.data
264259
assert np.all(cube_data == datavar_strings)
@@ -303,6 +298,8 @@ def make_testcube(
303298
charlen = N_CHARS_DIM
304299
if encoding_str == "utf-32":
305300
charlen = charlen // 4 - 1
301+
elif encoding_str == "utf-16":
302+
charlen = charlen - 2
306303
strings_dtype = np.dtype(f"U{charlen}")
307304
coordvar_array = np.array(coordvar_strings, dtype=strings_dtype)
308305
datavar_array = np.array(datavar_strings, dtype=strings_dtype)

0 commit comments

Comments
 (0)