4444import contextlib
4545import dataclasses
4646import threading
47+ from typing import Callable
4748import warnings
4849
4950import numpy as np
@@ -117,8 +118,8 @@ class VariableEncoder:
117118 varname : str # just for the error messages
118119 dtype : np .dtype
119120 is_chardata : bool # just a shortcut for the dtype test
120- read_encoding : str # IF 'is_chardata': a valid encoding from the codecs package
121- write_encoding : str # IF 'is_chardata': a valid encoding from the codecs package
121+ read_encoding : str # IF 'is_chardata': one of the supported encodings
122+ write_encoding : str # IF 'is_chardata': one of the supported encodings
122123 n_chars_dim : int # IF 'is_chardata': length of associated character dimension
123124 string_width : int # IF 'is_chardata': width when viewed as strings (i.e. "Uxx")
124125
@@ -138,59 +139,30 @@ def __init__(self, cf_var):
138139 self .dtype = cf_var .dtype
139140 self .is_chardata = np .issubdtype (self .dtype , np .bytes_ )
140141 if self .is_chardata :
141- self .read_encoding = self ._get_encoding (cf_var , writing = False )
142- self .write_encoding = self ._get_encoding (cf_var , writing = True )
142+ encoding_attr = getattr (cf_var , "_Encoding" , None )
143+ self .read_encoding = _identify_encoding (
144+ encoding_attr , var_name = cf_var .name , writing = False
145+ )
146+ self .write_encoding = _identify_encoding (
147+ encoding_attr , var_name = cf_var .name , writing = True
148+ )
143149 n_chars_dim = 1 # default to 1 for a scalar var
144150 if len (cf_var .dimensions ) >= 1 :
145151 dim_name = cf_var .dimensions [- 1 ]
146152 if dim_name in cf_var .group ().dimensions :
147153 n_chars_dim = cf_var .group ().dimensions [dim_name ].size
148154 self .n_chars_dim = n_chars_dim
149- self .string_width = self ._get_string_width (cf_var )
155+ self .string_width = self ._get_string_width ()
150156
151- @staticmethod
152- def _get_encoding (cf_var , writing = False ) -> str :
153- """Get the byte encoding defined for this variable (or None)."""
154- result = getattr (cf_var , "_Encoding" , None )
155- if result is not None :
156- try :
157- # Accept + normalise naming of encodings
158- result = codecs .lookup (result ).name
159- # NOTE: if encoding does not suit data, errors can occur.
160- # For example, _Encoding = "ascii", with non-ascii content.
161- except LookupError :
162- # Unrecognised encoding name : handle this as just a warning
163- msg = (
164- f"Ignoring unknown encoding for variable { cf_var .name !r} : "
165- f"_Encoding = { result !r} ."
166- )
167- warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning
168- warnings .warn (msg , category = warntype )
169- # Proceed as if there is no specified encoding
170- result = None
171-
172- if result is None :
173- if writing :
174- result = DEFAULT_WRITE_ENCODING
175- else :
176- result = DEFAULT_READ_ENCODING
177- return result
178-
179- def _get_string_width (self , cf_var ) -> int :
157+ def _get_string_width (self ) -> int :
180158 """Return the string-length defined for this variable."""
181159 # Work out the actual byte width from the parent dataset dimensions.
182- strlen = self .n_chars_dim
160+ n_bytes = self .n_chars_dim
183161 # Convert the string dimension length (i.e. bytes) to a sufficiently-long
184162 # string width, depending on the (read) encoding used.
185163 encoding = self .read_encoding
186- if "utf-16" in encoding :
187- # Each char needs at least 2 bytes -- including a terminator char
188- strlen = (strlen // 2 ) - 1
189- elif "utf-32" in encoding :
190- # Each char needs exactly 4 bytes -- including a terminator char
191- strlen = (strlen // 4 ) - 1
192- # "ELSE": assume there can be (at most) as many chars as bytes
193- return strlen
164+ n_chars = _ENCODING_WIDTH_TRANSLATIONS [encoding ].nbytes_2_nchars (n_bytes )
165+ return n_chars
194166
195167 def decode_bytes_to_stringarray (self , data : np .ndarray ) -> np .ndarray :
196168 if self .is_chardata :
@@ -252,6 +224,98 @@ def context(self, perform_decoding: bool):
252224DEFAULT_WRITE_ENCODING = "ascii"
253225
254226
227+ @dataclasses .dataclass
228+ class EncodingWidthRelations :
229+ """Encode the default string-width <-> byte-dimension relations.
230+
231+ These translations are just a "best guess"...
232+
233+ When translating bytes (dtype S1) to strings (dtype Uxx), the chosen (default)
234+ string width may be longer than is needed for the actual content. But it is at
235+ least "safe".
236+
237+ When translating strings to bytes, we *can* get more bytes than the default
238+ byte dimension length, and the code will then truncate
239+ ( with a warning : see '_identify_encoding' ).
240+ This can be avoided if necessary, in specific cases, by recasting the data to a
241+ dtype with greater width (Uxx).
242+ """
243+
244+ nchars_2_nbytes : Callable [[int ], int ]
245+ nbytes_2_nchars : Callable [[int ], int ]
246+
247+
248+ _ENCODING_WIDTH_TRANSLATIONS = {
249+ "ascii" : EncodingWidthRelations (lambda x : x , lambda x : x ),
250+ "utf-8" : EncodingWidthRelations (lambda x : x , lambda x : x ),
251+ "utf-16" : EncodingWidthRelations (
252+ nchars_2_nbytes = lambda x : x + 2 ,
253+ nbytes_2_nchars = lambda x : x - 2 ,
254+ ),
255+ "utf-32" : EncodingWidthRelations (
256+ nchars_2_nbytes = lambda x : (x + 1 ) * 4 ,
257+ nbytes_2_nchars = lambda x : x // 4 - 1 ,
258+ ),
259+ }
260+ SUPPORTED_ENCODINGS = list (_ENCODING_WIDTH_TRANSLATIONS .keys ())
261+
262+
263+ def _identify_encoding (encoding , var_name : str , writing : bool = False ) -> str :
264+ """Normalise an encoding name + check it is supported.
265+
266+ Parameters
267+ ----------
268+ encoding : Any
269+ Select an encoding : None, or a string, or anything printable (via str()).
270+ var_name : str
271+ Name of the relevant dataste variable (i.e. 'var_name') :
272+ used only to produce warning messages.
273+ writing : bool
274+ Specify whether reading or writing, which affects any *default* return value,
275+ i.e. select between DEFAULT_READ_ENCODING / DEFAULT_WRITE_ENCODING.
276+
277+ If given, and supported, return a normalised encoding name,
278+ -- i.e. always one of SUPPORTED_ENCODINGS.
279+ If not given, or not supported, return the default encoding name.
280+
281+ If given **but not recognised/supported**, also emit a warning (and return default).
282+ """
283+ if encoding is not None :
284+ encoding = str (encoding )
285+
286+ result : str | None = None # not yet 'found' : we will never *return* this
287+
288+ if encoding is not None :
289+ # Normalise the name : NB must recognised by Python "codecs".
290+ try :
291+ result = codecs .lookup (encoding ).name
292+ except LookupError :
293+ pass
294+
295+ if result is not None :
296+ if result not in SUPPORTED_ENCODINGS :
297+ # Python "codecs" recognised it, but we don't support it.
298+ result = None
299+
300+ if encoding is not None and result is None :
301+ # Unrecognised encoding name : handle this as just a warning
302+ msg = (
303+ f"Ignoring unsupported encoding for netCDF variable { var_name !r} : "
304+ f"_Encoding = { encoding !r} , is not recognised as one of the supported "
305+ f"encodings, { SUPPORTED_ENCODINGS } ."
306+ )
307+ warntype = IrisCfSaveWarning if writing else IrisCfLoadWarning
308+ warnings .warn (msg , category = warntype )
309+
310+ if result is None :
311+ if writing :
312+ result = DEFAULT_WRITE_ENCODING
313+ else :
314+ result = DEFAULT_READ_ENCODING
315+
316+ return result
317+
318+
255319class EncodedVariable (VariableWrapper ):
256320 """A variable wrapper that translates variable data according to byte encodings."""
257321
0 commit comments