2222import time
2323import warnings
2424import zipfile
25- from collections .abc import Container , Iterable , Iterator , Mapping , Reversible , Sequence
25+ from collections .abc import Container , Iterable , Iterator , Reversible , Sequence
2626from contextlib import AbstractContextManager , ExitStack
2727from datetime import date , datetime
2828from os import PathLike
2929from pathlib import Path
3030from struct import Struct , calcsize , error , pack , unpack
31- from types import MappingProxyType , TracebackType
31+ from types import TracebackType
3232from typing import (
3333 IO ,
3434 Any ,
@@ -285,32 +285,57 @@ def _truncate_utf8_str(
285285
286286@functools .cache
287287def _BOM_and_dbf_decoded_pad_bytes (
288+ pad_byte : Literal [b" " , b"\x00 " ],
288289 encoding : str = "utf8" ,
289- ) -> tuple [bytes , Mapping [str , bytes ]]:
290+ ) -> tuple [bytes , dict [ str , bytes ], dict [ str , bytes ], dict [str , bytes ]]:
290291 try :
291292 BOM = "" .encode (encoding )
292293 except UnicodeEncodeError :
293294 BOM = b""
294295
295- tuples : list [tuple [str , bytes ]] = []
296- for pad_byte_str , N in {b" " : 5 , b"\x00 " : 5 , b" \x00 " : 2 }.items ():
297- # Max code unit size under UTF-8, UTF-16, and UTF-32 is 4 bytes.
298- for n in range (1 , N ):
299- pad_bytes = pad_byte_str * n
296+ N : int # code-unit size in bytes (possible length of
297+ # byte strings, that a single code point could encode to)
298+ if encoding .lower ().startswith ("utf32" ):
299+ N = 4
300+ elif encoding .lower ().startswith ("utf16" ):
301+ # Null bytes and ascii spaces don't encode to Surrogate-pairs
302+ N = 2
303+ else :
304+ # Both Ascii and UTF-8 handled here (UTF-8 is backward compatible with ascii)
305+ N = 1
306+
307+ def decoded_code_points_and_bytes (
308+ pad_byte_strs : Iterable [bytes ],
309+ ) -> dict [str , bytes ]:
310+ retval = {}
311+ for pad_bytes in pad_byte_strs :
300312 try :
301313 s : str = (BOM + pad_bytes ).decode (encoding )
302314 except UnicodeDecodeError :
303315 continue
304- tuples .append ((s , pad_bytes ))
305- break
306- return BOM , MappingProxyType (dict (tuples ))
316+ retval [s ] = pad_bytes
317+ return retval
318+
319+ # Max code unit size under UTF-8, UTF-16, and UTF-32 is 4 bytes.
320+ if pad_byte == b"\x00 " :
321+ # Just checking the field, in which asii spaces are technically valid
322+ # even though PyShp historically has converted them to underscores
323+ return BOM , {}, decoded_code_points_and_bytes ([b"\x00 " * N ]), {}
324+ else :
325+ pad_byte_strs = [b" " * i + b"\x00 " * (N - i ) for i in range (N + 1 )]
326+
327+ all_ascii_spaces = decoded_code_points_and_bytes ([b" " ])
328+ mixed = decoded_code_points_and_bytes (pad_byte_strs )
329+ all_null_bytes = decoded_code_points_and_bytes ([b"\x00 " ])
330+
331+ return BOM , all_ascii_spaces , mixed , all_null_bytes
307332
308333
309334def _encode_dbf_string (
310335 s : str ,
311336 size : int ,
312- decode : Decoder | None ,
313- pad_byte : bytes ,
337+ pad_byte : Literal [ b" " , b" \x00 " ] ,
338+ decode : Decoder | None = None ,
314339 encoding : str = "utf8" ,
315340 encodingErrors : str = "strict" ,
316341 strict : bool = True ,
@@ -337,7 +362,7 @@ def _encode_dbf_string(
337362 if len (encoded ) <= size :
338363 if i <= N - 1 :
339364 msg = (
340- f"Dropped { N - i } code points (e.g. characters)! "
365+ f"Dropped { N - i } out of { N } code points (e.g. characters)! "
341366 f"{ s } was truncated to { trimmed } (discarding: { s [i :]} ), "
342367 f"in order to encode it under { size } bytes for the field or field name. "
343368 f"Used: { encoding = } and { encodingErrors = } . "
@@ -358,21 +383,43 @@ def _encode_dbf_string(
358383 f"to a short enough byte string, using { encoding = } , { encodingErrors = } ({ BOM = !r} )"
359384 )
360385
361- _BOM , decoded_pad_bytes = _BOM_and_dbf_decoded_pad_bytes (encoding )
386+ _BOM , all_first , mixed , all_last = _BOM_and_dbf_decoded_pad_bytes (
387+ pad_byte , encoding
388+ )
389+ already_warned = False
362390
363- for suffix , pad_bytes in decoded_pad_bytes .items ():
364- if s .endswith (suffix ):
365- msg = (
366- f"Under the given encoding: { encoding } , "
367- f" the text (field name or 'C' or 'M' field): { s !r} "
368- f" ends with { suffix !r} , which "
369- f"encodes to the pad bytes: { pad_bytes !r} . "
370- "The real end of the actual data may be earlier. "
371- )
372- if strict :
373- raise DbfStringDataLoss (msg )
374- warnings .warn (msg , category = PossibleDataLoss )
375- break
391+ def check_and_trim (decoded_pad_bytes : dict [str , bytes ]) -> None :
392+
393+ nonlocal trimmed , already_warned
394+
395+ for suffix , pad_bytes in decoded_pad_bytes .items ():
396+ if not suffix :
397+ continue
398+ if len (suffix ) >= 2 :
399+ raise ValueError (
400+ f"Multiple code points: { suffix } encoded to: { pad_bytes !r} under { encoding = } "
401+ )
402+ if trimmed .endswith (suffix ):
403+ msg = (
404+ f"Under the given encoding: { encoding } , after truncation to { size } bytes,"
405+ f" the remaining text (field name or 'C' or 'M' field): { trimmed !r} "
406+ f" ends with { suffix !r} , which "
407+ f"encodes to the pad bytes: { pad_bytes !r} . "
408+ "The real end of the actual data may be earlier. "
409+ )
410+ if strict :
411+ raise DbfStringDataLoss (msg )
412+ if not already_warned :
413+ warnings .warn (msg , category = PossibleDataLoss )
414+ already_warned = True
415+ if len (set (pad_bytes )) == 1 : # all same byte => strip all code points
416+ trimmed = trimmed .rstrip (suffix )
417+ else :
418+ trimmed = trimmed .removesuffix (suffix )
419+
420+ check_and_trim (all_last )
421+ check_and_trim (mixed )
422+ check_and_trim (all_first )
376423
377424 if len (encoded ) < size :
378425 padded = encoded .ljust (size , pad_byte )
@@ -384,7 +431,8 @@ def _encode_dbf_string(
384431
385432 with warnings .catch_warnings ():
386433 warnings .simplefilter ("ignore" )
387- # TODO: Fuzz test this to see what it actually catches.
434+ # TODO: Fuzz test this to see what it actually catches,
435+ # as it makes encoding much slower.
388436 decoded = decode (
389437 b = padded ,
390438 encoding = encoding ,
@@ -407,7 +455,7 @@ def _encode_dbf_string(
407455
408456def _try_to_decode_dbf_name_or_text_field (
409457 b : bytes ,
410- pad_bytes : bytes , # Pad bytes will be trimmed ( from the R of b) in their order in the byte-string
458+ pad_bytes : bytes , # Pad bytes will be trimmed from the RHS (end) of b.
411459 encoding : str = "utf8" ,
412460 encodingErrors : str = "strict" ,
413461) -> str :
@@ -537,12 +585,12 @@ def from_unchecked(
537585
538586 if "\x00 " in name :
539587 msg = (
540- "Field names should not contain null characters "
588+ "Field names ought not contain null characters, "
541589 "as null bytes are used for padding in the header. "
542590 f"Got: { name = } "
543591 )
544592 if strict :
545- raise dbfFileException (msg )
593+ raise DbfStringDataLoss (msg )
546594 warnings .warn (msg , category = PossibleDataLoss )
547595
548596 try :
@@ -592,6 +640,7 @@ def from_unchecked(
592640 return inst
593641
594642 @classmethod
643+ @functools .cache
595644 def trim_name_until_encodable (
596645 cls ,
597646 name : str ,
@@ -4216,7 +4265,12 @@ def __init__(
42164265 self .recNum = 0
42174266 self ._is_utf8 = encoding .replace ("-" , "" ).replace ("_" , "" ).lower () == "utf8"
42184267
4219- self ._BOM , self ._decoded_pad_bytes = _BOM_and_dbf_decoded_pad_bytes (encoding )
4268+ (
4269+ self ._BOM ,
4270+ self ._decoded_ascii_spaces ,
4271+ self ._decoded_mixed_bytes ,
4272+ self ._decoded_null_bytes ,
4273+ ) = _BOM_and_dbf_decoded_pad_bytes (b" " , encoding )
42204274
42214275 def field (
42224276 # Types of args should match *Field
@@ -4434,14 +4488,21 @@ def _record(self, record: list[RecordValue]) -> None:
44344488 )
44354489 if self .strict :
44364490 raise DbfStringDataLoss (msg )
4437- warnings .warn (msg )
4491+ warnings .warn (msg , category = PossibleDataLoss )
4492+
4493+ depadded = trimmed
4494+ for byte in [b"\x00 " , b" " ]:
4495+ try :
4496+ decoded_pad_byte = byte .decode (
4497+ self .encoding , self .encodingErrors
4498+ )
4499+ except UnicodeDecodeError :
4500+ continue
4501+ depadded = depadded .rstrip (decoded_pad_byte )
44384502
4439- # TODO: Handle decoded_pad_bytes longer than 1
4440- pad_bytes = "" .join (self ._decoded_pad_bytes )
4441- depadded = trimmed .rstrip (pad_bytes )
44424503 if len (depadded ) < len (trimmed ):
44434504 msg = (
4444- f"Trimmed: { trimmed } , stringified: { str_val } of data: { value } "
4505+ f"Trimmed: { trimmed !r } , stringified: { str_val !r } of data: { value !r } "
44454506 f"ends in decoded pad bytes or decoded null bytes. "
44464507 "Data encoded as null bytes and pad bytes will probably not "
44474508 "be recovered by applications reading the Shapefile or dbf file "
0 commit comments