Skip to content

Commit 08d465d

Browse files
authored
Merge pull request #430 from JamesParrott/Add_unicode_to_hypothesis_tests
Add unicode to hypothesis tests
2 parents afc8e71 + 1905d73 commit 08d465d

2 files changed

Lines changed: 204 additions & 105 deletions

File tree

src/shapefile.py

Lines changed: 99 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@
2222
import time
2323
import warnings
2424
import zipfile
25-
from collections.abc import Container, Iterable, Iterator, Mapping, Reversible, Sequence
25+
from collections.abc import Container, Iterable, Iterator, Reversible, Sequence
2626
from contextlib import AbstractContextManager, ExitStack
2727
from datetime import date, datetime
2828
from os import PathLike
2929
from pathlib import Path
3030
from struct import Struct, calcsize, error, pack, unpack
31-
from types import MappingProxyType, TracebackType
31+
from types import TracebackType
3232
from typing import (
3333
IO,
3434
Any,
@@ -285,32 +285,57 @@ def _truncate_utf8_str(
285285

286286
@functools.cache
287287
def _BOM_and_dbf_decoded_pad_bytes(
288+
pad_byte: Literal[b" ", b"\x00"],
288289
encoding: str = "utf8",
289-
) -> tuple[bytes, Mapping[str, bytes]]:
290+
) -> tuple[bytes, dict[str, bytes], dict[str, bytes], dict[str, bytes]]:
290291
try:
291292
BOM = "".encode(encoding)
292293
except UnicodeEncodeError:
293294
BOM = b""
294295

295-
tuples: list[tuple[str, bytes]] = []
296-
for pad_byte_str, N in {b" ": 5, b"\x00": 5, b" \x00": 2}.items():
297-
# Max code unit size under UTF-8, UTF-16, and UTF-32 is 4 bytes.
298-
for n in range(1, N):
299-
pad_bytes = pad_byte_str * n
296+
N: int # code-unit size in bytes (possible length of
297+
# byte strings, that a single code point could encode to)
298+
if encoding.lower().startswith("utf32"):
299+
N = 4
300+
elif encoding.lower().startswith("utf16"):
301+
# Null bytes and ascii spaces don't encode to Surrogate-pairs
302+
N = 2
303+
else:
304+
# Both Ascii and UTF-8 handled here (UTF-8 is backward compatible with ascii)
305+
N = 1
306+
307+
def decoded_code_points_and_bytes(
308+
pad_byte_strs: Iterable[bytes],
309+
) -> dict[str, bytes]:
310+
retval = {}
311+
for pad_bytes in pad_byte_strs:
300312
try:
301313
s: str = (BOM + pad_bytes).decode(encoding)
302314
except UnicodeDecodeError:
303315
continue
304-
tuples.append((s, pad_bytes))
305-
break
306-
return BOM, MappingProxyType(dict(tuples))
316+
retval[s] = pad_bytes
317+
return retval
318+
319+
# Max code unit size under UTF-8, UTF-16, and UTF-32 is 4 bytes.
320+
if pad_byte == b"\x00":
321+
# Just checking the field, in which asii spaces are technically valid
322+
# even though PyShp historically has converted them to underscores
323+
return BOM, {}, decoded_code_points_and_bytes([b"\x00" * N]), {}
324+
else:
325+
pad_byte_strs = [b" " * i + b"\x00" * (N - i) for i in range(N + 1)]
326+
327+
all_ascii_spaces = decoded_code_points_and_bytes([b" "])
328+
mixed = decoded_code_points_and_bytes(pad_byte_strs)
329+
all_null_bytes = decoded_code_points_and_bytes([b"\x00"])
330+
331+
return BOM, all_ascii_spaces, mixed, all_null_bytes
307332

308333

309334
def _encode_dbf_string(
310335
s: str,
311336
size: int,
312-
decode: Decoder | None,
313-
pad_byte: bytes,
337+
pad_byte: Literal[b" ", b"\x00"],
338+
decode: Decoder | None = None,
314339
encoding: str = "utf8",
315340
encodingErrors: str = "strict",
316341
strict: bool = True,
@@ -337,7 +362,7 @@ def _encode_dbf_string(
337362
if len(encoded) <= size:
338363
if i <= N - 1:
339364
msg = (
340-
f"Dropped {N - i} code points (e.g. characters)! "
365+
f"Dropped {N - i} out of {N} code points (e.g. characters)! "
341366
f"{s} was truncated to {trimmed} (discarding: {s[i:]}), "
342367
f"in order to encode it under {size} bytes for the field or field name. "
343368
f"Used: {encoding=} and {encodingErrors=}. "
@@ -358,21 +383,43 @@ def _encode_dbf_string(
358383
f"to a short enough byte string, using {encoding=}, {encodingErrors=} ({BOM=!r})"
359384
)
360385

361-
_BOM, decoded_pad_bytes = _BOM_and_dbf_decoded_pad_bytes(encoding)
386+
_BOM, all_first, mixed, all_last = _BOM_and_dbf_decoded_pad_bytes(
387+
pad_byte, encoding
388+
)
389+
already_warned = False
362390

363-
for suffix, pad_bytes in decoded_pad_bytes.items():
364-
if s.endswith(suffix):
365-
msg = (
366-
f"Under the given encoding: {encoding}, "
367-
f" the text (field name or 'C' or 'M' field): {s!r} "
368-
f" ends with {suffix!r}, which "
369-
f"encodes to the pad bytes: {pad_bytes!r}. "
370-
"The real end of the actual data may be earlier. "
371-
)
372-
if strict:
373-
raise DbfStringDataLoss(msg)
374-
warnings.warn(msg, category=PossibleDataLoss)
375-
break
391+
def check_and_trim(decoded_pad_bytes: dict[str, bytes]) -> None:
392+
393+
nonlocal trimmed, already_warned
394+
395+
for suffix, pad_bytes in decoded_pad_bytes.items():
396+
if not suffix:
397+
continue
398+
if len(suffix) >= 2:
399+
raise ValueError(
400+
f"Multiple code points: {suffix} encoded to: {pad_bytes!r} under {encoding=}"
401+
)
402+
if trimmed.endswith(suffix):
403+
msg = (
404+
f"Under the given encoding: {encoding}, after truncation to {size} bytes,"
405+
f" the remaining text (field name or 'C' or 'M' field): {trimmed!r} "
406+
f" ends with {suffix!r}, which "
407+
f"encodes to the pad bytes: {pad_bytes!r}. "
408+
"The real end of the actual data may be earlier. "
409+
)
410+
if strict:
411+
raise DbfStringDataLoss(msg)
412+
if not already_warned:
413+
warnings.warn(msg, category=PossibleDataLoss)
414+
already_warned = True
415+
if len(set(pad_bytes)) == 1: # all same byte => strip all code points
416+
trimmed = trimmed.rstrip(suffix)
417+
else:
418+
trimmed = trimmed.removesuffix(suffix)
419+
420+
check_and_trim(all_last)
421+
check_and_trim(mixed)
422+
check_and_trim(all_first)
376423

377424
if len(encoded) < size:
378425
padded = encoded.ljust(size, pad_byte)
@@ -384,7 +431,8 @@ def _encode_dbf_string(
384431

385432
with warnings.catch_warnings():
386433
warnings.simplefilter("ignore")
387-
# TODO: Fuzz test this to see what it actually catches.
434+
# TODO: Fuzz test this to see what it actually catches,
435+
# as it makes encoding much slower.
388436
decoded = decode(
389437
b=padded,
390438
encoding=encoding,
@@ -407,7 +455,7 @@ def _encode_dbf_string(
407455

408456
def _try_to_decode_dbf_name_or_text_field(
409457
b: bytes,
410-
pad_bytes: bytes, # Pad bytes will be trimmed (from the R of b) in their order in the byte-string
458+
pad_bytes: bytes, # Pad bytes will be trimmed from the RHS (end) of b.
411459
encoding: str = "utf8",
412460
encodingErrors: str = "strict",
413461
) -> str:
@@ -537,12 +585,12 @@ def from_unchecked(
537585

538586
if "\x00" in name:
539587
msg = (
540-
"Field names should not contain null characters "
588+
"Field names ought not contain null characters, "
541589
"as null bytes are used for padding in the header. "
542590
f"Got: {name=} "
543591
)
544592
if strict:
545-
raise dbfFileException(msg)
593+
raise DbfStringDataLoss(msg)
546594
warnings.warn(msg, category=PossibleDataLoss)
547595

548596
try:
@@ -592,6 +640,7 @@ def from_unchecked(
592640
return inst
593641

594642
@classmethod
643+
@functools.cache
595644
def trim_name_until_encodable(
596645
cls,
597646
name: str,
@@ -4216,7 +4265,12 @@ def __init__(
42164265
self.recNum = 0
42174266
self._is_utf8 = encoding.replace("-", "").replace("_", "").lower() == "utf8"
42184267

4219-
self._BOM, self._decoded_pad_bytes = _BOM_and_dbf_decoded_pad_bytes(encoding)
4268+
(
4269+
self._BOM,
4270+
self._decoded_ascii_spaces,
4271+
self._decoded_mixed_bytes,
4272+
self._decoded_null_bytes,
4273+
) = _BOM_and_dbf_decoded_pad_bytes(b" ", encoding)
42204274

42214275
def field(
42224276
# Types of args should match *Field
@@ -4434,14 +4488,21 @@ def _record(self, record: list[RecordValue]) -> None:
44344488
)
44354489
if self.strict:
44364490
raise DbfStringDataLoss(msg)
4437-
warnings.warn(msg)
4491+
warnings.warn(msg, category=PossibleDataLoss)
4492+
4493+
depadded = trimmed
4494+
for byte in [b"\x00", b" "]:
4495+
try:
4496+
decoded_pad_byte = byte.decode(
4497+
self.encoding, self.encodingErrors
4498+
)
4499+
except UnicodeDecodeError:
4500+
continue
4501+
depadded = depadded.rstrip(decoded_pad_byte)
44384502

4439-
# TODO: Handle decoded_pad_bytes longer than 1
4440-
pad_bytes = "".join(self._decoded_pad_bytes)
4441-
depadded = trimmed.rstrip(pad_bytes)
44424503
if len(depadded) < len(trimmed):
44434504
msg = (
4444-
f"Trimmed: {trimmed}, stringified: {str_val} of data: {value} "
4505+
f"Trimmed: {trimmed!r}, stringified: {str_val!r} of data: {value!r} "
44454506
f"ends in decoded pad bytes or decoded null bytes. "
44464507
"Data encoded as null bytes and pad bytes will probably not "
44474508
"be recovered by applications reading the Shapefile or dbf file "

0 commit comments

Comments
 (0)