diff --git a/changes/0000.removal.md b/changes/0000.removal.md new file mode 100644 index 0000000000..075e4a15a9 --- /dev/null +++ b/changes/0000.removal.md @@ -0,0 +1,17 @@ +The ``Endian`` (``zarr.codecs.bytes.Endian``) and ``ShardingCodecIndexLocation`` +(``zarr.codecs.ShardingCodecIndexLocation``) enums are now deprecated. Pass the +equivalent literal string instead (e.g. ``"little"`` / ``"big"``, ``"start"`` / +``"end"``). The enum classes remain importable but emit ``DeprecationWarning`` +on member access, and will be removed in a future release. ``BytesCodec.endian`` +and ``ShardingCodec.index_location`` are now plain strings rather than enum +members. + +Two follow-on changes from this deprecation: + +- ``NDBuffer.byteorder`` now returns a literal string (``"little"`` or + ``"big"``) rather than an ``Endian`` member. Subclasses overriding this + property should update their return type. +- The module-level binding ``zarr.codecs.bytes.default_system_endian`` was + removed. ``BytesCodec()`` continues to default to ``sys.byteorder``; + external callers that imported ``default_system_endian`` should use + ``sys.byteorder`` directly. diff --git a/src/zarr/codecs/_deprecated_enum.py b/src/zarr/codecs/_deprecated_enum.py new file mode 100644 index 0000000000..af4b5a33df --- /dev/null +++ b/src/zarr/codecs/_deprecated_enum.py @@ -0,0 +1,49 @@ +"""Helpers for deprecating string-valued enums in favor of literal strings. + +See PR #3963 for context on the deprecation pattern. +""" + +from __future__ import annotations + +import warnings +from enum import Enum + + +class _DeprecatedStrEnumMeta(type): + """ + Metaclass for legacy enum-like classes. Accessing a member name on the + class (e.g. `LegacyShim.foo`) emits a `DeprecationWarning` and returns + the equivalent string. Members are declared by setting a `_members` + class attribute mapping each member name to its string value. + """ + + _members: dict[str, str] + + def __getattr__(cls, name: str) -> str: + members: dict[str, str] = type.__getattribute__(cls, "_members") + if name in members: + warnings.warn( + f"{cls.__name__}.{name} is deprecated; pass the string {members[name]!r} instead.", + DeprecationWarning, + stacklevel=2, + ) + return members[name] + raise AttributeError(name) + + +def _coerce_enum_input(value: object, param_name: str, codec_name: str) -> object: + """ + If `value` is a real `enum.Enum` instance, emit a deprecation warning + naming `codec_name` and return `value.value`. Otherwise return `value` + unchanged. The third argument lets the warning text name the actual + codec (e.g. `BloscCodec`, `BytesCodec`, `ShardingCodec`). + """ + if isinstance(value, Enum): + warnings.warn( + f"Passing an enum to {codec_name}(..., {param_name}=...) is deprecated; " + "pass the equivalent literal string instead.", + DeprecationWarning, + stacklevel=3, + ) + return value.value + return value diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 8a20282060..087de716fc 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -1,9 +1,7 @@ from __future__ import annotations import asyncio -import warnings from dataclasses import dataclass, field, replace -from enum import Enum from functools import cached_property from typing import TYPE_CHECKING, ClassVar, Final, Literal, NotRequired, TypedDict @@ -12,6 +10,7 @@ from packaging.version import Version from zarr.abc.codec import BytesBytesCodec +from zarr.codecs._deprecated_enum import _coerce_enum_input, _DeprecatedStrEnumMeta from zarr.core.buffer.cpu import as_numpy_array_wrapper from zarr.core.common import JSON, NamedRequiredConfig, parse_named_configuration from zarr.core.dtype.common import HasItemSize @@ -59,27 +58,6 @@ class BloscJSON_V3(NamedRequiredConfig[Literal["blosc"], BloscConfigV3]): """ -class _DeprecatedStrEnumMeta(type): - """ - Metaclass for the legacy `BloscShuffle` / `BloscCname` classes. Accessing - a member name (e.g. `BloscShuffle.bitshuffle`) emits a `DeprecationWarning` - and returns the equivalent string. - """ - - _members: dict[str, str] - - def __getattr__(cls, name: str) -> str: - members: dict[str, str] = type.__getattribute__(cls, "_members") - if name in members: - warnings.warn( - f"{cls.__name__}.{name} is deprecated; pass the string {members[name]!r} instead.", - DeprecationWarning, - stacklevel=2, - ) - return members[name] - raise AttributeError(name) - - class BloscShuffle(metaclass=_DeprecatedStrEnumMeta): """ Deprecated. Pass a literal string (`"noshuffle"`, `"shuffle"`, or @@ -149,22 +127,6 @@ def parse_blocksize(data: JSON) -> int: raise TypeError(f"Value should be an int. Got {type(data)} instead.") -def _coerce_enum_input(value: object, param_name: str) -> object: - """ - If `value` is a real `enum.Enum` instance, emit a deprecation warning - and return `value.value`. Otherwise return `value` unchanged. - """ - if isinstance(value, Enum): - warnings.warn( - f"Passing an enum to BloscCodec(..., {param_name}=...) is deprecated; " - "pass the equivalent literal string instead.", - DeprecationWarning, - stacklevel=3, - ) - return value.value - return value - - def _parse_cname(data: object) -> BloscCnameLiteral: if isinstance(data, str) and data in BLOSC_CNAME: return data # type: ignore[return-value] @@ -285,8 +247,8 @@ def __init__( shuffle = "bitshuffle" self._tunable_attrs.update({"shuffle"}) - cname = _coerce_enum_input(cname, "cname") # type: ignore[assignment] - shuffle = _coerce_enum_input(shuffle, "shuffle") # type: ignore[assignment] + cname = _coerce_enum_input(cname, "cname", "BloscCodec") # type: ignore[assignment] + shuffle = _coerce_enum_input(shuffle, "shuffle", "BloscCodec") # type: ignore[assignment] typesize_parsed = parse_typesize(typesize) cname_parsed = _parse_cname(cname) diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 48f8b57c49..1c67c65e98 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -3,12 +3,12 @@ import sys import warnings from dataclasses import dataclass, replace -from enum import Enum -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, ClassVar, Final, Literal from zarr.abc.codec import ArrayBytesCodec +from zarr.codecs._deprecated_enum import _coerce_enum_input, _DeprecatedStrEnumMeta from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import JSON, parse_enum, parse_named_configuration +from zarr.core.common import JSON, parse_named_configuration from zarr.core.dtype.common import HasEndianness from zarr.core.dtype.npy.structured import Struct @@ -18,16 +18,25 @@ from zarr.core.array_spec import ArraySpec -class Endian(Enum): +EndianLiteral = Literal["little", "big"] +"""Byte order of multi-byte numeric data.""" + +ENDIAN: Final = ("little", "big") + + +class Endian(metaclass=_DeprecatedStrEnumMeta): """ - Enum for endian type used by bytes codec. + Deprecated. Pass a literal string (`"little"` or `"big"`) directly to + `BytesCodec` instead. """ - big = "big" - little = "little" + _members: ClassVar[dict[str, str]] = {"little": "little", "big": "big"} -default_system_endian = Endian(sys.byteorder) +def _parse_endian(data: object) -> EndianLiteral: + if isinstance(data, str) and data in ENDIAN: + return data # type: ignore[return-value] + raise ValueError(f"endian must be one of {list(ENDIAN)!r}. Got {data!r}.") @dataclass(frozen=True) @@ -36,10 +45,14 @@ class BytesCodec(ArrayBytesCodec): is_fixed_size = True - endian: Endian | None + endian: EndianLiteral | None - def __init__(self, *, endian: Endian | str | None = default_system_endian) -> None: - endian_parsed = None if endian is None else parse_enum(endian, Endian) + def __init__(self, *, endian: Endian | EndianLiteral | None = sys.byteorder) -> None: + if endian is None: + endian_parsed: EndianLiteral | None = None + else: + coerced = _coerce_enum_input(endian, "endian", "BytesCodec") + endian_parsed = _parse_endian(coerced) object.__setattr__(self, "endian", endian_parsed) @@ -55,7 +68,7 @@ def to_dict(self) -> dict[str, JSON]: if self.endian is None: return {"name": "bytes"} else: - return {"name": "bytes", "configuration": {"endian": self.endian.value}} + return {"name": "bytes", "configuration": {"endian": self.endian}} def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: if isinstance(array_spec.dtype, Struct): @@ -67,7 +80,7 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: UserWarning, stacklevel=2, ) - return replace(self, endian=Endian.little) + return replace(self, endian="little") else: if self.endian is not None: return replace(self, endian=None) @@ -85,8 +98,7 @@ def _decode_sync( chunk_bytes: Buffer, chunk_spec: ArraySpec, ) -> NDBuffer: - # TODO: remove endianness enum in favor of literal union - endian_str = self.endian.value if self.endian is not None else None + endian_str = self.endian if isinstance(chunk_spec.dtype, HasEndianness): dtype = replace(chunk_spec.dtype, endianness=endian_str).to_native_dtype() # type: ignore[call-arg] else: @@ -121,9 +133,7 @@ def _encode_sync( and self.endian is not None and self.endian != chunk_array.byteorder ): - # type-ignore is a numpy bug - # see https://github.com/numpy/numpy/issues/26473 - new_dtype = chunk_array.dtype.newbyteorder(self.endian.name) # type: ignore[arg-type] + new_dtype = chunk_array.dtype.newbyteorder(self.endian) chunk_array = chunk_array.astype(new_dtype) nd_array = chunk_array.as_ndarray_like() diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 609e32f87d..0906013d62 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -2,10 +2,9 @@ from collections.abc import Iterable, Mapping, MutableMapping, Sequence from dataclasses import dataclass, replace -from enum import Enum from functools import lru_cache from operator import itemgetter -from typing import TYPE_CHECKING, Any, NamedTuple, cast +from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal, NamedTuple, cast import numpy as np import numpy.typing as npt @@ -24,6 +23,7 @@ RangeByteRequest, SuffixByteRequest, ) +from zarr.codecs._deprecated_enum import _coerce_enum_input, _DeprecatedStrEnumMeta from zarr.codecs.bytes import BytesCodec from zarr.codecs.crc32c_ import Crc32cCodec from zarr.core.array_spec import ArrayConfig, ArraySpec @@ -37,7 +37,6 @@ from zarr.core.chunk_grids import ChunkGrid from zarr.core.common import ( ShapeLike, - parse_enum, parse_named_configuration, parse_shapelike, product, @@ -74,17 +73,25 @@ ShardMutableMapping = MutableMapping[tuple[int, ...], Buffer | None] -class ShardingCodecIndexLocation(Enum): +IndexLocation = Literal["start", "end"] +"""Position of the shard index within the encoded shard.""" + +INDEX_LOCATION: Final = ("start", "end") + + +class ShardingCodecIndexLocation(metaclass=_DeprecatedStrEnumMeta): """ - Enum for index location used by the sharding codec. + Deprecated. Pass a literal string (`"start"` or `"end"`) directly to + `ShardingCodec` instead. """ - start = "start" - end = "end" + _members: ClassVar[dict[str, str]] = {"start": "start", "end": "end"} -def parse_index_location(data: object) -> ShardingCodecIndexLocation: - return parse_enum(data, ShardingCodecIndexLocation) +def _parse_index_location(data: object) -> IndexLocation: + if isinstance(data, str) and data in INDEX_LOCATION: + return data # type: ignore[return-value] + raise ValueError(f"index_location must be one of {list(INDEX_LOCATION)!r}. Got {data!r}.") @dataclass(frozen=True) @@ -240,7 +247,7 @@ async def from_bytes( shard_index_size = codec._shard_index_size(chunks_per_shard) obj = cls() obj.buf = buf - if codec.index_location == ShardingCodecIndexLocation.start: + if codec.index_location == "start": shard_index_bytes = obj.buf[:shard_index_size] else: shard_index_bytes = obj.buf[-shard_index_size:] @@ -310,7 +317,7 @@ class ShardingCodec( chunk_shape: tuple[int, ...] codecs: tuple[Codec, ...] index_codecs: tuple[Codec, ...] - index_location: ShardingCodecIndexLocation = ShardingCodecIndexLocation.end + index_location: IndexLocation = "end" def __init__( self, @@ -318,12 +325,15 @@ def __init__( chunk_shape: ShapeLike, codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(),), index_codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(), Crc32cCodec()), - index_location: ShardingCodecIndexLocation | str = ShardingCodecIndexLocation.end, + index_location: ShardingCodecIndexLocation | IndexLocation = "end", ) -> None: chunk_shape_parsed = parse_shapelike(chunk_shape) codecs_parsed = parse_codecs(codecs) index_codecs_parsed = parse_codecs(index_codecs) - index_location_parsed = parse_index_location(index_location) + index_location_coerced = _coerce_enum_input( + index_location, "index_location", "ShardingCodec" + ) + index_location_parsed = _parse_index_location(index_location_coerced) object.__setattr__(self, "chunk_shape", chunk_shape_parsed) object.__setattr__(self, "codecs", codecs_parsed) @@ -348,7 +358,7 @@ def __setstate__(self, state: dict[str, Any]) -> None: object.__setattr__(self, "chunk_shape", parse_shapelike(config["chunk_shape"])) object.__setattr__(self, "codecs", parse_codecs(config["codecs"])) object.__setattr__(self, "index_codecs", parse_codecs(config["index_codecs"])) - object.__setattr__(self, "index_location", parse_index_location(config["index_location"])) + object.__setattr__(self, "index_location", _parse_index_location(config["index_location"])) # Use instance-local lru_cache to avoid memory leaks # object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec)) @@ -371,7 +381,7 @@ def to_dict(self) -> dict[str, JSON]: "chunk_shape": self.chunk_shape, "codecs": tuple(s.to_dict() for s in self.codecs), "index_codecs": tuple(s.to_dict() for s in self.index_codecs), - "index_location": self.index_location.value, + "index_location": self.index_location, }, } @@ -659,7 +669,7 @@ async def _encode_shard_dict( return None index_bytes = await self._encode_shard_index(index) - if self.index_location == ShardingCodecIndexLocation.start: + if self.index_location == "start": empty_chunks_mask = index.offsets_and_lengths[..., 0] == MAX_UINT_64 index.offsets_and_lengths[~empty_chunks_mask, 0] += len(index_bytes) index_bytes = await self._encode_shard_index( @@ -766,7 +776,7 @@ async def _load_shard_index_maybe( self, byte_getter: ByteGetter, chunks_per_shard: tuple[int, ...] ) -> _ShardIndex | None: shard_index_size = self._shard_index_size(chunks_per_shard) - if self.index_location == ShardingCodecIndexLocation.start: + if self.index_location == "start": index_bytes = await byte_getter.get( prototype=numpy_buffer_prototype(), byte_range=RangeByteRequest(0, shard_index_size), diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 2e8ca5445d..03e67cd342 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -153,7 +153,7 @@ from zarr.abc.codec import CodecPipeline from zarr.abc.store import Store - from zarr.codecs.sharding import ShardingCodecIndexLocation + from zarr.codecs.sharding import IndexLocation from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar from zarr.storage import StoreLike from zarr.types import AnyArray, AnyAsyncArray, ArrayV2, ArrayV3, AsyncArrayV2, AsyncArrayV3 @@ -1743,7 +1743,7 @@ def info(self) -> Any: Order : C Read-only : False Store type : MemoryStore - Codecs : [{'endian': }] + Codecs : [{'endian': 'little'}] No. bytes : 480 """ return self._info() @@ -3919,7 +3919,7 @@ def info(self) -> Any: Order : C Read-only : False Store type : MemoryStore - Codecs : [BytesCodec(endian=)] + Codecs : [BytesCodec(endian='little')] No. bytes : 40 """ return self.async_array.info @@ -4003,7 +4003,7 @@ async def _shards_initialized( class ShardsConfigParam(TypedDict): shape: tuple[int, ...] - index_location: ShardingCodecIndexLocation | None + index_location: IndexLocation | None type ShardsLike = tuple[int, ...] | Sequence[Sequence[int]] | ShardsConfigParam | Literal["auto"] @@ -4388,7 +4388,7 @@ async def init_array( if zarr_format is None: zarr_format = _default_zarr_format() - from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation + from zarr.codecs.sharding import ShardingCodec zdtype = parse_dtype(dtype, zarr_format=zarr_format) shape_parsed = parse_shapelike(shape) @@ -4481,11 +4481,9 @@ async def init_array( codecs_out: tuple[Codec, ...] if inner is not None: inner_chunks_flat = as_regular_shape(inner.outer_chunks) - index_location = None + index_location: IndexLocation = "end" if isinstance(shards, dict): - index_location = ShardingCodecIndexLocation(shards.get("index_location", None)) - if index_location is None: - index_location = ShardingCodecIndexLocation.end + index_location = cast("IndexLocation", shards.get("index_location", "end")) sharding_codec = ShardingCodec( chunk_shape=inner_chunks_flat, codecs=sub_codecs, index_location=index_location ) diff --git a/src/zarr/core/buffer/core.py b/src/zarr/core/buffer/core.py index 58a59975b7..6ef1eb1474 100644 --- a/src/zarr/core/buffer/core.py +++ b/src/zarr/core/buffer/core.py @@ -21,7 +21,7 @@ from collections.abc import Iterable, Sequence from typing import Self - from zarr.codecs.bytes import Endian + from zarr.codecs.bytes import EndianLiteral from zarr.core.common import BytesLike # Everything here is imported into ``zarr.core.buffer`` namespace. @@ -491,15 +491,13 @@ def shape(self) -> tuple[int, ...]: return self._data.shape @property - def byteorder(self) -> Endian: - from zarr.codecs.bytes import Endian - + def byteorder(self) -> EndianLiteral: if self.dtype.byteorder == "<": - return Endian.little + return "little" elif self.dtype.byteorder == ">": - return Endian.big + return "big" else: - return Endian(sys.byteorder) + return sys.byteorder def reshape(self, newshape: tuple[int, ...] | Literal[-1]) -> Self: return self.__class__(self._data.reshape(newshape)) diff --git a/tests/conftest.py b/tests/conftest.py index 3515acace0..cebd0a3183 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,7 +16,7 @@ import zarr.registry from zarr import AsyncGroup, config from zarr.abc.store import Store -from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation +from zarr.codecs.sharding import IndexLocation, ShardingCodec from zarr.core.array import ( _parse_chunk_encoding_v2, _parse_chunk_encoding_v3, @@ -399,11 +399,9 @@ def create_array_metadata( codecs_out: tuple[Codec, ...] if inner is not None: inner_chunks_flat = as_regular_shape(inner.outer_chunks) - index_location = None + index_location: IndexLocation = "end" if isinstance(shards, dict): - index_location = ShardingCodecIndexLocation(shards.get("index_location", None)) - if index_location is None: - index_location = ShardingCodecIndexLocation.end + index_location = cast("IndexLocation", shards.get("index_location", "end")) sharding_codec = ShardingCodec( chunk_shape=inner_chunks_flat, codecs=sub_codecs, diff --git a/tests/test_codecs/test_bytes.py b/tests/test_codecs/test_bytes.py new file mode 100644 index 0000000000..25c786a405 --- /dev/null +++ b/tests/test_codecs/test_bytes.py @@ -0,0 +1,173 @@ +"""Tests for `BytesCodec` and the deprecation of the `Endian` enum.""" + +from __future__ import annotations + +import enum +import sys +import warnings +from typing import Any, cast + +import pytest + +from zarr.codecs.bytes import ( + ENDIAN, + BytesCodec, + Endian, + EndianLiteral, +) +from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.buffer import default_buffer_prototype +from zarr.core.dtype.npy.int import Int8, Int32 +from zarr.core.dtype.npy.structured import Struct + + +@pytest.mark.parametrize("endian", ENDIAN) +def test_bytes_codec_accepts_all_endians(endian: EndianLiteral) -> None: + """ + Every endian value in ENDIAN is accepted by BytesCodec and round-trips + to the same value on the stored attribute. Catches drift between the + EndianLiteral type alias and the runtime ENDIAN tuple. + """ + codec = BytesCodec(endian=endian) + assert codec.endian == endian + + +@pytest.mark.parametrize("endian", ENDIAN) +def test_bytes_codec_json_roundtrip(endian: EndianLiteral) -> None: + """ + BytesCodec.to_dict produces the spec-defined wire shape and the + round-trip through from_dict preserves equality. Asserting the literal + JSON shape catches drift between BytesCodec's runtime representation and + the codec's V3 on-disk form. + """ + codec = BytesCodec(endian=endian) + assert codec.to_dict() == {"name": "bytes", "configuration": {"endian": endian}} + restored = BytesCodec.from_dict(codec.to_dict()) + assert restored == codec + + +@pytest.mark.parametrize( + ("member", "expected"), + [("little", "little"), ("big", "big")], +) +def test_endian_member_access_warns(member: str, expected: str) -> None: + """ + Accessing a member on the deprecated `Endian` class emits a + `DeprecationWarning` and resolves to the equivalent literal string. + """ + with pytest.warns(DeprecationWarning, match=rf"Endian\.{member}"): + value = getattr(Endian, member) + assert value == expected + + +def test_endian_class_imports_silently() -> None: + """ + Importing the deprecated `Endian` class by name must not emit a warning; + only member access does. Guards against `bytes.py` accidentally + triggering its own deprecation warnings at import time. + """ + with warnings.catch_warnings(): + warnings.simplefilter("error") + from zarr.codecs.bytes import Endian as _Endian # noqa: F401 + + +def test_bytes_codec_init_with_enum_instance_warns() -> None: + """ + Passing a foreign `enum.Enum` instance to `BytesCodec.__init__` triggers + the init-level deprecation warning (from `_coerce_enum_input`) and + normalizes the value to the corresponding literal string. Covers the + case where a downstream package defined its own enum-shaped class to + bridge between zarr's old API and its own. + """ + + class LegacyEndian(enum.Enum): + little = "little" + + with pytest.warns(DeprecationWarning, match=r"Passing an enum to BytesCodec"): + codec = BytesCodec(endian=cast(Endian, LegacyEndian.little)) + assert codec.endian == "little" + + +def test_bytes_codec_init_with_deprecated_class_member() -> None: + """ + The realistic legacy-upgrade idiom: `BytesCodec(endian=Endian.little)`. + Member access on `Endian` emits one `DeprecationWarning` (from the + metaclass) and resolves to the bare string, which `BytesCodec` then + accepts without further warning. No second warning from + `_coerce_enum_input` because the metaclass already produced a string. + + The `cast` is necessary because the metaclass `__getattr__` is typed + as returning `str`, which does not statically match the codec's + `EndianLiteral` parameter even though the runtime value does. + """ + with pytest.warns(DeprecationWarning, match=r"Endian\.little"): + codec = BytesCodec(endian=cast(EndianLiteral, Endian.little)) + assert codec.endian == "little" + + +def test_bytes_codec_rejects_unknown_endian() -> None: + """ + `BytesCodec.__init__` raises `ValueError` when given a string outside + `ENDIAN`, and the error message names the offending parameter. + """ + kwargs: dict[str, Any] = {"endian": "north"} + with pytest.raises(ValueError, match="endian must be one of"): + BytesCodec(**kwargs) + + +def test_endian_attribute_error_for_unknown_member() -> None: + """ + Attribute access for a name that is not a known member of the + deprecated `Endian` class falls through to `AttributeError`, matching + the behavior of a regular class. + """ + with pytest.raises(AttributeError): + getattr(Endian, "not_a_member") # noqa: B009 + + +def test_bytes_codec_default_endian_matches_system() -> None: + """ + Constructing `BytesCodec()` with no arguments yields a codec whose + `endian` matches `sys.byteorder`. This replaces the previous + `default_system_endian = Endian(sys.byteorder)` module-level binding. + """ + codec = BytesCodec() + assert codec.endian == sys.byteorder + + +def _make_array_spec(dtype: Any) -> ArraySpec: + """Build a minimal ArraySpec around the given dtype for codec.evolve testing.""" + return ArraySpec( + shape=(1,), + dtype=dtype, + fill_value=0, + config=cast(ArrayConfig, {}), + prototype=default_buffer_prototype(), + ) + + +def test_bytes_codec_evolve_structured_multi_byte_fields_warns_and_defaults() -> None: + """ + BytesCodec(endian=None).evolve_from_array_spec(spec) with a structured dtype + whose fields contain multi-byte members emits a UserWarning about the + missing endian and returns a codec with endian set to "little" for legacy + compatibility. + """ + codec = BytesCodec(endian=None) + dtype = Struct(fields=(("a", Int32()), ("b", Int32()))) + spec = _make_array_spec(dtype) + with pytest.warns(UserWarning, match=r"Missing 'endian' for structured dtype"): + evolved = codec.evolve_from_array_spec(spec) + assert evolved.endian == "little" + + +def test_bytes_codec_evolve_structured_single_byte_fields_clears_endian() -> None: + """ + For a structured dtype whose fields are all single-byte, BytesCodec drops + its endian on evolve (endian is meaningless for single-byte content). + """ + codec = BytesCodec(endian="little") + dtype = Struct(fields=(("a", Int8()), ("b", Int8()))) + spec = _make_array_spec(dtype) + evolved = codec.evolve_from_array_spec(spec) + assert evolved.endian is None diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index 233cc4cb77..6471746d82 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -1,5 +1,7 @@ +import enum import pickle -from typing import Any +import warnings +from typing import Any, cast import numpy as np import numpy.typing as npt @@ -13,10 +15,15 @@ from zarr.codecs import ( BloscCodec, ShardingCodec, - ShardingCodecIndexLocation, TransposeCodec, ) +from zarr.codecs.sharding import ( + INDEX_LOCATION, + IndexLocation, + ShardingCodecIndexLocation, +) from zarr.core.buffer import NDArrayLike, default_buffer_prototype +from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.storage import StorePath, ZipStore from ..conftest import ArrayRequest @@ -38,7 +45,7 @@ def test_sharding( store: Store, array_fixture: npt.NDArray[Any], - index_location: ShardingCodecIndexLocation, + index_location: IndexLocation, offset: int, ) -> None: """ @@ -76,7 +83,7 @@ def test_sharding( @pytest.mark.parametrize("offset", [0, 10]) def test_sharding_scalar( store: Store, - index_location: ShardingCodecIndexLocation, + index_location: IndexLocation, offset: int, ) -> None: """ @@ -110,7 +117,7 @@ def test_sharding_scalar( indirect=["array_fixture"], ) def test_sharding_partial( - store: Store, array_fixture: npt.NDArray[Any], index_location: ShardingCodecIndexLocation + store: Store, array_fixture: npt.NDArray[Any], index_location: IndexLocation ) -> None: data = array_fixture spath = StorePath(store) @@ -146,7 +153,7 @@ def test_sharding_partial( indirect=["array_fixture"], ) def test_sharding_partial_readwrite( - store: Store, array_fixture: npt.NDArray[Any], index_location: ShardingCodecIndexLocation + store: Store, array_fixture: npt.NDArray[Any], index_location: IndexLocation ) -> None: data = array_fixture spath = StorePath(store) @@ -178,7 +185,7 @@ def test_sharding_partial_readwrite( @pytest.mark.parametrize("index_location", ["start", "end"]) @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) def test_sharding_partial_read( - store: Store, array_fixture: npt.NDArray[Any], index_location: ShardingCodecIndexLocation + store: Store, array_fixture: npt.NDArray[Any], index_location: IndexLocation ) -> None: data = array_fixture spath = StorePath(store) @@ -207,7 +214,7 @@ def test_sharding_partial_read( @pytest.mark.parametrize("index_location", ["start", "end"]) @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) def test_sharding_partial_overwrite( - store: Store, array_fixture: npt.NDArray[Any], index_location: ShardingCodecIndexLocation + store: Store, array_fixture: npt.NDArray[Any], index_location: IndexLocation ) -> None: data = array_fixture[:10, :10, :10] spath = StorePath(store) @@ -258,8 +265,8 @@ def test_sharding_partial_overwrite( def test_nested_sharding( store: Store, array_fixture: npt.NDArray[Any], - outer_index_location: ShardingCodecIndexLocation, - inner_index_location: ShardingCodecIndexLocation, + outer_index_location: IndexLocation, + inner_index_location: IndexLocation, ) -> None: data = array_fixture spath = StorePath(store) @@ -306,8 +313,8 @@ def test_nested_sharding( def test_nested_sharding_create_array( store: Store, array_fixture: npt.NDArray[Any], - outer_index_location: ShardingCodecIndexLocation, - inner_index_location: ShardingCodecIndexLocation, + outer_index_location: IndexLocation, + inner_index_location: IndexLocation, ) -> None: data = array_fixture spath = StorePath(store) @@ -406,12 +413,8 @@ def test_pickle() -> None: @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) -@pytest.mark.parametrize( - "index_location", [ShardingCodecIndexLocation.start, ShardingCodecIndexLocation.end] -) -async def test_sharding_with_empty_inner_chunk( - store: Store, index_location: ShardingCodecIndexLocation -) -> None: +@pytest.mark.parametrize("index_location", ["start", "end"]) +async def test_sharding_with_empty_inner_chunk(store: Store, index_location: IndexLocation) -> None: data = np.arange(0, 16 * 16, dtype="uint32").reshape((16, 16)) fill_value = 1 @@ -433,13 +436,10 @@ async def test_sharding_with_empty_inner_chunk( @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) -@pytest.mark.parametrize( - "index_location", - [ShardingCodecIndexLocation.start, ShardingCodecIndexLocation.end], -) +@pytest.mark.parametrize("index_location", ["start", "end"]) @pytest.mark.parametrize("chunks_per_shard", [(5, 2), (2, 5), (5, 5)]) async def test_sharding_with_chunks_per_shard( - store: Store, index_location: ShardingCodecIndexLocation, chunks_per_shard: tuple[int] + store: Store, index_location: IndexLocation, chunks_per_shard: tuple[int] ) -> None: chunk_shape = (2, 1) shape = tuple(x * y for x, y in zip(chunks_per_shard, chunk_shape, strict=False)) @@ -554,3 +554,142 @@ def test_sharding_mixed_integer_list_indexing(store: Store) -> None: s3 = sharded[0:5, 1, 0:3] assert c3.shape == s3.shape == (5, 3) # type: ignore[union-attr] np.testing.assert_array_equal(c3, s3) + + +# --- Tests for ShardingCodecIndexLocation deprecation --- + + +@pytest.mark.parametrize("location", INDEX_LOCATION) +def test_sharding_codec_accepts_all_index_locations(location: IndexLocation) -> None: + """ + Every value in INDEX_LOCATION is accepted by ShardingCodec and round-trips + to the same value on the stored attribute. Catches drift between the + IndexLocation type alias and the runtime INDEX_LOCATION tuple. + """ + codec = ShardingCodec(chunk_shape=(1,), index_location=location) + assert codec.index_location == location + + +@pytest.mark.parametrize("location", INDEX_LOCATION) +def test_sharding_codec_json_roundtrip_index_location( + location: IndexLocation, +) -> None: + """ + ShardingCodec.to_dict writes index_location as the bare literal string, + and the round-trip through from_dict preserves equality. Asserting the + on-disk index_location value (not just the round-trip) catches drift + between ShardingCodec's runtime representation and the V3 wire form. + """ + codec = ShardingCodec(chunk_shape=(1,), index_location=location) + serialized = codec.to_dict() + assert serialized["configuration"]["index_location"] == location # type: ignore[index, call-overload] + restored = ShardingCodec.from_dict(serialized) + assert restored == codec + + +@pytest.mark.parametrize( + ("member", "expected"), + [("start", "start"), ("end", "end")], +) +def test_sharding_index_location_member_access_warns(member: str, expected: str) -> None: + """ + Accessing a member on the deprecated ShardingCodecIndexLocation class + emits a DeprecationWarning and resolves to the equivalent literal string. + """ + with pytest.warns(DeprecationWarning, match=rf"ShardingCodecIndexLocation\.{member}"): + value = getattr(ShardingCodecIndexLocation, member) + assert value == expected + + +def test_sharding_index_location_class_imports_silently() -> None: + """ + Importing the deprecated ShardingCodecIndexLocation class by name must not + emit a warning; only member access does. + """ + with warnings.catch_warnings(): + warnings.simplefilter("error") + from zarr.codecs.sharding import ( # noqa: F401 + ShardingCodecIndexLocation as _SCIL, + ) + + +def test_sharding_codec_init_with_enum_instance_warns() -> None: + """ + Passing a foreign enum.Enum instance to ShardingCodec.__init__ triggers + the init-level deprecation warning (from _coerce_enum_input) and + normalizes the value to the corresponding literal string. Covers the + case where a downstream package defined its own enum-shaped class to + bridge between zarr's old API and its own. + """ + + class LegacyIndexLocation(enum.Enum): + end = "end" + + with pytest.warns(DeprecationWarning, match=r"Passing an enum to ShardingCodec"): + codec = ShardingCodec( + chunk_shape=(1,), + index_location=cast(ShardingCodecIndexLocation, LegacyIndexLocation.end), + ) + assert codec.index_location == "end" + + +def test_sharding_codec_init_with_deprecated_class_member() -> None: + """ + The realistic legacy-upgrade idiom: ShardingCodec(index_location=ShardingCodecIndexLocation.end). + Member access on ShardingCodecIndexLocation emits one DeprecationWarning + (from the metaclass) and resolves to the bare string, which ShardingCodec + then accepts without further warning. No second warning from + _coerce_enum_input because the metaclass already produced a string. + + The cast is necessary because the metaclass __getattr__ is typed as + returning str, which does not statically match the codec's + IndexLocation parameter even though the runtime value does. + """ + with pytest.warns(DeprecationWarning, match=r"ShardingCodecIndexLocation\.end"): + codec = ShardingCodec( + chunk_shape=(1,), + index_location=cast(IndexLocation, ShardingCodecIndexLocation.end), + ) + assert codec.index_location == "end" + + +def test_sharding_codec_rejects_unknown_index_location() -> None: + """ + ShardingCodec.__init__ raises ValueError when index_location is outside + INDEX_LOCATION, and the error message names the offending parameter. + """ + kwargs: dict[str, Any] = {"chunk_shape": (1,), "index_location": "middle"} + with pytest.raises(ValueError, match="index_location must be one of"): + ShardingCodec(**kwargs) + + +def test_sharding_index_location_attribute_error_for_unknown_member() -> None: + """ + Attribute access for a name that is not a known member of the deprecated + ShardingCodecIndexLocation class falls through to AttributeError. + """ + with pytest.raises(AttributeError): + getattr(ShardingCodecIndexLocation, "not_a_member") # noqa: B009 + + +@pytest.mark.parametrize("index_location", INDEX_LOCATION) +def test_create_array_with_dict_shards_index_location( + index_location: IndexLocation, +) -> None: + """ + zarr.create_array accepts a `ShardsConfigParam`-shaped dict for `shards` + with an explicit `index_location`, and the resulting sharding codec + stores that value. Covers the `isinstance(shards, dict)` branch in + init_array that the tuple-shaped `shards` form doesn't reach. + """ + arr = zarr.create_array( + store={}, + shape=(8,), + chunks=(2,), + shards={"shape": (4,), "index_location": index_location}, + dtype="uint8", + ) + assert isinstance(arr.metadata, ArrayV3Metadata) # needed for mypy + sharding = arr.metadata.codecs[0] + assert isinstance(sharding, ShardingCodec) + assert sharding.index_location == index_location diff --git a/tests/test_info.py b/tests/test_info.py index 28c8803c83..08f2318dc2 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -74,7 +74,7 @@ def test_array_info(zarr_format: ZarrFormat) -> None: Read-only : True Store type : MemoryStore Filters : () - Serializer : BytesCodec(endian=) + Serializer : BytesCodec(endian='little') Compressors : ()""") @@ -117,7 +117,7 @@ def test_array_info_complete( Read-only : True Store type : MemoryStore Filters : () - Serializer : BytesCodec(endian=) + Serializer : BytesCodec(endian='little') Compressors : () No. bytes : {count_bytes} ({count_bytes_formatted}) No. bytes stored : {count_bytes_stored} ({count_bytes_stored_formatted})