Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions changes/0000.removal.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
The ``Endian`` (``zarr.codecs.bytes.Endian``) and ``ShardingCodecIndexLocation``
(``zarr.codecs.ShardingCodecIndexLocation``) enums are now deprecated. Pass the
equivalent literal string instead (e.g. ``"little"`` / ``"big"``, ``"start"`` /
``"end"``). The enum classes remain importable but emit ``DeprecationWarning``
on member access, and will be removed in a future release. ``BytesCodec.endian``
and ``ShardingCodec.index_location`` are now plain strings rather than enum
members.

Two follow-on changes from this deprecation:

- ``NDBuffer.byteorder`` now returns a literal string (``"little"`` or
``"big"``) rather than an ``Endian`` member. Subclasses overriding this
property should update their return type.
- The module-level binding ``zarr.codecs.bytes.default_system_endian`` was
removed. ``BytesCodec()`` continues to default to ``sys.byteorder``;
external callers that imported ``default_system_endian`` should use
``sys.byteorder`` directly.
49 changes: 49 additions & 0 deletions src/zarr/codecs/_deprecated_enum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""Helpers for deprecating string-valued enums in favor of literal strings.

See PR #3963 for context on the deprecation pattern.
"""

from __future__ import annotations

import warnings
from enum import Enum


class _DeprecatedStrEnumMeta(type):
"""
Metaclass for legacy enum-like classes. Accessing a member name on the
class (e.g. `LegacyShim.foo`) emits a `DeprecationWarning` and returns
the equivalent string. Members are declared by setting a `_members`
class attribute mapping each member name to its string value.
"""

_members: dict[str, str]

def __getattr__(cls, name: str) -> str:
members: dict[str, str] = type.__getattribute__(cls, "_members")
if name in members:
warnings.warn(
f"{cls.__name__}.{name} is deprecated; pass the string {members[name]!r} instead.",
DeprecationWarning,
stacklevel=2,
)
return members[name]
raise AttributeError(name)


def _coerce_enum_input(value: object, param_name: str, codec_name: str) -> object:
"""
If `value` is a real `enum.Enum` instance, emit a deprecation warning
naming `codec_name` and return `value.value`. Otherwise return `value`
unchanged. The third argument lets the warning text name the actual
codec (e.g. `BloscCodec`, `BytesCodec`, `ShardingCodec`).
"""
if isinstance(value, Enum):
warnings.warn(
f"Passing an enum to {codec_name}(..., {param_name}=...) is deprecated; "
"pass the equivalent literal string instead.",
DeprecationWarning,
stacklevel=3,
)
return value.value
return value
44 changes: 3 additions & 41 deletions src/zarr/codecs/blosc.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
from __future__ import annotations

import asyncio
import warnings
from dataclasses import dataclass, field, replace
from enum import Enum
from functools import cached_property
from typing import TYPE_CHECKING, ClassVar, Final, Literal, NotRequired, TypedDict

Expand All @@ -12,6 +10,7 @@
from packaging.version import Version

from zarr.abc.codec import BytesBytesCodec
from zarr.codecs._deprecated_enum import _coerce_enum_input, _DeprecatedStrEnumMeta
from zarr.core.buffer.cpu import as_numpy_array_wrapper
from zarr.core.common import JSON, NamedRequiredConfig, parse_named_configuration
from zarr.core.dtype.common import HasItemSize
Expand Down Expand Up @@ -59,27 +58,6 @@ class BloscJSON_V3(NamedRequiredConfig[Literal["blosc"], BloscConfigV3]):
"""


class _DeprecatedStrEnumMeta(type):
"""
Metaclass for the legacy `BloscShuffle` / `BloscCname` classes. Accessing
a member name (e.g. `BloscShuffle.bitshuffle`) emits a `DeprecationWarning`
and returns the equivalent string.
"""

_members: dict[str, str]

def __getattr__(cls, name: str) -> str:
members: dict[str, str] = type.__getattribute__(cls, "_members")
if name in members:
warnings.warn(
f"{cls.__name__}.{name} is deprecated; pass the string {members[name]!r} instead.",
DeprecationWarning,
stacklevel=2,
)
return members[name]
raise AttributeError(name)


class BloscShuffle(metaclass=_DeprecatedStrEnumMeta):
"""
Deprecated. Pass a literal string (`"noshuffle"`, `"shuffle"`, or
Expand Down Expand Up @@ -149,22 +127,6 @@ def parse_blocksize(data: JSON) -> int:
raise TypeError(f"Value should be an int. Got {type(data)} instead.")


def _coerce_enum_input(value: object, param_name: str) -> object:
"""
If `value` is a real `enum.Enum` instance, emit a deprecation warning
and return `value.value`. Otherwise return `value` unchanged.
"""
if isinstance(value, Enum):
warnings.warn(
f"Passing an enum to BloscCodec(..., {param_name}=...) is deprecated; "
"pass the equivalent literal string instead.",
DeprecationWarning,
stacklevel=3,
)
return value.value
return value


def _parse_cname(data: object) -> BloscCnameLiteral:
if isinstance(data, str) and data in BLOSC_CNAME:
return data # type: ignore[return-value]
Expand Down Expand Up @@ -285,8 +247,8 @@ def __init__(
shuffle = "bitshuffle"
self._tunable_attrs.update({"shuffle"})

cname = _coerce_enum_input(cname, "cname") # type: ignore[assignment]
shuffle = _coerce_enum_input(shuffle, "shuffle") # type: ignore[assignment]
cname = _coerce_enum_input(cname, "cname", "BloscCodec") # type: ignore[assignment]
shuffle = _coerce_enum_input(shuffle, "shuffle", "BloscCodec") # type: ignore[assignment]

typesize_parsed = parse_typesize(typesize)
cname_parsed = _parse_cname(cname)
Expand Down
46 changes: 28 additions & 18 deletions src/zarr/codecs/bytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
import sys
import warnings
from dataclasses import dataclass, replace
from enum import Enum
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, ClassVar, Final, Literal

from zarr.abc.codec import ArrayBytesCodec
from zarr.codecs._deprecated_enum import _coerce_enum_input, _DeprecatedStrEnumMeta
from zarr.core.buffer import Buffer, NDBuffer
from zarr.core.common import JSON, parse_enum, parse_named_configuration
from zarr.core.common import JSON, parse_named_configuration
from zarr.core.dtype.common import HasEndianness
from zarr.core.dtype.npy.structured import Struct

Expand All @@ -18,16 +18,25 @@
from zarr.core.array_spec import ArraySpec


class Endian(Enum):
EndianLiteral = Literal["little", "big"]
"""Byte order of multi-byte numeric data."""

ENDIAN: Final = ("little", "big")


class Endian(metaclass=_DeprecatedStrEnumMeta):
"""
Enum for endian type used by bytes codec.
Deprecated. Pass a literal string (`"little"` or `"big"`) directly to
`BytesCodec` instead.
"""

big = "big"
little = "little"
_members: ClassVar[dict[str, str]] = {"little": "little", "big": "big"}


default_system_endian = Endian(sys.byteorder)
def _parse_endian(data: object) -> EndianLiteral:
if isinstance(data, str) and data in ENDIAN:
return data # type: ignore[return-value]
raise ValueError(f"endian must be one of {list(ENDIAN)!r}. Got {data!r}.")


@dataclass(frozen=True)
Expand All @@ -36,10 +45,14 @@ class BytesCodec(ArrayBytesCodec):

is_fixed_size = True

endian: Endian | None
endian: EndianLiteral | None

def __init__(self, *, endian: Endian | str | None = default_system_endian) -> None:
endian_parsed = None if endian is None else parse_enum(endian, Endian)
def __init__(self, *, endian: Endian | EndianLiteral | None = sys.byteorder) -> None:
if endian is None:
endian_parsed: EndianLiteral | None = None
else:
coerced = _coerce_enum_input(endian, "endian", "BytesCodec")
endian_parsed = _parse_endian(coerced)

object.__setattr__(self, "endian", endian_parsed)

Expand All @@ -55,7 +68,7 @@ def to_dict(self) -> dict[str, JSON]:
if self.endian is None:
return {"name": "bytes"}
else:
return {"name": "bytes", "configuration": {"endian": self.endian.value}}
return {"name": "bytes", "configuration": {"endian": self.endian}}

def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
if isinstance(array_spec.dtype, Struct):
Expand All @@ -67,7 +80,7 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
UserWarning,
stacklevel=2,
)
return replace(self, endian=Endian.little)
return replace(self, endian="little")
else:
if self.endian is not None:
return replace(self, endian=None)
Expand All @@ -85,8 +98,7 @@ def _decode_sync(
chunk_bytes: Buffer,
chunk_spec: ArraySpec,
) -> NDBuffer:
# TODO: remove endianness enum in favor of literal union
endian_str = self.endian.value if self.endian is not None else None
endian_str = self.endian
if isinstance(chunk_spec.dtype, HasEndianness):
dtype = replace(chunk_spec.dtype, endianness=endian_str).to_native_dtype() # type: ignore[call-arg]
else:
Expand Down Expand Up @@ -121,9 +133,7 @@ def _encode_sync(
and self.endian is not None
and self.endian != chunk_array.byteorder
):
# type-ignore is a numpy bug
# see https://github.com/numpy/numpy/issues/26473
new_dtype = chunk_array.dtype.newbyteorder(self.endian.name) # type: ignore[arg-type]
new_dtype = chunk_array.dtype.newbyteorder(self.endian)
chunk_array = chunk_array.astype(new_dtype)

nd_array = chunk_array.as_ndarray_like()
Expand Down
44 changes: 27 additions & 17 deletions src/zarr/codecs/sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@

from collections.abc import Iterable, Mapping, MutableMapping, Sequence
from dataclasses import dataclass, replace
from enum import Enum
from functools import lru_cache
from operator import itemgetter
from typing import TYPE_CHECKING, Any, NamedTuple, cast
from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal, NamedTuple, cast

import numpy as np
import numpy.typing as npt
Expand All @@ -24,6 +23,7 @@
RangeByteRequest,
SuffixByteRequest,
)
from zarr.codecs._deprecated_enum import _coerce_enum_input, _DeprecatedStrEnumMeta
from zarr.codecs.bytes import BytesCodec
from zarr.codecs.crc32c_ import Crc32cCodec
from zarr.core.array_spec import ArrayConfig, ArraySpec
Expand All @@ -37,7 +37,6 @@
from zarr.core.chunk_grids import ChunkGrid
from zarr.core.common import (
ShapeLike,
parse_enum,
parse_named_configuration,
parse_shapelike,
product,
Expand Down Expand Up @@ -74,17 +73,25 @@
ShardMutableMapping = MutableMapping[tuple[int, ...], Buffer | None]


class ShardingCodecIndexLocation(Enum):
IndexLocation = Literal["start", "end"]
"""Position of the shard index within the encoded shard."""

INDEX_LOCATION: Final = ("start", "end")


class ShardingCodecIndexLocation(metaclass=_DeprecatedStrEnumMeta):
"""
Enum for index location used by the sharding codec.
Deprecated. Pass a literal string (`"start"` or `"end"`) directly to
`ShardingCodec` instead.
"""

start = "start"
end = "end"
_members: ClassVar[dict[str, str]] = {"start": "start", "end": "end"}


def parse_index_location(data: object) -> ShardingCodecIndexLocation:
return parse_enum(data, ShardingCodecIndexLocation)
def _parse_index_location(data: object) -> IndexLocation:
if isinstance(data, str) and data in INDEX_LOCATION:
return data # type: ignore[return-value]
raise ValueError(f"index_location must be one of {list(INDEX_LOCATION)!r}. Got {data!r}.")


@dataclass(frozen=True)
Expand Down Expand Up @@ -240,7 +247,7 @@ async def from_bytes(
shard_index_size = codec._shard_index_size(chunks_per_shard)
obj = cls()
obj.buf = buf
if codec.index_location == ShardingCodecIndexLocation.start:
if codec.index_location == "start":
shard_index_bytes = obj.buf[:shard_index_size]
else:
shard_index_bytes = obj.buf[-shard_index_size:]
Expand Down Expand Up @@ -310,20 +317,23 @@ class ShardingCodec(
chunk_shape: tuple[int, ...]
codecs: tuple[Codec, ...]
index_codecs: tuple[Codec, ...]
index_location: ShardingCodecIndexLocation = ShardingCodecIndexLocation.end
index_location: IndexLocation = "end"

def __init__(
self,
*,
chunk_shape: ShapeLike,
codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(),),
index_codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(), Crc32cCodec()),
index_location: ShardingCodecIndexLocation | str = ShardingCodecIndexLocation.end,
index_location: ShardingCodecIndexLocation | IndexLocation = "end",
) -> None:
chunk_shape_parsed = parse_shapelike(chunk_shape)
codecs_parsed = parse_codecs(codecs)
index_codecs_parsed = parse_codecs(index_codecs)
index_location_parsed = parse_index_location(index_location)
index_location_coerced = _coerce_enum_input(
index_location, "index_location", "ShardingCodec"
)
index_location_parsed = _parse_index_location(index_location_coerced)

object.__setattr__(self, "chunk_shape", chunk_shape_parsed)
object.__setattr__(self, "codecs", codecs_parsed)
Expand All @@ -348,7 +358,7 @@ def __setstate__(self, state: dict[str, Any]) -> None:
object.__setattr__(self, "chunk_shape", parse_shapelike(config["chunk_shape"]))
object.__setattr__(self, "codecs", parse_codecs(config["codecs"]))
object.__setattr__(self, "index_codecs", parse_codecs(config["index_codecs"]))
object.__setattr__(self, "index_location", parse_index_location(config["index_location"]))
object.__setattr__(self, "index_location", _parse_index_location(config["index_location"]))

# Use instance-local lru_cache to avoid memory leaks
# object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec))
Expand All @@ -371,7 +381,7 @@ def to_dict(self) -> dict[str, JSON]:
"chunk_shape": self.chunk_shape,
"codecs": tuple(s.to_dict() for s in self.codecs),
"index_codecs": tuple(s.to_dict() for s in self.index_codecs),
"index_location": self.index_location.value,
"index_location": self.index_location,
},
}

Expand Down Expand Up @@ -659,7 +669,7 @@ async def _encode_shard_dict(
return None

index_bytes = await self._encode_shard_index(index)
if self.index_location == ShardingCodecIndexLocation.start:
if self.index_location == "start":
empty_chunks_mask = index.offsets_and_lengths[..., 0] == MAX_UINT_64
index.offsets_and_lengths[~empty_chunks_mask, 0] += len(index_bytes)
index_bytes = await self._encode_shard_index(
Expand Down Expand Up @@ -766,7 +776,7 @@ async def _load_shard_index_maybe(
self, byte_getter: ByteGetter, chunks_per_shard: tuple[int, ...]
) -> _ShardIndex | None:
shard_index_size = self._shard_index_size(chunks_per_shard)
if self.index_location == ShardingCodecIndexLocation.start:
if self.index_location == "start":
index_bytes = await byte_getter.get(
prototype=numpy_buffer_prototype(),
byte_range=RangeByteRequest(0, shard_index_size),
Expand Down
Loading
Loading