Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/3748.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added `array.fill_missing_chunks` configuration option. When set to `False`, reading missing chunks raises a `ChunkNotFoundError` instead of filling them with the array's fill value.
14 changes: 13 additions & 1 deletion docs/user-guide/arrays.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,13 +158,25 @@ print(f"Shape after second append: {z.shape}")

Zarr arrays are parametrized with a configuration that determines certain aspects of array behavior.

We currently support two configuration options for arrays: `write_empty_chunks` and `order`.
We currently support three configuration options for arrays: `write_empty_chunks`, `fill_missing_chunks`, and `order`.

| field | type | default | description |
| - | - | - | - |
| `write_empty_chunks` | `bool` | `False` | Controls whether empty chunks are written to storage. See [Empty chunks](performance.md#empty-chunks).
| `fill_missing_chunks` | `bool` | `True` | Controls whether missing chunks are filled with the array's fill value on read. If `False`, reading missing chunks raises a [`ChunkNotFoundError`][zarr.errors.ChunkNotFoundError].
| `order` | `Literal["C", "F"]` | `"C"` | The memory layout of arrays returned when reading data from the store.

!!! info
The Zarr V3 spec states that readers should interpret an uninitialized chunk as containing the
array's `fill_value`. By default, Zarr-Python follows this behavior: a missing chunk is treated
as uninitialized and filled with the array's `fill_value`. However, if you know that all chunks
have been written (i.e., are initialized), you may want to treat a missing chunk as an error. Set
`fill_missing_chunks=False` to raise a [`ChunkNotFoundError`][zarr.errors.ChunkNotFoundError] instead.

!!! note
`write_empty_chunks=False` skips writing chunks that are entirely the array's fill value.
If `fill_missing_chunks=False`, attempting to read these missing chunks will raise a [`ChunkNotFoundError`][zarr.errors.ChunkNotFoundError].

You can specify the configuration when you create an array with the `config` keyword argument.
`config` can be passed as either a `dict` or an `ArrayConfig` object.

Expand Down
1 change: 1 addition & 0 deletions docs/user-guide/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Configuration options include the following:
- Default Zarr format `default_zarr_version`
- Default array order in memory `array.order`
- Whether empty chunks are written to storage `array.write_empty_chunks`
- Whether missing chunks are filled with the array's fill value on read `array.fill_missing_chunks` (default `True`). Set to `False` to raise a [`ChunkNotFoundError`][zarr.errors.ChunkNotFoundError] instead.
- Async and threading options, e.g. `async.concurrency` and `threading.max_workers`
- Selections of implementations of codecs, codec pipelines and buffers
- Enabling GPU support with `zarr.config.enable_gpu()`. See GPU support for more.
Expand Down
8 changes: 6 additions & 2 deletions src/zarr/abc/codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,9 @@ async def encode(
@abstractmethod
async def read(
self,
batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
batch_info: Iterable[
tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool, str, tuple[int, ...]]
],
out: NDBuffer,
drop_axes: tuple[int, ...] = (),
) -> None:
Expand All @@ -439,12 +441,14 @@ async def read(

Parameters
----------
batch_info : Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]]
batch_info : Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool, str, tuple[int, ...]]]
Ordered set of information about the chunks.
The first slice selection determines which parts of the chunk will be fetched.
The second slice selection determines where in the output array the chunk data will be written.
The ByteGetter is used to fetch the necessary bytes.
The chunk spec contains information about the construction of an array from the bytes.
The string is the chunk key.
The tuple of ints is the chunk's grid coordinates.

If the Store returns ``None`` for a chunk, then the chunk was not
written and the implementation must set the values of that chunk (or
Expand Down
13 changes: 11 additions & 2 deletions src/zarr/codecs/sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,8 @@ async def _decode_single(
chunk_selection,
out_selection,
is_complete_shard,
"/".join(str(c) for c in chunk_coords),
chunk_coords,
)
for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer
],
Expand Down Expand Up @@ -511,6 +513,8 @@ async def _decode_partial_single(
chunk_selection,
out_selection,
is_complete_shard,
"/".join(str(c) for c in chunk_coords),
chunk_coords,
)
for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer
],
Expand Down Expand Up @@ -711,17 +715,22 @@ def _get_index_chunk_spec(self, chunks_per_shard: tuple[int, ...]) -> ArraySpec:
dtype=UInt64(endianness="little"),
fill_value=MAX_UINT_64,
config=ArrayConfig(
order="C", write_empty_chunks=False
order="C", write_empty_chunks=False, fill_missing_chunks=True
), # Note: this is hard-coded for simplicity -- it is not surfaced into user code,
prototype=default_buffer_prototype(),
)

def _get_chunk_spec(self, shard_spec: ArraySpec) -> ArraySpec:
# Because the shard index and inner chunks should be stored
# together, we detect missing data via the shard index.
# The inner chunks defined here are thus allowed to return
# None, even if fill_missing_chunks=False at the array level.
config = replace(shard_spec.config, fill_missing_chunks=True)
return ArraySpec(
shape=self.chunk_shape,
dtype=shard_spec.dtype,
fill_value=shard_spec.fill_value,
config=shard_spec.config,
config=config,
prototype=shard_spec.prototype,
)

Expand Down
4 changes: 3 additions & 1 deletion src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -5616,11 +5616,13 @@ async def _get_selection(
await codec_pipeline.read(
[
(
store_path / metadata.encode_chunk_key(chunk_coords),
store_path / (chunk_key := metadata.encode_chunk_key(chunk_coords)),
metadata.get_chunk_spec(chunk_coords, _config, prototype=prototype),
chunk_selection,
out_selection,
is_complete_chunk,
chunk_key,
chunk_coords,
)
for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer
],
Expand Down
21 changes: 18 additions & 3 deletions src/zarr/core/array_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class ArrayConfigParams(TypedDict):

order: NotRequired[MemoryOrder]
write_empty_chunks: NotRequired[bool]
fill_missing_chunks: NotRequired[bool]


@dataclass(frozen=True)
Expand All @@ -41,17 +42,25 @@ class ArrayConfig:
The memory layout of the arrays returned when reading data from the store.
write_empty_chunks : bool
If True, empty chunks will be written to the store.
fill_missing_chunks : bool
If True, missing chunks will be filled with the array's fill value on read.
If False, reading missing chunks will raise a ``ChunkNotFoundError``.
"""

order: MemoryOrder
write_empty_chunks: bool
fill_missing_chunks: bool

def __init__(self, order: MemoryOrder, write_empty_chunks: bool) -> None:
def __init__(
self, order: MemoryOrder, write_empty_chunks: bool, fill_missing_chunks: bool
) -> None:
order_parsed = parse_order(order)
write_empty_chunks_parsed = parse_bool(write_empty_chunks)
fill_missing_chunks_parsed = parse_bool(fill_missing_chunks)

object.__setattr__(self, "order", order_parsed)
object.__setattr__(self, "write_empty_chunks", write_empty_chunks_parsed)
object.__setattr__(self, "fill_missing_chunks", fill_missing_chunks_parsed)

@classmethod
def from_dict(cls, data: ArrayConfigParams) -> Self:
Expand All @@ -62,7 +71,9 @@ def from_dict(cls, data: ArrayConfigParams) -> Self:
"""
kwargs_out: ArrayConfigParams = {}
for f in fields(ArrayConfig):
field_name = cast("Literal['order', 'write_empty_chunks']", f.name)
field_name = cast(
"Literal['order', 'write_empty_chunks', 'fill_missing_chunks']", f.name
)
if field_name not in data:
kwargs_out[field_name] = zarr_config.get(f"array.{field_name}")
else:
Expand All @@ -73,7 +84,11 @@ def to_dict(self) -> ArrayConfigParams:
"""
Serialize an instance of this class to a dict.
"""
return {"order": self.order, "write_empty_chunks": self.write_empty_chunks}
return {
"order": self.order,
"write_empty_chunks": self.write_empty_chunks,
"fill_missing_chunks": self.fill_missing_chunks,
}


ArrayConfigLike = ArrayConfig | ArrayConfigParams
Expand Down
36 changes: 27 additions & 9 deletions src/zarr/core/codec_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from zarr.core.common import concurrent_map
from zarr.core.config import config
from zarr.core.indexing import SelectorTuple, is_scalar
from zarr.errors import ZarrUserWarning
from zarr.errors import ChunkNotFoundError, ZarrUserWarning
from zarr.registry import register_pipeline

if TYPE_CHECKING:
Expand Down Expand Up @@ -248,7 +248,9 @@ async def encode_partial_batch(

async def read_batch(
self,
batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
batch_info: Iterable[
tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool, str, tuple[int, ...]]
],
out: NDBuffer,
drop_axes: tuple[int, ...] = (),
) -> None:
Expand All @@ -259,13 +261,17 @@ async def read_batch(
for byte_getter, chunk_spec, chunk_selection, *_ in batch_info
]
)
for chunk_array, (_, chunk_spec, _, out_selection, _) in zip(
for chunk_array, (_, chunk_spec, _, out_selection, _, chunk_key, chunk_coords) in zip(
chunk_array_batch, batch_info, strict=False
):
if chunk_array is not None:
out[out_selection] = chunk_array
else:
elif chunk_spec.config.fill_missing_chunks:
out[out_selection] = fill_value_or_default(chunk_spec)
else:
raise ChunkNotFoundError(
f"chunk '{chunk_key}' at grid position {chunk_coords} not found in store"
)
else:
chunk_bytes_batch = await concurrent_map(
[(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch_info],
Expand All @@ -280,16 +286,26 @@ async def read_batch(
)
],
)
for chunk_array, (_, chunk_spec, chunk_selection, out_selection, _) in zip(
chunk_array_batch, batch_info, strict=False
):
for chunk_array, (
_,
chunk_spec,
chunk_selection,
out_selection,
_,
chunk_key,
chunk_coords,
) in zip(chunk_array_batch, batch_info, strict=False):
if chunk_array is not None:
tmp = chunk_array[chunk_selection]
if drop_axes != ():
tmp = tmp.squeeze(axis=drop_axes)
out[out_selection] = tmp
else:
elif chunk_spec.config.fill_missing_chunks:
out[out_selection] = fill_value_or_default(chunk_spec)
else:
raise ChunkNotFoundError(
f"chunk '{chunk_key}' at grid position {chunk_coords} not found in store"
)

def _merge_chunk_array(
self,
Expand Down Expand Up @@ -466,7 +482,9 @@ async def encode(

async def read(
self,
batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
batch_info: Iterable[
tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool, str, tuple[int, ...]]
],
out: NDBuffer,
drop_axes: tuple[int, ...] = (),
) -> None:
Expand Down
1 change: 1 addition & 0 deletions src/zarr/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def enable_gpu(self) -> ConfigSet:
"array": {
"order": "C",
"write_empty_chunks": False,
"fill_missing_chunks": True,
"target_shard_size_bytes": None,
},
"async": {"concurrency": 10, "timeout": None},
Expand Down
6 changes: 6 additions & 0 deletions src/zarr/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,3 +144,9 @@ class BoundsCheckError(IndexError): ...


class ArrayIndexError(IndexError): ...


class ChunkNotFoundError(BaseZarrError):
"""
Raised when a chunk that was expected to exist in storage was not retrieved successfully.
"""
97 changes: 96 additions & 1 deletion tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from zarr.core.codec_pipeline import BatchedCodecPipeline
from zarr.core.config import BadConfigError, config
from zarr.core.indexing import SelectorTuple
from zarr.errors import ZarrUserWarning
from zarr.errors import ChunkNotFoundError, ZarrUserWarning
from zarr.registry import (
fully_qualified_name,
get_buffer_class,
Expand Down Expand Up @@ -53,6 +53,7 @@ def test_config_defaults_set() -> None:
"array": {
"order": "C",
"write_empty_chunks": False,
"fill_missing_chunks": True,
"target_shard_size_bytes": None,
},
"async": {"concurrency": 10, "timeout": None},
Expand Down Expand Up @@ -319,6 +320,100 @@ class NewCodec2(BytesCodec):
get_codec_class("new_codec")


@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"])
@pytest.mark.parametrize(
"kwargs",
[
{"shards": (4, 4)},
{"compressors": None},
],
ids=["partial_decode", "full_decode"],
)
def test_config_fill_missing_chunks(store: Store, kwargs: dict[str, Any]) -> None:
arr = zarr.create_array(
store=store,
shape=(4, 4),
chunks=(2, 2),
dtype="int32",
fill_value=42,
**kwargs,
)

# default behavior: missing chunks are filled with the fill value
result = zarr.open_array(store)[:]
assert np.array_equal(result, np.full((4, 4), 42, dtype="int32"))

# with fill_missing_chunks=False, reading missing chunks raises an error
with config.set({"array.fill_missing_chunks": False}):
with pytest.raises(ChunkNotFoundError):
zarr.open_array(store)[:]

# after writing data, all chunks exist and no error is raised
arr[:] = np.arange(16, dtype="int32").reshape(4, 4)
with config.set({"array.fill_missing_chunks": False}):
result = zarr.open_array(store)[:]
assert np.array_equal(result, np.arange(16, dtype="int32").reshape(4, 4))


@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"])
def test_config_fill_missing_chunks_sharded_inner(store: Store) -> None:
"""Missing inner chunks within a shard are always filled with the array's
fill value, even when fill_missing_chunks=False."""
arr = zarr.create_array(
store=store,
shape=(8, 4),
chunks=(2, 2),
shards=(4, 4),
dtype="int32",
fill_value=42,
)

# write only one inner chunk in the first shard, leaving the second shard empty
arr[0:2, 0:2] = np.ones((2, 2), dtype="int32")

with config.set({"array.fill_missing_chunks": False}):
a = zarr.open_array(store)

# first shard exists: missing inner chunks are filled, no error
result = a[:4]
expected = np.full((4, 4), 42, dtype="int32")
expected[0:2, 0:2] = 1
assert np.array_equal(result, expected)

# second shard is entirely missing: raises an error
with pytest.raises(ChunkNotFoundError):
a[4:]


@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"])
def test_config_fill_missing_chunks_write_empty_chunks(store: Store) -> None:
"""write_empty_chunks=False drops chunks equal to fill_value, which then
appear missing to fill_missing_chunks=False."""
arr = zarr.create_array(
store=store,
shape=(4,),
chunks=(2,),
dtype="int32",
fill_value=0,
config={"write_empty_chunks": False, "fill_missing_chunks": False},
)

# write non-fill-value data: chunks are stored
arr[:] = [1, 2, 3, 4]
assert np.array_equal(arr[:], [1, 2, 3, 4])

# overwrite with fill_value: chunks are dropped by write_empty_chunks=False
arr[:] = 0
with pytest.raises(ChunkNotFoundError):
arr[:]

# with write_empty_chunks=True, chunks are kept and no error is raised
with config.set({"array.write_empty_chunks": True}):
arr = zarr.open_array(store)
arr[:] = 0
assert np.array_equal(arr[:], [0, 0, 0, 0])


@pytest.mark.parametrize(
"key",
[
Expand Down
Loading