diff --git a/changes/3748.feature.md b/changes/3748.feature.md new file mode 100644 index 0000000000..00553e73fb --- /dev/null +++ b/changes/3748.feature.md @@ -0,0 +1 @@ +Added `array.fill_missing_chunks` configuration option. When set to `False`, reading missing chunks raises a `ChunkNotFoundError` instead of filling them with the array's fill value. diff --git a/docs/user-guide/arrays.md b/docs/user-guide/arrays.md index a44c096b73..727d8b05e8 100644 --- a/docs/user-guide/arrays.md +++ b/docs/user-guide/arrays.md @@ -158,13 +158,25 @@ print(f"Shape after second append: {z.shape}") Zarr arrays are parametrized with a configuration that determines certain aspects of array behavior. -We currently support two configuration options for arrays: `write_empty_chunks` and `order`. +We currently support three configuration options for arrays: `write_empty_chunks`, `fill_missing_chunks`, and `order`. | field | type | default | description | | - | - | - | - | | `write_empty_chunks` | `bool` | `False` | Controls whether empty chunks are written to storage. See [Empty chunks](performance.md#empty-chunks). +| `fill_missing_chunks` | `bool` | `True` | Controls whether missing chunks are filled with the array's fill value on read. If `False`, reading missing chunks raises a [`ChunkNotFoundError`][zarr.errors.ChunkNotFoundError]. | `order` | `Literal["C", "F"]` | `"C"` | The memory layout of arrays returned when reading data from the store. +!!! info + The Zarr V3 spec states that readers should interpret an uninitialized chunk as containing the + array's `fill_value`. By default, Zarr-Python follows this behavior: a missing chunk is treated + as uninitialized and filled with the array's `fill_value`. However, if you know that all chunks + have been written (i.e., are initialized), you may want to treat a missing chunk as an error. Set + `fill_missing_chunks=False` to raise a [`ChunkNotFoundError`][zarr.errors.ChunkNotFoundError] instead. + +!!! note + `write_empty_chunks=False` skips writing chunks that are entirely the array's fill value. + If `fill_missing_chunks=False`, attempting to read these missing chunks will raise a [`ChunkNotFoundError`][zarr.errors.ChunkNotFoundError]. + You can specify the configuration when you create an array with the `config` keyword argument. `config` can be passed as either a `dict` or an `ArrayConfig` object. diff --git a/docs/user-guide/config.md b/docs/user-guide/config.md index 21fe9b5def..613a51abf9 100644 --- a/docs/user-guide/config.md +++ b/docs/user-guide/config.md @@ -30,6 +30,7 @@ Configuration options include the following: - Default Zarr format `default_zarr_version` - Default array order in memory `array.order` - Whether empty chunks are written to storage `array.write_empty_chunks` +- Whether missing chunks are filled with the array's fill value on read `array.fill_missing_chunks` (default `True`). Set to `False` to raise a [`ChunkNotFoundError`][zarr.errors.ChunkNotFoundError] instead. - Async and threading options, e.g. `async.concurrency` and `threading.max_workers` - Selections of implementations of codecs, codec pipelines and buffers - Enabling GPU support with `zarr.config.enable_gpu()`. See GPU support for more. diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 3ec5ec522b..a45f582832 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -430,7 +430,9 @@ async def encode( @abstractmethod async def read( self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + batch_info: Iterable[ + tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool, str, tuple[int, ...]] + ], out: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: @@ -439,12 +441,14 @@ async def read( Parameters ---------- - batch_info : Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]] + batch_info : Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool, str, tuple[int, ...]]] Ordered set of information about the chunks. The first slice selection determines which parts of the chunk will be fetched. The second slice selection determines where in the output array the chunk data will be written. The ByteGetter is used to fetch the necessary bytes. The chunk spec contains information about the construction of an array from the bytes. + The string is the chunk key. + The tuple of ints is the chunk's grid coordinates. If the Store returns ``None`` for a chunk, then the chunk was not written and the implementation must set the values of that chunk (or diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 85162c2f74..7ba2858254 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -439,6 +439,8 @@ async def _decode_single( chunk_selection, out_selection, is_complete_shard, + "/".join(str(c) for c in chunk_coords), + chunk_coords, ) for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer ], @@ -511,6 +513,8 @@ async def _decode_partial_single( chunk_selection, out_selection, is_complete_shard, + "/".join(str(c) for c in chunk_coords), + chunk_coords, ) for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer ], @@ -711,17 +715,22 @@ def _get_index_chunk_spec(self, chunks_per_shard: tuple[int, ...]) -> ArraySpec: dtype=UInt64(endianness="little"), fill_value=MAX_UINT_64, config=ArrayConfig( - order="C", write_empty_chunks=False + order="C", write_empty_chunks=False, fill_missing_chunks=True ), # Note: this is hard-coded for simplicity -- it is not surfaced into user code, prototype=default_buffer_prototype(), ) def _get_chunk_spec(self, shard_spec: ArraySpec) -> ArraySpec: + # Because the shard index and inner chunks should be stored + # together, we detect missing data via the shard index. + # The inner chunks defined here are thus allowed to return + # None, even if fill_missing_chunks=False at the array level. + config = replace(shard_spec.config, fill_missing_chunks=True) return ArraySpec( shape=self.chunk_shape, dtype=shard_spec.dtype, fill_value=shard_spec.fill_value, - config=shard_spec.config, + config=config, prototype=shard_spec.prototype, ) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 486216fa32..6ea8607c3f 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -5616,11 +5616,13 @@ async def _get_selection( await codec_pipeline.read( [ ( - store_path / metadata.encode_chunk_key(chunk_coords), + store_path / (chunk_key := metadata.encode_chunk_key(chunk_coords)), metadata.get_chunk_spec(chunk_coords, _config, prototype=prototype), chunk_selection, out_selection, is_complete_chunk, + chunk_key, + chunk_coords, ) for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer ], diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index 421dfbf145..b173534d93 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -28,6 +28,7 @@ class ArrayConfigParams(TypedDict): order: NotRequired[MemoryOrder] write_empty_chunks: NotRequired[bool] + fill_missing_chunks: NotRequired[bool] @dataclass(frozen=True) @@ -41,17 +42,25 @@ class ArrayConfig: The memory layout of the arrays returned when reading data from the store. write_empty_chunks : bool If True, empty chunks will be written to the store. + fill_missing_chunks : bool + If True, missing chunks will be filled with the array's fill value on read. + If False, reading missing chunks will raise a ``ChunkNotFoundError``. """ order: MemoryOrder write_empty_chunks: bool + fill_missing_chunks: bool - def __init__(self, order: MemoryOrder, write_empty_chunks: bool) -> None: + def __init__( + self, order: MemoryOrder, write_empty_chunks: bool, fill_missing_chunks: bool + ) -> None: order_parsed = parse_order(order) write_empty_chunks_parsed = parse_bool(write_empty_chunks) + fill_missing_chunks_parsed = parse_bool(fill_missing_chunks) object.__setattr__(self, "order", order_parsed) object.__setattr__(self, "write_empty_chunks", write_empty_chunks_parsed) + object.__setattr__(self, "fill_missing_chunks", fill_missing_chunks_parsed) @classmethod def from_dict(cls, data: ArrayConfigParams) -> Self: @@ -62,7 +71,9 @@ def from_dict(cls, data: ArrayConfigParams) -> Self: """ kwargs_out: ArrayConfigParams = {} for f in fields(ArrayConfig): - field_name = cast("Literal['order', 'write_empty_chunks']", f.name) + field_name = cast( + "Literal['order', 'write_empty_chunks', 'fill_missing_chunks']", f.name + ) if field_name not in data: kwargs_out[field_name] = zarr_config.get(f"array.{field_name}") else: @@ -73,7 +84,11 @@ def to_dict(self) -> ArrayConfigParams: """ Serialize an instance of this class to a dict. """ - return {"order": self.order, "write_empty_chunks": self.write_empty_chunks} + return { + "order": self.order, + "write_empty_chunks": self.write_empty_chunks, + "fill_missing_chunks": self.fill_missing_chunks, + } ArrayConfigLike = ArrayConfig | ArrayConfigParams diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index fd557ac43e..a6f34eba28 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -17,7 +17,7 @@ from zarr.core.common import concurrent_map from zarr.core.config import config from zarr.core.indexing import SelectorTuple, is_scalar -from zarr.errors import ZarrUserWarning +from zarr.errors import ChunkNotFoundError, ZarrUserWarning from zarr.registry import register_pipeline if TYPE_CHECKING: @@ -248,7 +248,9 @@ async def encode_partial_batch( async def read_batch( self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + batch_info: Iterable[ + tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool, str, tuple[int, ...]] + ], out: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: @@ -259,13 +261,17 @@ async def read_batch( for byte_getter, chunk_spec, chunk_selection, *_ in batch_info ] ) - for chunk_array, (_, chunk_spec, _, out_selection, _) in zip( + for chunk_array, (_, chunk_spec, _, out_selection, _, chunk_key, chunk_coords) in zip( chunk_array_batch, batch_info, strict=False ): if chunk_array is not None: out[out_selection] = chunk_array - else: + elif chunk_spec.config.fill_missing_chunks: out[out_selection] = fill_value_or_default(chunk_spec) + else: + raise ChunkNotFoundError( + f"chunk '{chunk_key}' at grid position {chunk_coords} not found in store" + ) else: chunk_bytes_batch = await concurrent_map( [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch_info], @@ -280,16 +286,26 @@ async def read_batch( ) ], ) - for chunk_array, (_, chunk_spec, chunk_selection, out_selection, _) in zip( - chunk_array_batch, batch_info, strict=False - ): + for chunk_array, ( + _, + chunk_spec, + chunk_selection, + out_selection, + _, + chunk_key, + chunk_coords, + ) in zip(chunk_array_batch, batch_info, strict=False): if chunk_array is not None: tmp = chunk_array[chunk_selection] if drop_axes != (): tmp = tmp.squeeze(axis=drop_axes) out[out_selection] = tmp - else: + elif chunk_spec.config.fill_missing_chunks: out[out_selection] = fill_value_or_default(chunk_spec) + else: + raise ChunkNotFoundError( + f"chunk '{chunk_key}' at grid position {chunk_coords} not found in store" + ) def _merge_chunk_array( self, @@ -466,7 +482,9 @@ async def encode( async def read( self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + batch_info: Iterable[ + tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool, str, tuple[int, ...]] + ], out: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index f8f8ea4f5f..6acdd25999 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -96,6 +96,7 @@ def enable_gpu(self) -> ConfigSet: "array": { "order": "C", "write_empty_chunks": False, + "fill_missing_chunks": True, "target_shard_size_bytes": None, }, "async": {"concurrency": 10, "timeout": None}, diff --git a/src/zarr/errors.py b/src/zarr/errors.py index bcd6a08deb..8adfce13ea 100644 --- a/src/zarr/errors.py +++ b/src/zarr/errors.py @@ -144,3 +144,9 @@ class BoundsCheckError(IndexError): ... class ArrayIndexError(IndexError): ... + + +class ChunkNotFoundError(BaseZarrError): + """ + Raised when a chunk that was expected to exist in storage was not retrieved successfully. + """ diff --git a/tests/test_config.py b/tests/test_config.py index c3102e8efe..e507a5fc5b 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -23,7 +23,7 @@ from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.config import BadConfigError, config from zarr.core.indexing import SelectorTuple -from zarr.errors import ZarrUserWarning +from zarr.errors import ChunkNotFoundError, ZarrUserWarning from zarr.registry import ( fully_qualified_name, get_buffer_class, @@ -53,6 +53,7 @@ def test_config_defaults_set() -> None: "array": { "order": "C", "write_empty_chunks": False, + "fill_missing_chunks": True, "target_shard_size_bytes": None, }, "async": {"concurrency": 10, "timeout": None}, @@ -319,6 +320,100 @@ class NewCodec2(BytesCodec): get_codec_class("new_codec") +@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) +@pytest.mark.parametrize( + "kwargs", + [ + {"shards": (4, 4)}, + {"compressors": None}, + ], + ids=["partial_decode", "full_decode"], +) +def test_config_fill_missing_chunks(store: Store, kwargs: dict[str, Any]) -> None: + arr = zarr.create_array( + store=store, + shape=(4, 4), + chunks=(2, 2), + dtype="int32", + fill_value=42, + **kwargs, + ) + + # default behavior: missing chunks are filled with the fill value + result = zarr.open_array(store)[:] + assert np.array_equal(result, np.full((4, 4), 42, dtype="int32")) + + # with fill_missing_chunks=False, reading missing chunks raises an error + with config.set({"array.fill_missing_chunks": False}): + with pytest.raises(ChunkNotFoundError): + zarr.open_array(store)[:] + + # after writing data, all chunks exist and no error is raised + arr[:] = np.arange(16, dtype="int32").reshape(4, 4) + with config.set({"array.fill_missing_chunks": False}): + result = zarr.open_array(store)[:] + assert np.array_equal(result, np.arange(16, dtype="int32").reshape(4, 4)) + + +@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) +def test_config_fill_missing_chunks_sharded_inner(store: Store) -> None: + """Missing inner chunks within a shard are always filled with the array's + fill value, even when fill_missing_chunks=False.""" + arr = zarr.create_array( + store=store, + shape=(8, 4), + chunks=(2, 2), + shards=(4, 4), + dtype="int32", + fill_value=42, + ) + + # write only one inner chunk in the first shard, leaving the second shard empty + arr[0:2, 0:2] = np.ones((2, 2), dtype="int32") + + with config.set({"array.fill_missing_chunks": False}): + a = zarr.open_array(store) + + # first shard exists: missing inner chunks are filled, no error + result = a[:4] + expected = np.full((4, 4), 42, dtype="int32") + expected[0:2, 0:2] = 1 + assert np.array_equal(result, expected) + + # second shard is entirely missing: raises an error + with pytest.raises(ChunkNotFoundError): + a[4:] + + +@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) +def test_config_fill_missing_chunks_write_empty_chunks(store: Store) -> None: + """write_empty_chunks=False drops chunks equal to fill_value, which then + appear missing to fill_missing_chunks=False.""" + arr = zarr.create_array( + store=store, + shape=(4,), + chunks=(2,), + dtype="int32", + fill_value=0, + config={"write_empty_chunks": False, "fill_missing_chunks": False}, + ) + + # write non-fill-value data: chunks are stored + arr[:] = [1, 2, 3, 4] + assert np.array_equal(arr[:], [1, 2, 3, 4]) + + # overwrite with fill_value: chunks are dropped by write_empty_chunks=False + arr[:] = 0 + with pytest.raises(ChunkNotFoundError): + arr[:] + + # with write_empty_chunks=True, chunks are kept and no error is raised + with config.set({"array.write_empty_chunks": True}): + arr = zarr.open_array(store) + arr[:] = 0 + assert np.array_equal(arr[:], [0, 0, 0, 0]) + + @pytest.mark.parametrize( "key", [