From a07764e0cdd6c6f2c0d5d3ebaaaa8714be245408 Mon Sep 17 00:00:00 2001 From: Tom White Date: Thu, 9 Oct 2025 11:39:38 +0100 Subject: [PATCH 01/12] Add `codec_pipeline.fill_missing_chunks` config --- src/zarr/core/codec_pipeline.py | 12 +++++++++--- src/zarr/errors.py | 3 +++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index fd557ac43e..b0c47b9ac1 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -17,7 +17,7 @@ from zarr.core.common import concurrent_map from zarr.core.config import config from zarr.core.indexing import SelectorTuple, is_scalar -from zarr.errors import ZarrUserWarning +from zarr.errors import MissingChunkError, ZarrUserWarning from zarr.registry import register_pipeline if TYPE_CHECKING: @@ -265,7 +265,10 @@ async def read_batch( if chunk_array is not None: out[out_selection] = chunk_array else: - out[out_selection] = fill_value_or_default(chunk_spec) + if config.get("codec_pipeline.fill_missing_chunks", True): + out[out_selection] = fill_value_or_default(chunk_spec) + else: + raise MissingChunkError() else: chunk_bytes_batch = await concurrent_map( [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch_info], @@ -289,7 +292,10 @@ async def read_batch( tmp = tmp.squeeze(axis=drop_axes) out[out_selection] = tmp else: - out[out_selection] = fill_value_or_default(chunk_spec) + if config.get("codec_pipeline.fill_missing_chunks", True): + out[out_selection] = fill_value_or_default(chunk_spec) + else: + raise MissingChunkError() def _merge_chunk_array( self, diff --git a/src/zarr/errors.py b/src/zarr/errors.py index bcd6a08deb..304f3080e7 100644 --- a/src/zarr/errors.py +++ b/src/zarr/errors.py @@ -144,3 +144,6 @@ class BoundsCheckError(IndexError): ... class ArrayIndexError(IndexError): ... + + +class MissingChunkError(IndexError): ... From 7438a03d349a80d80804bd70564035c4e2cf7bce Mon Sep 17 00:00:00 2001 From: williamsnell Date: Thu, 5 Mar 2026 17:39:59 +1300 Subject: [PATCH 02/12] Set default for `fill_missing_chunks` in config.py. Add test replicating example in zarr-python #486. --- src/zarr/core/config.py | 1 + tests/test_config.py | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index f8f8ea4f5f..2b10f8d0cf 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -104,6 +104,7 @@ def enable_gpu(self) -> ConfigSet: "codec_pipeline": { "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", "batch_size": 1, + "fill_missing_chunks": True, }, "codecs": { "blosc": "zarr.codecs.blosc.BloscCodec", diff --git a/tests/test_config.py b/tests/test_config.py index c3102e8efe..b1fa43ffc5 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -23,7 +23,7 @@ from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.config import BadConfigError, config from zarr.core.indexing import SelectorTuple -from zarr.errors import ZarrUserWarning +from zarr.errors import MissingChunkError, ZarrUserWarning from zarr.registry import ( fully_qualified_name, get_buffer_class, @@ -61,6 +61,7 @@ def test_config_defaults_set() -> None: "codec_pipeline": { "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", "batch_size": 1, + "fill_missing_chunks": True, }, "codecs": { "blosc": "zarr.codecs.blosc.BloscCodec", @@ -319,6 +320,26 @@ class NewCodec2(BytesCodec): get_codec_class("new_codec") +@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) +def test_config_fill_missing_chunks(store: Store) -> None: + arr = zarr.create_array(store=store, shape=(3, 3), chunks=(2, 2), dtype="int32", fill_value=42) + + # default behavior: missing chunks are filled with the fill value + result = zarr.open_array(store)[:] + assert np.array_equal(result, np.full((3, 3), 42, dtype="int32")) + + # with fill_missing_chunks=False, reading missing chunks raises an error + with config.set({"codec_pipeline.fill_missing_chunks": False}): + with pytest.raises(MissingChunkError): + zarr.open_array(store)[:] + + # after writing data, all chunks exist and no error is raised + arr[:] = np.arange(9, dtype="int32").reshape(3, 3) + with config.set({"codec_pipeline.fill_missing_chunks": False}): + result = zarr.open_array(store)[:] + assert np.array_equal(result, np.arange(9, dtype="int32").reshape(3, 3)) + + @pytest.mark.parametrize( "key", [ From 38e5acf9cce3d437bedebab5bd4eef67196c9ac5 Mon Sep 17 00:00:00 2001 From: williamsnell Date: Thu, 5 Mar 2026 17:54:09 +1300 Subject: [PATCH 03/12] Add fill_missing_chunks to examples of config options. --- docs/user-guide/config.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/user-guide/config.md b/docs/user-guide/config.md index 21fe9b5def..84879775f4 100644 --- a/docs/user-guide/config.md +++ b/docs/user-guide/config.md @@ -30,6 +30,7 @@ Configuration options include the following: - Default Zarr format `default_zarr_version` - Default array order in memory `array.order` - Whether empty chunks are written to storage `array.write_empty_chunks` +- Whether missing chunks are filled with the fill value on read `codec_pipeline.fill_missing_chunks` (default `True`). Set to `False` to raise a `MissingChunkError` instead. - Async and threading options, e.g. `async.concurrency` and `threading.max_workers` - Selections of implementations of codecs, codec pipelines and buffers - Enabling GPU support with `zarr.config.enable_gpu()`. See GPU support for more. From 1d51b37e3da99a720c0b2039d84f8608e42a3539 Mon Sep 17 00:00:00 2001 From: williamsnell Date: Thu, 5 Mar 2026 18:04:15 +1300 Subject: [PATCH 04/12] Add to /changes --- changes/3748.feature.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/3748.feature.md diff --git a/changes/3748.feature.md b/changes/3748.feature.md new file mode 100644 index 0000000000..37acbfaea0 --- /dev/null +++ b/changes/3748.feature.md @@ -0,0 +1 @@ +Added `codec_pipeline.fill_missing_chunks` configuration option. When set to `False`, reading missing chunks raises a `MissingChunkError` instead of filling them with the fill value. From ad3e2ed689213b6a1a5e5f6d60283ca81d66c5ec Mon Sep 17 00:00:00 2001 From: williamsnell Date: Thu, 5 Mar 2026 18:15:55 +1300 Subject: [PATCH 05/12] Parameterize tests to make sure we hit both branches of `if self.supports_partial_decode`. --- tests/test_config.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/tests/test_config.py b/tests/test_config.py index b1fa43ffc5..b27ec27932 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -321,12 +321,27 @@ class NewCodec2(BytesCodec): @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) -def test_config_fill_missing_chunks(store: Store) -> None: - arr = zarr.create_array(store=store, shape=(3, 3), chunks=(2, 2), dtype="int32", fill_value=42) +@pytest.mark.parametrize( + "kwargs", + [ + {"shards": (4, 4)}, + {"compressors": None}, + ], + ids=["partial_decode", "full_decode"], +) +def test_config_fill_missing_chunks(store: Store, kwargs: dict) -> None: + arr = zarr.create_array( + store=store, + shape=(4, 4), + chunks=(2, 2), + dtype="int32", + fill_value=42, + **kwargs, + ) # default behavior: missing chunks are filled with the fill value result = zarr.open_array(store)[:] - assert np.array_equal(result, np.full((3, 3), 42, dtype="int32")) + assert np.array_equal(result, np.full((4, 4), 42, dtype="int32")) # with fill_missing_chunks=False, reading missing chunks raises an error with config.set({"codec_pipeline.fill_missing_chunks": False}): @@ -334,10 +349,10 @@ def test_config_fill_missing_chunks(store: Store) -> None: zarr.open_array(store)[:] # after writing data, all chunks exist and no error is raised - arr[:] = np.arange(9, dtype="int32").reshape(3, 3) + arr[:] = np.arange(16, dtype="int32").reshape(4, 4) with config.set({"codec_pipeline.fill_missing_chunks": False}): result = zarr.open_array(store)[:] - assert np.array_equal(result, np.arange(9, dtype="int32").reshape(3, 3)) + assert np.array_equal(result, np.arange(16, dtype="int32").reshape(4, 4)) @pytest.mark.parametrize( From 2c9b31bf141e7236b10a51836d43bb3a0b849e10 Mon Sep 17 00:00:00 2001 From: williamsnell Date: Thu, 5 Mar 2026 20:21:04 +1300 Subject: [PATCH 06/12] Fix lint errors: remove parentheses, type kwargs. --- src/zarr/core/codec_pipeline.py | 4 ++-- tests/test_config.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index b0c47b9ac1..68eb648a18 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -268,7 +268,7 @@ async def read_batch( if config.get("codec_pipeline.fill_missing_chunks", True): out[out_selection] = fill_value_or_default(chunk_spec) else: - raise MissingChunkError() + raise MissingChunkError else: chunk_bytes_batch = await concurrent_map( [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch_info], @@ -295,7 +295,7 @@ async def read_batch( if config.get("codec_pipeline.fill_missing_chunks", True): out[out_selection] = fill_value_or_default(chunk_spec) else: - raise MissingChunkError() + raise MissingChunkError def _merge_chunk_array( self, diff --git a/tests/test_config.py b/tests/test_config.py index b27ec27932..b07532b222 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -329,7 +329,7 @@ class NewCodec2(BytesCodec): ], ids=["partial_decode", "full_decode"], ) -def test_config_fill_missing_chunks(store: Store, kwargs: dict) -> None: +def test_config_fill_missing_chunks(store: Store, kwargs: dict[str, Any]) -> None: arr = zarr.create_array( store=store, shape=(4, 4), From 2846ed9086cc5dda62b6110e4123b49d5b3281fa Mon Sep 17 00:00:00 2001 From: williamsnell Date: Fri, 6 Mar 2026 09:19:52 +1300 Subject: [PATCH 07/12] Move config from codec_pipeline -> array. Update docs, tests. --- changes/3748.feature.md | 2 +- docs/user-guide/arrays.md | 7 ++++++- docs/user-guide/config.md | 2 +- src/zarr/codecs/sharding.py | 2 +- src/zarr/core/array_spec.py | 21 ++++++++++++++++++--- src/zarr/core/codec_pipeline.py | 4 ++-- src/zarr/core/config.py | 2 +- tests/test_config.py | 6 +++--- 8 files changed, 33 insertions(+), 13 deletions(-) diff --git a/changes/3748.feature.md b/changes/3748.feature.md index 37acbfaea0..8a5d6087b9 100644 --- a/changes/3748.feature.md +++ b/changes/3748.feature.md @@ -1 +1 @@ -Added `codec_pipeline.fill_missing_chunks` configuration option. When set to `False`, reading missing chunks raises a `MissingChunkError` instead of filling them with the fill value. +Added `array.fill_missing_chunks` configuration option. When set to `False`, reading missing chunks raises a `MissingChunkError` instead of filling them with the array's fill value. diff --git a/docs/user-guide/arrays.md b/docs/user-guide/arrays.md index cd6a93cac9..983aa983f7 100644 --- a/docs/user-guide/arrays.md +++ b/docs/user-guide/arrays.md @@ -158,13 +158,18 @@ print(f"Shape after second append: {z.shape}") Zarr arrays are parametrized with a configuration that determines certain aspects of array behavior. -We currently support two configuration options for arrays: `write_empty_chunks` and `order`. +We currently support three configuration options for arrays: `write_empty_chunks`, `fill_missing_chunks`, and `order`. | field | type | default | description | | - | - | - | - | | `write_empty_chunks` | `bool` | `False` | Controls whether empty chunks are written to storage. See [Empty chunks](performance.md#empty-chunks). +| `fill_missing_chunks` | `bool` | `True` | Controls whether missing chunks are filled with the array's fill value on read. If `False`, reading missing chunks raises a `MissingChunkError`. | `order` | `Literal["C", "F"]` | `"C"` | The memory layout of arrays returned when reading data from the store. +!!! note + `write_empty_chunks=False` skips writing chunks that are entirely the array's fill value. + If `fill_missing_chunks=False`, attempting to read these missing chunks will raise an error. + You can specify the configuration when you create an array with the `config` keyword argument. `config` can be passed as either a `dict` or an `ArrayConfig` object. diff --git a/docs/user-guide/config.md b/docs/user-guide/config.md index 84879775f4..ee2c9f91c2 100644 --- a/docs/user-guide/config.md +++ b/docs/user-guide/config.md @@ -30,7 +30,7 @@ Configuration options include the following: - Default Zarr format `default_zarr_version` - Default array order in memory `array.order` - Whether empty chunks are written to storage `array.write_empty_chunks` -- Whether missing chunks are filled with the fill value on read `codec_pipeline.fill_missing_chunks` (default `True`). Set to `False` to raise a `MissingChunkError` instead. +- Whether missing chunks are filled with the array's fill value on read `array.fill_missing_chunks` (default `True`). Set to `False` to raise a `MissingChunkError` instead. - Async and threading options, e.g. `async.concurrency` and `threading.max_workers` - Selections of implementations of codecs, codec pipelines and buffers - Enabling GPU support with `zarr.config.enable_gpu()`. See GPU support for more. diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 85162c2f74..c45618a099 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -711,7 +711,7 @@ def _get_index_chunk_spec(self, chunks_per_shard: tuple[int, ...]) -> ArraySpec: dtype=UInt64(endianness="little"), fill_value=MAX_UINT_64, config=ArrayConfig( - order="C", write_empty_chunks=False + order="C", write_empty_chunks=False, fill_missing_chunks=True ), # Note: this is hard-coded for simplicity -- it is not surfaced into user code, prototype=default_buffer_prototype(), ) diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index 421dfbf145..e8539674cf 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -28,6 +28,7 @@ class ArrayConfigParams(TypedDict): order: NotRequired[MemoryOrder] write_empty_chunks: NotRequired[bool] + fill_missing_chunks: NotRequired[bool] @dataclass(frozen=True) @@ -41,17 +42,25 @@ class ArrayConfig: The memory layout of the arrays returned when reading data from the store. write_empty_chunks : bool If True, empty chunks will be written to the store. + fill_missing_chunks : bool + If True, missing chunks will be filled with the array's fill value on read. + If False, reading missing chunks will raise a ``MissingChunkError``. """ order: MemoryOrder write_empty_chunks: bool + fill_missing_chunks: bool - def __init__(self, order: MemoryOrder, write_empty_chunks: bool) -> None: + def __init__( + self, order: MemoryOrder, write_empty_chunks: bool, fill_missing_chunks: bool + ) -> None: order_parsed = parse_order(order) write_empty_chunks_parsed = parse_bool(write_empty_chunks) + fill_missing_chunks_parsed = parse_bool(fill_missing_chunks) object.__setattr__(self, "order", order_parsed) object.__setattr__(self, "write_empty_chunks", write_empty_chunks_parsed) + object.__setattr__(self, "fill_missing_chunks", fill_missing_chunks_parsed) @classmethod def from_dict(cls, data: ArrayConfigParams) -> Self: @@ -62,7 +71,9 @@ def from_dict(cls, data: ArrayConfigParams) -> Self: """ kwargs_out: ArrayConfigParams = {} for f in fields(ArrayConfig): - field_name = cast("Literal['order', 'write_empty_chunks']", f.name) + field_name = cast( + "Literal['order', 'write_empty_chunks', 'fill_missing_chunks']", f.name + ) if field_name not in data: kwargs_out[field_name] = zarr_config.get(f"array.{field_name}") else: @@ -73,7 +84,11 @@ def to_dict(self) -> ArrayConfigParams: """ Serialize an instance of this class to a dict. """ - return {"order": self.order, "write_empty_chunks": self.write_empty_chunks} + return { + "order": self.order, + "write_empty_chunks": self.write_empty_chunks, + "fill_missing_chunks": self.fill_missing_chunks, + } ArrayConfigLike = ArrayConfig | ArrayConfigParams diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 68eb648a18..9c3ff37c7f 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -265,7 +265,7 @@ async def read_batch( if chunk_array is not None: out[out_selection] = chunk_array else: - if config.get("codec_pipeline.fill_missing_chunks", True): + if chunk_spec.config.fill_missing_chunks: out[out_selection] = fill_value_or_default(chunk_spec) else: raise MissingChunkError @@ -292,7 +292,7 @@ async def read_batch( tmp = tmp.squeeze(axis=drop_axes) out[out_selection] = tmp else: - if config.get("codec_pipeline.fill_missing_chunks", True): + if chunk_spec.config.fill_missing_chunks: out[out_selection] = fill_value_or_default(chunk_spec) else: raise MissingChunkError diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 2b10f8d0cf..6acdd25999 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -96,6 +96,7 @@ def enable_gpu(self) -> ConfigSet: "array": { "order": "C", "write_empty_chunks": False, + "fill_missing_chunks": True, "target_shard_size_bytes": None, }, "async": {"concurrency": 10, "timeout": None}, @@ -104,7 +105,6 @@ def enable_gpu(self) -> ConfigSet: "codec_pipeline": { "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", "batch_size": 1, - "fill_missing_chunks": True, }, "codecs": { "blosc": "zarr.codecs.blosc.BloscCodec", diff --git a/tests/test_config.py b/tests/test_config.py index b07532b222..b5a0c77408 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -53,6 +53,7 @@ def test_config_defaults_set() -> None: "array": { "order": "C", "write_empty_chunks": False, + "fill_missing_chunks": True, "target_shard_size_bytes": None, }, "async": {"concurrency": 10, "timeout": None}, @@ -61,7 +62,6 @@ def test_config_defaults_set() -> None: "codec_pipeline": { "path": "zarr.core.codec_pipeline.BatchedCodecPipeline", "batch_size": 1, - "fill_missing_chunks": True, }, "codecs": { "blosc": "zarr.codecs.blosc.BloscCodec", @@ -344,13 +344,13 @@ def test_config_fill_missing_chunks(store: Store, kwargs: dict[str, Any]) -> Non assert np.array_equal(result, np.full((4, 4), 42, dtype="int32")) # with fill_missing_chunks=False, reading missing chunks raises an error - with config.set({"codec_pipeline.fill_missing_chunks": False}): + with config.set({"array.fill_missing_chunks": False}): with pytest.raises(MissingChunkError): zarr.open_array(store)[:] # after writing data, all chunks exist and no error is raised arr[:] = np.arange(16, dtype="int32").reshape(4, 4) - with config.set({"codec_pipeline.fill_missing_chunks": False}): + with config.set({"array.fill_missing_chunks": False}): result = zarr.open_array(store)[:] assert np.array_equal(result, np.arange(16, dtype="int32").reshape(4, 4)) From de7afd80e1cd44fc89d794f81a0b84bf6146bd4f Mon Sep 17 00:00:00 2001 From: williamsnell Date: Sat, 7 Mar 2026 09:43:34 +1300 Subject: [PATCH 08/12] Delegate missing-shard detection away from _get_chunk_spec. Codify expected behaviour of fill_missing_chunks for both sharding and write_empty_chunks via tests. Use elif to make control flow slightly clearer. --- src/zarr/codecs/sharding.py | 7 +++- src/zarr/core/codec_pipeline.py | 14 ++++---- tests/test_config.py | 59 +++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 9 deletions(-) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index c45618a099..2e9d9370d8 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -717,11 +717,16 @@ def _get_index_chunk_spec(self, chunks_per_shard: tuple[int, ...]) -> ArraySpec: ) def _get_chunk_spec(self, shard_spec: ArraySpec) -> ArraySpec: + # Because the shard index and inner chunks should be stored + # together, we detect missing data via the shard index. + # The inner chunks defined here are thus allowed to return + # None, even if fill_missing_chunks=False at the array level. + config = replace(shard_spec.config, fill_missing_chunks=True) return ArraySpec( shape=self.chunk_shape, dtype=shard_spec.dtype, fill_value=shard_spec.fill_value, - config=shard_spec.config, + config=config, prototype=shard_spec.prototype, ) diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 9c3ff37c7f..8b75e620ad 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -264,11 +264,10 @@ async def read_batch( ): if chunk_array is not None: out[out_selection] = chunk_array + elif chunk_spec.config.fill_missing_chunks: + out[out_selection] = fill_value_or_default(chunk_spec) else: - if chunk_spec.config.fill_missing_chunks: - out[out_selection] = fill_value_or_default(chunk_spec) - else: - raise MissingChunkError + raise MissingChunkError else: chunk_bytes_batch = await concurrent_map( [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch_info], @@ -291,11 +290,10 @@ async def read_batch( if drop_axes != (): tmp = tmp.squeeze(axis=drop_axes) out[out_selection] = tmp + elif chunk_spec.config.fill_missing_chunks: + out[out_selection] = fill_value_or_default(chunk_spec) else: - if chunk_spec.config.fill_missing_chunks: - out[out_selection] = fill_value_or_default(chunk_spec) - else: - raise MissingChunkError + raise MissingChunkError def _merge_chunk_array( self, diff --git a/tests/test_config.py b/tests/test_config.py index b5a0c77408..7c21d55ded 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -355,6 +355,65 @@ def test_config_fill_missing_chunks(store: Store, kwargs: dict[str, Any]) -> Non assert np.array_equal(result, np.arange(16, dtype="int32").reshape(4, 4)) +@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) +def test_config_fill_missing_chunks_sharded_inner(store: Store) -> None: + """Missing inner chunks within a shard are always filled with the array's + fill value, even when fill_missing_chunks=False.""" + arr = zarr.create_array( + store=store, + shape=(8, 4), + chunks=(2, 2), + shards=(4, 4), + dtype="int32", + fill_value=42, + ) + + # write only one inner chunk in the first shard, leaving the second shard empty + arr[0:2, 0:2] = np.ones((2, 2), dtype="int32") + + with config.set({"array.fill_missing_chunks": False}): + a = zarr.open_array(store) + + # first shard exists: missing inner chunks are filled, no error + result = a[:4] + expected = np.full((4, 4), 42, dtype="int32") + expected[0:2, 0:2] = 1 + assert np.array_equal(result, expected) + + # second shard is entirely missing: raises an error + with pytest.raises(MissingChunkError): + a[4:] + + +@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) +def test_config_fill_missing_chunks_write_empty_chunks(store: Store) -> None: + """write_empty_chunks=False drops chunks equal to fill_value, which then + appear missing to fill_missing_chunks=False.""" + arr = zarr.create_array( + store=store, + shape=(4,), + chunks=(2,), + dtype="int32", + fill_value=0, + config={"write_empty_chunks": False, "fill_missing_chunks": False}, + ) + + # write non-fill-value data: chunks are stored + arr[:] = [1, 2, 3, 4] + assert np.array_equal(arr[:], [1, 2, 3, 4]) + + # overwrite with fill_value: chunks are dropped by write_empty_chunks=False + arr[:] = 0 + with pytest.raises(MissingChunkError): + arr[:] + + # with write_empty_chunks=True, chunks are kept and no error is raised + with config.set({"array.write_empty_chunks": True}): + arr = zarr.open_array(store) + arr[:] = 0 + assert np.array_equal(arr[:], [0, 0, 0, 0]) + + @pytest.mark.parametrize( "key", [ From d4605212f8985a756f1856555fa3d4855be92e36 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 11 Mar 2026 12:51:32 +0100 Subject: [PATCH 09/12] Define ChunkNotFoundError; expose chunk key and chunk index in ChunkNotFoundError --- src/zarr/abc/codec.py | 8 ++++++-- src/zarr/codecs/sharding.py | 4 ++++ src/zarr/core/array.py | 2 ++ src/zarr/core/codec_pipeline.py | 32 +++++++++++++++++++++++--------- src/zarr/errors.py | 5 ++++- tests/test_config.py | 8 ++++---- 6 files changed, 43 insertions(+), 16 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 3ec5ec522b..a45f582832 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -430,7 +430,9 @@ async def encode( @abstractmethod async def read( self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + batch_info: Iterable[ + tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool, str, tuple[int, ...]] + ], out: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: @@ -439,12 +441,14 @@ async def read( Parameters ---------- - batch_info : Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]] + batch_info : Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool, str, tuple[int, ...]]] Ordered set of information about the chunks. The first slice selection determines which parts of the chunk will be fetched. The second slice selection determines where in the output array the chunk data will be written. The ByteGetter is used to fetch the necessary bytes. The chunk spec contains information about the construction of an array from the bytes. + The string is the chunk key. + The tuple of ints is the chunk's grid coordinates. If the Store returns ``None`` for a chunk, then the chunk was not written and the implementation must set the values of that chunk (or diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 2e9d9370d8..7ba2858254 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -439,6 +439,8 @@ async def _decode_single( chunk_selection, out_selection, is_complete_shard, + "/".join(str(c) for c in chunk_coords), + chunk_coords, ) for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer ], @@ -511,6 +513,8 @@ async def _decode_partial_single( chunk_selection, out_selection, is_complete_shard, + "/".join(str(c) for c in chunk_coords), + chunk_coords, ) for chunk_coords, chunk_selection, out_selection, is_complete_shard in indexer ], diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 564d0e915a..b3261f05f7 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -5608,6 +5608,8 @@ async def _get_selection( chunk_selection, out_selection, is_complete_chunk, + metadata.encode_chunk_key(chunk_coords), + chunk_coords, ) for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer ], diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 8b75e620ad..16381182d2 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -17,7 +17,7 @@ from zarr.core.common import concurrent_map from zarr.core.config import config from zarr.core.indexing import SelectorTuple, is_scalar -from zarr.errors import MissingChunkError, ZarrUserWarning +from zarr.errors import ChunkNotFoundError, ZarrUserWarning from zarr.registry import register_pipeline if TYPE_CHECKING: @@ -248,7 +248,9 @@ async def encode_partial_batch( async def read_batch( self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + batch_info: Iterable[ + tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool, str, tuple[int, ...]] + ], out: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: @@ -259,7 +261,7 @@ async def read_batch( for byte_getter, chunk_spec, chunk_selection, *_ in batch_info ] ) - for chunk_array, (_, chunk_spec, _, out_selection, _) in zip( + for chunk_array, (_, chunk_spec, _, out_selection, _, chunk_key, chunk_coords) in zip( chunk_array_batch, batch_info, strict=False ): if chunk_array is not None: @@ -267,7 +269,9 @@ async def read_batch( elif chunk_spec.config.fill_missing_chunks: out[out_selection] = fill_value_or_default(chunk_spec) else: - raise MissingChunkError + raise ChunkNotFoundError( + f"chunk '{chunk_key}' at grid position {chunk_coords} not found in store." + ) else: chunk_bytes_batch = await concurrent_map( [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch_info], @@ -282,9 +286,15 @@ async def read_batch( ) ], ) - for chunk_array, (_, chunk_spec, chunk_selection, out_selection, _) in zip( - chunk_array_batch, batch_info, strict=False - ): + for chunk_array, ( + _, + chunk_spec, + chunk_selection, + out_selection, + _, + chunk_key, + chunk_coords, + ) in zip(chunk_array_batch, batch_info, strict=False): if chunk_array is not None: tmp = chunk_array[chunk_selection] if drop_axes != (): @@ -293,7 +303,9 @@ async def read_batch( elif chunk_spec.config.fill_missing_chunks: out[out_selection] = fill_value_or_default(chunk_spec) else: - raise MissingChunkError + raise ChunkNotFoundError( + f"chunk '{chunk_key}' at grid position {chunk_coords} not found in store" + ) def _merge_chunk_array( self, @@ -470,7 +482,9 @@ async def encode( async def read( self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]], + batch_info: Iterable[ + tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool, str, tuple[int, ...]] + ], out: NDBuffer, drop_axes: tuple[int, ...] = (), ) -> None: diff --git a/src/zarr/errors.py b/src/zarr/errors.py index 304f3080e7..8adfce13ea 100644 --- a/src/zarr/errors.py +++ b/src/zarr/errors.py @@ -146,4 +146,7 @@ class BoundsCheckError(IndexError): ... class ArrayIndexError(IndexError): ... -class MissingChunkError(IndexError): ... +class ChunkNotFoundError(BaseZarrError): + """ + Raised when a chunk that was expected to exist in storage was not retrieved successfully. + """ diff --git a/tests/test_config.py b/tests/test_config.py index 7c21d55ded..e507a5fc5b 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -23,7 +23,7 @@ from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.config import BadConfigError, config from zarr.core.indexing import SelectorTuple -from zarr.errors import MissingChunkError, ZarrUserWarning +from zarr.errors import ChunkNotFoundError, ZarrUserWarning from zarr.registry import ( fully_qualified_name, get_buffer_class, @@ -345,7 +345,7 @@ def test_config_fill_missing_chunks(store: Store, kwargs: dict[str, Any]) -> Non # with fill_missing_chunks=False, reading missing chunks raises an error with config.set({"array.fill_missing_chunks": False}): - with pytest.raises(MissingChunkError): + with pytest.raises(ChunkNotFoundError): zarr.open_array(store)[:] # after writing data, all chunks exist and no error is raised @@ -381,7 +381,7 @@ def test_config_fill_missing_chunks_sharded_inner(store: Store) -> None: assert np.array_equal(result, expected) # second shard is entirely missing: raises an error - with pytest.raises(MissingChunkError): + with pytest.raises(ChunkNotFoundError): a[4:] @@ -404,7 +404,7 @@ def test_config_fill_missing_chunks_write_empty_chunks(store: Store) -> None: # overwrite with fill_value: chunks are dropped by write_empty_chunks=False arr[:] = 0 - with pytest.raises(MissingChunkError): + with pytest.raises(ChunkNotFoundError): arr[:] # with write_empty_chunks=True, chunks are kept and no error is raised From 9c9a096edbe289f8c6d5fd75591faf317f39542e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 11 Mar 2026 13:03:02 +0100 Subject: [PATCH 10/12] update docs --- docs/user-guide/arrays.md | 11 +++++++++-- docs/user-guide/config.md | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/user-guide/arrays.md b/docs/user-guide/arrays.md index 983aa983f7..0aa10be6d9 100644 --- a/docs/user-guide/arrays.md +++ b/docs/user-guide/arrays.md @@ -163,12 +163,19 @@ We currently support three configuration options for arrays: `write_empty_chunks | field | type | default | description | | - | - | - | - | | `write_empty_chunks` | `bool` | `False` | Controls whether empty chunks are written to storage. See [Empty chunks](performance.md#empty-chunks). -| `fill_missing_chunks` | `bool` | `True` | Controls whether missing chunks are filled with the array's fill value on read. If `False`, reading missing chunks raises a `MissingChunkError`. +| `fill_missing_chunks` | `bool` | `True` | Controls whether missing chunks are filled with the array's fill value on read. If `False`, reading missing chunks raises a [`ChunkNotFoundError`][]. | `order` | `Literal["C", "F"]` | `"C"` | The memory layout of arrays returned when reading data from the store. +!!! info + The Zarr V3 spec states that readers should interpret an uninitialized chunk as containing the + array's `fill_value`. By default, Zarr-Python follows this behavior: a missing chunk is treated + as uninitialized and filled with the array's `fill_value`. However, if you know that all chunks + have been written (i.e., are initialized), you may want to treat a missing chunk as an error. Set + `fill_missing_chunks=False` to raise a [`ChunkNotFoundError`][] instead. + !!! note `write_empty_chunks=False` skips writing chunks that are entirely the array's fill value. - If `fill_missing_chunks=False`, attempting to read these missing chunks will raise an error. + If `fill_missing_chunks=False`, attempting to read these missing chunks will raise a [`ChunkNotFoundError`][]. You can specify the configuration when you create an array with the `config` keyword argument. `config` can be passed as either a `dict` or an `ArrayConfig` object. diff --git a/docs/user-guide/config.md b/docs/user-guide/config.md index ee2c9f91c2..6d3ab8e804 100644 --- a/docs/user-guide/config.md +++ b/docs/user-guide/config.md @@ -30,7 +30,7 @@ Configuration options include the following: - Default Zarr format `default_zarr_version` - Default array order in memory `array.order` - Whether empty chunks are written to storage `array.write_empty_chunks` -- Whether missing chunks are filled with the array's fill value on read `array.fill_missing_chunks` (default `True`). Set to `False` to raise a `MissingChunkError` instead. +- Whether missing chunks are filled with the array's fill value on read `array.fill_missing_chunks` (default `True`). Set to `False` to raise a [`ChunkNotFoundError`][] instead. - Async and threading options, e.g. `async.concurrency` and `threading.max_workers` - Selections of implementations of codecs, codec pipelines and buffers - Enabling GPU support with `zarr.config.enable_gpu()`. See GPU support for more. From 70b7c7b46c482f4de89d4238841a91bccfd80376 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 11 Mar 2026 13:11:44 +0100 Subject: [PATCH 11/12] fix links --- docs/user-guide/arrays.md | 6 +++--- docs/user-guide/config.md | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/user-guide/arrays.md b/docs/user-guide/arrays.md index 432f76ff69..727d8b05e8 100644 --- a/docs/user-guide/arrays.md +++ b/docs/user-guide/arrays.md @@ -163,7 +163,7 @@ We currently support three configuration options for arrays: `write_empty_chunks | field | type | default | description | | - | - | - | - | | `write_empty_chunks` | `bool` | `False` | Controls whether empty chunks are written to storage. See [Empty chunks](performance.md#empty-chunks). -| `fill_missing_chunks` | `bool` | `True` | Controls whether missing chunks are filled with the array's fill value on read. If `False`, reading missing chunks raises a [`ChunkNotFoundError`][]. +| `fill_missing_chunks` | `bool` | `True` | Controls whether missing chunks are filled with the array's fill value on read. If `False`, reading missing chunks raises a [`ChunkNotFoundError`][zarr.errors.ChunkNotFoundError]. | `order` | `Literal["C", "F"]` | `"C"` | The memory layout of arrays returned when reading data from the store. !!! info @@ -171,11 +171,11 @@ We currently support three configuration options for arrays: `write_empty_chunks array's `fill_value`. By default, Zarr-Python follows this behavior: a missing chunk is treated as uninitialized and filled with the array's `fill_value`. However, if you know that all chunks have been written (i.e., are initialized), you may want to treat a missing chunk as an error. Set - `fill_missing_chunks=False` to raise a [`ChunkNotFoundError`][] instead. + `fill_missing_chunks=False` to raise a [`ChunkNotFoundError`][zarr.errors.ChunkNotFoundError] instead. !!! note `write_empty_chunks=False` skips writing chunks that are entirely the array's fill value. - If `fill_missing_chunks=False`, attempting to read these missing chunks will raise a [`ChunkNotFoundError`][]. + If `fill_missing_chunks=False`, attempting to read these missing chunks will raise a [`ChunkNotFoundError`][zarr.errors.ChunkNotFoundError]. You can specify the configuration when you create an array with the `config` keyword argument. `config` can be passed as either a `dict` or an `ArrayConfig` object. diff --git a/docs/user-guide/config.md b/docs/user-guide/config.md index 6d3ab8e804..613a51abf9 100644 --- a/docs/user-guide/config.md +++ b/docs/user-guide/config.md @@ -30,7 +30,7 @@ Configuration options include the following: - Default Zarr format `default_zarr_version` - Default array order in memory `array.order` - Whether empty chunks are written to storage `array.write_empty_chunks` -- Whether missing chunks are filled with the array's fill value on read `array.fill_missing_chunks` (default `True`). Set to `False` to raise a [`ChunkNotFoundError`][] instead. +- Whether missing chunks are filled with the array's fill value on read `array.fill_missing_chunks` (default `True`). Set to `False` to raise a [`ChunkNotFoundError`][zarr.errors.ChunkNotFoundError] instead. - Async and threading options, e.g. `async.concurrency` and `threading.max_workers` - Selections of implementations of codecs, codec pipelines and buffers - Enabling GPU support with `zarr.config.enable_gpu()`. See GPU support for more. From 8f2cdc4620d6d1b756b750b9f6b6d7c51f4876fd Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 11 Mar 2026 13:14:59 +0100 Subject: [PATCH 12/12] cleanup --- changes/3748.feature.md | 2 +- src/zarr/core/array.py | 4 ++-- src/zarr/core/array_spec.py | 2 +- src/zarr/core/codec_pipeline.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/changes/3748.feature.md b/changes/3748.feature.md index 8a5d6087b9..00553e73fb 100644 --- a/changes/3748.feature.md +++ b/changes/3748.feature.md @@ -1 +1 @@ -Added `array.fill_missing_chunks` configuration option. When set to `False`, reading missing chunks raises a `MissingChunkError` instead of filling them with the array's fill value. +Added `array.fill_missing_chunks` configuration option. When set to `False`, reading missing chunks raises a `ChunkNotFoundError` instead of filling them with the array's fill value. diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index b3261f05f7..11c9ece15a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -5603,12 +5603,12 @@ async def _get_selection( await codec_pipeline.read( [ ( - store_path / metadata.encode_chunk_key(chunk_coords), + store_path / (chunk_key := metadata.encode_chunk_key(chunk_coords)), metadata.get_chunk_spec(chunk_coords, _config, prototype=prototype), chunk_selection, out_selection, is_complete_chunk, - metadata.encode_chunk_key(chunk_coords), + chunk_key, chunk_coords, ) for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index e8539674cf..b173534d93 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -44,7 +44,7 @@ class ArrayConfig: If True, empty chunks will be written to the store. fill_missing_chunks : bool If True, missing chunks will be filled with the array's fill value on read. - If False, reading missing chunks will raise a ``MissingChunkError``. + If False, reading missing chunks will raise a ``ChunkNotFoundError``. """ order: MemoryOrder diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 16381182d2..a6f34eba28 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -270,7 +270,7 @@ async def read_batch( out[out_selection] = fill_value_or_default(chunk_spec) else: raise ChunkNotFoundError( - f"chunk '{chunk_key}' at grid position {chunk_coords} not found in store." + f"chunk '{chunk_key}' at grid position {chunk_coords} not found in store" ) else: chunk_bytes_batch = await concurrent_map(