Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/4073.misc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Extend the `test_block_indexing` Hypothesis property test to cover rectilinear chunk grids and sharded regular grids, and generalize the `block_indices` strategy in `zarr.testing.strategies` to build its array-space oracle from cumulative chunk offsets (`chunk_sizes` parameter) instead of a uniform chunk size.
96 changes: 83 additions & 13 deletions src/zarr/testing/strategies.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import itertools
import math
import sys
from collections.abc import Callable, Mapping
Expand Down Expand Up @@ -599,20 +600,30 @@ def orthogonal_indices(

@st.composite
def block_indices(
draw: st.DrawFn, *, chunk_grid_shape: tuple[int, ...], chunks: tuple[int, ...]
draw: st.DrawFn, *, chunk_sizes: tuple[tuple[int, ...], ...]
) -> tuple[tuple[int | slice, ...], tuple[slice, ...]]:
"""
Strategy for block-selection indexers over a *regular* chunk grid.
Strategy for block-selection indexers over a chunk grid.

Block indexing is basic indexing applied to the block grid (the grid of
chunks), so each axis is drawn with ``basic_indices`` over that axis's chunk
count from ``chunk_grid_shape`` (e.g. ``Array.cdata_shape``), mirroring how
``orthogonal_indices`` reuses ``basic_indices`` per axis. Block indexing only
supports integers and step-1 slices whose start references an existing chunk,
so strided slices and slices starting at the grid edge are filtered out. The
array-space translation assumes a regular (uniform) chunk grid; an over-long
stop into a smaller last chunk is left for numpy to clamp when the oracle is
applied.
count, mirroring how ``orthogonal_indices`` reuses ``basic_indices`` per
axis. ``chunk_sizes`` gives the per-chunk data sizes of the array's *outer*
(block) grid for every axis — i.e. ``Array.write_chunk_sizes``, the grid that
``Array.blocks`` addresses (the shard grid when sharding is used). For
example ``(3, 3, 3, 1)`` for a length-10 axis with a regular chunk size of 3,
or the explicit edges of a rectilinear axis; ``nchunks`` for an axis is
``len(chunk_sizes[axis])``.

The array-space translation uses the cumulative sum of those sizes, matching
``BlockIndexer``'s use of ``dim_grid.chunk_offset``. Because the sizes are
clipped to the array extent, the final offset equals the extent and the
translation is exact for regular (uniform), rectilinear, and sharded grids
alike.

Block indexing only supports integers and step-1 slices whose start
references an existing chunk, so strided slices and slices starting at the
grid edge are filtered out.

Returns
-------
Expand Down Expand Up @@ -640,24 +651,83 @@ def predicate(value: tuple[Any, ...]) -> bool:

block_indexer: list[int | slice] = []
array_indexer: list[slice] = []
for chunk, nchunks in zip(chunks, chunk_grid_shape, strict=True):
(dim_sel,) = draw(
for sizes in chunk_sizes:
nchunks = len(sizes)
# offsets[i] is the array-space start of chunk i; length nchunks + 1.
offsets = list(itertools.accumulate(sizes, initial=0))
dim_strategy = (
basic_indices(min_dims=1, shape=(nchunks,), allow_ellipsis=False)
# normalize bare ints / slices to a 1-tuple, skip the empty tuple
.map(lambda x: (x,) if not isinstance(x, tuple) else x)
.filter(bool)
.filter(supported(nchunks))
)
# basic_indices draws slices far more often than bare integers, so the
# integer (single-block) branch below would only be hit on rare draws.
# Union in an explicit integer so it is reliably exercised — keeping
# coverage deterministic under the derandomized ``ci`` Hypothesis profile.
(dim_sel,) = draw(
dim_strategy | st.integers(min_value=0, max_value=nchunks - 1).map(lambda i: (i,))
)
block_indexer.append(dim_sel)
if isinstance(dim_sel, slice):
start, stop, _ = dim_sel.indices(nchunks)
array_indexer.append(slice(start * chunk, stop * chunk))
array_indexer.append(slice(offsets[start], offsets[stop]))
else:
block = dim_sel % nchunks
array_indexer.append(slice(block * chunk, (block + 1) * chunk))
array_indexer.append(slice(offsets[block], offsets[block + 1]))
return tuple(block_indexer), tuple(array_indexer)


@st.composite
def block_test_arrays(
draw: st.DrawFn,
) -> tuple[Array[Any], np.ndarray[Any, Any]]:
"""Draw an array for block-indexing property tests, with its source contents.

Two arms, selected with equal probability:

- **regular**: a regular chunk grid, optionally wrapped in sharding.
- **rectilinear**: a variable (rectilinear) chunk grid, always unsharded.

Returns ``(zarray, nparray)``. The per-axis block sizes the oracle needs are
``zarray.write_chunk_sizes`` — the array's *outer* (block / shard) grid, which
is exactly the grid ``Array.blocks`` addresses; the caller reads it directly.
"""
chunks: tuple[int, ...] | list[list[int]]
if draw(st.booleans()):
# regular arm, optionally sharded
nparray, chunks = draw(
np_array_and_chunks(
arrays=numpy_arrays(shapes=npst.array_shapes(max_dims=4, min_side=1))
)
)
# min_side=1 chunking guarantees shape // chunk >= 1 on every axis, which
# shard_shapes requires.
shards = draw(st.none() | shard_shapes(shape=nparray.shape, chunk_shape=chunks))
event("block regular sharded" if shards is not None else "block regular unsharded")
rectilinear = False
else:
# rectilinear arm, always unsharded
event("block rectilinear")
shape = draw(_rectilinear_shapes)
chunks = draw(rectilinear_chunks(shape=shape))
nparray = draw(numpy_arrays(shapes=st.just(shape), dtype=draw(dtypes())))
shards, rectilinear = None, True

store = draw(stores)
with zarr.config.set({"array.rectilinear_chunks": rectilinear}):
zarray = zarr.create_array(
store=store,
shape=nparray.shape,
chunks=chunks,
shards=shards,
dtype=nparray.dtype,
)
zarray[...] = nparray
return zarray, nparray


def key_ranges(
keys: SearchStrategy[str] = node_names, max_size: int = sys.maxsize
) -> SearchStrategy[list[tuple[str, ByteRequest | None]]]:
Expand Down
24 changes: 9 additions & 15 deletions tests/test_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@
arrays,
basic_indices,
block_indices,
block_test_arrays,
complex_rectilinear_arrays,
np_array_and_chunks,
numpy_arrays,
orthogonal_indices,
rectilinear_arrays,
Expand Down Expand Up @@ -261,27 +261,21 @@ def test_mask_indexing(data: st.DataObject) -> None:
@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning")
@given(data=st.data())
def test_block_indexing(data: st.DataObject) -> None:
# Block indexing addresses whole chunks on a regular grid; the array-space
# oracle in block_indices() assumes regular, unsharded chunks, so build the
# array directly from a regular chunking rather than drawing one that might
# be rectilinear or sharded.
nparray, chunks = data.draw(
np_array_and_chunks(arrays=numpy_arrays(shapes=npst.array_shapes(max_dims=4, min_side=1)))
)
store = data.draw(stores)
zarray = zarr.create_array(store=store, shape=nparray.shape, chunks=chunks, dtype=nparray.dtype)
zarray[...] = nparray
# Block indexing addresses whole inner chunks. block_indices() builds its
# array-space oracle from cumulative chunk offsets, so it works for regular
# (uniform), rectilinear, and sharded grids alike; block_test_arrays draws
# across that matrix (rectilinear + sharded is unsupported and not drawn).
zarray, nparray = data.draw(block_test_arrays())

block_indexer, array_indexer = data.draw(
block_indices(chunk_grid_shape=zarray.cdata_shape, chunks=chunks)
)
block_indexer, array_indexer = data.draw(block_indices(chunk_sizes=zarray.write_chunk_sizes))
expected = nparray[array_indexer]

# sync get, via both the .blocks interface and the dedicated method
assert_array_equal(expected, zarray.blocks[block_indexer])
assert_array_equal(expected, zarray.get_block_selection(block_indexer))

# sync set, via both interfaces
# sync set, via both interfaces; sharded set is broken upstream (GH2834)
assume(zarray.shards is None)
new_data = data.draw(numpy_arrays(shapes=st.just(expected.shape), dtype=nparray.dtype))
nparray[array_indexer] = new_data
zarray.blocks[block_indexer] = new_data
Expand Down
Loading