diff --git a/changes/4073.misc.md b/changes/4073.misc.md new file mode 100644 index 0000000000..bcc6b8281e --- /dev/null +++ b/changes/4073.misc.md @@ -0,0 +1 @@ +Extend the `test_block_indexing` Hypothesis property test to cover rectilinear chunk grids and sharded regular grids, and generalize the `block_indices` strategy in `zarr.testing.strategies` to build its array-space oracle from cumulative chunk offsets (`chunk_sizes` parameter) instead of a uniform chunk size. diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 0ef1ba99bb..f2c83677cc 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -1,3 +1,4 @@ +import itertools import math import sys from collections.abc import Callable, Mapping @@ -599,20 +600,30 @@ def orthogonal_indices( @st.composite def block_indices( - draw: st.DrawFn, *, chunk_grid_shape: tuple[int, ...], chunks: tuple[int, ...] + draw: st.DrawFn, *, chunk_sizes: tuple[tuple[int, ...], ...] ) -> tuple[tuple[int | slice, ...], tuple[slice, ...]]: """ - Strategy for block-selection indexers over a *regular* chunk grid. + Strategy for block-selection indexers over a chunk grid. Block indexing is basic indexing applied to the block grid (the grid of chunks), so each axis is drawn with ``basic_indices`` over that axis's chunk - count from ``chunk_grid_shape`` (e.g. ``Array.cdata_shape``), mirroring how - ``orthogonal_indices`` reuses ``basic_indices`` per axis. Block indexing only - supports integers and step-1 slices whose start references an existing chunk, - so strided slices and slices starting at the grid edge are filtered out. The - array-space translation assumes a regular (uniform) chunk grid; an over-long - stop into a smaller last chunk is left for numpy to clamp when the oracle is - applied. + count, mirroring how ``orthogonal_indices`` reuses ``basic_indices`` per + axis. ``chunk_sizes`` gives the per-chunk data sizes of the array's *outer* + (block) grid for every axis — i.e. ``Array.write_chunk_sizes``, the grid that + ``Array.blocks`` addresses (the shard grid when sharding is used). For + example ``(3, 3, 3, 1)`` for a length-10 axis with a regular chunk size of 3, + or the explicit edges of a rectilinear axis; ``nchunks`` for an axis is + ``len(chunk_sizes[axis])``. + + The array-space translation uses the cumulative sum of those sizes, matching + ``BlockIndexer``'s use of ``dim_grid.chunk_offset``. Because the sizes are + clipped to the array extent, the final offset equals the extent and the + translation is exact for regular (uniform), rectilinear, and sharded grids + alike. + + Block indexing only supports integers and step-1 slices whose start + references an existing chunk, so strided slices and slices starting at the + grid edge are filtered out. Returns ------- @@ -640,24 +651,83 @@ def predicate(value: tuple[Any, ...]) -> bool: block_indexer: list[int | slice] = [] array_indexer: list[slice] = [] - for chunk, nchunks in zip(chunks, chunk_grid_shape, strict=True): - (dim_sel,) = draw( + for sizes in chunk_sizes: + nchunks = len(sizes) + # offsets[i] is the array-space start of chunk i; length nchunks + 1. + offsets = list(itertools.accumulate(sizes, initial=0)) + dim_strategy = ( basic_indices(min_dims=1, shape=(nchunks,), allow_ellipsis=False) # normalize bare ints / slices to a 1-tuple, skip the empty tuple .map(lambda x: (x,) if not isinstance(x, tuple) else x) .filter(bool) .filter(supported(nchunks)) ) + # basic_indices draws slices far more often than bare integers, so the + # integer (single-block) branch below would only be hit on rare draws. + # Union in an explicit integer so it is reliably exercised — keeping + # coverage deterministic under the derandomized ``ci`` Hypothesis profile. + (dim_sel,) = draw( + dim_strategy | st.integers(min_value=0, max_value=nchunks - 1).map(lambda i: (i,)) + ) block_indexer.append(dim_sel) if isinstance(dim_sel, slice): start, stop, _ = dim_sel.indices(nchunks) - array_indexer.append(slice(start * chunk, stop * chunk)) + array_indexer.append(slice(offsets[start], offsets[stop])) else: block = dim_sel % nchunks - array_indexer.append(slice(block * chunk, (block + 1) * chunk)) + array_indexer.append(slice(offsets[block], offsets[block + 1])) return tuple(block_indexer), tuple(array_indexer) +@st.composite +def block_test_arrays( + draw: st.DrawFn, +) -> tuple[Array[Any], np.ndarray[Any, Any]]: + """Draw an array for block-indexing property tests, with its source contents. + + Two arms, selected with equal probability: + + - **regular**: a regular chunk grid, optionally wrapped in sharding. + - **rectilinear**: a variable (rectilinear) chunk grid, always unsharded. + + Returns ``(zarray, nparray)``. The per-axis block sizes the oracle needs are + ``zarray.write_chunk_sizes`` — the array's *outer* (block / shard) grid, which + is exactly the grid ``Array.blocks`` addresses; the caller reads it directly. + """ + chunks: tuple[int, ...] | list[list[int]] + if draw(st.booleans()): + # regular arm, optionally sharded + nparray, chunks = draw( + np_array_and_chunks( + arrays=numpy_arrays(shapes=npst.array_shapes(max_dims=4, min_side=1)) + ) + ) + # min_side=1 chunking guarantees shape // chunk >= 1 on every axis, which + # shard_shapes requires. + shards = draw(st.none() | shard_shapes(shape=nparray.shape, chunk_shape=chunks)) + event("block regular sharded" if shards is not None else "block regular unsharded") + rectilinear = False + else: + # rectilinear arm, always unsharded + event("block rectilinear") + shape = draw(_rectilinear_shapes) + chunks = draw(rectilinear_chunks(shape=shape)) + nparray = draw(numpy_arrays(shapes=st.just(shape), dtype=draw(dtypes()))) + shards, rectilinear = None, True + + store = draw(stores) + with zarr.config.set({"array.rectilinear_chunks": rectilinear}): + zarray = zarr.create_array( + store=store, + shape=nparray.shape, + chunks=chunks, + shards=shards, + dtype=nparray.dtype, + ) + zarray[...] = nparray + return zarray, nparray + + def key_ranges( keys: SearchStrategy[str] = node_names, max_size: int = sys.maxsize ) -> SearchStrategy[list[tuple[str, ByteRequest | None]]]: diff --git a/tests/test_properties.py b/tests/test_properties.py index 994510aca0..3f71fdf493 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -26,8 +26,8 @@ arrays, basic_indices, block_indices, + block_test_arrays, complex_rectilinear_arrays, - np_array_and_chunks, numpy_arrays, orthogonal_indices, rectilinear_arrays, @@ -261,27 +261,21 @@ def test_mask_indexing(data: st.DataObject) -> None: @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @given(data=st.data()) def test_block_indexing(data: st.DataObject) -> None: - # Block indexing addresses whole chunks on a regular grid; the array-space - # oracle in block_indices() assumes regular, unsharded chunks, so build the - # array directly from a regular chunking rather than drawing one that might - # be rectilinear or sharded. - nparray, chunks = data.draw( - np_array_and_chunks(arrays=numpy_arrays(shapes=npst.array_shapes(max_dims=4, min_side=1))) - ) - store = data.draw(stores) - zarray = zarr.create_array(store=store, shape=nparray.shape, chunks=chunks, dtype=nparray.dtype) - zarray[...] = nparray + # Block indexing addresses whole inner chunks. block_indices() builds its + # array-space oracle from cumulative chunk offsets, so it works for regular + # (uniform), rectilinear, and sharded grids alike; block_test_arrays draws + # across that matrix (rectilinear + sharded is unsupported and not drawn). + zarray, nparray = data.draw(block_test_arrays()) - block_indexer, array_indexer = data.draw( - block_indices(chunk_grid_shape=zarray.cdata_shape, chunks=chunks) - ) + block_indexer, array_indexer = data.draw(block_indices(chunk_sizes=zarray.write_chunk_sizes)) expected = nparray[array_indexer] # sync get, via both the .blocks interface and the dedicated method assert_array_equal(expected, zarray.blocks[block_indexer]) assert_array_equal(expected, zarray.get_block_selection(block_indexer)) - # sync set, via both interfaces + # sync set, via both interfaces; sharded set is broken upstream (GH2834) + assume(zarray.shards is None) new_data = data.draw(numpy_arrays(shapes=st.just(expected.shape), dtype=nparray.dtype)) nparray[array_indexer] = new_data zarray.blocks[block_indexer] = new_data