diff --git a/Cargo.toml b/Cargo.toml index 10f33fa..6c4cd9e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ crate-type = ["cdylib", "rlib"] [dependencies] pyo3 = { version = "0.27.1", features = ["abi3-py311"] } -zarrs = { version = "0.23.0", features = ["async", "zlib", "pcodec", "bz2"] } +zarrs = { version = "0.23.6", features = ["async", "zlib", "pcodec", "bz2"] } rayon_iter_concurrent_limit = "0.2.0" rayon = "1.10.0" # fix for https://stackoverflow.com/questions/76593417/package-openssl-was-not-found-in-the-pkg-config-search-path diff --git a/README.md b/README.md index f06add7..380435f 100644 --- a/README.md +++ b/README.md @@ -89,38 +89,7 @@ Chunk concurrency is typically favored because: ## Supported Indexing Methods -The following methods will trigger use with the old zarr-python pipeline: - -1. Any `oindex` or `vindex` integer `np.ndarray` indexing with dimensionality >=3 i.e., - - ```python - arr[np.array([...]), :, np.array([...])] - arr[np.array([...]), np.array([...]), np.array([...])] - arr[np.array([...]), np.array([...]), np.array([...])] = ... - arr.oindex[np.array([...]), np.array([...]), np.array([...])] = ... - ``` - -2. Any `vindex` or `oindex` discontinuous integer `np.ndarray` indexing for writes in 2D - - ```python - arr[np.array([0, 5]), :] = ... - arr.oindex[np.array([0, 5]), :] = ... - ``` - -3. `vindex` writes in 2D where both indexers are integer `np.ndarray` indices i.e., - - ```python - arr[np.array([...]), np.array([...])] = ... - ``` - -4. Ellipsis indexing. We have tested some, but others fail even with `zarr-python`'s default codec pipeline. Thus for now we advise proceeding with caution here. - - ```python - arr[0:10, ..., 0:5] - ``` - - -Furthermore, using anything except contiguous (i.e., slices or consecutive integer) `np.ndarray` for numeric data will fall back to the default `zarr-python` implementation. +The only supported methods are `slice` and `Ellipsis`-based methods - everything else will fall back to the `zarr-python` implementation (if `codec_pipeline.strict` is not enabled). Please file an issue if you believe we have more holes in our coverage than we are aware of or you wish to contribute! For example, we have an [issue in zarrs for integer-array indexing](https://github.com/LDeakin/zarrs/issues/52) that would unblock a lot the use of the rust pipeline for that use-case (very useful for mini-batch training perhaps!). diff --git a/python/zarrs/__init__.py b/python/zarrs/__init__.py index cf5b8bd..decded2 100644 --- a/python/zarrs/__init__.py +++ b/python/zarrs/__init__.py @@ -1,6 +1,6 @@ from ._internal import __version__ from .pipeline import ZarrsCodecPipeline as _ZarrsCodecPipeline -from .utils import CollapsedDimensionError, DiscontiguousArrayError +from .utils import UnsupportedIndexTypeError # Need to do this redirection so people can access the pipeline as `zarrs.ZarrsCodecPipeline` instead of `zarrs.pipeline.ZarrsCodecPipeline` @@ -10,7 +10,6 @@ class ZarrsCodecPipeline(_ZarrsCodecPipeline): __all__ = [ "ZarrsCodecPipeline", - "DiscontiguousArrayError", - "CollapsedDimensionError", + "UnsupportedIndexTypeError", "__version__", ] diff --git a/python/zarrs/pipeline.py b/python/zarrs/pipeline.py index a3e4959..87d0f8d 100644 --- a/python/zarrs/pipeline.py +++ b/python/zarrs/pipeline.py @@ -26,9 +26,7 @@ from ._internal import CodecPipelineImpl from .utils import ( - CollapsedDimensionError, - DiscontiguousArrayError, - FillValueNoneError, + UnsupportedIndexTypeError, make_chunk_info_for_rust_with_indices, ) @@ -184,10 +182,8 @@ async def read( ) except ( UnsupportedMetadataError, - DiscontiguousArrayError, - CollapsedDimensionError, + UnsupportedIndexTypeError, UnsupportedDataTypeError, - FillValueNoneError, ): if self.python_impl is None: raise @@ -219,10 +215,8 @@ async def write( ) except ( UnsupportedMetadataError, - DiscontiguousArrayError, - CollapsedDimensionError, + UnsupportedIndexTypeError, UnsupportedDataTypeError, - FillValueNoneError, ): if self.python_impl is None: raise diff --git a/python/zarrs/utils.py b/python/zarrs/utils.py index 8a314a7..0b50861 100644 --- a/python/zarrs/utils.py +++ b/python/zarrs/utils.py @@ -6,14 +6,10 @@ from functools import reduce from typing import TYPE_CHECKING, Any -import numpy as np -from zarr.core.indexing import is_integer - from zarrs._internal import ChunkItem if TYPE_CHECKING: from collections.abc import Iterable - from types import EllipsisType from zarr.abc.store import ByteGetter, ByteSetter from zarr.core.array_spec import ArraySpec @@ -26,121 +22,29 @@ def get_max_threads() -> int: return (os.cpu_count() or 1) + 4 -class DiscontiguousArrayError(Exception): - pass - - -class CollapsedDimensionError(Exception): - pass - - -class FillValueNoneError(Exception): +class UnsupportedIndexTypeError(Exception): pass -# This is a (mostly) copy of the function from zarr.core.indexing that fixes: -# DeprecationWarning: Conversion of an array with ndim > 0 to a scalar is deprecated -# TODO: Upstream this fix -def make_slice_selection(selection: tuple[np.ndarray | float]) -> list[slice]: - ls: list[slice] = [] - for dim_selection in selection: - if is_integer(dim_selection): - ls.append(slice(int(dim_selection), int(dim_selection) + 1, 1)) - elif isinstance(dim_selection, np.ndarray): - dim_selection = dim_selection.ravel() - if len(dim_selection) == 1: - ls.append( - slice(int(dim_selection.item()), int(dim_selection.item()) + 1, 1) - ) - else: - diff = np.diff(dim_selection) - if (diff != 1).any() and (diff != 0).any(): - raise DiscontiguousArrayError(diff) - ls.append(slice(dim_selection[0], dim_selection[-1] + 1, 1)) - else: - ls.append(dim_selection) - return ls - - def selector_tuple_to_slice_selection(selector_tuple: SelectorTuple) -> list[slice]: if isinstance(selector_tuple, slice): return [selector_tuple] - if all(isinstance(s, slice) for s in selector_tuple): - return list(selector_tuple) - return make_slice_selection(selector_tuple) - - -def resulting_shape_from_index( - array_shape: tuple[int, ...], - index_tuple: tuple[int | slice | EllipsisType | np.ndarray], - drop_axes: tuple[int, ...], - *, - pad: bool, -) -> tuple[int, ...]: - result_shape = [] - advanced_index_shapes = [ - idx.shape for idx in index_tuple if isinstance(idx, np.ndarray) - ] - basic_shape_index = 0 - - # Broadcast all advanced indices, if any - if advanced_index_shapes: - result_shape += np.broadcast_shapes(*advanced_index_shapes) - # Consume dimensions from array_shape - basic_shape_index += len(advanced_index_shapes) - - # Process each remaining index in index_tuple - for idx in index_tuple: - if isinstance(idx, int): - # Integer index reduces dimension, so skip this dimension in array_shape - basic_shape_index += 1 - elif isinstance(idx, slice): - if idx.step is not None and idx.step > 1: - raise DiscontiguousArrayError( - "Step size greater than 1 is not supported" - ) - # Slice keeps dimension, adjust size accordingly - start, stop, _ = idx.indices(array_shape[basic_shape_index]) - result_shape.append(stop - start) - basic_shape_index += 1 - elif idx is Ellipsis: - # Calculate number of dimensions that Ellipsis should fill - num_to_fill = len(array_shape) - len(index_tuple) + 1 - result_shape += array_shape[ - basic_shape_index : basic_shape_index + num_to_fill - ] - basic_shape_index += num_to_fill - elif not isinstance(idx, np.ndarray): - raise ValueError(f"Invalid index type: {type(idx)}") - - # Step 4: Append remaining dimensions from array_shape if fewer indices were used - if basic_shape_index < len(array_shape) and pad: - result_shape += array_shape[basic_shape_index:] - - return tuple(size for idx, size in enumerate(result_shape) if idx not in drop_axes) + if all( + isinstance(s, slice) and (s.step is None or s.step == 1) or isinstance(s, int) + for s in selector_tuple + ): + return list( + s if isinstance(s, slice) else slice(s, s + 1) for s in selector_tuple + ) + raise UnsupportedIndexTypeError( + f"Invalid index type detected among indexes: {selector_tuple}" + ) def prod_op(x: Iterable[int]) -> int: return reduce(operator.mul, x, 1) -def get_shape_for_selector( - selector_tuple: SelectorTuple, - shape: tuple[int, ...], - *, - pad: bool, - drop_axes: tuple[int, ...] = (), -) -> tuple[int, ...]: - if isinstance(selector_tuple, slice | np.ndarray): - return resulting_shape_from_index( - shape, - (selector_tuple,), - drop_axes, - pad=pad, - ) - return resulting_shape_from_index(shape, selector_tuple, drop_axes, pad=pad) - - def get_implicit_fill_value(dtype: ZDType, fill_value: Any) -> Any: if fill_value is None: fill_value = dtype.default_scalar() @@ -160,7 +64,7 @@ def make_chunk_info_for_rust_with_indices( drop_axes: tuple[int, ...], shape: tuple[int, ...], ) -> RustChunkInfo: - shape = shape if shape else (1,) # constant array + is_shape_constant = shape == () chunk_info_with_indices: list[ChunkItem] = [] write_empty_chunks: bool = True for ( @@ -171,28 +75,38 @@ def make_chunk_info_for_rust_with_indices( _, ) in batch_info: write_empty_chunks = chunk_spec.config.write_empty_chunks - out_selection_as_slices = selector_tuple_to_slice_selection(out_selection) chunk_selection_as_slices = selector_tuple_to_slice_selection(chunk_selection) - shape_chunk_selection_slices = get_shape_for_selector( - tuple(chunk_selection_as_slices), - chunk_spec.shape, - pad=True, - drop_axes=drop_axes, + out_selection_as_slices_iter = iter( + selector_tuple_to_slice_selection(out_selection) ) - shape_chunk_selection = get_shape_for_selector( - chunk_selection, chunk_spec.shape, pad=True, drop_axes=drop_axes + out_selection_as_slices_expanded = list( + next(out_selection_as_slices_iter) + if not isinstance(c, int) + else slice(0, 1) + for c in chunk_selection ) - if prod_op(shape_chunk_selection) != prod_op(shape_chunk_selection_slices): - raise CollapsedDimensionError( - f"{shape_chunk_selection} != {shape_chunk_selection_slices}" - ) + if is_shape_constant: + io_array_shape = (1,) + else: + shape_iter = iter(shape) + try: + io_array_shape = list( + next(shape_iter) if not isinstance(c, int) else 1 + for c in chunk_selection + ) + except RuntimeError as e: + if isinstance(e.__cause__, StopIteration): + raise IndexError( + f"the size of the chunk subset {chunk_selection} and input/output subset {shape} are incompatible" + ) from None + raise chunk_info_with_indices.append( ChunkItem( key=byte_getter.path, chunk_subset=chunk_selection_as_slices, chunk_shape=chunk_spec.shape, - subset=out_selection_as_slices, - shape=shape, + subset=out_selection_as_slices_expanded, + shape=io_array_shape, ) ) return RustChunkInfo(chunk_info_with_indices, write_empty_chunks) diff --git a/src/chunk_item.rs b/src/chunk_item.rs index b759120..1e435bd 100644 --- a/src/chunk_item.rs +++ b/src/chunk_item.rs @@ -31,6 +31,7 @@ pub(crate) struct ChunkItem { pub chunk_subset: ArraySubset, pub subset: ArraySubset, pub shape: Vec, + pub output_shape: Vec, pub num_elements: u64, } @@ -65,6 +66,7 @@ impl ChunkItem { subset, shape: chunk_shape_nonzero_u64, num_elements, + output_shape: shape_nonzero_u64 }) } } diff --git a/src/lib.rs b/src/lib.rs index 1b7c08e..81d9daf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -38,7 +38,7 @@ mod utils; use crate::concurrency::ChunkConcurrentLimitAndCodecOptions; use crate::store::StoreConfig; -use crate::utils::{PyCodecErrExt, PyErrExt as _, PyUntypedArrayExt as _}; +use crate::utils::{PyCodecErrExt, PyErrExt as _}; // TODO: Use a OnceLock for store with get_or_try_init when stabilised? #[gen_stub_pyclass] @@ -288,7 +288,6 @@ impl CodecPipelineImpl { ) -> PyResult<()> { // Get input array let output = Self::nparray_to_unsafe_cell_slice(value)?; - let output_shape: Vec = value.shape_zarr()?; // Adjust the concurrency based on the codec chain and the first chunk description let Some((chunk_concurrent_limit, codec_options)) = @@ -343,7 +342,7 @@ impl CodecPipelineImpl { .fixed_size() .ok_or("variable length data type not supported") .map_py_err::()?, - &output_shape, + bytemuck::must_cast_slice::<_, u64>(&item.output_shape), item.subset.clone(), ) .map_py_err::()? @@ -410,7 +409,6 @@ impl CodecPipelineImpl { } else { InputValue::Constant(FillValue::new(input_slice.to_vec())) }; - let input_shape: Vec = value.shape_zarr()?; // Adjust the concurrency based on the codec chain and the first chunk description let Some((chunk_concurrent_limit, mut codec_options)) = @@ -424,7 +422,7 @@ impl CodecPipelineImpl { let store_chunk = |item: ChunkItem| match &input { InputValue::Array(input) => { let chunk_subset_bytes = input - .extract_array_subset(&item.subset, &input_shape, &self.data_type) + .extract_array_subset(&item.subset, bytemuck::must_cast_slice::<_, u64>(&item.output_shape), &self.data_type) .map_codec_err()?; self.store_chunk_subset_bytes( &item, diff --git a/src/utils.rs b/src/utils.rs index d663b5c..3ededf0 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,7 +1,6 @@ use std::fmt::Display; -use numpy::{PyUntypedArray, PyUntypedArrayMethods}; -use pyo3::{Bound, PyErr, PyResult, PyTypeInfo}; +use pyo3::{PyErr, PyResult, PyTypeInfo}; use zarrs::array::CodecError; use crate::ChunkItem; @@ -38,23 +37,6 @@ impl PyCodecErrExt for Result { } } -pub(crate) trait PyUntypedArrayExt { - fn shape_zarr(&self) -> PyResult>; -} - -impl PyUntypedArrayExt for Bound<'_, PyUntypedArray> { - fn shape_zarr(&self) -> PyResult> { - Ok(if self.shape().is_empty() { - vec![1] // scalar value - } else { - self.shape() - .iter() - .map(|&i| u64::try_from(i)) - .collect::>()? - }) - } -} - pub fn is_whole_chunk(item: &ChunkItem) -> bool { item.chunk_subset.start().iter().all(|&o| o == 0) && item.chunk_subset.shape() == bytemuck::must_cast_slice::<_, u64>(&item.shape) diff --git a/tests/pipeline/conftest.py b/tests/pipeline/conftest.py index 8afbe77..299f3bc 100644 --- a/tests/pipeline/conftest.py +++ b/tests/pipeline/conftest.py @@ -37,7 +37,7 @@ def fill_value() -> int: return fill_value_ -non_numpy_indices = [ +indexers = [ pytest.param(slice(1, 3), id="slice_in_chunk"), pytest.param(slice(1, 7), id="slice_across_chunks"), pytest.param(2, id="int"), @@ -45,14 +45,6 @@ def fill_value() -> int: pytest.param(Ellipsis, id="ellipsis"), ] -numpy_indices = [ - pytest.param(np.array([1, 2]), id="contiguous_in_chunk_array"), - pytest.param(np.array([0, 3]), id="discontinuous_in_chunk_array"), - pytest.param(np.array([0, 6]), id="across_chunks_indices_array"), -] - -all_indices = numpy_indices + non_numpy_indices - indexing_method_params = [ pytest.param(lambda x: getattr(x, "oindex"), id="oindex"), pytest.param(lambda x: x, id="vindex"), @@ -106,7 +98,6 @@ def arr(tmp_path: Path, dimensionality: int, format: Literal[2, 3]) -> zarr.Arra def roundtrip_params() -> Generator[ParameterSet]: for format, dimensionality in product(zarr_formats, dimensionalities_): - indexers = non_numpy_indices if dimensionality > 2 else all_indices for index_param_prod in product(indexers, repeat=dimensionality): index = tuple(index_param.values[0] for index_param in index_param_prod) # multi-ellipsis indexing is not supported diff --git a/tests/test_strict_mode.py b/tests/test_strict_mode.py index d634799..5b5becd 100644 --- a/tests/test_strict_mode.py +++ b/tests/test_strict_mode.py @@ -15,10 +15,7 @@ import zarr from zarr.storage import StorePath -from zarrs.pipeline import ( - DiscontiguousArrayError, - UnsupportedDataTypeError, -) +from zarrs.pipeline import UnsupportedDataTypeError, UnsupportedIndexTypeError if TYPE_CHECKING: from zarr.abc.store import Store @@ -90,12 +87,23 @@ def test_advanced_indexing( arr[:] = data with ( - pytest.raises(DiscontiguousArrayError) + pytest.raises(UnsupportedIndexTypeError) if codec_pipeline_strict else nullcontext() ): arr[:, ::2] = data[:, ::2] + with ( + pytest.raises(UnsupportedIndexTypeError) + if codec_pipeline_strict + else nullcontext() + ): arr[[0, 2], [0, 1]] = data[[0, 2], [0, 1]] + with ( + pytest.raises(UnsupportedIndexTypeError) + if codec_pipeline_strict + else nullcontext() + ): + arr[np.array([0, 1, 2, 3])] @pytest.mark.parametrize("store", ["local"], indirect=["store"]) def test_supported_operations_still_work( @@ -121,7 +129,3 @@ def test_supported_operations_still_work( # Integer indexing should work assert arr[5, 5] == data[5, 5] - - # Contiguous array indexing should work - indices = np.array([1, 2, 3]) - assert np.array_equal(arr[indices, 0], data[indices, 0])