Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changes/3668.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Exposes the array runtime configuration as an attribute called `config` on the `Array` and
`AsyncArray` classes. The previous `AsyncArray._config` attribute is now a deprecated alias for `AsyncArray.config`.

Adds a method for creating a new `Array` / `AsyncArray` instance with a new runtime configuration, and fixes inaccurate documentation about the `write_empty_chunks` configuration parameter.
9 changes: 8 additions & 1 deletion docs/user-guide/performance.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,14 @@ This optimization prevents storing redundant objects and can speed up reads, but
added computation during array writes, since the contents of
each chunk must be compared to the fill value, and these advantages are contingent on the content of the array.
If you know that your data will form chunks that are almost always non-empty, then there is no advantage to the optimization described above.
In this case, creating an array with `write_empty_chunks=True` (the default) will instruct Zarr to write every chunk without checking for emptiness.
In this case, creating an array with `write_empty_chunks=True` will instruct Zarr to write every chunk without checking for emptiness.

The default value of `write_empty_chunks` is `False`:

```python exec="true" session="performance" source="above" result="ansi"
arr = zarr.create_array(store={}, shape=(1,), dtype='uint8')
assert arr.config.write_empty_chunks == False
```

The following example illustrates the effect of the `write_empty_chunks` flag on
the time required to write an array with different values.:
Expand Down
84 changes: 77 additions & 7 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@
from zarr.codecs.sharding import ShardingCodecIndexLocation
from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar
from zarr.storage import StoreLike
from zarr.types import AnyArray, AnyAsyncArray, AsyncArrayV2, AsyncArrayV3
from zarr.types import AnyArray, AnyAsyncArray, ArrayV2, ArrayV3, AsyncArrayV2, AsyncArrayV3


# Array and AsyncArray are defined in the base ``zarr`` namespace
Expand Down Expand Up @@ -300,14 +300,14 @@ class AsyncArray(Generic[T_ArrayMetadata]):
The path to the Zarr store.
codec_pipeline : CodecPipeline
The codec pipeline used for encoding and decoding chunks.
_config : ArrayConfig
config : ArrayConfig
The runtime configuration of the array.
"""

metadata: T_ArrayMetadata
store_path: StorePath
codec_pipeline: CodecPipeline = field(init=False)
_config: ArrayConfig
config: ArrayConfig

@overload
def __init__(
Expand Down Expand Up @@ -336,7 +336,7 @@ def __init__(

object.__setattr__(self, "metadata", metadata_parsed)
object.__setattr__(self, "store_path", store_path)
object.__setattr__(self, "_config", config_parsed)
object.__setattr__(self, "config", config_parsed)
object.__setattr__(
self,
"codec_pipeline",
Expand Down Expand Up @@ -1012,6 +1012,11 @@ async def example():
def store(self) -> Store:
return self.store_path.store

@property
@deprecated("Use AsyncArray.config instead.", category=ZarrDeprecationWarning)
def _config(self) -> ArrayConfig:
return self.config

@property
def ndim(self) -> int:
"""Returns the number of dimensions in the Array.
Expand Down Expand Up @@ -1165,7 +1170,7 @@ def order(self) -> MemoryOrder:
if self.metadata.zarr_format == 2:
return self.metadata.order
else:
return self._config.order
return self.config.order

@property
def attrs(self) -> dict[str, JSON]:
Expand Down Expand Up @@ -1298,6 +1303,35 @@ def _nshards(self) -> int:
"""
return product(self._shard_grid_shape)

@overload
def with_config(self: AsyncArrayV2, config: ArrayConfigLike) -> AsyncArrayV2: ...

@overload
def with_config(self: AsyncArrayV3, config: ArrayConfigLike) -> AsyncArrayV3: ...

def with_config(self, config: ArrayConfigLike) -> Self:
"""
Return a copy of this Array with a new runtime configuration.

Parameters
----------

config : ArrayConfigLike
The runtime config for the new Array. Any keys not specified will be inherited
from the current array's config.

Returns
-------
A new Array
"""
if isinstance(config, ArrayConfig):
new_config = config
else:
# Merge new config with existing config, so missing keys are inherited
# from the current array rather than from global defaults
new_config = ArrayConfig(**{**self.config.to_dict(), **config}) # type: ignore[arg-type]
return type(self)(metadata=self.metadata, store_path=self.store_path, config=new_config)

async def nchunks_initialized(self) -> int:
"""
Calculate the number of chunks that have been initialized in storage.
Expand Down Expand Up @@ -1570,7 +1604,7 @@ async def _get_selection(
)
if product(indexer.shape) > 0:
# need to use the order from the metadata for v2
_config = self._config
_config = self.config
if self.metadata.zarr_format == 2:
_config = replace(_config, order=self.order)

Expand Down Expand Up @@ -1741,7 +1775,7 @@ async def _set_selection(
value_buffer = prototype.nd_buffer.from_ndarray_like(value)

# need to use the order from the metadata for v2
_config = self._config
_config = self.config
if self.metadata.zarr_format == 2:
_config = replace(_config, order=self.metadata.order)

Expand Down Expand Up @@ -2063,6 +2097,19 @@ def async_array(self) -> AsyncArray[T_ArrayMetadata]:
"""
return self._async_array

@property
def config(self) -> ArrayConfig:
"""
The runtime configuration for this array. This is a read-only property. To modify the
runtime configuration, use `Array.with_config` to create a new `Array` with the modified
configuration.

Returns
-------
An `ArrayConfig` object that defines the runtime configuration for the array.
"""
return self.async_array.config

@classmethod
@deprecated("Use zarr.create_array instead.", category=ZarrDeprecationWarning)
def create(
Expand Down Expand Up @@ -2524,6 +2571,29 @@ def _nshards(self) -> int:
"""
return self.async_array._nshards

@overload
def with_config(self: ArrayV2, config: ArrayConfigLike) -> ArrayV2: ...

@overload
def with_config(self: ArrayV3, config: ArrayConfigLike) -> ArrayV3: ...

def with_config(self, config: ArrayConfigLike) -> Self:
"""
Return a copy of this Array with a new runtime configuration.

Parameters
----------

config : ArrayConfigLike
The runtime config for the new Array. Any keys not specified will be inherited
from the current array's config.

Returns
-------
A new Array
"""
return type(self)(self._async_array.with_config(config))

@property
def nbytes(self) -> int:
"""
Expand Down
6 changes: 6 additions & 0 deletions src/zarr/core/array_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ def from_dict(cls, data: ArrayConfigParams) -> Self:
kwargs_out[field_name] = data[field_name]
return cls(**kwargs_out)

def to_dict(self) -> ArrayConfigParams:
"""
Serialize an instance of this class to a dict.
"""
return {"order": self.order, "write_empty_chunks": self.write_empty_chunks}


ArrayConfigLike = ArrayConfig | ArrayConfigParams

Expand Down
2 changes: 1 addition & 1 deletion tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def test_open_array_respects_write_empty_chunks_config(zarr_format: ZarrFormat)
arr2 = zarr.open(store=store, path="test_array", config={"write_empty_chunks": True})
assert isinstance(arr2, zarr.Array)

assert arr2.async_array._config.write_empty_chunks is True
assert arr2.async_array.config.write_empty_chunks is True

arr2[0:5] = np.zeros(5)
assert arr2.nchunks_initialized == 1
Expand Down
42 changes: 40 additions & 2 deletions tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
default_filters_v2,
default_serializer_v3,
)
from zarr.core.array_spec import ArrayConfig, ArrayConfigParams
from zarr.core.buffer import NDArrayLike, NDArrayLikeOrScalar, default_buffer_prototype
from zarr.core.chunk_grids import _auto_partition
from zarr.core.chunk_key_encodings import ChunkKeyEncodingParams
Expand Down Expand Up @@ -889,7 +890,7 @@ def test_write_empty_chunks_behavior(
config={"write_empty_chunks": write_empty_chunks},
)

assert arr.async_array._config.write_empty_chunks == write_empty_chunks
assert arr.async_array.config.write_empty_chunks == write_empty_chunks

# initialize the store with some non-fill value chunks
arr[:] = fill_value + 1
Expand Down Expand Up @@ -1562,7 +1563,7 @@ async def test_write_empty_chunks_config(write_empty_chunks: bool, store: Store)
"""
with zarr.config.set({"array.write_empty_chunks": write_empty_chunks}):
arr = await create_array(store, shape=(2, 2), dtype="i4")
assert arr._config.write_empty_chunks == write_empty_chunks
assert arr.config.write_empty_chunks == write_empty_chunks

@staticmethod
@pytest.mark.parametrize("path", [None, "", "/", "/foo", "foo", "foo/bar"])
Expand Down Expand Up @@ -2194,3 +2195,40 @@ def test_create_array_with_data_num_gets(
# one get for the metadata and one per shard.
# Note: we don't actually need one get per shard, but this is the current behavior
assert store.counter["get"] == 1 + num_shards


@pytest.mark.parametrize("config", [{}, {"write_empty_chunks": True}, {"order": "C"}])
def test_with_config(config: ArrayConfigParams) -> None:
"""
Test that `AsyncArray.with_config` and `Array.with_config` create a copy of the source
array with a new runtime configuration.
"""
# the config we start with
source_config: ArrayConfigParams = {"write_empty_chunks": False, "order": "F"}
source_array = zarr.create_array({}, shape=(1,), dtype="uint8", config=source_config)

new_async_array_config_dict = source_array._async_array.with_config(config).config.to_dict()
new_array_config_dict = source_array.with_config(config).config.to_dict()

for key in source_config:
if key in config:
assert new_async_array_config_dict[key] == config[key] # type: ignore[literal-required]
assert new_array_config_dict[key] == config[key] # type: ignore[literal-required]
else:
assert new_async_array_config_dict[key] == source_config[key] # type: ignore[literal-required]
assert new_array_config_dict[key] == source_config[key] # type: ignore[literal-required]


def test_with_config_polymorphism() -> None:
"""
Test that `AsyncArray.with_config` and `Array.with_config` accept dicts and full array config
objects.
"""
source_config: ArrayConfig = ArrayConfig.from_dict({"write_empty_chunks": False, "order": "F"})
source_config_dict = source_config.to_dict()

arr = zarr.create_array({}, shape=(1,), dtype="uint8")
arr_source_config = arr.with_config(source_config)
arr_source_config_dict = arr.with_config(source_config_dict)

assert arr_source_config.config == arr_source_config_dict.config