From e27ce5abb096798dfdc1a187789c322932efde0b Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Fri, 13 Mar 2026 12:12:23 -0400 Subject: [PATCH 1/2] Fix: List prefix does not do overeager string match --- src/zarr/storage/_memory.py | 3 +- src/zarr/storage/_utils.py | 11 ++++++++ src/zarr/storage/_zip.py | 2 ++ src/zarr/testing/store.py | 49 ++++++++++++++++++++++++--------- tests/test_store/test_memory.py | 6 ---- 5 files changed, 51 insertions(+), 20 deletions(-) diff --git a/src/zarr/storage/_memory.py b/src/zarr/storage/_memory.py index e6f9b7a512..6f18a23014 100644 --- a/src/zarr/storage/_memory.py +++ b/src/zarr/storage/_memory.py @@ -7,7 +7,7 @@ from zarr.core.buffer import Buffer, gpu from zarr.core.buffer.core import default_buffer_prototype from zarr.core.common import concurrent_map -from zarr.storage._utils import _normalize_byte_range_index +from zarr.storage._utils import _normalize_byte_range_index, _normalize_prefix if TYPE_CHECKING: from collections.abc import AsyncIterator, Iterable, MutableMapping @@ -152,6 +152,7 @@ async def list(self) -> AsyncIterator[str]: async def list_prefix(self, prefix: str) -> AsyncIterator[str]: # docstring inherited # note: we materialize all dict keys into a list here so we can mutate the dict in-place (e.g. in delete_prefix) + prefix = _normalize_prefix(prefix) for key in list(self._store_dict): if key.startswith(prefix): yield key diff --git a/src/zarr/storage/_utils.py b/src/zarr/storage/_utils.py index 10ac395b36..b5f3632a90 100644 --- a/src/zarr/storage/_utils.py +++ b/src/zarr/storage/_utils.py @@ -13,6 +13,17 @@ from zarr.core.buffer import Buffer +def _normalize_prefix(prefix: str) -> str: + """Normalize a store prefix to ensure it has a trailing slash. + + This ensures that prefix matching uses directory-like semantics, + so that e.g. prefix "a" does not match keys under "a_extra/". + """ + if prefix != "" and not prefix.endswith("/"): + return prefix + "/" + return prefix + + def normalize_path(path: str | bytes | Path | None) -> str: if path is None: result = "" diff --git a/src/zarr/storage/_zip.py b/src/zarr/storage/_zip.py index 72bf9e335a..dfdd278d47 100644 --- a/src/zarr/storage/_zip.py +++ b/src/zarr/storage/_zip.py @@ -16,6 +16,7 @@ SuffixByteRequest, ) from zarr.core.buffer import Buffer, BufferPrototype +from zarr.storage._utils import _normalize_prefix if TYPE_CHECKING: from collections.abc import AsyncIterator, Iterable @@ -261,6 +262,7 @@ async def list(self) -> AsyncIterator[str]: async def list_prefix(self, prefix: str) -> AsyncIterator[str]: # docstring inherited + prefix = _normalize_prefix(prefix) async for key in self.list(): if key.startswith(prefix): yield key diff --git a/src/zarr/testing/store.py b/src/zarr/testing/store.py index 1b8e85ed98..f234b39fec 100644 --- a/src/zarr/testing/store.py +++ b/src/zarr/testing/store.py @@ -442,23 +442,46 @@ async def test_list(self, store: S) -> None: async def test_list_prefix(self, store: S) -> None: """ Test that the `list_prefix` method works as intended. Given a prefix, it should return - all the keys in storage that start with this prefix. + all the keys under that prefix, treating the prefix as a directory path. """ - prefixes = ("", "a/", "a/b/", "a/b/c/") data = self.buffer_cls.from_bytes(b"") - fname = "zarr.json" - store_dict = {p + fname: data for p in prefixes} - + store_dict = { + "zarr.json": data, + "a/zarr.json": data, + "a/b/zarr.json": data, + "a/b/c/zarr.json": data, + "a_extra/zarr.json": data, + } await store._set_many(store_dict.items()) + all_keys = sorted(store_dict.keys()) + + a_keys = ["a/b/c/zarr.json", "a/b/zarr.json", "a/zarr.json"] + ab_keys = ["a/b/c/zarr.json", "a/b/zarr.json"] + + # query prefix -> expected keys + test_cases: dict[str, list[str]] = { + # empty prefix returns everything + "": all_keys, + # with trailing / + "a/": a_keys, + "a/b/": ab_keys, + "a/b/c/": ["a/b/c/zarr.json"], + "a_extra/": ["a_extra/zarr.json"], + # without trailing / should behave the same as with / + "a": a_keys, + "a/b": ab_keys, + "a/b/c": ["a/b/c/zarr.json"], + "a_extra": ["a_extra/zarr.json"], + # partial prefix that doesn't match any directory + "a_e": [], + # prefix that doesn't match anything + "b": [], + "b/": [], + } - for prefix in prefixes: - observed = tuple(sorted(await _collect_aiterator(store.list_prefix(prefix)))) - expected: tuple[str, ...] = () - for key in store_dict: - if key.startswith(prefix): - expected += (key,) - expected = tuple(sorted(expected)) - assert observed == expected + for prefix, expected in test_cases.items(): + observed = sorted(await _collect_aiterator(store.list_prefix(prefix))) + assert observed == expected, f"list_prefix({prefix!r}): {observed} != {expected}" async def test_list_empty_path(self, store: S) -> None: """ diff --git a/tests/test_store/test_memory.py b/tests/test_store/test_memory.py index 03c8b24271..1a9145566f 100644 --- a/tests/test_store/test_memory.py +++ b/tests/test_store/test_memory.py @@ -57,9 +57,6 @@ def test_store_supports_writes(self, store: MemoryStore) -> None: def test_store_supports_listing(self, store: MemoryStore) -> None: assert store.supports_listing - async def test_list_prefix(self, store: MemoryStore) -> None: - assert True - @pytest.mark.parametrize("dtype", ["uint8", "float32", "int64"]) @pytest.mark.parametrize("zarr_format", [2, 3]) async def test_deterministic_size( @@ -163,9 +160,6 @@ def test_store_supports_writes(self, store: GpuMemoryStore) -> None: def test_store_supports_listing(self, store: GpuMemoryStore) -> None: assert store.supports_listing - async def test_list_prefix(self, store: GpuMemoryStore) -> None: - assert True - def test_dict_reference(self, store: GpuMemoryStore) -> None: store_dict: dict[str, Any] = {} result = GpuMemoryStore(store_dict=store_dict) From 231d256dd482a32f5f4585ac691546dd28eddc85 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Fri, 13 Mar 2026 12:51:58 -0400 Subject: [PATCH 2/2] junk comments --- src/zarr/testing/store.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/zarr/testing/store.py b/src/zarr/testing/store.py index f234b39fec..6e8e02ecdc 100644 --- a/src/zarr/testing/store.py +++ b/src/zarr/testing/store.py @@ -458,23 +458,17 @@ async def test_list_prefix(self, store: S) -> None: a_keys = ["a/b/c/zarr.json", "a/b/zarr.json", "a/zarr.json"] ab_keys = ["a/b/c/zarr.json", "a/b/zarr.json"] - # query prefix -> expected keys test_cases: dict[str, list[str]] = { - # empty prefix returns everything "": all_keys, - # with trailing / "a/": a_keys, "a/b/": ab_keys, "a/b/c/": ["a/b/c/zarr.json"], "a_extra/": ["a_extra/zarr.json"], - # without trailing / should behave the same as with / "a": a_keys, "a/b": ab_keys, "a/b/c": ["a/b/c/zarr.json"], "a_extra": ["a_extra/zarr.json"], - # partial prefix that doesn't match any directory "a_e": [], - # prefix that doesn't match anything "b": [], "b/": [], }