Skip to content

Commit aa58ffa

Browse files
Workaround for KeyError 'ome' when writing multiscale images (#1115)
* fix: workaround for KeyError 'ome' when writing multiscale with processes scheduler (#1024) Re-open the zarr group after da.compute() in _write_raster_datatree so that the main process picks up metadata written by ome-zarr-py's delayed write_multiscales_metadata() task, which runs in a subprocess under the 'processes' scheduler and leaves the original in-memory GroupMetadata stale. The fresh group is returned to _write_raster so the subsequent spatialdata attrs write also uses the correct on-disk state. Upstream issue: ome/ome-zarr-py#580 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix: use use_consolidated=False when re-opening group after compute The re-open after da.compute() was inheriting stale consolidated metadata, which caused KeyError when newly written children (e.g. labels3d_multiscale_xarray) were not yet reflected in it. Passing use_consolidated=False forces zarr to read group membership directly from the store instead. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 8d7f72f commit aa58ffa

2 files changed

Lines changed: 34 additions & 3 deletions

File tree

src/spatialdata/_io/io_raster.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@ def _write_raster(
316316
**metadata,
317317
)
318318
elif isinstance(raster_data, DataTree):
319-
_write_raster_datatree(
319+
group = _write_raster_datatree(
320320
raster_type,
321321
group,
322322
name,
@@ -409,7 +409,7 @@ def _write_raster_datatree(
409409
raster_format: RasterFormatType,
410410
storage_options: JSONDict | list[JSONDict] | None = None,
411411
**metadata: str | JSONDict | list[JSONDict],
412-
) -> None:
412+
) -> zarr.Group:
413413
"""Write raster data of type DataTree to disk.
414414
415415
Parameters
@@ -460,13 +460,23 @@ def _write_raster_datatree(
460460
# os.replace is called. These can also be alleviated by using 'single-threaded' scheduler.
461461
da.compute(*dask_delayed, optimize_graph=False)
462462

463+
# Workaround for https://github.com/scverse/spatialdata/issues/1024.
464+
# ome-zarr-py bundles write_multiscales_metadata() as a dask.delayed task in the compute=False
465+
# code path (see https://github.com/ome/ome-zarr-py/issues/580). When da.compute() runs with
466+
# the 'processes' scheduler that task executes in a subprocess: the on-disk zarr.json is written
467+
# correctly, but the zarr.Group held in this process keeps its original in-memory GroupMetadata
468+
# and never sees the update. Re-opening the group forces a fresh read from the store.
469+
# This workaround should not be needed once https://github.com/ome/ome-zarr-py/issues/580 is fixed.
470+
group = zarr.open_group(store=group.store, path=group.path, mode="r+", use_consolidated=False)
471+
463472
trans_group = group["labels"][element_name] if raster_type == "labels" else group
464473
overwrite_coordinate_transformations_raster(
465474
group=trans_group,
466475
transformations=transformations,
467476
axes=tuple(input_axes),
468477
raster_format=raster_format,
469478
)
479+
return group
470480

471481

472482
def write_image(

tests/io/test_pyramids_performance.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,11 @@
1010
import xarray as xr
1111
import zarr
1212

13-
from spatialdata import SpatialData
13+
from spatialdata import SpatialData, read_zarr
1414
from spatialdata._io import write_image
1515
from spatialdata._io.format import CurrentRasterFormat
1616
from spatialdata.models import Image2DModel
17+
from spatialdata.testing import assert_spatial_data_objects_are_identical
1718

1819
if TYPE_CHECKING:
1920
import _pytest.fixtures
@@ -95,3 +96,23 @@ def test_write_image_multiscale_performance(sdata_with_image: SpatialData, tmp_p
9596
# In addition, we could do use a mock side effect to check that the entry points from within spatialdata are within
9697
# the expected range.
9798
assert actual_num_chunk_reads in range(0, num_chunks_scale0.item() * 2 + 1)
99+
100+
101+
@pytest.mark.parametrize("scheduler", ["threads", "processes"])
102+
def test_write_multiscale_image_dask_scheduler(tmp_path: Path, scheduler: str) -> None:
103+
# Regression test for https://github.com/scverse/spatialdata/issues/1024.
104+
# Writing a multiscale image with the 'processes' Dask scheduler previously raised
105+
# KeyError: 'ome' because ome-zarr-py runs write_multiscales_metadata() as a
106+
# dask.delayed task (https://github.com/ome/ome-zarr-py/issues/580): the metadata
107+
# write occurs in a subprocess and the zarr.Group in the main process is never
108+
# refreshed, so subsequent metadata reads fail.
109+
rng = np.random.default_rng(0)
110+
arr = dask.array.from_array(rng.random((3, 64, 64)).astype("float32"), chunks=(3, 32, 32))
111+
image = Image2DModel.parse(arr, dims=["c", "y", "x"], scale_factors=[2])
112+
sdata = SpatialData(images={"img": image})
113+
114+
store_path = tmp_path / "test.zarr"
115+
with dask.config.set(scheduler=scheduler):
116+
sdata.write(store_path)
117+
118+
assert_spatial_data_objects_are_identical(sdata, read_zarr(store_path))

0 commit comments

Comments
 (0)