Skip to content
13 changes: 11 additions & 2 deletions dandi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -1051,11 +1051,12 @@ def downloads_gen():
else:
return

yield {"status": "deleting extra files"}
remote_paths = set(map(str, entries))
zarr_basepath = Path(download_path)
dirs = deque([zarr_basepath])

announced: bool = False
empty_dirs: deque[Path] = deque()
dirs: deque[Path] = deque([zarr_basepath])
while dirs:
d = dirs.popleft()
is_empty = True
Expand All @@ -1066,7 +1067,11 @@ def downloads_gen():
p.is_file()
and p.relative_to(zarr_basepath).as_posix() not in remote_paths
):
if not announced:
announced = True
yield {"status": "deleting extra files"}
try:
lgr.debug("Deleting extra Zarr file %s", p)
p.unlink()
except OSError:
is_empty = False
Expand All @@ -1079,6 +1084,10 @@ def downloads_gen():
empty_dirs.append(d)
while empty_dirs:
d = empty_dirs.popleft()
if not announced:
announced = True
yield {"status": "deleting extra files"}
lgr.debug("Removing now-empty Zarr directory %s", d)
d.rmdir()
if d.parent != zarr_basepath and not any(d.parent.iterdir()):
empty_dirs.append(d.parent)
Expand Down
67 changes: 67 additions & 0 deletions dandi/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,73 @@ def test_download_zarr(tmp_path: Path, zarr_dandiset: SampleDandiset) -> None:
)


def _zarr_download_statuses(
dandiset: SampleDandiset,
output_dir: Path,
existing: DownloadExisting = DownloadExisting.ERROR,
) -> list[dict]:
return list(
Downloader(
url=DandisetURL(
instance=dandiset.api.instance,
dandiset_id=dandiset.dandiset.identifier,
version_id=dandiset.dandiset.version_id,
),
output_dir=output_dir,
existing=existing,
get_metadata=True,
get_assets=True,
preserve_tree=False,
jobs_per_zarr=None,
on_error="raise",
).download_generator()
)


@pytest.mark.ai_generated
def test_download_zarr_clean_no_deleting_status(
tmp_path: Path, zarr_dandiset: SampleDandiset
) -> None:
# On a fresh, first-time download every local file corresponds to a remote
# entry, so nothing is deleted and the "deleting extra files" status must
# not be emitted.
statuses = _zarr_download_statuses(zarr_dandiset, tmp_path)
assert not any(s.get("status") == "deleting extra files" for s in statuses)
assert_dirtrees_eq(
zarr_dandiset.dspath / "sample.zarr",
tmp_path / zarr_dandiset.dandiset_id / "sample.zarr",
)


@pytest.mark.ai_generated
def test_download_zarr_deletes_orphan_files(
tmp_path: Path, zarr_dandiset: SampleDandiset
) -> None:
# First download cleanly.
download(zarr_dandiset.dandiset.version_api_url, tmp_path)
zarr_root = tmp_path / zarr_dandiset.dandiset_id / "sample.zarr"
# Introduce orphaned local files (and a now-extra subdirectory) that do not
# correspond to any remote Zarr entry.
orphan_file = zarr_root / "orphan.bin"
orphan_file.write_bytes(b"not a real zarr entry")
orphan_dir = zarr_root / "orphan_dir"
orphan_dir.mkdir()
(orphan_dir / "nested_orphan.bin").write_bytes(b"also extra")
assert orphan_file.exists()
assert orphan_dir.exists()
# Re-download: cleanup must remove the orphans and announce it.
statuses = _zarr_download_statuses(
zarr_dandiset, tmp_path, existing=DownloadExisting.OVERWRITE
)
assert any(s.get("status") == "deleting extra files" for s in statuses)
assert not orphan_file.exists()
assert not orphan_dir.exists()
assert_dirtrees_eq(
zarr_dandiset.dspath / "sample.zarr",
zarr_root,
)


def test_download_different_zarr(tmp_path: Path, zarr_dandiset: SampleDandiset) -> None:
dd = tmp_path / zarr_dandiset.dandiset_id
dd.mkdir()
Expand Down
Loading