diff --git a/dandi/download.py b/dandi/download.py index 5d9b77f1e..438f2e4d8 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -1051,11 +1051,12 @@ def downloads_gen(): else: return - yield {"status": "deleting extra files"} remote_paths = set(map(str, entries)) zarr_basepath = Path(download_path) - dirs = deque([zarr_basepath]) + + announced: bool = False empty_dirs: deque[Path] = deque() + dirs: deque[Path] = deque([zarr_basepath]) while dirs: d = dirs.popleft() is_empty = True @@ -1066,7 +1067,11 @@ def downloads_gen(): p.is_file() and p.relative_to(zarr_basepath).as_posix() not in remote_paths ): + if not announced: + announced = True + yield {"status": "deleting extra files"} try: + lgr.debug("Deleting extra Zarr file %s", p) p.unlink() except OSError: is_empty = False @@ -1079,6 +1084,10 @@ def downloads_gen(): empty_dirs.append(d) while empty_dirs: d = empty_dirs.popleft() + if not announced: + announced = True + yield {"status": "deleting extra files"} + lgr.debug("Removing now-empty Zarr directory %s", d) d.rmdir() if d.parent != zarr_basepath and not any(d.parent.iterdir()): empty_dirs.append(d.parent) diff --git a/dandi/tests/test_download.py b/dandi/tests/test_download.py index 9da449667..731244504 100644 --- a/dandi/tests/test_download.py +++ b/dandi/tests/test_download.py @@ -456,6 +456,73 @@ def test_download_zarr(tmp_path: Path, zarr_dandiset: SampleDandiset) -> None: ) +def _zarr_download_statuses( + dandiset: SampleDandiset, + output_dir: Path, + existing: DownloadExisting = DownloadExisting.ERROR, +) -> list[dict]: + return list( + Downloader( + url=DandisetURL( + instance=dandiset.api.instance, + dandiset_id=dandiset.dandiset.identifier, + version_id=dandiset.dandiset.version_id, + ), + output_dir=output_dir, + existing=existing, + get_metadata=True, + get_assets=True, + preserve_tree=False, + jobs_per_zarr=None, + on_error="raise", + ).download_generator() + ) + + +@pytest.mark.ai_generated +def test_download_zarr_clean_no_deleting_status( + tmp_path: Path, zarr_dandiset: SampleDandiset +) -> None: + # On a fresh, first-time download every local file corresponds to a remote + # entry, so nothing is deleted and the "deleting extra files" status must + # not be emitted. + statuses = _zarr_download_statuses(zarr_dandiset, tmp_path) + assert not any(s.get("status") == "deleting extra files" for s in statuses) + assert_dirtrees_eq( + zarr_dandiset.dspath / "sample.zarr", + tmp_path / zarr_dandiset.dandiset_id / "sample.zarr", + ) + + +@pytest.mark.ai_generated +def test_download_zarr_deletes_orphan_files( + tmp_path: Path, zarr_dandiset: SampleDandiset +) -> None: + # First download cleanly. + download(zarr_dandiset.dandiset.version_api_url, tmp_path) + zarr_root = tmp_path / zarr_dandiset.dandiset_id / "sample.zarr" + # Introduce orphaned local files (and a now-extra subdirectory) that do not + # correspond to any remote Zarr entry. + orphan_file = zarr_root / "orphan.bin" + orphan_file.write_bytes(b"not a real zarr entry") + orphan_dir = zarr_root / "orphan_dir" + orphan_dir.mkdir() + (orphan_dir / "nested_orphan.bin").write_bytes(b"also extra") + assert orphan_file.exists() + assert orphan_dir.exists() + # Re-download: cleanup must remove the orphans and announce it. + statuses = _zarr_download_statuses( + zarr_dandiset, tmp_path, existing=DownloadExisting.OVERWRITE + ) + assert any(s.get("status") == "deleting extra files" for s in statuses) + assert not orphan_file.exists() + assert not orphan_dir.exists() + assert_dirtrees_eq( + zarr_dandiset.dspath / "sample.zarr", + zarr_root, + ) + + def test_download_different_zarr(tmp_path: Path, zarr_dandiset: SampleDandiset) -> None: dd = tmp_path / zarr_dandiset.dandiset_id dd.mkdir()