From db6bf4b94182e9816f1a6a6195b6e1fdad1125c4 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 23 Jun 2026 16:35:19 +0000 Subject: [PATCH 01/10] Only announce/perform Zarr cleanup when there are extra files to delete In _download_zarr the {"status": "deleting extra files"} progress message was emitted on every Zarr download, even on a fresh first-time download where nothing is stale, which misleadingly implied files were being removed. Scan the directory tree first to determine whether any stale files or now-empty directories actually need removal, and only yield the status (and perform the unlink/rmdir work) when there is something to delete. Add per-path DEBUG logging before each unlink and rmdir so deletions are auditable. The whole-tree Zarr checksum verification still runs unconditionally, and the OSError handling and empty-directory pruning behavior are preserved. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01LFHXuWAevoNHvEgw3QhGtU --- dandi/download.py | 57 +++++++++++++++++++++++------- dandi/tests/test_download.py | 67 ++++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 13 deletions(-) diff --git a/dandi/download.py b/dandi/download.py index 5d9b77f1e..d6b926791 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -1051,12 +1051,17 @@ def downloads_gen(): else: return - yield {"status": "deleting extra files"} remote_paths = set(map(str, entries)) zarr_basepath = Path(download_path) + + # First, scan the tree (without modifying anything) to determine whether + # there are any stale files or now-empty directories to remove. On a + # fresh, first-time download every file on disk corresponds to a remote + # entry, so nothing needs to be deleted and we must not misleadingly + # announce "deleting extra files". + has_extra = False dirs = deque([zarr_basepath]) - empty_dirs: deque[Path] = deque() - while dirs: + while dirs and not has_extra: d = dirs.popleft() is_empty = True for p in list(d.iterdir()): @@ -1066,22 +1071,48 @@ def downloads_gen(): p.is_file() and p.relative_to(zarr_basepath).as_posix() not in remote_paths ): - try: - p.unlink() - except OSError: - is_empty = False + has_extra = True + break elif p.is_dir(): dirs.append(p) is_empty = False else: is_empty = False if is_empty and d != zarr_basepath: - empty_dirs.append(d) - while empty_dirs: - d = empty_dirs.popleft() - d.rmdir() - if d.parent != zarr_basepath and not any(d.parent.iterdir()): - empty_dirs.append(d.parent) + has_extra = True + + if has_extra: + yield {"status": "deleting extra files"} + dirs = deque([zarr_basepath]) + empty_dirs: deque[Path] = deque() + while dirs: + d = dirs.popleft() + is_empty = True + for p in list(d.iterdir()): + if exclude_from_zarr(p): + is_empty = False + elif ( + p.is_file() + and p.relative_to(zarr_basepath).as_posix() not in remote_paths + ): + try: + lgr.debug("Deleting extra Zarr file %s", p) + p.unlink() + except OSError: + is_empty = False + elif p.is_dir(): + dirs.append(p) + is_empty = False + else: + is_empty = False + if is_empty and d != zarr_basepath: + empty_dirs.append(d) + while empty_dirs: + d = empty_dirs.popleft() + lgr.debug("Removing now-empty Zarr directory %s", d) + d.rmdir() + if d.parent != zarr_basepath and not any(d.parent.iterdir()): + empty_dirs.append(d.parent) if "skipped" not in final_out["message"]: zarr_checksum = asset.get_digest().value diff --git a/dandi/tests/test_download.py b/dandi/tests/test_download.py index 9da449667..731244504 100644 --- a/dandi/tests/test_download.py +++ b/dandi/tests/test_download.py @@ -456,6 +456,73 @@ def test_download_zarr(tmp_path: Path, zarr_dandiset: SampleDandiset) -> None: ) +def _zarr_download_statuses( + dandiset: SampleDandiset, + output_dir: Path, + existing: DownloadExisting = DownloadExisting.ERROR, +) -> list[dict]: + return list( + Downloader( + url=DandisetURL( + instance=dandiset.api.instance, + dandiset_id=dandiset.dandiset.identifier, + version_id=dandiset.dandiset.version_id, + ), + output_dir=output_dir, + existing=existing, + get_metadata=True, + get_assets=True, + preserve_tree=False, + jobs_per_zarr=None, + on_error="raise", + ).download_generator() + ) + + +@pytest.mark.ai_generated +def test_download_zarr_clean_no_deleting_status( + tmp_path: Path, zarr_dandiset: SampleDandiset +) -> None: + # On a fresh, first-time download every local file corresponds to a remote + # entry, so nothing is deleted and the "deleting extra files" status must + # not be emitted. + statuses = _zarr_download_statuses(zarr_dandiset, tmp_path) + assert not any(s.get("status") == "deleting extra files" for s in statuses) + assert_dirtrees_eq( + zarr_dandiset.dspath / "sample.zarr", + tmp_path / zarr_dandiset.dandiset_id / "sample.zarr", + ) + + +@pytest.mark.ai_generated +def test_download_zarr_deletes_orphan_files( + tmp_path: Path, zarr_dandiset: SampleDandiset +) -> None: + # First download cleanly. + download(zarr_dandiset.dandiset.version_api_url, tmp_path) + zarr_root = tmp_path / zarr_dandiset.dandiset_id / "sample.zarr" + # Introduce orphaned local files (and a now-extra subdirectory) that do not + # correspond to any remote Zarr entry. + orphan_file = zarr_root / "orphan.bin" + orphan_file.write_bytes(b"not a real zarr entry") + orphan_dir = zarr_root / "orphan_dir" + orphan_dir.mkdir() + (orphan_dir / "nested_orphan.bin").write_bytes(b"also extra") + assert orphan_file.exists() + assert orphan_dir.exists() + # Re-download: cleanup must remove the orphans and announce it. + statuses = _zarr_download_statuses( + zarr_dandiset, tmp_path, existing=DownloadExisting.OVERWRITE + ) + assert any(s.get("status") == "deleting extra files" for s in statuses) + assert not orphan_file.exists() + assert not orphan_dir.exists() + assert_dirtrees_eq( + zarr_dandiset.dspath / "sample.zarr", + zarr_root, + ) + + def test_download_different_zarr(tmp_path: Path, zarr_dandiset: SampleDandiset) -> None: dd = tmp_path / zarr_dandiset.dandiset_id dd.mkdir() From 851c10274f6f45e2f9b302c196a0b9248ab4cdff Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 23 Jun 2026 16:48:28 +0000 Subject: [PATCH 02/10] Consolidate cleanup-scan comment to a single line Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01LFHXuWAevoNHvEgw3QhGtU --- dandi/download.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/dandi/download.py b/dandi/download.py index d6b926791..111affe8a 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -1054,11 +1054,7 @@ def downloads_gen(): remote_paths = set(map(str, entries)) zarr_basepath = Path(download_path) - # First, scan the tree (without modifying anything) to determine whether - # there are any stale files or now-empty directories to remove. On a - # fresh, first-time download every file on disk corresponds to a remote - # entry, so nothing needs to be deleted and we must not misleadingly - # announce "deleting extra files". + # Scan the tree (without modifying it) to see if any stale files or now-empty directories need removing, so a clean download doesn't misleadingly announce "deleting extra files". has_extra = False dirs = deque([zarr_basepath]) while dirs and not has_extra: From 90d4eb9c25844b173ce07ef459b724de1f82cd0d Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 23 Jun 2026 16:49:19 +0000 Subject: [PATCH 03/10] Use descriptive loop variable names in Zarr cleanup Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01LFHXuWAevoNHvEgw3QhGtU --- dandi/download.py | 48 +++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/dandi/download.py b/dandi/download.py index 111affe8a..46190e8b7 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -1058,23 +1058,23 @@ def downloads_gen(): has_extra = False dirs = deque([zarr_basepath]) while dirs and not has_extra: - d = dirs.popleft() + dir = dirs.popleft() is_empty = True - for p in list(d.iterdir()): - if exclude_from_zarr(p): + for path in list(dir.iterdir()): + if exclude_from_zarr(path): is_empty = False elif ( - p.is_file() - and p.relative_to(zarr_basepath).as_posix() not in remote_paths + path.is_file() + and path.relative_to(zarr_basepath).as_posix() not in remote_paths ): has_extra = True break - elif p.is_dir(): - dirs.append(p) + elif path.is_dir(): + dirs.append(path) is_empty = False else: is_empty = False - if is_empty and d != zarr_basepath: + if is_empty and dir != zarr_basepath: has_extra = True if has_extra: @@ -1082,33 +1082,33 @@ def downloads_gen(): dirs = deque([zarr_basepath]) empty_dirs: deque[Path] = deque() while dirs: - d = dirs.popleft() + dir = dirs.popleft() is_empty = True - for p in list(d.iterdir()): - if exclude_from_zarr(p): + for path in list(dir.iterdir()): + if exclude_from_zarr(path): is_empty = False elif ( - p.is_file() - and p.relative_to(zarr_basepath).as_posix() not in remote_paths + path.is_file() + and path.relative_to(zarr_basepath).as_posix() not in remote_paths ): try: - lgr.debug("Deleting extra Zarr file %s", p) - p.unlink() + lgr.debug("Deleting extra Zarr file %s", path) + path.unlink() except OSError: is_empty = False - elif p.is_dir(): - dirs.append(p) + elif path.is_dir(): + dirs.append(path) is_empty = False else: is_empty = False - if is_empty and d != zarr_basepath: - empty_dirs.append(d) + if is_empty and dir != zarr_basepath: + empty_dirs.append(dir) while empty_dirs: - d = empty_dirs.popleft() - lgr.debug("Removing now-empty Zarr directory %s", d) - d.rmdir() - if d.parent != zarr_basepath and not any(d.parent.iterdir()): - empty_dirs.append(d.parent) + dir = empty_dirs.popleft() + lgr.debug("Removing now-empty Zarr directory %s", dir) + dir.rmdir() + if dir.parent != zarr_basepath and not any(dir.parent.iterdir()): + empty_dirs.append(dir.parent) if "skipped" not in final_out["message"]: zarr_checksum = asset.get_digest().value From 8667e1248d009ec5fc244a778217e21241ff91c2 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Tue, 23 Jun 2026 12:49:43 -0400 Subject: [PATCH 04/10] Apply suggestion from @CodyCBakerPhD --- dandi/download.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dandi/download.py b/dandi/download.py index 46190e8b7..d88344363 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -1103,6 +1103,7 @@ def downloads_gen(): is_empty = False if is_empty and dir != zarr_basepath: empty_dirs.append(dir) + while empty_dirs: dir = empty_dirs.popleft() lgr.debug("Removing now-empty Zarr directory %s", dir) From 622aa29a0ffff01262813c9a56863f5d18e188fd Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Tue, 23 Jun 2026 12:52:53 -0400 Subject: [PATCH 05/10] Apply suggestion from @CodyCBakerPhD --- dandi/download.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dandi/download.py b/dandi/download.py index d88344363..cc22ab983 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -1054,7 +1054,8 @@ def downloads_gen(): remote_paths = set(map(str, entries)) zarr_basepath = Path(download_path) - # Scan the tree (without modifying it) to see if any stale files or now-empty directories need removing, so a clean download doesn't misleadingly announce "deleting extra files". + # Scan the tree to see if any stale files or now-empty directories need removing + # So a clean download doesn't misleadingly announce "deleting extra files" has_extra = False dirs = deque([zarr_basepath]) while dirs and not has_extra: From e65aadd78760a59786fd9268c7427bbcf6ee331b Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 23 Jun 2026 16:52:54 +0000 Subject: [PATCH 06/10] Consolidate Zarr cleanup into a single tree scan Replace the separate detection and deletion walks with one non-mutating scan that collects the stale files and resulting empty directories, then only announces and performs the removal when there is something to delete. A guarded rmdir preserves the previous OSError-on-unlink behavior (a file that can't be removed keeps its directory in place). Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01LFHXuWAevoNHvEgw3QhGtU --- dandi/download.py | 56 +++++++++++++++++------------------------------ 1 file changed, 20 insertions(+), 36 deletions(-) diff --git a/dandi/download.py b/dandi/download.py index cc22ab983..8fd6e6648 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -1054,11 +1054,12 @@ def downloads_gen(): remote_paths = set(map(str, entries)) zarr_basepath = Path(download_path) - # Scan the tree to see if any stale files or now-empty directories need removing + # Scan the tree (without modifying it) for stale files and resulting empty directories # So a clean download doesn't misleadingly announce "deleting extra files" - has_extra = False + to_delete: list[Path] = [] + empty_dirs: deque[Path] = deque() dirs = deque([zarr_basepath]) - while dirs and not has_extra: + while dirs: dir = dirs.popleft() is_empty = True for path in list(dir.iterdir()): @@ -1068,49 +1069,32 @@ def downloads_gen(): path.is_file() and path.relative_to(zarr_basepath).as_posix() not in remote_paths ): - has_extra = True - break + to_delete.append(path) elif path.is_dir(): dirs.append(path) is_empty = False else: is_empty = False if is_empty and dir != zarr_basepath: - has_extra = True + empty_dirs.append(dir) - if has_extra: + if to_delete or empty_dirs: yield {"status": "deleting extra files"} - dirs = deque([zarr_basepath]) - empty_dirs: deque[Path] = deque() - while dirs: - dir = dirs.popleft() - is_empty = True - for path in list(dir.iterdir()): - if exclude_from_zarr(path): - is_empty = False - elif ( - path.is_file() - and path.relative_to(zarr_basepath).as_posix() not in remote_paths - ): - try: - lgr.debug("Deleting extra Zarr file %s", path) - path.unlink() - except OSError: - is_empty = False - elif path.is_dir(): - dirs.append(path) - is_empty = False - else: - is_empty = False - if is_empty and dir != zarr_basepath: - empty_dirs.append(dir) - + for path in to_delete: + lgr.debug("Deleting extra Zarr file %s", path) + try: + path.unlink() + except OSError: + # A file we couldn't remove keeps its directory non-empty, so + # the rmdir guard below will leave that directory in place. + pass while empty_dirs: dir = empty_dirs.popleft() - lgr.debug("Removing now-empty Zarr directory %s", dir) - dir.rmdir() - if dir.parent != zarr_basepath and not any(dir.parent.iterdir()): - empty_dirs.append(dir.parent) + if not any(dir.iterdir()): + lgr.debug("Removing now-empty Zarr directory %s", dir) + dir.rmdir() + if dir.parent != zarr_basepath and not any(dir.parent.iterdir()): + empty_dirs.append(dir.parent) if "skipped" not in final_out["message"]: zarr_checksum = asset.get_digest().value From 14295c321bcac0c2ea453b3f6a0d1f7936a40793 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 23 Jun 2026 17:01:24 +0000 Subject: [PATCH 07/10] Annotate dirs deque in Zarr cleanup Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01LFHXuWAevoNHvEgw3QhGtU --- dandi/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dandi/download.py b/dandi/download.py index 8fd6e6648..ae2600dba 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -1058,7 +1058,7 @@ def downloads_gen(): # So a clean download doesn't misleadingly announce "deleting extra files" to_delete: list[Path] = [] empty_dirs: deque[Path] = deque() - dirs = deque([zarr_basepath]) + dirs: deque[Path] = deque([zarr_basepath]) while dirs: dir = dirs.popleft() is_empty = True From 976122c6cc5687165a09867c05c804f68a1baf17 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 23 Jun 2026 19:04:04 +0000 Subject: [PATCH 08/10] Use single-pass Zarr cleanup with lazy 'deleting' announcement Replace the separate collection scan with the original single mutating walk, yielding the 'deleting extra files' status lazily on the first actual removal via a flag. This avoids the extra full-tree traversal and restores the exact original OSError-on-unlink handling (no rmdir guard needed). Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01LFHXuWAevoNHvEgw3QhGtU --- dandi/download.py | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/dandi/download.py b/dandi/download.py index ae2600dba..073f1be42 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -1054,9 +1054,10 @@ def downloads_gen(): remote_paths = set(map(str, entries)) zarr_basepath = Path(download_path) - # Scan the tree (without modifying it) for stale files and resulting empty directories - # So a clean download doesn't misleadingly announce "deleting extra files" - to_delete: list[Path] = [] + # Walk the tree, removing stale files (and the directories they leave empty). + # The "deleting extra files" status is yielded lazily on the first removal so a + # clean download doesn't misleadingly announce it when there's nothing to delete. + announced = False empty_dirs: deque[Path] = deque() dirs: deque[Path] = deque([zarr_basepath]) while dirs: @@ -1069,7 +1070,14 @@ def downloads_gen(): path.is_file() and path.relative_to(zarr_basepath).as_posix() not in remote_paths ): - to_delete.append(path) + if not announced: + announced = True + yield {"status": "deleting extra files"} + try: + lgr.debug("Deleting extra Zarr file %s", path) + path.unlink() + except OSError: + is_empty = False elif path.is_dir(): dirs.append(path) is_empty = False @@ -1077,24 +1085,15 @@ def downloads_gen(): is_empty = False if is_empty and dir != zarr_basepath: empty_dirs.append(dir) - - if to_delete or empty_dirs: - yield {"status": "deleting extra files"} - for path in to_delete: - lgr.debug("Deleting extra Zarr file %s", path) - try: - path.unlink() - except OSError: - # A file we couldn't remove keeps its directory non-empty, so - # the rmdir guard below will leave that directory in place. - pass - while empty_dirs: - dir = empty_dirs.popleft() - if not any(dir.iterdir()): - lgr.debug("Removing now-empty Zarr directory %s", dir) - dir.rmdir() - if dir.parent != zarr_basepath and not any(dir.parent.iterdir()): - empty_dirs.append(dir.parent) + while empty_dirs: + dir = empty_dirs.popleft() + if not announced: + announced = True + yield {"status": "deleting extra files"} + lgr.debug("Removing now-empty Zarr directory %s", dir) + dir.rmdir() + if dir.parent != zarr_basepath and not any(dir.parent.iterdir()): + empty_dirs.append(dir.parent) if "skipped" not in final_out["message"]: zarr_checksum = asset.get_digest().value From dd150f7d59f5a5a8c5f306cc7766ac96c7964666 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Tue, 23 Jun 2026 15:27:36 -0400 Subject: [PATCH 09/10] Apply suggestions from code review Co-authored-by: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> --- dandi/download.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dandi/download.py b/dandi/download.py index 073f1be42..84d70cea0 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -1054,10 +1054,7 @@ def downloads_gen(): remote_paths = set(map(str, entries)) zarr_basepath = Path(download_path) - # Walk the tree, removing stale files (and the directories they leave empty). - # The "deleting extra files" status is yielded lazily on the first removal so a - # clean download doesn't misleadingly announce it when there's nothing to delete. - announced = False + announced: bool = False empty_dirs: deque[Path] = deque() dirs: deque[Path] = deque([zarr_basepath]) while dirs: From de363123186f4eec0bb9376820193512dc149f19 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 23 Jun 2026 19:28:45 +0000 Subject: [PATCH 10/10] Restore original d/p loop variable names to minimize diff Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01LFHXuWAevoNHvEgw3QhGtU --- dandi/download.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/dandi/download.py b/dandi/download.py index 84d70cea0..438f2e4d8 100644 --- a/dandi/download.py +++ b/dandi/download.py @@ -1058,39 +1058,39 @@ def downloads_gen(): empty_dirs: deque[Path] = deque() dirs: deque[Path] = deque([zarr_basepath]) while dirs: - dir = dirs.popleft() + d = dirs.popleft() is_empty = True - for path in list(dir.iterdir()): - if exclude_from_zarr(path): + for p in list(d.iterdir()): + if exclude_from_zarr(p): is_empty = False elif ( - path.is_file() - and path.relative_to(zarr_basepath).as_posix() not in remote_paths + p.is_file() + and p.relative_to(zarr_basepath).as_posix() not in remote_paths ): if not announced: announced = True yield {"status": "deleting extra files"} try: - lgr.debug("Deleting extra Zarr file %s", path) - path.unlink() + lgr.debug("Deleting extra Zarr file %s", p) + p.unlink() except OSError: is_empty = False - elif path.is_dir(): - dirs.append(path) + elif p.is_dir(): + dirs.append(p) is_empty = False else: is_empty = False - if is_empty and dir != zarr_basepath: - empty_dirs.append(dir) + if is_empty and d != zarr_basepath: + empty_dirs.append(d) while empty_dirs: - dir = empty_dirs.popleft() + d = empty_dirs.popleft() if not announced: announced = True yield {"status": "deleting extra files"} - lgr.debug("Removing now-empty Zarr directory %s", dir) - dir.rmdir() - if dir.parent != zarr_basepath and not any(dir.parent.iterdir()): - empty_dirs.append(dir.parent) + lgr.debug("Removing now-empty Zarr directory %s", d) + d.rmdir() + if d.parent != zarr_basepath and not any(d.parent.iterdir()): + empty_dirs.append(d.parent) if "skipped" not in final_out["message"]: zarr_checksum = asset.get_digest().value