From 15e4c618b4beed38aa18c4e02a3cb3ab952d80a9 Mon Sep 17 00:00:00 2001 From: Peter Bull <1799186+pjbull@users.noreply.github.com> Date: Mon, 29 Jun 2026 08:43:40 -0700 Subject: [PATCH] Fix rglob TypeError when blob and directory share a name (#431) When a cloud bucket contains both a blob and a "directory" prefix with the same name, _build_subtree could either crash with 'TypeError: NoneType object is not subscriptable' or silently overwrite already-discovered descendants. Cloud object stores allow this layout even though local filesystems do not. The tree builder now treats a file leaf and a directory subtree as mergeable at the same path: the leaf is promoted to a subtree when a deeper entry arrives, and an existing subtree is preserved when a same named file or directory entry is processed later. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- HISTORY.md | 1 + cloudpathlib/cloudpath.py | 16 +++++++++++++++- tests/test_cloudpath_file_io.py | 25 +++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index 64e08d5b..3d417ebc 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -9,6 +9,7 @@ - Changed `S3Client._get_metadata` to read object metadata with `HeadObject` instead of `GetObject`, so `stat`, `etag`, and `size` no longer open the object body. Also fixes a `KeyError` on `ContentLength` against S3-compatible gateways that drop `Content-Length` from `GetObject` responses. (Issue [#564](https://github.com/drivendataorg/cloudpathlib/issues/564), PR [#565](https://github.com/drivendataorg/cloudpathlib/pull/565)) - Added a `lazy` keyword argument to `CloudPath.walk`. By default (`lazy=False`) the existing fast behavior is preserved: the whole subtree is fetched up front with a single recursive listing. Passing `lazy=True` lists each directory on demand so that, when `top_down=True`, callers can prune subdirectories by modifying `dirnames` in-place (à la `os.walk` / `Path.walk`) to skip fetching the contents of those subtrees entirely — dramatically reducing API calls for large, sparsely-traversed trees. (Issue [#518](https://github.com/drivendataorg/cloudpathlib/issues/518)) - Fixed S3 copy and move operations to forward copy-specific extra args such as `CopySourceSSECustomerKey`, and added `addressing_style="virtual"` support for `S3Client` (Issues [#500](https://github.com/drivendataorg/cloudpathlib/issues/500), [#527](https://github.com/drivendataorg/cloudpathlib/issues/527)) +- Fixed `CloudPath.rglob` raising `TypeError: 'NoneType' object is not subscriptable` when a bucket contains both a blob and a "directory" prefix sharing the same name (a layout cloud object stores allow but local filesystems do not). The internal `_build_subtree` now merges file and directory entries at the same path instead of overwriting one with the other. (Issue [#431](https://github.com/drivendataorg/cloudpathlib/issues/431)) ## v0.24.0 (2026-04-29) - Added support for S3 Multi-Region Access Point (MRAP) URLs in `S3Path` (Issue [#556](https://github.com/drivendataorg/cloudpathlib/issues/556), PR [#557](https://github.com/drivendataorg/cloudpathlib/pull/557)) diff --git a/cloudpathlib/cloudpath.py b/cloudpathlib/cloudpath.py index 01306fa7..41492b24 100644 --- a/cloudpathlib/cloudpath.py +++ b/cloudpathlib/cloudpath.py @@ -488,16 +488,30 @@ def _glob_checks(self, pattern: Union[str, os.PathLike]) -> str: def _build_subtree(self, recursive): # build a tree structure for all files out of default dicts Tree: Callable = lambda: defaultdict(Tree) + _MISSING = object() def _build_tree(trunk, branch, nodes, is_dir): """Utility to build a tree from nested defaultdicts with a generator of nodes (parts) of a path.""" next_branch = next(nodes, None) + existing = trunk.get(branch) if branch in trunk else _MISSING + if next_branch is None: - trunk[branch] = Tree() if is_dir else None # leaf node + # Leaf node. Cloud object stores can have a blob and a + # "directory" with the same name; if we've already recorded a + # subtree here, keep it (it represents descendants we still + # need to walk). Otherwise set the marker for file/dir. + if existing is _MISSING or not isinstance(existing, defaultdict): + trunk[branch] = Tree() if is_dir else None else: + # Recursing into a deeper level: if we previously recorded a + # file leaf (None) at this branch, promote it to a subtree so + # descendants are reachable. If a subtree already exists, + # reuse it rather than overwriting. + if existing is None: + trunk[branch] = Tree() _build_tree(trunk[branch], next_branch, nodes, is_dir) file_tree = Tree() diff --git a/tests/test_cloudpath_file_io.py b/tests/test_cloudpath_file_io.py index 04519557..e8d649ba 100644 --- a/tests/test_cloudpath_file_io.py +++ b/tests/test_cloudpath_file_io.py @@ -472,6 +472,31 @@ def test_glob_many_open_files(rig): assert next(it) == p +def test_rglob_file_and_dir_same_name(rig, monkeypatch): + """Regression test for #431: a blob and a "directory" can share the same + name in cloud storage. Recursive globbing must not error in that case.""" + base = rig.create_cloud_path("") + + # Simulate _list_dir returning both a blob and a "directory" with the + # same name "output". Cloud object stores allow this; local filesystems + # do not, which is why we patch _list_dir instead of writing files. + fake_entries = [ + (base / "output", False), + (base / "output" / "13655" / "0" / "file1.json", False), + (base / "output" / "13655" / "0", True), + (base / "output" / "13655", True), + ] + + def fake_list_dir(self, cloud_path, recursive=False): + return iter(fake_entries) + + monkeypatch.setattr(rig.client_class, "_list_dir", fake_list_dir) + + results = list(base.rglob("*")) + paths = {str(p) for p in results} + assert str(base / "output" / "13655" / "0" / "file1.json") in paths + + def test_glob_exceptions(rig): cp = rig.create_cloud_path("dir_0/")