From 15e4c618b4beed38aa18c4e02a3cb3ab952d80a9 Mon Sep 17 00:00:00 2001
From: Peter Bull <1799186+pjbull@users.noreply.github.com>
Date: Mon, 29 Jun 2026 08:43:40 -0700
Subject: [PATCH] Fix rglob TypeError when blob and directory share a name
 (#431)

When a cloud bucket contains both a blob and a "directory" prefix with
the same name, _build_subtree could either crash with
'TypeError: NoneType object is not subscriptable' or silently overwrite
already-discovered descendants. Cloud object stores allow this layout
even though local filesystems do not.

The tree builder now treats a file leaf and a directory subtree as
mergeable at the same path: the leaf is promoted to a subtree when a
deeper entry arrives, and an existing subtree is preserved when a same
named file or directory entry is processed later.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 HISTORY.md                      |  1 +
 cloudpathlib/cloudpath.py       | 16 +++++++++++++++-
 tests/test_cloudpath_file_io.py | 25 +++++++++++++++++++++++++
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/HISTORY.md b/HISTORY.md
index 64e08d5b..3d417ebc 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -9,6 +9,7 @@
 - Changed `S3Client._get_metadata` to read object metadata with `HeadObject` instead of `GetObject`, so `stat`, `etag`, and `size` no longer open the object body. Also fixes a `KeyError` on `ContentLength` against S3-compatible gateways that drop `Content-Length` from `GetObject` responses. (Issue [#564](https://github.com/drivendataorg/cloudpathlib/issues/564), PR [#565](https://github.com/drivendataorg/cloudpathlib/pull/565))
 - Added a `lazy` keyword argument to `CloudPath.walk`. By default (`lazy=False`) the existing fast behavior is preserved: the whole subtree is fetched up front with a single recursive listing. Passing `lazy=True` lists each directory on demand so that, when `top_down=True`, callers can prune subdirectories by modifying `dirnames` in-place (à la `os.walk` / `Path.walk`) to skip fetching the contents of those subtrees entirely — dramatically reducing API calls for large, sparsely-traversed trees. (Issue [#518](https://github.com/drivendataorg/cloudpathlib/issues/518))
 - Fixed S3 copy and move operations to forward copy-specific extra args such as `CopySourceSSECustomerKey`, and added `addressing_style="virtual"` support for `S3Client` (Issues [#500](https://github.com/drivendataorg/cloudpathlib/issues/500), [#527](https://github.com/drivendataorg/cloudpathlib/issues/527))
+- Fixed `CloudPath.rglob` raising `TypeError: 'NoneType' object is not subscriptable` when a bucket contains both a blob and a "directory" prefix sharing the same name (a layout cloud object stores allow but local filesystems do not). The internal `_build_subtree` now merges file and directory entries at the same path instead of overwriting one with the other. (Issue [#431](https://github.com/drivendataorg/cloudpathlib/issues/431))
 
 ## v0.24.0 (2026-04-29)
 - Added support for S3 Multi-Region Access Point (MRAP) URLs in `S3Path` (Issue [#556](https://github.com/drivendataorg/cloudpathlib/issues/556), PR [#557](https://github.com/drivendataorg/cloudpathlib/pull/557))
diff --git a/cloudpathlib/cloudpath.py b/cloudpathlib/cloudpath.py
index 01306fa7..41492b24 100644
--- a/cloudpathlib/cloudpath.py
+++ b/cloudpathlib/cloudpath.py
@@ -488,16 +488,30 @@ def _glob_checks(self, pattern: Union[str, os.PathLike]) -> str:
     def _build_subtree(self, recursive):
         # build a tree structure for all files out of default dicts
         Tree: Callable = lambda: defaultdict(Tree)
+        _MISSING = object()
 
         def _build_tree(trunk, branch, nodes, is_dir):
             """Utility to build a tree from nested defaultdicts with a generator
             of nodes (parts) of a path."""
             next_branch = next(nodes, None)
 
+            existing = trunk.get(branch) if branch in trunk else _MISSING
+
             if next_branch is None:
-                trunk[branch] = Tree() if is_dir else None  # leaf node
+                # Leaf node. Cloud object stores can have a blob and a
+                # "directory" with the same name; if we've already recorded a
+                # subtree here, keep it (it represents descendants we still
+                # need to walk). Otherwise set the marker for file/dir.
+                if existing is _MISSING or not isinstance(existing, defaultdict):
+                    trunk[branch] = Tree() if is_dir else None
 
             else:
+                # Recursing into a deeper level: if we previously recorded a
+                # file leaf (None) at this branch, promote it to a subtree so
+                # descendants are reachable. If a subtree already exists,
+                # reuse it rather than overwriting.
+                if existing is None:
+                    trunk[branch] = Tree()
                 _build_tree(trunk[branch], next_branch, nodes, is_dir)
 
         file_tree = Tree()
diff --git a/tests/test_cloudpath_file_io.py b/tests/test_cloudpath_file_io.py
index 04519557..e8d649ba 100644
--- a/tests/test_cloudpath_file_io.py
+++ b/tests/test_cloudpath_file_io.py
@@ -472,6 +472,31 @@ def test_glob_many_open_files(rig):
             assert next(it) == p
 
 
+def test_rglob_file_and_dir_same_name(rig, monkeypatch):
+    """Regression test for #431: a blob and a "directory" can share the same
+    name in cloud storage. Recursive globbing must not error in that case."""
+    base = rig.create_cloud_path("")
+
+    # Simulate _list_dir returning both a blob and a "directory" with the
+    # same name "output". Cloud object stores allow this; local filesystems
+    # do not, which is why we patch _list_dir instead of writing files.
+    fake_entries = [
+        (base / "output", False),
+        (base / "output" / "13655" / "0" / "file1.json", False),
+        (base / "output" / "13655" / "0", True),
+        (base / "output" / "13655", True),
+    ]
+
+    def fake_list_dir(self, cloud_path, recursive=False):
+        return iter(fake_entries)
+
+    monkeypatch.setattr(rig.client_class, "_list_dir", fake_list_dir)
+
+    results = list(base.rglob("*"))
+    paths = {str(p) for p in results}
+    assert str(base / "output" / "13655" / "0" / "file1.json") in paths
+
+
 def test_glob_exceptions(rig):
     cp = rig.create_cloud_path("dir_0/")