From dd966b31c56624b0e918a3a96a2719db47cf6dea Mon Sep 17 00:00:00 2001 From: Federico Mengozzi <19249682+fedemengo@users.noreply.github.com> Date: Wed, 29 Apr 2026 20:57:35 +0200 Subject: [PATCH 1/4] Add support for S3 Multi-Region Access Point (MRAP) URLs (#557) * Add MRAP URL support * test MRAP * update history * use walrus operator * cache bucket parsing * address pr review * fix linter issues * url escape `:` on win and cache * handle Windows drive in test assertion * make lint --- HISTORY.md | 1 + cloudpathlib/s3/s3path.py | 31 ++++++++- tests/mock_clients/mock_s3.py | 11 ++-- tests/test_s3_specific.py | 117 ++++++++++++++++++++++++++++++++++ 4 files changed, 152 insertions(+), 8 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 9f0afac4..fa945f76 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,7 @@ # cloudpathlib Changelog ## UNRELEASED +- Added support for S3 Multi-Region Access Point (MRAP) URLs in `S3Path` (Issue [#556](https://github.com/drivendataorg/cloudpathlib/issues/556), PR [#557](https://github.com/drivendataorg/cloudpathlib/pull/557)) - Added support for Pydantic serialization (Issue [#537](https://github.com/drivendataorg/cloudpathlib/issues/537), PR [#538](https://github.com/drivendataorg/cloudpathlib/pull/538)) ## v0.23.0 (2025-10-07) diff --git a/cloudpathlib/s3/s3path.py b/cloudpathlib/s3/s3path.py index d01b63aa..118d08ae 100644 --- a/cloudpathlib/s3/s3path.py +++ b/cloudpathlib/s3/s3path.py @@ -1,4 +1,6 @@ import os +import re +import sys from pathlib import Path from tempfile import TemporaryDirectory from typing import Any, Optional, TYPE_CHECKING @@ -8,6 +10,10 @@ if TYPE_CHECKING: from .s3client import S3Client +_MRAP_PATTERN = re.compile( + r"^s3://(?Parn:aws:s3::\d{12}:accesspoint/[^/]+\.mrap)(?:/(?P.*))?$" +) + @register_path_class("s3") class S3Path(CloudPath): @@ -26,6 +32,8 @@ class S3Path(CloudPath): cloud_prefix: str = "s3://" client: "S3Client" + _bucket: str + _local_path: Path @property def drive(self) -> str: @@ -74,7 +82,17 @@ def stat(self, follow_symlinks=True): @property def bucket(self) -> str: - return self._no_prefix.split("/", 1)[0] + """The bucket name, or the full MRAP ARN for MRAP paths. + + :type: :class:`str` + """ + if hasattr(self, "_bucket"): + return self._bucket + if match := _MRAP_PATTERN.match(str(self)): + self._bucket = match.group("arn") + else: + self._bucket = self._no_prefix.split("/", 1)[0] + return self._bucket @property def key(self) -> str: @@ -90,3 +108,14 @@ def key(self) -> str: @property def etag(self): return self.client._get_metadata(self).get("etag") + + @property + def _local(self) -> Path: + if hasattr(self, "_local_path"): + return self._local_path + no_prefix = self._no_prefix + # `:` is invalid in Windows paths; percent-encode it for MRAP ARNs + if sys.platform == "win32": + no_prefix = no_prefix.replace(":", "%3A") + self._local_path = self.client._local_cache_dir / no_prefix + return self._local_path diff --git a/tests/mock_clients/mock_s3.py b/tests/mock_clients/mock_s3.py index 9f75f950..7f2a7aaa 100644 --- a/tests/mock_clients/mock_s3.py +++ b/tests/mock_clients/mock_s3.py @@ -220,14 +220,11 @@ def list_buckets(self): return {"Buckets": [{"Name": DEFAULT_S3_BUCKET_NAME}]} def head_object(self, Bucket, Key, **kwargs): - if ( - not (self.root / Key).exists() - or (self.root / Key).is_dir() - or Bucket != DEFAULT_S3_BUCKET_NAME - ): + if not (self.root / Key).exists() or (self.root / Key).is_dir(): raise ClientError({}, {}) - else: - return {"key": Key} + if Bucket != DEFAULT_S3_BUCKET_NAME and ".mrap" not in Bucket: + raise ClientError({}, {}) + return {"key": Key} def generate_presigned_url(self, op: str, Params: dict, ExpiresIn: int): mock_presigned_url = f"https://{Params['Bucket']}.s3.amazonaws.com/{Params['Key']}?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=TEST%2FTEST%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240131T194721Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=TEST" diff --git a/tests/test_s3_specific.py b/tests/test_s3_specific.py index 58b2e21a..6cd3fbb3 100644 --- a/tests/test_s3_specific.py +++ b/tests/test_s3_specific.py @@ -290,3 +290,120 @@ def test_as_url_presign(s3_rig): assert "Signature" in query_params else: assert False, "Unknown presigned URL format" + + +_MRAP_ARN = "arn:aws:s3::123456789012:accesspoint/my-mrap.mrap" + + +def test_mrap_bucket_and_key(): + """MRAP paths return the full ARN as bucket and the path suffix as key.""" + # MRAP path without key + p = S3Path(f"s3://{_MRAP_ARN}") + assert p.bucket == _MRAP_ARN + assert p.key == "" + + # MRAP path with trailing slash + p2 = S3Path(f"s3://{_MRAP_ARN}/") + assert p2.bucket == _MRAP_ARN + assert p2.key == "" + + # MRAP path with a single key segment + p3 = S3Path(f"s3://{_MRAP_ARN}/file.txt") + assert p3.bucket == _MRAP_ARN + assert p3.key == "file.txt" + + # MRAP path with a nested key + p4 = S3Path(f"s3://{_MRAP_ARN}/folder/sub/file.txt") + assert p4.bucket == _MRAP_ARN + assert p4.key == "folder/sub/file.txt" + + # Regular S3 path is unaffected + p5 = S3Path("s3://my-bucket/folder/file.txt") + assert p5.bucket == "my-bucket" + assert p5.key == "folder/file.txt" + + # ARN-like strings that are NOT valid MRAPs fall back to normal bucket parsing + # (wrong account ID length, missing .mrap suffix) + p6 = S3Path("s3://arn:aws:s3::12345:accesspoint/x.mrap/key") + assert p6.bucket == "arn:aws:s3::12345:accesspoint" # treated as normal bucket + + p7 = S3Path("s3://arn:aws:s3::123456789012:accesspoint/notmrap/key") + assert p7.bucket == "arn:aws:s3::123456789012:accesspoint" # treated as normal bucket + + +def test_mrap_path_manipulation(): + """MRAP paths support standard path manipulation operations.""" + base = S3Path(f"s3://{_MRAP_ARN}") + + # Joining via / + child = base / "folder" / "file.txt" + assert str(child) == f"s3://{_MRAP_ARN}/folder/file.txt" + assert child.bucket == _MRAP_ARN + assert child.key == "folder/file.txt" + + # name, stem, suffix + assert child.name == "file.txt" + assert child.stem == "file" + assert child.suffix == ".txt" + + # parent preserves the MRAP ARN as bucket + parent = child.parent + assert str(parent) == f"s3://{_MRAP_ARN}/folder" + assert parent.bucket == _MRAP_ARN + assert parent.key == "folder" + + # with_name and with_suffix + assert str(child.with_name("other.csv")) == f"s3://{_MRAP_ARN}/folder/other.csv" + assert str(child.with_suffix(".csv")) == f"s3://{_MRAP_ARN}/folder/file.csv" + + # str / repr round-trip + url = f"s3://{_MRAP_ARN}/folder/file.txt" + assert str(S3Path(url)) == url + assert repr(S3Path(url)) == f"S3Path('{url}')" + + +def test_mrap_file_operations(s3_rig): + """MRAP paths work end-to-end with the mock S3 backend.""" + client = s3_rig.client_class() + base = f"s3://{_MRAP_ARN}/{s3_rig.test_dir}" + + # seeded file from test assets + existing = client.CloudPath(f"{base}/dir_0/file0_0.txt") + assert existing.exists() + assert existing.is_file() + assert not existing.is_dir() + assert client.CloudPath(f"{base}/dir_0").is_dir() + + # iterdir on the test_dir level: expects dir_0 and dir_1 + top_level = list(client.CloudPath(base).iterdir()) + assert len(top_level) == 2 + assert all(p.is_dir() for p in top_level) + assert {p.name for p in top_level} == {"dir_0", "dir_1"} + + # iterdir on dir_0: expects 3 files + dir0_contents = list(client.CloudPath(f"{base}/dir_0").iterdir()) + assert len(dir0_contents) == 3 + assert all(p.is_file() for p in dir0_contents) + + # write / read / delete + new_file = client.CloudPath(f"{base}/mrap_write_test.txt") + assert not new_file.exists() + new_file.write_text("hello from mrap") + assert new_file.exists() + assert new_file.read_text() == "hello from mrap" + assert new_file.bucket == _MRAP_ARN + new_file.unlink() + assert not new_file.exists() + + +def test_mrap_local_path_windows_encoding(monkeypatch, s3_rig): + """On Windows, colons in MRAP ARNs must be percent-encoded in the local cache path.""" + import cloudpathlib.s3.s3path as s3path_module + + monkeypatch.setattr(s3path_module.sys, "platform", "win32") + client = s3_rig.client_class() + p = client.CloudPath(f"s3://{_MRAP_ARN}/some/key.txt") + # strip drive (e.g. "C:") since it legitimately contains a colon on Windows + local_no_drive = str(p._local)[len(p._local.drive) :] + assert ":" not in local_no_drive, f"Colon found in local path on simulated Windows: {p._local}" + assert "%3A" in local_no_drive From f5e8d5d539edacfd03df5caacd5e161de9e7d5eb Mon Sep 17 00:00:00 2001 From: Peter Bull <1799186+pjbull@users.noreply.github.com> Date: Wed, 29 Apr 2026 13:55:42 -0700 Subject: [PATCH 2/4] fix live tests with live mrap --- .env.example | 2 ++ tests/test_s3_specific.py | 17 ++++++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/.env.example b/.env.example index c081e8bc..5f76f0bd 100644 --- a/.env.example +++ b/.env.example @@ -22,6 +22,8 @@ CUSTOM_S3_SECRET_KEY=your_custom_s3_secret_key ## BUCKETS # Used when running live tests; you will need read/write access to these buckets to run those test LIVE_S3_BUCKET=a-bucket-you-can-access +LIVE_S3_MRAP_ARN=arn:aws:s3::ACCOUNT:accesspoint/MRAP + LIVE_AZURE_CONTAINER=a-container-you-can-access diff --git a/tests/test_s3_specific.py b/tests/test_s3_specific.py index 6cd3fbb3..b32ff538 100644 --- a/tests/test_s3_specific.py +++ b/tests/test_s3_specific.py @@ -1,5 +1,6 @@ from concurrent.futures import ProcessPoolExecutor from itertools import islice +import os from time import sleep import time @@ -363,9 +364,19 @@ def test_mrap_path_manipulation(): def test_mrap_file_operations(s3_rig): - """MRAP paths work end-to-end with the mock S3 backend.""" + """MRAP paths work end-to-end with the mock S3 backend (or a real MRAP in live mode).""" + if s3_rig.live_server: + mrap_arn = os.getenv("LIVE_S3_MRAP_ARN") + if not mrap_arn: + pytest.skip( + "LIVE_S3_MRAP_ARN is not set; set it to the ARN of an MRAP " + "fronting LIVE_S3_BUCKET to run this test live." + ) + else: + mrap_arn = _MRAP_ARN + client = s3_rig.client_class() - base = f"s3://{_MRAP_ARN}/{s3_rig.test_dir}" + base = f"s3://{mrap_arn}/{s3_rig.test_dir}" # seeded file from test assets existing = client.CloudPath(f"{base}/dir_0/file0_0.txt") @@ -391,7 +402,7 @@ def test_mrap_file_operations(s3_rig): new_file.write_text("hello from mrap") assert new_file.exists() assert new_file.read_text() == "hello from mrap" - assert new_file.bucket == _MRAP_ARN + assert new_file.bucket == mrap_arn new_file.unlink() assert not new_file.exists() From a74e865258eeb5818108994ba872aed363484307 Mon Sep 17 00:00:00 2001 From: Peter Bull <1799186+pjbull@users.noreply.github.com> Date: Wed, 29 Apr 2026 13:56:01 -0700 Subject: [PATCH 3/4] prep for release --- HISTORY.md | 2 +- pyproject.toml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index fa945f76..2c986f45 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,6 @@ # cloudpathlib Changelog -## UNRELEASED +## v0.24.0 (2026-04-29) - Added support for S3 Multi-Region Access Point (MRAP) URLs in `S3Path` (Issue [#556](https://github.com/drivendataorg/cloudpathlib/issues/556), PR [#557](https://github.com/drivendataorg/cloudpathlib/pull/557)) - Added support for Pydantic serialization (Issue [#537](https://github.com/drivendataorg/cloudpathlib/issues/537), PR [#538](https://github.com/drivendataorg/cloudpathlib/pull/538)) diff --git a/pyproject.toml b/pyproject.toml index 81f3d433..f1e6a503 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi" [project] name = "cloudpathlib" -version = "0.23.0" +version = "0.24.0" description = "pathlib-style classes for cloud storage services." readme = "README.md" authors = [{ name = "DrivenData", email = "info@drivendata.org" }] @@ -36,7 +36,7 @@ dependencies = [ [project.optional-dependencies] azure = ["azure-storage-blob>=12", "azure-storage-file-datalake>=12"] gs = ["google-cloud-storage"] -s3 = ["boto3>=1.34.0"] +s3 = ["boto3[crt]>=1.34.0"] all = ["cloudpathlib[azure]", "cloudpathlib[gs]", "cloudpathlib[s3]"] From b74ea34b0cb79ad818ec797381fb062e6f56b636 Mon Sep 17 00:00:00 2001 From: Peter Bull <1799186+pjbull@users.noreply.github.com> Date: Wed, 29 Apr 2026 14:10:39 -0700 Subject: [PATCH 4/4] Keep CRT optional --- pyproject.toml | 2 +- requirements-dev.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f1e6a503..6cf6995a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ dependencies = [ [project.optional-dependencies] azure = ["azure-storage-blob>=12", "azure-storage-file-datalake>=12"] gs = ["google-cloud-storage"] -s3 = ["boto3[crt]>=1.34.0"] +s3 = ["boto3>=1.34.0"] all = ["cloudpathlib[azure]", "cloudpathlib[gs]", "cloudpathlib[s3]"] diff --git a/requirements-dev.txt b/requirements-dev.txt index a21e4bd6..4882a770 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,6 +2,7 @@ azure-identity black[jupyter]>=24.1.0;python_version>='3.8' +botocore[crt] # required for S3 Multi-Region Access Point (MRAP) tests build flake8 ipytest