diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index 670ccaaf245..41308cd60c1 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -82,6 +82,32 @@ def __getattr__(name): ) +def _is_likely_uri(path): + """ + Check if a string looks like a URI (has a valid scheme followed by ':'). + + This is a Python port of the C++ ``IsLikelyUri()`` heuristic in + ``cpp/src/arrow/filesystem/path_util.cc``. It is intentionally + conservative: single-letter schemes are excluded (could be a Windows + drive letter) and the scheme must conform to RFC 3986. + """ + if not path or path[0] == '/': + return False + colon_pos = path.find(':') + if colon_pos < 0: + return False + # One-letter URI schemes don't officially exist; may be a Windows drive + if colon_pos < 2: + return False + # The largest IANA-registered scheme is 36 characters + if colon_pos > 36: + return False + scheme = path[:colon_pos] + if not scheme[0].isalpha(): + return False + return all(c.isalnum() or c in ('+', '-', '.') for c in scheme[1:]) + + def _ensure_filesystem(filesystem, *, use_mmap=False): if isinstance(filesystem, FileSystem): return filesystem @@ -174,13 +200,17 @@ def _resolve_filesystem_and_path(path, filesystem=None, *, memory_map=False): filesystem, path = FileSystem.from_uri(path) except ValueError as e: msg = str(e) - if "empty scheme" in msg or "Cannot parse URI" in msg: - # neither an URI nor a locally existing path, so assume that - # local path was given and propagate a nicer file not found - # error instead of a more confusing scheme parsing error + if "empty scheme" in msg: + # No scheme at all — treat as a local path and propagate + # a nicer "file not found" error later. + pass + elif "Cannot parse URI" in msg and not _is_likely_uri(path): + # Path doesn't look like a URI (no valid scheme prefix), + # so treat it as a local path rather than surfacing a + # confusing URI-parsing error. pass else: - raise e + raise else: path = filesystem.normalize_path(path) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 376398baa07..537ceb99997 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -1695,6 +1695,67 @@ def test_filesystem_from_path_object(path): assert path == p.resolve().absolute().as_posix() +def test_is_likely_uri(): + """Unit tests for the _is_likely_uri() heuristic.""" + from pyarrow.fs import _is_likely_uri + + # Valid URI schemes + assert _is_likely_uri("s3://bucket/key") + assert _is_likely_uri("gs://bucket/key") + assert _is_likely_uri("hdfs://host/path") + assert _is_likely_uri("file:///local/path") + assert _is_likely_uri("abfss://container@account/path") + assert _is_likely_uri("grpc+https://host:443") + + # Not URIs — local paths, Windows drives, empty, etc. + assert not _is_likely_uri("") + assert not _is_likely_uri("/absolute/path") + assert not _is_likely_uri("relative/path") + assert not _is_likely_uri("C:\\Users\\foo") # single-letter → drive + assert not _is_likely_uri("C:/Users/foo") + assert not _is_likely_uri("3bucket://key") # scheme starts with digit + assert not _is_likely_uri("-scheme://key") # scheme starts with dash + + +def test_resolve_filesystem_and_path_uri_with_spaces(): + """ + URIs with a recognised scheme but un-encoded spaces must raise + ValueError — NOT silently fall back to LocalFileSystem. + (GH-41365) + """ + from pyarrow.fs import _resolve_filesystem_and_path + + # S3 URI with spaces should raise, not return LocalFileSystem + with pytest.raises(ValueError, match="Cannot parse URI"): + _resolve_filesystem_and_path("s3://bucket/path with space/file.parquet") + + # GCS URI with spaces should also raise + with pytest.raises(ValueError, match="Cannot parse URI"): + _resolve_filesystem_and_path("gs://bucket/path with space/file.csv") + + # abfss URI with spaces + with pytest.raises(ValueError, match="Cannot parse URI"): + _resolve_filesystem_and_path( + "abfss://container@account/dir with space/file" + ) + + +def test_resolve_filesystem_and_path_local_with_spaces(): + """ + Local paths (no scheme) with spaces should still resolve to + LocalFileSystem — they must NOT be confused with malformed URIs. + """ + from pyarrow.fs import _resolve_filesystem_and_path + + # Absolute local path with spaces → LocalFileSystem + fs, path = _resolve_filesystem_and_path("/tmp/path with spaces/data") + assert isinstance(fs, LocalFileSystem) + + # Non-existent absolute path → still LocalFileSystem + fs, path = _resolve_filesystem_and_path("/nonexistent/path") + assert isinstance(fs, LocalFileSystem) + + @pytest.mark.s3 def test_filesystem_from_uri_s3(s3_server): from pyarrow.fs import S3FileSystem