|
60 | 60 | from pyarrow._s3fs import S3RetryStrategy |
61 | 61 | from pyarrow.fs import ( |
62 | 62 | FileInfo, |
| 63 | + FileSelector, |
63 | 64 | FileSystem, |
64 | 65 | FileType, |
65 | 66 | ) |
|
114 | 115 | S3_ROLE_SESSION_NAME, |
115 | 116 | S3_SECRET_ACCESS_KEY, |
116 | 117 | S3_SESSION_TOKEN, |
| 118 | + FileEntry, |
117 | 119 | FileIO, |
118 | 120 | InputFile, |
119 | 121 | InputStream, |
@@ -674,6 +676,36 @@ def delete(self, location: str | InputFile | OutputFile) -> None: |
674 | 676 | raise PermissionError(f"Cannot delete file, access denied: {location}") from e |
675 | 677 | raise # pragma: no cover - If some other kind of OSError, raise the raw error |
676 | 678 |
|
| 679 | + def list_prefix(self, location: str) -> Iterator[FileEntry]: |
| 680 | + """Recursively list every file under the given location.""" |
| 681 | + original = urlparse(location) |
| 682 | + scheme, netloc, path = self.parse_location(location, self.properties) |
| 683 | + fs = self.fs_by_scheme(scheme, netloc) |
| 684 | + selector = FileSelector(path, recursive=True, allow_not_found=True) |
| 685 | + |
| 686 | + if original.scheme in ("hdfs", "viewfs"): |
| 687 | + uri_prefix = f"{original.scheme}://{netloc}" |
| 688 | + ensure_leading_slash = True |
| 689 | + elif original.scheme: |
| 690 | + # Cloud filesystem paths from pyarrow already start with the bucket/container. |
| 691 | + uri_prefix = f"{original.scheme}://" |
| 692 | + ensure_leading_slash = False |
| 693 | + else: |
| 694 | + uri_prefix = "" |
| 695 | + ensure_leading_slash = False |
| 696 | + |
| 697 | + for info in fs.get_file_info(selector): |
| 698 | + if info.type != FileType.File: |
| 699 | + continue |
| 700 | + info_path = info.path |
| 701 | + if ensure_leading_slash and not info_path.startswith("/"): |
| 702 | + info_path = "/" + info_path |
| 703 | + yield FileEntry( |
| 704 | + location=f"{uri_prefix}{info_path}", |
| 705 | + size=info.size or 0, |
| 706 | + last_modified=info.mtime, |
| 707 | + ) |
| 708 | + |
677 | 709 | def __getstate__(self) -> dict[str, Any]: |
678 | 710 | """Create a dictionary of the PyArrowFileIO fields used when pickling.""" |
679 | 711 | fileio_copy = copy(self.__dict__) |
|
0 commit comments