2929DEFAULT_ALLOWED_CLASS = ["STANDARD" ]
3030
3131
32+ def remove_prefix (text : str , prefix : str ):
33+ """Helper function that removes prefix from a string.
34+
35+ Args:
36+ text: String of text to trim a prefix from.
37+ prefix: String of text that will be trimmed from text.
38+
39+ Returns:
40+ Text value with the specified prefix removed.
41+ """
42+ # Note that as of python 3.9 removeprefix is built into string.
43+ if text .startswith (prefix ):
44+ return text [len (prefix ) :]
45+ return text
46+
47+
3248class ListWorker (object ):
3349 """Worker that lists a range of objects from a GCS bucket.
3450
@@ -75,7 +91,7 @@ def __init__(
7591 client : storage .Client = None ,
7692 skip_compose : bool = True ,
7793 list_directory_objects : bool = False ,
78- prefix : str = None ,
94+ prefix : str = "" ,
7995 allowed_storage_classes : list [str ] = DEFAULT_ALLOWED_CLASS ,
8096 max_retries : int = 5 ,
8197 ):
@@ -98,7 +114,7 @@ def __init__(
98114 self .default_alph = "a"
99115 self .skip_compose = skip_compose
100116 self .list_directory_objects = list_directory_objects
101- self .prefix = prefix
117+ self .prefix = prefix if prefix else ""
102118 self .allowed_storage_classes = allowed_storage_classes
103119 self .api_call_count = 0
104120 self .max_retries = max_retries
@@ -163,8 +179,10 @@ def run(self) -> None:
163179 try :
164180 list_blob_args = {
165181 "max_results" : self .max_results ,
166- "start_offset" : self .start_range ,
167- "end_offset" : self .end_range ,
182+ "start_offset" : self .prefix + self .start_range ,
183+ "end_offset" : (
184+ "" if not self .end_range else self .prefix + self .end_range
185+ ),
168186 }
169187 if self .prefix :
170188 list_blob_args ["prefix" ] = self .prefix
@@ -184,7 +202,10 @@ def run(self) -> None:
184202 and blob .storage_class in self .allowed_storage_classes
185203 ):
186204 self .results .add ((blob .name , blob .size ))
187- self .start_range = blob .name
205+ # Remove the prefix from the name so that range calculations remain prefix-agnostic.
206+ # This is necessary due to the unbounded end-range when splitting string namespaces
207+ # of unknown size.
208+ self .start_range = remove_prefix (blob .name , self .prefix )
188209 if i == self .max_results :
189210 # Only allow work stealing when paging.
190211 has_results = True
@@ -237,7 +258,7 @@ def run_list_worker(
237258 end_range : str ,
238259 client : storage .Client = None ,
239260 skip_compose : bool = True ,
240- prefix : str = None ,
261+ prefix : str = "" ,
241262 allowed_storage_classes : list [str ] = DEFAULT_ALLOWED_CLASS ,
242263) -> None :
243264 """Helper function to execute a ListWorker.
@@ -253,7 +274,7 @@ def run_list_worker(
253274 unidle_queue: Multiprocessing queue pushed to when the worker has successfully stolen work.
254275 results_queue: Multiprocessing queue on which the worker pushes its listing results onto.
255276 metadata_queue: Multiprocessing queue on which the worker pushes tracking metadata.
256- start_range: Stirng start range worker will begin listing from.
277+ start_range: String start range worker will begin listing from.
257278 end_range: String end range worker will list until.
258279 client: The GCS storage client. When not provided, will be derived from background auth.
259280 skip_compose: When true, skip listing files with the composed object prefix.
@@ -303,7 +324,7 @@ def __init__(
303324 bucket : str ,
304325 sort_results : bool = False ,
305326 skip_compose : bool = True ,
306- prefix : str = None ,
327+ prefix : str = "" ,
307328 allowed_storage_classes : list [str ] = DEFAULT_ALLOWED_CLASS ,
308329 ):
309330 # The maximum number of threads utilized in the fast list operation.
0 commit comments