ordering

vdusek · vdusek · commit 6f4d940affde · 2026-05-04T19:34:28.000+02:00
diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
@@ -45,18 +45,6 @@ async def from_content(cls, url: str, content: str) -> Self:
         robots = Protego.parse(content)
         return cls(url, robots)
 
-    @classmethod
-    async def find(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
-        """Determine the location of a robots.txt file for a URL and fetch it.
-
-        Args:
-            url: The URL whose domain will be used to find the corresponding robots.txt file.
-            http_client: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
-            proxy_info: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
-        """
-        robots_url = URL(url).with_path('/robots.txt')
-        return await cls.load(str(robots_url), http_client, proxy_info)
-
     @classmethod
     async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
         """Load the robots.txt file for a given URL.
@@ -83,6 +71,18 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N
 
         return cls(url, robots, http_client=http_client, proxy_info=proxy_info)
 
+    @classmethod
+    async def find(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
+        """Determine the location of a robots.txt file for a URL and fetch it.
+
+        Args:
+            url: The URL whose domain will be used to find the corresponding robots.txt file.
+            http_client: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
+            proxy_info: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
+        """
+        robots_url = URL(url).with_path('/robots.txt')
+        return await cls.load(str(robots_url), http_client, proxy_info)
+
     def is_allowed(self, url: str, user_agent: str = '*') -> bool:
         """Check if the given URL is allowed for the given user agent.
 
diff --git a/src/crawlee/request_loaders/_sitemap_request_loader.py b/src/crawlee/request_loaders/_sitemap_request_loader.py
@@ -168,6 +168,106 @@ def __init__(
         # Start background loading
         self._loading_task = asyncio.create_task(self._load_sitemaps())
 
+    async def __aenter__(self) -> SitemapRequestLoader:
+        """Enter the context manager."""
+        await self.start()
+        return self
+
+    async def __aexit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        exc_traceback: TracebackType | None,
+    ) -> None:
+        """Exit the context manager."""
+        await self.close()
+
+    @override
+    async def get_total_count(self) -> int:
+        """Return the total number of URLs found so far."""
+        state = await self._get_state()
+        return state.total_count
+
+    @override
+    async def get_handled_count(self) -> int:
+        """Return the number of URLs that have been handled."""
+        state = await self._get_state()
+        return state.handled_count
+
+    @override
+    async def is_empty(self) -> bool:
+        """Check if there are no more URLs to process."""
+        state = await self._get_state()
+        return not state.url_queue
+
+    @override
+    async def is_finished(self) -> bool:
+        """Check if all URLs have been processed."""
+        state = await self._get_state()
+        return not state.url_queue and len(state.in_progress) == 0 and self._loading_task.done()
+
+    @override
+    async def fetch_next_request(self) -> Request | None:
+        """Fetch the next request to process."""
+        while not (await self.is_finished()):
+            state = await self._get_state()
+            if not state.url_queue:
+                await asyncio.sleep(0.1)
+                continue
+
+            async with self._queue_lock:
+                # Double-check if the queue is still not empty after acquiring the lock
+                if not state.url_queue:
+                    continue
+
+                url = state.url_queue.popleft()
+                request_option = RequestOptions(url=url, enqueue_strategy=self._enqueue_strategy)
+
+                if len(state.url_queue) < self._max_buffer_size:
+                    self._queue_has_capacity.set()
+
+                if self._transform_request_function:
+                    transform_request_option = self._transform_request_function(request_option)
+                    if transform_request_option == 'skip':
+                        state.total_count -= 1
+                        continue
+                    if transform_request_option != 'unchanged':
+                        request_option = transform_request_option
+
+                request = Request.from_url(**request_option)
+                state.in_progress.add(request.url)
+
+            return request
+
+        return None
+
+    @override
+    async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
+        """Mark a request as successfully handled."""
+        state = await self._get_state()
+        if request.url in state.in_progress:
+            state.in_progress.remove(request.url)
+            state.handled_count += 1
+        return None
+
+    async def start(self) -> None:
+        """Start the sitemap loading process."""
+        if self._loading_task and not self._loading_task.done():
+            return
+        self._loading_task = asyncio.create_task(self._load_sitemaps())
+
+    async def abort_loading(self) -> None:
+        """Abort the sitemap loading process."""
+        if self._loading_task and not self._loading_task.done():
+            self._loading_task.cancel()
+            with suppress(asyncio.CancelledError):
+                await self._loading_task
+
+    async def close(self) -> None:
+        """Close the request loader."""
+        await self.abort_loading()
+        await self._state.teardown()
+
     async def _get_state(self) -> SitemapRequestLoaderState:
         """Initialize and return the current state."""
         if self._state.is_initialized:
@@ -310,100 +410,3 @@ async def _load_sitemaps(self) -> None:
         except Exception:
             logger.exception('Error loading sitemaps')
             raise
-
-    @override
-    async def get_total_count(self) -> int:
-        """Return the total number of URLs found so far."""
-        state = await self._get_state()
-        return state.total_count
-
-    @override
-    async def get_handled_count(self) -> int:
-        """Return the number of URLs that have been handled."""
-        state = await self._get_state()
-        return state.handled_count
-
-    @override
-    async def is_empty(self) -> bool:
-        """Check if there are no more URLs to process."""
-        state = await self._get_state()
-        return not state.url_queue
-
-    @override
-    async def is_finished(self) -> bool:
-        """Check if all URLs have been processed."""
-        state = await self._get_state()
-        return not state.url_queue and len(state.in_progress) == 0 and self._loading_task.done()
-
-    @override
-    async def fetch_next_request(self) -> Request | None:
-        """Fetch the next request to process."""
-        while not (await self.is_finished()):
-            state = await self._get_state()
-            if not state.url_queue:
-                await asyncio.sleep(0.1)
-                continue
-
-            async with self._queue_lock:
-                # Double-check if the queue is still not empty after acquiring the lock
-                if not state.url_queue:
-                    continue
-
-                url = state.url_queue.popleft()
-                request_option = RequestOptions(url=url, enqueue_strategy=self._enqueue_strategy)
-
-                if len(state.url_queue) < self._max_buffer_size:
-                    self._queue_has_capacity.set()
-
-                if self._transform_request_function:
-                    transform_request_option = self._transform_request_function(request_option)
-                    if transform_request_option == 'skip':
-                        state.total_count -= 1
-                        continue
-                    if transform_request_option != 'unchanged':
-                        request_option = transform_request_option
-
-                request = Request.from_url(**request_option)
-                state.in_progress.add(request.url)
-
-            return request
-
-        return None
-
-    @override
-    async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
-        """Mark a request as successfully handled."""
-        state = await self._get_state()
-        if request.url in state.in_progress:
-            state.in_progress.remove(request.url)
-            state.handled_count += 1
-        return None
-
-    async def abort_loading(self) -> None:
-        """Abort the sitemap loading process."""
-        if self._loading_task and not self._loading_task.done():
-            self._loading_task.cancel()
-            with suppress(asyncio.CancelledError):
-                await self._loading_task
-
-    async def start(self) -> None:
-        """Start the sitemap loading process."""
-        if self._loading_task and not self._loading_task.done():
-            return
-        self._loading_task = asyncio.create_task(self._load_sitemaps())
-
-    async def close(self) -> None:
-        """Close the request loader."""
-        await self.abort_loading()
-        await self._state.teardown()
-
-    async def __aenter__(self) -> SitemapRequestLoader:
-        """Enter the context manager."""
-        await self.start()
-        return self
-
-    async def __aexit__(
-        self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None
-    ) -> None:
-        """Exit the context manager."""
-        await self.close()