From 3bfba4032aa2a14aa6a324bbb4850f1322dddf23 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 30 Jun 2026 13:05:01 +0200 Subject: [PATCH 1/4] feat: provide Request instances in skipped request callbacks --- .../respect_robots_on_skipped_request.py | 6 +- src/crawlee/_utils/requests.py | 32 +++++++++ .../_abstract_http/_abstract_http_crawler.py | 53 ++++++++------- src/crawlee/crawlers/_basic/_basic_crawler.py | 67 +++++++++++++++---- .../_playwright/_playwright_crawler.py | 53 ++++++++------- tests/unit/_utils/test_requests.py | 26 ++++++- .../crawlers/_basic/test_basic_crawler.py | 48 ++++++++++++- .../test_beautifulsoup_crawler.py | 22 +++--- .../crawlers/_parsel/test_parsel_crawler.py | 22 +++--- .../_playwright/test_playwright_crawler.py | 22 +++--- 10 files changed, 254 insertions(+), 97 deletions(-) diff --git a/docs/examples/code_examples/respect_robots_on_skipped_request.py b/docs/examples/code_examples/respect_robots_on_skipped_request.py index 5c7eca173f..8b9cf7b18e 100644 --- a/docs/examples/code_examples/respect_robots_on_skipped_request.py +++ b/docs/examples/code_examples/respect_robots_on_skipped_request.py @@ -1,6 +1,6 @@ import asyncio -from crawlee import SkippedReason +from crawlee import Request, SkippedReason from crawlee.crawlers import ( BeautifulSoupCrawler, BeautifulSoupCrawlingContext, @@ -18,7 +18,9 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: # highlight-start # This handler is called when a request is skipped @crawler.on_skipped_request - async def skipped_request_handler(url: str, reason: SkippedReason) -> None: + async def skipped_request_handler(request: Request, reason: SkippedReason) -> None: + url = request.url + # Check if the request was skipped due to robots.txt rules if reason == 'robots_txt': crawler.log.info(f'Skipped {url} due to robots.txt rules.') diff --git a/src/crawlee/_utils/requests.py b/src/crawlee/_utils/requests.py index fa31d4621d..cb9a41b490 100644 --- a/src/crawlee/_utils/requests.py +++ b/src/crawlee/_utils/requests.py @@ -3,11 +3,15 @@ from logging import getLogger from typing import TYPE_CHECKING +from pydantic import ValidationError from yarl import URL from crawlee._utils.crypto import compute_short_hash if TYPE_CHECKING: + from logging import Logger + + from crawlee._request import Request, RequestOptions from crawlee._types import HttpHeaders, HttpMethod, HttpPayload logger = getLogger(__name__) @@ -110,6 +114,34 @@ def compute_unique_key( return normalized_url +def create_request_from_options(request_options: RequestOptions, logger: Logger | None = None) -> Request | None: + """Build a `Request` from `RequestOptions`, returning `None` if the URL is invalid. + + Shared by the crawlers' `extract_links` implementations to turn extracted URLs into `Request` + objects. A URL that fails validation (a malformed URL or an unsupported, non-`http(s)` scheme) is + logged at the debug level and skipped by returning `None`, rather than raising. + + Args: + request_options: The options passed to `Request.from_url`. + logger: An optional logger used to report a skipped, invalid URL. + + Returns: + The created `Request`, or `None` if the URL was invalid. + """ + # Imported lazily to avoid a circular import (`crawlee._request` imports from this module). + from crawlee._request import Request # noqa: PLC0415 + + try: + return Request.from_url(**request_options) + except ValidationError as exc: + if logger is not None: + logger.debug( + f'Skipping URL "{request_options["url"]}" due to invalid format: {exc}. ' + 'This may be caused by a malformed URL or an unsupported URL scheme.' + ) + return None + + def _get_payload_hash(payload: HttpPayload | None) -> str: payload_in_bytes = b'' if payload is None else payload return compute_short_hash(payload_in_bytes) diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 8d15a1d801..6a67b851ef 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -7,11 +7,11 @@ from typing import TYPE_CHECKING, Generic from more_itertools import partition -from pydantic import ValidationError from typing_extensions import NotRequired, TypeVar from crawlee._request import Request, RequestOptions, RequestState from crawlee._utils.docs import docs_group +from crawlee._utils.requests import create_request_from_options from crawlee._utils.time import SharedTimeout from crawlee._utils.urls import to_absolute_url_iterator from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline @@ -206,6 +206,7 @@ async def extract_links( **kwargs: Unpack[EnqueueLinksKwargs], ) -> list[Request]: requests = list[Request]() + skipped = list[Request]() base_user_data = user_data or {} @@ -214,6 +215,21 @@ async def extract_links( kwargs.setdefault('strategy', 'same-hostname') strategy = kwargs.get('strategy', 'same-hostname') + def to_request(url: str) -> Request | None: + """Build a `Request` from a single extracted URL, applying the user-provided transform.""" + request_options = RequestOptions( + url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy + ) + + if transform_request_function: + transform_request_options = transform_request_function(request_options) + if transform_request_options == 'skip': + return None + if transform_request_options != 'unchanged': + request_options = transform_request_options + + return create_request_from_options(request_options, context.log) + links_iterator: Iterator[str] = iter( self._parser.find_links(parsed_content, selector=selector, attribute=attribute) ) @@ -227,34 +243,19 @@ async def extract_links( ) links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log) + # Requests disallowed by robots.txt are reported to the skipped-request callback; the rest + # continue to the enqueue filter. Both paths go through `to_request` for consistent building. if robots_txt_file: - skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator) - else: - skipped = iter([]) + skipped_iterator, links_iterator = partition(robots_txt_file.is_allowed, links_iterator) + for url in skipped_iterator: + request = to_request(url) + if request is not None: + skipped.append(request) for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs): - request_options = RequestOptions( - url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy - ) - - if transform_request_function: - transform_request_options = transform_request_function(request_options) - if transform_request_options == 'skip': - continue - if transform_request_options != 'unchanged': - request_options = transform_request_options - - try: - request = Request.from_url(**request_options) - except ValidationError as exc: - context.log.debug( - f'Skipping URL "{url}" due to invalid format: {exc}. ' - 'This may be caused by a malformed URL or unsupported URL scheme. ' - 'Please ensure the URL is correct and retry.' - ) - continue - - requests.append(request) + request = to_request(url) + if request is not None: + requests.append(request) skipped_tasks = [ asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index be3da6dd27..78e02ad82d 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -4,6 +4,7 @@ import asyncio import functools +import inspect import logging import signal import sys @@ -17,7 +18,7 @@ from http import HTTPStatus from io import StringIO from pathlib import Path -from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast +from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast, get_type_hints from weakref import WeakKeyDictionary from cachetools import LRUCache @@ -110,7 +111,35 @@ ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]] FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]] -SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]] +SkippedRequestCallback = ( + Callable[[str, SkippedReason], Awaitable[None]] | Callable[[Request, SkippedReason], Awaitable[None]] +) +"""A skipped-request callback receives either the URL `str` or the full `Request`. + +For backward compatibility, callbacks whose first parameter is annotated as `str` (or is unannotated) +receive `request.url`; callbacks that annotate it as `Request` receive the `Request` object. See +`_skipped_request_callback_expects_request`. +""" + + +def _skipped_request_callback_expects_request(callback: Callable[..., Awaitable[None]]) -> bool: + """Whether a skipped-request callback wants the full `Request` rather than the URL string. + + The first parameter's resolved type annotation decides: a callback annotating it as `Request` + receives the `Request` object, while a callback annotating it as `str` (or leaving it unannotated) + receives `request.url`, preserving the original `(url: str, reason)` signature. Anything that + cannot be introspected falls back to the backward-compatible `str` form. + """ + try: + parameters = list(inspect.signature(callback).parameters.values()) + type_hints = get_type_hints(callback) + except Exception: # Any introspection failure falls back to the backward-compatible `str` form. + return False + + if not parameters: + return False + + return type_hints.get(parameters[0].name) is Request class _BasicCrawlerOptions(TypedDict): @@ -417,6 +446,7 @@ def __init__( self._error_handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext] | None = None self._failed_request_handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext] | None = None self._on_skipped_request: SkippedRequestCallback | None = None + self._on_skipped_request_expects_request = False self._abort_on_error = abort_on_error # Crawler callbacks @@ -678,8 +708,13 @@ def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequest """Register a function to handle skipped requests. The skipped request handler is invoked when a request is skipped due to a collision or other reasons. + + The callback receives either the request URL as a `str` or the full `Request` object, depending on + how its first parameter is annotated. Annotate it as `Request` to access request metadata such as + `user_data`; a `str` annotation (or no annotation) keeps the original URL-only behavior. """ self._on_skipped_request = callback + self._on_skipped_request_expects_request = _skipped_request_callback_expects_request(callback) return callback async def run( @@ -826,12 +861,14 @@ async def add_requests( wait_for_all_requests_to_be_added: If True, wait for all requests to be added before returning. wait_for_all_requests_to_be_added_timeout: Timeout for waiting for all requests to be added. """ - allowed_requests = [] - skipped = [] - - for request in requests: - check_url = request.url if isinstance(request, Request) else request - if await self._is_allowed_based_on_robots_txt_file(check_url): + allowed_requests: list[Request] = [] + skipped: list[Request] = [] + + for original in requests: + # Normalize `str` URLs to `Request` once, so robots-skipped items always reach the + # skipped-request callback as a `Request` (see `_handle_skipped_request`). + request = original if isinstance(original, Request) else Request.from_url(original) + if await self._is_allowed_based_on_robots_txt_file(request.url): allowed_requests.append(request) else: skipped.append(request) @@ -1210,17 +1247,19 @@ async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawling raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e async def _handle_skipped_request( - self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False + self, request: Request, reason: SkippedReason, *, need_mark: bool = False ) -> None: - if need_mark and isinstance(request, Request): + if need_mark: request.state = RequestState.SKIPPED await self._mark_request_as_handled(request) - url = request.url if isinstance(request, Request) else request - - if self._on_skipped_request: + if self._on_skipped_request is not None: + # Pass the full `Request` or just its URL, depending on how the callback annotated its first + # parameter (see `on_skipped_request`). The cast reflects that dual-dispatch contract. + callback = cast('Callable[[str | Request, SkippedReason], Awaitable[None]]', self._on_skipped_request) + argument: str | Request = request if self._on_skipped_request_expects_request else request.url try: - await self._on_skipped_request(url, reason) + await callback(argument, reason) except Exception as e: raise UserDefinedErrorHandlerError('Exception thrown in user-defined skipped request callback') from e diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index bbbc03b1fa..5eee40c25d 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -9,13 +9,13 @@ import playwright.async_api from more_itertools import partition -from pydantic import ValidationError from typing_extensions import NotRequired, TypedDict, TypeVar from crawlee._request import Request, RequestOptions, RequestState from crawlee._types import BasicCrawlingContext, ConcurrencySettings from crawlee._utils.blocked import RETRY_CSS_SELECTORS from crawlee._utils.docs import docs_group +from crawlee._utils.requests import create_request_from_options from crawlee._utils.robots import RobotsTxtFile from crawlee._utils.time import SharedTimeout from crawlee._utils.urls import to_absolute_url_iterator @@ -462,6 +462,7 @@ async def extract_links( The `PlaywrightCrawler` implementation of the `ExtractLinksFunction` function. """ requests = list[Request]() + skipped = list[Request]() base_user_data = user_data or {} @@ -470,6 +471,21 @@ async def extract_links( kwargs.setdefault('strategy', 'same-hostname') strategy = kwargs.get('strategy', 'same-hostname') + def to_request(url: str) -> Request | None: + """Build a `Request` from a single extracted URL, applying the user-provided transform.""" + request_options = RequestOptions( + url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy + ) + + if transform_request_function: + transform_request_options = transform_request_function(request_options) + if transform_request_options == 'skip': + return None + if transform_request_options != 'unchanged': + request_options = transform_request_options + + return create_request_from_options(request_options, context.log) + elements = await context.page.query_selector_all(selector) links_iterator: Iterator[str] = iter( [url for element in elements if (url := await element.get_attribute(attribute)) is not None] @@ -481,34 +497,19 @@ async def extract_links( links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log) + # Requests disallowed by robots.txt are reported to the skipped-request callback; the rest + # continue to the enqueue filter. Both paths go through `to_request` for consistent building. if robots_txt_file: - skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator) - else: - skipped = iter([]) + skipped_iterator, links_iterator = partition(robots_txt_file.is_allowed, links_iterator) + for url in skipped_iterator: + request = to_request(url) + if request is not None: + skipped.append(request) for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs): - request_options = RequestOptions( - url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy - ) - - if transform_request_function: - transform_request_options = transform_request_function(request_options) - if transform_request_options == 'skip': - continue - if transform_request_options != 'unchanged': - request_options = transform_request_options - - try: - request = Request.from_url(**request_options) - except ValidationError as exc: - context.log.debug( - f'Skipping URL "{url}" due to invalid format: {exc}. ' - 'This may be caused by a malformed URL or unsupported URL scheme. ' - 'Please ensure the URL is correct and retry.' - ) - continue - - requests.append(request) + request = to_request(url) + if request is not None: + requests.append(request) skipped_tasks = [ asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped diff --git a/tests/unit/_utils/test_requests.py b/tests/unit/_utils/test_requests.py index 8198909592..537fccd361 100644 --- a/tests/unit/_utils/test_requests.py +++ b/tests/unit/_utils/test_requests.py @@ -1,9 +1,12 @@ from __future__ import annotations +import logging + import pytest +from crawlee._request import RequestOptions from crawlee._types import HttpHeaders -from crawlee._utils.requests import compute_unique_key, normalize_url +from crawlee._utils.requests import compute_unique_key, create_request_from_options, normalize_url @pytest.mark.parametrize( @@ -132,3 +135,24 @@ def test_compute_unique_key_with_whitespace_in_headers() -> None: uk_2 = compute_unique_key(url, headers=headers_with_whitespaces, use_extended_unique_key=True) assert uk_2 == expected_output + + +def test_create_request_from_options_valid_url() -> None: + """A valid `http(s)` URL yields a `Request` carrying that URL.""" + request = create_request_from_options(RequestOptions(url='https://crawlee.dev')) + assert request is not None + assert request.url == 'https://crawlee.dev' + + +def test_create_request_from_options_invalid_url_returns_none() -> None: + """An unsupported, non-`http(s)` URL is dropped (returns `None`) instead of raising.""" + assert create_request_from_options(RequestOptions(url='ftp://crawlee.dev')) is None + + +def test_create_request_from_options_logs_invalid_url(caplog: pytest.LogCaptureFixture) -> None: + """When a logger is provided, an invalid URL is reported at the debug level.""" + logger = logging.getLogger('test_create_request_from_options') + with caplog.at_level(logging.DEBUG, logger=logger.name): + assert create_request_from_options(RequestOptions(url='ftp://crawlee.dev'), logger) is None + + assert any('ftp://crawlee.dev' in record.message for record in caplog.records) diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 0391d65843..bacc8863a5 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -19,12 +19,13 @@ import pytest -from crawlee import ConcurrencySettings, Glob, service_locator +from crawlee import ConcurrencySettings, Glob, SkippedReason, service_locator from crawlee._request import Request, RequestState from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpMethod from crawlee._utils.robots import RobotsTxtFile from crawlee.configuration import Configuration from crawlee.crawlers import BasicCrawler +from crawlee.crawlers._basic._basic_crawler import _skipped_request_callback_expects_request from crawlee.errors import RequestCollisionError, SessionError, UserDefinedErrorHandlerError from crawlee.events import Event, EventCrawlerStatusData, LocalEventManager from crawlee.request_loaders import RequestList, RequestManagerTandem, ThrottlingRequestManager @@ -1647,6 +1648,51 @@ async def test_lock_with_get_robots_txt_file_for_url(server_url: URL) -> None: assert spy.call_count == 1 +def test_skipped_request_callback_dispatch_by_annotation() -> None: + """The skipped-request callback receives the full `Request` only when its first parameter annotates it.""" + + async def expects_request(_request: Request, _reason: SkippedReason) -> None: ... + async def expects_url(_url: str, _reason: SkippedReason) -> None: ... + async def expects_url_unannotated(_url, _reason) -> None: ... # noqa: ANN001 + + assert _skipped_request_callback_expects_request(expects_request) is True + assert _skipped_request_callback_expects_request(expects_url) is False + assert _skipped_request_callback_expects_request(expects_url_unannotated) is False + + +async def test_add_requests_reports_disallowed_url_to_skipped_callback(server_url: URL) -> None: + """A bare `str` URL disallowed by robots.txt reaches the `add_requests` skipped callback as a `Request`.""" + crawler = BasicCrawler(respect_robots_txt_file=True) + skip = Mock() + + @crawler.on_skipped_request + async def skipped_hook(request: Request, _reason: SkippedReason) -> None: + skip(request) + + # `page_1` is disallowed by the test server's robots.txt; pass it as a plain string. + await crawler.add_requests([str(server_url / 'page_1')]) + + skipped_requests = [call.args[0] for call in skip.call_args_list] + assert all(isinstance(request, Request) for request in skipped_requests) + assert {request.url for request in skipped_requests} == {str(server_url / 'page_1')} + + +async def test_skipped_request_callback_receives_url_for_str_signature(server_url: URL) -> None: + """A callback whose first parameter is annotated as `str` keeps receiving the URL string (backward compatible).""" + crawler = BasicCrawler(respect_robots_txt_file=True) + skip = Mock() + + @crawler.on_skipped_request + async def skipped_hook(url: str, _reason: SkippedReason) -> None: + skip(url) + + await crawler.add_requests([str(server_url / 'page_1')]) + + skipped_urls = [call.args[0] for call in skip.call_args_list] + assert all(isinstance(url, str) for url in skipped_urls) + assert set(skipped_urls) == {str(server_url / 'page_1')} + + async def test_reduced_logs_from_timed_out_request_handler(caplog: pytest.LogCaptureFixture) -> None: caplog.set_level(logging.INFO) crawler = BasicCrawler( diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py index 9a828b0078..4d8f603910 100644 --- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py @@ -246,18 +246,22 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await context.enqueue_links() @crawler.on_skipped_request - async def skipped_hook(url: str, _reason: SkippedReason) -> None: - skip(url) + async def skipped_hook(request: Request, _reason: SkippedReason) -> None: + skip(request) await crawler.run([str(server_url / 'start_enqueue')]) - expected_skip_calls = [ - mock.call(str(server_url / 'page_1')), - mock.call(str(server_url / 'page_2')), - mock.call(str(server_url / 'page_3')), - mock.call(str(server_url / 'page_4')), - ] - skip.assert_has_calls(expected_skip_calls, any_order=True) + expected_skip_urls = { + str(server_url / 'page_1'), + str(server_url / 'page_2'), + str(server_url / 'page_3'), + str(server_url / 'page_4'), + } + + requests = [call.args[0] for call in skip.call_args_list] + + assert all(isinstance(request, Request) for request in requests) + assert {request.url for request in requests} == expected_skip_urls async def test_extract_links(server_url: URL, http_client: HttpClient) -> None: diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py index 02f5b61a86..56e70acdc6 100644 --- a/tests/unit/crawlers/_parsel/test_parsel_crawler.py +++ b/tests/unit/crawlers/_parsel/test_parsel_crawler.py @@ -330,18 +330,22 @@ async def request_handler(context: ParselCrawlingContext) -> None: await context.enqueue_links() @crawler.on_skipped_request - async def skipped_hook(url: str, _reason: SkippedReason) -> None: - skip(url) + async def skipped_hook(request: Request, _reason: SkippedReason) -> None: + skip(request) await crawler.run([str(server_url / 'start_enqueue')]) - expected_skip_calls = [ - mock.call(str(server_url / 'page_1')), - mock.call(str(server_url / 'page_2')), - mock.call(str(server_url / 'page_3')), - mock.call(str(server_url / 'page_4')), - ] - skip.assert_has_calls(expected_skip_calls, any_order=True) + expected_skip_urls = { + str(server_url / 'page_1'), + str(server_url / 'page_2'), + str(server_url / 'page_3'), + str(server_url / 'page_4'), + } + + requests = [call.args[0] for call in skip.call_args_list] + + assert all(isinstance(request, Request) for request in requests) + assert {request.url for request in requests} == expected_skip_urls async def test_extract_links(server_url: URL, http_client: HttpClient) -> None: diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index 78d1789f99..f4208aa339 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -765,18 +765,22 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: await context.enqueue_links() @crawler.on_skipped_request - async def skipped_hook(url: str, _reason: SkippedReason) -> None: - skip(url) + async def skipped_hook(request: Request, _reason: SkippedReason) -> None: + skip(request) await crawler.run([str(server_url / 'start_enqueue')]) - expected_skip_calls = [ - mock.call(str(server_url / 'page_1')), - mock.call(str(server_url / 'page_2')), - mock.call(str(server_url / 'page_3')), - mock.call(str(server_url / 'page_4')), - ] - skip.assert_has_calls(expected_skip_calls, any_order=True) + expected_skip_urls = { + str(server_url / 'page_1'), + str(server_url / 'page_2'), + str(server_url / 'page_3'), + str(server_url / 'page_4'), + } + + requests = [call.args[0] for call in skip.call_args_list] + + assert all(isinstance(request, Request) for request in requests) + assert {request.url for request in requests} == expected_skip_urls async def test_send_request(server_url: URL) -> None: From af58657b180f8da9dfba524b5e423ed1792bad73 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 30 Jun 2026 14:00:38 +0200 Subject: [PATCH 2/4] docs: remove stray blank line in robots.txt skipped request example --- docs/examples/code_examples/respect_robots_on_skipped_request.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/examples/code_examples/respect_robots_on_skipped_request.py b/docs/examples/code_examples/respect_robots_on_skipped_request.py index 8b9cf7b18e..277e6d90f3 100644 --- a/docs/examples/code_examples/respect_robots_on_skipped_request.py +++ b/docs/examples/code_examples/respect_robots_on_skipped_request.py @@ -20,7 +20,6 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: @crawler.on_skipped_request async def skipped_request_handler(request: Request, reason: SkippedReason) -> None: url = request.url - # Check if the request was skipped due to robots.txt rules if reason == 'robots_txt': crawler.log.info(f'Skipped {url} due to robots.txt rules.') From 564183cf4c4b743e6dd108eef40fba30f993f89c Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 1 Jul 2026 09:22:08 +0200 Subject: [PATCH 3/4] fix(crawlers): robust skipped-request callback dispatch and keep robots skips out of transform --- .../_abstract_http/_abstract_http_crawler.py | 17 +++++-- src/crawlee/crawlers/_basic/_basic_crawler.py | 47 +++++++++++++++---- .../_playwright/_playwright_crawler.py | 17 +++++-- .../_basic/_deferred_skipped_request_hook.py | 27 +++++++++++ .../crawlers/_basic/test_basic_crawler.py | 25 +++++++++- .../test_beautifulsoup_crawler.py | 32 +++++++++++++ .../_playwright/test_playwright_crawler.py | 30 ++++++++++++ 7 files changed, 176 insertions(+), 19 deletions(-) create mode 100644 tests/unit/crawlers/_basic/_deferred_skipped_request_hook.py diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 6a67b851ef..2cfe08721b 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -215,13 +215,19 @@ async def extract_links( kwargs.setdefault('strategy', 'same-hostname') strategy = kwargs.get('strategy', 'same-hostname') - def to_request(url: str) -> Request | None: - """Build a `Request` from a single extracted URL, applying the user-provided transform.""" + def to_request(url: str, *, apply_transform: bool = True) -> Request | None: + """Build a `Request` from a single extracted URL. + + `transform_request_function` is applied only to links that will actually be enqueued + (`apply_transform=True`); robots.txt-skipped links are reported to the skipped-request + callback verbatim, so a transform returning `'skip'` cannot hide a robots-blocked URL + from that audit. + """ request_options = RequestOptions( url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy ) - if transform_request_function: + if apply_transform and transform_request_function: transform_request_options = transform_request_function(request_options) if transform_request_options == 'skip': return None @@ -244,11 +250,12 @@ def to_request(url: str) -> Request | None: links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log) # Requests disallowed by robots.txt are reported to the skipped-request callback; the rest - # continue to the enqueue filter. Both paths go through `to_request` for consistent building. + # continue to the enqueue filter. The transform shapes only the enqueued set, so it is not + # applied to skipped links (see `to_request`). if robots_txt_file: skipped_iterator, links_iterator = partition(robots_txt_file.is_allowed, links_iterator) for url in skipped_iterator: - request = to_request(url) + request = to_request(url, apply_transform=False) if request is not None: skipped.append(request) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 78e02ad82d..e7743e2cba 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -2,6 +2,7 @@ from __future__ import annotations +import ast import asyncio import functools import inspect @@ -18,7 +19,7 @@ from http import HTTPStatus from io import StringIO from pathlib import Path -from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast, get_type_hints +from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast, get_args from weakref import WeakKeyDictionary from cachetools import LRUCache @@ -125,21 +126,51 @@ def _skipped_request_callback_expects_request(callback: Callable[..., Awaitable[None]]) -> bool: """Whether a skipped-request callback wants the full `Request` rather than the URL string. - The first parameter's resolved type annotation decides: a callback annotating it as `Request` - receives the `Request` object, while a callback annotating it as `str` (or leaving it unannotated) - receives `request.url`, preserving the original `(url: str, reason)` signature. Anything that - cannot be introspected falls back to the backward-compatible `str` form. + The first parameter's type annotation decides: a callback annotating it as `Request` (including a + union such as `Request | None`) receives the `Request` object, while a callback annotating it as + `str` (or leaving it unannotated) receives `request.url`, preserving the original `(url, reason)` + signature. + + Annotations are matched leniently. Under `from __future__ import annotations` the annotation is a + string, and a hook whose module imports `Request` only under `TYPE_CHECKING` (the idiomatic style) + cannot be resolved to the class at runtime. Rather than silently degrading such hooks to the `str` + form, string annotations are matched by inspecting their syntax for a bare `Request` name. """ try: parameters = list(inspect.signature(callback).parameters.values()) - type_hints = get_type_hints(callback) - except Exception: # Any introspection failure falls back to the backward-compatible `str` form. + except (TypeError, ValueError): # Uninspectable callable falls back to the backward-compatible `str` form. return False if not parameters: return False - return type_hints.get(parameters[0].name) is Request + annotation = parameters[0].annotation + + if annotation is inspect.Parameter.empty: + return False + + # A string annotation (PEP 563, or an explicitly quoted forward reference) may not resolve to the + # class when `Request` is a `TYPE_CHECKING`-only import, so match it by name instead. This handles + # unions like `Request | None` and `Optional[Request]` without misfiring on names like `RequestOptions`. + if isinstance(annotation, str): + return _annotation_names_request(annotation) + + # An already-resolved annotation (a class or a typing construct): match `Request` directly or inside a union. + return annotation is Request or Request in get_args(annotation) + + +def _annotation_names_request(annotation: str) -> bool: + """Whether a string type annotation references `Request` as a bare name (e.g. `Request`, `Request | None`).""" + try: + tree = ast.parse(annotation, mode='eval') + except SyntaxError: + return False + + return any( + (isinstance(node, ast.Name) and node.id == Request.__name__) + or (isinstance(node, ast.Attribute) and node.attr == Request.__name__) + for node in ast.walk(tree) + ) class _BasicCrawlerOptions(TypedDict): diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 5eee40c25d..33eb3bbad3 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -471,13 +471,19 @@ async def extract_links( kwargs.setdefault('strategy', 'same-hostname') strategy = kwargs.get('strategy', 'same-hostname') - def to_request(url: str) -> Request | None: - """Build a `Request` from a single extracted URL, applying the user-provided transform.""" + def to_request(url: str, *, apply_transform: bool = True) -> Request | None: + """Build a `Request` from a single extracted URL. + + `transform_request_function` is applied only to links that will actually be enqueued + (`apply_transform=True`); robots.txt-skipped links are reported to the skipped-request + callback verbatim, so a transform returning `'skip'` cannot hide a robots-blocked URL + from that audit. + """ request_options = RequestOptions( url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy ) - if transform_request_function: + if apply_transform and transform_request_function: transform_request_options = transform_request_function(request_options) if transform_request_options == 'skip': return None @@ -498,11 +504,12 @@ def to_request(url: str) -> Request | None: links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log) # Requests disallowed by robots.txt are reported to the skipped-request callback; the rest - # continue to the enqueue filter. Both paths go through `to_request` for consistent building. + # continue to the enqueue filter. The transform shapes only the enqueued set, so it is not + # applied to skipped links (see `to_request`). if robots_txt_file: skipped_iterator, links_iterator = partition(robots_txt_file.is_allowed, links_iterator) for url in skipped_iterator: - request = to_request(url) + request = to_request(url, apply_transform=False) if request is not None: skipped.append(request) diff --git a/tests/unit/crawlers/_basic/_deferred_skipped_request_hook.py b/tests/unit/crawlers/_basic/_deferred_skipped_request_hook.py new file mode 100644 index 0000000000..ae32400677 --- /dev/null +++ b/tests/unit/crawlers/_basic/_deferred_skipped_request_hook.py @@ -0,0 +1,27 @@ +"""Skipped-request hooks written in the idiomatic deferred-annotation style. + +Used by `test_basic_crawler.py` to check that `_skipped_request_callback_expects_request` still +recognizes a `Request` annotation when the hook's module uses `from __future__ import annotations` +(PEP 563) and imports `Request` only under `TYPE_CHECKING`, so the name is not available at runtime. +The module is loaded by file path so it is never collected by pytest and does not rely on the test +package layout. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from crawlee import Request, SkippedReason + + +async def expects_request(_request: Request, _reason: SkippedReason) -> None: + """First parameter annotated `Request`, resolvable only under `TYPE_CHECKING`.""" + + +async def expects_optional_request(_request: Request | None, _reason: SkippedReason) -> None: + """First parameter annotated with a `Request | None` union.""" + + +async def expects_url(_url: str, _reason: SkippedReason) -> None: + """First parameter annotated `str` keeps the legacy URL-only behavior.""" diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index bacc8863a5..996a4cdac4 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -2,6 +2,7 @@ from __future__ import annotations import asyncio +import importlib.util import json import logging import os @@ -14,6 +15,7 @@ from dataclasses import dataclass from datetime import timedelta from itertools import product +from pathlib import Path from typing import TYPE_CHECKING, Any, Literal, cast from unittest.mock import AsyncMock, Mock, call, patch @@ -37,7 +39,6 @@ if TYPE_CHECKING: from collections.abc import Callable, Sequence - from pathlib import Path from yarl import URL @@ -1660,6 +1661,28 @@ async def expects_url_unannotated(_url, _reason) -> None: ... # noqa: ANN001 assert _skipped_request_callback_expects_request(expects_url_unannotated) is False +def test_skipped_request_callback_dispatch_accepts_optional_and_deferred_request_annotation() -> None: + """Detection recognizes a `Request` annotation given as a union or resolvable only under `TYPE_CHECKING`.""" + + # A `Request | None` union still means "give me the Request". + async def expects_optional_request(_request: Request | None, _reason: SkippedReason) -> None: ... + + assert _skipped_request_callback_expects_request(expects_optional_request) is True + + # Hooks living in a module that imports `Request` only under `TYPE_CHECKING` (with PEP 563 deferred + # annotations, the style Crawlee itself uses) must not silently degrade to the URL-only form. + hook_path = Path(__file__).parent / '_deferred_skipped_request_hook.py' + spec = importlib.util.spec_from_file_location('_deferred_skipped_request_hook', hook_path) + assert spec is not None + assert spec.loader is not None + hooks = importlib.util.module_from_spec(spec) + spec.loader.exec_module(hooks) + + assert _skipped_request_callback_expects_request(hooks.expects_request) is True + assert _skipped_request_callback_expects_request(hooks.expects_optional_request) is True + assert _skipped_request_callback_expects_request(hooks.expects_url) is False + + async def test_add_requests_reports_disallowed_url_to_skipped_callback(server_url: URL) -> None: """A bare `str` URL disallowed by robots.txt reaches the `add_requests` skipped callback as a `Request`.""" crawler = BasicCrawler(respect_robots_txt_file=True) diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py index 4d8f603910..f8186db45b 100644 --- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py @@ -264,6 +264,38 @@ async def skipped_hook(request: Request, _reason: SkippedReason) -> None: assert {request.url for request in requests} == expected_skip_urls +async def test_on_skipped_request_reports_robots_url_despite_transform_skip( + server_url: URL, http_client: HttpClient +) -> None: + """A `transform_request_function` returning `'skip'` must not hide a robots-disallowed URL from the callback.""" + crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True) + skip = mock.Mock() + + def transform_request_function(request_options: RequestOptions) -> RequestOptions | RequestTransformAction: + # The user only means "do not enqueue page_3"; this must not affect robots.txt reporting. + if 'page_3' in request_options['url']: + return 'skip' + return 'unchanged' + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + await context.enqueue_links(transform_request_function=transform_request_function) + + @crawler.on_skipped_request + async def skipped_hook(request: Request, _reason: SkippedReason) -> None: + skip(request.url) + + await crawler.run([str(server_url / 'start_enqueue')]) + + reported = {call.args[0] for call in skip.call_args_list} + assert reported == { + str(server_url / 'page_1'), + str(server_url / 'page_2'), + str(server_url / 'page_3'), # robots-blocked; the transform skip must not hide it here + str(server_url / 'page_4'), + } + + async def test_extract_links(server_url: URL, http_client: HttpClient) -> None: crawler = BeautifulSoupCrawler(http_client=http_client) extracted_links: list[str] = [] diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index f4208aa339..d25e59a1e7 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -783,6 +783,36 @@ async def skipped_hook(request: Request, _reason: SkippedReason) -> None: assert {request.url for request in requests} == expected_skip_urls +async def test_on_skipped_request_reports_robots_url_despite_transform_skip(server_url: URL) -> None: + """A `transform_request_function` returning `'skip'` must not hide a robots-disallowed URL from the callback.""" + crawler = PlaywrightCrawler(respect_robots_txt_file=True) + skip = mock.Mock() + + def transform_request_function(request_options: RequestOptions) -> RequestOptions | RequestTransformAction: + # The user only means "do not enqueue page_3"; this must not affect robots.txt reporting. + if 'page_3' in request_options['url']: + return 'skip' + return 'unchanged' + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + await context.enqueue_links(transform_request_function=transform_request_function) + + @crawler.on_skipped_request + async def skipped_hook(request: Request, _reason: SkippedReason) -> None: + skip(request.url) + + await crawler.run([str(server_url / 'start_enqueue')]) + + reported = {call.args[0] for call in skip.call_args_list} + assert reported == { + str(server_url / 'page_1'), + str(server_url / 'page_2'), + str(server_url / 'page_3'), # robots-blocked; the transform skip must not hide it here + str(server_url / 'page_4'), + } + + async def test_send_request(server_url: URL) -> None: check_data: dict[str, Any] = {} From 608fbb6ff70c192520dde7da6ac850a4e7fec5ce Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 1 Jul 2026 09:31:33 +0200 Subject: [PATCH 4/4] docs: shorten skipped-request callback comments and drop helper module docstring --- .../_abstract_http/_abstract_http_crawler.py | 11 ++++----- src/crawlee/crawlers/_basic/_basic_crawler.py | 23 +++++++------------ .../_playwright/_playwright_crawler.py | 11 ++++----- .../_basic/_deferred_skipped_request_hook.py | 9 -------- .../crawlers/_basic/test_basic_crawler.py | 3 +-- 5 files changed, 17 insertions(+), 40 deletions(-) diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 2cfe08721b..5dcd18d0be 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -218,10 +218,8 @@ async def extract_links( def to_request(url: str, *, apply_transform: bool = True) -> Request | None: """Build a `Request` from a single extracted URL. - `transform_request_function` is applied only to links that will actually be enqueued - (`apply_transform=True`); robots.txt-skipped links are reported to the skipped-request - callback verbatim, so a transform returning `'skip'` cannot hide a robots-blocked URL - from that audit. + The transform is applied only to enqueued links (`apply_transform=True`), so a + transform returning `'skip'` cannot hide a robots-skipped URL from the callback. """ request_options = RequestOptions( url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy @@ -249,9 +247,8 @@ def to_request(url: str, *, apply_transform: bool = True) -> Request | None: ) links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log) - # Requests disallowed by robots.txt are reported to the skipped-request callback; the rest - # continue to the enqueue filter. The transform shapes only the enqueued set, so it is not - # applied to skipped links (see `to_request`). + # Robots-disallowed requests go to the skipped-request callback (without the transform, see + # `to_request`); the rest continue to the enqueue filter. if robots_txt_file: skipped_iterator, links_iterator = partition(robots_txt_file.is_allowed, links_iterator) for url in skipped_iterator: diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index e7743e2cba..660f210817 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -126,19 +126,14 @@ def _skipped_request_callback_expects_request(callback: Callable[..., Awaitable[None]]) -> bool: """Whether a skipped-request callback wants the full `Request` rather than the URL string. - The first parameter's type annotation decides: a callback annotating it as `Request` (including a - union such as `Request | None`) receives the `Request` object, while a callback annotating it as - `str` (or leaving it unannotated) receives `request.url`, preserving the original `(url, reason)` - signature. - - Annotations are matched leniently. Under `from __future__ import annotations` the annotation is a - string, and a hook whose module imports `Request` only under `TYPE_CHECKING` (the idiomatic style) - cannot be resolved to the class at runtime. Rather than silently degrading such hooks to the `str` - form, string annotations are matched by inspecting their syntax for a bare `Request` name. + The first parameter's annotation decides: `Request` (or a union such as `Request | None`) gets the + `Request` object; `str` or no annotation keeps the legacy `(url, reason)` signature. String + annotations (PEP 563, or a `TYPE_CHECKING`-only `Request` import) are matched by name so such hooks + don't silently degrade to the `str` form. """ try: parameters = list(inspect.signature(callback).parameters.values()) - except (TypeError, ValueError): # Uninspectable callable falls back to the backward-compatible `str` form. + except (TypeError, ValueError): # Uninspectable callable falls back to the `str` form. return False if not parameters: @@ -149,18 +144,16 @@ def _skipped_request_callback_expects_request(callback: Callable[..., Awaitable[ if annotation is inspect.Parameter.empty: return False - # A string annotation (PEP 563, or an explicitly quoted forward reference) may not resolve to the - # class when `Request` is a `TYPE_CHECKING`-only import, so match it by name instead. This handles - # unions like `Request | None` and `Optional[Request]` without misfiring on names like `RequestOptions`. + # A string annotation may not resolve to the class (e.g. a `TYPE_CHECKING`-only import), so match by name. if isinstance(annotation, str): return _annotation_names_request(annotation) - # An already-resolved annotation (a class or a typing construct): match `Request` directly or inside a union. + # A resolved annotation: match `Request` directly or inside a union. return annotation is Request or Request in get_args(annotation) def _annotation_names_request(annotation: str) -> bool: - """Whether a string type annotation references `Request` as a bare name (e.g. `Request`, `Request | None`).""" + """Whether a string annotation names `Request` (e.g. `Request`, `Request | None`), not `RequestOptions`.""" try: tree = ast.parse(annotation, mode='eval') except SyntaxError: diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 33eb3bbad3..5bf2232661 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -474,10 +474,8 @@ async def extract_links( def to_request(url: str, *, apply_transform: bool = True) -> Request | None: """Build a `Request` from a single extracted URL. - `transform_request_function` is applied only to links that will actually be enqueued - (`apply_transform=True`); robots.txt-skipped links are reported to the skipped-request - callback verbatim, so a transform returning `'skip'` cannot hide a robots-blocked URL - from that audit. + The transform is applied only to enqueued links (`apply_transform=True`), so a + transform returning `'skip'` cannot hide a robots-skipped URL from the callback. """ request_options = RequestOptions( url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy @@ -503,9 +501,8 @@ def to_request(url: str, *, apply_transform: bool = True) -> Request | None: links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log) - # Requests disallowed by robots.txt are reported to the skipped-request callback; the rest - # continue to the enqueue filter. The transform shapes only the enqueued set, so it is not - # applied to skipped links (see `to_request`). + # Robots-disallowed requests go to the skipped-request callback (without the transform, see + # `to_request`); the rest continue to the enqueue filter. if robots_txt_file: skipped_iterator, links_iterator = partition(robots_txt_file.is_allowed, links_iterator) for url in skipped_iterator: diff --git a/tests/unit/crawlers/_basic/_deferred_skipped_request_hook.py b/tests/unit/crawlers/_basic/_deferred_skipped_request_hook.py index ae32400677..448f749be5 100644 --- a/tests/unit/crawlers/_basic/_deferred_skipped_request_hook.py +++ b/tests/unit/crawlers/_basic/_deferred_skipped_request_hook.py @@ -1,12 +1,3 @@ -"""Skipped-request hooks written in the idiomatic deferred-annotation style. - -Used by `test_basic_crawler.py` to check that `_skipped_request_callback_expects_request` still -recognizes a `Request` annotation when the hook's module uses `from __future__ import annotations` -(PEP 563) and imports `Request` only under `TYPE_CHECKING`, so the name is not available at runtime. -The module is loaded by file path so it is never collected by pytest and does not rely on the test -package layout. -""" - from __future__ import annotations from typing import TYPE_CHECKING diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 996a4cdac4..a762a7c392 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -1669,8 +1669,7 @@ async def expects_optional_request(_request: Request | None, _reason: SkippedRea assert _skipped_request_callback_expects_request(expects_optional_request) is True - # Hooks living in a module that imports `Request` only under `TYPE_CHECKING` (with PEP 563 deferred - # annotations, the style Crawlee itself uses) must not silently degrade to the URL-only form. + # Hooks in a module that imports `Request` only under `TYPE_CHECKING` must not degrade to the URL form. hook_path = Path(__file__).parent / '_deferred_skipped_request_hook.py' spec = importlib.util.spec_from_file_location('_deferred_skipped_request_hook', hook_path) assert spec is not None