apify · vdusek · Jun 30, 2026 · Jun 30, 2026 · Jul 1, 2026 · Jul 1, 2026
diff --git a/docs/examples/code_examples/respect_robots_on_skipped_request.py b/docs/examples/code_examples/respect_robots_on_skipped_request.py
@@ -1,6 +1,6 @@
 import asyncio
 
-from crawlee import SkippedReason
+from crawlee import Request, SkippedReason
 from crawlee.crawlers import (
     BeautifulSoupCrawler,
     BeautifulSoupCrawlingContext,
@@ -18,7 +18,8 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
     # highlight-start
     # This handler is called when a request is skipped
     @crawler.on_skipped_request
-    async def skipped_request_handler(url: str, reason: SkippedReason) -> None:
+    async def skipped_request_handler(request: Request, reason: SkippedReason) -> None:
+        url = request.url
         # Check if the request was skipped due to robots.txt rules
         if reason == 'robots_txt':
             crawler.log.info(f'Skipped {url} due to robots.txt rules.')

diff --git a/src/crawlee/_utils/requests.py b/src/crawlee/_utils/requests.py
@@ -3,11 +3,15 @@
 from logging import getLogger
 from typing import TYPE_CHECKING
 
+from pydantic import ValidationError
 from yarl import URL
 
 from crawlee._utils.crypto import compute_short_hash
 
 if TYPE_CHECKING:
+    from logging import Logger
+
+    from crawlee._request import Request, RequestOptions
     from crawlee._types import HttpHeaders, HttpMethod, HttpPayload
 
 logger = getLogger(__name__)
@@ -110,6 +114,34 @@ def compute_unique_key(
     return normalized_url
 
 
+def create_request_from_options(request_options: RequestOptions, logger: Logger | None = None) -> Request | None:
+    """Build a `Request` from `RequestOptions`, returning `None` if the URL is invalid.
+
+    Shared by the crawlers' `extract_links` implementations to turn extracted URLs into `Request`
+    objects. A URL that fails validation (a malformed URL or an unsupported, non-`http(s)` scheme) is
+    logged at the debug level and skipped by returning `None`, rather than raising.
+
+    Args:
+        request_options: The options passed to `Request.from_url`.
+        logger: An optional logger used to report a skipped, invalid URL.
+
+    Returns:
+        The created `Request`, or `None` if the URL was invalid.
+    """
+    # Imported lazily to avoid a circular import (`crawlee._request` imports from this module).
+    from crawlee._request import Request  # noqa: PLC0415
+
+    try:
+        return Request.from_url(**request_options)
+    except ValidationError as exc:
+        if logger is not None:
+            logger.debug(
+                f'Skipping URL "{request_options["url"]}" due to invalid format: {exc}. '
+                'This may be caused by a malformed URL or an unsupported URL scheme.'
+            )
+        return None
+
+
 def _get_payload_hash(payload: HttpPayload | None) -> str:
     payload_in_bytes = b'' if payload is None else payload
     return compute_short_hash(payload_in_bytes)

diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
@@ -7,11 +7,11 @@
 from typing import TYPE_CHECKING, Generic
 
 from more_itertools import partition
-from pydantic import ValidationError
 from typing_extensions import NotRequired, TypeVar
 
 from crawlee._request import Request, RequestOptions, RequestState
 from crawlee._utils.docs import docs_group
+from crawlee._utils.requests import create_request_from_options
 from crawlee._utils.time import SharedTimeout
 from crawlee._utils.urls import to_absolute_url_iterator
 from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
@@ -206,6 +206,7 @@ async def extract_links(
             **kwargs: Unpack[EnqueueLinksKwargs],
         ) -> list[Request]:
             requests = list[Request]()
+            skipped = list[Request]()
 
             base_user_data = user_data or {}
 
@@ -214,6 +215,25 @@ async def extract_links(
             kwargs.setdefault('strategy', 'same-hostname')
             strategy = kwargs.get('strategy', 'same-hostname')
 
+            def to_request(url: str, *, apply_transform: bool = True) -> Request | None:
+                """Build a `Request` from a single extracted URL.
+
+                The transform is applied only to enqueued links (`apply_transform=True`), so a
+                transform returning `'skip'` cannot hide a robots-skipped URL from the callback.
+                """
+                request_options = RequestOptions(
+                    url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
+                )
+
+                if apply_transform and transform_request_function:
+                    transform_request_options = transform_request_function(request_options)
+                    if transform_request_options == 'skip':
+                        return None
+                    if transform_request_options != 'unchanged':
+                        request_options = transform_request_options
+
+                return create_request_from_options(request_options, context.log)
+
             links_iterator: Iterator[str] = iter(
                 self._parser.find_links(parsed_content, selector=selector, attribute=attribute)
             )
@@ -227,34 +247,19 @@ async def extract_links(
             )
             links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
 
+            # Robots-disallowed requests go to the skipped-request callback (without the transform, see
+            # `to_request`); the rest continue to the enqueue filter.
             if robots_txt_file:
-                skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)
-            else:
-                skipped = iter([])
+                skipped_iterator, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)
+                for url in skipped_iterator:
+                    request = to_request(url, apply_transform=False)
+                    if request is not None:
+                        skipped.append(request)
 
             for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
-                request_options = RequestOptions(
-                    url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
-                )
-
-                if transform_request_function:
-                    transform_request_options = transform_request_function(request_options)
-                    if transform_request_options == 'skip':
-                        continue
-                    if transform_request_options != 'unchanged':
-                        request_options = transform_request_options
-
-                try:
-                    request = Request.from_url(**request_options)
-                except ValidationError as exc:
-                    context.log.debug(
-                        f'Skipping URL "{url}" due to invalid format: {exc}. '
-                        'This may be caused by a malformed URL or unsupported URL scheme. '
-                        'Please ensure the URL is correct and retry.'
-                    )
-                    continue
-
-                requests.append(request)
+                request = to_request(url)
+                if request is not None:
+                    requests.append(request)
 
             skipped_tasks = [
                 asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped

diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -2,8 +2,10 @@
 
 from __future__ import annotations
 
+import ast
 import asyncio
 import functools
+import inspect
 import logging
 import signal
 import sys
@@ -17,7 +19,7 @@
 from http import HTTPStatus
 from io import StringIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
+from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast, get_args
 from weakref import WeakKeyDictionary
 
 from cachetools import LRUCache
@@ -110,7 +112,58 @@
 
 ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
 FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
-SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
+SkippedRequestCallback = (
+    Callable[[str, SkippedReason], Awaitable[None]] | Callable[[Request, SkippedReason], Awaitable[None]]
+)
+"""A skipped-request callback receives either the URL `str` or the full `Request`.
+
+For backward compatibility, callbacks whose first parameter is annotated as `str` (or is unannotated)
+receive `request.url`; callbacks that annotate it as `Request` receive the `Request` object. See
+`_skipped_request_callback_expects_request`.
+"""
+
+
+def _skipped_request_callback_expects_request(callback: Callable[..., Awaitable[None]]) -> bool:
+    """Whether a skipped-request callback wants the full `Request` rather than the URL string.
+
+    The first parameter's annotation decides: `Request` (or a union such as `Request | None`) gets the
+    `Request` object; `str` or no annotation keeps the legacy `(url, reason)` signature. String
+    annotations (PEP 563, or a `TYPE_CHECKING`-only `Request` import) are matched by name so such hooks
+    don't silently degrade to the `str` form.
+    """
+    try:
+        parameters = list(inspect.signature(callback).parameters.values())
+    except (TypeError, ValueError):  # Uninspectable callable falls back to the `str` form.
+        return False
+
+    if not parameters:
+        return False
+
+    annotation = parameters[0].annotation
+
+    if annotation is inspect.Parameter.empty:
+        return False
+
+    # A string annotation may not resolve to the class (e.g. a `TYPE_CHECKING`-only import), so match by name.
+    if isinstance(annotation, str):
+        return _annotation_names_request(annotation)
+
+    # A resolved annotation: match `Request` directly or inside a union.
+    return annotation is Request or Request in get_args(annotation)
+
+
+def _annotation_names_request(annotation: str) -> bool:
+    """Whether a string annotation names `Request` (e.g. `Request`, `Request | None`), not `RequestOptions`."""
+    try:
+        tree = ast.parse(annotation, mode='eval')
+    except SyntaxError:
+        return False
+
+    return any(
+        (isinstance(node, ast.Name) and node.id == Request.__name__)
+        or (isinstance(node, ast.Attribute) and node.attr == Request.__name__)
+        for node in ast.walk(tree)
+    )
 
 
 class _BasicCrawlerOptions(TypedDict):
@@ -417,6 +470,7 @@ def __init__(
         self._error_handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext] | None = None
         self._failed_request_handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext] | None = None
         self._on_skipped_request: SkippedRequestCallback | None = None
+        self._on_skipped_request_expects_request = False
         self._abort_on_error = abort_on_error
 
         # Crawler callbacks
@@ -678,8 +732,13 @@ def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequest
         """Register a function to handle skipped requests.
 
         The skipped request handler is invoked when a request is skipped due to a collision or other reasons.
+
+        The callback receives either the request URL as a `str` or the full `Request` object, depending on
+        how its first parameter is annotated. Annotate it as `Request` to access request metadata such as
+        `user_data`; a `str` annotation (or no annotation) keeps the original URL-only behavior.
         """
         self._on_skipped_request = callback
+        self._on_skipped_request_expects_request = _skipped_request_callback_expects_request(callback)
         return callback
 
     async def run(
@@ -826,12 +885,14 @@ async def add_requests(
             wait_for_all_requests_to_be_added: If True, wait for all requests to be added before returning.
             wait_for_all_requests_to_be_added_timeout: Timeout for waiting for all requests to be added.
         """
-        allowed_requests = []
-        skipped = []
-
-        for request in requests:
-            check_url = request.url if isinstance(request, Request) else request
-            if await self._is_allowed_based_on_robots_txt_file(check_url):
+        allowed_requests: list[Request] = []
+        skipped: list[Request] = []
+
+        for original in requests:
+            # Normalize `str` URLs to `Request` once, so robots-skipped items always reach the
+            # skipped-request callback as a `Request` (see `_handle_skipped_request`).
+            request = original if isinstance(original, Request) else Request.from_url(original)
+            if await self._is_allowed_based_on_robots_txt_file(request.url):
                 allowed_requests.append(request)
             else:
                 skipped.append(request)
@@ -1210,17 +1271,19 @@ async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawling
                 raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e
 
     async def _handle_skipped_request(
-        self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
+        self, request: Request, reason: SkippedReason, *, need_mark: bool = False
     ) -> None:
-        if need_mark and isinstance(request, Request):
+        if need_mark:
             request.state = RequestState.SKIPPED
             await self._mark_request_as_handled(request)
 
-        url = request.url if isinstance(request, Request) else request
-
-        if self._on_skipped_request:
+        if self._on_skipped_request is not None:
+            # Pass the full `Request` or just its URL, depending on how the callback annotated its first
+            # parameter (see `on_skipped_request`). The cast reflects that dual-dispatch contract.
+            callback = cast('Callable[[str | Request, SkippedReason], Awaitable[None]]', self._on_skipped_request)
+            argument: str | Request = request if self._on_skipped_request_expects_request else request.url
             try:
-                await self._on_skipped_request(url, reason)
+                await callback(argument, reason)
             except Exception as e:
                 raise UserDefinedErrorHandlerError('Exception thrown in user-defined skipped request callback') from e