Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import asyncio

from crawlee import SkippedReason
from crawlee import Request, SkippedReason
from crawlee.crawlers import (
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
Expand All @@ -18,7 +18,8 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
# highlight-start
# This handler is called when a request is skipped
@crawler.on_skipped_request
async def skipped_request_handler(url: str, reason: SkippedReason) -> None:
async def skipped_request_handler(request: Request, reason: SkippedReason) -> None:
url = request.url
# Check if the request was skipped due to robots.txt rules
if reason == 'robots_txt':
crawler.log.info(f'Skipped {url} due to robots.txt rules.')
Expand Down
32 changes: 32 additions & 0 deletions src/crawlee/_utils/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,15 @@
from logging import getLogger
from typing import TYPE_CHECKING

from pydantic import ValidationError
from yarl import URL

from crawlee._utils.crypto import compute_short_hash

if TYPE_CHECKING:
from logging import Logger

from crawlee._request import Request, RequestOptions
from crawlee._types import HttpHeaders, HttpMethod, HttpPayload

logger = getLogger(__name__)
Expand Down Expand Up @@ -110,6 +114,34 @@ def compute_unique_key(
return normalized_url


def create_request_from_options(request_options: RequestOptions, logger: Logger | None = None) -> Request | None:
"""Build a `Request` from `RequestOptions`, returning `None` if the URL is invalid.

Shared by the crawlers' `extract_links` implementations to turn extracted URLs into `Request`
objects. A URL that fails validation (a malformed URL or an unsupported, non-`http(s)` scheme) is
logged at the debug level and skipped by returning `None`, rather than raising.

Args:
request_options: The options passed to `Request.from_url`.
logger: An optional logger used to report a skipped, invalid URL.

Returns:
The created `Request`, or `None` if the URL was invalid.
"""
# Imported lazily to avoid a circular import (`crawlee._request` imports from this module).
from crawlee._request import Request # noqa: PLC0415

try:
return Request.from_url(**request_options)
except ValidationError as exc:
if logger is not None:
logger.debug(
f'Skipping URL "{request_options["url"]}" due to invalid format: {exc}. '
'This may be caused by a malformed URL or an unsupported URL scheme.'
)
return None


def _get_payload_hash(payload: HttpPayload | None) -> str:
payload_in_bytes = b'' if payload is None else payload
return compute_short_hash(payload_in_bytes)
Expand Down
57 changes: 31 additions & 26 deletions src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
from typing import TYPE_CHECKING, Generic

from more_itertools import partition
from pydantic import ValidationError
from typing_extensions import NotRequired, TypeVar

from crawlee._request import Request, RequestOptions, RequestState
from crawlee._utils.docs import docs_group
from crawlee._utils.requests import create_request_from_options
from crawlee._utils.time import SharedTimeout
from crawlee._utils.urls import to_absolute_url_iterator
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
Expand Down Expand Up @@ -206,6 +206,7 @@ async def extract_links(
**kwargs: Unpack[EnqueueLinksKwargs],
) -> list[Request]:
requests = list[Request]()
skipped = list[Request]()

base_user_data = user_data or {}

Expand All @@ -214,6 +215,25 @@ async def extract_links(
kwargs.setdefault('strategy', 'same-hostname')
strategy = kwargs.get('strategy', 'same-hostname')

def to_request(url: str, *, apply_transform: bool = True) -> Request | None:
"""Build a `Request` from a single extracted URL.

The transform is applied only to enqueued links (`apply_transform=True`), so a
transform returning `'skip'` cannot hide a robots-skipped URL from the callback.
"""
request_options = RequestOptions(
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
)

if apply_transform and transform_request_function:
transform_request_options = transform_request_function(request_options)
if transform_request_options == 'skip':
return None
if transform_request_options != 'unchanged':
request_options = transform_request_options

return create_request_from_options(request_options, context.log)

links_iterator: Iterator[str] = iter(
self._parser.find_links(parsed_content, selector=selector, attribute=attribute)
)
Expand All @@ -227,34 +247,19 @@ async def extract_links(
)
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)

# Robots-disallowed requests go to the skipped-request callback (without the transform, see
# `to_request`); the rest continue to the enqueue filter.
if robots_txt_file:
skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)
else:
skipped = iter([])
skipped_iterator, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)
for url in skipped_iterator:
request = to_request(url, apply_transform=False)
if request is not None:
skipped.append(request)

for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
request_options = RequestOptions(
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
)

if transform_request_function:
transform_request_options = transform_request_function(request_options)
if transform_request_options == 'skip':
continue
if transform_request_options != 'unchanged':
request_options = transform_request_options

try:
request = Request.from_url(**request_options)
except ValidationError as exc:
context.log.debug(
f'Skipping URL "{url}" due to invalid format: {exc}. '
'This may be caused by a malformed URL or unsupported URL scheme. '
'Please ensure the URL is correct and retry.'
)
continue

requests.append(request)
request = to_request(url)
if request is not None:
requests.append(request)

skipped_tasks = [
asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped
Expand Down
91 changes: 77 additions & 14 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

from __future__ import annotations

import ast
import asyncio
import functools
import inspect
import logging
import signal
import sys
Expand All @@ -17,7 +19,7 @@
from http import HTTPStatus
from io import StringIO
from pathlib import Path
from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast, get_args
from weakref import WeakKeyDictionary

from cachetools import LRUCache
Expand Down Expand Up @@ -110,7 +112,58 @@

ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
SkippedRequestCallback = (
Callable[[str, SkippedReason], Awaitable[None]] | Callable[[Request, SkippedReason], Awaitable[None]]
)
"""A skipped-request callback receives either the URL `str` or the full `Request`.

For backward compatibility, callbacks whose first parameter is annotated as `str` (or is unannotated)
receive `request.url`; callbacks that annotate it as `Request` receive the `Request` object. See
`_skipped_request_callback_expects_request`.
"""


def _skipped_request_callback_expects_request(callback: Callable[..., Awaitable[None]]) -> bool:

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we can wait for v2 release and make a breaking change with a clear signature to avoid this kind of fragile runtime inspection.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed

"""Whether a skipped-request callback wants the full `Request` rather than the URL string.

The first parameter's annotation decides: `Request` (or a union such as `Request | None`) gets the
`Request` object; `str` or no annotation keeps the legacy `(url, reason)` signature. String
annotations (PEP 563, or a `TYPE_CHECKING`-only `Request` import) are matched by name so such hooks
don't silently degrade to the `str` form.
"""
try:
parameters = list(inspect.signature(callback).parameters.values())
except (TypeError, ValueError): # Uninspectable callable falls back to the `str` form.
return False

if not parameters:
return False

annotation = parameters[0].annotation

if annotation is inspect.Parameter.empty:
return False

# A string annotation may not resolve to the class (e.g. a `TYPE_CHECKING`-only import), so match by name.
if isinstance(annotation, str):
return _annotation_names_request(annotation)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I'm not mistaken, this won't match when the annotation uses an import alias under TYPE_CHECKING:

if TYPE_CHECKING:
    from crawlee import Request as CrawleeRequest

async def skipped_hook(request: CrawleeRequest, _reason: SkippedReason) -> None:
    pass


# A resolved annotation: match `Request` directly or inside a union.
return annotation is Request or Request in get_args(annotation)


def _annotation_names_request(annotation: str) -> bool:
"""Whether a string annotation names `Request` (e.g. `Request`, `Request | None`), not `RequestOptions`."""
try:
tree = ast.parse(annotation, mode='eval')
except SyntaxError:
return False

return any(
(isinstance(node, ast.Name) and node.id == Request.__name__)
or (isinstance(node, ast.Attribute) and node.attr == Request.__name__)
for node in ast.walk(tree)
)


class _BasicCrawlerOptions(TypedDict):
Expand Down Expand Up @@ -417,6 +470,7 @@ def __init__(
self._error_handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext] | None = None
self._failed_request_handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext] | None = None
self._on_skipped_request: SkippedRequestCallback | None = None
self._on_skipped_request_expects_request = False
self._abort_on_error = abort_on_error

# Crawler callbacks
Expand Down Expand Up @@ -678,8 +732,13 @@ def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequest
"""Register a function to handle skipped requests.

The skipped request handler is invoked when a request is skipped due to a collision or other reasons.

The callback receives either the request URL as a `str` or the full `Request` object, depending on
how its first parameter is annotated. Annotate it as `Request` to access request metadata such as
`user_data`; a `str` annotation (or no annotation) keeps the original URL-only behavior.
"""
self._on_skipped_request = callback
self._on_skipped_request_expects_request = _skipped_request_callback_expects_request(callback)
return callback

async def run(
Expand Down Expand Up @@ -826,12 +885,14 @@ async def add_requests(
wait_for_all_requests_to_be_added: If True, wait for all requests to be added before returning.
wait_for_all_requests_to_be_added_timeout: Timeout for waiting for all requests to be added.
"""
allowed_requests = []
skipped = []

for request in requests:
check_url = request.url if isinstance(request, Request) else request
if await self._is_allowed_based_on_robots_txt_file(check_url):
allowed_requests: list[Request] = []
skipped: list[Request] = []

for original in requests:
# Normalize `str` URLs to `Request` once, so robots-skipped items always reach the
# skipped-request callback as a `Request` (see `_handle_skipped_request`).
request = original if isinstance(original, Request) else Request.from_url(original)
if await self._is_allowed_based_on_robots_txt_file(request.url):
allowed_requests.append(request)
else:
skipped.append(request)
Expand Down Expand Up @@ -1210,17 +1271,19 @@ async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawling
raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e

async def _handle_skipped_request(
self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
self, request: Request, reason: SkippedReason, *, need_mark: bool = False
) -> None:
if need_mark and isinstance(request, Request):
if need_mark:
request.state = RequestState.SKIPPED
await self._mark_request_as_handled(request)

url = request.url if isinstance(request, Request) else request

if self._on_skipped_request:
if self._on_skipped_request is not None:
# Pass the full `Request` or just its URL, depending on how the callback annotated its first
# parameter (see `on_skipped_request`). The cast reflects that dual-dispatch contract.
callback = cast('Callable[[str | Request, SkippedReason], Awaitable[None]]', self._on_skipped_request)
argument: str | Request = request if self._on_skipped_request_expects_request else request.url
try:
await self._on_skipped_request(url, reason)
await callback(argument, reason)
except Exception as e:
raise UserDefinedErrorHandlerError('Exception thrown in user-defined skipped request callback') from e

Expand Down
Loading
Loading