Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/guides/request_loaders.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> is
The `SitemapRequestLoader` is designed specifically for sitemaps that follow the standard Sitemaps protocol. HTML pages containing links are not supported by this loader - those should be handled by regular crawlers using the `enqueue_links` functionality.
:::

The loader supports filtering URLs using glob patterns and regular expressions, allowing you to include or exclude specific types of URLs. The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> provides streaming processing of sitemaps, ensuring efficient memory usage without loading the entire sitemap into memory.
The loader supports filtering URLs using glob patterns and regular expressions, allowing you to include or exclude specific types of URLs. By default, the loader also keeps only URLs whose host matches their parent sitemap (`enqueue_strategy='same-hostname'`), matching the `enqueue_links` default. Pass `enqueue_strategy='all'` to disable this filter, or `'same-domain'` / `'same-origin'` for other scopes. The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> provides streaming processing of sitemaps, ensuring efficient memory usage without loading the entire sitemap into memory.

<RunnableCodeBlock className="language-python" language="python">
{SitemapExample}
Expand Down
74 changes: 52 additions & 22 deletions src/crawlee/_utils/robots.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
from yarl import URL

from crawlee._utils.sitemap import Sitemap
from crawlee._utils.urls import filter_url
from crawlee._utils.web import is_status_code_client_error

if TYPE_CHECKING:
from typing_extensions import Self

from crawlee._types import EnqueueStrategy
from crawlee.http_clients import HttpClient
from crawlee.proxy_configuration import ProxyInfo

Expand All @@ -21,7 +23,11 @@

class RobotsTxtFile:
def __init__(
self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
self,
url: str,
robots: Protego,
http_client: HttpClient | None = None,
proxy_info: ProxyInfo | None = None,
) -> None:
self._robots = robots
self._original_url = URL(url).origin()
Expand All @@ -39,18 +45,6 @@ async def from_content(cls, url: str, content: str) -> Self:
robots = Protego.parse(content)
return cls(url, robots)

@classmethod
async def find(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
"""Determine the location of a robots.txt file for a URL and fetch it.

Args:
url: The URL whose domain will be used to find the corresponding robots.txt file.
http_client: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
proxy_info: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
"""
robots_url = URL(url).with_path('/robots.txt')
return await cls.load(str(robots_url), http_client, proxy_info)

@classmethod
async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
"""Load the robots.txt file for a given URL.
Expand All @@ -77,6 +71,18 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N

return cls(url, robots, http_client=http_client, proxy_info=proxy_info)

@classmethod
async def find(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
"""Determine the location of a robots.txt file for a URL and fetch it.

Args:
url: The URL whose domain will be used to find the corresponding robots.txt file.
http_client: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
proxy_info: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
"""
robots_url = URL(url).with_path('/robots.txt')
return await cls.load(str(robots_url), http_client, proxy_info)

def is_allowed(self, url: str, user_agent: str = '*') -> bool:
"""Check if the given URL is allowed for the given user agent.

Expand All @@ -89,9 +95,25 @@ def is_allowed(self, url: str, user_agent: str = '*') -> bool:
return True
return bool(self._robots.can_fetch(str(check_url), user_agent))

def get_sitemaps(self) -> list[str]:
"""Get the list of sitemaps urls from the robots.txt file."""
return list(self._robots.sitemaps)
def get_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> list[str]:
"""Get the list of sitemap URLs from the robots.txt file, filtered by enqueue strategy.

Args:
enqueue_strategy: Strategy used to filter sitemap entries relative to the robots.txt URL's host.
Pass `'same-hostname'` to match the sitemap protocol's same-host expectation, or `'all'` to
disable host filtering. Regardless of the strategy, entries with non-`http(s)` schemes are
always filtered out.
"""
sitemaps: list[str] = []
for sitemap_url in self._robots.sitemaps:
ok, reason = filter_url(target=sitemap_url, strategy=enqueue_strategy, origin=self._original_url)
if not ok:
logger.warning(
f'Skipping sitemap {sitemap_url!r} listed in robots.txt at {str(self._original_url)!r}: {reason}.'
)
continue
sitemaps.append(sitemap_url)
return sitemaps

def get_crawl_delay(self, user_agent: str = '*') -> int | None:
"""Get the crawl delay for the given user agent.
Expand All @@ -103,15 +125,23 @@ def get_crawl_delay(self, user_agent: str = '*') -> int | None:
crawl_delay = self._robots.crawl_delay(user_agent)
return int(crawl_delay) if crawl_delay is not None else None

async def parse_sitemaps(self) -> Sitemap:
"""Parse the sitemaps from the robots.txt file and return a `Sitemap` instance."""
sitemaps = self.get_sitemaps()
async def parse_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> Sitemap:
"""Parse the sitemaps from the robots.txt file and return a `Sitemap` instance.

Args:
enqueue_strategy: Forwarded to `get_sitemaps`; see that method for details.
"""
sitemaps = self.get_sitemaps(enqueue_strategy=enqueue_strategy)
if not self._http_client:
raise ValueError('HTTP client is required to parse sitemaps.')

return await Sitemap.load(sitemaps, self._http_client, self._proxy_info)

async def parse_urls_from_sitemaps(self) -> list[str]:
"""Parse the sitemaps in the robots.txt file and return a list URLs."""
sitemap = await self.parse_sitemaps()
async def parse_urls_from_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> list[str]:
"""Parse the sitemaps in the robots.txt file and return a list URLs.

Args:
enqueue_strategy: Forwarded to `get_sitemaps`; see that method for details.
"""
sitemap = await self.parse_sitemaps(enqueue_strategy=enqueue_strategy)
return sitemap.urls
2 changes: 1 addition & 1 deletion src/crawlee/_utils/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -546,7 +546,7 @@ def _check_and_add(url: str) -> bool:

# Try getting sitemaps from robots.txt first
robots = await RobotsTxtFile.find(url=hostname_urls[0], http_client=http_client, proxy_info=proxy_info)
for sitemap_url in robots.get_sitemaps():
for sitemap_url in robots.get_sitemaps(enqueue_strategy='same-hostname'):
if _check_and_add(sitemap_url):
yield sitemap_url

Expand Down
112 changes: 107 additions & 5 deletions src/crawlee/_utils/urls.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,30 @@
from __future__ import annotations

import tempfile
from functools import lru_cache
from typing import TYPE_CHECKING

from pydantic import AnyHttpUrl, TypeAdapter
from tldextract import TLDExtract
from typing_extensions import assert_never
from yarl import URL

if TYPE_CHECKING:
from collections.abc import Iterator
from logging import Logger

from crawlee._types import EnqueueStrategy


_ALLOWED_SCHEMES: frozenset[str] = frozenset({'http', 'https'})
"""URL schemes Crawlee accepts for fetching and enqueuing."""

UNSUPPORTED_SCHEME_MESSAGE = 'unsupported URL scheme (only http and https are allowed).'
"""Reusable suffix for log messages explaining why a non-`http(s)` URL was rejected."""

_HTTP_URL_ADAPTER: TypeAdapter[AnyHttpUrl] = TypeAdapter(AnyHttpUrl)
"""Pydantic validator for HTTP and HTTPS URLs."""


def is_url_absolute(url: str) -> bool:
"""Check if a URL is absolute."""
Expand Down Expand Up @@ -38,16 +54,102 @@ def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger
yield converted_url


_http_url_adapter = TypeAdapter(AnyHttpUrl)


def validate_http_url(value: str | None) -> str | None:
"""Validate the given HTTP URL.

Args:
value: The URL to validate, or `None` to skip validation.

Raises:
pydantic.ValidationError: If the URL is not valid.
pydantic.ValidationError: If the URL is malformed or its scheme is not `http`/`https`.
"""
if value is not None:
_http_url_adapter.validate_python(value)
_HTTP_URL_ADAPTER.validate_python(value)

return value


def filter_url(
*,
target: str | URL,
strategy: EnqueueStrategy,
origin: str | URL,
) -> tuple[bool, str | None]:
"""Check whether `target` is eligible to be enqueued under `strategy` relative to `origin`.

Combines the two checks every enqueue site needs: the URL must use a supported scheme
(`http` or `https`), and it must match `strategy` relative to `origin`. Callers that need to
distinguish a scheme rejection from a strategy mismatch (for different log levels or dedup)
can compare the returned reason against `UNSUPPORTED_SCHEME_MESSAGE`.

Args:
target: The URL being evaluated.
strategy: The enqueue strategy to apply.
origin: The reference URL the target is compared against.

Returns:
`(True, None)` if `target` is eligible. Otherwise `(False, reason)` where `reason` is
a human-readable rejection message suitable for log output.
"""
target_url = _to_url(target)

if not _is_supported_url_scheme(target_url):
return False, UNSUPPORTED_SCHEME_MESSAGE

if not _matches_enqueue_strategy(strategy, target_url=target_url, origin_url=_to_url(origin)):
return False, f'does not match enqueue strategy {strategy!r}'

return True, None


def _is_supported_url_scheme(url: str | URL) -> bool:
"""Return whether `url` uses a scheme Crawlee accepts (http or https)."""
return _to_url(url).scheme in _ALLOWED_SCHEMES


def _matches_enqueue_strategy(
strategy: EnqueueStrategy,
*,
target_url: URL,
origin_url: URL,
) -> bool:
"""Check whether `target_url` matches `origin_url` under `strategy`. Scheme is not considered."""
if strategy == 'all':
return True

if origin_url.host is None or target_url.host is None:
return False

if strategy == 'same-hostname':
return target_url.host == origin_url.host

if strategy == 'same-domain':
return _domain_under_public_suffix(origin_url.host) == _domain_under_public_suffix(target_url.host)

if strategy == 'same-origin':
return (
target_url.host == origin_url.host
and target_url.scheme == origin_url.scheme
and target_url.port == origin_url.port
)

assert_never(strategy)


def _to_url(value: str | URL) -> URL:
return URL(value) if isinstance(value, str) else value


@lru_cache(maxsize=1)
def _get_tld_extractor() -> TLDExtract:
"""Return a lazily-initialized `TLDExtract` instance shared across the module."""
# `mkdtemp` (vs `TemporaryDirectory`) returns a path whose lifetime is tied to the process — `TemporaryDirectory`
# is collected immediately when its return value is discarded, which would race the directory out from under
# tldextract.
return TLDExtract(cache_dir=tempfile.mkdtemp())
Comment thread
vdusek marked this conversation as resolved.


@lru_cache(maxsize=2048)
def _domain_under_public_suffix(host: str) -> str:
"""Return the registrable domain for `host`, cached to avoid re-running the PSL lookup."""
return _get_tld_extractor().extract_str(host).top_domain_under_public_suffix
Loading
Loading