apify · vdusek · May 4, 2026 · Apr 30, 2026 · May 1, 2026 · May 3, 2026
diff --git a/docs/guides/request_loaders.mdx b/docs/guides/request_loaders.mdx
@@ -136,7 +136,7 @@ The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> is
 The `SitemapRequestLoader` is designed specifically for sitemaps that follow the standard Sitemaps protocol. HTML pages containing links are not supported by this loader - those should be handled by regular crawlers using the `enqueue_links` functionality.
 :::
 
-The loader supports filtering URLs using glob patterns and regular expressions, allowing you to include or exclude specific types of URLs. The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> provides streaming processing of sitemaps, ensuring efficient memory usage without loading the entire sitemap into memory.
+The loader supports filtering URLs using glob patterns and regular expressions, allowing you to include or exclude specific types of URLs. By default, the loader also keeps only URLs whose host matches their parent sitemap (`enqueue_strategy='same-hostname'`), matching the `enqueue_links` default. Pass `enqueue_strategy='all'` to disable this filter, or `'same-domain'` / `'same-origin'` for other scopes. The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> provides streaming processing of sitemaps, ensuring efficient memory usage without loading the entire sitemap into memory.
 
 <RunnableCodeBlock className="language-python" language="python">
     {SitemapExample}

diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
@@ -7,11 +7,13 @@
 from yarl import URL
 
 from crawlee._utils.sitemap import Sitemap
+from crawlee._utils.urls import filter_url
 from crawlee._utils.web import is_status_code_client_error
 
 if TYPE_CHECKING:
     from typing_extensions import Self
 
+    from crawlee._types import EnqueueStrategy
     from crawlee.http_clients import HttpClient
     from crawlee.proxy_configuration import ProxyInfo
 
@@ -21,7 +23,11 @@
 
 class RobotsTxtFile:
     def __init__(
-        self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
+        self,
+        url: str,
+        robots: Protego,
+        http_client: HttpClient | None = None,
+        proxy_info: ProxyInfo | None = None,
     ) -> None:
         self._robots = robots
         self._original_url = URL(url).origin()
@@ -39,18 +45,6 @@ async def from_content(cls, url: str, content: str) -> Self:
         robots = Protego.parse(content)
         return cls(url, robots)
 
-    @classmethod
-    async def find(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
-        """Determine the location of a robots.txt file for a URL and fetch it.
-
-        Args:
-            url: The URL whose domain will be used to find the corresponding robots.txt file.
-            http_client: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
-            proxy_info: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
-        """
-        robots_url = URL(url).with_path('/robots.txt')
-        return await cls.load(str(robots_url), http_client, proxy_info)
-
     @classmethod
     async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
         """Load the robots.txt file for a given URL.
@@ -77,6 +71,18 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N
 
         return cls(url, robots, http_client=http_client, proxy_info=proxy_info)
 
+    @classmethod
+    async def find(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
+        """Determine the location of a robots.txt file for a URL and fetch it.
+
+        Args:
+            url: The URL whose domain will be used to find the corresponding robots.txt file.
+            http_client: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
+            proxy_info: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
+        """
+        robots_url = URL(url).with_path('/robots.txt')
+        return await cls.load(str(robots_url), http_client, proxy_info)
+
     def is_allowed(self, url: str, user_agent: str = '*') -> bool:
         """Check if the given URL is allowed for the given user agent.
 
@@ -89,9 +95,25 @@ def is_allowed(self, url: str, user_agent: str = '*') -> bool:
             return True
         return bool(self._robots.can_fetch(str(check_url), user_agent))
 
-    def get_sitemaps(self) -> list[str]:
-        """Get the list of sitemaps urls from the robots.txt file."""
-        return list(self._robots.sitemaps)
+    def get_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> list[str]:
+        """Get the list of sitemap URLs from the robots.txt file, filtered by enqueue strategy.
+
+        Args:
+            enqueue_strategy: Strategy used to filter sitemap entries relative to the robots.txt URL's host.
+                Pass `'same-hostname'` to match the sitemap protocol's same-host expectation, or `'all'` to
+                disable host filtering. Regardless of the strategy, entries with non-`http(s)` schemes are
+                always filtered out.
+        """
+        sitemaps: list[str] = []
+        for sitemap_url in self._robots.sitemaps:
+            ok, reason = filter_url(target=sitemap_url, strategy=enqueue_strategy, origin=self._original_url)
+            if not ok:
+                logger.warning(
+                    f'Skipping sitemap {sitemap_url!r} listed in robots.txt at {str(self._original_url)!r}: {reason}.'
+                )
+                continue
+            sitemaps.append(sitemap_url)
+        return sitemaps
 
     def get_crawl_delay(self, user_agent: str = '*') -> int | None:
         """Get the crawl delay for the given user agent.
@@ -103,15 +125,23 @@ def get_crawl_delay(self, user_agent: str = '*') -> int | None:
         crawl_delay = self._robots.crawl_delay(user_agent)
         return int(crawl_delay) if crawl_delay is not None else None
 
-    async def parse_sitemaps(self) -> Sitemap:
-        """Parse the sitemaps from the robots.txt file and return a `Sitemap` instance."""
-        sitemaps = self.get_sitemaps()
+    async def parse_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> Sitemap:
+        """Parse the sitemaps from the robots.txt file and return a `Sitemap` instance.
+
+        Args:
+            enqueue_strategy: Forwarded to `get_sitemaps`; see that method for details.
+        """
+        sitemaps = self.get_sitemaps(enqueue_strategy=enqueue_strategy)
         if not self._http_client:
             raise ValueError('HTTP client is required to parse sitemaps.')
 
         return await Sitemap.load(sitemaps, self._http_client, self._proxy_info)
 
-    async def parse_urls_from_sitemaps(self) -> list[str]:
-        """Parse the sitemaps in the robots.txt file and return a list URLs."""
-        sitemap = await self.parse_sitemaps()
+    async def parse_urls_from_sitemaps(self, *, enqueue_strategy: EnqueueStrategy) -> list[str]:
+        """Parse the sitemaps in the robots.txt file and return a list URLs.
+
+        Args:
+            enqueue_strategy: Forwarded to `get_sitemaps`; see that method for details.
+        """
+        sitemap = await self.parse_sitemaps(enqueue_strategy=enqueue_strategy)
         return sitemap.urls
diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py
@@ -546,7 +546,7 @@ def _check_and_add(url: str) -> bool:
 
     # Try getting sitemaps from robots.txt first
     robots = await RobotsTxtFile.find(url=hostname_urls[0], http_client=http_client, proxy_info=proxy_info)
-    for sitemap_url in robots.get_sitemaps():
+    for sitemap_url in robots.get_sitemaps(enqueue_strategy='same-hostname'):
         if _check_and_add(sitemap_url):
             yield sitemap_url
 

diff --git a/src/crawlee/_utils/urls.py b/src/crawlee/_utils/urls.py
@@ -1,14 +1,30 @@
 from __future__ import annotations
 
+import tempfile
+from functools import lru_cache
 from typing import TYPE_CHECKING
 
 from pydantic import AnyHttpUrl, TypeAdapter
+from tldextract import TLDExtract
+from typing_extensions import assert_never
 from yarl import URL
 
 if TYPE_CHECKING:
     from collections.abc import Iterator
     from logging import Logger
 
+    from crawlee._types import EnqueueStrategy
+
+
+_ALLOWED_SCHEMES: frozenset[str] = frozenset({'http', 'https'})
+"""URL schemes Crawlee accepts for fetching and enqueuing."""
+
+UNSUPPORTED_SCHEME_MESSAGE = 'unsupported URL scheme (only http and https are allowed).'
+"""Reusable suffix for log messages explaining why a non-`http(s)` URL was rejected."""
+
+_HTTP_URL_ADAPTER: TypeAdapter[AnyHttpUrl] = TypeAdapter(AnyHttpUrl)
+"""Pydantic validator for HTTP and HTTPS URLs."""
+
 
 def is_url_absolute(url: str) -> bool:
     """Check if a URL is absolute."""
@@ -38,16 +54,102 @@ def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger
             yield converted_url
 
 
-_http_url_adapter = TypeAdapter(AnyHttpUrl)
-
-
 def validate_http_url(value: str | None) -> str | None:
     """Validate the given HTTP URL.
 
+    Args:
+        value: The URL to validate, or `None` to skip validation.
+
     Raises:
-        pydantic.ValidationError: If the URL is not valid.
+        pydantic.ValidationError: If the URL is malformed or its scheme is not `http`/`https`.
     """
     if value is not None:
-        _http_url_adapter.validate_python(value)
+        _HTTP_URL_ADAPTER.validate_python(value)
 
     return value
+
+
+def filter_url(
+    *,
+    target: str | URL,
+    strategy: EnqueueStrategy,
+    origin: str | URL,
+) -> tuple[bool, str | None]:
+    """Check whether `target` is eligible to be enqueued under `strategy` relative to `origin`.
+
+    Combines the two checks every enqueue site needs: the URL must use a supported scheme
+    (`http` or `https`), and it must match `strategy` relative to `origin`. Callers that need to
+    distinguish a scheme rejection from a strategy mismatch (for different log levels or dedup)
+    can compare the returned reason against `UNSUPPORTED_SCHEME_MESSAGE`.
+
+    Args:
+        target: The URL being evaluated.
+        strategy: The enqueue strategy to apply.
+        origin: The reference URL the target is compared against.
+
+    Returns:
+        `(True, None)` if `target` is eligible. Otherwise `(False, reason)` where `reason` is
+        a human-readable rejection message suitable for log output.
+    """
+    target_url = _to_url(target)
+
+    if not _is_supported_url_scheme(target_url):
+        return False, UNSUPPORTED_SCHEME_MESSAGE
+
+    if not _matches_enqueue_strategy(strategy, target_url=target_url, origin_url=_to_url(origin)):
+        return False, f'does not match enqueue strategy {strategy!r}'
+
+    return True, None
+
+
+def _is_supported_url_scheme(url: str | URL) -> bool:
+    """Return whether `url` uses a scheme Crawlee accepts (http or https)."""
+    return _to_url(url).scheme in _ALLOWED_SCHEMES
+
+
+def _matches_enqueue_strategy(
+    strategy: EnqueueStrategy,
+    *,
+    target_url: URL,
+    origin_url: URL,
+) -> bool:
+    """Check whether `target_url` matches `origin_url` under `strategy`. Scheme is not considered."""
+    if strategy == 'all':
+        return True
+
+    if origin_url.host is None or target_url.host is None:
+        return False
+
+    if strategy == 'same-hostname':
+        return target_url.host == origin_url.host
+
+    if strategy == 'same-domain':
+        return _domain_under_public_suffix(origin_url.host) == _domain_under_public_suffix(target_url.host)
+
+    if strategy == 'same-origin':
+        return (
+            target_url.host == origin_url.host
+            and target_url.scheme == origin_url.scheme
+            and target_url.port == origin_url.port
+        )
+
+    assert_never(strategy)
+
+
+def _to_url(value: str | URL) -> URL:
+    return URL(value) if isinstance(value, str) else value
+
+
+@lru_cache(maxsize=1)
+def _get_tld_extractor() -> TLDExtract:
+    """Return a lazily-initialized `TLDExtract` instance shared across the module."""
+    # `mkdtemp` (vs `TemporaryDirectory`) returns a path whose lifetime is tied to the process — `TemporaryDirectory`
+    # is collected immediately when its return value is discarded, which would race the directory out from under
+    # tldextract.
+    return TLDExtract(cache_dir=tempfile.mkdtemp())
+
+
+@lru_cache(maxsize=2048)
+def _domain_under_public_suffix(host: str) -> str:
+    """Return the registrable domain for `host`, cached to avoid re-running the PSL lookup."""
+    return _get_tld_extractor().extract_str(host).top_domain_under_public_suffix