From 6ff22559820ee06ddd4b9d97f4a616ad5aae7af7 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 18 Mar 2026 15:20:00 +0100 Subject: [PATCH 01/39] feat: implementing the first tool --- README.md | 4 + pyproject.toml | 7 +- src/strands_tools/apify.py | 377 +++++++++++++++++++++++++++++++++++++ tests/test_apify.py | 317 +++++++++++++++++++++++++++++++ 4 files changed, 703 insertions(+), 2 deletions(-) create mode 100644 src/strands_tools/apify.py create mode 100644 tests/test_apify.py diff --git a/README.md b/README.md index e945edf4..8ec23e98 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,10 @@ Below is a comprehensive table of all available tools, how to use them with an a | Tool | Agent Usage | Use Case | |------|-------------|----------| | a2a_client | `provider = A2AClientToolProvider(known_agent_urls=["http://localhost:9000"]); agent = Agent(tools=provider.tools)` | Discover and communicate with A2A-compliant agents, send messages between agents | +| apify_run_actor | `agent.tool.apify_run_actor(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run any Apify Actor by ID with arbitrary input | +| apify_get_dataset_items | `agent.tool.apify_get_dataset_items(dataset_id="abc123", limit=50)` | Fetch items from an Apify Dataset | +| apify_run_actor_and_get_dataset | `agent.tool.apify_run_actor_and_get_dataset(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run an Actor and fetch its Dataset results in one step | +| apify_scrape_url | `agent.tool.apify_scrape_url(url="https://example.com")` | Scrape a URL and return its content as markdown | | file_read | `agent.tool.file_read(path="path/to/file.txt")` | Reading configuration files, parsing code files, loading datasets | | file_write | `agent.tool.file_write(path="path/to/file.txt", content="file content")` | Writing results to files, creating new files, saving output data | | editor | `agent.tool.editor(command="view", path="path/to/file.py")` | Advanced file operations like syntax highlighting, pattern replacement, and multi-file edits | diff --git a/pyproject.toml b/pyproject.toml index bf00325f..de75e0be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,9 @@ Homepage = "https://github.com/strands-agents/tools" Documentation = "https://strandsagents.com/" [project.optional-dependencies] +apify = [ + "apify-client>=1.0.0", +] build = [ "hatch>=1.16.5", ] @@ -122,7 +125,7 @@ mongodb-memory = [ ] [tool.hatch.envs.hatch-static-analysis] -features = ["mem0-memory", "local-chromium-browser", "agent-core-browser", "agent-core-code-interpreter", "a2a-client", "diagram", "rss", "use-computer", "twelvelabs", "elasticsearch-memory", "mongodb-memory"] +features = ["mem0-memory", "local-chromium-browser", "agent-core-browser", "agent-core-code-interpreter", "a2a-client", "diagram", "rss", "use-computer", "twelvelabs", "elasticsearch-memory", "mongodb-memory", "apify"] dependencies = [ "strands-agents>=1.0.0", "mypy>=0.981,<1.0.0", @@ -141,7 +144,7 @@ lint-check = [ lint-fix = ["ruff check --fix"] [tool.hatch.envs.hatch-test] -features = ["mem0-memory", "local-chromium-browser", "agent-core-browser", "agent-core-code-interpreter", "a2a-client", "diagram", "rss", "use-computer", "twelvelabs", "elasticsearch-memory", "mongodb-memory"] +features = ["mem0-memory", "local-chromium-browser", "agent-core-browser", "agent-core-code-interpreter", "a2a-client", "diagram", "rss", "use-computer", "twelvelabs", "elasticsearch-memory", "mongodb-memory", "apify"] extra-dependencies = [ "moto>=5.1.0,<6.0.0", "pytest>=8.0.0,<10.0.0", diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py new file mode 100644 index 00000000..96e5f9b8 --- /dev/null +++ b/src/strands_tools/apify.py @@ -0,0 +1,377 @@ +"""Apify platform integration tool for Strands Agents. + +Provides capabilities to run Apify Actors, retrieve Datasets, and scrape URLs +using the Apify platform programmatically. + +Available tools: +- apify_run_actor: Run any Apify Actor by ID with arbitrary input +- apify_get_dataset_items: Fetch items from an Apify Dataset +- apify_run_actor_and_get_dataset: Run an Actor and fetch its Dataset results in one step +- apify_scrape_url: Scrape a URL and return its content as markdown + +Setup Requirements: +------------------ +1. Create an Apify account at https://apify.com +2. Obtain your API token: Apify Console β†’ Settings β†’ API & Integrations β†’ Personal API tokens +3. Install the optional dependency: pip install -e ".[apify]" +4. Set the environment variable: + APIFY_API_TOKEN=your_api_token_here + +Usage with Strands Agent: +```python +from strands import Agent +from strands_tools import apify + +agent = Agent(tools=[ + apify.apify_run_actor, + apify.apify_get_dataset_items, + apify.apify_run_actor_and_get_dataset, + apify.apify_scrape_url, +]) + +# Run an Actor +result = agent.tool.apify_run_actor( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, +) + +# Scrape a single URL +content = agent.tool.apify_scrape_url(url="https://example.com") +``` + +!!!!!!!!!!!!! IMPORTANT: !!!!!!!!!!!!! + +Environment Variables: +- APIFY_API_TOKEN: Your Apify API token (required) + Obtain from https://console.apify.com/account/integrations + +Example .env configuration: + APIFY_API_TOKEN=apify_api_1a2B3cD4eF5gH6iJ7kL8m + +!!!!!!!!!!!!! IMPORTANT: !!!!!!!!!!!!! + +See the function docstrings for complete parameter documentation. +""" + +import json +import logging +import os +from typing import Any, Dict, List, Optional + +from rich.panel import Panel +from rich.text import Text +from strands import tool + +from strands_tools.utils import console_util + +logger = logging.getLogger(__name__) +console = console_util.create() + +try: + from apify_client import ApifyClient + + HAS_APIFY_CLIENT = True +except ImportError: + HAS_APIFY_CLIENT = False + +WEBSITE_CONTENT_CRAWLER = "apify/website-content-crawler" +TRACKING_HEADER = {"x-apify-integration-platform": "strands"} + + +def _check_dependency() -> None: + """Raise ImportError if apify-client is not installed.""" + if not HAS_APIFY_CLIENT: + raise ImportError("apify-client package is required. Install with: pip install strands-agents-tools[apify]") + + +class ApifyToolClient: + """Helper class encapsulating Apify API interactions via apify-client.""" + + def __init__(self) -> None: + token = os.getenv("APIFY_API_TOKEN", "") + if not token: + raise ValueError( + "APIFY_API_TOKEN environment variable is not set. " + "Get your token at https://console.apify.com/account/integrations" + ) + self.client: "ApifyClient" = ApifyClient(token, headers=TRACKING_HEADER) + + def run_actor( + self, + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = 300, + memory_mbytes: Optional[int] = None, + ) -> Dict[str, Any]: + """Run an Apify Actor synchronously and return run metadata.""" + call_kwargs: Dict[str, Any] = { + "run_input": run_input or {}, + "timeout_secs": timeout_secs, + } + if memory_mbytes is not None: + call_kwargs["memory_mbytes"] = memory_mbytes + + actor_run = self.client.actor(actor_id).call(**call_kwargs) + + status = actor_run.get("status", "UNKNOWN") + if status not in ("SUCCEEDED",): + raise RuntimeError(f"Actor {actor_id} finished with status {status}. Run ID: {actor_run.get('id', 'N/A')}") + + return { + "run_id": actor_run.get("id"), + "status": status, + "dataset_id": actor_run.get("defaultDatasetId"), + "started_at": actor_run.get("startedAt"), + "finished_at": actor_run.get("finishedAt"), + } + + def get_dataset_items( + self, + dataset_id: str, + limit: int = 100, + offset: int = 0, + ) -> List[Dict[str, Any]]: + """Fetch items from an Apify Dataset.""" + result = self.client.dataset(dataset_id).list_items(limit=limit, offset=offset) + return list(result.items) + + def run_actor_and_get_dataset( + self, + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = 300, + memory_mbytes: Optional[int] = None, + dataset_items_limit: int = 100, + ) -> Dict[str, Any]: + """Run an Actor synchronously, then fetch its default Dataset items.""" + run_metadata = self.run_actor( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + ) + dataset_id = run_metadata["dataset_id"] + items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit) + return {**run_metadata, "items": items} + + def scrape_url(self, url: str, timeout_secs: int = 120) -> str: + """Scrape a single URL using Website Content Crawler and return markdown.""" + run_input = { + "startUrls": [{"url": url}], + "maxCrawlPages": 1, + } + actor_run = self.client.actor(WEBSITE_CONTENT_CRAWLER).call( + run_input=run_input, + timeout_secs=timeout_secs, + ) + + status = actor_run.get("status", "UNKNOWN") + if status not in ("SUCCEEDED",): + raise RuntimeError( + f"Website Content Crawler finished with status {status}. Run ID: {actor_run.get('id', 'N/A')}" + ) + + dataset_id = actor_run.get("defaultDatasetId") + result = self.client.dataset(dataset_id).list_items(limit=1) + items = list(result.items) + + if not items: + raise RuntimeError(f"No content returned for URL: {url}") + + return str(items[0].get("markdown") or items[0].get("text", "")) + + +# --- Tool functions --- + + +@tool +def apify_run_actor( + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = 300, + memory_mbytes: Optional[int] = None, +) -> str: + """Run any Apify Actor by its ID or name and return the run metadata as JSON. + + Executes the Actor synchronously - blocks until the Actor Run finishes or the timeout + is reached. Use this when you need to run a specific Actor and then inspect or process + the results separately. + + Common Actors: + - "apify/website-content-crawler" - scrape websites and extract content + - "apify/web-scraper" - general-purpose web scraper + - "apify/google-search-scraper" - scrape Google search results + + Args: + actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". + run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema. + timeout_secs: Maximum time in seconds to wait for the Actor Run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Actor Run. Uses Actor default if not set. + + Returns: + JSON string with run metadata: run_id, status, dataset_id, started_at, finished_at. + """ + _check_dependency() + try: + client = ApifyToolClient() + result = client.run_actor( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + ) + panel = Panel( + f"[green]Actor Run completed[/green]\n" + f"Actor: {actor_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}", + title="[bold cyan]Apify: Run Actor[/bold cyan]", + border_style="green", + ) + console.print(panel) + return json.dumps(result, indent=2, default=str) + except Exception as e: + error_panel = Panel( + Text(str(e), style="red"), + title="[bold red]Apify Error[/bold red]", + border_style="red", + ) + console.print(error_panel) + raise + + +@tool +def apify_get_dataset_items( + dataset_id: str, + limit: int = 100, + offset: int = 0, +) -> str: + """Fetch items from an existing Apify Dataset and return them as JSON. + + Use this after running an Actor to retrieve the structured results from its + default Dataset, or to access any Dataset by ID. + + Args: + dataset_id: The Apify Dataset ID to fetch items from. + limit: Maximum number of items to return. Defaults to 100. + offset: Number of items to skip for pagination. Defaults to 0. + + Returns: + JSON string containing an array of Dataset items. + """ + _check_dependency() + try: + client = ApifyToolClient() + items = client.get_dataset_items(dataset_id=dataset_id, limit=limit, offset=offset) + panel = Panel( + f"[green]Dataset items retrieved[/green]\nDataset ID: {dataset_id}\nItems returned: {len(items)}", + title="[bold cyan]Apify: Dataset Items[/bold cyan]", + border_style="green", + ) + console.print(panel) + return json.dumps(items, indent=2, default=str) + except Exception as e: + error_panel = Panel( + Text(str(e), style="red"), + title="[bold red]Apify Error[/bold red]", + border_style="red", + ) + console.print(error_panel) + raise + + +@tool +def apify_run_actor_and_get_dataset( + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = 300, + memory_mbytes: Optional[int] = None, + dataset_items_limit: int = 100, +) -> str: + """Run an Apify Actor and fetch its Dataset results in one step. + + Convenience tool that combines running an Actor and fetching its default Dataset + items into a single call. Use this when you want both the run metadata and the + result data without making two separate tool calls. + + Args: + actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". + run_input: JSON-serializable input for the Actor. + timeout_secs: Maximum time in seconds to wait for the Actor Run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Actor Run. + dataset_items_limit: Maximum number of Dataset items to return. Defaults to 100. + + Returns: + JSON string with run metadata (run_id, status, dataset_id, started_at, finished_at) + plus an "items" array containing the Dataset results. + """ + _check_dependency() + try: + client = ApifyToolClient() + result = client.run_actor_and_get_dataset( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + dataset_items_limit=dataset_items_limit, + ) + panel = Panel( + f"[green]Actor Run completed with dataset[/green]\n" + f"Actor: {actor_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}\n" + f"Items returned: {len(result['items'])}", + title="[bold cyan]Apify: Run Actor + Dataset[/bold cyan]", + border_style="green", + ) + console.print(panel) + return json.dumps(result, indent=2, default=str) + except Exception as e: + error_panel = Panel( + Text(str(e), style="red"), + title="[bold red]Apify Error[/bold red]", + border_style="red", + ) + console.print(error_panel) + raise + + +@tool +def apify_scrape_url( + url: str, + timeout_secs: int = 120, +) -> str: + """Scrape a single URL and return its content as markdown. + + Uses the Apify Website Content Crawler Actor under the hood, pre-configured for + fast single-page scraping. This is the simplest way to extract readable content + from any web page. + + Args: + url: The URL to scrape, e.g. "https://example.com". + timeout_secs: Maximum time in seconds to wait for scraping to finish. Defaults to 120. + + Returns: + Markdown content of the scraped page as a plain string. + """ + _check_dependency() + try: + client = ApifyToolClient() + content = client.scrape_url(url=url, timeout_secs=timeout_secs) + panel = Panel( + f"[green]URL scraped successfully[/green]\nURL: {url}\nContent length: {len(content)} characters", + title="[bold cyan]Apify: Scrape URL[/bold cyan]", + border_style="green", + ) + console.print(panel) + return content + except Exception as e: + error_panel = Panel( + Text(str(e), style="red"), + title="[bold red]Apify Error[/bold red]", + border_style="red", + ) + console.print(error_panel) + raise diff --git a/tests/test_apify.py b/tests/test_apify.py new file mode 100644 index 00000000..d8a3c5e4 --- /dev/null +++ b/tests/test_apify.py @@ -0,0 +1,317 @@ +"""Tests for the Apify tools.""" + +import json +from unittest.mock import MagicMock, patch + +import pytest + +from strands_tools import apify +from strands_tools.apify import ( + ApifyToolClient, + apify_get_dataset_items, + apify_run_actor, + apify_run_actor_and_get_dataset, + apify_scrape_url, +) + +MOCK_ACTOR_RUN = { + "id": "run-HG7ml5fB1hCp8YEBA", + "actId": "janedoe~my-scraper", + "userId": "user-abc123", + "startedAt": "2026-03-15T14:30:00.000Z", + "finishedAt": "2026-03-15T14:35:22.000Z", + "status": "SUCCEEDED", + "statusMessage": "Actor finished successfully", + "defaultDatasetId": "dataset-WkC9gct8rq1uR5vDZ", + "defaultKeyValueStoreId": "kvs-Xb3A8gct8rq1uR5vD", + "buildNumber": "1.2.3", +} + +MOCK_FAILED_RUN = { + **MOCK_ACTOR_RUN, + "status": "FAILED", + "statusMessage": "Actor failed with an error", +} + +MOCK_TIMED_OUT_RUN = { + **MOCK_ACTOR_RUN, + "status": "TIMED-OUT", + "statusMessage": "Actor run timed out", +} + +MOCK_DATASET_ITEMS = [ + {"url": "https://example.com/product/1", "title": "Widget A", "price": 19.99, "currency": "USD"}, + {"url": "https://example.com/product/2", "title": "Widget B", "price": 29.99, "currency": "USD"}, + {"url": "https://example.com/product/3", "title": "Widget C", "price": 39.99, "currency": "EUR"}, +] + +MOCK_SCRAPED_ITEM = { + "url": "https://example.com", + "markdown": "# Example Domain\n\nThis domain is for use in illustrative examples.", + "text": "Example Domain. This domain is for use in illustrative examples.", +} + + +@pytest.fixture +def mock_apify_client(): + """Create a mock ApifyClient with pre-configured responses.""" + client = MagicMock() + + mock_actor = MagicMock() + mock_actor.call.return_value = MOCK_ACTOR_RUN + client.actor.return_value = mock_actor + + mock_dataset = MagicMock() + mock_list_result = MagicMock() + mock_list_result.items = MOCK_DATASET_ITEMS + mock_dataset.list_items.return_value = mock_list_result + client.dataset.return_value = mock_dataset + + return client + + +@pytest.fixture +def mock_apify_env(monkeypatch): + """Set required Apify environment variables.""" + monkeypatch.setenv("APIFY_API_TOKEN", "test-token-12345") + + +# --- Module import --- + + +def test_apify_module_is_importable(): + """Verify that the apify tool module can be imported from strands_tools.""" + assert apify is not None + assert apify.__name__ == "strands_tools.apify" + + +# --- ApifyToolClient --- + + +def test_client_missing_token(monkeypatch): + """ApifyToolClient raises ValueError when APIFY_API_TOKEN is not set.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + with pytest.raises(ValueError, match="APIFY_API_TOKEN"): + ApifyToolClient() + + +def test_client_uses_env_token(mock_apify_env): + """ApifyToolClient passes the env token to ApifyClient.""" + with patch("strands_tools.apify.ApifyClient") as MockClient: + ApifyToolClient() + MockClient.assert_called_once_with("test-token-12345") + + +# --- apify_run_actor --- + + +def test_run_actor_success(mock_apify_env, mock_apify_client): + """Successful Actor Run returns JSON with run metadata.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="janedoe/my-scraper", run_input={"url": "https://example.com"}) + + data = json.loads(result) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + assert data["status"] == "SUCCEEDED" + assert data["dataset_id"] == "dataset-WkC9gct8rq1uR5vDZ" + assert "started_at" in data + assert "finished_at" in data + mock_apify_client.actor.assert_called_once_with("janedoe/my-scraper") + + +def test_run_actor_with_memory(mock_apify_env, mock_apify_client): + """Actor Run passes memory_mbytes when provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_run_actor(actor_id="janedoe/my-scraper", memory_mbytes=512) + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs["memory_mbytes"] == 512 + + +def test_run_actor_failure(mock_apify_env, mock_apify_client): + """Actor Run raises RuntimeError when Actor fails.""" + mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + with pytest.raises(RuntimeError, match="FAILED"): + apify_run_actor(actor_id="janedoe/my-scraper") + + +def test_run_actor_timeout(mock_apify_env, mock_apify_client): + """Actor Run raises RuntimeError when Actor times out.""" + mock_apify_client.actor.return_value.call.return_value = MOCK_TIMED_OUT_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + with pytest.raises(RuntimeError, match="TIMED-OUT"): + apify_run_actor(actor_id="janedoe/my-scraper") + + +def test_run_actor_api_exception(mock_apify_env, mock_apify_client): + """Actor Run re-raises exceptions from the Apify client.""" + mock_apify_client.actor.return_value.call.side_effect = Exception("Connection failed") + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + with pytest.raises(Exception, match="Connection failed"): + apify_run_actor(actor_id="janedoe/my-scraper") + + +# --- apify_get_dataset_items --- + + +def test_get_dataset_items_success(mock_apify_env, mock_apify_client): + """Successful dataset retrieval returns JSON array of items.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_get_dataset_items(dataset_id="dataset-WkC9gct8rq1uR5vDZ") + + items = json.loads(result) + assert len(items) == 3 + assert items[0]["title"] == "Widget A" + assert items[2]["currency"] == "EUR" + mock_apify_client.dataset.assert_called_once_with("dataset-WkC9gct8rq1uR5vDZ") + + +def test_get_dataset_items_with_pagination(mock_apify_env, mock_apify_client): + """Dataset retrieval passes limit and offset.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_get_dataset_items(dataset_id="dataset-xyz", limit=50, offset=10) + + mock_apify_client.dataset.return_value.list_items.assert_called_once_with(limit=50, offset=10) + + +def test_get_dataset_items_empty(mock_apify_env, mock_apify_client): + """Empty dataset returns an empty JSON array.""" + mock_list_result = MagicMock() + mock_list_result.items = [] + mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_get_dataset_items(dataset_id="dataset-empty") + + items = json.loads(result) + assert items == [] + + +# --- apify_run_actor_and_get_dataset --- + + +def test_run_actor_and_get_dataset_success(mock_apify_env, mock_apify_client): + """Combined run + dataset fetch returns run metadata and items.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor_and_get_dataset( + actor_id="janedoe/my-scraper", + run_input={"url": "https://example.com"}, + dataset_items_limit=50, + ) + + data = json.loads(result) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + assert data["status"] == "SUCCEEDED" + assert data["dataset_id"] == "dataset-WkC9gct8rq1uR5vDZ" + assert len(data["items"]) == 3 + assert data["items"][0]["title"] == "Widget A" + + +def test_run_actor_and_get_dataset_actor_failure(mock_apify_env, mock_apify_client): + """Combined tool raises when the Actor fails.""" + mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + with pytest.raises(RuntimeError, match="FAILED"): + apify_run_actor_and_get_dataset(actor_id="janedoe/my-scraper") + + +# --- apify_scrape_url --- + + +def test_scrape_url_success(mock_apify_env, mock_apify_client): + """Scrape URL returns markdown content from the crawled page.""" + mock_list_result = MagicMock() + mock_list_result.items = [MOCK_SCRAPED_ITEM] + mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_scrape_url(url="https://example.com") + + assert "Example Domain" in result + mock_apify_client.actor.assert_called_once_with("apify/website-content-crawler") + + +def test_scrape_url_no_content(mock_apify_env, mock_apify_client): + """Scrape URL raises when no content is returned.""" + mock_list_result = MagicMock() + mock_list_result.items = [] + mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + with pytest.raises(RuntimeError, match="No content returned"): + apify_scrape_url(url="https://example.com") + + +def test_scrape_url_crawler_failure(mock_apify_env, mock_apify_client): + """Scrape URL raises when the crawler Actor fails.""" + mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + with pytest.raises(RuntimeError, match="FAILED"): + apify_scrape_url(url="https://example.com") + + +def test_scrape_url_falls_back_to_text(mock_apify_env, mock_apify_client): + """Scrape URL falls back to text field when markdown is absent.""" + item_without_markdown = {"url": "https://example.com", "text": "Plain text content"} + mock_list_result = MagicMock() + mock_list_result.items = [item_without_markdown] + mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_scrape_url(url="https://example.com") + + assert result == "Plain text content" + + +# --- Dependency guard --- + + +def test_missing_apify_client_run_actor(mock_apify_env): + """apify_run_actor raises ImportError when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + with pytest.raises(ImportError, match="apify-client"): + apify_run_actor(actor_id="test/actor") + + +def test_missing_apify_client_get_dataset(mock_apify_env): + """apify_get_dataset_items raises ImportError when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + with pytest.raises(ImportError, match="apify-client"): + apify_get_dataset_items(dataset_id="dataset-123") + + +def test_missing_apify_client_run_and_get(mock_apify_env): + """apify_run_actor_and_get_dataset raises ImportError when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + with pytest.raises(ImportError, match="apify-client"): + apify_run_actor_and_get_dataset(actor_id="test/actor") + + +def test_missing_apify_client_scrape_url(mock_apify_env): + """apify_scrape_url raises ImportError when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + with pytest.raises(ImportError, match="apify-client"): + apify_scrape_url(url="https://example.com") + + +# --- Missing token from tool entry points --- + + +def test_run_actor_missing_token(monkeypatch): + """apify_run_actor raises ValueError when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + with pytest.raises(ValueError, match="APIFY_API_TOKEN"): + apify_run_actor(actor_id="test/actor") + + +def test_scrape_url_missing_token(monkeypatch): + """apify_scrape_url raises ValueError when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + with pytest.raises(ValueError, match="APIFY_API_TOKEN"): + apify_scrape_url(url="https://example.com") From 5f01fa3d82966e63a83c4d7fcd15042ba03edc9d Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 18 Mar 2026 16:19:55 +0100 Subject: [PATCH 02/39] chore: renamed platform integration tag --- src/strands_tools/apify.py | 2 +- tests/test_apify.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 96e5f9b8..a43a7c85 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -75,7 +75,7 @@ HAS_APIFY_CLIENT = False WEBSITE_CONTENT_CRAWLER = "apify/website-content-crawler" -TRACKING_HEADER = {"x-apify-integration-platform": "strands"} +TRACKING_HEADER = {"x-apify-integration-platform": "strands-agents"} def _check_dependency() -> None: diff --git a/tests/test_apify.py b/tests/test_apify.py index d8a3c5e4..31644288 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -99,7 +99,10 @@ def test_client_uses_env_token(mock_apify_env): """ApifyToolClient passes the env token to ApifyClient.""" with patch("strands_tools.apify.ApifyClient") as MockClient: ApifyToolClient() - MockClient.assert_called_once_with("test-token-12345") + MockClient.assert_called_once_with( + "test-token-12345", + headers={"x-apify-integration-platform": "strands-agents"}, + ) # --- apify_run_actor --- From c685127f3f1019f7e2f03837c654ca7c13233f90 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 19 Mar 2026 14:34:23 +0100 Subject: [PATCH 03/39] feat: updading apify tool docs --- README.md | 43 +++++++++++ docs/apify_tool.md | 145 +++++++++++++++++++++++++++++++++++++ src/strands_tools/apify.py | 66 ++++------------- 3 files changed, 201 insertions(+), 53 deletions(-) create mode 100644 docs/apify_tool.md diff --git a/README.md b/README.md index 8ec23e98..457bf57b 100644 --- a/README.md +++ b/README.md @@ -964,6 +964,43 @@ result = agent.tool.mongodb_memory( ) ``` +### Apify Core Tools + +```python +from strands import Agent +from strands_tools import apify + +agent = Agent(tools=[ + apify.apify_run_actor, + apify.apify_get_dataset_items, + apify.apify_run_actor_and_get_dataset, + apify.apify_scrape_url, +]) + +# Scrape a single URL and get markdown content +content = agent.tool.apify_scrape_url(url="https://example.com") + +# Run an Actor and get results in one step +result = agent.tool.apify_run_actor_and_get_dataset( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, + dataset_items_limit=50, +) + +# Run an Actor (get metadata only) +run_info = agent.tool.apify_run_actor( + actor_id="apify/google-search-scraper", + run_input={"queries": "AI agent frameworks"}, +) + +# Fetch Dataset items separately +items = agent.tool.apify_get_dataset_items( + dataset_id="abc123", + limit=100, +) +``` + + ## 🌍 Environment Variables Configuration Agents Tools provides extensive customization through environment variables. This allows you to configure tool behavior without modifying code, making it ideal for different environments (development, testing, production). @@ -1072,6 +1109,12 @@ The Mem0 Memory Tool supports three different backend configurations: - If `NEPTUNE_ANALYTICS_GRAPH_IDENTIFIER` is set, the tool will configure Neptune Analytics as graph store to enhance memory search - LLM configuration applies to all backend modes and allows customization of the language model used for memory processing +#### Apify Tool + +| Environment Variable | Description | Default | +|----------------------|-------------|---------| +| APIFY_API_TOKEN | Apify API token for authentication (required) | None | + #### Bright Data Tool | Environment Variable | Description | Default | diff --git a/docs/apify_tool.md b/docs/apify_tool.md new file mode 100644 index 00000000..9c930e1c --- /dev/null +++ b/docs/apify_tool.md @@ -0,0 +1,145 @@ +# Apify Core Tools + +The Apify core tools (`apify.py`) provide the foundational building blocks for interacting with the [Apify](https://apify.com) platform from Strands Agents. These generic tools let you run any [Actor](https://apify.com/store) by ID, fetch Dataset results, and scrape individual URLs. + +For higher-level, domain-specific tools see: +- [Apify Social Media Tools](apify_social_media_tool.md) β€” simplified wrappers for Instagram, LinkedIn, Twitter/X, TikTok, and Facebook scrapers +- [Apify Search Tools](apify_search_tool.md) β€” simplified wrappers for Google Search, Google Maps, YouTube, web crawling, and e-commerce scrapers + +## Installation + +```bash +pip install strands-agents-tools[apify] +``` + +## Configuration + +Set your Apify API token as an environment variable: + +```bash +export APIFY_API_TOKEN=apify_api_your_token_here +``` + +Get your token from the [Apify Console](https://console.apify.com/account/integrations) β†’ Settings β†’ API & Integrations β†’ Personal API tokens. + +## Usage + +```python +from strands import Agent +from strands_tools import apify + +agent = Agent(tools=[ + apify.apify_run_actor, + apify.apify_scrape_url, + apify.apify_get_dataset_items, + apify.apify_run_actor_and_get_dataset, +]) +``` + +### Scrape a URL + +The simplest way to extract content from any web page. Uses the [Website Content Crawler](https://apify.com/apify/website-content-crawler) Actor under the hood and returns the page content as Markdown: + +```python +content = agent.tool.apify_scrape_url(url="https://example.com") +``` + +### Run an Actor + +Execute any Actor from the [Apify Store](https://apify.com/store) by its ID. The call blocks until the Actor Run finishes or the timeout is reached: + +```python +result = agent.tool.apify_run_actor( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, + timeout_secs=300, +) +``` + +The result is a JSON string containing run metadata: `run_id`, `status`, `dataset_id`, `started_at`, and `finished_at`. + +### Run an Actor and Get Results + +Combine running an Actor and fetching its Dataset results in a single call: + +```python +result = agent.tool.apify_run_actor_and_get_dataset( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, + dataset_items_limit=50, +) +``` + +### Fetch Dataset Items + +Retrieve results from a Dataset by its ID. Useful after running an Actor to get the structured results separately, or to access any existing Dataset: + +```python +items = agent.tool.apify_get_dataset_items( + dataset_id="abc123", + limit=100, + offset=0, +) +``` + +## Tool Parameters + +### apify_scrape_url + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `url` | string | Yes | β€” | The URL to scrape | +| `timeout_secs` | int | No | 120 | Maximum time in seconds to wait for scraping to finish | + +**Returns:** Markdown content of the scraped page as a plain string. + +### apify_run_actor + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `actor_id` | string | Yes | β€” | Actor identifier (e.g., `apify/website-content-crawler`) | +| `run_input` | dict | No | `{}` | JSON-serializable input for the Actor | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor Run to finish | +| `memory_mbytes` | int | No | Actor default | Memory allocation in MB for the Actor Run | + +**Returns:** JSON string with run metadata: `run_id`, `status`, `dataset_id`, `started_at`, `finished_at`. + +### apify_get_dataset_items + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `dataset_id` | string | Yes | β€” | The Apify Dataset ID to fetch items from | +| `limit` | int | No | 100 | Maximum number of items to return | +| `offset` | int | No | 0 | Number of items to skip for pagination | + +**Returns:** JSON string containing an array of Dataset items. + +### apify_run_actor_and_get_dataset + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `actor_id` | string | Yes | β€” | Actor identifier (e.g., `apify/website-content-crawler`) | +| `run_input` | dict | No | `{}` | JSON-serializable input for the Actor | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor Run to finish | +| `memory_mbytes` | int | No | Actor default | Memory allocation in MB for the Actor Run | +| `dataset_items_limit` | int | No | 100 | Maximum number of Dataset items to return | + +**Returns:** JSON string with run metadata plus an `items` array containing the Dataset results. + +## Troubleshooting + +| Error | Cause | Fix | +|-------|-------|-----| +| `APIFY_API_TOKEN environment variable is not set` | Token not configured | Set the `APIFY_API_TOKEN` environment variable | +| `apify-client package is required` | Optional dependency not installed | Run `pip install strands-agents-tools[apify]` | +| `Actor ... finished with status FAILED` | Actor execution error | Check Actor input parameters and run logs in the [Apify Console](https://console.apify.com) | +| `Actor ... finished with status TIMED-OUT` | Timeout too short for the workload | Increase the `timeout_secs` parameter | +| `No content returned for URL` | Website Content Crawler returned empty results | Verify the URL is accessible and returns content | + +## References + +- [Strands Agents Tools](https://strandsagents.com/latest/user-guide/concepts/tools/tools_overview/) +- [Apify Platform](https://apify.com) +- [Apify API Documentation](https://docs.apify.com/api/v2) +- [Apify Store](https://apify.com/store) +- [Apify Python Client](https://docs.apify.com/api/client/python/docs) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index a43a7c85..a272a8bf 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -1,56 +1,16 @@ -"""Apify platform integration tool for Strands Agents. - -Provides capabilities to run Apify Actors, retrieve Datasets, and scrape URLs -using the Apify platform programmatically. - -Available tools: -- apify_run_actor: Run any Apify Actor by ID with arbitrary input -- apify_get_dataset_items: Fetch items from an Apify Dataset -- apify_run_actor_and_get_dataset: Run an Actor and fetch its Dataset results in one step -- apify_scrape_url: Scrape a URL and return its content as markdown - -Setup Requirements: ------------------- -1. Create an Apify account at https://apify.com -2. Obtain your API token: Apify Console β†’ Settings β†’ API & Integrations β†’ Personal API tokens -3. Install the optional dependency: pip install -e ".[apify]" -4. Set the environment variable: - APIFY_API_TOKEN=your_api_token_here - -Usage with Strands Agent: -```python -from strands import Agent -from strands_tools import apify - -agent = Agent(tools=[ - apify.apify_run_actor, - apify.apify_get_dataset_items, - apify.apify_run_actor_and_get_dataset, - apify.apify_scrape_url, -]) - -# Run an Actor -result = agent.tool.apify_run_actor( - actor_id="apify/website-content-crawler", - run_input={"startUrls": [{"url": "https://example.com"}]}, -) - -# Scrape a single URL -content = agent.tool.apify_scrape_url(url="https://example.com") -``` - -!!!!!!!!!!!!! IMPORTANT: !!!!!!!!!!!!! - -Environment Variables: -- APIFY_API_TOKEN: Your Apify API token (required) - Obtain from https://console.apify.com/account/integrations - -Example .env configuration: - APIFY_API_TOKEN=apify_api_1a2B3cD4eF5gH6iJ7kL8m - -!!!!!!!!!!!!! IMPORTANT: !!!!!!!!!!!!! - -See the function docstrings for complete parameter documentation. +"""Core Apify platform tools for Strands Agents. + +Provides the foundational building blocks for interacting with the Apify platform: +run any Actor by ID, fetch Dataset results, and scrape individual URLs. +For domain-specific wrappers see apify_social_media.py and apify_search.py. + +Setup: + 1. Create an Apify account at https://apify.com + 2. Get your API token: Console > Settings > API & Integrations + 3. export APIFY_API_TOKEN=your_token + 4. pip install strands-agents-tools[apify] + +See docs/apify_tool.md for usage examples, parameter reference, and troubleshooting. """ import json From d1a896e46d8596c0dd9bb5d27b6eb80dd55827c3 Mon Sep 17 00:00:00 2001 From: Murat Kaan Meral Date: Thu, 19 Mar 2026 11:36:09 -0400 Subject: [PATCH 04/39] chore: add strands command (#416) --- .github/workflows/strands-command.yml | 92 +++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 .github/workflows/strands-command.yml diff --git a/.github/workflows/strands-command.yml b/.github/workflows/strands-command.yml new file mode 100644 index 00000000..33874e24 --- /dev/null +++ b/.github/workflows/strands-command.yml @@ -0,0 +1,92 @@ +name: Strands Command Handler + +on: + issue_comment: + types: [created] + workflow_dispatch: + inputs: + issue_id: + description: 'Issue ID to process (can be issue or PR number)' + required: true + type: string + command: + description: 'Strands command to execute' + required: false + type: string + default: '' + session_id: + description: 'Optional session ID to use' + required: false + type: string + default: '' + +jobs: + authorization-check: + if: startsWith(github.event.comment.body, '/strands') || github.event_name == 'workflow_dispatch' + name: Check access + permissions: read-all + runs-on: ubuntu-latest + outputs: + approval-env: ${{ steps.auth.outputs.approval-env }} + steps: + - name: Check Authorization + id: auth + uses: strands-agents/devtools/authorization-check@main + with: + skip-check: ${{ github.event_name == 'workflow_dispatch' }} + username: ${{ github.event.comment.user.login || 'invalid' }} + allowed-roles: 'maintain,triage,write,admin' + + setup-and-process: + needs: [authorization-check] + environment: ${{ needs.authorization-check.outputs.approval-env }} + permissions: + # Needed to create a branch for the Implementer Agent + contents: write + # These both are needed to add the `strands-running` label to issues and prs + issues: write + pull-requests: write + runs-on: ubuntu-latest + steps: + - name: Parse input + id: parse + uses: strands-agents/devtools/strands-command/actions/strands-input-parser@main + with: + issue_id: ${{ inputs.issue_id }} + command: ${{ inputs.command }} + session_id: ${{ inputs.session_id }} + + execute-readonly-agent: + needs: [setup-and-process] + permissions: + contents: read + issues: read + pull-requests: read + id-token: write # Required for OIDC + runs-on: ubuntu-latest + timeout-minutes: 60 + steps: + + # Add any steps here to set up the environment for the Agent in your repo + # setup node, setup python, or any other dependencies + + - name: Run Strands Agent + id: agent-runner + uses: strands-agents/devtools/strands-command/actions/strands-agent-runner@main + with: + aws_role_arn: ${{ secrets.AWS_ROLE_ARN }} + sessions_bucket: ${{ secrets.AGENT_SESSIONS_BUCKET }} + write_permission: 'false' + + finalize: + if: always() && (startsWith(github.event.comment.body, '/strands') || github.event_name == 'workflow_dispatch') + needs: [setup-and-process, execute-readonly-agent] + permissions: + contents: write + issues: write + pull-requests: write + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - name: Execute write operations + uses: strands-agents/devtools/strands-command/actions/strands-finalize@main From 4deefa82415691095b87df706c070afc0b451805 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Fri, 20 Mar 2026 13:44:40 +0100 Subject: [PATCH 05/39] feat: update Apify tool dependencies and enhance documentation Made-with: Cursor --- README.md | 3 +- docs/apify_tool.md | 16 +-- pyproject.toml | 2 +- src/strands_tools/apify.py | 263 +++++++++++++++++++++++-------------- tests/test_apify.py | 213 ++++++++++++++++++++++-------- 5 files changed, 333 insertions(+), 164 deletions(-) diff --git a/README.md b/README.md index 457bf57b..516b83ac 100644 --- a/README.md +++ b/README.md @@ -964,7 +964,7 @@ result = agent.tool.mongodb_memory( ) ``` -### Apify Core Tools +### Apify ```python from strands import Agent @@ -1000,7 +1000,6 @@ items = agent.tool.apify_get_dataset_items( ) ``` - ## 🌍 Environment Variables Configuration Agents Tools provides extensive customization through environment variables. This allows you to configure tool behavior without modifying code, making it ideal for different environments (development, testing, production). diff --git a/docs/apify_tool.md b/docs/apify_tool.md index 9c930e1c..d4cf3bfd 100644 --- a/docs/apify_tool.md +++ b/docs/apify_tool.md @@ -1,10 +1,6 @@ -# Apify Core Tools +# Apify -The Apify core tools (`apify.py`) provide the foundational building blocks for interacting with the [Apify](https://apify.com) platform from Strands Agents. These generic tools let you run any [Actor](https://apify.com/store) by ID, fetch Dataset results, and scrape individual URLs. - -For higher-level, domain-specific tools see: -- [Apify Social Media Tools](apify_social_media_tool.md) β€” simplified wrappers for Instagram, LinkedIn, Twitter/X, TikTok, and Facebook scrapers -- [Apify Search Tools](apify_search_tool.md) β€” simplified wrappers for Google Search, Google Maps, YouTube, web crawling, and e-commerce scrapers +The Apify tools (`apify.py`) enable [Strands Agents](https://strandsagents.com/) to interact with the [Apify](https://apify.com) platform β€” running any [Actor](https://apify.com/store) by ID, fetching Dataset results, and scraping individual URLs. ## Installation @@ -98,9 +94,9 @@ items = agent.tool.apify_get_dataset_items( | Parameter | Type | Required | Default | Description | |-----------|------|----------|---------|-------------| | `actor_id` | string | Yes | β€” | Actor identifier (e.g., `apify/website-content-crawler`) | -| `run_input` | dict | No | `{}` | JSON-serializable input for the Actor | +| `run_input` | dict | No | None | JSON-serializable input for the Actor | | `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor Run to finish | -| `memory_mbytes` | int | No | Actor default | Memory allocation in MB for the Actor Run | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the Actor Run (uses Actor default if not set) | **Returns:** JSON string with run metadata: `run_id`, `status`, `dataset_id`, `started_at`, `finished_at`. @@ -119,9 +115,9 @@ items = agent.tool.apify_get_dataset_items( | Parameter | Type | Required | Default | Description | |-----------|------|----------|---------|-------------| | `actor_id` | string | Yes | β€” | Actor identifier (e.g., `apify/website-content-crawler`) | -| `run_input` | dict | No | `{}` | JSON-serializable input for the Actor | +| `run_input` | dict | No | None | JSON-serializable input for the Actor | | `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor Run to finish | -| `memory_mbytes` | int | No | Actor default | Memory allocation in MB for the Actor Run | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the Actor Run (uses Actor default if not set) | | `dataset_items_limit` | int | No | 100 | Maximum number of Dataset items to return | **Returns:** JSON string with run metadata plus an `items` array containing the Dataset results. diff --git a/pyproject.toml b/pyproject.toml index de75e0be..93e05c6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,7 @@ Documentation = "https://strandsagents.com/" [project.optional-dependencies] apify = [ - "apify-client>=1.0.0", + "apify-client>=2.5.0,<3.0.0", ] build = [ "hatch>=1.16.5", diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index a272a8bf..c85f3cc7 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -1,22 +1,64 @@ -"""Core Apify platform tools for Strands Agents. - -Provides the foundational building blocks for interacting with the Apify platform: -run any Actor by ID, fetch Dataset results, and scrape individual URLs. -For domain-specific wrappers see apify_social_media.py and apify_search.py. - -Setup: - 1. Create an Apify account at https://apify.com - 2. Get your API token: Console > Settings > API & Integrations - 3. export APIFY_API_TOKEN=your_token - 4. pip install strands-agents-tools[apify] - -See docs/apify_tool.md for usage examples, parameter reference, and troubleshooting. +"""Apify platform tools for Strands Agents. + +This module provides web scraping, data extraction, and automation capabilities +using the Apify platform. It lets you run any Actor by ID, fetch Dataset results, +and scrape individual URLs. + +Key Features: +------------ +1. Actor Execution: + β€’ apify_run_actor: Run any Apify Actor by ID with custom input + β€’ apify_run_actor_and_get_dataset: Run an Actor and fetch results in one step + +2. Data Retrieval: + β€’ apify_get_dataset_items: Fetch items from an Apify Dataset with pagination + β€’ apify_scrape_url: Scrape a single URL and return content as Markdown + +3. Error Handling: + β€’ Graceful API error handling with descriptive messages + β€’ Dependency checking (apify-client optional install) + β€’ Timeout management for Actor Runs + +Setup Requirements: +------------------ +1. Create an Apify account at https://apify.com +2. Obtain your API token: Apify Console > Settings > API & Integrations > Personal API tokens +3. Install the optional dependency: pip install strands-agents-tools[apify] +4. Set the environment variable: + APIFY_API_TOKEN=your_api_token_here + +Example .env configuration: + APIFY_API_TOKEN=apify_api_1a2B3cD4eF5gH6iJ7kL8m + +Usage Examples: +-------------- +```python +from strands import Agent +from strands_tools import apify + +agent = Agent(tools=[ + apify.apify_run_actor, + apify.apify_get_dataset_items, + apify.apify_run_actor_and_get_dataset, + apify.apify_scrape_url, +]) + +# Scrape a single URL +content = agent.tool.apify_scrape_url(url="https://example.com") + +# Run an Actor +result = agent.tool.apify_run_actor( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, +) +``` """ import json import logging import os from typing import Any, Dict, List, Optional +from urllib.parse import urlparse from rich.panel import Panel from rich.text import Text @@ -29,6 +71,7 @@ try: from apify_client import ApifyClient + from apify_client.errors import ApifyApiError HAS_APIFY_CLIENT = True except ImportError: @@ -36,6 +79,7 @@ WEBSITE_CONTENT_CRAWLER = "apify/website-content-crawler" TRACKING_HEADER = {"x-apify-integration-platform": "strands-agents"} +ERROR_PANEL_TITLE = "[bold red]Apify Error[/bold red]" def _check_dependency() -> None: @@ -44,6 +88,49 @@ def _check_dependency() -> None: raise ImportError("apify-client package is required. Install with: pip install strands-agents-tools[apify]") +def _validate_url(url: str) -> None: + """Raise ValueError if the URL does not have a valid HTTP(S) scheme and domain.""" + parsed = urlparse(url) + if parsed.scheme not in ("http", "https"): + raise ValueError(f"Invalid URL scheme '{parsed.scheme}'. Only http and https URLs are supported.") + if not parsed.netloc: + raise ValueError(f"Invalid URL '{url}'. A domain is required.") + + +def _format_error(e: Exception) -> str: + """Map exceptions to user-friendly error messages, with special handling for ApifyApiError.""" + if HAS_APIFY_CLIENT and isinstance(e, ApifyApiError): + status_code = getattr(e, "status_code", None) + msg = getattr(e, "message", str(e)) + match status_code: + case 401: + return "Authentication failed. Verify your APIFY_API_TOKEN is valid." + case 404: + return f"Resource not found: {msg}" + case 429: + return ( + "Rate limit exceeded. The Apify client retries automatically; " + "if this persists, reduce request frequency." + ) + case _: + return f"Apify API error ({status_code}): {msg}" + return str(e) + + +def _error_result(e: Exception, tool_name: str) -> Dict[str, Any]: + """Build a structured error response and display an error panel.""" + message = _format_error(e) + logger.error("%s failed: %s", tool_name, message) + console.print(Panel(Text(message, style="red"), title=ERROR_PANEL_TITLE, border_style="red")) + return {"status": "error", "content": [{"text": message}]} + + +def _success_result(text: str, panel_body: str, panel_title: str) -> Dict[str, Any]: + """Build a structured success response and display a success panel.""" + console.print(Panel(panel_body, title=f"[bold cyan]{panel_title}[/bold cyan]", border_style="green")) + return {"status": "success", "content": [{"text": text}]} + + class ApifyToolClient: """Helper class encapsulating Apify API interactions via apify-client.""" @@ -56,6 +143,14 @@ def __init__(self) -> None: ) self.client: "ApifyClient" = ApifyClient(token, headers=TRACKING_HEADER) + @staticmethod + def _check_run_status(actor_run: Dict[str, Any], label: str) -> None: + """Raise RuntimeError if the Actor Run did not succeed.""" + status = actor_run.get("status", "UNKNOWN") + if status != "SUCCEEDED": + run_id = actor_run.get("id", "N/A") + raise RuntimeError(f"{label} finished with status {status}. Run ID: {run_id}") + def run_actor( self, actor_id: str, @@ -72,14 +167,11 @@ def run_actor( call_kwargs["memory_mbytes"] = memory_mbytes actor_run = self.client.actor(actor_id).call(**call_kwargs) - - status = actor_run.get("status", "UNKNOWN") - if status not in ("SUCCEEDED",): - raise RuntimeError(f"Actor {actor_id} finished with status {status}. Run ID: {actor_run.get('id', 'N/A')}") + self._check_run_status(actor_run, f"Actor {actor_id}") return { "run_id": actor_run.get("id"), - "status": status, + "status": actor_run.get("status"), "dataset_id": actor_run.get("defaultDatasetId"), "started_at": actor_run.get("startedAt"), "finished_at": actor_run.get("finishedAt"), @@ -116,7 +208,7 @@ def run_actor_and_get_dataset( def scrape_url(self, url: str, timeout_secs: int = 120) -> str: """Scrape a single URL using Website Content Crawler and return markdown.""" - run_input = { + run_input: Dict[str, Any] = { "startUrls": [{"url": url}], "maxCrawlPages": 1, } @@ -124,12 +216,7 @@ def scrape_url(self, url: str, timeout_secs: int = 120) -> str: run_input=run_input, timeout_secs=timeout_secs, ) - - status = actor_run.get("status", "UNKNOWN") - if status not in ("SUCCEEDED",): - raise RuntimeError( - f"Website Content Crawler finished with status {status}. Run ID: {actor_run.get('id', 'N/A')}" - ) + self._check_run_status(actor_run, "Website Content Crawler") dataset_id = actor_run.get("defaultDatasetId") result = self.client.dataset(dataset_id).list_items(limit=1) @@ -150,7 +237,7 @@ def apify_run_actor( run_input: Optional[Dict[str, Any]] = None, timeout_secs: int = 300, memory_mbytes: Optional[int] = None, -) -> str: +) -> Dict[str, Any]: """Run any Apify Actor by its ID or name and return the run metadata as JSON. Executes the Actor synchronously - blocks until the Actor Run finishes or the timeout @@ -169,10 +256,11 @@ def apify_run_actor( memory_mbytes: Memory allocation in MB for the Actor Run. Uses Actor default if not set. Returns: - JSON string with run metadata: run_id, status, dataset_id, started_at, finished_at. + Dict with status and content containing run metadata: run_id, status, dataset_id, + started_at, finished_at. """ - _check_dependency() try: + _check_dependency() client = ApifyToolClient() result = client.run_actor( actor_id=actor_id, @@ -180,25 +268,19 @@ def apify_run_actor( timeout_secs=timeout_secs, memory_mbytes=memory_mbytes, ) - panel = Panel( - f"[green]Actor Run completed[/green]\n" - f"Actor: {actor_id}\n" - f"Run ID: {result['run_id']}\n" - f"Status: {result['status']}\n" - f"Dataset ID: {result['dataset_id']}", - title="[bold cyan]Apify: Run Actor[/bold cyan]", - border_style="green", + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Actor Run completed[/green]\n" + f"Actor: {actor_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}" + ), + panel_title="Apify: Run Actor", ) - console.print(panel) - return json.dumps(result, indent=2, default=str) except Exception as e: - error_panel = Panel( - Text(str(e), style="red"), - title="[bold red]Apify Error[/bold red]", - border_style="red", - ) - console.print(error_panel) - raise + return _error_result(e, "apify_run_actor") @tool @@ -206,7 +288,7 @@ def apify_get_dataset_items( dataset_id: str, limit: int = 100, offset: int = 0, -) -> str: +) -> Dict[str, Any]: """Fetch items from an existing Apify Dataset and return them as JSON. Use this after running an Actor to retrieve the structured results from its @@ -218,27 +300,21 @@ def apify_get_dataset_items( offset: Number of items to skip for pagination. Defaults to 0. Returns: - JSON string containing an array of Dataset items. + Dict with status and content containing an array of Dataset items. """ - _check_dependency() try: + _check_dependency() client = ApifyToolClient() items = client.get_dataset_items(dataset_id=dataset_id, limit=limit, offset=offset) - panel = Panel( - f"[green]Dataset items retrieved[/green]\nDataset ID: {dataset_id}\nItems returned: {len(items)}", - title="[bold cyan]Apify: Dataset Items[/bold cyan]", - border_style="green", + return _success_result( + text=json.dumps(items, indent=2, default=str), + panel_body=( + f"[green]Dataset items retrieved[/green]\nDataset ID: {dataset_id}\nItems returned: {len(items)}" + ), + panel_title="Apify: Dataset Items", ) - console.print(panel) - return json.dumps(items, indent=2, default=str) except Exception as e: - error_panel = Panel( - Text(str(e), style="red"), - title="[bold red]Apify Error[/bold red]", - border_style="red", - ) - console.print(error_panel) - raise + return _error_result(e, "apify_get_dataset_items") @tool @@ -248,7 +324,7 @@ def apify_run_actor_and_get_dataset( timeout_secs: int = 300, memory_mbytes: Optional[int] = None, dataset_items_limit: int = 100, -) -> str: +) -> Dict[str, Any]: """Run an Apify Actor and fetch its Dataset results in one step. Convenience tool that combines running an Actor and fetching its default Dataset @@ -263,11 +339,11 @@ def apify_run_actor_and_get_dataset( dataset_items_limit: Maximum number of Dataset items to return. Defaults to 100. Returns: - JSON string with run metadata (run_id, status, dataset_id, started_at, finished_at) - plus an "items" array containing the Dataset results. + Dict with status and content containing run metadata (run_id, status, dataset_id, + started_at, finished_at) plus an "items" array containing the Dataset results. """ - _check_dependency() try: + _check_dependency() client = ApifyToolClient() result = client.run_actor_and_get_dataset( actor_id=actor_id, @@ -276,33 +352,27 @@ def apify_run_actor_and_get_dataset( memory_mbytes=memory_mbytes, dataset_items_limit=dataset_items_limit, ) - panel = Panel( - f"[green]Actor Run completed with dataset[/green]\n" - f"Actor: {actor_id}\n" - f"Run ID: {result['run_id']}\n" - f"Status: {result['status']}\n" - f"Dataset ID: {result['dataset_id']}\n" - f"Items returned: {len(result['items'])}", - title="[bold cyan]Apify: Run Actor + Dataset[/bold cyan]", - border_style="green", + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Actor Run completed with dataset[/green]\n" + f"Actor: {actor_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}\n" + f"Items returned: {len(result['items'])}" + ), + panel_title="Apify: Run Actor + Dataset", ) - console.print(panel) - return json.dumps(result, indent=2, default=str) except Exception as e: - error_panel = Panel( - Text(str(e), style="red"), - title="[bold red]Apify Error[/bold red]", - border_style="red", - ) - console.print(error_panel) - raise + return _error_result(e, "apify_run_actor_and_get_dataset") @tool def apify_scrape_url( url: str, timeout_secs: int = 120, -) -> str: +) -> Dict[str, Any]: """Scrape a single URL and return its content as markdown. Uses the Apify Website Content Crawler Actor under the hood, pre-configured for @@ -314,24 +384,19 @@ def apify_scrape_url( timeout_secs: Maximum time in seconds to wait for scraping to finish. Defaults to 120. Returns: - Markdown content of the scraped page as a plain string. + Dict with status and content containing the markdown content of the scraped page. """ - _check_dependency() try: + _validate_url(url) + _check_dependency() client = ApifyToolClient() content = client.scrape_url(url=url, timeout_secs=timeout_secs) - panel = Panel( - f"[green]URL scraped successfully[/green]\nURL: {url}\nContent length: {len(content)} characters", - title="[bold cyan]Apify: Scrape URL[/bold cyan]", - border_style="green", + return _success_result( + text=content, + panel_body=( + f"[green]URL scraped successfully[/green]\nURL: {url}\nContent length: {len(content)} characters" + ), + panel_title="Apify: Scrape URL", ) - console.print(panel) - return content except Exception as e: - error_panel = Panel( - Text(str(e), style="red"), - title="[bold red]Apify Error[/bold red]", - border_style="red", - ) - console.print(error_panel) - raise + return _error_result(e, "apify_scrape_url") diff --git a/tests/test_apify.py b/tests/test_apify.py index 31644288..19ae534b 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -16,7 +16,7 @@ MOCK_ACTOR_RUN = { "id": "run-HG7ml5fB1hCp8YEBA", - "actId": "janedoe~my-scraper", + "actId": "aimee~my-scraper", "userId": "user-abc123", "startedAt": "2026-03-15T14:30:00.000Z", "finishedAt": "2026-03-15T14:35:22.000Z", @@ -52,6 +52,17 @@ } +def _make_apify_api_error(status_code: int, message: str) -> Exception: + """Create an ApifyApiError instance for testing without calling its real __init__.""" + from apify_client.errors import ApifyApiError + + error = ApifyApiError.__new__(ApifyApiError) + Exception.__init__(error, message) + error.status_code = status_code + error.message = message + return error + + @pytest.fixture def mock_apify_client(): """Create a mock ApifyClient with pre-configured responses.""" @@ -109,64 +120,106 @@ def test_client_uses_env_token(mock_apify_env): def test_run_actor_success(mock_apify_env, mock_apify_client): - """Successful Actor Run returns JSON with run metadata.""" + """Successful Actor Run returns structured result with run metadata.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_actor(actor_id="janedoe/my-scraper", run_input={"url": "https://example.com"}) + result = apify_run_actor(actor_id="aimee/my-scraper", run_input={"url": "https://example.com"}) - data = json.loads(result) + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" assert data["status"] == "SUCCEEDED" assert data["dataset_id"] == "dataset-WkC9gct8rq1uR5vDZ" assert "started_at" in data assert "finished_at" in data - mock_apify_client.actor.assert_called_once_with("janedoe/my-scraper") + mock_apify_client.actor.assert_called_once_with("aimee/my-scraper") + + +def test_run_actor_default_input(mock_apify_env, mock_apify_client): + """Actor Run defaults run_input to empty dict when not provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="aimee/my-scraper") + + assert result["status"] == "success" + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs["run_input"] == {} def test_run_actor_with_memory(mock_apify_env, mock_apify_client): """Actor Run passes memory_mbytes when provided.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - apify_run_actor(actor_id="janedoe/my-scraper", memory_mbytes=512) + apify_run_actor(actor_id="aimee/my-scraper", memory_mbytes=512) call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs assert call_kwargs["memory_mbytes"] == 512 def test_run_actor_failure(mock_apify_env, mock_apify_client): - """Actor Run raises RuntimeError when Actor fails.""" + """Actor Run returns error dict when Actor fails.""" mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - with pytest.raises(RuntimeError, match="FAILED"): - apify_run_actor(actor_id="janedoe/my-scraper") + result = apify_run_actor(actor_id="aimee/my-scraper") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] def test_run_actor_timeout(mock_apify_env, mock_apify_client): - """Actor Run raises RuntimeError when Actor times out.""" + """Actor Run returns error dict when Actor times out.""" mock_apify_client.actor.return_value.call.return_value = MOCK_TIMED_OUT_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - with pytest.raises(RuntimeError, match="TIMED-OUT"): - apify_run_actor(actor_id="janedoe/my-scraper") + result = apify_run_actor(actor_id="aimee/my-scraper") + + assert result["status"] == "error" + assert "TIMED-OUT" in result["content"][0]["text"] def test_run_actor_api_exception(mock_apify_env, mock_apify_client): - """Actor Run re-raises exceptions from the Apify client.""" + """Actor Run returns error dict on API exceptions.""" mock_apify_client.actor.return_value.call.side_effect = Exception("Connection failed") with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - with pytest.raises(Exception, match="Connection failed"): - apify_run_actor(actor_id="janedoe/my-scraper") + result = apify_run_actor(actor_id="aimee/my-scraper") + + assert result["status"] == "error" + assert "Connection failed" in result["content"][0]["text"] + + +def test_run_actor_apify_api_error_401(mock_apify_env, mock_apify_client): + """Actor Run returns friendly message for 401 authentication errors.""" + error = _make_apify_api_error(401, "Unauthorized") + mock_apify_client.actor.return_value.call.side_effect = error + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="aimee/my-scraper") + + assert result["status"] == "error" + assert "Authentication failed" in result["content"][0]["text"] + + +def test_run_actor_apify_api_error_404(mock_apify_env, mock_apify_client): + """Actor Run returns friendly message for 404 not-found errors.""" + error = _make_apify_api_error(404, "Actor not found") + mock_apify_client.actor.return_value.call.side_effect = error + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="aimee/nonexistent") + + assert result["status"] == "error" + assert "Resource not found" in result["content"][0]["text"] # --- apify_get_dataset_items --- def test_get_dataset_items_success(mock_apify_env, mock_apify_client): - """Successful dataset retrieval returns JSON array of items.""" + """Successful dataset retrieval returns structured result with items.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_get_dataset_items(dataset_id="dataset-WkC9gct8rq1uR5vDZ") - items = json.loads(result) + assert result["status"] == "success" + items = json.loads(result["content"][0]["text"]) assert len(items) == 3 assert items[0]["title"] == "Widget A" assert items[2]["currency"] == "EUR" @@ -182,7 +235,7 @@ def test_get_dataset_items_with_pagination(mock_apify_env, mock_apify_client): def test_get_dataset_items_empty(mock_apify_env, mock_apify_client): - """Empty dataset returns an empty JSON array.""" + """Empty dataset returns a structured result with empty JSON array.""" mock_list_result = MagicMock() mock_list_result.items = [] mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result @@ -190,7 +243,8 @@ def test_get_dataset_items_empty(mock_apify_env, mock_apify_client): with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_get_dataset_items(dataset_id="dataset-empty") - items = json.loads(result) + assert result["status"] == "success" + items = json.loads(result["content"][0]["text"]) assert items == [] @@ -198,15 +252,16 @@ def test_get_dataset_items_empty(mock_apify_env, mock_apify_client): def test_run_actor_and_get_dataset_success(mock_apify_env, mock_apify_client): - """Combined run + dataset fetch returns run metadata and items.""" + """Combined run + dataset fetch returns structured result with metadata and items.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_run_actor_and_get_dataset( - actor_id="janedoe/my-scraper", + actor_id="aimee/my-scraper", run_input={"url": "https://example.com"}, dataset_items_limit=50, ) - data = json.loads(result) + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" assert data["status"] == "SUCCEEDED" assert data["dataset_id"] == "dataset-WkC9gct8rq1uR5vDZ" @@ -215,19 +270,21 @@ def test_run_actor_and_get_dataset_success(mock_apify_env, mock_apify_client): def test_run_actor_and_get_dataset_actor_failure(mock_apify_env, mock_apify_client): - """Combined tool raises when the Actor fails.""" + """Combined tool returns error dict when the Actor fails.""" mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - with pytest.raises(RuntimeError, match="FAILED"): - apify_run_actor_and_get_dataset(actor_id="janedoe/my-scraper") + result = apify_run_actor_and_get_dataset(actor_id="aimee/my-scraper") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] # --- apify_scrape_url --- def test_scrape_url_success(mock_apify_env, mock_apify_client): - """Scrape URL returns markdown content from the crawled page.""" + """Scrape URL returns structured result with markdown content.""" mock_list_result = MagicMock() mock_list_result.items = [MOCK_SCRAPED_ITEM] mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result @@ -235,28 +292,33 @@ def test_scrape_url_success(mock_apify_env, mock_apify_client): with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_scrape_url(url="https://example.com") - assert "Example Domain" in result + assert result["status"] == "success" + assert "Example Domain" in result["content"][0]["text"] mock_apify_client.actor.assert_called_once_with("apify/website-content-crawler") def test_scrape_url_no_content(mock_apify_env, mock_apify_client): - """Scrape URL raises when no content is returned.""" + """Scrape URL returns error dict when no content is returned.""" mock_list_result = MagicMock() mock_list_result.items = [] mock_apify_client.dataset.return_value.list_items.return_value = mock_list_result with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - with pytest.raises(RuntimeError, match="No content returned"): - apify_scrape_url(url="https://example.com") + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "No content returned" in result["content"][0]["text"] def test_scrape_url_crawler_failure(mock_apify_env, mock_apify_client): - """Scrape URL raises when the crawler Actor fails.""" + """Scrape URL returns error dict when the crawler Actor fails.""" mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - with pytest.raises(RuntimeError, match="FAILED"): - apify_scrape_url(url="https://example.com") + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] def test_scrape_url_falls_back_to_text(mock_apify_env, mock_apify_client): @@ -269,52 +331,99 @@ def test_scrape_url_falls_back_to_text(mock_apify_env, mock_apify_client): with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_scrape_url(url="https://example.com") - assert result == "Plain text content" + assert result["status"] == "success" + assert result["content"][0]["text"] == "Plain text content" + + +def test_scrape_url_invalid_url_scheme(mock_apify_env): + """apify_scrape_url returns error for invalid URL scheme.""" + result = apify_scrape_url(url="ftp://example.com") + + assert result["status"] == "error" + assert "Invalid URL scheme" in result["content"][0]["text"] + + +def test_scrape_url_missing_scheme(mock_apify_env): + """apify_scrape_url returns error for URL without http/https scheme.""" + result = apify_scrape_url(url="example.com") + + assert result["status"] == "error" + assert "Invalid URL scheme" in result["content"][0]["text"] # --- Dependency guard --- def test_missing_apify_client_run_actor(mock_apify_env): - """apify_run_actor raises ImportError when apify-client is not installed.""" + """apify_run_actor returns error dict when apify-client is not installed.""" with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): - with pytest.raises(ImportError, match="apify-client"): - apify_run_actor(actor_id="test/actor") + result = apify_run_actor(actor_id="test/actor") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] def test_missing_apify_client_get_dataset(mock_apify_env): - """apify_get_dataset_items raises ImportError when apify-client is not installed.""" + """apify_get_dataset_items returns error dict when apify-client is not installed.""" with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): - with pytest.raises(ImportError, match="apify-client"): - apify_get_dataset_items(dataset_id="dataset-123") + result = apify_get_dataset_items(dataset_id="dataset-123") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] def test_missing_apify_client_run_and_get(mock_apify_env): - """apify_run_actor_and_get_dataset raises ImportError when apify-client is not installed.""" + """apify_run_actor_and_get_dataset returns error dict when apify-client is not installed.""" with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): - with pytest.raises(ImportError, match="apify-client"): - apify_run_actor_and_get_dataset(actor_id="test/actor") + result = apify_run_actor_and_get_dataset(actor_id="test/actor") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] def test_missing_apify_client_scrape_url(mock_apify_env): - """apify_scrape_url raises ImportError when apify-client is not installed.""" + """apify_scrape_url returns error dict when apify-client is not installed.""" with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): - with pytest.raises(ImportError, match="apify-client"): - apify_scrape_url(url="https://example.com") + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] # --- Missing token from tool entry points --- def test_run_actor_missing_token(monkeypatch): - """apify_run_actor raises ValueError when APIFY_API_TOKEN is missing.""" + """apify_run_actor returns error dict when APIFY_API_TOKEN is missing.""" monkeypatch.delenv("APIFY_API_TOKEN", raising=False) - with pytest.raises(ValueError, match="APIFY_API_TOKEN"): - apify_run_actor(actor_id="test/actor") + result = apify_run_actor(actor_id="test/actor") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + +def test_get_dataset_items_missing_token(monkeypatch): + """apify_get_dataset_items returns error dict when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_get_dataset_items(dataset_id="dataset-123") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + +def test_run_actor_and_get_dataset_missing_token(monkeypatch): + """apify_run_actor_and_get_dataset returns error dict when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_run_actor_and_get_dataset(actor_id="test/actor") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] def test_scrape_url_missing_token(monkeypatch): - """apify_scrape_url raises ValueError when APIFY_API_TOKEN is missing.""" + """apify_scrape_url returns error dict when APIFY_API_TOKEN is missing.""" monkeypatch.delenv("APIFY_API_TOKEN", raising=False) - with pytest.raises(ValueError, match="APIFY_API_TOKEN"): - apify_scrape_url(url="https://example.com") + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] From dd2d6fda98e6edb387a33bab4e7a7c74f6b287d6 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Fri, 20 Mar 2026 14:26:51 +0100 Subject: [PATCH 06/39] feat: add task execution tools to Apify integration and create unit tests for it --- src/strands_tools/apify.py | 160 ++++++++++++++++++++++++++++++++++++- tests/test_apify.py | 141 ++++++++++++++++++++++++++++++++ 2 files changed, 299 insertions(+), 2 deletions(-) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index c85f3cc7..9f707134 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -10,11 +10,15 @@ β€’ apify_run_actor: Run any Apify Actor by ID with custom input β€’ apify_run_actor_and_get_dataset: Run an Actor and fetch results in one step -2. Data Retrieval: +2. Task Execution: + β€’ apify_run_task: Run a saved Actor Task by ID with optional input overrides + β€’ apify_run_task_and_get_dataset: Run a Task and fetch results in one step + +3. Data Retrieval: β€’ apify_get_dataset_items: Fetch items from an Apify Dataset with pagination β€’ apify_scrape_url: Scrape a single URL and return content as Markdown -3. Error Handling: +4. Error Handling: β€’ Graceful API error handling with descriptive messages β€’ Dependency checking (apify-client optional install) β€’ Timeout management for Actor Runs @@ -38,8 +42,10 @@ agent = Agent(tools=[ apify.apify_run_actor, + apify.apify_run_task, apify.apify_get_dataset_items, apify.apify_run_actor_and_get_dataset, + apify.apify_run_task_and_get_dataset, apify.apify_scrape_url, ]) @@ -162,6 +168,7 @@ def run_actor( call_kwargs: Dict[str, Any] = { "run_input": run_input or {}, "timeout_secs": timeout_secs, + "logger": None, } if memory_mbytes is not None: call_kwargs["memory_mbytes"] = memory_mbytes @@ -203,6 +210,56 @@ def run_actor_and_get_dataset( memory_mbytes=memory_mbytes, ) dataset_id = run_metadata["dataset_id"] + if not dataset_id: + raise RuntimeError(f"Actor {actor_id} run has no default Dataset.") + items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit) + return {**run_metadata, "items": items} + + def run_task( + self, + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = 300, + memory_mbytes: Optional[int] = None, + ) -> Dict[str, Any]: + """Run an Apify Task synchronously and return run metadata.""" + call_kwargs: Dict[str, Any] = {"timeout_secs": timeout_secs} + if task_input is not None: + call_kwargs["task_input"] = task_input + if memory_mbytes is not None: + call_kwargs["memory_mbytes"] = memory_mbytes + + task_run = self.client.task(task_id).call(**call_kwargs) + if task_run is None: + raise RuntimeError(f"Task {task_id} returned no run data (possible wait timeout).") + self._check_run_status(task_run, f"Task {task_id}") + + return { + "run_id": task_run.get("id"), + "status": task_run.get("status"), + "dataset_id": task_run.get("defaultDatasetId"), + "started_at": task_run.get("startedAt"), + "finished_at": task_run.get("finishedAt"), + } + + def run_task_and_get_dataset( + self, + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = 300, + memory_mbytes: Optional[int] = None, + dataset_items_limit: int = 100, + ) -> Dict[str, Any]: + """Run a Task synchronously, then fetch its default Dataset items.""" + run_metadata = self.run_task( + task_id=task_id, + task_input=task_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + ) + dataset_id = run_metadata["dataset_id"] + if not dataset_id: + raise RuntimeError(f"Task {task_id} run has no default Dataset.") items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit) return {**run_metadata, "items": items} @@ -215,6 +272,7 @@ def scrape_url(self, url: str, timeout_secs: int = 120) -> str: actor_run = self.client.actor(WEBSITE_CONTENT_CRAWLER).call( run_input=run_input, timeout_secs=timeout_secs, + logger=None, ) self._check_run_status(actor_run, "Website Content Crawler") @@ -368,6 +426,104 @@ def apify_run_actor_and_get_dataset( return _error_result(e, "apify_run_actor_and_get_dataset") +@tool +def apify_run_task( + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = 300, + memory_mbytes: Optional[int] = None, +) -> Dict[str, Any]: + """Run an Apify Task by its ID or name and return the run metadata as JSON. + + Tasks are saved Actor configurations with preset inputs. Use this when a Task + has already been configured in the Apify Console, so you don't need to specify + the full Actor input every time. + + Args: + task_id: Task identifier, e.g. "janedoe~my-task" or a Task ID string. + task_input: Optional JSON-serializable input to override the Task's default input. + timeout_secs: Maximum time in seconds to wait for the Task Run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Task Run. Uses Task default if not set. + + Returns: + Dict with status and content containing run metadata: run_id, status, dataset_id, + started_at, finished_at. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_task( + task_id=task_id, + task_input=task_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Task Run completed[/green]\n" + f"Task: {task_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}" + ), + panel_title="Apify: Run Task", + ) + except Exception as e: + return _error_result(e, "apify_run_task") + + +@tool +def apify_run_task_and_get_dataset( + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = 300, + memory_mbytes: Optional[int] = None, + dataset_items_limit: int = 100, +) -> Dict[str, Any]: + """Run an Apify Task and fetch its Dataset results in one step. + + Convenience tool that combines running a Task and fetching its default Dataset + items into a single call. Use this when you want both the run metadata and the + result data without making two separate tool calls. + + Args: + task_id: Task identifier, e.g. "janedoe~my-task" or a Task ID string. + task_input: Optional JSON-serializable input to override the Task's default input. + timeout_secs: Maximum time in seconds to wait for the Task Run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Task Run. + dataset_items_limit: Maximum number of Dataset items to return. Defaults to 100. + + Returns: + Dict with status and content containing run metadata (run_id, status, dataset_id, + started_at, finished_at) plus an "items" array containing the Dataset results. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_task_and_get_dataset( + task_id=task_id, + task_input=task_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + dataset_items_limit=dataset_items_limit, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Task Run completed with dataset[/green]\n" + f"Task: {task_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}\n" + f"Items returned: {len(result['items'])}" + ), + panel_title="Apify: Run Task + Dataset", + ) + except Exception as e: + return _error_result(e, "apify_run_task_and_get_dataset") + + @tool def apify_scrape_url( url: str, diff --git a/tests/test_apify.py b/tests/test_apify.py index 19ae534b..f4ed99f0 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -11,6 +11,8 @@ apify_get_dataset_items, apify_run_actor, apify_run_actor_and_get_dataset, + apify_run_task, + apify_run_task_and_get_dataset, apify_scrape_url, ) @@ -72,6 +74,10 @@ def mock_apify_client(): mock_actor.call.return_value = MOCK_ACTOR_RUN client.actor.return_value = mock_actor + mock_task = MagicMock() + mock_task.call.return_value = MOCK_ACTOR_RUN + client.task.return_value = mock_task + mock_dataset = MagicMock() mock_list_result = MagicMock() mock_list_result.items = MOCK_DATASET_ITEMS @@ -280,6 +286,105 @@ def test_run_actor_and_get_dataset_actor_failure(mock_apify_env, mock_apify_clie assert "FAILED" in result["content"][0]["text"] +# --- apify_run_task --- + + +def test_run_task_success(mock_apify_env, mock_apify_client): + """Successful Task Run returns structured result with run metadata.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task(task_id="janedoe~my-task", task_input={"query": "test"}) + + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + assert data["status"] == "SUCCEEDED" + assert data["dataset_id"] == "dataset-WkC9gct8rq1uR5vDZ" + mock_apify_client.task.assert_called_once_with("janedoe~my-task") + + +def test_run_task_no_input(mock_apify_env, mock_apify_client): + """Task Run omits task_input kwarg when not provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task(task_id="janedoe~my-task") + + assert result["status"] == "success" + call_kwargs = mock_apify_client.task.return_value.call.call_args.kwargs + assert "task_input" not in call_kwargs + + +def test_run_task_with_memory(mock_apify_env, mock_apify_client): + """Task Run passes memory_mbytes when provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_run_task(task_id="janedoe~my-task", memory_mbytes=1024) + + call_kwargs = mock_apify_client.task.return_value.call.call_args.kwargs + assert call_kwargs["memory_mbytes"] == 1024 + + +def test_run_task_failure(mock_apify_env, mock_apify_client): + """Task Run returns error dict when Task fails.""" + mock_apify_client.task.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task(task_id="janedoe~my-task") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] + + +def test_run_task_none_response(mock_apify_env, mock_apify_client): + """Task Run returns error dict when TaskClient.call() returns None.""" + mock_apify_client.task.return_value.call.return_value = None + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task(task_id="janedoe~my-task") + + assert result["status"] == "error" + assert "no run data" in result["content"][0]["text"] + + +def test_run_task_apify_api_error_401(mock_apify_env, mock_apify_client): + """Task Run returns friendly message for 401 authentication errors.""" + error = _make_apify_api_error(401, "Unauthorized") + mock_apify_client.task.return_value.call.side_effect = error + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task(task_id="janedoe~my-task") + + assert result["status"] == "error" + assert "Authentication failed" in result["content"][0]["text"] + + +# --- apify_run_task_and_get_dataset --- + + +def test_run_task_and_get_dataset_success(mock_apify_env, mock_apify_client): + """Combined Task run + dataset fetch returns structured result with metadata and items.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task_and_get_dataset( + task_id="janedoe~my-task", + task_input={"query": "test"}, + dataset_items_limit=50, + ) + + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + assert len(data["items"]) == 3 + assert data["items"][0]["title"] == "Widget A" + + +def test_run_task_and_get_dataset_task_failure(mock_apify_env, mock_apify_client): + """Combined Task tool returns error dict when the Task fails.""" + mock_apify_client.task.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task_and_get_dataset(task_id="janedoe~my-task") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] + + # --- apify_scrape_url --- @@ -381,6 +486,24 @@ def test_missing_apify_client_run_and_get(mock_apify_env): assert "apify-client" in result["content"][0]["text"] +def test_missing_apify_client_run_task(mock_apify_env): + """apify_run_task returns error dict when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + result = apify_run_task(task_id="janedoe~my-task") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] + + +def test_missing_apify_client_run_task_and_get(mock_apify_env): + """apify_run_task_and_get_dataset returns error dict when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + result = apify_run_task_and_get_dataset(task_id="janedoe~my-task") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] + + def test_missing_apify_client_scrape_url(mock_apify_env): """apify_scrape_url returns error dict when apify-client is not installed.""" with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): @@ -420,6 +543,24 @@ def test_run_actor_and_get_dataset_missing_token(monkeypatch): assert "APIFY_API_TOKEN" in result["content"][0]["text"] +def test_run_task_missing_token(monkeypatch): + """apify_run_task returns error dict when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_run_task(task_id="janedoe~my-task") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + +def test_run_task_and_get_dataset_missing_token(monkeypatch): + """apify_run_task_and_get_dataset returns error dict when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_run_task_and_get_dataset(task_id="janedoe~my-task") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + def test_scrape_url_missing_token(monkeypatch): """apify_scrape_url returns error dict when APIFY_API_TOKEN is missing.""" monkeypatch.delenv("APIFY_API_TOKEN", raising=False) From f823eaef9680e359b4a6884c2854f9bbfae3041e Mon Sep 17 00:00:00 2001 From: David Omrai Date: Fri, 20 Mar 2026 14:58:27 +0100 Subject: [PATCH 07/39] feat: edit docs for apify tools --- README.md | 14 ++++++++++ docs/apify_tool.md | 56 ++++++++++++++++++++++++++++++++++++-- src/strands_tools/apify.py | 4 +-- tests/test_apify.py | 50 +++++++++++++++++----------------- 4 files changed, 95 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 516b83ac..45a3d6e8 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,8 @@ Below is a comprehensive table of all available tools, how to use them with an a | apify_run_actor | `agent.tool.apify_run_actor(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run any Apify Actor by ID with arbitrary input | | apify_get_dataset_items | `agent.tool.apify_get_dataset_items(dataset_id="abc123", limit=50)` | Fetch items from an Apify Dataset | | apify_run_actor_and_get_dataset | `agent.tool.apify_run_actor_and_get_dataset(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run an Actor and fetch its Dataset results in one step | +| apify_run_task | `agent.tool.apify_run_task(task_id="user~my-task")` | Run a saved Apify Task by ID with optional input overrides | +| apify_run_task_and_get_dataset | `agent.tool.apify_run_task_and_get_dataset(task_id="user~my-task", dataset_items_limit=50)` | Run a Task and fetch its Dataset results in one step | | apify_scrape_url | `agent.tool.apify_scrape_url(url="https://example.com")` | Scrape a URL and return its content as markdown | | file_read | `agent.tool.file_read(path="path/to/file.txt")` | Reading configuration files, parsing code files, loading datasets | | file_write | `agent.tool.file_write(path="path/to/file.txt", content="file content")` | Writing results to files, creating new files, saving output data | @@ -972,8 +974,10 @@ from strands_tools import apify agent = Agent(tools=[ apify.apify_run_actor, + apify.apify_run_task, apify.apify_get_dataset_items, apify.apify_run_actor_and_get_dataset, + apify.apify_run_task_and_get_dataset, apify.apify_scrape_url, ]) @@ -987,6 +991,16 @@ result = agent.tool.apify_run_actor_and_get_dataset( dataset_items_limit=50, ) +# Run a saved Task (pre-configured Actor with default inputs) +run_info = agent.tool.apify_run_task(task_id="user~my-task") + +# Run a Task and get results in one step +result = agent.tool.apify_run_task_and_get_dataset( + task_id="user~my-task", + task_input={"query": "override default input"}, + dataset_items_limit=50, +) + # Run an Actor (get metadata only) run_info = agent.tool.apify_run_actor( actor_id="apify/google-search-scraper", diff --git a/docs/apify_tool.md b/docs/apify_tool.md index d4cf3bfd..2a436246 100644 --- a/docs/apify_tool.md +++ b/docs/apify_tool.md @@ -1,6 +1,6 @@ # Apify -The Apify tools (`apify.py`) enable [Strands Agents](https://strandsagents.com/) to interact with the [Apify](https://apify.com) platform β€” running any [Actor](https://apify.com/store) by ID, fetching Dataset results, and scraping individual URLs. +The Apify tools (`apify.py`) enable [Strands Agents](https://strandsagents.com/) to interact with the [Apify](https://apify.com) platform β€” running any [Actor](https://apify.com/store) or [Task](https://docs.apify.com/platform/actors/running/tasks) by ID, fetching Dataset results, and scraping individual URLs. ## Installation @@ -26,9 +26,11 @@ from strands_tools import apify agent = Agent(tools=[ apify.apify_run_actor, + apify.apify_run_task, apify.apify_scrape_url, apify.apify_get_dataset_items, apify.apify_run_actor_and_get_dataset, + apify.apify_run_task_and_get_dataset, ]) ``` @@ -66,6 +68,31 @@ result = agent.tool.apify_run_actor_and_get_dataset( ) ``` +### Run a Task + +Execute a saved [Actor Task](https://docs.apify.com/platform/actors/running/tasks) β€” a pre-configured Actor with preset inputs. Use this when a Task has already been set up in the Apify Console: + +```python +result = agent.tool.apify_run_task( + task_id="user~my-task", + task_input={"query": "override input"}, + timeout_secs=300, +) +``` + +The result is a JSON string containing run metadata: `run_id`, `status`, `dataset_id`, `started_at`, and `finished_at`. + +### Run a Task and Get Results + +Combine running a Task and fetching its Dataset results in a single call: + +```python +result = agent.tool.apify_run_task_and_get_dataset( + task_id="user~my-task", + dataset_items_limit=50, +) +``` + ### Fetch Dataset Items Retrieve results from a Dataset by its ID. Useful after running an Actor to get the structured results separately, or to access any existing Dataset: @@ -100,6 +127,29 @@ items = agent.tool.apify_get_dataset_items( **Returns:** JSON string with run metadata: `run_id`, `status`, `dataset_id`, `started_at`, `finished_at`. +### apify_run_task + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `task_id` | string | Yes | β€” | Task identifier (e.g., `user~my-task` or a Task ID) | +| `task_input` | dict | No | None | JSON-serializable input to override the Task's default input | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Task Run to finish | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the Task Run (uses Task default if not set) | + +**Returns:** JSON string with run metadata: `run_id`, `status`, `dataset_id`, `started_at`, `finished_at`. + +### apify_run_task_and_get_dataset + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `task_id` | string | Yes | β€” | Task identifier (e.g., `user~my-task` or a Task ID) | +| `task_input` | dict | No | None | JSON-serializable input to override the Task's default input | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Task Run to finish | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the Task Run (uses Task default if not set) | +| `dataset_items_limit` | int | No | 100 | Maximum number of Dataset items to return | + +**Returns:** JSON string with run metadata plus an `items` array containing the Dataset results. + ### apify_get_dataset_items | Parameter | Type | Required | Default | Description | @@ -129,7 +179,9 @@ items = agent.tool.apify_get_dataset_items( | `APIFY_API_TOKEN environment variable is not set` | Token not configured | Set the `APIFY_API_TOKEN` environment variable | | `apify-client package is required` | Optional dependency not installed | Run `pip install strands-agents-tools[apify]` | | `Actor ... finished with status FAILED` | Actor execution error | Check Actor input parameters and run logs in the [Apify Console](https://console.apify.com) | -| `Actor ... finished with status TIMED-OUT` | Timeout too short for the workload | Increase the `timeout_secs` parameter | +| `Task ... finished with status FAILED` | Task execution error | Check Task configuration and run logs in the [Apify Console](https://console.apify.com) | +| `Actor/Task ... finished with status TIMED-OUT` | Timeout too short for the workload | Increase the `timeout_secs` parameter | +| `Task ... returned no run data` | Task `call()` returned `None` (wait timeout) | Increase the `timeout_secs` parameter | | `No content returned for URL` | Website Content Crawler returned empty results | Verify the URL is accessible and returns content | ## References diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 9f707134..6fc61eba 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -440,7 +440,7 @@ def apify_run_task( the full Actor input every time. Args: - task_id: Task identifier, e.g. "janedoe~my-task" or a Task ID string. + task_id: Task identifier, e.g. "user~my-task" or a Task ID string. task_input: Optional JSON-serializable input to override the Task's default input. timeout_secs: Maximum time in seconds to wait for the Task Run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the Task Run. Uses Task default if not set. @@ -488,7 +488,7 @@ def apify_run_task_and_get_dataset( result data without making two separate tool calls. Args: - task_id: Task identifier, e.g. "janedoe~my-task" or a Task ID string. + task_id: Task identifier, e.g. "user~my-task" or a Task ID string. task_input: Optional JSON-serializable input to override the Task's default input. timeout_secs: Maximum time in seconds to wait for the Task Run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the Task Run. diff --git a/tests/test_apify.py b/tests/test_apify.py index f4ed99f0..a88a085b 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -18,7 +18,7 @@ MOCK_ACTOR_RUN = { "id": "run-HG7ml5fB1hCp8YEBA", - "actId": "aimee~my-scraper", + "actId": "actor~my-scraper", "userId": "user-abc123", "startedAt": "2026-03-15T14:30:00.000Z", "finishedAt": "2026-03-15T14:35:22.000Z", @@ -128,7 +128,7 @@ def test_client_uses_env_token(mock_apify_env): def test_run_actor_success(mock_apify_env, mock_apify_client): """Successful Actor Run returns structured result with run metadata.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_actor(actor_id="aimee/my-scraper", run_input={"url": "https://example.com"}) + result = apify_run_actor(actor_id="actor/my-scraper", run_input={"url": "https://example.com"}) assert result["status"] == "success" data = json.loads(result["content"][0]["text"]) @@ -137,13 +137,13 @@ def test_run_actor_success(mock_apify_env, mock_apify_client): assert data["dataset_id"] == "dataset-WkC9gct8rq1uR5vDZ" assert "started_at" in data assert "finished_at" in data - mock_apify_client.actor.assert_called_once_with("aimee/my-scraper") + mock_apify_client.actor.assert_called_once_with("actor/my-scraper") def test_run_actor_default_input(mock_apify_env, mock_apify_client): """Actor Run defaults run_input to empty dict when not provided.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_actor(actor_id="aimee/my-scraper") + result = apify_run_actor(actor_id="actor/my-scraper") assert result["status"] == "success" call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs @@ -153,7 +153,7 @@ def test_run_actor_default_input(mock_apify_env, mock_apify_client): def test_run_actor_with_memory(mock_apify_env, mock_apify_client): """Actor Run passes memory_mbytes when provided.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - apify_run_actor(actor_id="aimee/my-scraper", memory_mbytes=512) + apify_run_actor(actor_id="actor/my-scraper", memory_mbytes=512) call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs assert call_kwargs["memory_mbytes"] == 512 @@ -164,7 +164,7 @@ def test_run_actor_failure(mock_apify_env, mock_apify_client): mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_actor(actor_id="aimee/my-scraper") + result = apify_run_actor(actor_id="actor/my-scraper") assert result["status"] == "error" assert "FAILED" in result["content"][0]["text"] @@ -175,7 +175,7 @@ def test_run_actor_timeout(mock_apify_env, mock_apify_client): mock_apify_client.actor.return_value.call.return_value = MOCK_TIMED_OUT_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_actor(actor_id="aimee/my-scraper") + result = apify_run_actor(actor_id="actor/my-scraper") assert result["status"] == "error" assert "TIMED-OUT" in result["content"][0]["text"] @@ -186,7 +186,7 @@ def test_run_actor_api_exception(mock_apify_env, mock_apify_client): mock_apify_client.actor.return_value.call.side_effect = Exception("Connection failed") with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_actor(actor_id="aimee/my-scraper") + result = apify_run_actor(actor_id="actor/my-scraper") assert result["status"] == "error" assert "Connection failed" in result["content"][0]["text"] @@ -198,7 +198,7 @@ def test_run_actor_apify_api_error_401(mock_apify_env, mock_apify_client): mock_apify_client.actor.return_value.call.side_effect = error with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_actor(actor_id="aimee/my-scraper") + result = apify_run_actor(actor_id="actor/my-scraper") assert result["status"] == "error" assert "Authentication failed" in result["content"][0]["text"] @@ -210,7 +210,7 @@ def test_run_actor_apify_api_error_404(mock_apify_env, mock_apify_client): mock_apify_client.actor.return_value.call.side_effect = error with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_actor(actor_id="aimee/nonexistent") + result = apify_run_actor(actor_id="actor/nonexistent") assert result["status"] == "error" assert "Resource not found" in result["content"][0]["text"] @@ -261,7 +261,7 @@ def test_run_actor_and_get_dataset_success(mock_apify_env, mock_apify_client): """Combined run + dataset fetch returns structured result with metadata and items.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_run_actor_and_get_dataset( - actor_id="aimee/my-scraper", + actor_id="actor/my-scraper", run_input={"url": "https://example.com"}, dataset_items_limit=50, ) @@ -280,7 +280,7 @@ def test_run_actor_and_get_dataset_actor_failure(mock_apify_env, mock_apify_clie mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_actor_and_get_dataset(actor_id="aimee/my-scraper") + result = apify_run_actor_and_get_dataset(actor_id="actor/my-scraper") assert result["status"] == "error" assert "FAILED" in result["content"][0]["text"] @@ -292,20 +292,20 @@ def test_run_actor_and_get_dataset_actor_failure(mock_apify_env, mock_apify_clie def test_run_task_success(mock_apify_env, mock_apify_client): """Successful Task Run returns structured result with run metadata.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_task(task_id="janedoe~my-task", task_input={"query": "test"}) + result = apify_run_task(task_id="user~my-task", task_input={"query": "test"}) assert result["status"] == "success" data = json.loads(result["content"][0]["text"]) assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" assert data["status"] == "SUCCEEDED" assert data["dataset_id"] == "dataset-WkC9gct8rq1uR5vDZ" - mock_apify_client.task.assert_called_once_with("janedoe~my-task") + mock_apify_client.task.assert_called_once_with("user~my-task") def test_run_task_no_input(mock_apify_env, mock_apify_client): """Task Run omits task_input kwarg when not provided.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_task(task_id="janedoe~my-task") + result = apify_run_task(task_id="user~my-task") assert result["status"] == "success" call_kwargs = mock_apify_client.task.return_value.call.call_args.kwargs @@ -315,7 +315,7 @@ def test_run_task_no_input(mock_apify_env, mock_apify_client): def test_run_task_with_memory(mock_apify_env, mock_apify_client): """Task Run passes memory_mbytes when provided.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - apify_run_task(task_id="janedoe~my-task", memory_mbytes=1024) + apify_run_task(task_id="user~my-task", memory_mbytes=1024) call_kwargs = mock_apify_client.task.return_value.call.call_args.kwargs assert call_kwargs["memory_mbytes"] == 1024 @@ -326,7 +326,7 @@ def test_run_task_failure(mock_apify_env, mock_apify_client): mock_apify_client.task.return_value.call.return_value = MOCK_FAILED_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_task(task_id="janedoe~my-task") + result = apify_run_task(task_id="user~my-task") assert result["status"] == "error" assert "FAILED" in result["content"][0]["text"] @@ -337,7 +337,7 @@ def test_run_task_none_response(mock_apify_env, mock_apify_client): mock_apify_client.task.return_value.call.return_value = None with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_task(task_id="janedoe~my-task") + result = apify_run_task(task_id="user~my-task") assert result["status"] == "error" assert "no run data" in result["content"][0]["text"] @@ -349,7 +349,7 @@ def test_run_task_apify_api_error_401(mock_apify_env, mock_apify_client): mock_apify_client.task.return_value.call.side_effect = error with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_task(task_id="janedoe~my-task") + result = apify_run_task(task_id="user~my-task") assert result["status"] == "error" assert "Authentication failed" in result["content"][0]["text"] @@ -362,7 +362,7 @@ def test_run_task_and_get_dataset_success(mock_apify_env, mock_apify_client): """Combined Task run + dataset fetch returns structured result with metadata and items.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_run_task_and_get_dataset( - task_id="janedoe~my-task", + task_id="user~my-task", task_input={"query": "test"}, dataset_items_limit=50, ) @@ -379,7 +379,7 @@ def test_run_task_and_get_dataset_task_failure(mock_apify_env, mock_apify_client mock_apify_client.task.return_value.call.return_value = MOCK_FAILED_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): - result = apify_run_task_and_get_dataset(task_id="janedoe~my-task") + result = apify_run_task_and_get_dataset(task_id="user~my-task") assert result["status"] == "error" assert "FAILED" in result["content"][0]["text"] @@ -489,7 +489,7 @@ def test_missing_apify_client_run_and_get(mock_apify_env): def test_missing_apify_client_run_task(mock_apify_env): """apify_run_task returns error dict when apify-client is not installed.""" with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): - result = apify_run_task(task_id="janedoe~my-task") + result = apify_run_task(task_id="user~my-task") assert result["status"] == "error" assert "apify-client" in result["content"][0]["text"] @@ -498,7 +498,7 @@ def test_missing_apify_client_run_task(mock_apify_env): def test_missing_apify_client_run_task_and_get(mock_apify_env): """apify_run_task_and_get_dataset returns error dict when apify-client is not installed.""" with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): - result = apify_run_task_and_get_dataset(task_id="janedoe~my-task") + result = apify_run_task_and_get_dataset(task_id="user~my-task") assert result["status"] == "error" assert "apify-client" in result["content"][0]["text"] @@ -546,7 +546,7 @@ def test_run_actor_and_get_dataset_missing_token(monkeypatch): def test_run_task_missing_token(monkeypatch): """apify_run_task returns error dict when APIFY_API_TOKEN is missing.""" monkeypatch.delenv("APIFY_API_TOKEN", raising=False) - result = apify_run_task(task_id="janedoe~my-task") + result = apify_run_task(task_id="user~my-task") assert result["status"] == "error" assert "APIFY_API_TOKEN" in result["content"][0]["text"] @@ -555,7 +555,7 @@ def test_run_task_missing_token(monkeypatch): def test_run_task_and_get_dataset_missing_token(monkeypatch): """apify_run_task_and_get_dataset returns error dict when APIFY_API_TOKEN is missing.""" monkeypatch.delenv("APIFY_API_TOKEN", raising=False) - result = apify_run_task_and_get_dataset(task_id="janedoe~my-task") + result = apify_run_task_and_get_dataset(task_id="user~my-task") assert result["status"] == "error" assert "APIFY_API_TOKEN" in result["content"][0]["text"] From bcf09508f52cccc40aad003bb7158c23768e0c11 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 23 Mar 2026 18:13:27 +0100 Subject: [PATCH 08/39] feat: enhance Apify tool with validation methods and default parameters --- src/strands_tools/apify.py | 150 +++++++++++++++++++++++++++++-------- 1 file changed, 119 insertions(+), 31 deletions(-) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 6fc61eba..bdac5124 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -86,6 +86,13 @@ WEBSITE_CONTENT_CRAWLER = "apify/website-content-crawler" TRACKING_HEADER = {"x-apify-integration-platform": "strands-agents"} ERROR_PANEL_TITLE = "[bold red]Apify Error[/bold red]" +DEFAULT_TIMEOUT_SECS = 300 +DEFAULT_SCRAPE_TIMEOUT_SECS = 120 +DEFAULT_DATASET_ITEMS_LIMIT = 100 +VALID_CRAWLER_TYPES = ("playwright:adaptive", "playwright:firefox", "cheerio") + + +# --- Helper functions --- def _check_dependency() -> None: @@ -94,25 +101,22 @@ def _check_dependency() -> None: raise ImportError("apify-client package is required. Install with: pip install strands-agents-tools[apify]") -def _validate_url(url: str) -> None: - """Raise ValueError if the URL does not have a valid HTTP(S) scheme and domain.""" - parsed = urlparse(url) - if parsed.scheme not in ("http", "https"): - raise ValueError(f"Invalid URL scheme '{parsed.scheme}'. Only http and https URLs are supported.") - if not parsed.netloc: - raise ValueError(f"Invalid URL '{url}'. A domain is required.") - - def _format_error(e: Exception) -> str: """Map exceptions to user-friendly error messages, with special handling for ApifyApiError.""" if HAS_APIFY_CLIENT and isinstance(e, ApifyApiError): status_code = getattr(e, "status_code", None) msg = getattr(e, "message", str(e)) match status_code: + case 400: + return f"Invalid request: {msg}" case 401: return "Authentication failed. Verify your APIFY_API_TOKEN is valid." + case 402: + return "Insufficient Apify plan credits or subscription limits exceeded." case 404: return f"Resource not found: {msg}" + case 408: + return f"Actor Run timed out: {msg}" case 429: return ( "Rate limit exceeded. The Apify client retries automatically; " @@ -157,23 +161,60 @@ def _check_run_status(actor_run: Dict[str, Any], label: str) -> None: run_id = actor_run.get("id", "N/A") raise RuntimeError(f"{label} finished with status {status}. Run ID: {run_id}") + @staticmethod + def _validate_url(url: str) -> None: + """Raise ValueError if the URL does not have a valid HTTP(S) scheme and domain.""" + parsed = urlparse(url) + if parsed.scheme not in ("http", "https"): + raise ValueError(f"Invalid URL scheme '{parsed.scheme}'. Only http and https URLs are supported.") + if not parsed.netloc: + raise ValueError(f"Invalid URL '{url}'. A domain is required.") + + @staticmethod + def _validate_identifier(value: str, name: str) -> None: + """Raise ValueError if a required string identifier is empty or whitespace-only.""" + if not value.strip(): + raise ValueError(f"'{name}' must be a non-empty string.") + + @staticmethod + def _validate_positive(value: int, name: str) -> None: + """Raise ValueError if the value is not a positive integer (> 0).""" + if value <= 0: + raise ValueError(f"'{name}' must be a positive integer, got {value}.") + + @staticmethod + def _validate_non_negative(value: int, name: str) -> None: + """Raise ValueError if the value is negative.""" + if value < 0: + raise ValueError(f"'{name}' must be a non-negative integer, got {value}.") + def run_actor( self, actor_id: str, run_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = 300, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, + build: Optional[str] = None, ) -> Dict[str, Any]: """Run an Apify Actor synchronously and return run metadata.""" + self._validate_identifier(actor_id, "actor_id") + self._validate_positive(timeout_secs, "timeout_secs") + if memory_mbytes is not None: + self._validate_positive(memory_mbytes, "memory_mbytes") + call_kwargs: Dict[str, Any] = { "run_input": run_input or {}, "timeout_secs": timeout_secs, - "logger": None, + "logger": None, # Suppress verbose apify-client logging not useful to end users } if memory_mbytes is not None: call_kwargs["memory_mbytes"] = memory_mbytes + if build is not None: + call_kwargs["build"] = build actor_run = self.client.actor(actor_id).call(**call_kwargs) + if actor_run is None: + raise RuntimeError(f"Actor {actor_id} returned no run data (possible wait timeout).") self._check_run_status(actor_run, f"Actor {actor_id}") return { @@ -187,10 +228,14 @@ def run_actor( def get_dataset_items( self, dataset_id: str, - limit: int = 100, + limit: int = DEFAULT_DATASET_ITEMS_LIMIT, offset: int = 0, ) -> List[Dict[str, Any]]: """Fetch items from an Apify Dataset.""" + self._validate_identifier(dataset_id, "dataset_id") + self._validate_positive(limit, "limit") + self._validate_non_negative(offset, "offset") + result = self.client.dataset(dataset_id).list_items(limit=limit, offset=offset) return list(result.items) @@ -198,31 +243,42 @@ def run_actor_and_get_dataset( self, actor_id: str, run_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = 300, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, - dataset_items_limit: int = 100, + build: Optional[str] = None, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, ) -> Dict[str, Any]: """Run an Actor synchronously, then fetch its default Dataset items.""" + self._validate_positive(dataset_items_limit, "dataset_items_limit") + self._validate_non_negative(dataset_items_offset, "dataset_items_offset") + run_metadata = self.run_actor( actor_id=actor_id, run_input=run_input, timeout_secs=timeout_secs, memory_mbytes=memory_mbytes, + build=build, ) dataset_id = run_metadata["dataset_id"] if not dataset_id: raise RuntimeError(f"Actor {actor_id} run has no default Dataset.") - items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit) + items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit, offset=dataset_items_offset) return {**run_metadata, "items": items} def run_task( self, task_id: str, task_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = 300, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, ) -> Dict[str, Any]: """Run an Apify Task synchronously and return run metadata.""" + self._validate_identifier(task_id, "task_id") + self._validate_positive(timeout_secs, "timeout_secs") + if memory_mbytes is not None: + self._validate_positive(memory_mbytes, "memory_mbytes") + call_kwargs: Dict[str, Any] = {"timeout_secs": timeout_secs} if task_input is not None: call_kwargs["task_input"] = task_input @@ -246,11 +302,15 @@ def run_task_and_get_dataset( self, task_id: str, task_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = 300, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, - dataset_items_limit: int = 100, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, ) -> Dict[str, Any]: """Run a Task synchronously, then fetch its default Dataset items.""" + self._validate_positive(dataset_items_limit, "dataset_items_limit") + self._validate_non_negative(dataset_items_offset, "dataset_items_offset") + run_metadata = self.run_task( task_id=task_id, task_input=task_input, @@ -260,19 +320,32 @@ def run_task_and_get_dataset( dataset_id = run_metadata["dataset_id"] if not dataset_id: raise RuntimeError(f"Task {task_id} run has no default Dataset.") - items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit) + items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit, offset=dataset_items_offset) return {**run_metadata, "items": items} - def scrape_url(self, url: str, timeout_secs: int = 120) -> str: + def scrape_url( + self, + url: str, + timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, + crawler_type: str = "cheerio", + ) -> str: """Scrape a single URL using Website Content Crawler and return markdown.""" + self._validate_url(url) + self._validate_positive(timeout_secs, "timeout_secs") + if crawler_type not in VALID_CRAWLER_TYPES: + raise ValueError( + f"Invalid crawler_type '{crawler_type}'. Must be one of: {', '.join(VALID_CRAWLER_TYPES)}." + ) + run_input: Dict[str, Any] = { "startUrls": [{"url": url}], "maxCrawlPages": 1, + "crawlerType": crawler_type, } actor_run = self.client.actor(WEBSITE_CONTENT_CRAWLER).call( run_input=run_input, timeout_secs=timeout_secs, - logger=None, + logger=None, # Suppress verbose apify-client logging not useful to end users ) self._check_run_status(actor_run, "Website Content Crawler") @@ -293,8 +366,9 @@ def scrape_url(self, url: str, timeout_secs: int = 120) -> str: def apify_run_actor( actor_id: str, run_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = 300, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, + build: Optional[str] = None, ) -> Dict[str, Any]: """Run any Apify Actor by its ID or name and return the run metadata as JSON. @@ -312,6 +386,7 @@ def apify_run_actor( run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema. timeout_secs: Maximum time in seconds to wait for the Actor Run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the Actor Run. Uses Actor default if not set. + build: Actor Build tag or number to run a specific version. Uses latest Build if not set. Returns: Dict with status and content containing run metadata: run_id, status, dataset_id, @@ -325,6 +400,7 @@ def apify_run_actor( run_input=run_input, timeout_secs=timeout_secs, memory_mbytes=memory_mbytes, + build=build, ) return _success_result( text=json.dumps(result, indent=2, default=str), @@ -344,7 +420,7 @@ def apify_run_actor( @tool def apify_get_dataset_items( dataset_id: str, - limit: int = 100, + limit: int = DEFAULT_DATASET_ITEMS_LIMIT, offset: int = 0, ) -> Dict[str, Any]: """Fetch items from an existing Apify Dataset and return them as JSON. @@ -379,9 +455,11 @@ def apify_get_dataset_items( def apify_run_actor_and_get_dataset( actor_id: str, run_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = 300, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, - dataset_items_limit: int = 100, + build: Optional[str] = None, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, ) -> Dict[str, Any]: """Run an Apify Actor and fetch its Dataset results in one step. @@ -394,7 +472,9 @@ def apify_run_actor_and_get_dataset( run_input: JSON-serializable input for the Actor. timeout_secs: Maximum time in seconds to wait for the Actor Run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the Actor Run. + build: Actor Build tag or number to run a specific version. Uses latest Build if not set. dataset_items_limit: Maximum number of Dataset items to return. Defaults to 100. + dataset_items_offset: Number of Dataset items to skip for pagination. Defaults to 0. Returns: Dict with status and content containing run metadata (run_id, status, dataset_id, @@ -408,7 +488,9 @@ def apify_run_actor_and_get_dataset( run_input=run_input, timeout_secs=timeout_secs, memory_mbytes=memory_mbytes, + build=build, dataset_items_limit=dataset_items_limit, + dataset_items_offset=dataset_items_offset, ) return _success_result( text=json.dumps(result, indent=2, default=str), @@ -430,7 +512,7 @@ def apify_run_actor_and_get_dataset( def apify_run_task( task_id: str, task_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = 300, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, ) -> Dict[str, Any]: """Run an Apify Task by its ID or name and return the run metadata as JSON. @@ -477,9 +559,10 @@ def apify_run_task( def apify_run_task_and_get_dataset( task_id: str, task_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = 300, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, - dataset_items_limit: int = 100, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, ) -> Dict[str, Any]: """Run an Apify Task and fetch its Dataset results in one step. @@ -493,6 +576,7 @@ def apify_run_task_and_get_dataset( timeout_secs: Maximum time in seconds to wait for the Task Run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the Task Run. dataset_items_limit: Maximum number of Dataset items to return. Defaults to 100. + dataset_items_offset: Number of Dataset items to skip for pagination. Defaults to 0. Returns: Dict with status and content containing run metadata (run_id, status, dataset_id, @@ -507,6 +591,7 @@ def apify_run_task_and_get_dataset( timeout_secs=timeout_secs, memory_mbytes=memory_mbytes, dataset_items_limit=dataset_items_limit, + dataset_items_offset=dataset_items_offset, ) return _success_result( text=json.dumps(result, indent=2, default=str), @@ -527,7 +612,8 @@ def apify_run_task_and_get_dataset( @tool def apify_scrape_url( url: str, - timeout_secs: int = 120, + timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, + crawler_type: str = "cheerio", ) -> Dict[str, Any]: """Scrape a single URL and return its content as markdown. @@ -538,15 +624,17 @@ def apify_scrape_url( Args: url: The URL to scrape, e.g. "https://example.com". timeout_secs: Maximum time in seconds to wait for scraping to finish. Defaults to 120. + crawler_type: Crawler engine to use. One of "playwright:adaptive" (fast, renders JS if + present, recommended default), "playwright:firefox" (reliable, renders JS, best at + avoiding blocking but slower), or "cheerio" (fastest, no JS rendering). Returns: Dict with status and content containing the markdown content of the scraped page. """ try: - _validate_url(url) _check_dependency() client = ApifyToolClient() - content = client.scrape_url(url=url, timeout_secs=timeout_secs) + content = client.scrape_url(url=url, timeout_secs=timeout_secs, crawler_type=crawler_type) return _success_result( text=content, panel_body=( From d15a3371927b5bbff7230ac12eaba338040599d3 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 23 Mar 2026 18:31:50 +0100 Subject: [PATCH 09/39] feat: create validation tests --- tests/test_apify.py | 139 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) diff --git a/tests/test_apify.py b/tests/test_apify.py index a88a085b..70b3aca5 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -456,6 +456,145 @@ def test_scrape_url_missing_scheme(mock_apify_env): assert "Invalid URL scheme" in result["content"][0]["text"] +# --- Parameter validation --- + + +def test_run_actor_empty_actor_id(mock_apify_env): + """apify_run_actor returns error for whitespace-only actor_id.""" + result = apify_run_actor(actor_id=" ") + + assert result["status"] == "error" + assert "actor_id" in result["content"][0]["text"] + + +def test_run_actor_zero_timeout(mock_apify_env): + """apify_run_actor returns error for non-positive timeout_secs.""" + result = apify_run_actor(actor_id="actor/valid", timeout_secs=0) + + assert result["status"] == "error" + assert "timeout_secs" in result["content"][0]["text"] + + +def test_run_actor_negative_timeout(mock_apify_env): + """apify_run_actor returns error for negative timeout_secs.""" + result = apify_run_actor(actor_id="actor/valid", timeout_secs=-5) + + assert result["status"] == "error" + assert "timeout_secs" in result["content"][0]["text"] + + +def test_run_actor_zero_memory(mock_apify_env): + """apify_run_actor returns error for non-positive memory_mbytes.""" + result = apify_run_actor(actor_id="actor/valid", memory_mbytes=0) + + assert result["status"] == "error" + assert "memory_mbytes" in result["content"][0]["text"] + + +def test_run_task_empty_task_id(mock_apify_env): + """apify_run_task returns error for whitespace-only task_id.""" + result = apify_run_task(task_id=" ") + + assert result["status"] == "error" + assert "task_id" in result["content"][0]["text"] + + +def test_run_task_zero_timeout(mock_apify_env): + """apify_run_task returns error for non-positive timeout_secs.""" + result = apify_run_task(task_id="user~my-task", timeout_secs=0) + + assert result["status"] == "error" + assert "timeout_secs" in result["content"][0]["text"] + + +def test_run_task_zero_memory(mock_apify_env): + """apify_run_task returns error for non-positive memory_mbytes.""" + result = apify_run_task(task_id="user~my-task", memory_mbytes=0) + + assert result["status"] == "error" + assert "memory_mbytes" in result["content"][0]["text"] + + +def test_get_dataset_items_empty_dataset_id(mock_apify_env): + """apify_get_dataset_items returns error for whitespace-only dataset_id.""" + result = apify_get_dataset_items(dataset_id=" ") + + assert result["status"] == "error" + assert "dataset_id" in result["content"][0]["text"] + + +def test_get_dataset_items_zero_limit(mock_apify_env): + """apify_get_dataset_items returns error for non-positive limit.""" + result = apify_get_dataset_items(dataset_id="dataset-abc", limit=0) + + assert result["status"] == "error" + assert "limit" in result["content"][0]["text"] + + +def test_get_dataset_items_negative_offset(mock_apify_env): + """apify_get_dataset_items returns error for negative offset.""" + result = apify_get_dataset_items(dataset_id="dataset-abc", offset=-1) + + assert result["status"] == "error" + assert "offset" in result["content"][0]["text"] + + +def test_run_actor_and_get_dataset_zero_dataset_limit(mock_apify_env): + """apify_run_actor_and_get_dataset returns error for non-positive dataset_items_limit.""" + result = apify_run_actor_and_get_dataset(actor_id="actor/valid", dataset_items_limit=0) + + assert result["status"] == "error" + assert "dataset_items_limit" in result["content"][0]["text"] + + +def test_run_actor_and_get_dataset_negative_dataset_offset(mock_apify_env): + """apify_run_actor_and_get_dataset returns error for negative dataset_items_offset.""" + result = apify_run_actor_and_get_dataset(actor_id="actor/valid", dataset_items_offset=-1) + + assert result["status"] == "error" + assert "dataset_items_offset" in result["content"][0]["text"] + + +def test_run_task_and_get_dataset_zero_dataset_limit(mock_apify_env): + """apify_run_task_and_get_dataset returns error for non-positive dataset_items_limit.""" + result = apify_run_task_and_get_dataset(task_id="user~my-task", dataset_items_limit=0) + + assert result["status"] == "error" + assert "dataset_items_limit" in result["content"][0]["text"] + + +def test_run_task_and_get_dataset_negative_dataset_offset(mock_apify_env): + """apify_run_task_and_get_dataset returns error for negative dataset_items_offset.""" + result = apify_run_task_and_get_dataset(task_id="user~my-task", dataset_items_offset=-1) + + assert result["status"] == "error" + assert "dataset_items_offset" in result["content"][0]["text"] + + +def test_scrape_url_zero_timeout(mock_apify_env): + """apify_scrape_url returns error for non-positive timeout_secs.""" + result = apify_scrape_url(url="https://example.com", timeout_secs=0) + + assert result["status"] == "error" + assert "timeout_secs" in result["content"][0]["text"] + + +def test_scrape_url_invalid_crawler_type(mock_apify_env): + """apify_scrape_url returns error for unsupported crawler_type.""" + result = apify_scrape_url(url="https://example.com", crawler_type="invalid") + + assert result["status"] == "error" + assert "crawler_type" in result["content"][0]["text"] + + +def test_scrape_url_missing_domain(mock_apify_env): + """apify_scrape_url returns error for URL with no domain.""" + result = apify_scrape_url(url="https://") + + assert result["status"] == "error" + assert "domain" in result["content"][0]["text"].lower() + + # --- Dependency guard --- From 2d8bfbeca34823e1ed99ea0c485df9322ebaf80e Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 24 Mar 2026 12:56:55 +0100 Subject: [PATCH 10/39] feat: standardize terminology in apify tool documentation and code --- README.md | 18 +++--- docs/apify_tool.md | 64 ++++++++++---------- src/strands_tools/apify.py | 116 ++++++++++++++++++------------------- tests/test_apify.py | 34 +++++------ 4 files changed, 116 insertions(+), 116 deletions(-) diff --git a/README.md b/README.md index 45a3d6e8..0ed290cf 100644 --- a/README.md +++ b/README.md @@ -100,10 +100,10 @@ Below is a comprehensive table of all available tools, how to use them with an a |------|-------------|----------| | a2a_client | `provider = A2AClientToolProvider(known_agent_urls=["http://localhost:9000"]); agent = Agent(tools=provider.tools)` | Discover and communicate with A2A-compliant agents, send messages between agents | | apify_run_actor | `agent.tool.apify_run_actor(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run any Apify Actor by ID with arbitrary input | -| apify_get_dataset_items | `agent.tool.apify_get_dataset_items(dataset_id="abc123", limit=50)` | Fetch items from an Apify Dataset | -| apify_run_actor_and_get_dataset | `agent.tool.apify_run_actor_and_get_dataset(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run an Actor and fetch its Dataset results in one step | -| apify_run_task | `agent.tool.apify_run_task(task_id="user~my-task")` | Run a saved Apify Task by ID with optional input overrides | -| apify_run_task_and_get_dataset | `agent.tool.apify_run_task_and_get_dataset(task_id="user~my-task", dataset_items_limit=50)` | Run a Task and fetch its Dataset results in one step | +| apify_get_dataset_items | `agent.tool.apify_get_dataset_items(dataset_id="abc123", limit=50)` | Fetch items from an Apify dataset | +| apify_run_actor_and_get_dataset | `agent.tool.apify_run_actor_and_get_dataset(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run an Actor and fetch its dataset results in one step | +| apify_run_task | `agent.tool.apify_run_task(task_id="user/my-task")` | Run a saved Apify task by ID with optional input overrides | +| apify_run_task_and_get_dataset | `agent.tool.apify_run_task_and_get_dataset(task_id="user/my-task", dataset_items_limit=50)` | Run a task and fetch its dataset results in one step | | apify_scrape_url | `agent.tool.apify_scrape_url(url="https://example.com")` | Scrape a URL and return its content as markdown | | file_read | `agent.tool.file_read(path="path/to/file.txt")` | Reading configuration files, parsing code files, loading datasets | | file_write | `agent.tool.file_write(path="path/to/file.txt", content="file content")` | Writing results to files, creating new files, saving output data | @@ -991,12 +991,12 @@ result = agent.tool.apify_run_actor_and_get_dataset( dataset_items_limit=50, ) -# Run a saved Task (pre-configured Actor with default inputs) -run_info = agent.tool.apify_run_task(task_id="user~my-task") +# Run a saved task (pre-configured Actor with default inputs) +run_info = agent.tool.apify_run_task(task_id="user/my-task") -# Run a Task and get results in one step +# Run a task and get results in one step result = agent.tool.apify_run_task_and_get_dataset( - task_id="user~my-task", + task_id="user/my-task", task_input={"query": "override default input"}, dataset_items_limit=50, ) @@ -1007,7 +1007,7 @@ run_info = agent.tool.apify_run_actor( run_input={"queries": "AI agent frameworks"}, ) -# Fetch Dataset items separately +# Fetch dataset items separately items = agent.tool.apify_get_dataset_items( dataset_id="abc123", limit=100, diff --git a/docs/apify_tool.md b/docs/apify_tool.md index 2a436246..58803bd9 100644 --- a/docs/apify_tool.md +++ b/docs/apify_tool.md @@ -1,6 +1,6 @@ # Apify -The Apify tools (`apify.py`) enable [Strands Agents](https://strandsagents.com/) to interact with the [Apify](https://apify.com) platform β€” running any [Actor](https://apify.com/store) or [Task](https://docs.apify.com/platform/actors/running/tasks) by ID, fetching Dataset results, and scraping individual URLs. +The Apify tools (`apify.py`) enable [Strands Agents](https://strandsagents.com/) to interact with the [Apify](https://apify.com) platform β€” running any [Actor](https://apify.com/store) or [task](https://docs.apify.com/platform/actors/running/tasks) by ID, fetching dataset results, and scraping individual URLs. ## Installation @@ -16,7 +16,7 @@ Set your Apify API token as an environment variable: export APIFY_API_TOKEN=apify_api_your_token_here ``` -Get your token from the [Apify Console](https://console.apify.com/account/integrations) β†’ Settings β†’ API & Integrations β†’ Personal API tokens. +Get your token from [Apify Console](https://console.apify.com/account/integrations) β†’ Settings β†’ API & Integrations β†’ Personal API tokens. ## Usage @@ -44,7 +44,7 @@ content = agent.tool.apify_scrape_url(url="https://example.com") ### Run an Actor -Execute any Actor from the [Apify Store](https://apify.com/store) by its ID. The call blocks until the Actor Run finishes or the timeout is reached: +Execute any Actor from [Apify Store](https://apify.com/store) by its ID. The call blocks until the Actor run finishes or the timeout is reached: ```python result = agent.tool.apify_run_actor( @@ -58,7 +58,7 @@ The result is a JSON string containing run metadata: `run_id`, `status`, `datase ### Run an Actor and Get Results -Combine running an Actor and fetching its Dataset results in a single call: +Combine running an Actor and fetching its dataset results in a single call: ```python result = agent.tool.apify_run_actor_and_get_dataset( @@ -68,9 +68,9 @@ result = agent.tool.apify_run_actor_and_get_dataset( ) ``` -### Run a Task +### Run a task -Execute a saved [Actor Task](https://docs.apify.com/platform/actors/running/tasks) β€” a pre-configured Actor with preset inputs. Use this when a Task has already been set up in the Apify Console: +Execute a saved [Actor task](https://docs.apify.com/platform/actors/running/tasks) β€” a pre-configured Actor with preset inputs. Use this when a task has already been set up in Apify Console: ```python result = agent.tool.apify_run_task( @@ -82,9 +82,9 @@ result = agent.tool.apify_run_task( The result is a JSON string containing run metadata: `run_id`, `status`, `dataset_id`, `started_at`, and `finished_at`. -### Run a Task and Get Results +### Run a task and get results -Combine running a Task and fetching its Dataset results in a single call: +Combine running a task and fetching its dataset results in a single call: ```python result = agent.tool.apify_run_task_and_get_dataset( @@ -93,9 +93,9 @@ result = agent.tool.apify_run_task_and_get_dataset( ) ``` -### Fetch Dataset Items +### Fetch dataset items -Retrieve results from a Dataset by its ID. Useful after running an Actor to get the structured results separately, or to access any existing Dataset: +Retrieve results from a dataset by its ID. Useful after running an Actor to get the structured results separately, or to access any existing dataset: ```python items = agent.tool.apify_get_dataset_items( @@ -122,8 +122,8 @@ items = agent.tool.apify_get_dataset_items( |-----------|------|----------|---------|-------------| | `actor_id` | string | Yes | β€” | Actor identifier (e.g., `apify/website-content-crawler`) | | `run_input` | dict | No | None | JSON-serializable input for the Actor | -| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor Run to finish | -| `memory_mbytes` | int | No | None | Memory allocation in MB for the Actor Run (uses Actor default if not set) | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor run to finish | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the Actor run (uses Actor default if not set) | **Returns:** JSON string with run metadata: `run_id`, `status`, `dataset_id`, `started_at`, `finished_at`. @@ -131,10 +131,10 @@ items = agent.tool.apify_get_dataset_items( | Parameter | Type | Required | Default | Description | |-----------|------|----------|---------|-------------| -| `task_id` | string | Yes | β€” | Task identifier (e.g., `user~my-task` or a Task ID) | -| `task_input` | dict | No | None | JSON-serializable input to override the Task's default input | -| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Task Run to finish | -| `memory_mbytes` | int | No | None | Memory allocation in MB for the Task Run (uses Task default if not set) | +| `task_id` | string | Yes | β€” | Task identifier (e.g., `user~my-task` or a task ID) | +| `task_input` | dict | No | None | JSON-serializable input to override the task's default input | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the task run to finish | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the task run (uses task default if not set) | **Returns:** JSON string with run metadata: `run_id`, `status`, `dataset_id`, `started_at`, `finished_at`. @@ -142,23 +142,23 @@ items = agent.tool.apify_get_dataset_items( | Parameter | Type | Required | Default | Description | |-----------|------|----------|---------|-------------| -| `task_id` | string | Yes | β€” | Task identifier (e.g., `user~my-task` or a Task ID) | -| `task_input` | dict | No | None | JSON-serializable input to override the Task's default input | -| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Task Run to finish | -| `memory_mbytes` | int | No | None | Memory allocation in MB for the Task Run (uses Task default if not set) | -| `dataset_items_limit` | int | No | 100 | Maximum number of Dataset items to return | +| `task_id` | string | Yes | β€” | Task identifier (e.g., `user~my-task` or a task ID) | +| `task_input` | dict | No | None | JSON-serializable input to override the task's default input | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the task run to finish | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the task run (uses task default if not set) | +| `dataset_items_limit` | int | No | 100 | Maximum number of dataset items to return | -**Returns:** JSON string with run metadata plus an `items` array containing the Dataset results. +**Returns:** JSON string with run metadata plus an `items` array containing the dataset results. ### apify_get_dataset_items | Parameter | Type | Required | Default | Description | |-----------|------|----------|---------|-------------| -| `dataset_id` | string | Yes | β€” | The Apify Dataset ID to fetch items from | +| `dataset_id` | string | Yes | β€” | The Apify dataset ID to fetch items from | | `limit` | int | No | 100 | Maximum number of items to return | | `offset` | int | No | 0 | Number of items to skip for pagination | -**Returns:** JSON string containing an array of Dataset items. +**Returns:** JSON string containing an array of dataset items. ### apify_run_actor_and_get_dataset @@ -166,11 +166,11 @@ items = agent.tool.apify_get_dataset_items( |-----------|------|----------|---------|-------------| | `actor_id` | string | Yes | β€” | Actor identifier (e.g., `apify/website-content-crawler`) | | `run_input` | dict | No | None | JSON-serializable input for the Actor | -| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor Run to finish | -| `memory_mbytes` | int | No | None | Memory allocation in MB for the Actor Run (uses Actor default if not set) | -| `dataset_items_limit` | int | No | 100 | Maximum number of Dataset items to return | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor run to finish | +| `memory_mbytes` | int | No | None | Memory allocation in MB for the Actor run (uses Actor default if not set) | +| `dataset_items_limit` | int | No | 100 | Maximum number of dataset items to return | -**Returns:** JSON string with run metadata plus an `items` array containing the Dataset results. +**Returns:** JSON string with run metadata plus an `items` array containing the dataset results. ## Troubleshooting @@ -178,10 +178,10 @@ items = agent.tool.apify_get_dataset_items( |-------|-------|-----| | `APIFY_API_TOKEN environment variable is not set` | Token not configured | Set the `APIFY_API_TOKEN` environment variable | | `apify-client package is required` | Optional dependency not installed | Run `pip install strands-agents-tools[apify]` | -| `Actor ... finished with status FAILED` | Actor execution error | Check Actor input parameters and run logs in the [Apify Console](https://console.apify.com) | -| `Task ... finished with status FAILED` | Task execution error | Check Task configuration and run logs in the [Apify Console](https://console.apify.com) | -| `Actor/Task ... finished with status TIMED-OUT` | Timeout too short for the workload | Increase the `timeout_secs` parameter | -| `Task ... returned no run data` | Task `call()` returned `None` (wait timeout) | Increase the `timeout_secs` parameter | +| `Actor ... finished with status FAILED` | Actor execution error | Check Actor input parameters and run logs in [Apify Console](https://console.apify.com) | +| `Task ... finished with status FAILED` | task execution error | Check task configuration and run logs in [Apify Console](https://console.apify.com) | +| `Actor/task ... finished with status TIMED-OUT` | Timeout too short for the workload | Increase the `timeout_secs` parameter | +| `Task ... returned no run data` | task `call()` returned `None` (wait timeout) | Increase the `timeout_secs` parameter | | `No content returned for URL` | Website Content Crawler returned empty results | Verify the URL is accessible and returns content | ## References diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index bdac5124..12176eae 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -1,27 +1,27 @@ """Apify platform tools for Strands Agents. This module provides web scraping, data extraction, and automation capabilities -using the Apify platform. It lets you run any Actor by ID, fetch Dataset results, -and scrape individual URLs. +using the Apify platform. It lets you run any Actor, task, fetch dataset +results, and scrape individual URLs. Key Features: ------------ 1. Actor Execution: - β€’ apify_run_actor: Run any Apify Actor by ID with custom input + β€’ apify_run_actor: Run any Apify Actor with custom input β€’ apify_run_actor_and_get_dataset: Run an Actor and fetch results in one step 2. Task Execution: - β€’ apify_run_task: Run a saved Actor Task by ID with optional input overrides - β€’ apify_run_task_and_get_dataset: Run a Task and fetch results in one step + β€’ apify_run_task: Run a saved Actor task with optional input overrides + β€’ apify_run_task_and_get_dataset: Run a task and fetch results in one step 3. Data Retrieval: - β€’ apify_get_dataset_items: Fetch items from an Apify Dataset with pagination + β€’ apify_get_dataset_items: Fetch items from an Apify dataset with pagination β€’ apify_scrape_url: Scrape a single URL and return content as Markdown 4. Error Handling: β€’ Graceful API error handling with descriptive messages β€’ Dependency checking (apify-client optional install) - β€’ Timeout management for Actor Runs + β€’ Timeout management for Actor runs Setup Requirements: ------------------ @@ -116,7 +116,7 @@ def _format_error(e: Exception) -> str: case 404: return f"Resource not found: {msg}" case 408: - return f"Actor Run timed out: {msg}" + return f"Actor run timed out: {msg}" case 429: return ( "Rate limit exceeded. The Apify client retries automatically; " @@ -155,7 +155,7 @@ def __init__(self) -> None: @staticmethod def _check_run_status(actor_run: Dict[str, Any], label: str) -> None: - """Raise RuntimeError if the Actor Run did not succeed.""" + """Raise RuntimeError if the Actor run did not succeed.""" status = actor_run.get("status", "UNKNOWN") if status != "SUCCEEDED": run_id = actor_run.get("id", "N/A") @@ -231,7 +231,7 @@ def get_dataset_items( limit: int = DEFAULT_DATASET_ITEMS_LIMIT, offset: int = 0, ) -> List[Dict[str, Any]]: - """Fetch items from an Apify Dataset.""" + """Fetch items from an Apify dataset.""" self._validate_identifier(dataset_id, "dataset_id") self._validate_positive(limit, "limit") self._validate_non_negative(offset, "offset") @@ -249,7 +249,7 @@ def run_actor_and_get_dataset( dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, dataset_items_offset: int = 0, ) -> Dict[str, Any]: - """Run an Actor synchronously, then fetch its default Dataset items.""" + """Run an Actor synchronously, then fetch its default dataset items.""" self._validate_positive(dataset_items_limit, "dataset_items_limit") self._validate_non_negative(dataset_items_offset, "dataset_items_offset") @@ -262,7 +262,7 @@ def run_actor_and_get_dataset( ) dataset_id = run_metadata["dataset_id"] if not dataset_id: - raise RuntimeError(f"Actor {actor_id} run has no default Dataset.") + raise RuntimeError(f"Actor {actor_id} run has no default dataset.") items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit, offset=dataset_items_offset) return {**run_metadata, "items": items} @@ -273,7 +273,7 @@ def run_task( timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, ) -> Dict[str, Any]: - """Run an Apify Task synchronously and return run metadata.""" + """Run an Apify task synchronously and return run metadata.""" self._validate_identifier(task_id, "task_id") self._validate_positive(timeout_secs, "timeout_secs") if memory_mbytes is not None: @@ -307,7 +307,7 @@ def run_task_and_get_dataset( dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, dataset_items_offset: int = 0, ) -> Dict[str, Any]: - """Run a Task synchronously, then fetch its default Dataset items.""" + """Run a task synchronously, then fetch its default dataset items.""" self._validate_positive(dataset_items_limit, "dataset_items_limit") self._validate_non_negative(dataset_items_offset, "dataset_items_offset") @@ -319,7 +319,7 @@ def run_task_and_get_dataset( ) dataset_id = run_metadata["dataset_id"] if not dataset_id: - raise RuntimeError(f"Task {task_id} run has no default Dataset.") + raise RuntimeError(f"Task {task_id} run has no default dataset.") items = self.get_dataset_items(dataset_id=dataset_id, limit=dataset_items_limit, offset=dataset_items_offset) return {**run_metadata, "items": items} @@ -370,9 +370,9 @@ def apify_run_actor( memory_mbytes: Optional[int] = None, build: Optional[str] = None, ) -> Dict[str, Any]: - """Run any Apify Actor by its ID or name and return the run metadata as JSON. + """Run any Apify Actor and return the run metadata as JSON. - Executes the Actor synchronously - blocks until the Actor Run finishes or the timeout + Executes the Actor synchronously - blocks until the Actor run finishes or the timeout is reached. Use this when you need to run a specific Actor and then inspect or process the results separately. @@ -384,9 +384,9 @@ def apify_run_actor( Args: actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema. - timeout_secs: Maximum time in seconds to wait for the Actor Run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Actor Run. Uses Actor default if not set. - build: Actor Build tag or number to run a specific version. Uses latest Build if not set. + timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. + build: Actor build tag or number to run a specific version. Uses latest build if not set. Returns: Dict with status and content containing run metadata: run_id, status, dataset_id, @@ -405,7 +405,7 @@ def apify_run_actor( return _success_result( text=json.dumps(result, indent=2, default=str), panel_body=( - f"[green]Actor Run completed[/green]\n" + f"[green]Actor run completed[/green]\n" f"Actor: {actor_id}\n" f"Run ID: {result['run_id']}\n" f"Status: {result['status']}\n" @@ -423,18 +423,18 @@ def apify_get_dataset_items( limit: int = DEFAULT_DATASET_ITEMS_LIMIT, offset: int = 0, ) -> Dict[str, Any]: - """Fetch items from an existing Apify Dataset and return them as JSON. + """Fetch items from an existing Apify dataset and return them as JSON. Use this after running an Actor to retrieve the structured results from its - default Dataset, or to access any Dataset by ID. + default dataset, or to access any dataset by ID. Args: - dataset_id: The Apify Dataset ID to fetch items from. + dataset_id: The Apify dataset ID to fetch items from. limit: Maximum number of items to return. Defaults to 100. offset: Number of items to skip for pagination. Defaults to 0. Returns: - Dict with status and content containing an array of Dataset items. + Dict with status and content containing an array of dataset items. """ try: _check_dependency() @@ -461,24 +461,24 @@ def apify_run_actor_and_get_dataset( dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, dataset_items_offset: int = 0, ) -> Dict[str, Any]: - """Run an Apify Actor and fetch its Dataset results in one step. + """Run an Apify Actor and fetch its dataset results in one step. - Convenience tool that combines running an Actor and fetching its default Dataset - items into a single call. Use this when you want both the run metadata and the + Convenience tool that combines running an Actor and fetching its default + dataset items into a single call. Use this when you want both the run metadata and the result data without making two separate tool calls. Args: actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". run_input: JSON-serializable input for the Actor. - timeout_secs: Maximum time in seconds to wait for the Actor Run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Actor Run. - build: Actor Build tag or number to run a specific version. Uses latest Build if not set. - dataset_items_limit: Maximum number of Dataset items to return. Defaults to 100. - dataset_items_offset: Number of Dataset items to skip for pagination. Defaults to 0. + timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Actor run. + build: Actor build tag or number to run a specific version. Uses latest build if not set. + dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. + dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. Returns: Dict with status and content containing run metadata (run_id, status, dataset_id, - started_at, finished_at) plus an "items" array containing the Dataset results. + started_at, finished_at) plus an "items" array containing the dataset results. """ try: _check_dependency() @@ -495,7 +495,7 @@ def apify_run_actor_and_get_dataset( return _success_result( text=json.dumps(result, indent=2, default=str), panel_body=( - f"[green]Actor Run completed with dataset[/green]\n" + f"[green]Actor run completed with dataset[/green]\n" f"Actor: {actor_id}\n" f"Run ID: {result['run_id']}\n" f"Status: {result['status']}\n" @@ -515,17 +515,17 @@ def apify_run_task( timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, ) -> Dict[str, Any]: - """Run an Apify Task by its ID or name and return the run metadata as JSON. + """Run an Apify task and return the run metadata as JSON. - Tasks are saved Actor configurations with preset inputs. Use this when a Task - has already been configured in the Apify Console, so you don't need to specify + Tasks are saved Actor configurations with preset inputs. Use this when a task + has already been configured in Apify Console, so you don't need to specify the full Actor input every time. Args: - task_id: Task identifier, e.g. "user~my-task" or a Task ID string. - task_input: Optional JSON-serializable input to override the Task's default input. - timeout_secs: Maximum time in seconds to wait for the Task Run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Task Run. Uses Task default if not set. + task_id: Task identifier, e.g. "user/my-task" or a task ID string. + task_input: Optional JSON-serializable input to override the task's default input. + timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. Returns: Dict with status and content containing run metadata: run_id, status, dataset_id, @@ -543,7 +543,7 @@ def apify_run_task( return _success_result( text=json.dumps(result, indent=2, default=str), panel_body=( - f"[green]Task Run completed[/green]\n" + f"[green]Task run completed[/green]\n" f"Task: {task_id}\n" f"Run ID: {result['run_id']}\n" f"Status: {result['status']}\n" @@ -564,23 +564,23 @@ def apify_run_task_and_get_dataset( dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, dataset_items_offset: int = 0, ) -> Dict[str, Any]: - """Run an Apify Task and fetch its Dataset results in one step. + """Run an Apify task and fetch its dataset results in one step. - Convenience tool that combines running a Task and fetching its default Dataset - items into a single call. Use this when you want both the run metadata and the + Convenience tool that combines running a task and fetching its default + dataset items into a single call. Use this when you want both the run metadata and the result data without making two separate tool calls. Args: - task_id: Task identifier, e.g. "user~my-task" or a Task ID string. - task_input: Optional JSON-serializable input to override the Task's default input. - timeout_secs: Maximum time in seconds to wait for the Task Run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Task Run. - dataset_items_limit: Maximum number of Dataset items to return. Defaults to 100. - dataset_items_offset: Number of Dataset items to skip for pagination. Defaults to 0. + task_id: Task identifier, e.g. "user/my-task" or a task ID string. + task_input: Optional JSON-serializable input to override the task's default input. + timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the task run. + dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. + dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. Returns: Dict with status and content containing run metadata (run_id, status, dataset_id, - started_at, finished_at) plus an "items" array containing the Dataset results. + started_at, finished_at) plus an "items" array containing the dataset results. """ try: _check_dependency() @@ -596,7 +596,7 @@ def apify_run_task_and_get_dataset( return _success_result( text=json.dumps(result, indent=2, default=str), panel_body=( - f"[green]Task Run completed with dataset[/green]\n" + f"[green]Task run completed with dataset[/green]\n" f"Task: {task_id}\n" f"Run ID: {result['run_id']}\n" f"Status: {result['status']}\n" @@ -617,16 +617,16 @@ def apify_scrape_url( ) -> Dict[str, Any]: """Scrape a single URL and return its content as markdown. - Uses the Apify Website Content Crawler Actor under the hood, pre-configured for + Uses the Website Content Crawler Actor under the hood, pre-configured for fast single-page scraping. This is the simplest way to extract readable content from any web page. Args: url: The URL to scrape, e.g. "https://example.com". timeout_secs: Maximum time in seconds to wait for scraping to finish. Defaults to 120. - crawler_type: Crawler engine to use. One of "playwright:adaptive" (fast, renders JS if - present, recommended default), "playwright:firefox" (reliable, renders JS, best at - avoiding blocking but slower), or "cheerio" (fastest, no JS rendering). + crawler_type: Crawler engine to use. One of "cheerio" (fastest, no JS rendering, + default), "playwright:adaptive" (fast, renders JS if present), or + "playwright:firefox" (reliable, renders JS, best at avoiding blocking but slower). Returns: Dict with status and content containing the markdown content of the scraped page. diff --git a/tests/test_apify.py b/tests/test_apify.py index 70b3aca5..3c9ec899 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -126,7 +126,7 @@ def test_client_uses_env_token(mock_apify_env): def test_run_actor_success(mock_apify_env, mock_apify_client): - """Successful Actor Run returns structured result with run metadata.""" + """Successful Actor run returns structured result with run metadata.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_run_actor(actor_id="actor/my-scraper", run_input={"url": "https://example.com"}) @@ -141,7 +141,7 @@ def test_run_actor_success(mock_apify_env, mock_apify_client): def test_run_actor_default_input(mock_apify_env, mock_apify_client): - """Actor Run defaults run_input to empty dict when not provided.""" + """Actor run defaults run_input to empty dict when not provided.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_run_actor(actor_id="actor/my-scraper") @@ -151,7 +151,7 @@ def test_run_actor_default_input(mock_apify_env, mock_apify_client): def test_run_actor_with_memory(mock_apify_env, mock_apify_client): - """Actor Run passes memory_mbytes when provided.""" + """Actor run passes memory_mbytes when provided.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): apify_run_actor(actor_id="actor/my-scraper", memory_mbytes=512) @@ -160,7 +160,7 @@ def test_run_actor_with_memory(mock_apify_env, mock_apify_client): def test_run_actor_failure(mock_apify_env, mock_apify_client): - """Actor Run returns error dict when Actor fails.""" + """Actor run returns error dict when Actor fails.""" mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): @@ -171,7 +171,7 @@ def test_run_actor_failure(mock_apify_env, mock_apify_client): def test_run_actor_timeout(mock_apify_env, mock_apify_client): - """Actor Run returns error dict when Actor times out.""" + """Actor run returns error dict when Actor times out.""" mock_apify_client.actor.return_value.call.return_value = MOCK_TIMED_OUT_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): @@ -182,7 +182,7 @@ def test_run_actor_timeout(mock_apify_env, mock_apify_client): def test_run_actor_api_exception(mock_apify_env, mock_apify_client): - """Actor Run returns error dict on API exceptions.""" + """Actor run returns error dict on API exceptions.""" mock_apify_client.actor.return_value.call.side_effect = Exception("Connection failed") with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): @@ -193,7 +193,7 @@ def test_run_actor_api_exception(mock_apify_env, mock_apify_client): def test_run_actor_apify_api_error_401(mock_apify_env, mock_apify_client): - """Actor Run returns friendly message for 401 authentication errors.""" + """Actor run returns friendly message for 401 authentication errors.""" error = _make_apify_api_error(401, "Unauthorized") mock_apify_client.actor.return_value.call.side_effect = error @@ -205,7 +205,7 @@ def test_run_actor_apify_api_error_401(mock_apify_env, mock_apify_client): def test_run_actor_apify_api_error_404(mock_apify_env, mock_apify_client): - """Actor Run returns friendly message for 404 not-found errors.""" + """Actor run returns friendly message for 404 not-found errors.""" error = _make_apify_api_error(404, "Actor not found") mock_apify_client.actor.return_value.call.side_effect = error @@ -233,7 +233,7 @@ def test_get_dataset_items_success(mock_apify_env, mock_apify_client): def test_get_dataset_items_with_pagination(mock_apify_env, mock_apify_client): - """Dataset retrieval passes limit and offset.""" + """dataset retrieval passes limit and offset.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): apify_get_dataset_items(dataset_id="dataset-xyz", limit=50, offset=10) @@ -290,7 +290,7 @@ def test_run_actor_and_get_dataset_actor_failure(mock_apify_env, mock_apify_clie def test_run_task_success(mock_apify_env, mock_apify_client): - """Successful Task Run returns structured result with run metadata.""" + """Successful task run returns structured result with run metadata.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_run_task(task_id="user~my-task", task_input={"query": "test"}) @@ -303,7 +303,7 @@ def test_run_task_success(mock_apify_env, mock_apify_client): def test_run_task_no_input(mock_apify_env, mock_apify_client): - """Task Run omits task_input kwarg when not provided.""" + """task run omits task_input kwarg when not provided.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_run_task(task_id="user~my-task") @@ -313,7 +313,7 @@ def test_run_task_no_input(mock_apify_env, mock_apify_client): def test_run_task_with_memory(mock_apify_env, mock_apify_client): - """Task Run passes memory_mbytes when provided.""" + """task run passes memory_mbytes when provided.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): apify_run_task(task_id="user~my-task", memory_mbytes=1024) @@ -322,7 +322,7 @@ def test_run_task_with_memory(mock_apify_env, mock_apify_client): def test_run_task_failure(mock_apify_env, mock_apify_client): - """Task Run returns error dict when Task fails.""" + """task run returns error dict when task fails.""" mock_apify_client.task.return_value.call.return_value = MOCK_FAILED_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): @@ -333,7 +333,7 @@ def test_run_task_failure(mock_apify_env, mock_apify_client): def test_run_task_none_response(mock_apify_env, mock_apify_client): - """Task Run returns error dict when TaskClient.call() returns None.""" + """task run returns error dict when TaskClient.call() returns None.""" mock_apify_client.task.return_value.call.return_value = None with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): @@ -344,7 +344,7 @@ def test_run_task_none_response(mock_apify_env, mock_apify_client): def test_run_task_apify_api_error_401(mock_apify_env, mock_apify_client): - """Task Run returns friendly message for 401 authentication errors.""" + """task run returns friendly message for 401 authentication errors.""" error = _make_apify_api_error(401, "Unauthorized") mock_apify_client.task.return_value.call.side_effect = error @@ -359,7 +359,7 @@ def test_run_task_apify_api_error_401(mock_apify_env, mock_apify_client): def test_run_task_and_get_dataset_success(mock_apify_env, mock_apify_client): - """Combined Task run + dataset fetch returns structured result with metadata and items.""" + """Combined task run + dataset fetch returns structured result with metadata and items.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): result = apify_run_task_and_get_dataset( task_id="user~my-task", @@ -375,7 +375,7 @@ def test_run_task_and_get_dataset_success(mock_apify_env, mock_apify_client): def test_run_task_and_get_dataset_task_failure(mock_apify_env, mock_apify_client): - """Combined Task tool returns error dict when the Task fails.""" + """Combined task tool returns error dict when the task fails.""" mock_apify_client.task.return_value.call.return_value = MOCK_FAILED_RUN with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): From 1ef943d257e06bbd60b1080cba6e8c66756f9138 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 25 Mar 2026 13:08:01 +0100 Subject: [PATCH 11/39] feat: refactor Apify tools into core module and update docs --- README.md | 13 +- docs/apify_tool.md | 23 +- src/strands_tools/apify.py | 345 +----------------------------- src/strands_tools/apify_core.py | 366 ++++++++++++++++++++++++++++++++ tests/test_apify.py | 16 +- 5 files changed, 401 insertions(+), 362 deletions(-) create mode 100644 src/strands_tools/apify_core.py diff --git a/README.md b/README.md index 0ed290cf..d79af517 100644 --- a/README.md +++ b/README.md @@ -970,16 +970,9 @@ result = agent.tool.mongodb_memory( ```python from strands import Agent -from strands_tools import apify - -agent = Agent(tools=[ - apify.apify_run_actor, - apify.apify_run_task, - apify.apify_get_dataset_items, - apify.apify_run_actor_and_get_dataset, - apify.apify_run_task_and_get_dataset, - apify.apify_scrape_url, -]) +from strands_tools.apify_core import APIFY_CORE_TOOLS + +agent = Agent(tools=APIFY_CORE_TOOLS) # Scrape a single URL and get markdown content content = agent.tool.apify_scrape_url(url="https://example.com") diff --git a/docs/apify_tool.md b/docs/apify_tool.md index 58803bd9..36358192 100644 --- a/docs/apify_tool.md +++ b/docs/apify_tool.md @@ -1,6 +1,6 @@ # Apify -The Apify tools (`apify.py`) enable [Strands Agents](https://strandsagents.com/) to interact with the [Apify](https://apify.com) platform β€” running any [Actor](https://apify.com/store) or [task](https://docs.apify.com/platform/actors/running/tasks) by ID, fetching dataset results, and scraping individual URLs. +The Apify core tools (`apify_core.py`) enable [Strands Agents](https://strandsagents.com/) to interact with the [Apify](https://apify.com) platform β€” running any [Actor](https://apify.com/store) or [task](https://docs.apify.com/platform/actors/running/tasks) by ID, fetching dataset results, and scraping individual URLs. ## Installation @@ -20,17 +20,24 @@ Get your token from [Apify Console](https://console.apify.com/account/integratio ## Usage +Register all core tools at once: + +```python +from strands import Agent +from strands_tools.apify_core import APIFY_CORE_TOOLS + +agent = Agent(tools=APIFY_CORE_TOOLS) +``` + +Or pick individual tools: + ```python from strands import Agent -from strands_tools import apify +from strands_tools import apify_core agent = Agent(tools=[ - apify.apify_run_actor, - apify.apify_run_task, - apify.apify_scrape_url, - apify.apify_get_dataset_items, - apify.apify_run_actor_and_get_dataset, - apify.apify_run_task_and_get_dataset, + apify_core.apify_run_actor, + apify_core.apify_scrape_url, ]) ``` diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 12176eae..cb63ae70 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -1,27 +1,11 @@ -"""Apify platform tools for Strands Agents. +"""Shared base for Apify platform tools. -This module provides web scraping, data extraction, and automation capabilities -using the Apify platform. It lets you run any Actor, task, fetch dataset -results, and scrape individual URLs. +This module provides the shared infrastructure used by all Apify tool modules +(e.g. apify_core, apify_social). It contains the API client, error handling, +response helpers, and constants. It does NOT contain any @tool functions itself. -Key Features: ------------- -1. Actor Execution: - β€’ apify_run_actor: Run any Apify Actor with custom input - β€’ apify_run_actor_and_get_dataset: Run an Actor and fetch results in one step - -2. Task Execution: - β€’ apify_run_task: Run a saved Actor task with optional input overrides - β€’ apify_run_task_and_get_dataset: Run a task and fetch results in one step - -3. Data Retrieval: - β€’ apify_get_dataset_items: Fetch items from an Apify dataset with pagination - β€’ apify_scrape_url: Scrape a single URL and return content as Markdown - -4. Error Handling: - β€’ Graceful API error handling with descriptive messages - β€’ Dependency checking (apify-client optional install) - β€’ Timeout management for Actor runs +Tool modules import from here: + from strands_tools.apify import ApifyToolClient, _check_dependency, ... Setup Requirements: ------------------ @@ -30,37 +14,8 @@ 3. Install the optional dependency: pip install strands-agents-tools[apify] 4. Set the environment variable: APIFY_API_TOKEN=your_api_token_here - -Example .env configuration: - APIFY_API_TOKEN=apify_api_1a2B3cD4eF5gH6iJ7kL8m - -Usage Examples: --------------- -```python -from strands import Agent -from strands_tools import apify - -agent = Agent(tools=[ - apify.apify_run_actor, - apify.apify_run_task, - apify.apify_get_dataset_items, - apify.apify_run_actor_and_get_dataset, - apify.apify_run_task_and_get_dataset, - apify.apify_scrape_url, -]) - -# Scrape a single URL -content = agent.tool.apify_scrape_url(url="https://example.com") - -# Run an Actor -result = agent.tool.apify_run_actor( - actor_id="apify/website-content-crawler", - run_input={"startUrls": [{"url": "https://example.com"}]}, -) -``` """ -import json import logging import os from typing import Any, Dict, List, Optional @@ -68,7 +23,6 @@ from rich.panel import Panel from rich.text import Text -from strands import tool from strands_tools.utils import console_util @@ -357,290 +311,3 @@ def scrape_url( raise RuntimeError(f"No content returned for URL: {url}") return str(items[0].get("markdown") or items[0].get("text", "")) - - -# --- Tool functions --- - - -@tool -def apify_run_actor( - actor_id: str, - run_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = DEFAULT_TIMEOUT_SECS, - memory_mbytes: Optional[int] = None, - build: Optional[str] = None, -) -> Dict[str, Any]: - """Run any Apify Actor and return the run metadata as JSON. - - Executes the Actor synchronously - blocks until the Actor run finishes or the timeout - is reached. Use this when you need to run a specific Actor and then inspect or process - the results separately. - - Common Actors: - - "apify/website-content-crawler" - scrape websites and extract content - - "apify/web-scraper" - general-purpose web scraper - - "apify/google-search-scraper" - scrape Google search results - - Args: - actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". - run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema. - timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. - build: Actor build tag or number to run a specific version. Uses latest build if not set. - - Returns: - Dict with status and content containing run metadata: run_id, status, dataset_id, - started_at, finished_at. - """ - try: - _check_dependency() - client = ApifyToolClient() - result = client.run_actor( - actor_id=actor_id, - run_input=run_input, - timeout_secs=timeout_secs, - memory_mbytes=memory_mbytes, - build=build, - ) - return _success_result( - text=json.dumps(result, indent=2, default=str), - panel_body=( - f"[green]Actor run completed[/green]\n" - f"Actor: {actor_id}\n" - f"Run ID: {result['run_id']}\n" - f"Status: {result['status']}\n" - f"Dataset ID: {result['dataset_id']}" - ), - panel_title="Apify: Run Actor", - ) - except Exception as e: - return _error_result(e, "apify_run_actor") - - -@tool -def apify_get_dataset_items( - dataset_id: str, - limit: int = DEFAULT_DATASET_ITEMS_LIMIT, - offset: int = 0, -) -> Dict[str, Any]: - """Fetch items from an existing Apify dataset and return them as JSON. - - Use this after running an Actor to retrieve the structured results from its - default dataset, or to access any dataset by ID. - - Args: - dataset_id: The Apify dataset ID to fetch items from. - limit: Maximum number of items to return. Defaults to 100. - offset: Number of items to skip for pagination. Defaults to 0. - - Returns: - Dict with status and content containing an array of dataset items. - """ - try: - _check_dependency() - client = ApifyToolClient() - items = client.get_dataset_items(dataset_id=dataset_id, limit=limit, offset=offset) - return _success_result( - text=json.dumps(items, indent=2, default=str), - panel_body=( - f"[green]Dataset items retrieved[/green]\nDataset ID: {dataset_id}\nItems returned: {len(items)}" - ), - panel_title="Apify: Dataset Items", - ) - except Exception as e: - return _error_result(e, "apify_get_dataset_items") - - -@tool -def apify_run_actor_and_get_dataset( - actor_id: str, - run_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = DEFAULT_TIMEOUT_SECS, - memory_mbytes: Optional[int] = None, - build: Optional[str] = None, - dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, - dataset_items_offset: int = 0, -) -> Dict[str, Any]: - """Run an Apify Actor and fetch its dataset results in one step. - - Convenience tool that combines running an Actor and fetching its default - dataset items into a single call. Use this when you want both the run metadata and the - result data without making two separate tool calls. - - Args: - actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". - run_input: JSON-serializable input for the Actor. - timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Actor run. - build: Actor build tag or number to run a specific version. Uses latest build if not set. - dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. - dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. - - Returns: - Dict with status and content containing run metadata (run_id, status, dataset_id, - started_at, finished_at) plus an "items" array containing the dataset results. - """ - try: - _check_dependency() - client = ApifyToolClient() - result = client.run_actor_and_get_dataset( - actor_id=actor_id, - run_input=run_input, - timeout_secs=timeout_secs, - memory_mbytes=memory_mbytes, - build=build, - dataset_items_limit=dataset_items_limit, - dataset_items_offset=dataset_items_offset, - ) - return _success_result( - text=json.dumps(result, indent=2, default=str), - panel_body=( - f"[green]Actor run completed with dataset[/green]\n" - f"Actor: {actor_id}\n" - f"Run ID: {result['run_id']}\n" - f"Status: {result['status']}\n" - f"Dataset ID: {result['dataset_id']}\n" - f"Items returned: {len(result['items'])}" - ), - panel_title="Apify: Run Actor + Dataset", - ) - except Exception as e: - return _error_result(e, "apify_run_actor_and_get_dataset") - - -@tool -def apify_run_task( - task_id: str, - task_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = DEFAULT_TIMEOUT_SECS, - memory_mbytes: Optional[int] = None, -) -> Dict[str, Any]: - """Run an Apify task and return the run metadata as JSON. - - Tasks are saved Actor configurations with preset inputs. Use this when a task - has already been configured in Apify Console, so you don't need to specify - the full Actor input every time. - - Args: - task_id: Task identifier, e.g. "user/my-task" or a task ID string. - task_input: Optional JSON-serializable input to override the task's default input. - timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. - - Returns: - Dict with status and content containing run metadata: run_id, status, dataset_id, - started_at, finished_at. - """ - try: - _check_dependency() - client = ApifyToolClient() - result = client.run_task( - task_id=task_id, - task_input=task_input, - timeout_secs=timeout_secs, - memory_mbytes=memory_mbytes, - ) - return _success_result( - text=json.dumps(result, indent=2, default=str), - panel_body=( - f"[green]Task run completed[/green]\n" - f"Task: {task_id}\n" - f"Run ID: {result['run_id']}\n" - f"Status: {result['status']}\n" - f"Dataset ID: {result['dataset_id']}" - ), - panel_title="Apify: Run Task", - ) - except Exception as e: - return _error_result(e, "apify_run_task") - - -@tool -def apify_run_task_and_get_dataset( - task_id: str, - task_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = DEFAULT_TIMEOUT_SECS, - memory_mbytes: Optional[int] = None, - dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, - dataset_items_offset: int = 0, -) -> Dict[str, Any]: - """Run an Apify task and fetch its dataset results in one step. - - Convenience tool that combines running a task and fetching its default - dataset items into a single call. Use this when you want both the run metadata and the - result data without making two separate tool calls. - - Args: - task_id: Task identifier, e.g. "user/my-task" or a task ID string. - task_input: Optional JSON-serializable input to override the task's default input. - timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the task run. - dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. - dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. - - Returns: - Dict with status and content containing run metadata (run_id, status, dataset_id, - started_at, finished_at) plus an "items" array containing the dataset results. - """ - try: - _check_dependency() - client = ApifyToolClient() - result = client.run_task_and_get_dataset( - task_id=task_id, - task_input=task_input, - timeout_secs=timeout_secs, - memory_mbytes=memory_mbytes, - dataset_items_limit=dataset_items_limit, - dataset_items_offset=dataset_items_offset, - ) - return _success_result( - text=json.dumps(result, indent=2, default=str), - panel_body=( - f"[green]Task run completed with dataset[/green]\n" - f"Task: {task_id}\n" - f"Run ID: {result['run_id']}\n" - f"Status: {result['status']}\n" - f"Dataset ID: {result['dataset_id']}\n" - f"Items returned: {len(result['items'])}" - ), - panel_title="Apify: Run Task + Dataset", - ) - except Exception as e: - return _error_result(e, "apify_run_task_and_get_dataset") - - -@tool -def apify_scrape_url( - url: str, - timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, - crawler_type: str = "cheerio", -) -> Dict[str, Any]: - """Scrape a single URL and return its content as markdown. - - Uses the Website Content Crawler Actor under the hood, pre-configured for - fast single-page scraping. This is the simplest way to extract readable content - from any web page. - - Args: - url: The URL to scrape, e.g. "https://example.com". - timeout_secs: Maximum time in seconds to wait for scraping to finish. Defaults to 120. - crawler_type: Crawler engine to use. One of "cheerio" (fastest, no JS rendering, - default), "playwright:adaptive" (fast, renders JS if present), or - "playwright:firefox" (reliable, renders JS, best at avoiding blocking but slower). - - Returns: - Dict with status and content containing the markdown content of the scraped page. - """ - try: - _check_dependency() - client = ApifyToolClient() - content = client.scrape_url(url=url, timeout_secs=timeout_secs, crawler_type=crawler_type) - return _success_result( - text=content, - panel_body=( - f"[green]URL scraped successfully[/green]\nURL: {url}\nContent length: {len(content)} characters" - ), - panel_title="Apify: Scrape URL", - ) - except Exception as e: - return _error_result(e, "apify_scrape_url") diff --git a/src/strands_tools/apify_core.py b/src/strands_tools/apify_core.py new file mode 100644 index 00000000..69d330ae --- /dev/null +++ b/src/strands_tools/apify_core.py @@ -0,0 +1,366 @@ +"""Core Apify platform tools for Strands Agents. + +This module provides web scraping, data extraction, and automation capabilities +using the Apify platform. It lets you run any Actor, task, fetch dataset +results, and scrape individual URLs. + +Available Tools: +--------------- +- apify_run_actor: Run any Apify Actor with custom input +- apify_get_dataset_items: Fetch items from an Apify dataset with pagination +- apify_run_actor_and_get_dataset: Run an Actor and fetch results in one step +- apify_run_task: Run a saved Actor task with optional input overrides +- apify_run_task_and_get_dataset: Run a task and fetch results in one step +- apify_scrape_url: Scrape a single URL and return content as Markdown + +Setup Requirements: +------------------ +1. Create an Apify account at https://apify.com +2. Obtain your API token: Apify Console > Settings > API & Integrations > Personal API tokens +3. Install the optional dependency: pip install strands-agents-tools[apify] +4. Set the environment variable: + APIFY_API_TOKEN=your_api_token_here + +Usage Examples: +-------------- +Register all core tools at once via the preset list: + +```python +from strands import Agent +from strands_tools.apify_core import APIFY_CORE_TOOLS + +agent = Agent(tools=APIFY_CORE_TOOLS) +``` + +Or pick individual tools for a smaller LLM tool surface: + +```python +from strands import Agent +from strands_tools import apify_core + +agent = Agent(tools=[ + apify_core.apify_scrape_url, + apify_core.apify_run_actor, +]) + +# Scrape a single URL +content = agent.tool.apify_scrape_url(url="https://example.com") + +# Run an Actor +result = agent.tool.apify_run_actor( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, +) +``` +""" + +import json +from typing import Any, Dict, Optional + +from strands import tool + +from strands_tools.apify import ( + DEFAULT_DATASET_ITEMS_LIMIT, + DEFAULT_SCRAPE_TIMEOUT_SECS, + DEFAULT_TIMEOUT_SECS, + ApifyToolClient, + _check_dependency, + _error_result, + _success_result, +) + + +@tool +def apify_run_actor( + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + build: Optional[str] = None, +) -> Dict[str, Any]: + """Run any Apify Actor and return the run metadata as JSON. + + Executes the Actor synchronously - blocks until the Actor run finishes or the timeout + is reached. Use this when you need to run a specific Actor and then inspect or process + the results separately. + + Common Actors: + - "apify/website-content-crawler" - scrape websites and extract content + - "apify/web-scraper" - general-purpose web scraper + - "apify/google-search-scraper" - scrape Google search results + + Args: + actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". + run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema. + timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. + build: Actor build tag or number to run a specific version. Uses latest build if not set. + + Returns: + Dict with status and content containing run metadata: run_id, status, dataset_id, + started_at, finished_at. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_actor( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + build=build, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Actor run completed[/green]\n" + f"Actor: {actor_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}" + ), + panel_title="Apify: Run Actor", + ) + except Exception as e: + return _error_result(e, "apify_run_actor") + + +@tool +def apify_get_dataset_items( + dataset_id: str, + limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + offset: int = 0, +) -> Dict[str, Any]: + """Fetch items from an existing Apify dataset and return them as JSON. + + Use this after running an Actor to retrieve the structured results from its + default dataset, or to access any dataset by ID. + + Args: + dataset_id: The Apify dataset ID to fetch items from. + limit: Maximum number of items to return. Defaults to 100. + offset: Number of items to skip for pagination. Defaults to 0. + + Returns: + Dict with status and content containing an array of dataset items. + """ + try: + _check_dependency() + client = ApifyToolClient() + items = client.get_dataset_items(dataset_id=dataset_id, limit=limit, offset=offset) + return _success_result( + text=json.dumps(items, indent=2, default=str), + panel_body=( + f"[green]Dataset items retrieved[/green]\nDataset ID: {dataset_id}\nItems returned: {len(items)}" + ), + panel_title="Apify: Dataset Items", + ) + except Exception as e: + return _error_result(e, "apify_get_dataset_items") + + +@tool +def apify_run_actor_and_get_dataset( + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + build: Optional[str] = None, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, +) -> Dict[str, Any]: + """Run an Apify Actor and fetch its dataset results in one step. + + Convenience tool that combines running an Actor and fetching its default + dataset items into a single call. Use this when you want both the run metadata and the + result data without making two separate tool calls. + + Args: + actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". + run_input: JSON-serializable input for the Actor. + timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Actor run. + build: Actor build tag or number to run a specific version. Uses latest build if not set. + dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. + dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. + + Returns: + Dict with status and content containing run metadata (run_id, status, dataset_id, + started_at, finished_at) plus an "items" array containing the dataset results. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_actor_and_get_dataset( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + build=build, + dataset_items_limit=dataset_items_limit, + dataset_items_offset=dataset_items_offset, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Actor run completed with dataset[/green]\n" + f"Actor: {actor_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}\n" + f"Items returned: {len(result['items'])}" + ), + panel_title="Apify: Run Actor + Dataset", + ) + except Exception as e: + return _error_result(e, "apify_run_actor_and_get_dataset") + + +@tool +def apify_run_task( + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, +) -> Dict[str, Any]: + """Run an Apify task and return the run metadata as JSON. + + Tasks are saved Actor configurations with preset inputs. Use this when a task + has already been configured in Apify Console, so you don't need to specify + the full Actor input every time. + + Args: + task_id: Task identifier, e.g. "user/my-task" or a task ID string. + task_input: Optional JSON-serializable input to override the task's default input. + timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. + + Returns: + Dict with status and content containing run metadata: run_id, status, dataset_id, + started_at, finished_at. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_task( + task_id=task_id, + task_input=task_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Task run completed[/green]\n" + f"Task: {task_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}" + ), + panel_title="Apify: Run Task", + ) + except Exception as e: + return _error_result(e, "apify_run_task") + + +@tool +def apify_run_task_and_get_dataset( + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, +) -> Dict[str, Any]: + """Run an Apify task and fetch its dataset results in one step. + + Convenience tool that combines running a task and fetching its default + dataset items into a single call. Use this when you want both the run metadata and the + result data without making two separate tool calls. + + Args: + task_id: Task identifier, e.g. "user/my-task" or a task ID string. + task_input: Optional JSON-serializable input to override the task's default input. + timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the task run. + dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. + dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. + + Returns: + Dict with status and content containing run metadata (run_id, status, dataset_id, + started_at, finished_at) plus an "items" array containing the dataset results. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_task_and_get_dataset( + task_id=task_id, + task_input=task_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + dataset_items_limit=dataset_items_limit, + dataset_items_offset=dataset_items_offset, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Task run completed with dataset[/green]\n" + f"Task: {task_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}\n" + f"Items returned: {len(result['items'])}" + ), + panel_title="Apify: Run Task + Dataset", + ) + except Exception as e: + return _error_result(e, "apify_run_task_and_get_dataset") + + +@tool +def apify_scrape_url( + url: str, + timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, + crawler_type: str = "cheerio", +) -> Dict[str, Any]: + """Scrape a single URL and return its content as markdown. + + Uses the Website Content Crawler Actor under the hood, pre-configured for + fast single-page scraping. This is the simplest way to extract readable content + from any web page. + + Args: + url: The URL to scrape, e.g. "https://example.com". + timeout_secs: Maximum time in seconds to wait for scraping to finish. Defaults to 120. + crawler_type: Crawler engine to use. One of "cheerio" (fastest, no JS rendering, + default), "playwright:adaptive" (fast, renders JS if present), or + "playwright:firefox" (reliable, renders JS, best at avoiding blocking but slower). + + Returns: + Dict with status and content containing the markdown content of the scraped page. + """ + try: + _check_dependency() + client = ApifyToolClient() + content = client.scrape_url(url=url, timeout_secs=timeout_secs, crawler_type=crawler_type) + return _success_result( + text=content, + panel_body=( + f"[green]URL scraped successfully[/green]\nURL: {url}\nContent length: {len(content)} characters" + ), + panel_title="Apify: Scrape URL", + ) + except Exception as e: + return _error_result(e, "apify_scrape_url") + + +# Pre-built list of all core tools for convenient agent registration. +# Usage: Agent(tools=APIFY_CORE_TOOLS) +APIFY_CORE_TOOLS = [ + apify_run_actor, + apify_get_dataset_items, + apify_run_actor_and_get_dataset, + apify_run_task, + apify_run_task_and_get_dataset, + apify_scrape_url, +] diff --git a/tests/test_apify.py b/tests/test_apify.py index 3c9ec899..963225fd 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -5,9 +5,9 @@ import pytest -from strands_tools import apify -from strands_tools.apify import ( - ApifyToolClient, +from strands_tools import apify, apify_core +from strands_tools.apify import ApifyToolClient +from strands_tools.apify_core import ( apify_get_dataset_items, apify_run_actor, apify_run_actor_and_get_dataset, @@ -96,12 +96,18 @@ def mock_apify_env(monkeypatch): # --- Module import --- -def test_apify_module_is_importable(): - """Verify that the apify tool module can be imported from strands_tools.""" +def test_apify_base_module_is_importable(): + """Verify that the apify base module can be imported from strands_tools.""" assert apify is not None assert apify.__name__ == "strands_tools.apify" +def test_apify_core_module_is_importable(): + """Verify that the apify_core tool module can be imported from strands_tools.""" + assert apify_core is not None + assert apify_core.__name__ == "strands_tools.apify_core" + + # --- ApifyToolClient --- From ff2494ef7fb4399bfe9dd8b90f1c0915ea4d78ce Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 25 Mar 2026 14:28:42 +0100 Subject: [PATCH 12/39] feat: refactor the tool, use one file only --- README.md | 2 +- docs/apify_tool.md | 10 +- src/strands_tools/apify.py | 349 +++++++++++++++++++++++++++++- src/strands_tools/apify_core.py | 366 -------------------------------- tests/test_apify.py | 16 +- 5 files changed, 354 insertions(+), 389 deletions(-) delete mode 100644 src/strands_tools/apify_core.py diff --git a/README.md b/README.md index d79af517..67d9833a 100644 --- a/README.md +++ b/README.md @@ -970,7 +970,7 @@ result = agent.tool.mongodb_memory( ```python from strands import Agent -from strands_tools.apify_core import APIFY_CORE_TOOLS +from strands_tools.apify import APIFY_CORE_TOOLS agent = Agent(tools=APIFY_CORE_TOOLS) diff --git a/docs/apify_tool.md b/docs/apify_tool.md index 36358192..f1455cdb 100644 --- a/docs/apify_tool.md +++ b/docs/apify_tool.md @@ -1,6 +1,6 @@ # Apify -The Apify core tools (`apify_core.py`) enable [Strands Agents](https://strandsagents.com/) to interact with the [Apify](https://apify.com) platform β€” running any [Actor](https://apify.com/store) or [task](https://docs.apify.com/platform/actors/running/tasks) by ID, fetching dataset results, and scraping individual URLs. +The Apify tools (`apify.py`) enable [Strands Agents](https://strandsagents.com/) to interact with the [Apify](https://apify.com) platform β€” running any [Actor](https://apify.com/store) or [task](https://docs.apify.com/platform/actors/running/tasks) by ID, fetching dataset results, and scraping individual URLs. ## Installation @@ -24,7 +24,7 @@ Register all core tools at once: ```python from strands import Agent -from strands_tools.apify_core import APIFY_CORE_TOOLS +from strands_tools.apify import APIFY_CORE_TOOLS agent = Agent(tools=APIFY_CORE_TOOLS) ``` @@ -33,11 +33,11 @@ Or pick individual tools: ```python from strands import Agent -from strands_tools import apify_core +from strands_tools import apify agent = Agent(tools=[ - apify_core.apify_run_actor, - apify_core.apify_scrape_url, + apify.apify_run_actor, + apify.apify_scrape_url, ]) ``` diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index cb63ae70..5855ab83 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -1,11 +1,17 @@ -"""Shared base for Apify platform tools. +"""Apify platform tools for Strands Agents. -This module provides the shared infrastructure used by all Apify tool modules -(e.g. apify_core, apify_social). It contains the API client, error handling, -response helpers, and constants. It does NOT contain any @tool functions itself. +This module provides web scraping, data extraction, and automation capabilities +using the Apify platform. It lets you run any Actor, task, fetch dataset +results, and scrape individual URLs. -Tool modules import from here: - from strands_tools.apify import ApifyToolClient, _check_dependency, ... +Available Tools: +--------------- +- apify_run_actor: Run any Apify Actor with custom input +- apify_get_dataset_items: Fetch items from an Apify dataset with pagination +- apify_run_actor_and_get_dataset: Run an Actor and fetch results in one step +- apify_run_task: Run a saved Actor task with optional input overrides +- apify_run_task_and_get_dataset: Run a task and fetch results in one step +- apify_scrape_url: Scrape a single URL and return content as Markdown Setup Requirements: ------------------ @@ -14,8 +20,41 @@ 3. Install the optional dependency: pip install strands-agents-tools[apify] 4. Set the environment variable: APIFY_API_TOKEN=your_api_token_here + +Usage Examples: +-------------- +Register all core tools at once via the preset list: + +```python +from strands import Agent +from strands_tools.apify import APIFY_CORE_TOOLS + +agent = Agent(tools=APIFY_CORE_TOOLS) +``` + +Or pick individual tools for a smaller LLM tool surface: + +```python +from strands import Agent +from strands_tools import apify + +agent = Agent(tools=[ + apify.apify_scrape_url, + apify.apify_run_actor, +]) + +# Scrape a single URL +content = agent.tool.apify_scrape_url(url="https://example.com") + +# Run an Actor +result = agent.tool.apify_run_actor( + actor_id="apify/website-content-crawler", + run_input={"startUrls": [{"url": "https://example.com"}]}, +) +``` """ +import json import logging import os from typing import Any, Dict, List, Optional @@ -23,6 +62,7 @@ from rich.panel import Panel from rich.text import Text +from strands import tool from strands_tools.utils import console_util @@ -311,3 +351,300 @@ def scrape_url( raise RuntimeError(f"No content returned for URL: {url}") return str(items[0].get("markdown") or items[0].get("text", "")) + + +# --- Tool functions --- + + +@tool +def apify_run_actor( + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + build: Optional[str] = None, +) -> Dict[str, Any]: + """Run any Apify Actor and return the run metadata as JSON. + + Executes the Actor synchronously - blocks until the Actor run finishes or the timeout + is reached. Use this when you need to run a specific Actor and then inspect or process + the results separately. + + Common Actors: + - "apify/website-content-crawler" - scrape websites and extract content + - "apify/web-scraper" - general-purpose web scraper + - "apify/google-search-scraper" - scrape Google search results + + Args: + actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". + run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema. + timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. + build: Actor build tag or number to run a specific version. Uses latest build if not set. + + Returns: + Dict with status and content containing run metadata: run_id, status, dataset_id, + started_at, finished_at. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_actor( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + build=build, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Actor run completed[/green]\n" + f"Actor: {actor_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}" + ), + panel_title="Apify: Run Actor", + ) + except Exception as e: + return _error_result(e, "apify_run_actor") + + +@tool +def apify_get_dataset_items( + dataset_id: str, + limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + offset: int = 0, +) -> Dict[str, Any]: + """Fetch items from an existing Apify dataset and return them as JSON. + + Use this after running an Actor to retrieve the structured results from its + default dataset, or to access any dataset by ID. + + Args: + dataset_id: The Apify dataset ID to fetch items from. + limit: Maximum number of items to return. Defaults to 100. + offset: Number of items to skip for pagination. Defaults to 0. + + Returns: + Dict with status and content containing an array of dataset items. + """ + try: + _check_dependency() + client = ApifyToolClient() + items = client.get_dataset_items(dataset_id=dataset_id, limit=limit, offset=offset) + return _success_result( + text=json.dumps(items, indent=2, default=str), + panel_body=( + f"[green]Dataset items retrieved[/green]\nDataset ID: {dataset_id}\nItems returned: {len(items)}" + ), + panel_title="Apify: Dataset Items", + ) + except Exception as e: + return _error_result(e, "apify_get_dataset_items") + + +@tool +def apify_run_actor_and_get_dataset( + actor_id: str, + run_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + build: Optional[str] = None, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, +) -> Dict[str, Any]: + """Run an Apify Actor and fetch its dataset results in one step. + + Convenience tool that combines running an Actor and fetching its default + dataset items into a single call. Use this when you want both the run metadata and the + result data without making two separate tool calls. + + Args: + actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". + run_input: JSON-serializable input for the Actor. + timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the Actor run. + build: Actor build tag or number to run a specific version. Uses latest build if not set. + dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. + dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. + + Returns: + Dict with status and content containing run metadata (run_id, status, dataset_id, + started_at, finished_at) plus an "items" array containing the dataset results. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_actor_and_get_dataset( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + build=build, + dataset_items_limit=dataset_items_limit, + dataset_items_offset=dataset_items_offset, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Actor run completed with dataset[/green]\n" + f"Actor: {actor_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}\n" + f"Items returned: {len(result['items'])}" + ), + panel_title="Apify: Run Actor + Dataset", + ) + except Exception as e: + return _error_result(e, "apify_run_actor_and_get_dataset") + + +@tool +def apify_run_task( + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, +) -> Dict[str, Any]: + """Run an Apify task and return the run metadata as JSON. + + Tasks are saved Actor configurations with preset inputs. Use this when a task + has already been configured in Apify Console, so you don't need to specify + the full Actor input every time. + + Args: + task_id: Task identifier, e.g. "user/my-task" or a task ID string. + task_input: Optional JSON-serializable input to override the task's default input. + timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. + + Returns: + Dict with status and content containing run metadata: run_id, status, dataset_id, + started_at, finished_at. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_task( + task_id=task_id, + task_input=task_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Task run completed[/green]\n" + f"Task: {task_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}" + ), + panel_title="Apify: Run Task", + ) + except Exception as e: + return _error_result(e, "apify_run_task") + + +@tool +def apify_run_task_and_get_dataset( + task_id: str, + task_input: Optional[Dict[str, Any]] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, + memory_mbytes: Optional[int] = None, + dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, + dataset_items_offset: int = 0, +) -> Dict[str, Any]: + """Run an Apify task and fetch its dataset results in one step. + + Convenience tool that combines running a task and fetching its default + dataset items into a single call. Use this when you want both the run metadata and the + result data without making two separate tool calls. + + Args: + task_id: Task identifier, e.g. "user/my-task" or a task ID string. + task_input: Optional JSON-serializable input to override the task's default input. + timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. + memory_mbytes: Memory allocation in MB for the task run. + dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. + dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. + + Returns: + Dict with status and content containing run metadata (run_id, status, dataset_id, + started_at, finished_at) plus an "items" array containing the dataset results. + """ + try: + _check_dependency() + client = ApifyToolClient() + result = client.run_task_and_get_dataset( + task_id=task_id, + task_input=task_input, + timeout_secs=timeout_secs, + memory_mbytes=memory_mbytes, + dataset_items_limit=dataset_items_limit, + dataset_items_offset=dataset_items_offset, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]Task run completed with dataset[/green]\n" + f"Task: {task_id}\n" + f"Run ID: {result['run_id']}\n" + f"Status: {result['status']}\n" + f"Dataset ID: {result['dataset_id']}\n" + f"Items returned: {len(result['items'])}" + ), + panel_title="Apify: Run Task + Dataset", + ) + except Exception as e: + return _error_result(e, "apify_run_task_and_get_dataset") + + +@tool +def apify_scrape_url( + url: str, + timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, + crawler_type: str = "cheerio", +) -> Dict[str, Any]: + """Scrape a single URL and return its content as markdown. + + Uses the Website Content Crawler Actor under the hood, pre-configured for + fast single-page scraping. This is the simplest way to extract readable content + from any web page. + + Args: + url: The URL to scrape, e.g. "https://example.com". + timeout_secs: Maximum time in seconds to wait for scraping to finish. Defaults to 120. + crawler_type: Crawler engine to use. One of "cheerio" (fastest, no JS rendering, + default), "playwright:adaptive" (fast, renders JS if present), or + "playwright:firefox" (reliable, renders JS, best at avoiding blocking but slower). + + Returns: + Dict with status and content containing the markdown content of the scraped page. + """ + try: + _check_dependency() + client = ApifyToolClient() + content = client.scrape_url(url=url, timeout_secs=timeout_secs, crawler_type=crawler_type) + return _success_result( + text=content, + panel_body=( + f"[green]URL scraped successfully[/green]\nURL: {url}\nContent length: {len(content)} characters" + ), + panel_title="Apify: Scrape URL", + ) + except Exception as e: + return _error_result(e, "apify_scrape_url") + + +APIFY_CORE_TOOLS = [ + apify_run_actor, + apify_get_dataset_items, + apify_run_actor_and_get_dataset, + apify_run_task, + apify_run_task_and_get_dataset, + apify_scrape_url, +] diff --git a/src/strands_tools/apify_core.py b/src/strands_tools/apify_core.py deleted file mode 100644 index 69d330ae..00000000 --- a/src/strands_tools/apify_core.py +++ /dev/null @@ -1,366 +0,0 @@ -"""Core Apify platform tools for Strands Agents. - -This module provides web scraping, data extraction, and automation capabilities -using the Apify platform. It lets you run any Actor, task, fetch dataset -results, and scrape individual URLs. - -Available Tools: ---------------- -- apify_run_actor: Run any Apify Actor with custom input -- apify_get_dataset_items: Fetch items from an Apify dataset with pagination -- apify_run_actor_and_get_dataset: Run an Actor and fetch results in one step -- apify_run_task: Run a saved Actor task with optional input overrides -- apify_run_task_and_get_dataset: Run a task and fetch results in one step -- apify_scrape_url: Scrape a single URL and return content as Markdown - -Setup Requirements: ------------------- -1. Create an Apify account at https://apify.com -2. Obtain your API token: Apify Console > Settings > API & Integrations > Personal API tokens -3. Install the optional dependency: pip install strands-agents-tools[apify] -4. Set the environment variable: - APIFY_API_TOKEN=your_api_token_here - -Usage Examples: --------------- -Register all core tools at once via the preset list: - -```python -from strands import Agent -from strands_tools.apify_core import APIFY_CORE_TOOLS - -agent = Agent(tools=APIFY_CORE_TOOLS) -``` - -Or pick individual tools for a smaller LLM tool surface: - -```python -from strands import Agent -from strands_tools import apify_core - -agent = Agent(tools=[ - apify_core.apify_scrape_url, - apify_core.apify_run_actor, -]) - -# Scrape a single URL -content = agent.tool.apify_scrape_url(url="https://example.com") - -# Run an Actor -result = agent.tool.apify_run_actor( - actor_id="apify/website-content-crawler", - run_input={"startUrls": [{"url": "https://example.com"}]}, -) -``` -""" - -import json -from typing import Any, Dict, Optional - -from strands import tool - -from strands_tools.apify import ( - DEFAULT_DATASET_ITEMS_LIMIT, - DEFAULT_SCRAPE_TIMEOUT_SECS, - DEFAULT_TIMEOUT_SECS, - ApifyToolClient, - _check_dependency, - _error_result, - _success_result, -) - - -@tool -def apify_run_actor( - actor_id: str, - run_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = DEFAULT_TIMEOUT_SECS, - memory_mbytes: Optional[int] = None, - build: Optional[str] = None, -) -> Dict[str, Any]: - """Run any Apify Actor and return the run metadata as JSON. - - Executes the Actor synchronously - blocks until the Actor run finishes or the timeout - is reached. Use this when you need to run a specific Actor and then inspect or process - the results separately. - - Common Actors: - - "apify/website-content-crawler" - scrape websites and extract content - - "apify/web-scraper" - general-purpose web scraper - - "apify/google-search-scraper" - scrape Google search results - - Args: - actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". - run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema. - timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. - build: Actor build tag or number to run a specific version. Uses latest build if not set. - - Returns: - Dict with status and content containing run metadata: run_id, status, dataset_id, - started_at, finished_at. - """ - try: - _check_dependency() - client = ApifyToolClient() - result = client.run_actor( - actor_id=actor_id, - run_input=run_input, - timeout_secs=timeout_secs, - memory_mbytes=memory_mbytes, - build=build, - ) - return _success_result( - text=json.dumps(result, indent=2, default=str), - panel_body=( - f"[green]Actor run completed[/green]\n" - f"Actor: {actor_id}\n" - f"Run ID: {result['run_id']}\n" - f"Status: {result['status']}\n" - f"Dataset ID: {result['dataset_id']}" - ), - panel_title="Apify: Run Actor", - ) - except Exception as e: - return _error_result(e, "apify_run_actor") - - -@tool -def apify_get_dataset_items( - dataset_id: str, - limit: int = DEFAULT_DATASET_ITEMS_LIMIT, - offset: int = 0, -) -> Dict[str, Any]: - """Fetch items from an existing Apify dataset and return them as JSON. - - Use this after running an Actor to retrieve the structured results from its - default dataset, or to access any dataset by ID. - - Args: - dataset_id: The Apify dataset ID to fetch items from. - limit: Maximum number of items to return. Defaults to 100. - offset: Number of items to skip for pagination. Defaults to 0. - - Returns: - Dict with status and content containing an array of dataset items. - """ - try: - _check_dependency() - client = ApifyToolClient() - items = client.get_dataset_items(dataset_id=dataset_id, limit=limit, offset=offset) - return _success_result( - text=json.dumps(items, indent=2, default=str), - panel_body=( - f"[green]Dataset items retrieved[/green]\nDataset ID: {dataset_id}\nItems returned: {len(items)}" - ), - panel_title="Apify: Dataset Items", - ) - except Exception as e: - return _error_result(e, "apify_get_dataset_items") - - -@tool -def apify_run_actor_and_get_dataset( - actor_id: str, - run_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = DEFAULT_TIMEOUT_SECS, - memory_mbytes: Optional[int] = None, - build: Optional[str] = None, - dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, - dataset_items_offset: int = 0, -) -> Dict[str, Any]: - """Run an Apify Actor and fetch its dataset results in one step. - - Convenience tool that combines running an Actor and fetching its default - dataset items into a single call. Use this when you want both the run metadata and the - result data without making two separate tool calls. - - Args: - actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". - run_input: JSON-serializable input for the Actor. - timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Actor run. - build: Actor build tag or number to run a specific version. Uses latest build if not set. - dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. - dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. - - Returns: - Dict with status and content containing run metadata (run_id, status, dataset_id, - started_at, finished_at) plus an "items" array containing the dataset results. - """ - try: - _check_dependency() - client = ApifyToolClient() - result = client.run_actor_and_get_dataset( - actor_id=actor_id, - run_input=run_input, - timeout_secs=timeout_secs, - memory_mbytes=memory_mbytes, - build=build, - dataset_items_limit=dataset_items_limit, - dataset_items_offset=dataset_items_offset, - ) - return _success_result( - text=json.dumps(result, indent=2, default=str), - panel_body=( - f"[green]Actor run completed with dataset[/green]\n" - f"Actor: {actor_id}\n" - f"Run ID: {result['run_id']}\n" - f"Status: {result['status']}\n" - f"Dataset ID: {result['dataset_id']}\n" - f"Items returned: {len(result['items'])}" - ), - panel_title="Apify: Run Actor + Dataset", - ) - except Exception as e: - return _error_result(e, "apify_run_actor_and_get_dataset") - - -@tool -def apify_run_task( - task_id: str, - task_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = DEFAULT_TIMEOUT_SECS, - memory_mbytes: Optional[int] = None, -) -> Dict[str, Any]: - """Run an Apify task and return the run metadata as JSON. - - Tasks are saved Actor configurations with preset inputs. Use this when a task - has already been configured in Apify Console, so you don't need to specify - the full Actor input every time. - - Args: - task_id: Task identifier, e.g. "user/my-task" or a task ID string. - task_input: Optional JSON-serializable input to override the task's default input. - timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. - - Returns: - Dict with status and content containing run metadata: run_id, status, dataset_id, - started_at, finished_at. - """ - try: - _check_dependency() - client = ApifyToolClient() - result = client.run_task( - task_id=task_id, - task_input=task_input, - timeout_secs=timeout_secs, - memory_mbytes=memory_mbytes, - ) - return _success_result( - text=json.dumps(result, indent=2, default=str), - panel_body=( - f"[green]Task run completed[/green]\n" - f"Task: {task_id}\n" - f"Run ID: {result['run_id']}\n" - f"Status: {result['status']}\n" - f"Dataset ID: {result['dataset_id']}" - ), - panel_title="Apify: Run Task", - ) - except Exception as e: - return _error_result(e, "apify_run_task") - - -@tool -def apify_run_task_and_get_dataset( - task_id: str, - task_input: Optional[Dict[str, Any]] = None, - timeout_secs: int = DEFAULT_TIMEOUT_SECS, - memory_mbytes: Optional[int] = None, - dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, - dataset_items_offset: int = 0, -) -> Dict[str, Any]: - """Run an Apify task and fetch its dataset results in one step. - - Convenience tool that combines running a task and fetching its default - dataset items into a single call. Use this when you want both the run metadata and the - result data without making two separate tool calls. - - Args: - task_id: Task identifier, e.g. "user/my-task" or a task ID string. - task_input: Optional JSON-serializable input to override the task's default input. - timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the task run. - dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. - dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. - - Returns: - Dict with status and content containing run metadata (run_id, status, dataset_id, - started_at, finished_at) plus an "items" array containing the dataset results. - """ - try: - _check_dependency() - client = ApifyToolClient() - result = client.run_task_and_get_dataset( - task_id=task_id, - task_input=task_input, - timeout_secs=timeout_secs, - memory_mbytes=memory_mbytes, - dataset_items_limit=dataset_items_limit, - dataset_items_offset=dataset_items_offset, - ) - return _success_result( - text=json.dumps(result, indent=2, default=str), - panel_body=( - f"[green]Task run completed with dataset[/green]\n" - f"Task: {task_id}\n" - f"Run ID: {result['run_id']}\n" - f"Status: {result['status']}\n" - f"Dataset ID: {result['dataset_id']}\n" - f"Items returned: {len(result['items'])}" - ), - panel_title="Apify: Run Task + Dataset", - ) - except Exception as e: - return _error_result(e, "apify_run_task_and_get_dataset") - - -@tool -def apify_scrape_url( - url: str, - timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, - crawler_type: str = "cheerio", -) -> Dict[str, Any]: - """Scrape a single URL and return its content as markdown. - - Uses the Website Content Crawler Actor under the hood, pre-configured for - fast single-page scraping. This is the simplest way to extract readable content - from any web page. - - Args: - url: The URL to scrape, e.g. "https://example.com". - timeout_secs: Maximum time in seconds to wait for scraping to finish. Defaults to 120. - crawler_type: Crawler engine to use. One of "cheerio" (fastest, no JS rendering, - default), "playwright:adaptive" (fast, renders JS if present), or - "playwright:firefox" (reliable, renders JS, best at avoiding blocking but slower). - - Returns: - Dict with status and content containing the markdown content of the scraped page. - """ - try: - _check_dependency() - client = ApifyToolClient() - content = client.scrape_url(url=url, timeout_secs=timeout_secs, crawler_type=crawler_type) - return _success_result( - text=content, - panel_body=( - f"[green]URL scraped successfully[/green]\nURL: {url}\nContent length: {len(content)} characters" - ), - panel_title="Apify: Scrape URL", - ) - except Exception as e: - return _error_result(e, "apify_scrape_url") - - -# Pre-built list of all core tools for convenient agent registration. -# Usage: Agent(tools=APIFY_CORE_TOOLS) -APIFY_CORE_TOOLS = [ - apify_run_actor, - apify_get_dataset_items, - apify_run_actor_and_get_dataset, - apify_run_task, - apify_run_task_and_get_dataset, - apify_scrape_url, -] diff --git a/tests/test_apify.py b/tests/test_apify.py index 963225fd..038f8211 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -5,9 +5,9 @@ import pytest -from strands_tools import apify, apify_core -from strands_tools.apify import ApifyToolClient -from strands_tools.apify_core import ( +from strands_tools import apify +from strands_tools.apify import ( + ApifyToolClient, apify_get_dataset_items, apify_run_actor, apify_run_actor_and_get_dataset, @@ -96,18 +96,12 @@ def mock_apify_env(monkeypatch): # --- Module import --- -def test_apify_base_module_is_importable(): - """Verify that the apify base module can be imported from strands_tools.""" +def test_apify_module_is_importable(): + """Verify that the apify module can be imported from strands_tools.""" assert apify is not None assert apify.__name__ == "strands_tools.apify" -def test_apify_core_module_is_importable(): - """Verify that the apify_core tool module can be imported from strands_tools.""" - assert apify_core is not None - assert apify_core.__name__ == "strands_tools.apify_core" - - # --- ApifyToolClient --- From cb7f9e569391242963e774e3a25fc1adb7d8290f Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 25 Mar 2026 15:41:00 +0100 Subject: [PATCH 13/39] feat: add new search and crawling tool to Apify integration --- README.md | 35 +++- docs/apify_tool.md | 161 ++++++++++++++++++- src/strands_tools/apify.py | 321 ++++++++++++++++++++++++++++++++++++- tests/test_apify.py | 272 +++++++++++++++++++++++++++++++ 4 files changed, 785 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 67d9833a..8cd1ffc5 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,11 @@ Below is a comprehensive table of all available tools, how to use them with an a | apify_run_task | `agent.tool.apify_run_task(task_id="user/my-task")` | Run a saved Apify task by ID with optional input overrides | | apify_run_task_and_get_dataset | `agent.tool.apify_run_task_and_get_dataset(task_id="user/my-task", dataset_items_limit=50)` | Run a task and fetch its dataset results in one step | | apify_scrape_url | `agent.tool.apify_scrape_url(url="https://example.com")` | Scrape a URL and return its content as markdown | +| apify_google_search_scraper | `agent.tool.apify_google_search_scraper(search_query="best AI frameworks")` | Search Google and return structured results | +| apify_google_places_scraper | `agent.tool.apify_google_places_scraper(search_query="restaurants in Prague")` | Search Google Maps for businesses and places | +| apify_youtube_scraper | `agent.tool.apify_youtube_scraper(search_query="python tutorial")` | Scrape YouTube videos, channels, or search results | +| apify_website_content_crawler | `agent.tool.apify_website_content_crawler(start_url="https://docs.example.com")` | Crawl a website and extract content from multiple pages | +| apify_ecommerce_scraper | `agent.tool.apify_ecommerce_scraper(url="https://www.amazon.com/dp/B0TEST")` | Scrape product data from e-commerce websites | | file_read | `agent.tool.file_read(path="path/to/file.txt")` | Reading configuration files, parsing code files, loading datasets | | file_write | `agent.tool.file_write(path="path/to/file.txt", content="file content")` | Writing results to files, creating new files, saving output data | | editor | `agent.tool.editor(command="view", path="path/to/file.py")` | Advanced file operations like syntax highlighting, pattern replacement, and multi-file edits | @@ -970,9 +975,9 @@ result = agent.tool.mongodb_memory( ```python from strands import Agent -from strands_tools.apify import APIFY_CORE_TOOLS +from strands_tools.apify import APIFY_ALL_TOOLS -agent = Agent(tools=APIFY_CORE_TOOLS) +agent = Agent(tools=APIFY_ALL_TOOLS) # Scrape a single URL and get markdown content content = agent.tool.apify_scrape_url(url="https://example.com") @@ -1005,6 +1010,32 @@ items = agent.tool.apify_get_dataset_items( dataset_id="abc123", limit=100, ) + +# Search Google +results = agent.tool.apify_google_search_scraper( + search_query="best AI frameworks 2025", + results_limit=10, +) + +# Search Google Maps for places +places = agent.tool.apify_google_places_scraper( + search_query="restaurants in Prague", + include_reviews=True, +) + +# Scrape YouTube +videos = agent.tool.apify_youtube_scraper(search_query="python tutorial") + +# Crawl a website (multi-page) +pages = agent.tool.apify_website_content_crawler( + start_url="https://docs.example.com", + max_pages=20, +) + +# Scrape e-commerce product data +products = agent.tool.apify_ecommerce_scraper( + url="https://www.amazon.com/dp/B0TEST", +) ``` ## 🌍 Environment Variables Configuration diff --git a/docs/apify_tool.md b/docs/apify_tool.md index f1455cdb..bc57b6dc 100644 --- a/docs/apify_tool.md +++ b/docs/apify_tool.md @@ -179,6 +179,159 @@ items = agent.tool.apify_get_dataset_items( **Returns:** JSON string with run metadata plus an `items` array containing the dataset results. +## Search & Crawling + +Specialized tools for common search and crawling use cases. Register all search tools at once: + +```python +from strands import Agent +from strands_tools.apify import APIFY_SEARCH_TOOLS + +agent = Agent(tools=APIFY_SEARCH_TOOLS) +``` + +Or register all Apify tools (core + search): + +```python +from strands_tools.apify import APIFY_ALL_TOOLS + +agent = Agent(tools=APIFY_ALL_TOOLS) +``` + +### Search Google + +Search Google and return structured results using the [Google Search Scraper](https://apify.com/apify/google-search-scraper) Actor: + +```python +result = agent.tool.apify_google_search_scraper( + search_query="best AI frameworks 2025", + results_limit=10, + country_code="us", +) +``` + +### Search Google Maps + +Search Google Maps for businesses and places using the [Google Maps Scraper](https://apify.com/compass/crawler-google-places) Actor: + +```python +result = agent.tool.apify_google_places_scraper( + search_query="restaurants in Prague", + results_limit=20, + include_reviews=True, + max_reviews=5, +) +``` + +### Scrape YouTube + +Scrape YouTube videos, channels, or search results using the [YouTube Scraper](https://apify.com/streamers/youtube-scraper) Actor: + +```python +# Search YouTube +result = agent.tool.apify_youtube_scraper( + search_query="python tutorial", + results_limit=10, +) + +# Scrape specific videos +result = agent.tool.apify_youtube_scraper( + urls=["https://www.youtube.com/watch?v=dQw4w9WgXcQ"], +) +``` + +### Crawl a website + +Crawl a website and extract content from multiple pages using the [Website Content Crawler](https://apify.com/apify/website-content-crawler) Actor. This is the multi-page version β€” distinct from `apify_scrape_url` which is limited to a single page: + +```python +result = agent.tool.apify_website_content_crawler( + start_url="https://docs.example.com", + max_pages=20, + max_depth=3, +) +``` + +### Scrape e-commerce products + +Scrape product data from e-commerce websites using the [E-commerce Scraping Tool](https://apify.com/apify/e-commerce-scraping-tool) Actor. Supports Amazon, eBay, Walmart, and other platforms: + +```python +# Scrape a single product page +result = agent.tool.apify_ecommerce_scraper( + url="https://www.amazon.com/dp/B0TEST", +) + +# Scrape a category or search results page +result = agent.tool.apify_ecommerce_scraper( + url="https://www.amazon.com/s?k=headphones", + url_type="listing", + results_limit=20, +) +``` + +## Search & Crawling Tool Parameters + +### apify_google_search_scraper + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `search_query` | string | Yes | β€” | The search query string. Supports advanced Google operators like `"site:example.com"` | +| `results_limit` | int | No | 10 | Maximum number of results to return. Google returns ~10 per page, so requesting more triggers additional page scraping | +| `country_code` | string | No | None | Two-letter country code for localized results (e.g., `"us"`, `"de"`) | +| `language_code` | string | No | None | Two-letter language code (e.g., `"en"`, `"de"`) | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait | + +**Returns:** JSON string with run metadata and an `items` array containing structured search results (organic results, ads, People Also Ask). + +### apify_google_places_scraper + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `search_query` | string | Yes | β€” | Search query for Google Maps (e.g., `"restaurants in Prague"`) | +| `results_limit` | int | No | 20 | Maximum number of places to return | +| `language` | string | No | None | Language for results (e.g., `"en"`, `"de"`) | +| `include_reviews` | bool | No | False | Whether to include user reviews | +| `max_reviews` | int | No | 5 | Maximum reviews per place when `include_reviews` is True | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait | + +**Returns:** JSON string with run metadata and an `items` array containing place data (name, address, rating, phone, website). + +### apify_youtube_scraper + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `search_query` | string | No | None | YouTube search query | +| `urls` | list[str] | No | None | Specific YouTube video or channel URLs | +| `results_limit` | int | No | 20 | Maximum number of results to return | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait | + +At least one of `search_query` or `urls` must be provided. + +**Returns:** JSON string with run metadata and an `items` array containing video/channel data. + +### apify_website_content_crawler + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `start_url` | string | Yes | β€” | The starting URL to crawl | +| `max_pages` | int | No | 10 | Maximum number of pages to crawl | +| `max_depth` | int | No | 2 | Maximum crawl depth from the start URL | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait | + +**Returns:** JSON string with run metadata and an `items` array containing crawled page data with markdown content. + +### apify_ecommerce_scraper + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `url` | string | Yes | β€” | The URL to scrape | +| `url_type` | string | No | `"product"` | Type of URL: `"product"` for a product detail page, `"listing"` for a category or search results page | +| `results_limit` | int | No | 20 | Maximum number of products to return | +| `timeout_secs` | int | No | 300 | Maximum time in seconds to wait | + +**Returns:** JSON string with run metadata and an `items` array containing structured product data. + ## Troubleshooting | Error | Cause | Fix | @@ -187,9 +340,10 @@ items = agent.tool.apify_get_dataset_items( | `apify-client package is required` | Optional dependency not installed | Run `pip install strands-agents-tools[apify]` | | `Actor ... finished with status FAILED` | Actor execution error | Check Actor input parameters and run logs in [Apify Console](https://console.apify.com) | | `Task ... finished with status FAILED` | task execution error | Check task configuration and run logs in [Apify Console](https://console.apify.com) | -| `Actor/task ... finished with status TIMED-OUT` | Timeout too short for the workload | Increase the `timeout_secs` parameter | +| `Actor/task ... finished with status TIMED-OUT` | Timeout too short for the workload | Increase the `timeout_secs` parameter; `apify_website_content_crawler` with large `max_pages` may need 600+ seconds | | `Task ... returned no run data` | task `call()` returned `None` (wait timeout) | Increase the `timeout_secs` parameter | | `No content returned for URL` | Website Content Crawler returned empty results | Verify the URL is accessible and returns content | +| `At least one of 'search_query' or 'urls' must be provided` | YouTube Scraper called without input | Provide a `search_query`, `urls`, or both | ## References @@ -198,3 +352,8 @@ items = agent.tool.apify_get_dataset_items( - [Apify API Documentation](https://docs.apify.com/api/v2) - [Apify Store](https://apify.com/store) - [Apify Python Client](https://docs.apify.com/api/client/python/docs) +- [Google Search Scraper Actor](https://apify.com/apify/google-search-scraper) +- [Google Maps Scraper Actor](https://apify.com/compass/crawler-google-places) +- [YouTube Scraper Actor](https://apify.com/streamers/youtube-scraper) +- [Website Content Crawler Actor](https://apify.com/apify/website-content-crawler) +- [E-commerce Scraping Tool Actor](https://apify.com/apify/e-commerce-scraping-tool) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 5855ab83..07171e2a 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -2,10 +2,11 @@ This module provides web scraping, data extraction, and automation capabilities using the Apify platform. It lets you run any Actor, task, fetch dataset -results, and scrape individual URLs. +results, scrape individual URLs, and perform specialized search and crawling. Available Tools: --------------- +Core: - apify_run_actor: Run any Apify Actor with custom input - apify_get_dataset_items: Fetch items from an Apify dataset with pagination - apify_run_actor_and_get_dataset: Run an Actor and fetch results in one step @@ -13,6 +14,13 @@ - apify_run_task_and_get_dataset: Run a task and fetch results in one step - apify_scrape_url: Scrape a single URL and return content as Markdown +Search & Crawling: +- apify_google_search_scraper: Search Google and return structured results +- apify_google_places_scraper: Search Google Maps for businesses and places +- apify_youtube_scraper: Scrape YouTube videos, channels, or search results +- apify_website_content_crawler: Crawl a website and extract content from multiple pages +- apify_ecommerce_scraper: Scrape product data from e-commerce websites + Setup Requirements: ------------------ 1. Create an Apify account at https://apify.com @@ -32,6 +40,24 @@ agent = Agent(tools=APIFY_CORE_TOOLS) ``` +Register all search & crawling tools: + +```python +from strands import Agent +from strands_tools.apify import APIFY_SEARCH_TOOLS + +agent = Agent(tools=APIFY_SEARCH_TOOLS) +``` + +Register all Apify tools (core + search): + +```python +from strands import Agent +from strands_tools.apify import APIFY_ALL_TOOLS + +agent = Agent(tools=APIFY_ALL_TOOLS) +``` + Or pick individual tools for a smaller LLM tool surface: ```python @@ -41,6 +67,7 @@ agent = Agent(tools=[ apify.apify_scrape_url, apify.apify_run_actor, + apify.apify_google_search_scraper, ]) # Scrape a single URL @@ -51,6 +78,12 @@ actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]}, ) + +# Search Google +results = agent.tool.apify_google_search_scraper( + search_query="best AI frameworks 2025", + results_limit=10, +) ``` """ @@ -648,3 +681,289 @@ def apify_scrape_url( apify_run_task_and_get_dataset, apify_scrape_url, ] + + +# --- Search & crawling tool constants --- + +GOOGLE_SEARCH_SCRAPER_ID = "apify/google-search-scraper" +GOOGLE_PLACES_SCRAPER_ID = "compass/crawler-google-places" +YOUTUBE_SCRAPER_ID = "streamers/youtube-scraper" +ECOMMERCE_SCRAPER_ID = "apify/e-commerce-scraping-tool" +DEFAULT_SEARCH_RESULTS_LIMIT = 20 + + +# --- Search & crawling helpers --- + + +def _search_crawl_result( + actor_name: str, + client: ApifyToolClient, + run_input: Dict[str, Any], + actor_id: str, + timeout_secs: int, + results_limit: int, +) -> Dict[str, Any]: + """Run a search/crawling Actor and return formatted results.""" + result = client.run_actor_and_get_dataset( + actor_id=actor_id, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=results_limit, + ) + return _success_result( + text=json.dumps(result, indent=2, default=str), + panel_body=( + f"[green]{actor_name} completed[/green]\nRun ID: {result['run_id']}\nItems returned: {len(result['items'])}" + ), + panel_title=f"Apify: {actor_name}", + ) + + +# --- Search & crawling tool functions --- + + +@tool +def apify_google_search_scraper( + search_query: str, + results_limit: int = 10, + country_code: Optional[str] = None, + language_code: Optional[str] = None, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, +) -> Dict[str, Any]: + """Search Google and return structured search results. + + Uses the Google Search Scraper Actor to perform a Google search and return + organic results, ads, People Also Ask, and related queries in a structured format. + + Args: + search_query: The search query string, e.g. "best AI frameworks 2025". + Supports advanced Google operators like "site:example.com" or "AI OR ML". + results_limit: Maximum number of results to return. Google returns ~10 results + per page, so requesting more triggers additional page scraping. Defaults to 10. + country_code: Two-letter country code for localized results, e.g. "us", "de". + language_code: Two-letter language code for the interface, e.g. "en", "de". + timeout_secs: Maximum time in seconds to wait for the run to finish. Defaults to 300. + + Returns: + Dict with status and content containing structured Google search results including + organic results, ads, and People Also Ask data. + """ + try: + _check_dependency() + client = ApifyToolClient() + max_pages = max(1, (results_limit + 9) // 10) + run_input: Dict[str, Any] = { + "queries": search_query, + "maxPagesPerQuery": max_pages, + } + if country_code is not None: + run_input["countryCode"] = country_code + if language_code is not None: + run_input["languageCode"] = language_code + return _search_crawl_result( + actor_name="Google Search Scraper", + client=client, + run_input=run_input, + actor_id=GOOGLE_SEARCH_SCRAPER_ID, + timeout_secs=timeout_secs, + results_limit=results_limit, + ) + except Exception as e: + return _error_result(e, "apify_google_search_scraper") + + +@tool +def apify_google_places_scraper( + search_query: str, + results_limit: int = DEFAULT_SEARCH_RESULTS_LIMIT, + language: Optional[str] = None, + include_reviews: bool = False, + max_reviews: int = 5, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, +) -> Dict[str, Any]: + """Search Google Maps for businesses and places, optionally including reviews. + + Uses the Google Maps Scraper Actor to find places matching a search query + and return structured data including name, address, rating, phone, and website. + + Args: + search_query: Search query for Google Maps, e.g. "restaurants in Prague". + results_limit: Maximum number of places to return. Defaults to 20. + language: Language for results, e.g. "en", "de". Defaults to English. + include_reviews: Whether to include user reviews for each place. Defaults to False. + max_reviews: Maximum reviews per place when include_reviews is True. Defaults to 5. + timeout_secs: Maximum time in seconds to wait for the run to finish. Defaults to 300. + + Returns: + Dict with status and content containing structured Google Maps place data. + """ + try: + _check_dependency() + client = ApifyToolClient() + run_input: Dict[str, Any] = { + "searchStringsArray": [search_query], + "maxCrawledPlacesPerSearch": results_limit, + "maxReviews": max_reviews if include_reviews else 0, + } + if language is not None: + run_input["language"] = language + return _search_crawl_result( + actor_name="Google Places Scraper", + client=client, + run_input=run_input, + actor_id=GOOGLE_PLACES_SCRAPER_ID, + timeout_secs=timeout_secs, + results_limit=results_limit, + ) + except Exception as e: + return _error_result(e, "apify_google_places_scraper") + + +@tool +def apify_youtube_scraper( + search_query: Optional[str] = None, + urls: Optional[List[str]] = None, + results_limit: int = DEFAULT_SEARCH_RESULTS_LIMIT, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, +) -> Dict[str, Any]: + """Scrape YouTube videos, channels, or search results. + + Uses the YouTube Scraper Actor to search YouTube or scrape specific video/channel + URLs. Provide either a search query, specific URLs, or both. + + Args: + search_query: YouTube search query, e.g. "python tutorial". + urls: Specific YouTube video or channel URLs to scrape. + results_limit: Maximum number of results to return. Defaults to 20. + timeout_secs: Maximum time in seconds to wait for the run to finish. Defaults to 300. + + Returns: + Dict with status and content containing structured YouTube video/channel data. + """ + try: + _check_dependency() + if not search_query and not urls: + raise ValueError("At least one of 'search_query' or 'urls' must be provided.") + client = ApifyToolClient() + run_input: Dict[str, Any] = { + "maxResults": results_limit, + } + if search_query is not None: + run_input["searchQueries"] = [search_query] + if urls is not None: + run_input["startUrls"] = [{"url": u} for u in urls] + return _search_crawl_result( + actor_name="YouTube Scraper", + client=client, + run_input=run_input, + actor_id=YOUTUBE_SCRAPER_ID, + timeout_secs=timeout_secs, + results_limit=results_limit, + ) + except Exception as e: + return _error_result(e, "apify_youtube_scraper") + + +@tool +def apify_website_content_crawler( + start_url: str, + max_pages: int = 10, + max_depth: int = 2, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, +) -> Dict[str, Any]: + """Crawl a website and extract content from multiple pages. + + Uses the Website Content Crawler Actor to perform a multi-page crawl starting + from the given URL. Returns page content as markdown. This is the extended + multi-page version β€” distinct from apify_scrape_url which scrapes a single page. + + Args: + start_url: The starting URL to crawl, e.g. "https://docs.example.com". + max_pages: Maximum number of pages to crawl. Defaults to 10. + max_depth: Maximum crawl depth from the start URL. Defaults to 2. + timeout_secs: Maximum time in seconds to wait for the run to finish. Defaults to 300. + + Returns: + Dict with status and content containing crawled page data with markdown content. + """ + try: + _check_dependency() + client = ApifyToolClient() + client._validate_url(start_url) + run_input: Dict[str, Any] = { + "startUrls": [{"url": start_url}], + "maxCrawlPages": max_pages, + "maxCrawlDepth": max_depth, + "proxyConfiguration": {"useApifyProxy": True}, + } + return _search_crawl_result( + actor_name="Website Content Crawler", + client=client, + run_input=run_input, + actor_id=WEBSITE_CONTENT_CRAWLER, + timeout_secs=timeout_secs, + results_limit=max_pages, + ) + except Exception as e: + return _error_result(e, "apify_website_content_crawler") + + +VALID_ECOMMERCE_URL_TYPES = ("product", "listing") + + +@tool +def apify_ecommerce_scraper( + url: str, + url_type: str = "product", + results_limit: int = DEFAULT_SEARCH_RESULTS_LIMIT, + timeout_secs: int = DEFAULT_TIMEOUT_SECS, +) -> Dict[str, Any]: + """Scrape product data from e-commerce websites. + + Uses the E-commerce Scraping Tool Actor to extract structured product data + (title, price, description, images, etc.) from supported e-commerce platforms + including Amazon, eBay, Walmart, and others. The Actor auto-detects the site. + + Args: + url: The URL to scrape. + url_type: Type of URL being scraped. Use "product" (default) for a direct product + detail page, or "listing" for a category page or search results page containing + multiple products. + results_limit: Maximum number of products to return. Defaults to 20. + timeout_secs: Maximum time in seconds to wait for the run to finish. Defaults to 300. + + Returns: + Dict with status and content containing structured product data. + """ + try: + _check_dependency() + client = ApifyToolClient() + client._validate_url(url) + if url_type not in VALID_ECOMMERCE_URL_TYPES: + raise ValueError(f"Invalid url_type '{url_type}'. Must be one of: {', '.join(VALID_ECOMMERCE_URL_TYPES)}.") + url_field = "listingUrls" if url_type == "listing" else "detailsUrls" + run_input: Dict[str, Any] = { + url_field: [{"url": url}], + "maxProductResults": results_limit, + } + return _search_crawl_result( + actor_name="E-commerce Scraper", + client=client, + run_input=run_input, + actor_id=ECOMMERCE_SCRAPER_ID, + timeout_secs=timeout_secs, + results_limit=results_limit, + ) + except Exception as e: + return _error_result(e, "apify_ecommerce_scraper") + + +APIFY_SEARCH_TOOLS = [ + apify_google_search_scraper, + apify_google_places_scraper, + apify_youtube_scraper, + apify_website_content_crawler, + apify_ecommerce_scraper, +] + +APIFY_ALL_TOOLS = APIFY_CORE_TOOLS + APIFY_SEARCH_TOOLS diff --git a/tests/test_apify.py b/tests/test_apify.py index 038f8211..c788eaea 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -8,12 +8,17 @@ from strands_tools import apify from strands_tools.apify import ( ApifyToolClient, + apify_ecommerce_scraper, apify_get_dataset_items, + apify_google_places_scraper, + apify_google_search_scraper, apify_run_actor, apify_run_actor_and_get_dataset, apify_run_task, apify_run_task_and_get_dataset, apify_scrape_url, + apify_website_content_crawler, + apify_youtube_scraper, ) MOCK_ACTOR_RUN = { @@ -707,3 +712,270 @@ def test_scrape_url_missing_token(monkeypatch): assert result["status"] == "error" assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + +# --- apify_google_search_scraper --- + + +def test_google_search_scraper_success(mock_apify_env, mock_apify_client): + """Google Search Scraper returns structured results with correct input mapping.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_google_search_scraper(search_query="best AI frameworks", results_limit=5) + + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + assert len(data["items"]) == 3 + + mock_apify_client.actor.assert_called_once_with("apify/google-search-scraper") + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["queries"] == "best AI frameworks" + assert run_input["maxPagesPerQuery"] == 1 + assert "resultsPerPage" not in run_input + + +def test_google_search_scraper_multi_page(mock_apify_env, mock_apify_client): + """Google Search Scraper calculates correct page count when results_limit exceeds 10.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_google_search_scraper(search_query="AI", results_limit=25) + + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["maxPagesPerQuery"] == 3 + assert "resultsPerPage" not in run_input + + +def test_google_search_scraper_optional_params(mock_apify_env, mock_apify_client): + """Google Search Scraper includes optional country and language codes when provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_google_search_scraper(search_query="AI", results_limit=10, country_code="de", language_code="de") + + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["countryCode"] == "de" + assert run_input["languageCode"] == "de" + + +def test_google_search_scraper_optional_params_omitted(mock_apify_env, mock_apify_client): + """Google Search Scraper omits optional fields when not provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_google_search_scraper(search_query="AI") + + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert "countryCode" not in run_input + assert "languageCode" not in run_input + + +def test_google_search_scraper_missing_dependency(mock_apify_env): + """Google Search Scraper returns error when apify-client is not installed.""" + with patch("strands_tools.apify.HAS_APIFY_CLIENT", False): + result = apify_google_search_scraper(search_query="test") + + assert result["status"] == "error" + assert "apify-client" in result["content"][0]["text"] + + +def test_google_search_scraper_missing_token(monkeypatch): + """Google Search Scraper returns error when APIFY_API_TOKEN is missing.""" + monkeypatch.delenv("APIFY_API_TOKEN", raising=False) + result = apify_google_search_scraper(search_query="test") + + assert result["status"] == "error" + assert "APIFY_API_TOKEN" in result["content"][0]["text"] + + +def test_google_search_scraper_actor_failure(mock_apify_env, mock_apify_client): + """Google Search Scraper returns error when Actor fails.""" + mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_google_search_scraper(search_query="test") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] + + +# --- apify_google_places_scraper --- + + +def test_google_places_scraper_success(mock_apify_env, mock_apify_client): + """Google Places Scraper returns structured results with correct input mapping.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_google_places_scraper(search_query="restaurants in Prague", results_limit=10) + + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + + mock_apify_client.actor.assert_called_once_with("compass/crawler-google-places") + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["searchStringsArray"] == ["restaurants in Prague"] + assert run_input["maxCrawledPlacesPerSearch"] == 10 + assert run_input["maxReviews"] == 0 + + +def test_google_places_scraper_with_reviews(mock_apify_env, mock_apify_client): + """Google Places Scraper sets maxReviews when include_reviews is True.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_google_places_scraper(search_query="hotels in Berlin", include_reviews=True, max_reviews=10) + + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["maxReviews"] == 10 + + +def test_google_places_scraper_reviews_disabled(mock_apify_env, mock_apify_client): + """Google Places Scraper sets maxReviews to 0 when include_reviews is False.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_google_places_scraper(search_query="cafes", include_reviews=False, max_reviews=10) + + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["maxReviews"] == 0 + + +def test_google_places_scraper_optional_language(mock_apify_env, mock_apify_client): + """Google Places Scraper includes language when provided.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_google_places_scraper(search_query="cafes", language="de") + + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["language"] == "de" + + +# --- apify_youtube_scraper --- + + +def test_youtube_scraper_search_query(mock_apify_env, mock_apify_client): + """YouTube Scraper returns results when given a search query.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_youtube_scraper(search_query="python tutorial", results_limit=5) + + assert result["status"] == "success" + mock_apify_client.actor.assert_called_once_with("streamers/youtube-scraper") + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["searchQueries"] == ["python tutorial"] + assert run_input["maxResults"] == 5 + assert "startUrls" not in run_input + + +def test_youtube_scraper_urls(mock_apify_env, mock_apify_client): + """YouTube Scraper returns results when given specific URLs.""" + urls = ["https://www.youtube.com/watch?v=abc123"] + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_youtube_scraper(urls=urls) + + assert result["status"] == "success" + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["startUrls"] == [{"url": "https://www.youtube.com/watch?v=abc123"}] + assert "searchQueries" not in run_input + + +def test_youtube_scraper_both_query_and_urls(mock_apify_env, mock_apify_client): + """YouTube Scraper accepts both search_query and urls simultaneously.""" + urls = ["https://www.youtube.com/watch?v=abc123"] + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_youtube_scraper(search_query="python", urls=urls) + + assert result["status"] == "success" + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["searchQueries"] == ["python"] + assert run_input["startUrls"] == [{"url": "https://www.youtube.com/watch?v=abc123"}] + + +def test_youtube_scraper_no_input(mock_apify_env): + """YouTube Scraper returns error when neither search_query nor urls is provided.""" + result = apify_youtube_scraper() + + assert result["status"] == "error" + assert "search_query" in result["content"][0]["text"] + + +# --- apify_website_content_crawler --- + + +def test_website_content_crawler_success(mock_apify_env, mock_apify_client): + """Website Content Crawler returns results with correct input mapping.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_website_content_crawler(start_url="https://docs.example.com", max_pages=5, max_depth=3) + + assert result["status"] == "success" + mock_apify_client.actor.assert_called_once_with("apify/website-content-crawler") + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["startUrls"] == [{"url": "https://docs.example.com"}] + assert run_input["maxCrawlPages"] == 5 + assert run_input["maxCrawlDepth"] == 3 + assert run_input["proxyConfiguration"] == {"useApifyProxy": True} + + +def test_website_content_crawler_defaults(mock_apify_env, mock_apify_client): + """Website Content Crawler uses correct defaults for max_pages and max_depth.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + apify_website_content_crawler(start_url="https://example.com") + + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["maxCrawlPages"] == 10 + assert run_input["maxCrawlDepth"] == 2 + + +def test_website_content_crawler_invalid_url(mock_apify_env): + """Website Content Crawler returns error for invalid URL.""" + result = apify_website_content_crawler(start_url="not-a-url") + + assert result["status"] == "error" + assert "Invalid URL" in result["content"][0]["text"] + + +# --- apify_ecommerce_scraper --- + + +def test_ecommerce_scraper_success(mock_apify_env, mock_apify_client): + """E-commerce Scraper returns results with correct input mapping for product URL.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_ecommerce_scraper(url="https://www.amazon.com/dp/B0TEST", results_limit=10) + + assert result["status"] == "success" + data = json.loads(result["content"][0]["text"]) + assert data["run_id"] == "run-HG7ml5fB1hCp8YEBA" + + mock_apify_client.actor.assert_called_once_with("apify/e-commerce-scraping-tool") + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["detailsUrls"] == [{"url": "https://www.amazon.com/dp/B0TEST"}] + assert "listingUrls" not in run_input + assert run_input["maxProductResults"] == 10 + + +def test_ecommerce_scraper_listing_url(mock_apify_env, mock_apify_client): + """E-commerce Scraper uses listingUrls when url_type is 'listing'.""" + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_ecommerce_scraper( + url="https://www.amazon.com/s?k=headphones", url_type="listing", results_limit=10 + ) + + assert result["status"] == "success" + run_input = mock_apify_client.actor.return_value.call.call_args.kwargs["run_input"] + assert run_input["listingUrls"] == [{"url": "https://www.amazon.com/s?k=headphones"}] + assert "detailsUrls" not in run_input + + +def test_ecommerce_scraper_invalid_url_type(mock_apify_env): + """E-commerce Scraper returns error for invalid url_type.""" + result = apify_ecommerce_scraper(url="https://www.amazon.com/dp/B0TEST", url_type="invalid") + + assert result["status"] == "error" + assert "url_type" in result["content"][0]["text"] + + +def test_ecommerce_scraper_invalid_url(mock_apify_env): + """E-commerce Scraper returns error for invalid URL.""" + result = apify_ecommerce_scraper(url="not-a-url") + + assert result["status"] == "error" + assert "Invalid URL" in result["content"][0]["text"] + + +def test_ecommerce_scraper_actor_failure(mock_apify_env, mock_apify_client): + """E-commerce Scraper returns error when Actor fails.""" + mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_ecommerce_scraper(url="https://www.amazon.com/dp/B0TEST") + + assert result["status"] == "error" + assert "FAILED" in result["content"][0]["text"] From 849925fa89e41dd6289dd4a79e5620c10d43571d Mon Sep 17 00:00:00 2001 From: Mackenzie Zastrow <3211021+zastrowm@users.noreply.github.com> Date: Wed, 25 Mar 2026 15:08:22 -0400 Subject: [PATCH 14/39] feat: make verify_ssl=False opt-in via environment variable (#425) Co-authored-by: Mackenzie Zastrow --- README.md | 6 ++++ src/strands_tools/http_request.py | 14 ++++++-- tests/test_http_request.py | 56 ++++++++++++++++++++++++------- 3 files changed, 62 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index e945edf4..0e71be16 100644 --- a/README.md +++ b/README.md @@ -1139,6 +1139,12 @@ The Mem0 Memory Tool supports three different backend configurations: |----------------------|-------------|---------| | ENV_VARS_MASKED_DEFAULT | Default setting for masking sensitive values | true | +#### HTTP Request Tool + +| Environment Variable | Description | Default | +|----------------------|-------------|---------| +| STRANDS_HTTP_ALLOW_INSECURE_SSL | Allow disabling SSL certificate verification via verify_ssl parameter | false | + #### Dynamic MCP Client Tool | Environment Variable | Description | Default | diff --git a/src/strands_tools/http_request.py b/src/strands_tools/http_request.py index ba107d71..71b9a928 100644 --- a/src/strands_tools/http_request.py +++ b/src/strands_tools/http_request.py @@ -98,7 +98,7 @@ }, "verify_ssl": { "type": "boolean", - "description": "Whether to verify SSL certificates", + "description": "Whether to verify SSL certificates. Disabling may be restricted.", }, "cookie": { "type": "string", @@ -643,7 +643,17 @@ def http_request(tool: ToolUse, **kwargs: Any) -> ToolResult: url = tool_input["url"] headers = process_auth_headers(tool_input.get("headers", {}), tool_input) body = tool_input.get("body") - verify = tool_input.get("verify_ssl", True) + + # verify_ssl=False is opt-in via STRANDS_HTTP_ALLOW_INSECURE_SSL env var + verify_ssl_input = tool_input.get("verify_ssl", True) + if verify_ssl_input is False: + if os.environ.get("STRANDS_HTTP_ALLOW_INSECURE_SSL", "").lower() != "true": + raise ValueError( + "SSL verification cannot be disabled unless the STRANDS_HTTP_ALLOW_INSECURE_SSL " + "environment variable is set to 'true'." + ) + verify = verify_ssl_input + cookie = tool_input.get("cookie") cookie_jar = tool_input.get("cookie_jar") diff --git a/tests/test_http_request.py b/tests/test_http_request.py index bd8e3fee..5a782fb3 100644 --- a/tests/test_http_request.py +++ b/tests/test_http_request.py @@ -605,23 +605,55 @@ def test_verify_ssl_option(): }, } - # Call http_request with verify_ssl=False - with patch("strands_tools.http_request.get_user_input") as mock_input: - mock_input.return_value = "y" - # Use a real request but don't actually send it over the network - with responses.RequestsMock() as rsps: - rsps.add( - responses.GET, - "https://example.com/api/insecure", - json={"status": "insecure"}, - status=200, - ) - result = http_request.http_request(tool=tool_use) + # Call http_request with verify_ssl=False (requires STRANDS_HTTP_ALLOW_INSECURE_SSL) + original_env = os.environ.copy() + os.environ["STRANDS_HTTP_ALLOW_INSECURE_SSL"] = "true" + try: + with patch("strands_tools.http_request.get_user_input") as mock_input: + mock_input.return_value = "y" + # Use a real request but don't actually send it over the network + with responses.RequestsMock() as rsps: + rsps.add( + responses.GET, + "https://example.com/api/insecure", + json={"status": "insecure"}, + status=200, + ) + result = http_request.http_request(tool=tool_use) + finally: + os.environ.clear() + os.environ.update(original_env) # Verify the result assert result["status"] == "success" +def test_verify_ssl_blocked_without_env_var(): + """Test that verify_ssl=False is blocked without STRANDS_HTTP_ALLOW_INSECURE_SSL.""" + tool_use = { + "toolUseId": "test-ssl-blocked-id", + "input": { + "method": "GET", + "url": "https://example.com/api/insecure", + "verify_ssl": False, + }, + } + + # Ensure the env var is NOT set + original_env = os.environ.copy() + os.environ.pop("STRANDS_HTTP_ALLOW_INSECURE_SSL", None) + try: + with patch("strands_tools.http_request.get_user_input") as mock_input: + mock_input.return_value = "y" + result = http_request.http_request(tool=tool_use) + finally: + os.environ.clear() + os.environ.update(original_env) + + assert result["status"] == "error" + assert "STRANDS_HTTP_ALLOW_INSECURE_SSL" in result["content"][0]["text"] + + @responses.activate def test_dev_mode_no_confirmation(): """Test that in BYPASS_TOOL_CONSENT mode, no confirmation is requested for modifying requests.""" From 9eef60b2222f8f8caef18c937c15a30c3f203009 Mon Sep 17 00:00:00 2001 From: Mackenzie Zastrow <3211021+zastrowm@users.noreply.github.com> Date: Wed, 25 Mar 2026 15:18:23 -0400 Subject: [PATCH 15/39] fix: add allowlist for auth_env_var parameter for http_request tool (#424) Co-authored-by: Mackenzie Zastrow --- src/strands_tools/http_request.py | 124 ++++++++++++++++++++++-------- tests/test_http_request.py | 114 ++++++++++++++++++++------- 2 files changed, 175 insertions(+), 63 deletions(-) diff --git a/src/strands_tools/http_request.py b/src/strands_tools/http_request.py index 71b9a928..d5ff1767 100644 --- a/src/strands_tools/http_request.py +++ b/src/strands_tools/http_request.py @@ -2,18 +2,26 @@ Make HTTP requests with comprehensive authentication, session management, and metrics. Supports all major authentication types and enterprise patterns. -Environment Variable Support: -1. Authentication tokens: - - Uses auth_env_var parameter to read tokens from environment (e.g., GITHUB_TOKEN, GITLAB_TOKEN) - - Example: http_request(method="GET", url="...", auth_type="token", auth_env_var="GITHUB_TOKEN") - - Supported variables: GITHUB_TOKEN, GITLAB_TOKEN, SLACK_BOT_TOKEN, AWS_ACCESS_KEY_ID, etc. -2. AWS credentials: - - Reads AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, AWS_REGION automatically - - Example: http_request(method="GET", url="...", auth_type="aws_sig_v4", aws_auth={"service": "s3"}) -Use the environment tool (agent.tool.environment) to view available environment variables: -- List all: environment(action="list") -- Get specific: environment(action="get", name="GITHUB_TOKEN") -- Set new: environment(action="set", name="CUSTOM_TOKEN", value="your-token") +Authentication Support: +1. Direct tokens: Pass auth_token directly for Bearer, token, custom, or api_key auth types +2. Basic auth: Provide username/password via basic_auth parameter +3. Digest auth: Provide credentials via digest_auth parameter +4. JWT: Provide secret/algorithm/expiry via jwt_config parameter +5. AWS SigV4: Uses boto3 credential chain automatically via aws_auth parameter + +Environment Variable Token Config: + Import and populate HTTP_REQUEST_TOKEN_CONFIG to allow specific environment variables + to be used as auth tokens for requests to matching domains. + + Format: {"ENV_VAR_NAME": ["allowed.domain.com", "*.other.com"]} + + Example: + from strands_tools.http_request import HTTP_REQUEST_TOKEN_CONFIG + HTTP_REQUEST_TOKEN_CONFIG["GITHUB_TOKEN"] = ["api.github.com"] + HTTP_REQUEST_TOKEN_CONFIG["GITLAB_TOKEN"] = ["gitlab.com"] + + When auth_env_var is passed to the tool, the token is only injected if the request + domain matches one of the allowed domains for that variable. """ import base64 @@ -48,9 +56,8 @@ "name": "http_request", "description": ( "Make HTTP requests to any API with comprehensive authentication including Bearer tokens, Basic auth, " - "JWT, AWS SigV4, Digest auth, and enterprise authentication patterns. Automatically reads tokens from " - "environment variables (GITHUB_TOKEN, GITLAB_TOKEN, AWS credentials, etc.) when auth_env_var is specified. " - "Use environment(action='list') to view available variables. Includes session management, metrics, " + "JWT, AWS SigV4, Digest auth, and enterprise authentication patterns. " + "Includes session management, metrics, " "streaming support, cookie handling, redirect control, proxy support, and optional HTML to markdown conversion." ), "inputSchema": { @@ -82,11 +89,15 @@ }, "auth_token": { "type": "string", - "description": "Authentication token (if not provided, will check environment variables)", + "description": "Authentication token (if not provided, will check auth_env_var if configured)", }, "auth_env_var": { "type": "string", - "description": "Name of environment variable containing the auth token", + "description": ( + "Name of an environment variable containing the auth token. " + "The variable must be listed in HTTP_REQUEST_TOKEN_CONFIG " + "with an allowed domain that matches the request URL." + ), }, "headers": { "type": "object", @@ -198,6 +209,12 @@ # Metrics storage REQUEST_METRICS = collections.defaultdict(list) +# Token config: maps env var names to lists of allowed domains. +# Import and populate this dict to enable auth_env_var support: +# from strands_tools.http_request import HTTP_REQUEST_TOKEN_CONFIG +# HTTP_REQUEST_TOKEN_CONFIG["GITHUB_TOKEN"] = ["api.github.com"] +HTTP_REQUEST_TOKEN_CONFIG: Dict[str, list] = {} + def extract_content_from_html(html: str) -> str: """Convert HTML content to Markdown format. @@ -372,12 +389,14 @@ def format_headers_table(headers: Dict) -> Table: def process_auth_headers(headers: Dict[str, Any], tool_input: Dict[str, Any]) -> Dict[str, Any]: - """ - Process authentication headers based on input parameters. + """Process authentication headers based on input parameters. Supports multiple authentication methods: - 1. Environment variables: Uses auth_env_var to read tokens - 2. Direct token: Uses auth_token parameter + 1. Direct token: Uses auth_token parameter + 2. Env var token: Uses auth_env_var parameter, validated against HTTP_REQUEST_TOKEN_CONFIG + 3. Basic auth: Handled separately via handle_basic_auth + 4. JWT: Handled separately via handle_jwt + 5. AWS SigV4: Handled separately via handle_aws_sigv4 Special handling for different APIs: - GitHub: Uses "token" prefix (auth_type="token") @@ -385,23 +404,39 @@ def process_auth_headers(headers: Dict[str, Any], tool_input: Dict[str, Any]) -> - AWS: Uses SigV4 signing (auth_type="aws_sig_v4") Examples: - # GitHub API with environment variable - process_auth_headers({}, {"auth_type": "token", "auth_env_var": "GITHUB_TOKEN"}) + # GitHub API with env var (requires HTTP_REQUEST_TOKEN_CONFIG["GITHUB_TOKEN"] = ["api.github.com"]) + process_auth_headers({}, {"auth_type": "token", "auth_env_var": "GITHUB_TOKEN", "url": "https://api.github.com/user"}) - # GitLab API with environment variable - process_auth_headers({}, {"auth_type": "Bearer", "auth_env_var": "GITLAB_TOKEN"}) + # Direct token + process_auth_headers({}, {"auth_type": "Bearer", "auth_token": "my-token"}) """ headers = headers or {} - # Get auth token from input or environment auth_token = tool_input.get("auth_token") + + # Resolve token from environment variable if auth_env_var is provided if not auth_token and "auth_env_var" in tool_input: env_var_name = tool_input["auth_env_var"] - auth_token = os.getenv(env_var_name) + allowed_domains = HTTP_REQUEST_TOKEN_CONFIG.get(env_var_name) + + if allowed_domains is None: + raise ValueError( + f"Environment variable '{env_var_name}' is not listed in STRANDS_HTTP_REQUEST_TOKEN_CONFIG. " + f"Add it with an explicit list of allowed domains before using it as an auth token." + ) + + # Validate the request URL against the allowed domains using URL parsing + request_url = tool_input.get("url", "") + request_host = urlparse(request_url).hostname or "" + if request_host not in allowed_domains: + raise ValueError( + f"Request to '{request_host}' is not in the allowed domains for '{env_var_name}': {allowed_domains}" + ) + + auth_token = os.environ.get(env_var_name) if not auth_token: raise ValueError( - f"Environment variable '{env_var_name}' not found or empty. " - f"Use environment(action='list') to see available variables." + f"Environment variable '{env_var_name}' is not set or is empty." ) auth_type = tool_input.get("auth_type") @@ -554,6 +589,16 @@ def http_request(tool: ToolUse, **kwargs: Any) -> ToolResult: Common API Examples: 1. GitHub API (uses "token" auth_type): + ```python + http_request( + method="GET", + url="https://api.github.com/user", + auth_type="token", + auth_token="", + ) + ``` + + Or with env var (requires HTTP_REQUEST_TOKEN_CONFIG["GITHUB_TOKEN"] = ["api.github.com"]): ```python http_request( method="GET", @@ -564,6 +609,16 @@ def http_request(tool: ToolUse, **kwargs: Any) -> ToolResult: ``` 2. GitLab API (uses "Bearer" auth_type): + ```python + http_request( + method="GET", + url="https://gitlab.com/api/v4/user", + auth_type="Bearer", + auth_token="", + ) + ``` + + Or with env var (requires HTTP_REQUEST_TOKEN_CONFIG["GITLAB_TOKEN"] = ["gitlab.com"]): ```python http_request( method="GET", @@ -622,9 +677,10 @@ def http_request(tool: ToolUse, **kwargs: Any) -> ToolResult: ``` Environment Variables: - - Authentication tokens are read from environment when auth_env_var is specified - AWS credentials are automatically loaded from environment variables or credentials file - - Use environment(action='list') to view all available environment variables + + Token Config: + - Use HTTP_REQUEST_TOKEN_CONFIG to allow specific env vars as auth tokens for permitted domains """ console = console_util.create() @@ -947,9 +1003,9 @@ def http_request(tool: ToolUse, **kwargs: Any) -> ToolResult: if "auth" in error_str or "token" in error_str or "credential" in error_str or "unauthorized" in error_str: suggestion = ( "\n\nSuggestion: Check your authentication setup. Common solutions:\n" - "- For GitHub API: Use auth_type='token' with auth_env_var='GITHUB_TOKEN'\n" - "- For GitLab API: Use auth_type='Bearer' with auth_env_var='GITLAB_TOKEN'\n" - "- Use environment(action='list') to view available environment variables" + "- For GitHub API: Use auth_type='token' with auth_token=''\n" + "- For GitLab API: Use auth_type='Bearer' with auth_token=''\n" + "- For AWS APIs: Use auth_type='aws_sig_v4' with aws_auth configuration" ) # Special handling for ImportError to help with test assertions diff --git a/tests/test_http_request.py b/tests/test_http_request.py index 5a782fb3..8a791ba3 100644 --- a/tests/test_http_request.py +++ b/tests/test_http_request.py @@ -25,7 +25,6 @@ def mock_request_state(): """Create a mock request state dictionary.""" return {} - @pytest.fixture def mock_env_vars(): """Set up mock environment variables for testing.""" @@ -45,7 +44,6 @@ def extract_result_text(result): return "\n".join([item["text"] for item in result["content"]]) return str(result) - @responses.activate def test_basic_get_request(): """Test a basic GET request with direct invocation.""" @@ -232,7 +230,7 @@ def test_disable_redirects(): @responses.activate -def test_auth_token_direct(mock_env_vars): +def test_auth_token_direct(): """Test using auth_token parameter directly.""" responses.add( responses.GET, @@ -262,8 +260,8 @@ def test_auth_token_direct(mock_env_vars): @responses.activate -def test_auth_token_from_env(mock_env_vars): - """Test getting auth token from environment variable.""" +def test_auth_token_bearer(): + """Test Bearer auth with direct auth_token.""" responses.add( responses.GET, "https://api.example.com/protected", @@ -278,7 +276,7 @@ def test_auth_token_from_env(mock_env_vars): "method": "GET", "url": "https://api.example.com/protected", "auth_type": "Bearer", - "auth_env_var": "TEST_TOKEN", + "auth_token": "test-token-value", }, } @@ -292,7 +290,7 @@ def test_auth_token_from_env(mock_env_vars): @responses.activate -def test_github_api_auth(mock_env_vars): +def test_github_api_auth(): """Test GitHub API authentication with token prefix.""" responses.add( responses.GET, @@ -315,7 +313,7 @@ def test_github_api_auth(mock_env_vars): "method": "GET", "url": "https://api.github.com/user", "auth_type": "token", - "auth_env_var": "GITHUB_TOKEN", + "auth_token": "github-token-1234", }, } @@ -330,6 +328,85 @@ def test_github_api_auth(mock_env_vars): assert responses.calls[0].request.headers["Accept"] == "application/vnd.github.v3+json" +@responses.activate +def test_auth_env_var_allowed_domain(mock_env_vars): + """Test auth_env_var resolves token when domain is in the allowlist.""" + responses.add( + responses.GET, + "https://api.github.com/user", + json={"login": "testuser"}, + status=200, + ) + + tool_use = { + "toolUseId": "test-env-var-allowed-id", + "input": { + "method": "GET", + "url": "https://api.github.com/user", + "auth_type": "token", + "auth_env_var": "GITHUB_TOKEN", + }, + } + + token_config = {"GITHUB_TOKEN": ["api.github.com"]} + with ( + patch("strands_tools.http_request.HTTP_REQUEST_TOKEN_CONFIG", token_config), + patch("strands_tools.http_request.get_user_input") as mock_input, + ): + mock_input.return_value = "y" + result = http_request.http_request(tool=tool_use) + + assert result["status"] == "success" + assert responses.calls[0].request.headers["Authorization"] == "token github-token-1234" + + +def test_auth_env_var_domain_not_allowed(mock_env_vars): + """Test auth_env_var raises error when domain is not in the allowlist.""" + tool_use = { + "toolUseId": "test-env-var-denied-id", + "input": { + "method": "GET", + "url": "https://evil.example.com/steal", + "auth_type": "token", + "auth_env_var": "GITHUB_TOKEN", + }, + } + + token_config = {"GITHUB_TOKEN": ["api.github.com"]} + with ( + patch("strands_tools.http_request.HTTP_REQUEST_TOKEN_CONFIG", token_config), + patch("strands_tools.http_request.get_user_input") as mock_input, + ): + mock_input.return_value = "y" + result = http_request.http_request(tool=tool_use) + + assert result["status"] == "error" + assert "not in the allowed domains" in result["content"][0]["text"] + + +def test_auth_env_var_not_in_config(): + """Test auth_env_var raises error when env var is not in token config at all.""" + tool_use = { + "toolUseId": "test-env-var-no-config-id", + "input": { + "method": "GET", + "url": "https://api.github.com/user", + "auth_type": "token", + "auth_env_var": "SOME_UNKNOWN_TOKEN", + }, + } + + with ( + patch("strands_tools.http_request.HTTP_REQUEST_TOKEN_CONFIG", {}), + patch("strands_tools.http_request.get_user_input") as mock_input, + ): + mock_input.return_value = "y" + result = http_request.http_request(tool=tool_use) + + assert result["status"] == "error" + assert "STRANDS_HTTP_REQUEST_TOKEN_CONFIG" in result["content"][0]["text"] + + @responses.activate def test_basic_auth(): """Test basic authentication.""" @@ -441,27 +518,6 @@ def test_cancellation(monkeypatch): monkeypatch.delenv("BYPASS_TOOL_CONSENT", raising=False) -@responses.activate -def test_missing_env_var(): - """Test error when environment variable doesn't exist.""" - tool_use = { - "toolUseId": "test-missing-env-id", - "input": { - "method": "GET", - "url": "https://api.example.com/", - "auth_type": "Bearer", - "auth_env_var": "NON_EXISTENT_TOKEN", - }, - } - - with patch("strands_tools.http_request.get_user_input") as mock_input: - mock_input.return_value = "y" - result = http_request.http_request(tool=tool_use) - - assert result["status"] == "error" - assert "Environment variable 'NON_EXISTENT_TOKEN' not found" in result["content"][0]["text"] - - def test_aws_sigv4_auth(): """Test AWS SigV4 authentication.""" tool_use = { From 488a168d0cf3c7b48ee932ad0b2433a690688614 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 26 Mar 2026 09:57:56 +0100 Subject: [PATCH 16/39] docs: add missing tools parameters --- docs/apify_tool.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/apify_tool.md b/docs/apify_tool.md index f1455cdb..46e9e800 100644 --- a/docs/apify_tool.md +++ b/docs/apify_tool.md @@ -120,6 +120,7 @@ items = agent.tool.apify_get_dataset_items( |-----------|------|----------|---------|-------------| | `url` | string | Yes | β€” | The URL to scrape | | `timeout_secs` | int | No | 120 | Maximum time in seconds to wait for scraping to finish | +| `crawler_type` | string | No | `"cheerio"` | Crawler engine to use. One of `"cheerio"` (fastest, no JS rendering), `"playwright:adaptive"` (fast, renders JS if present), or `"playwright:firefox"` (reliable, renders JS, best at avoiding blocking but slower) | **Returns:** Markdown content of the scraped page as a plain string. @@ -131,6 +132,7 @@ items = agent.tool.apify_get_dataset_items( | `run_input` | dict | No | None | JSON-serializable input for the Actor | | `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor run to finish | | `memory_mbytes` | int | No | None | Memory allocation in MB for the Actor run (uses Actor default if not set) | +| `build` | string | No | None | Actor build tag or number to run a specific version (uses latest build if not set) | **Returns:** JSON string with run metadata: `run_id`, `status`, `dataset_id`, `started_at`, `finished_at`. @@ -154,6 +156,7 @@ items = agent.tool.apify_get_dataset_items( | `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the task run to finish | | `memory_mbytes` | int | No | None | Memory allocation in MB for the task run (uses task default if not set) | | `dataset_items_limit` | int | No | 100 | Maximum number of dataset items to return | +| `dataset_items_offset` | int | No | 0 | Number of dataset items to skip for pagination | **Returns:** JSON string with run metadata plus an `items` array containing the dataset results. @@ -175,7 +178,9 @@ items = agent.tool.apify_get_dataset_items( | `run_input` | dict | No | None | JSON-serializable input for the Actor | | `timeout_secs` | int | No | 300 | Maximum time in seconds to wait for the Actor run to finish | | `memory_mbytes` | int | No | None | Memory allocation in MB for the Actor run (uses Actor default if not set) | +| `build` | string | No | None | Actor build tag or number to run a specific version (uses latest build if not set) | | `dataset_items_limit` | int | No | 100 | Maximum number of dataset items to return | +| `dataset_items_offset` | int | No | 0 | Number of dataset items to skip for pagination | **Returns:** JSON string with run metadata plus an `items` array containing the dataset results. From 642f0db74d4539a71028d1be8e38524409903a6a Mon Sep 17 00:00:00 2001 From: Raju Ansari Date: Thu, 26 Mar 2026 07:21:01 -0700 Subject: [PATCH 17/39] feat: add payment required header support (#423) Co-authored-by: Raju Ansari --- src/strands_tools/http_request.py | 6 +- tests/test_http_request.py | 144 ++++++++++++++++++++++++++++++ 2 files changed, 148 insertions(+), 2 deletions(-) diff --git a/src/strands_tools/http_request.py b/src/strands_tools/http_request.py index d5ff1767..116f9797 100644 --- a/src/strands_tools/http_request.py +++ b/src/strands_tools/http_request.py @@ -971,8 +971,10 @@ def http_request(tool: ToolUse, **kwargs: Any) -> ToolResult: result_text.append(f"Redirects: {redirect_count} redirects followed ({redirect_chain})") # Add minimal headers to text response - important_headers = ["Content-Type", "Content-Length", "Date", "Server"] - headers_text = {k: v for k, v in response.headers.items() if k in important_headers} + important_headers_lower = { + h.lower() for h in ["Content-Type", "Content-Length", "Date", "Server", "Payment-Required"] + } + headers_text = {k: v for k, v in response.headers.items() if k.lower() in important_headers_lower} result_text.append(f"Headers: {headers_text}") # Add body to text response diff --git a/tests/test_http_request.py b/tests/test_http_request.py index 8a791ba3..54c10c30 100644 --- a/tests/test_http_request.py +++ b/tests/test_http_request.py @@ -1180,3 +1180,147 @@ def test_proxy_support(): assert result["status"] == "success" result_text = extract_result_text(result) assert "Status Code: 200" in result_text + + +@responses.activate +def test_payment_required_header_in_response(): + """Test that Payment-Required header is captured in response.""" + # Set up mock response with Payment-Required header + responses.add( + responses.GET, + "https://api.example.com/premium-feature", + json={"error": "payment required"}, + status=402, + headers={"Payment-Required": "true"}, + content_type="application/json", + ) + + tool_use = { + "toolUseId": "test-payment-required-id", + "input": { + "method": "GET", + "url": "https://api.example.com/premium-feature", + }, + } + + with patch("strands_tools.http_request.get_user_input") as mock_input: + mock_input.return_value = "y" + result = http_request.http_request(tool=tool_use) + + assert result["status"] == "success" + result_text = extract_result_text(result) + + # Verify Payment-Required header is in the response + assert "Payment-Required" in result_text + assert "true" in result_text + assert "Status Code: 402" in result_text + + +@responses.activate +def test_payment_required_header_with_other_headers(): + """Test Payment-Required header is captured alongside other important headers.""" + # Set up mock response with multiple important headers + responses.add( + responses.GET, + "https://api.example.com/data", + json={"data": "test"}, + status=200, + headers={ + "Date": "Mon, 24 Mar 2026 12:00:00 GMT", + "Server": "nginx/1.20.0", + "Payment-Required": "false", + "X-Custom-Header": "should-not-appear", + }, + content_type="application/json", + ) + + tool_use = { + "toolUseId": "test-multiple-headers-id", + "input": { + "method": "GET", + "url": "https://api.example.com/data", + }, + } + + with patch("strands_tools.http_request.get_user_input") as mock_input: + mock_input.return_value = "y" + result = http_request.http_request(tool=tool_use) + + assert result["status"] == "success" + result_text = extract_result_text(result) + + # Verify important headers are present + assert "Content-Type" in result_text + assert "Server" in result_text + assert "Payment-Required" in result_text + + # Verify custom headers are not included + assert "X-Custom-Header" not in result_text + + +@responses.activate +def test_payment_required_header_case_insensitive(): + """Test that Payment-Required header is matched case-insensitively.""" + # Set up mock response with lowercase payment-required header + responses.add( + responses.GET, + "https://api.example.com/check", + json={"status": "ok"}, + status=200, + headers={"payment-required": "false"}, + content_type="application/json", + ) + + tool_use = { + "toolUseId": "test-case-insensitive-id", + "input": { + "method": "GET", + "url": "https://api.example.com/check", + }, + } + + with patch("strands_tools.http_request.get_user_input") as mock_input: + mock_input.return_value = "y" + result = http_request.http_request(tool=tool_use) + + assert result["status"] == "success" + result_text = extract_result_text(result) + + # Verify the header is captured regardless of case + assert "payment-required" in result_text.lower() + + +@responses.activate +def test_payment_required_header_missing(): + """Test response when Payment-Required header is not present.""" + # Set up mock response without Payment-Required header + responses.add( + responses.GET, + "https://api.example.com/free-feature", + json={"data": "free content"}, + status=200, + headers={ + "Server": "nginx", + }, + content_type="application/json", + ) + + tool_use = { + "toolUseId": "test-no-payment-header-id", + "input": { + "method": "GET", + "url": "https://api.example.com/free-feature", + }, + } + + with patch("strands_tools.http_request.get_user_input") as mock_input: + mock_input.return_value = "y" + result = http_request.http_request(tool=tool_use) + + assert result["status"] == "success" + result_text = extract_result_text(result) + + # Verify response is successful even without Payment-Required header + assert "Status Code: 200" in result_text + # The headers dict should still be present but without Payment-Required + assert "Headers:" in result_text From 22e8d4400989db54c607ecd62acdd3a73df9a94e Mon Sep 17 00:00:00 2001 From: Abdallah Moussawi Date: Thu, 26 Mar 2026 14:31:07 +0000 Subject: [PATCH 18/39] feat(code-interpreter): add session_timeout_seconds parameter to AgentCoreCodeInterpreter (#418) --- .../agent_core_code_interpreter.py | 13 +++- .../test_agent_core_code_interpreter.py | 76 +++++++++++++++++-- 2 files changed, 81 insertions(+), 8 deletions(-) diff --git a/src/strands_tools/code_interpreter/agent_core_code_interpreter.py b/src/strands_tools/code_interpreter/agent_core_code_interpreter.py index 7fe637b6..4e5ebf42 100644 --- a/src/strands_tools/code_interpreter/agent_core_code_interpreter.py +++ b/src/strands_tools/code_interpreter/agent_core_code_interpreter.py @@ -53,6 +53,7 @@ def __init__( session_name: Optional[str] = None, auto_create: bool = True, persist_sessions: bool = True, + session_timeout_seconds: int = 900, ) -> None: """ Initialize the Bedrock AgentCore code interpreter with session persistence support. @@ -100,6 +101,10 @@ def __init__( sessions to survive across invocations and be reconnected by subsequent instances via module-level cache. + session_timeout_seconds (int): Timeout in seconds for sessions created + by this instance. Sessions automatically terminate after the timeout period. + Default: 900 (15 minutes). + Session Lifecycle: Invocation 1 (Instance #1): 1. Create new instance with session_name="user-abc-123" @@ -180,6 +185,7 @@ def invoke(payload, context): self.identifier = identifier or "aws.codeinterpreter.v1" self.auto_create = auto_create self.persist_sessions = persist_sessions + self.session_timeout_seconds = session_timeout_seconds if session_name is None: self.default_session = f"session-{uuid.uuid4().hex[:12]}" @@ -262,8 +268,11 @@ def init_session(self, action: InitSessionAction) -> Dict[str, Any]: # Create new sandbox client client = BedrockAgentCoreCodeInterpreterClient(region=self.region) - # Start session with identifier and name - client.start(identifier=self.identifier, name=session_name) + client.start( + identifier=self.identifier, + name=session_name, + session_timeout_seconds=self.session_timeout_seconds, + ) aws_session_id = client.session_id diff --git a/tests/code_interpreter/test_agent_core_code_interpreter.py b/tests/code_interpreter/test_agent_core_code_interpreter.py index 1fff0789..89a7bb4e 100644 --- a/tests/code_interpreter/test_agent_core_code_interpreter.py +++ b/tests/code_interpreter/test_agent_core_code_interpreter.py @@ -67,6 +67,7 @@ def test_initialization(interpreter): assert interpreter.default_session.startswith("session-") assert interpreter.auto_create is True assert interpreter.persist_sessions is True + assert interpreter.session_timeout_seconds == 900 def test_initialization_with_new_parameters(): @@ -77,6 +78,22 @@ def test_initialization_with_new_parameters(): assert interpreter.persist_sessions is False +def test_initialization_with_session_timeout(): + """Test initialization with custom session timeout.""" + with patch("strands_tools.code_interpreter.agent_core_code_interpreter.resolve_region") as mock_resolve: + mock_resolve.return_value = "us-west-2" + interpreter = AgentCoreCodeInterpreter(region="us-west-2", session_timeout_seconds=1800) + assert interpreter.session_timeout_seconds == 1800 + + +def test_initialization_without_session_timeout(): + """Test initialization without session timeout defaults to 900.""" + with patch("strands_tools.code_interpreter.agent_core_code_interpreter.resolve_region") as mock_resolve: + mock_resolve.return_value = "us-west-2" + interpreter = AgentCoreCodeInterpreter(region="us-west-2") + assert interpreter.session_timeout_seconds == 900 + + def test_session_name_no_cleaning(): """Test that session names are used as-is without cleaning.""" with patch("strands_tools.code_interpreter.agent_core_code_interpreter.resolve_region") as mock_resolve: @@ -396,7 +413,9 @@ def test_init_session_success(mock_client_class, interpreter, mock_client): assert result["content"][0]["json"]["sessionId"] == "test-session-id-123" mock_client_class.assert_called_once_with(region="us-west-2") - mock_client.start.assert_called_once_with(identifier="aws.codeinterpreter.v1", name="my-session") + mock_client.start.assert_called_once_with( + identifier="aws.codeinterpreter.v1", name="my-session", session_timeout_seconds=900 + ) assert "my-session" in interpreter._sessions session_info = interpreter._sessions["my-session"] @@ -429,7 +448,9 @@ def test_init_session_with_custom_identifier(mock_client_class, mock_client): assert result["content"][0]["json"]["sessionId"] == "test-session-id-123" mock_client_class.assert_called_once_with(region="us-west-2") - mock_client.start.assert_called_once_with(identifier=custom_id, name="custom-session") + mock_client.start.assert_called_once_with( + identifier=custom_id, name="custom-session", session_timeout_seconds=900 + ) assert "custom-session" in interpreter._sessions session_info = interpreter._sessions["custom-session"] @@ -458,7 +479,9 @@ def test_init_session_with_default_identifier(mock_client_class, mock_client): assert result["content"][0]["json"]["sessionId"] == "test-session-id-123" mock_client_class.assert_called_once_with(region="us-west-2") - mock_client.start.assert_called_once_with(identifier="aws.codeinterpreter.v1", name="default-session") + mock_client.start.assert_called_once_with( + identifier="aws.codeinterpreter.v1", name="default-session", session_timeout_seconds=900 + ) assert "default-session" in interpreter._sessions session_info = interpreter._sessions["default-session"] @@ -468,6 +491,44 @@ def test_init_session_with_default_identifier(mock_client_class, mock_client): assert session_info.client == mock_client +@patch("strands_tools.code_interpreter.agent_core_code_interpreter.BedrockAgentCoreCodeInterpreterClient") +def test_init_session_with_session_timeout(mock_client_class, mock_client): + """Test session initialization passes session_timeout_seconds to client.start() when set.""" + with patch("strands_tools.code_interpreter.agent_core_code_interpreter.resolve_region") as mock_resolve: + mock_resolve.return_value = "us-west-2" + mock_client_class.return_value = mock_client + + interpreter = AgentCoreCodeInterpreter(region="us-west-2", session_timeout_seconds=1800) + + action = InitSessionAction(type="initSession", description="Test session", session_name="timeout-session") + + result = interpreter.init_session(action) + + assert result["status"] == "success" + mock_client.start.assert_called_once_with( + identifier="aws.codeinterpreter.v1", name="timeout-session", session_timeout_seconds=1800 + ) + + +@patch("strands_tools.code_interpreter.agent_core_code_interpreter.BedrockAgentCoreCodeInterpreterClient") +def test_init_session_without_session_timeout(mock_client_class, mock_client): + """Test session initialization passes default session_timeout_seconds to client.start().""" + with patch("strands_tools.code_interpreter.agent_core_code_interpreter.resolve_region") as mock_resolve: + mock_resolve.return_value = "us-west-2" + mock_client_class.return_value = mock_client + + interpreter = AgentCoreCodeInterpreter(region="us-west-2") + + action = InitSessionAction(type="initSession", description="Test session", session_name="no-timeout-session") + + result = interpreter.init_session(action) + + assert result["status"] == "success" + mock_client.start.assert_called_once_with( + identifier="aws.codeinterpreter.v1", name="no-timeout-session", session_timeout_seconds=900 + ) + + @patch("strands_tools.code_interpreter.agent_core_code_interpreter.BedrockAgentCoreCodeInterpreterClient") def test_init_session_multiple_identifiers_verification(mock_client_class, mock_client): """Test that different interpreter instances with different identifiers work correctly.""" @@ -498,9 +559,12 @@ def test_init_session_multiple_identifiers_verification(mock_client_class, mock_ assert mock_client.start.call_count == 3 call_args_list = mock_client.start.call_args_list - assert call_args_list[0] == ((), {"identifier": custom_id1, "name": "session1"}) - assert call_args_list[1] == ((), {"identifier": custom_id2, "name": "session2"}) - assert call_args_list[2] == ((), {"identifier": "aws.codeinterpreter.v1", "name": "session3"}) + assert call_args_list[0] == ((), {"identifier": custom_id1, "name": "session1", "session_timeout_seconds": 900}) + assert call_args_list[1] == ((), {"identifier": custom_id2, "name": "session2", "session_timeout_seconds": 900}) + assert call_args_list[2] == ( + (), + {"identifier": "aws.codeinterpreter.v1", "name": "session3", "session_timeout_seconds": 900}, + ) @patch("strands_tools.code_interpreter.agent_core_code_interpreter.BedrockAgentCoreCodeInterpreterClient") From 0af4fd72101d9f601d08f6eff7d89109e56ce799 Mon Sep 17 00:00:00 2001 From: Mackenzie Zastrow <3211021+zastrowm@users.noreply.github.com> Date: Thu, 26 Mar 2026 14:49:25 -0400 Subject: [PATCH 19/39] fix: add info-level logging when auth token is resolved from environment variable (#428) Co-authored-by: Mackenzie Zastrow --- src/strands_tools/http_request.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/strands_tools/http_request.py b/src/strands_tools/http_request.py index 116f9797..95176184 100644 --- a/src/strands_tools/http_request.py +++ b/src/strands_tools/http_request.py @@ -29,6 +29,7 @@ import datetime import http.cookiejar import json +import logging import os import time from typing import Any, Dict, Optional, Union @@ -52,6 +53,8 @@ from strands_tools.utils import console_util from strands_tools.utils.user_input import get_user_input +logger = logging.getLogger(__name__) + TOOL_SPEC = { "name": "http_request", "description": ( @@ -435,9 +438,8 @@ def process_auth_headers(headers: Dict[str, Any], tool_input: Dict[str, Any]) -> auth_token = os.environ.get(env_var_name) if not auth_token: - raise ValueError( - f"Environment variable '{env_var_name}' is not set or is empty." - ) + raise ValueError(f"Environment variable '{env_var_name}' is not set or is empty.") + logger.info(f"Resolved auth token from environment variable '{env_var_name}' for domain '{request_host}'") auth_type = tool_input.get("auth_type") From b0c8f30aa602e015f6024ae28d1247efbab60111 Mon Sep 17 00:00:00 2001 From: Agent of mkmeral Date: Mon, 30 Mar 2026 10:54:11 -0400 Subject: [PATCH 20/39] docs: add use_agent, graph, and elasticsearch_memory to README (#431) Co-authored-by: Strands Agent Co-authored-by: Strands Agent --- README.md | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) diff --git a/README.md b/README.md index 0e71be16..cf111a21 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,8 @@ Strands Agents Tools is a community-driven project that provides a powerful set - ⏱️ **Task Scheduling** - Schedule and manage cron jobs - 🧠 **Advanced Reasoning** - Tools for complex thinking and reasoning capabilities - 🐝 **Swarm Intelligence** - Coordinate multiple AI agents for parallel problem solving with shared memory +- πŸ€– **Agent as Tool** - Create nested agent instances with model switching support for multi-model workflows and specialized sub-tasks +- πŸ”— **Multi-Agent Graph** - Create and manage deterministic DAG-based multi-agent pipelines with output propagation and per-node model configuration - πŸ”Œ **Dynamic MCP Client** - ⚠️ Dynamically connect to external MCP servers and load remote tools (use with caution - see security warnings) - πŸ”„ **Multiple tools in Parallel** - Call multiple other tools at the same time in parallel with Batch Tool - πŸ” **Browser Tool** - Tool giving an agent access to perform automated actions on a browser (chromium) @@ -131,12 +133,14 @@ Below is a comprehensive table of all available tools, how to use them with an a | current_time | `agent.tool.current_time(timezone="US/Pacific")` | Get the current time in ISO 8601 format for a specified timezone | | sleep | `agent.tool.sleep(seconds=5)` | Pause execution for the specified number of seconds, interruptible with SIGINT (Ctrl+C) | | agent_graph | `agent.tool.agent_graph(agents=["agent1", "agent2"], connections=[{"from": "agent1", "to": "agent2"}])` | Create and visualize agent relationship graphs for complex multi-agent systems | +| graph | `agent.tool.graph(action="create", graph_id="pipeline", topology={"nodes": [...], "edges": [...]})` | Create and manage deterministic DAG-based multi-agent graphs using Strands SDK Graph implementation with per-node model configuration | | cron* | `agent.tool.cron(action="schedule", name="task", schedule="0 * * * *", command="backup.sh")` | Schedule and manage recurring tasks with cron job syntax
**Does not work on Windows | | slack | `agent.tool.slack(action="post_message", channel="general", text="Hello team!")` | Interact with Slack workspace for messaging and monitoring | | speak | `agent.tool.speak(text="Operation completed successfully", style="green", mode="polly")` | Output status messages with rich formatting and optional text-to-speech | | stop | `agent.tool.stop(message="Process terminated by user request")` | Gracefully terminate agent execution with custom message | | handoff_to_user | `agent.tool.handoff_to_user(message="Please confirm action", breakout_of_loop=False)` | Hand off control to user for confirmation, input, or complete task handoff | | use_llm | `agent.tool.use_llm(prompt="Analyze this data", system_prompt="You are a data analyst")` | Create nested AI loops with customized system prompts for specialized tasks | +| use_agent | `agent.tool.use_agent(prompt="Analyze this code", system_prompt="You are a code analyst.", model_provider="bedrock")` | Create nested agent instances with model switching, multi-model workflows, cost optimization, and specialized sub-tasks | | workflow | `agent.tool.workflow(action="create", name="data_pipeline", steps=[{"tool": "file_read"}, {"tool": "python_repl"}])` | Define, execute, and manage multi-step automated workflows | | mcp_client | `agent.tool.mcp_client(action="connect", connection_id="my_server", transport="stdio", command="python", args=["server.py"])` | ⚠️ **SECURITY WARNING**: Dynamically connect to external MCP servers via stdio, sse, or streamable_http, list tools, and call remote tools. This can pose security risks as agents may connect to malicious servers. Use with caution in production. | | batch| `agent.tool.batch(invocations=[{"name": "current_time", "arguments": {"timezone": "Europe/London"}}, {"name": "stop", "arguments": {}}])` | Call multiple other tools in parallel. | @@ -147,6 +151,7 @@ Below is a comprehensive table of all available tools, how to use them with an a | search_video | `agent.tool.search_video(query="people discussing AI")` | Semantic video search using TwelveLabs' Marengo model | | chat_video | `agent.tool.chat_video(prompt="What are the main topics?", video_id="video_123")` | Interactive video analysis using TwelveLabs' Pegasus model | | mongodb_memory | `agent.tool.mongodb_memory(action="record", content="User prefers vegetarian pizza", connection_string="mongodb+srv://...", database_name="memories")` | Store and retrieve memories using MongoDB Atlas with semantic search via AWS Bedrock Titan embeddings | +| elasticsearch_memory | `agent.tool.elasticsearch_memory(action="record", content="User prefers dark mode", cloud_id="...", api_key="...")` | Store and retrieve memories using Elasticsearch with semantic search via AWS Bedrock Titan embeddings | \* *These tools do not work on windows* @@ -679,6 +684,53 @@ agent.tool.handoff_to_user( ) ``` +### Use Agent (Agent as Tool) + +```python +from strands import Agent +from strands_tools import use_agent + +agent = Agent(tools=[use_agent]) + +# Basic usage - inherits parent agent's model +result = agent.tool.use_agent( + prompt="Tell me about the advantages of tool-building in AI agents", + system_prompt="You are a helpful AI assistant specializing in AI development concepts." +) + +# Use a different model provider for specialized tasks +result = agent.tool.use_agent( + prompt="Calculate 2 + 2 and explain the result", + system_prompt="You are a helpful math assistant.", + model_provider="bedrock", + model_settings={ + "model_id": "us.anthropic.claude-sonnet-4-20250514-v1:0" + }, + tools=["calculator"] +) + +# Use environment variables to determine model +import os +os.environ["STRANDS_PROVIDER"] = "ollama" +os.environ["STRANDS_MODEL_ID"] = "qwen3:4b" +result = agent.tool.use_agent( + prompt="Analyze this code", + system_prompt="You are a code review assistant.", + model_provider="env" +) + +# Custom model configuration with specific parameters +result = agent.tool.use_agent( + prompt="Write a creative story", + system_prompt="You are a creative writing assistant.", + model_provider="github", + model_settings={ + "model_id": "openai/o4-mini", + "params": {"temperature": 1, "max_tokens": 4000} + } +) +``` + ### A2A Client ```python @@ -814,6 +866,68 @@ result = agent.tool.use_computer( ) ``` +### Graph (Multi-Agent DAG) + +Create deterministic DAG-based multi-agent pipelines where agents are nodes with dependency relationships. Unlike `agent_graph` (which uses persistent message-passing), `graph` uses task-based execution with output propagation. + +```python +from strands import Agent +from strands_tools.graph import graph + +agent = Agent(tools=[graph]) + +# Create a multi-agent research pipeline +result = agent.tool.graph( + action="create", + graph_id="research_pipeline", + topology={ + "nodes": [ + { + "id": "researcher", + "role": "researcher", + "system_prompt": "You research topics thoroughly.", + "model_provider": "bedrock", + "model_settings": {"model_id": "us.anthropic.claude-sonnet-4-20250514-v1:0"} + }, + { + "id": "analyst", + "role": "analyst", + "system_prompt": "You analyze research data.", + "model_provider": "bedrock", + "model_settings": {"model_id": "us.anthropic.claude-3-5-haiku-20241022-v1:0"} + }, + { + "id": "reporter", + "role": "reporter", + "system_prompt": "You create comprehensive reports.", + "tools": ["file_write", "editor"] + } + ], + "edges": [ + {"from": "researcher", "to": "analyst"}, + {"from": "analyst", "to": "reporter"} + ], + "entry_points": ["researcher"] + } +) + +# Execute a task through the graph +result = agent.tool.graph( + action="execute", + graph_id="research_pipeline", + task="Research and analyze the impact of AI on healthcare" +) + +# Get graph status +result = agent.tool.graph(action="status", graph_id="research_pipeline") + +# List all graphs +result = agent.tool.graph(action="list") + +# Delete a graph +result = agent.tool.graph(action="delete", graph_id="research_pipeline") +``` + ### Elasticsearch Memory **Note**: This tool requires AWS account credentials to generate embeddings using Amazon Bedrock Titan models. @@ -1190,6 +1304,34 @@ The Mem0 Memory Tool supports three different backend configurations: |----------------------|-------------|---------| | RETRIEVE_ENABLE_METADATA_DEFAULT | Default setting for enabling metadata in retrieve tool responses | false | +#### Use Agent Tool + +| Environment Variable | Description | Default | +|----------------------|-------------|---------| +| STRANDS_PROVIDER | Default model provider when using model_provider="env" | ollama | +| STRANDS_MODEL_ID | Default model identifier for environment-based model selection | None | +| STRANDS_MAX_TOKENS | Maximum tokens for the nested agent model | None | +| STRANDS_TEMPERATURE | Sampling temperature for the nested agent model | None | + + +#### Elasticsearch Memory Tool + +| Environment Variable | Description | Default | +|----------------------|-------------|---------| +| ELASTICSEARCH_CLOUD_ID | Elasticsearch Cloud ID for connection | None | +| ELASTICSEARCH_URL | Elasticsearch URL for serverless connection | None | +| ELASTICSEARCH_API_KEY | Elasticsearch API key for authentication | None | +| ELASTICSEARCH_INDEX_NAME | Elasticsearch index name for memory storage | strands_memory | +| ELASTICSEARCH_NAMESPACE | Namespace for memory isolation | default | +| ELASTICSEARCH_EMBEDDING_MODEL | Amazon Bedrock model for embeddings | amazon.titan-embed-text-v2:0 | +| AWS_REGION | AWS region for Bedrock embedding service | us-west-2 | + +**Note**: This tool requires AWS account credentials to generate embeddings using Amazon Bedrock Titan models. + +#### Graph Tool + +The `graph` tool uses the same model provider environment variables as `use_agent` for per-node model configuration. No additional environment variables are required. + #### Video Tools | Environment Variable | Description | Default | From a2b955365e4c8801744b892bb04213d57fb773df Mon Sep 17 00:00:00 2001 From: javierlarota Date: Tue, 31 Mar 2026 14:15:53 -0400 Subject: [PATCH 21/39] fix: mem0_memory - Replace direct Console initialization with console_util (#378) --- src/strands_tools/mem0_memory.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/strands_tools/mem0_memory.py b/src/strands_tools/mem0_memory.py index e815e9ef..3b55f79d 100644 --- a/src/strands_tools/mem0_memory.py +++ b/src/strands_tools/mem0_memory.py @@ -74,17 +74,18 @@ from mem0 import Memory as Mem0Memory from mem0 import MemoryClient from opensearchpy import AWSV4SignerAuth, RequestsHttpConnection -from rich.console import Console from rich.panel import Panel from rich.table import Table from rich.text import Text from strands.types.tools import ToolResult, ToolResultContent, ToolUse +from strands_tools.utils import console_util + # Set up logging logger = logging.getLogger(__name__) # Initialize Rich console -console = Console() +console = console_util.create() TOOL_SPEC = { "name": "mem0_memory", From 1b9675f339003ae2bc97f1c5fbc8fe6b9c43894d Mon Sep 17 00:00:00 2001 From: Jiri Spilka Date: Thu, 2 Apr 2026 11:46:55 +0200 Subject: [PATCH 22/39] fix: Update Apify tools documentation for improved clarity and expanded details on input, usage, and examples. --- src/strands_tools/apify.py | 82 ++++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 35 deletions(-) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 5855ab83..1505cb60 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -1,8 +1,10 @@ """Apify platform tools for Strands Agents. -This module provides web scraping, data extraction, and automation capabilities -using the Apify platform. It lets you run any Actor, task, fetch dataset -results, and scrape individual URLs. +Apify is a large marketplace of tools for web scraping, data extraction, +and web automation. These tools are called Actors β€” serverless cloud applications that +take JSON input and store results in a dataset (structured, tabular output) or key-value +store (files and unstructured data). Actors exist for social media, e-commerce, search +engines, maps, travel sites, and many other sources. Available Tools: --------------- @@ -16,7 +18,7 @@ Setup Requirements: ------------------ 1. Create an Apify account at https://apify.com -2. Obtain your API token: Apify Console > Settings > API & Integrations > Personal API tokens +2. Get your API token: Apify Console > Settings > API & Integrations > Personal API tokens 3. Install the optional dependency: pip install strands-agents-tools[apify] 4. Set the environment variable: APIFY_API_TOKEN=your_api_token_here @@ -366,18 +368,22 @@ def apify_run_actor( ) -> Dict[str, Any]: """Run any Apify Actor and return the run metadata as JSON. - Executes the Actor synchronously - blocks until the Actor run finishes or the timeout - is reached. Use this when you need to run a specific Actor and then inspect or process - the results separately. + An Actor is a serverless cloud app on the Apify platform β€” it takes JSON input, + runs the scraping or automation job, and writes results to a dataset. This tool + executes the Actor synchronously and returns run metadata only (run_id, status, + dataset_id, timestamps). Use apify_run_actor_and_get_dataset to also fetch the + output data in one call, or apify_scrape_url for quick single-URL extraction. Common Actors: - - "apify/website-content-crawler" - scrape websites and extract content - - "apify/web-scraper" - general-purpose web scraper - - "apify/google-search-scraper" - scrape Google search results + - "apify/website-content-crawler" β€” scrape websites and extract content as markdown + - "apify/web-scraper" β€” general-purpose web scraper with JS rendering + - "apify/google-search-scraper" β€” scrape Google search results Args: - actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". - run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema. + actor_id: Actor identifier in "username/actor-name" format, + e.g. "apify/website-content-crawler". Find Actors at https://apify.com/store. + run_input: JSON-serializable input for the Actor. Each Actor defines its own + input schema β€” check the Actor README on Apify Store for required fields. timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. build: Actor build tag or number to run a specific version. Uses latest build if not set. @@ -419,8 +425,9 @@ def apify_get_dataset_items( ) -> Dict[str, Any]: """Fetch items from an existing Apify dataset and return them as JSON. - Use this after running an Actor to retrieve the structured results from its - default dataset, or to access any dataset by ID. + Every Actor run writes its output to a dataset β€” a structured, append-only store + for tabular data. Use the dataset_id from the run metadata returned by apify_run_actor + or apify_run_task. Use offset for pagination through large datasets. Args: dataset_id: The Apify dataset ID to fetch items from. @@ -457,15 +464,17 @@ def apify_run_actor_and_get_dataset( ) -> Dict[str, Any]: """Run an Apify Actor and fetch its dataset results in one step. - Convenience tool that combines running an Actor and fetching its default - dataset items into a single call. Use this when you want both the run metadata and the + Convenience tool that combines running an Actor and fetching its default dataset + items into a single call. Use this when you want both the run metadata and the result data without making two separate tool calls. Args: - actor_id: Actor identifier, e.g. "apify/website-content-crawler" or "username/actor-name". - run_input: JSON-serializable input for the Actor. + actor_id: Actor identifier in "username/actor-name" format, + e.g. "apify/website-content-crawler". Find Actors at https://apify.com/store. + run_input: JSON-serializable input for the Actor. Each Actor defines its own + input schema β€” check the Actor README on Apify Store for required fields. timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Actor run. + memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. build: Actor build tag or number to run a specific version. Uses latest build if not set. dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. @@ -509,15 +518,16 @@ def apify_run_task( timeout_secs: int = DEFAULT_TIMEOUT_SECS, memory_mbytes: Optional[int] = None, ) -> Dict[str, Any]: - """Run an Apify task and return the run metadata as JSON. + """Run a saved Apify task and return the run metadata as JSON. - Tasks are saved Actor configurations with preset inputs. Use this when a task - has already been configured in Apify Console, so you don't need to specify - the full Actor input every time. + Tasks are saved Actor configurations with preset inputs, managed in Apify Console. + Use this when a task has already been configured, so you don't need to specify + the full Actor input every time. Use apify_run_task_and_get_dataset to also fetch + the output data in one call. Args: - task_id: Task identifier, e.g. "user/my-task" or a task ID string. - task_input: Optional JSON-serializable input to override the task's default input. + task_id: Task identifier in "username~task-name" format or a task ID string. + task_input: Optional JSON-serializable input to override the task's default input fields. timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. @@ -558,17 +568,17 @@ def apify_run_task_and_get_dataset( dataset_items_limit: int = DEFAULT_DATASET_ITEMS_LIMIT, dataset_items_offset: int = 0, ) -> Dict[str, Any]: - """Run an Apify task and fetch its dataset results in one step. + """Run a saved Apify task and fetch its dataset results in one step. - Convenience tool that combines running a task and fetching its default - dataset items into a single call. Use this when you want both the run metadata and the + Convenience tool that combines running a task and fetching its default dataset + items into a single call. Use this when you want both the run metadata and the result data without making two separate tool calls. Args: - task_id: Task identifier, e.g. "user/my-task" or a task ID string. - task_input: Optional JSON-serializable input to override the task's default input. + task_id: Task identifier in "username~task-name" format or a task ID string. + task_input: Optional JSON-serializable input to override the task's default input fields. timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the task run. + memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. @@ -613,14 +623,16 @@ def apify_scrape_url( Uses the Website Content Crawler Actor under the hood, pre-configured for fast single-page scraping. This is the simplest way to extract readable content - from any web page. + from any web page β€” no Actor input schema needed. For multi-page crawls, use + apify_run_actor_and_get_dataset with "apify/website-content-crawler" directly. Args: url: The URL to scrape, e.g. "https://example.com". timeout_secs: Maximum time in seconds to wait for scraping to finish. Defaults to 120. - crawler_type: Crawler engine to use. One of "cheerio" (fastest, no JS rendering, - default), "playwright:adaptive" (fast, renders JS if present), or - "playwright:firefox" (reliable, renders JS, best at avoiding blocking but slower). + crawler_type: Crawler engine to use. One of: + - "cheerio" (default): Fastest, no JavaScript rendering. Best for static HTML. + - "playwright:adaptive": Renders JS only when needed. Good general-purpose choice. + - "playwright:firefox": Full JS rendering, best at bypassing anti-bot protection but slowest. Returns: Dict with status and content containing the markdown content of the scraped page. From 46daa97100e2e2a67d1f371eec0a6155d4053118 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 2 Apr 2026 14:57:12 +0200 Subject: [PATCH 23/39] docs: keep most important tools in readme --- README.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/README.md b/README.md index 67d9833a..3838222b 100644 --- a/README.md +++ b/README.md @@ -99,11 +99,7 @@ Below is a comprehensive table of all available tools, how to use them with an a | Tool | Agent Usage | Use Case | |------|-------------|----------| | a2a_client | `provider = A2AClientToolProvider(known_agent_urls=["http://localhost:9000"]); agent = Agent(tools=provider.tools)` | Discover and communicate with A2A-compliant agents, send messages between agents | -| apify_run_actor | `agent.tool.apify_run_actor(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run any Apify Actor by ID with arbitrary input | -| apify_get_dataset_items | `agent.tool.apify_get_dataset_items(dataset_id="abc123", limit=50)` | Fetch items from an Apify dataset | -| apify_run_actor_and_get_dataset | `agent.tool.apify_run_actor_and_get_dataset(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run an Actor and fetch its dataset results in one step | -| apify_run_task | `agent.tool.apify_run_task(task_id="user/my-task")` | Run a saved Apify task by ID with optional input overrides | -| apify_run_task_and_get_dataset | `agent.tool.apify_run_task_and_get_dataset(task_id="user/my-task", dataset_items_limit=50)` | Run a task and fetch its dataset results in one step | +| apify_run_actor | `agent.tool.apify_run_actor(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run any Apify Actor with arbitrary input | | apify_scrape_url | `agent.tool.apify_scrape_url(url="https://example.com")` | Scrape a URL and return its content as markdown | | file_read | `agent.tool.file_read(path="path/to/file.txt")` | Reading configuration files, parsing code files, loading datasets | | file_write | `agent.tool.file_write(path="path/to/file.txt", content="file content")` | Writing results to files, creating new files, saving output data | From 19500c7dec491448eb4fcf320084a874a7738677 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 2 Apr 2026 15:04:08 +0200 Subject: [PATCH 24/39] feat: update crawler type constants in Apify tool --- src/strands_tools/apify.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 5855ab83..adbe38b1 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -77,13 +77,14 @@ except ImportError: HAS_APIFY_CLIENT = False -WEBSITE_CONTENT_CRAWLER = "apify/website-content-crawler" TRACKING_HEADER = {"x-apify-integration-platform": "strands-agents"} ERROR_PANEL_TITLE = "[bold red]Apify Error[/bold red]" DEFAULT_TIMEOUT_SECS = 300 DEFAULT_SCRAPE_TIMEOUT_SECS = 120 DEFAULT_DATASET_ITEMS_LIMIT = 100 -VALID_CRAWLER_TYPES = ("playwright:adaptive", "playwright:firefox", "cheerio") + +WEBSITE_CONTENT_CRAWLER = "apify/website-content-crawler" +WEBSITE_CONTENT_CRAWLER_TYPES = ("playwright:adaptive", "playwright:firefox", "cheerio") # --- Helper functions --- @@ -326,9 +327,9 @@ def scrape_url( """Scrape a single URL using Website Content Crawler and return markdown.""" self._validate_url(url) self._validate_positive(timeout_secs, "timeout_secs") - if crawler_type not in VALID_CRAWLER_TYPES: + if crawler_type not in WEBSITE_CONTENT_CRAWLER_TYPES: raise ValueError( - f"Invalid crawler_type '{crawler_type}'. Must be one of: {', '.join(VALID_CRAWLER_TYPES)}." + f"Invalid crawler_type '{crawler_type}'. Must be one of: {', '.join(WEBSITE_CONTENT_CRAWLER_TYPES)}." ) run_input: Dict[str, Any] = { From 4405ebe8554eece9257ec77050c0dddb7969a994 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 2 Apr 2026 15:29:52 +0200 Subject: [PATCH 25/39] feat: use Literal for crawler types in Apify tool --- src/strands_tools/apify.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index adbe38b1..9f1eb080 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -57,7 +57,7 @@ import json import logging import os -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Literal, Optional, get_args from urllib.parse import urlparse from rich.panel import Panel @@ -84,7 +84,8 @@ DEFAULT_DATASET_ITEMS_LIMIT = 100 WEBSITE_CONTENT_CRAWLER = "apify/website-content-crawler" -WEBSITE_CONTENT_CRAWLER_TYPES = ("playwright:adaptive", "playwright:firefox", "cheerio") +CrawlerType = Literal["playwright:adaptive", "playwright:firefox", "cheerio"] +WEBSITE_CONTENT_CRAWLER_TYPES = get_args(CrawlerType) # --- Helper functions --- @@ -322,7 +323,7 @@ def scrape_url( self, url: str, timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, - crawler_type: str = "cheerio", + crawler_type: CrawlerType = "cheerio", ) -> str: """Scrape a single URL using Website Content Crawler and return markdown.""" self._validate_url(url) @@ -608,7 +609,7 @@ def apify_run_task_and_get_dataset( def apify_scrape_url( url: str, timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, - crawler_type: str = "cheerio", + crawler_type: CrawlerType = "cheerio", ) -> Dict[str, Any]: """Scrape a single URL and return its content as markdown. From ab930ad36dcb4cbe94e5615f81d1748130862747 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 2 Apr 2026 15:39:26 +0200 Subject: [PATCH 26/39] feat: add comment for tracking header --- src/strands_tools/apify.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 9f1eb080..17efac93 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -77,6 +77,7 @@ except ImportError: HAS_APIFY_CLIENT = False +# Attribution header - lets Apify track usage originating from strands-agents (analytics only) TRACKING_HEADER = {"x-apify-integration-platform": "strands-agents"} ERROR_PANEL_TITLE = "[bold red]Apify Error[/bold red]" DEFAULT_TIMEOUT_SECS = 300 From b07d7c195e997d401df58b6f2ff12425759a1931 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 2 Apr 2026 15:55:09 +0200 Subject: [PATCH 27/39] feat: add error handling for missing actor run data and dataset in Apify tool --- src/strands_tools/apify.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 17efac93..06a03811 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -344,9 +344,13 @@ def scrape_url( timeout_secs=timeout_secs, logger=None, # Suppress verbose apify-client logging not useful to end users ) + if actor_run is None: + raise RuntimeError("Website Content Crawler returned no run data (possible wait timeout).") self._check_run_status(actor_run, "Website Content Crawler") dataset_id = actor_run.get("defaultDatasetId") + if not dataset_id: + raise RuntimeError("Website Content Crawler run has no default dataset.") result = self.client.dataset(dataset_id).list_items(limit=1) items = list(result.items) From 30412f7135777658d0950f7172eda6f7727758d7 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 2 Apr 2026 15:57:59 +0200 Subject: [PATCH 28/39] feat: add unit tests for new tools guarding --- tests/test_apify.py | 58 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/tests/test_apify.py b/tests/test_apify.py index 038f8211..a34ae6b7 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -192,6 +192,17 @@ def test_run_actor_api_exception(mock_apify_env, mock_apify_client): assert "Connection failed" in result["content"][0]["text"] +def test_run_actor_none_response(mock_apify_env, mock_apify_client): + """Actor run returns error dict when ActorClient.call() returns None.""" + mock_apify_client.actor.return_value.call.return_value = None + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/my-scraper") + + assert result["status"] == "error" + assert "no run data" in result["content"][0]["text"] + + def test_run_actor_apify_api_error_401(mock_apify_env, mock_apify_client): """Actor run returns friendly message for 401 authentication errors.""" error = _make_apify_api_error(401, "Unauthorized") @@ -275,6 +286,18 @@ def test_run_actor_and_get_dataset_success(mock_apify_env, mock_apify_client): assert data["items"][0]["title"] == "Widget A" +def test_run_actor_and_get_dataset_no_dataset_id(mock_apify_env, mock_apify_client): + """Combined tool returns error when the Actor run has no default dataset.""" + run_no_dataset = {**MOCK_ACTOR_RUN, "defaultDatasetId": None} + mock_apify_client.actor.return_value.call.return_value = run_no_dataset + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor_and_get_dataset(actor_id="actor/my-scraper") + + assert result["status"] == "error" + assert "no default dataset" in result["content"][0]["text"] + + def test_run_actor_and_get_dataset_actor_failure(mock_apify_env, mock_apify_client): """Combined tool returns error dict when the Actor fails.""" mock_apify_client.actor.return_value.call.return_value = MOCK_FAILED_RUN @@ -374,6 +397,18 @@ def test_run_task_and_get_dataset_success(mock_apify_env, mock_apify_client): assert data["items"][0]["title"] == "Widget A" +def test_run_task_and_get_dataset_no_dataset_id(mock_apify_env, mock_apify_client): + """Combined task tool returns error when the task run has no default dataset.""" + run_no_dataset = {**MOCK_ACTOR_RUN, "defaultDatasetId": None} + mock_apify_client.task.return_value.call.return_value = run_no_dataset + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_task_and_get_dataset(task_id="user~my-task") + + assert result["status"] == "error" + assert "no default dataset" in result["content"][0]["text"] + + def test_run_task_and_get_dataset_task_failure(mock_apify_env, mock_apify_client): """Combined task tool returns error dict when the task fails.""" mock_apify_client.task.return_value.call.return_value = MOCK_FAILED_RUN @@ -402,6 +437,29 @@ def test_scrape_url_success(mock_apify_env, mock_apify_client): mock_apify_client.actor.assert_called_once_with("apify/website-content-crawler") +def test_scrape_url_none_response(mock_apify_env, mock_apify_client): + """Scrape URL returns error dict when ActorClient.call() returns None.""" + mock_apify_client.actor.return_value.call.return_value = None + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "no run data" in result["content"][0]["text"] + + +def test_scrape_url_no_dataset_id(mock_apify_env, mock_apify_client): + """Scrape URL returns error when the crawler run has no default dataset.""" + run_no_dataset = {**MOCK_ACTOR_RUN, "defaultDatasetId": None} + mock_apify_client.actor.return_value.call.return_value = run_no_dataset + + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_scrape_url(url="https://example.com") + + assert result["status"] == "error" + assert "no default dataset" in result["content"][0]["text"] + + def test_scrape_url_no_content(mock_apify_env, mock_apify_client): """Scrape URL returns error dict when no content is returned.""" mock_list_result = MagicMock() From 4732185ec6169edca0b398f2b9d69957ef037a01 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 2 Apr 2026 16:03:24 +0200 Subject: [PATCH 29/39] fix: ensure explicit empty input is correctly passed to Apify actor --- src/strands_tools/apify.py | 2 +- tests/test_apify.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 06a03811..509a2052 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -200,7 +200,7 @@ def run_actor( self._validate_positive(memory_mbytes, "memory_mbytes") call_kwargs: Dict[str, Any] = { - "run_input": run_input or {}, + "run_input": run_input if run_input is not None else {}, "timeout_secs": timeout_secs, "logger": None, # Suppress verbose apify-client logging not useful to end users } diff --git a/tests/test_apify.py b/tests/test_apify.py index a34ae6b7..78f15694 100644 --- a/tests/test_apify.py +++ b/tests/test_apify.py @@ -150,6 +150,17 @@ def test_run_actor_default_input(mock_apify_env, mock_apify_client): assert call_kwargs["run_input"] == {} +def test_run_actor_explicit_empty_input(mock_apify_env, mock_apify_client): + """Actor run passes through an explicitly empty dict instead of treating it as falsy.""" + empty_input: dict = {} + with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): + result = apify_run_actor(actor_id="actor/my-scraper", run_input=empty_input) + + assert result["status"] == "success" + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs["run_input"] is empty_input + + def test_run_actor_with_memory(mock_apify_env, mock_apify_client): """Actor run passes memory_mbytes when provided.""" with patch("strands_tools.apify.ApifyClient", return_value=mock_apify_client): From b1a792cc556130d4053ac6f6bdfe14649bd5bc9e Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 2 Apr 2026 16:10:11 +0200 Subject: [PATCH 30/39] fix: add error status message for None --- src/strands_tools/apify.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 509a2052..168613a6 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -119,6 +119,8 @@ def _format_error(e: Exception) -> str: "Rate limit exceeded. The Apify client retries automatically; " "if this persists, reduce request frequency." ) + case None: + return f"Apify API error: {msg}" case _: return f"Apify API error ({status_code}): {msg}" return str(e) From ab0d67567b17b67bfc41cc0f81032d4b2139be09 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 2 Apr 2026 16:20:44 +0200 Subject: [PATCH 31/39] docs: removed redundant tools mentioned from readme --- README.md | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/README.md b/README.md index 1a487d29..5cb37ce3 100644 --- a/README.md +++ b/README.md @@ -102,10 +102,6 @@ Below is a comprehensive table of all available tools, how to use them with an a | apify_run_actor | `agent.tool.apify_run_actor(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run any Apify Actor with arbitrary input | | apify_scrape_url | `agent.tool.apify_scrape_url(url="https://example.com")` | Scrape a URL and return its content as markdown | | apify_google_search_scraper | `agent.tool.apify_google_search_scraper(search_query="best AI frameworks")` | Search Google and return structured results | -| apify_google_places_scraper | `agent.tool.apify_google_places_scraper(search_query="restaurants in Prague")` | Search Google Maps for businesses and places | -| apify_youtube_scraper | `agent.tool.apify_youtube_scraper(search_query="python tutorial")` | Scrape YouTube videos, channels, or search results | -| apify_website_content_crawler | `agent.tool.apify_website_content_crawler(start_url="https://docs.example.com")` | Crawl a website and extract content from multiple pages | -| apify_ecommerce_scraper | `agent.tool.apify_ecommerce_scraper(url="https://www.amazon.com/dp/B0TEST")` | Scrape product data from e-commerce websites | | file_read | `agent.tool.file_read(path="path/to/file.txt")` | Reading configuration files, parsing code files, loading datasets | | file_write | `agent.tool.file_write(path="path/to/file.txt", content="file content")` | Writing results to files, creating new files, saving output data | | editor | `agent.tool.editor(command="view", path="path/to/file.py")` | Advanced file operations like syntax highlighting, pattern replacement, and multi-file edits | @@ -1013,25 +1009,6 @@ results = agent.tool.apify_google_search_scraper( results_limit=10, ) -# Search Google Maps for places -places = agent.tool.apify_google_places_scraper( - search_query="restaurants in Prague", - include_reviews=True, -) - -# Scrape YouTube -videos = agent.tool.apify_youtube_scraper(search_query="python tutorial") - -# Crawl a website (multi-page) -pages = agent.tool.apify_website_content_crawler( - start_url="https://docs.example.com", - max_pages=20, -) - -# Scrape e-commerce product data -products = agent.tool.apify_ecommerce_scraper( - url="https://www.amazon.com/dp/B0TEST", -) ``` ## 🌍 Environment Variables Configuration From cbb9010fd9df9836a847b185809ddcd8f508f04c Mon Sep 17 00:00:00 2001 From: Liz <91279165+lizradway@users.noreply.github.com> Date: Thu, 2 Apr 2026 15:17:04 -0400 Subject: [PATCH 32/39] fix: use console util to allow output suppression (#436) --- src/strands_tools/exa.py | 5 +++-- src/strands_tools/tavily.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/strands_tools/exa.py b/src/strands_tools/exa.py index db0da875..091699f0 100644 --- a/src/strands_tools/exa.py +++ b/src/strands_tools/exa.py @@ -47,10 +47,11 @@ from typing import Any, Dict, List, Literal, Optional, Union import aiohttp -from rich.console import Console from rich.panel import Panel from strands import tool +from strands_tools.utils import console_util + logger = logging.getLogger(__name__) # Exa API configuration @@ -59,7 +60,7 @@ EXA_CONTENTS_ENDPOINT = "/contents" # Initialize Rich console -console = Console() +console = console_util.create() def _get_api_key() -> str: diff --git a/src/strands_tools/tavily.py b/src/strands_tools/tavily.py index 437c3506..3915c800 100644 --- a/src/strands_tools/tavily.py +++ b/src/strands_tools/tavily.py @@ -52,10 +52,11 @@ from typing import Any, Dict, List, Literal, Optional, Union import aiohttp -from rich.console import Console from rich.panel import Panel from strands import tool +from strands_tools.utils import console_util + logger = logging.getLogger(__name__) # Tavily API configuration @@ -66,7 +67,7 @@ TAVILY_MAP_ENDPOINT = "/map" # Initialize Rich console -console = Console() +console = console_util.create() def _get_api_key() -> str: From 53851d882252fb44029ad6f067712476ec9d63a0 Mon Sep 17 00:00:00 2001 From: Tanishq <30299564+10ishq@users.noreply.github.com> Date: Wed, 8 Apr 2026 00:41:12 +0530 Subject: [PATCH 33/39] feat(exa): remove deprecated neural/keyword search types, add deep (#411) Co-authored-by: Tanishq Jaiswal <10ishq@users.noreply.github.com> --- README.md | 2 +- src/strands_tools/exa.py | 25 +++++++++++-------------- tests/test_exa.py | 7 +++---- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index cf111a21..2a188f3e 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ Below is a comprehensive table of all available tools, how to use them with an a | tavily_extract | `agent.tool.tavily_extract(urls=["www.tavily.com"], extract_depth="advanced")` | Extract clean, structured content from web pages with advanced processing and noise removal | | tavily_crawl | `agent.tool.tavily_crawl(url="www.tavily.com", max_depth=2, instructions="Find API docs")` | Crawl websites intelligently starting from a base URL with filtering and extraction | | tavily_map | `agent.tool.tavily_map(url="www.tavily.com", max_depth=2, instructions="Find all pages")` | Map website structure and discover URLs starting from a base URL without content extraction | -| exa_search | `agent.tool.exa_search(query="Best project management tools", text=True)` | Intelligent web search with auto mode (default) that combines neural and keyword search for optimal results | +| exa_search | `agent.tool.exa_search(query="Best project management tools", text=True)` | Intelligent web search with auto mode (default) for optimal results, plus fast and deep search modes | | exa_get_contents | `agent.tool.exa_get_contents(urls=["https://example.com/article"], text=True, summary={"query": "key points"})` | Extract full content and summaries from specific URLs with live crawling fallback | | python_repl* | `agent.tool.python_repl(code="import pandas as pd\ndf = pd.read_csv('data.csv')\nprint(df.head())")` | Running Python code snippets, data analysis, executing complex logic with user confirmation for security | | calculator | `agent.tool.calculator(expression="2 * sin(pi/4) + log(e**2)")` | Performing mathematical operations, symbolic math, equation solving | diff --git a/src/strands_tools/exa.py b/src/strands_tools/exa.py index 091699f0..39a0b81f 100644 --- a/src/strands_tools/exa.py +++ b/src/strands_tools/exa.py @@ -1,12 +1,12 @@ """ Exa Search and Contents tools for intelligent web search and content processing. -This module provides access to Exa's API, which offers neural search capabilities optimized for LLMs and AI agents. -The "auto" mode intelligently combines neural embeddings-based search with traditional keyword search for best results. +This module provides access to Exa's API, which offers advanced search capabilities optimized for LLMs and AI agents. +The "auto" mode intelligently selects the best search approach for optimal results. Key Features: - Auto mode that intelligently selects the best search approach (default) -- Neural and keyword search capabilities +- Deep search for thorough, comprehensive results - Advanced content filtering and domain management - Full page content extraction with summaries - Support for general web search, company info, news, PDFs, GitHub repos, and more @@ -192,7 +192,7 @@ def format_contents_response(data: Dict[str, Any]) -> Panel: @tool async def exa_search( query: str, - type: Optional[Literal["keyword", "neural", "fast", "auto"]] = "auto", + type: Optional[Literal["auto", "fast", "deep"]] = "auto", category: Optional[ Literal["company", "news", "pdf", "github", "personal site", "linkedin profile", "financial report"] ] = None, @@ -218,25 +218,22 @@ async def exa_search( extras: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: """ - Search the web intelligently using Exa's neural and keyword search capabilities. + Search the web intelligently using Exa's advanced search capabilities. Exa provides advanced web search optimized for LLMs and AI agents. The "auto" mode (default) - intelligently combines neural embeddings-based search with traditional keyword search to find - the most relevant results for your query. + intelligently selects the best search approach to find the most relevant results for your query. Key Features: - Auto mode that intelligently selects the best search approach (default) - - Neural search using embeddings for semantic understanding - - Traditional keyword search for exact matches + - Deep search for thorough, comprehensive results - Advanced filtering by domain, date, and content - Live crawling with fallback options - Rich content extraction with summaries Search Types: - - auto: Intelligently combines neural and keyword approaches (recommended default) - - neural: Uses embeddings-based model for semantic search - - keyword: Google-like SERP search for exact matches - - fast: Streamlined versions of neural and keyword models + - auto: Intelligently selects the best search approach (recommended default) + - fast: Optimized for speed + - deep: Thorough search for comprehensive results Categories (optional - general web search works best): - company: Focus on company websites and information when specifically needed @@ -250,7 +247,7 @@ async def exa_search( Args: query: The search query string. Examples: "Latest developments in artificial intelligence", "Best project management tools" - type: Search type - "auto" (default, recommended), "neural", "keyword", or "fast" + type: Search type - "auto" (default, recommended), "fast", or "deep" category: Optional data category - use sparingly as general search works best. Use "company" when specifically looking for company information user_location: Two-letter ISO country code (e.g., "US", "UK") for geo-localized results diff --git a/tests/test_exa.py b/tests/test_exa.py index fdbb3e68..f555b84d 100644 --- a/tests/test_exa.py +++ b/tests/test_exa.py @@ -17,7 +17,7 @@ def mock_aiohttp_response(): mock_response = AsyncMock() mock_response.json.return_value = { "requestId": "b5947044c4b78efa9552a7c89b306d95", - "resolvedSearchType": "neural", + "resolvedSearchType": "auto", "searchType": "auto", "results": [ { @@ -40,8 +40,7 @@ def mock_aiohttp_response(): "search": 0.005, "contents": 0, "breakdown": { - "keywordSearch": 0, - "neuralSearch": 0.005, + "search": 0.005, "contentText": 0, "contentHighlight": 0, "contentSummary": 0, @@ -249,7 +248,7 @@ def test_format_search_response(): data = { "requestId": "test-request-123", "searchType": "auto", - "resolvedSearchType": "neural", + "resolvedSearchType": "auto", "results": [ { "title": "Test Result", From 07799a14e313eef296b3c1e610709c6c31dd4f91 Mon Sep 17 00:00:00 2001 From: Jiri Spilka Date: Wed, 8 Apr 2026 08:59:43 +0200 Subject: [PATCH 34/39] fix: Improve docs using apify-writing-style --- README.md | 10 +++++----- src/strands_tools/apify.py | 28 ++++++++++++++-------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 67d9833a..60062155 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ Strands Agents Tools is a community-driven project that provides a powerful set - πŸ“ **File Operations** - Read, write, and edit files with syntax highlighting and intelligent modifications - πŸ–₯️ **Shell Integration** - Execute and interact with shell commands securely -- 🧠 **Memory** - Store user and agent memories across agent runs to provide personalized experiences with both Mem0, Amazon Bedrock Knowledge Bases, Elasticsearch, and MongoDB Atlas +- 🧠 **Memory** - Store user and agent memories across agent runs to provide personalized experiences with Mem0, Amazon Bedrock Knowledge Bases, Elasticsearch, and MongoDB Atlas - πŸ•ΈοΈ **Web Infrastructure** - Perform web searches, extract page content, and crawl websites with Tavily and Exa-powered tools - 🌐 **HTTP Client** - Make API requests with comprehensive authentication support - πŸ’¬ **Slack Client** - Real-time Slack events, message processing, and Slack API access @@ -104,7 +104,7 @@ Below is a comprehensive table of all available tools, how to use them with an a | apify_run_actor_and_get_dataset | `agent.tool.apify_run_actor_and_get_dataset(actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://example.com"}]})` | Run an Actor and fetch its dataset results in one step | | apify_run_task | `agent.tool.apify_run_task(task_id="user/my-task")` | Run a saved Apify task by ID with optional input overrides | | apify_run_task_and_get_dataset | `agent.tool.apify_run_task_and_get_dataset(task_id="user/my-task", dataset_items_limit=50)` | Run a task and fetch its dataset results in one step | -| apify_scrape_url | `agent.tool.apify_scrape_url(url="https://example.com")` | Scrape a URL and return its content as markdown | +| apify_scrape_url | `agent.tool.apify_scrape_url(url="https://example.com")` | Scrape a URL and return its content as Markdown | | file_read | `agent.tool.file_read(path="path/to/file.txt")` | Reading configuration files, parsing code files, loading datasets | | file_write | `agent.tool.file_write(path="path/to/file.txt", content="file content")` | Writing results to files, creating new files, saving output data | | editor | `agent.tool.editor(command="view", path="path/to/file.py")` | Advanced file operations like syntax highlighting, pattern replacement, and multi-file edits | @@ -206,7 +206,7 @@ result = agent.tool.mcp_client( tool_args={"x": 10, "y": 20} ) -# Connect to a SSE-based server +# Connect to an SSE-based server agent.tool.mcp_client( action="connect", connection_id="web_server", @@ -277,7 +277,7 @@ response = agent.tool.http_request( auth_token="your_token_here" ) -# Convert HTML webpages to markdown for better readability +# Convert HTML webpages to Markdown for better readability response = agent.tool.http_request( method="GET", url="https://example.com/article", @@ -974,7 +974,7 @@ from strands_tools.apify import APIFY_CORE_TOOLS agent = Agent(tools=APIFY_CORE_TOOLS) -# Scrape a single URL and get markdown content +# Scrape a single URL and get Markdown content content = agent.tool.apify_scrape_url(url="https://example.com") # Run an Actor and get results in one step diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 1505cb60..ae4f6dee 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -1,10 +1,10 @@ """Apify platform tools for Strands Agents. -Apify is a large marketplace of tools for web scraping, data extraction, -and web automation. These tools are called Actors β€” serverless cloud applications that -take JSON input and store results in a dataset (structured, tabular output) or key-value -store (files and unstructured data). Actors exist for social media, e-commerce, search -engines, maps, travel sites, and many other sources. + +Apify is the world's largest marketplace of tools for web scraping, crawling, data extraction, and web automation. +These tools are called Actors, serverless cloud programs that take JSON input and store results +in a dataset (structured, tabular output) or key-value store (files and unstructured data). +Get structured data from social media, e-commerce, search engines, maps, travel sites, or any other website. Available Tools: --------------- @@ -325,7 +325,7 @@ def scrape_url( timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, crawler_type: str = "cheerio", ) -> str: - """Scrape a single URL using Website Content Crawler and return markdown.""" + """Scrape a single URL using Website Content Crawler and return Markdown.""" self._validate_url(url) self._validate_positive(timeout_secs, "timeout_secs") if crawler_type not in VALID_CRAWLER_TYPES: @@ -375,15 +375,15 @@ def apify_run_actor( output data in one call, or apify_scrape_url for quick single-URL extraction. Common Actors: - - "apify/website-content-crawler" β€” scrape websites and extract content as markdown - - "apify/web-scraper" β€” general-purpose web scraper with JS rendering + - "apify/website-content-crawler" - scrape websites and extract content as Markdown + - "apify/web-scraper" - general-purpose web scraper with JS rendering - "apify/google-search-scraper" β€” scrape Google search results Args: actor_id: Actor identifier in "username/actor-name" format, e.g. "apify/website-content-crawler". Find Actors at https://apify.com/store. run_input: JSON-serializable input for the Actor. Each Actor defines its own - input schema β€” check the Actor README on Apify Store for required fields. + input schema - check the Actor README on Apify Store for required fields. timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. build: Actor build tag or number to run a specific version. Uses latest build if not set. @@ -472,7 +472,7 @@ def apify_run_actor_and_get_dataset( actor_id: Actor identifier in "username/actor-name" format, e.g. "apify/website-content-crawler". Find Actors at https://apify.com/store. run_input: JSON-serializable input for the Actor. Each Actor defines its own - input schema β€” check the Actor README on Apify Store for required fields. + input schema - check the Actor README on Apify Store for required fields. timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. build: Actor build tag or number to run a specific version. Uses latest build if not set. @@ -526,7 +526,7 @@ def apify_run_task( the output data in one call. Args: - task_id: Task identifier in "username~task-name" format or a task ID string. + task_id: Task identifier in "username/task-name" format or a task ID string. task_input: Optional JSON-serializable input to override the task's default input fields. timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. @@ -575,7 +575,7 @@ def apify_run_task_and_get_dataset( result data without making two separate tool calls. Args: - task_id: Task identifier in "username~task-name" format or a task ID string. + task_id: Task identifier in "username/task-name" format or a task ID string. task_input: Optional JSON-serializable input to override the task's default input fields. timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. @@ -619,7 +619,7 @@ def apify_scrape_url( timeout_secs: int = DEFAULT_SCRAPE_TIMEOUT_SECS, crawler_type: str = "cheerio", ) -> Dict[str, Any]: - """Scrape a single URL and return its content as markdown. + """Scrape a single URL and return its content as Markdown. Uses the Website Content Crawler Actor under the hood, pre-configured for fast single-page scraping. This is the simplest way to extract readable content @@ -635,7 +635,7 @@ def apify_scrape_url( - "playwright:firefox": Full JS rendering, best at bypassing anti-bot protection but slowest. Returns: - Dict with status and content containing the markdown content of the scraped page. + Dict with status and content containing the Markdown content of the scraped page. """ try: _check_dependency() From e172b1b0c32317618230dbd01866ae4c604b7094 Mon Sep 17 00:00:00 2001 From: Murat Kaan Meral Date: Thu, 9 Apr 2026 09:54:25 -0400 Subject: [PATCH 35/39] =?UTF-8?q?fix:=20add=20namespace=20validation=20and?= =?UTF-8?q?=20fix=20TOCTOU=20in=20elasticsearch=20memory=20=E2=80=A6=20(#4?= =?UTF-8?q?47)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/strands_tools/elasticsearch_memory.py | 128 +++++++++++--- tests/test_elasticsearch_memory.py | 198 ++++++++++++++++++---- tests/test_http_request.py | 2 + 3 files changed, 272 insertions(+), 56 deletions(-) diff --git a/src/strands_tools/elasticsearch_memory.py b/src/strands_tools/elasticsearch_memory.py index 6bf78930..5ad68f95 100644 --- a/src/strands_tools/elasticsearch_memory.py +++ b/src/strands_tools/elasticsearch_memory.py @@ -113,14 +113,15 @@ import json import logging import os +import re import time import uuid from datetime import datetime, timezone from enum import Enum -from typing import Dict, List, Optional +from typing import Any, Dict, List, Optional import boto3 -from elasticsearch import Elasticsearch, NotFoundError +from elasticsearch import Elasticsearch from strands import tool # Set up logging @@ -183,6 +184,47 @@ class MemoryAction(str, Enum): DEFAULT_EMBEDDING_MODEL = "amazon.titan-embed-text-v2:0" DEFAULT_EMBEDDING_DIMS = 1024 # Titan v2 returns 1024 dimensions DEFAULT_MAX_RESULTS = 10 +DEFAULT_NAMESPACE = "default" + + +def _validate_namespace(namespace: Any) -> str: + """Validate and sanitize namespace parameter to prevent injection attacks. + + This function treats namespace as a trusted identifier by requiring it to be + a simple string matching the pattern ^[A-Za-z0-9_-]{1,64}$ before including + it in Elasticsearch queries. This prevents potential injection attacks and + ensures consistent namespace handling across all memory operations. + + Args: + namespace: The namespace value to validate (can be any type) + + Returns: + str: A validated string namespace (1-64 chars, alphanumeric + underscore + hyphen only) + + Raises: + ElasticsearchValidationError: If namespace cannot be converted to a safe string + """ + if namespace is None: + return DEFAULT_NAMESPACE + + if not isinstance(namespace, str): + raise ElasticsearchValidationError(f"Namespace must be a string, got {type(namespace).__name__}. ") + + clean_namespace = str(namespace).strip() + + if not clean_namespace: + raise ElasticsearchValidationError("Invalid namespace: Namespace cannot be empty.") + + if len(clean_namespace) > 64: + raise ElasticsearchValidationError("Invalid namespace: Namespace too long. Maximum 64 characters allowed.") + + if not re.match(r"^[A-Za-z0-9_-]{1,64}$", clean_namespace): + raise ElasticsearchValidationError( + f"Invalid namespace: Namespace '{clean_namespace}' contains invalid characters. " + "Must match pattern ^[A-Za-z0-9_-]{1,64}$" + ) + + return clean_namespace def _ensure_index_exists(es_client: Elasticsearch, index_name: str, es_url: Optional[str] = None): @@ -465,13 +507,27 @@ def _get_memory(es_client: Elasticsearch, index_name: str, namespace: str, memor Exception: If memory not found or not in correct namespace """ try: - response = es_client.get(index=index_name, id=memory_id) - source = response["_source"] + # Query with both memory_id and namespace to enforce tenant isolation server-side + search_body = { + "query": { + "bool": { + "must": [ + {"term": {"memory_id": memory_id}}, + {"term": {"namespace": namespace}}, + ] + } + }, + "size": 1, + "_source": ["memory_id", "content", "timestamp", "metadata", "namespace"], + } + + response = es_client.search(index=index_name, body=search_body) - # Verify namespace - if source.get("namespace") != namespace: + if not response["hits"]["hits"]: raise ElasticsearchMemoryNotFoundError(f"Memory {memory_id} not found in namespace {namespace}") + source = response["hits"]["hits"][0]["_source"] + return { "memory_id": source["memory_id"], "content": source["content"], @@ -480,8 +536,6 @@ def _get_memory(es_client: Elasticsearch, index_name: str, namespace: str, memor "namespace": source["namespace"], } - except NotFoundError: - raise ElasticsearchMemoryNotFoundError(f"Memory {memory_id} not found") from None except ElasticsearchMemoryNotFoundError: raise except Exception as e: @@ -492,6 +546,10 @@ def _delete_memory(es_client: Elasticsearch, index_name: str, namespace: str, me """ Delete a specific memory by ID. + Uses delete_by_query with both memory_id and namespace constraints to + atomically verify ownership and delete in a single operation, preventing + TOCTOU (Time-of-Check to Time-of-Use) race conditions. + Args: es_client: Elasticsearch client index_name: Elasticsearch index name @@ -505,18 +563,29 @@ def _delete_memory(es_client: Elasticsearch, index_name: str, namespace: str, me Exception: If memory not found or deletion fails """ try: - # First verify the memory exists and is in correct namespace - _get_memory(es_client, index_name, namespace, memory_id) + # Atomically delete only if both memory_id and namespace match, + # preventing TOCTOU race conditions between check and delete + response = es_client.delete_by_query( + index=index_name, + body={ + "query": { + "bool": { + "must": [ + {"term": {"memory_id": memory_id}}, + {"term": {"namespace": namespace}}, + ] + } + } + }, + ) - # Delete the memory - response = es_client.delete(index=index_name, id=memory_id) + if response.get("deleted", 0) == 0: + raise ElasticsearchMemoryNotFoundError(f"Memory {memory_id} not found in namespace {namespace}") - return {"memory_id": memory_id, "result": response["result"]} + return {"memory_id": memory_id, "result": "deleted"} except ElasticsearchMemoryNotFoundError: raise - except NotFoundError: - raise ElasticsearchMemoryNotFoundError(f"Memory {memory_id} not found") from None except Exception as e: raise ElasticsearchMemoryError(f"Failed to delete memory {memory_id}: {str(e)}") from e @@ -603,11 +672,21 @@ def elasticsearch_memory( # Set defaults index_name = index_name or os.getenv("ELASTICSEARCH_INDEX_NAME", DEFAULT_INDEX_NAME) - namespace = namespace or os.getenv("ELASTICSEARCH_NAMESPACE", "default") + if namespace is None: + namespace = os.getenv("ELASTICSEARCH_NAMESPACE", DEFAULT_NAMESPACE) embedding_model = embedding_model or os.getenv("ELASTICSEARCH_EMBEDDING_MODEL", DEFAULT_EMBEDDING_MODEL) region = region or os.getenv("AWS_REGION", "us-west-2") max_results = max_results or DEFAULT_MAX_RESULTS + # Validate namespace to prevent injection attacks + try: + safe_namespace = _validate_namespace(namespace) + except ElasticsearchValidationError as e: + return { + "status": "error", + "content": [{"text": f"Invalid namespace: {str(e)}"}], + } + # Initialize Elasticsearch client try: if es_url: @@ -685,7 +764,7 @@ def elasticsearch_memory( try: if action_enum == MemoryAction.RECORD: response = _record_memory( - es_client, bedrock_runtime, index_name, namespace, embedding_model, content, metadata + es_client, bedrock_runtime, index_name, safe_namespace, embedding_model, content, metadata ) return { "status": "success", @@ -694,7 +773,14 @@ def elasticsearch_memory( elif action_enum == MemoryAction.RETRIEVE: response = _retrieve_memories( - es_client, bedrock_runtime, index_name, namespace, embedding_model, query, max_results, next_token + es_client, + bedrock_runtime, + index_name, + safe_namespace, + embedding_model, + query, + max_results, + next_token, ) return { "status": "success", @@ -702,21 +788,21 @@ def elasticsearch_memory( } elif action_enum == MemoryAction.LIST: - response = _list_memories(es_client, index_name, namespace, max_results, next_token) + response = _list_memories(es_client, index_name, safe_namespace, max_results, next_token) return { "status": "success", "content": [{"text": f"Memories listed successfully: {json.dumps(response, default=str)}"}], } elif action_enum == MemoryAction.GET: - response = _get_memory(es_client, index_name, namespace, memory_id) + response = _get_memory(es_client, index_name, safe_namespace, memory_id) return { "status": "success", "content": [{"text": f"Memory retrieved successfully: {json.dumps(response, default=str)}"}], } elif action_enum == MemoryAction.DELETE: - response = _delete_memory(es_client, index_name, namespace, memory_id) + response = _delete_memory(es_client, index_name, safe_namespace, memory_id) return { "status": "success", "content": [{"text": f"Memory deleted successfully: {memory_id}"}], diff --git a/tests/test_elasticsearch_memory.py b/tests/test_elasticsearch_memory.py index 01f0ce51..b95b5bc8 100644 --- a/tests/test_elasticsearch_memory.py +++ b/tests/test_elasticsearch_memory.py @@ -260,14 +260,21 @@ def test_get_memory(mock_elasticsearch_client, mock_bedrock_client, config): """Test getting a specific memory by ID.""" agent = Agent(tools=[elasticsearch_memory]) - # Configure mock get response - mock_elasticsearch_client["client"].get.return_value = { - "_source": { - "memory_id": "mem_123", - "content": "Test content", - "timestamp": "2023-01-01T00:00:00Z", - "metadata": {"category": "test"}, - "namespace": "test_namespace", + # Configure mock search response (now uses search instead of get for namespace enforcement) + mock_elasticsearch_client["client"].search.return_value = { + "hits": { + "hits": [ + { + "_source": { + "memory_id": "mem_123", + "content": "Test content", + "timestamp": "2023-01-01T00:00:00Z", + "metadata": {"category": "test"}, + "namespace": "test_namespace", + } + } + ], + "total": {"value": 1}, } } @@ -278,25 +285,20 @@ def test_get_memory(mock_elasticsearch_client, mock_bedrock_client, config): assert result["status"] == "success" assert "Memory retrieved successfully" in result["content"][0]["text"] - # Verify get was called - mock_elasticsearch_client["client"].get.assert_called_once_with(index="test_index", id="mem_123") + # Verify search was called with both memory_id and namespace for security + mock_elasticsearch_client["client"].search.assert_called_once() + search_call = mock_elasticsearch_client["client"].search.call_args[1] + query = search_call["body"]["query"]["bool"]["must"] + assert {"term": {"memory_id": "mem_123"}} in query + assert {"term": {"namespace": "test_namespace"}} in query def test_delete_memory(mock_elasticsearch_client, mock_bedrock_client, config): """Test deleting a memory.""" agent = Agent(tools=[elasticsearch_memory]) - # Configure mock responses - mock_elasticsearch_client["client"].get.return_value = { - "_source": { - "memory_id": "mem_123", - "content": "Test content", - "timestamp": "2023-01-01T00:00:00Z", - "metadata": {}, - "namespace": "test_namespace", - } - } - mock_elasticsearch_client["client"].delete.return_value = {"result": "deleted"} + # Configure mock delete_by_query response (atomic delete with namespace constraint) + mock_elasticsearch_client["client"].delete_by_query.return_value = {"deleted": 1} # Call the tool result = agent.tool.elasticsearch_memory(action="delete", memory_id="mem_123", **config) @@ -305,8 +307,12 @@ def test_delete_memory(mock_elasticsearch_client, mock_bedrock_client, config): assert result["status"] == "success" assert "Memory deleted successfully: mem_123" in result["content"][0]["text"] - # Verify delete was called - mock_elasticsearch_client["client"].delete.assert_called_once_with(index="test_index", id="mem_123") + # Verify delete_by_query was called with both memory_id and namespace + mock_elasticsearch_client["client"].delete_by_query.assert_called_once() + call_args = mock_elasticsearch_client["client"].delete_by_query.call_args[1] + query = call_args["body"]["query"]["bool"]["must"] + assert {"term": {"memory_id": "mem_123"}} in query + assert {"term": {"namespace": "test_namespace"}} in query def test_unsupported_action(mock_elasticsearch_client, mock_bedrock_client, config): @@ -387,26 +393,32 @@ def test_memory_not_found(mock_elasticsearch_client, mock_bedrock_client, config """Test handling when memory is not found.""" agent = Agent(tools=[elasticsearch_memory]) - from elasticsearch import NotFoundError - - # Configure mock to raise NotFoundError - mock_elasticsearch_client["client"].get.side_effect = NotFoundError("404", "not_found_exception", {}) + # Configure mock search to return empty results (memory not found in namespace) + mock_elasticsearch_client["client"].search.return_value = { + "hits": { + "hits": [], + "total": {"value": 0}, + } + } # Call the tool result = agent.tool.elasticsearch_memory(action="get", memory_id="nonexistent", **config) # Verify error response assert result["status"] == "error" - assert "Memory nonexistent not found" in result["content"][0]["text"] + assert "Memory nonexistent not found in namespace test_namespace" in result["content"][0]["text"] def test_namespace_validation(mock_elasticsearch_client, mock_bedrock_client, config): """Test that memories are properly filtered by namespace.""" agent = Agent(tools=[elasticsearch_memory]) - # Configure mock get response with wrong namespace - mock_elasticsearch_client["client"].get.return_value = { - "_source": {"memory_id": "mem_123", "content": "Test content", "namespace": "wrong_namespace"} + # Configure mock search to return empty results (memory not in this namespace) + mock_elasticsearch_client["client"].search.return_value = { + "hits": { + "hits": [], + "total": {"value": 0}, + } } # Call the tool @@ -416,6 +428,13 @@ def test_namespace_validation(mock_elasticsearch_client, mock_bedrock_client, co assert result["status"] == "error" assert "not found in namespace test_namespace" in result["content"][0]["text"] + # Verify search was called with both memory_id and namespace + mock_elasticsearch_client["client"].search.assert_called_once() + search_call = mock_elasticsearch_client["client"].search.call_args[1] + query = search_call["body"]["query"]["bool"]["must"] + assert {"term": {"memory_id": "mem_123"}} in query + assert {"term": {"namespace": "test_namespace"}} in query + def test_pagination_support(mock_elasticsearch_client, mock_bedrock_client, config): """Test pagination support in list and retrieve operations.""" @@ -754,12 +773,15 @@ def test_security_scenarios(mock_elasticsearch_client, mock_bedrock_client): """Test security-related scenarios like namespace isolation.""" agent = Agent(tools=[elasticsearch_memory]) - # Configure mock get response with wrong namespace - mock_elasticsearch_client["client"].get.return_value = { - "_source": {"memory_id": "mem_123", "content": "Test content", "namespace": "wrong_namespace"} + # Configure mock search to return empty results (memory not in this namespace) + mock_elasticsearch_client["client"].search.return_value = { + "hits": { + "hits": [], + "total": {"value": 0}, + } } - # Test namespace validation + # Test namespace validation - memory exists but not in requested namespace result = agent.tool.elasticsearch_memory( action="get", memory_id="mem_123", @@ -791,3 +813,109 @@ def test_troubleshooting_scenarios(mock_elasticsearch_client, mock_bedrock_clien result = agent.tool.elasticsearch_memory(action="record", content="test", **config) assert result["status"] == "error" assert "Unable to connect to Elasticsearch cluster" in result["content"][0]["text"] + + +def test_injection_prevention(mock_elasticsearch_client, mock_bedrock_client, config): + """Test that injection attempts via namespace are blocked.""" + agent = Agent(tools=[elasticsearch_memory]) + + # Remove namespace from config to avoid conflict + test_config = {k: v for k, v in config.items() if k != "namespace"} + + # Test dict-based injection (analogous to MongoDB {"$ne": ""} attack) + malicious_namespace = {"$ne": ""} + result = agent.tool.elasticsearch_memory(action="list", namespace=malicious_namespace, **test_config) + assert result["status"] == "error" + error_text = result["content"][0]["text"] + assert "Invalid namespace" in error_text or "Input should be a valid string" in error_text + + # Test other injection payloads + injection_attempts = [ + {"$gt": ""}, + {"$regex": ".*"}, + {"$exists": True}, + {"$in": ["tenant1", "tenant2"]}, + ] + + for injection_payload in injection_attempts: + result = agent.tool.elasticsearch_memory(action="list", namespace=injection_payload, **test_config) + assert result["status"] == "error", f"Injection {injection_payload} should be blocked" + error_text = result["content"][0]["text"] + assert "Invalid namespace" in error_text or "Input should be a valid string" in error_text + + +def test_namespace_validation_strict_rules(mock_elasticsearch_client, mock_bedrock_client, config): + """Test strict namespace validation rules.""" + agent = Agent(tools=[elasticsearch_memory]) + + # Remove namespace from config to avoid conflict + test_config = {k: v for k, v in config.items() if k != "namespace"} + + # Test invalid characters (should be rejected) + invalid_namespaces = [ + "user.name", # Dots not allowed + "user@domain", # @ symbol + "user$name", # $ symbol + "user name", # Space + "user/path", # Forward slash + "user:name", # Colon + "a" * 65, # Too long (over 64 chars) + "", # Empty + " ", # Whitespace only + ] + + for invalid_namespace in invalid_namespaces: + result = agent.tool.elasticsearch_memory(action="list", namespace=invalid_namespace, **test_config) + assert result["status"] == "error", f"Invalid namespace '{invalid_namespace}' should be rejected" + error_text = result["content"][0]["text"] + assert "Invalid namespace" in error_text + + +def test_valid_namespaces_accepted(mock_elasticsearch_client, mock_bedrock_client, config): + """Test that valid namespaces are accepted.""" + agent = Agent(tools=[elasticsearch_memory]) + + # Configure mock responses + mock_elasticsearch_client["client"].search.return_value = { + "hits": { + "hits": [], + "total": {"value": 0}, + } + } + + # Remove namespace from config + test_config = {k: v for k, v in config.items() if k != "namespace"} + + valid_namespaces = [ + "default", + "user_123", + "tenant-abc", + "MyNamespace", + "a", + "A" * 64, # Max length + ] + + for valid_namespace in valid_namespaces: + result = agent.tool.elasticsearch_memory(action="list", namespace=valid_namespace, **test_config) + assert result["status"] == "success", f"Valid namespace '{valid_namespace}' should be accepted" + + +def test_delete_memory_namespace_enforcement(mock_elasticsearch_client, mock_bedrock_client, config): + """Test that delete enforces namespace atomically (no TOCTOU).""" + agent = Agent(tools=[elasticsearch_memory]) + + # Configure delete_by_query to return 0 deleted (memory not in namespace) + mock_elasticsearch_client["client"].delete_by_query.return_value = {"deleted": 0} + + result = agent.tool.elasticsearch_memory(action="delete", memory_id="mem_123", **config) + + # Should fail because memory not found in the requested namespace + assert result["status"] == "error" + assert "not found in namespace test_namespace" in result["content"][0]["text"] + + # Verify delete_by_query was called with namespace constraint + mock_elasticsearch_client["client"].delete_by_query.assert_called_once() + call_args = mock_elasticsearch_client["client"].delete_by_query.call_args[1] + query = call_args["body"]["query"]["bool"]["must"] + assert {"term": {"memory_id": "mem_123"}} in query + assert {"term": {"namespace": "test_namespace"}} in query diff --git a/tests/test_http_request.py b/tests/test_http_request.py index 54c10c30..d3a084cc 100644 --- a/tests/test_http_request.py +++ b/tests/test_http_request.py @@ -25,6 +25,7 @@ def mock_request_state(): """Create a mock request state dictionary.""" return {} + @pytest.fixture def mock_env_vars(): """Set up mock environment variables for testing.""" @@ -44,6 +45,7 @@ def extract_result_text(result): return "\n".join([item["text"] for item in result["content"]]) return str(result) + @responses.activate def test_basic_get_request(): """Test a basic GET request with direct invocation.""" From 34146fe74027cfc815415eacc7b7e582c09e8b03 Mon Sep 17 00:00:00 2001 From: Agent of mkmeral Date: Fri, 10 Apr 2026 12:38:04 -0400 Subject: [PATCH 36/39] docs: update repository guidelines for new tools policy (#445) Co-authored-by: agent-of-mkmeral Co-authored-by: agent-of-mkmeral <265349452+agent-of-mkmeral@users.noreply.github.com> --- .github/ISSUE_TEMPLATE/feature_request.yml | 4 ++++ .github/PULL_REQUEST_TEMPLATE.md | 3 ++- CONTRIBUTING.md | 21 +++++++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml index e5f36b2c..40442a01 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yml +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -8,6 +8,10 @@ body: attributes: value: | Thanks for suggesting a new feature for Strands Agents Tools! + + > **Note:** We are not accepting new tools into this repository. If you'd like to build a new tool, we recommend using our [extension template](https://github.com/strands-agents/extension-template-python) to publish it as a standalone package. You can then get it featured in our [community catalog](https://strandsagents.com/docs/community/get-featured/). + > + > We still welcome feature requests for **improvements to existing tools**. - type: textarea id: problem-statement attributes: diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index aec71af0..f0888b3e 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -14,8 +14,9 @@ +> Please note that we are not accepting new tools into this repository. Instead, we recommend using our [extension template](https://github.com/strands-agents/extension-template-python) to publish your own tool package and get it featured in our [community catalog](https://strandsagents.com/docs/community/get-featured/). + Bug fix -New Tool Breaking change Documentation update Other (please describe): diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4e9ea627..e8f71776 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -7,6 +7,27 @@ Please read through this document before submitting any issues or pull requests information to effectively respond to your bug report or contribution. +## New Tools Policy + +**We are not accepting new tools into this repository.** Instead, we recommend publishing new tools as standalone community packages β€” this way you own your release cycle and can iterate independently. + +**What we accept:** +- Bug fixes for existing tools +- Documentation improvements +- Performance enhancements to existing tools +- Test coverage improvements + +**What we don't accept:** +- New tool submissions (PRs adding new tools will be closed) +- New tool feature requests (issues requesting new tools will be closed) + +**Want to build a tool?** Use our [extension template](https://github.com/strands-agents/extension-template-python) to scaffold your own tool package and publish it to PyPI. Once published, you can get it featured in our docs and community catalog: + +- Extension template: https://github.com/strands-agents/extension-template-python +- Get featured in docs: https://strandsagents.com/docs/community/get-featured/ +- Contribution guide: https://strandsagents.com/docs/contribute/ + + ## Reporting Bugs/Feature Requests We welcome you to use the [Bug Reports](../../issues/new?template=bug_report.yml) file to report bugs or [Feature Requests](../../issues/new?template=feature_request.yml) to suggest features. From 81810c4af18f3020ceb5c912438aa45e068babc6 Mon Sep 17 00:00:00 2001 From: Jiri Spilka Date: Wed, 8 Apr 2026 09:04:09 +0200 Subject: [PATCH 37/39] fix: Improve docs using apify-writing-style --- docs/apify_tool.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/apify_tool.md b/docs/apify_tool.md index 46e9e800..ec0a9238 100644 --- a/docs/apify_tool.md +++ b/docs/apify_tool.md @@ -191,9 +191,9 @@ items = agent.tool.apify_get_dataset_items( | `APIFY_API_TOKEN environment variable is not set` | Token not configured | Set the `APIFY_API_TOKEN` environment variable | | `apify-client package is required` | Optional dependency not installed | Run `pip install strands-agents-tools[apify]` | | `Actor ... finished with status FAILED` | Actor execution error | Check Actor input parameters and run logs in [Apify Console](https://console.apify.com) | -| `Task ... finished with status FAILED` | task execution error | Check task configuration and run logs in [Apify Console](https://console.apify.com) | +| `Task ... finished with status FAILED` | Task execution error | Check task configuration and run logs in [Apify Console](https://console.apify.com) | | `Actor/task ... finished with status TIMED-OUT` | Timeout too short for the workload | Increase the `timeout_secs` parameter | -| `Task ... returned no run data` | task `call()` returned `None` (wait timeout) | Increase the `timeout_secs` parameter | +| `Task ... returned no run data` | Task `call()` returned `None` (wait timeout) | Increase the `timeout_secs` parameter | | `No content returned for URL` | Website Content Crawler returned empty results | Verify the URL is accessible and returns content | ## References From 2eab80c4ed6949e782489b216c092bd4a4d2b2d4 Mon Sep 17 00:00:00 2001 From: Jiri Spilka Date: Tue, 14 Apr 2026 14:08:06 +0200 Subject: [PATCH 38/39] fix: Improve docs using apify-writing-style --- README.md | 6 +++--- src/strands_tools/apify.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 9e75e82a..81af8ed6 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ Strands Agents Tools is a community-driven project that provides a powerful set - πŸ“ **File Operations** - Read, write, and edit files with syntax highlighting and intelligent modifications - πŸ–₯️ **Shell Integration** - Execute and interact with shell commands securely -- 🧠 **Memory** - Store user and agent memories across agent runs to provide personalized experiences with Mem0, Amazon Bedrock Knowledge Bases, Elasticsearch, and MongoDB Atlas +- 🧠 **Memory** - Store user and agent memories across agent runs to provide personalized experiences with both Mem0, Amazon Bedrock Knowledge Bases, Elasticsearch, and MongoDB Atlas - πŸ•ΈοΈ **Web Infrastructure** - Perform web searches, extract page content, and crawl websites with Tavily and Exa-powered tools - 🌐 **HTTP Client** - Make API requests with comprehensive authentication support - πŸ’¬ **Slack Client** - Real-time Slack events, message processing, and Slack API access @@ -202,7 +202,7 @@ result = agent.tool.mcp_client( tool_args={"x": 10, "y": 20} ) -# Connect to an SSE-based server +# Connect to a SSE-based server agent.tool.mcp_client( action="connect", connection_id="web_server", @@ -273,7 +273,7 @@ response = agent.tool.http_request( auth_token="your_token_here" ) -# Convert HTML webpages to Markdown for better readability +# Convert HTML webpages to markdown for better readability response = agent.tool.http_request( method="GET", url="https://example.com/article", diff --git a/src/strands_tools/apify.py b/src/strands_tools/apify.py index 3f054603..19f8696a 100644 --- a/src/strands_tools/apify.py +++ b/src/strands_tools/apify.py @@ -394,7 +394,7 @@ def apify_run_actor( run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema - check the Actor README on Apify Store for required fields. timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. + memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default `memory` value if not set. build: Actor build tag or number to run a specific version. Uses latest build if not set. Returns: @@ -483,7 +483,7 @@ def apify_run_actor_and_get_dataset( run_input: JSON-serializable input for the Actor. Each Actor defines its own input schema - check the Actor README on Apify Store for required fields. timeout_secs: Maximum time in seconds to wait for the Actor run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default if not set. + memory_mbytes: Memory allocation in MB for the Actor run. Uses Actor default `memory` value if not set. build: Actor build tag or number to run a specific version. Uses latest build if not set. dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. @@ -538,7 +538,7 @@ def apify_run_task( task_id: Task identifier in "username/task-name" format or a task ID string. task_input: Optional JSON-serializable input to override the task's default input fields. timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. + memory_mbytes: Memory allocation in MB for the task run. Uses task default `memory` value if not set. Returns: Dict with status and content containing run metadata: run_id, status, dataset_id, @@ -587,7 +587,7 @@ def apify_run_task_and_get_dataset( task_id: Task identifier in "username/task-name" format or a task ID string. task_input: Optional JSON-serializable input to override the task's default input fields. timeout_secs: Maximum time in seconds to wait for the task run to finish. Defaults to 300. - memory_mbytes: Memory allocation in MB for the task run. Uses task default if not set. + memory_mbytes: Memory allocation in MB for the task run. Uses task default `memory` value if not set. dataset_items_limit: Maximum number of dataset items to return. Defaults to 100. dataset_items_offset: Number of dataset items to skip for pagination. Defaults to 0. From 4de42a09f8289c62a3d1f16aab117c7e44286347 Mon Sep 17 00:00:00 2001 From: poshinchen Date: Wed, 15 Apr 2026 17:23:28 -0400 Subject: [PATCH 39/39] fix(rss): prevent path traversal via unvalidated feed_id in get_feed_file_path (#451) --- src/strands_tools/rss.py | 6 +++++- tests/test_rss.py | 23 +++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/src/strands_tools/rss.py b/src/strands_tools/rss.py index 9a91de0a..952501c4 100644 --- a/src/strands_tools/rss.py +++ b/src/strands_tools/rss.py @@ -34,7 +34,11 @@ def __init__(self): os.makedirs(self.storage_path, exist_ok=True) def get_feed_file_path(self, feed_id: str) -> str: - return os.path.join(self.storage_path, f"{feed_id}.json") + file_path = os.path.realpath(os.path.join(self.storage_path, f"{feed_id}.json")) + storage_real = os.path.realpath(self.storage_path) + if not file_path.startswith(storage_real + os.sep): + raise ValueError(f"Invalid feed_id: path traversal detected in '{feed_id}'") + return file_path def get_subscription_file_path(self) -> str: return os.path.join(self.storage_path, "subscriptions.json") diff --git a/tests/test_rss.py b/tests/test_rss.py index 5a31cbe0..50b1db41 100644 --- a/tests/test_rss.py +++ b/tests/test_rss.py @@ -1,6 +1,7 @@ """Comprehensive tests for RSS feed tool with improved organization.""" import json +import os from unittest.mock import MagicMock, call, mock_open, patch import pytest @@ -131,6 +132,28 @@ def test_content_processing(self): result = manager.format_entry(entry_no_content, include_content=True) assert result["content"] == "No content available" + @pytest.mark.parametrize( + "feed_id", + [ + "../outside", + "../../etc/config", + "subdir/../../../escape", + "/absolute/path", + ], + ) + def test_get_feed_file_path_rejects_traversal(self, feed_id): + """Test that path traversal sequences in feed_id are rejected.""" + manager = RSSManager() + with pytest.raises(ValueError, match="path traversal detected"): + manager.get_feed_file_path(feed_id) + + def test_get_feed_file_path_allows_valid_ids(self): + """Test that valid feed_ids are accepted.""" + manager = RSSManager() + path = manager.get_feed_file_path("my_valid_feed") + assert path.endswith("my_valid_feed.json") + assert os.path.realpath(manager.storage_path) in path + @pytest.mark.parametrize( "url,expected_id", [