diff --git a/README.md b/README.md index c5f288f..e486d5f 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ AI-friendly web content fetching tool designed for LLM consumption. Rust library - **Pluggable fetchers** - URL-aware dispatch to specialized handlers for repos, docs, feeds, videos, papers, and more - **HTML-to-Markdown** - Built-in conversion optimized for LLMs, with fetched relative links/images resolved to absolute URLs - **Agent content focus** - Optional low-noise extraction mode for AI agents +- **Crawl discovery** - Optional bounded same-origin page discovery for AI agents - **HTML-to-Text** - Plain text extraction with clean formatting - **Binary detection** - Returns metadata only for images, PDFs, etc. - **Timeout handling** - 1s first-byte, 30s body with partial content on timeout @@ -72,6 +73,9 @@ fetchkit fetch https://example.com --user-agent "MyBot/1.0" # Hardened outbound policy for cluster/data-plane use fetchkit fetch https://example.com --hardened +# Discover a small same-origin page map for an agent +fetchkit fetch https://example.com --content-focus agent --crawl --max-pages 5 + # Show full documentation fetchkit --llmtxt ``` @@ -216,6 +220,8 @@ response = tool.fetch("https://example.com") | `as_text` | bool? | Convert HTML to plain text | | `save_to_file` | string? | Save body to path (requires `FileSaver`) | | `content_focus` | string? | `"full"`/unset returns everything; `"main"` strips semantic boilerplate; `"readable"` selects article-like content; `"agent"` selects the best low-noise strategy for AI agents | +| `crawl` | bool? | Fetch the seed URL, then discover and fetch bounded same-origin pages | +| `max_pages` | int? | Maximum crawl pages, including the seed; default 5, max 20 | | `if_none_match` | string? | ETag for conditional `If-None-Match` | | `if_modified_since` | string? | Timestamp for conditional `If-Modified-Since` | @@ -239,6 +245,7 @@ response = tool.fetch("https://example.com") | `bytes_written` | int? | Bytes saved to file | | `metadata` | object? | Structured `PageMetadata` (title, description, links, headings, extraction method, …) | | `quality` | object? | Agent-facing `PageQuality` (score, warnings, link density, suggested next action) | +| `crawl` | object? | Bounded crawl discovery result with visited page summaries | | `word_count` | int? | Word count of returned content | | `redirect_chain` | string[] | URLs visited during redirects (empty if none) | | `is_paywall` | bool? | Heuristic paywall signal (soft, not guaranteed) | diff --git a/crates/fetchkit-cli/src/main.rs b/crates/fetchkit-cli/src/main.rs index d871e25..32be1e1 100644 --- a/crates/fetchkit-cli/src/main.rs +++ b/crates/fetchkit-cli/src/main.rs @@ -14,7 +14,7 @@ mod mcp; use clap::{Parser, Subcommand, ValueEnum}; -use fetchkit::{FetchRequest, Tool, TOOL_LLMTXT}; +use fetchkit::{CrawlPage, FetchRequest, Tool, TOOL_LLMTXT}; use std::io::{self, Write}; /// Output format for fetch subcommand @@ -88,6 +88,18 @@ enum Commands { /// Agent FQDN for Signature-Agent header (requires --bot-auth-key) #[arg(long)] bot_auth_agent: Option, + + /// Extraction focus: full, main, readable, or agent + #[arg(long)] + content_focus: Option, + + /// Discover and fetch a bounded set of same-origin pages + #[arg(long)] + crawl: bool, + + /// Maximum crawl pages, including the seed + #[arg(long, default_value_t = 5)] + max_pages: usize, }, } @@ -125,17 +137,22 @@ async fn main() { allow_env_proxy, bot_auth_key, bot_auth_agent, + content_focus, + crawl, + max_pages, }) => { - run_fetch( - &url, + let options = FetchCommandOptions { output, user_agent, hardened, allow_env_proxy, bot_auth_key, bot_auth_agent, - ) - .await; + content_focus, + crawl, + max_pages, + }; + run_fetch(&url, options).await; } None => { eprintln!("Usage: fetchkit fetch "); @@ -192,28 +209,38 @@ fn build_tool( builder.build() } -async fn run_fetch( - url: &str, +struct FetchCommandOptions { output: OutputFormat, user_agent: Option, hardened: bool, allow_env_proxy: bool, bot_auth_key: Option, bot_auth_agent: Option, -) { + content_focus: Option, + crawl: bool, + max_pages: usize, +} + +async fn run_fetch(url: &str, options: FetchCommandOptions) { // Build request with markdown conversion - let request = FetchRequest::new(url).as_markdown(); + let mut request = FetchRequest::new(url).as_markdown(); + if let Some(focus) = options.content_focus { + request = request.content_focus(focus); + } + if options.crawl { + request = request.crawl(true).max_pages(options.max_pages); + } let tool = build_tool( - user_agent, - hardened, - allow_env_proxy, - bot_auth_key, - bot_auth_agent, + options.user_agent, + options.hardened, + options.allow_env_proxy, + options.bot_auth_key, + options.bot_auth_agent, ); // Execute request match tool.execute(request).await { - Ok(response) => match output { + Ok(response) => match options.output { OutputFormat::Md => print_md_with_frontmatter(&response), OutputFormat::Json => { let json = serde_json::to_string_pretty(&response).unwrap_or_else(|e| { @@ -277,6 +304,12 @@ fn format_md_with_frontmatter(response: &fetchkit::FetchResponse) -> String { output.push_str(&format!("suggested_next_action: {}\n", yaml_quote(action))); } } + if let Some(ref crawl) = response.crawl { + output.push_str(&format!("crawl_pages: {}\n", crawl.pages.len())); + if crawl.truncated.unwrap_or(false) { + output.push_str("crawl_truncated: true\n"); + } + } output.push_str("---\n"); // Append content, or error as body for unsupported content @@ -285,10 +318,43 @@ fn format_md_with_frontmatter(response: &fetchkit::FetchResponse) -> String { } else if let Some(ref err) = response.error { output.push_str(err); } + append_crawl_summary(&mut output, response); output } +fn append_crawl_summary(output: &mut String, response: &fetchkit::FetchResponse) { + let Some(ref crawl) = response.crawl else { + return; + }; + if crawl.pages.is_empty() { + return; + } + + output.push_str("\n\n## Crawl Discovery\n\n"); + for page in &crawl.pages { + output.push_str(&format!("- {}\n", format_crawl_page(page))); + } +} + +fn format_crawl_page(page: &CrawlPage) -> String { + let title = page.title.as_deref().unwrap_or(page.url.as_str()); + let mut summary = format!("[{}]({})", title.replace(['[', ']'], ""), page.url); + if let Some(status_code) = page.status_code { + summary.push_str(&format!(" - status {status_code}")); + } + if let Some(score) = page.quality_score { + summary.push_str(&format!(", quality {score:.2}")); + } + if let Some(word_count) = page.word_count { + summary.push_str(&format!(", {word_count} words")); + } + if let Some(ref error) = page.error { + summary.push_str(&format!(", error: {error}")); + } + summary +} + /// Write to stdout, exit silently on broken pipe fn writeln_safe(s: &str) { let stdout = io::stdout(); @@ -305,7 +371,7 @@ fn writeln_safe(s: &str) { #[cfg(test)] mod tests { use super::*; - use fetchkit::{FetchResponse, PageQuality}; + use fetchkit::{CrawlPage, CrawlResult, FetchResponse, PageQuality}; #[test] fn test_format_md_basic() { @@ -408,4 +474,35 @@ mod tests { assert!(output.contains("filename: \"*alias\"\n")); assert!(!output.contains("\nforged: true\n")); } + + #[test] + fn test_format_md_includes_crawl_summary() { + let response = FetchResponse { + url: "https://example.com".to_string(), + status_code: 200, + content: Some("# Home".to_string()), + crawl: Some(CrawlResult { + seed_url: "https://example.com".to_string(), + max_pages: 2, + pages: vec![CrawlPage { + url: "https://example.com/docs".to_string(), + status_code: Some(200), + title: Some("Docs".to_string()), + word_count: Some(42), + quality_score: Some(0.91), + ..Default::default() + }], + truncated: Some(true), + }), + ..Default::default() + }; + + let output = format_md_with_frontmatter(&response); + + assert!(output.contains("crawl_pages: 1\n")); + assert!(output.contains("crawl_truncated: true\n")); + assert!(output.contains("## Crawl Discovery")); + assert!(output + .contains("[Docs](https://example.com/docs) - status 200, quality 0.91, 42 words")); + } } diff --git a/crates/fetchkit-cli/src/mcp.rs b/crates/fetchkit-cli/src/mcp.rs index 594a43c..8882f8b 100644 --- a/crates/fetchkit-cli/src/mcp.rs +++ b/crates/fetchkit-cli/src/mcp.rs @@ -1,6 +1,6 @@ //! MCP (Model Context Protocol) server implementation -use fetchkit::Tool; +use fetchkit::{CrawlPage, Tool}; use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; use std::io::{self, BufRead, Write}; @@ -230,6 +230,12 @@ fn format_md_with_frontmatter(response: &fetchkit::FetchResponse) -> String { output.push_str(&format!("suggested_next_action: {}\n", yaml_quote(action))); } } + if let Some(ref crawl) = response.crawl { + output.push_str(&format!("crawl_pages: {}\n", crawl.pages.len())); + if crawl.truncated.unwrap_or(false) { + output.push_str("crawl_truncated: true\n"); + } + } output.push_str("---\n"); // Append content, or error as body for unsupported content @@ -238,10 +244,43 @@ fn format_md_with_frontmatter(response: &fetchkit::FetchResponse) -> String { } else if let Some(ref err) = response.error { output.push_str(err); } + append_crawl_summary(&mut output, response); output } +fn append_crawl_summary(output: &mut String, response: &fetchkit::FetchResponse) { + let Some(ref crawl) = response.crawl else { + return; + }; + if crawl.pages.is_empty() { + return; + } + + output.push_str("\n\n## Crawl Discovery\n\n"); + for page in &crawl.pages { + output.push_str(&format!("- {}\n", format_crawl_page(page))); + } +} + +fn format_crawl_page(page: &CrawlPage) -> String { + let title = page.title.as_deref().unwrap_or(page.url.as_str()); + let mut summary = format!("[{}]({})", title.replace(['[', ']'], ""), page.url); + if let Some(status_code) = page.status_code { + summary.push_str(&format!(" - status {status_code}")); + } + if let Some(score) = page.quality_score { + summary.push_str(&format!(", quality {score:.2}")); + } + if let Some(word_count) = page.word_count { + summary.push_str(&format!(", {word_count} words")); + } + if let Some(ref error) = page.error { + summary.push_str(&format!(", error: {error}")); + } + summary +} + fn yaml_quote(value: &str) -> String { serde_json::to_string(value).unwrap_or_else(|_| "\"\"".to_string()) } @@ -315,4 +354,35 @@ mod tests { assert!(output.contains("extraction_method: \"agent_main\"\n")); assert!(output.contains("suggested_next_action: \"retry_with_agent_focus_or_crawl\"\n")); } + + #[test] + fn test_format_md_includes_crawl_summary() { + let response = fetchkit::FetchResponse { + url: "https://example.com".to_string(), + status_code: 200, + content: Some("# Home".to_string()), + crawl: Some(fetchkit::CrawlResult { + seed_url: "https://example.com".to_string(), + max_pages: 2, + pages: vec![fetchkit::CrawlPage { + url: "https://example.com/docs".to_string(), + status_code: Some(200), + title: Some("Docs".to_string()), + word_count: Some(42), + quality_score: Some(0.91), + ..Default::default() + }], + truncated: Some(true), + }), + ..Default::default() + }; + + let output = format_md_with_frontmatter(&response); + + assert!(output.contains("crawl_pages: 1\n")); + assert!(output.contains("crawl_truncated: true\n")); + assert!(output.contains("## Crawl Discovery")); + assert!(output + .contains("[Docs](https://example.com/docs) - status 200, quality 0.91, 42 words")); + } } diff --git a/crates/fetchkit-cli/tests/cli_integration.rs b/crates/fetchkit-cli/tests/cli_integration.rs index ffdb45d..e9c604c 100644 --- a/crates/fetchkit-cli/tests/cli_integration.rs +++ b/crates/fetchkit-cli/tests/cli_integration.rs @@ -163,6 +163,9 @@ fn test_fetch_help_lists_hardening_flags() { assert!(output.status.success()); assert!(stdout.contains("--hardened")); assert!(stdout.contains("--allow-env-proxy")); + assert!(stdout.contains("--content-focus")); + assert!(stdout.contains("--crawl")); + assert!(stdout.contains("--max-pages")); } #[test] diff --git a/crates/fetchkit-python/src/lib.rs b/crates/fetchkit-python/src/lib.rs index d869ff0..debca1e 100644 --- a/crates/fetchkit-python/src/lib.rs +++ b/crates/fetchkit-python/src/lib.rs @@ -32,12 +32,15 @@ pub struct PyFetchRequest { impl PyFetchRequest { /// Create a new request #[new] - #[pyo3(signature = (url, method=None, as_markdown=None, as_text=None))] + #[pyo3(signature = (url, method=None, as_markdown=None, as_text=None, content_focus=None, crawl=None, max_pages=None))] fn new( url: String, method: Option, as_markdown: Option, as_text: Option, + content_focus: Option, + crawl: Option, + max_pages: Option, ) -> PyResult { let mut req = FetchRequest::new(url); @@ -47,6 +50,9 @@ impl PyFetchRequest { req.as_markdown = as_markdown; req.as_text = as_text; + req.content_focus = content_focus; + req.crawl = crawl; + req.max_pages = max_pages; Ok(Self { inner: req }) } @@ -75,6 +81,24 @@ impl PyFetchRequest { self.inner.as_text } + /// Get content focus. + #[getter] + fn content_focus(&self) -> Option<&str> { + self.inner.content_focus.as_deref() + } + + /// Get crawl flag. + #[getter] + fn crawl(&self) -> Option { + self.inner.crawl + } + + /// Get max pages. + #[getter] + fn max_pages(&self) -> Option { + self.inner.max_pages + } + /// Convert to JSON string fn to_json(&self) -> PyResult { serde_json::to_string(&self.inner).map_err(|e| PyValueError::new_err(e.to_string())) @@ -301,32 +325,56 @@ impl PyFetchKitTool { } /// Fetch a URL directly (convenience method) - #[pyo3(signature = (url, method=None, as_markdown=None, as_text=None))] + #[pyo3(signature = (url, method=None, as_markdown=None, as_text=None, content_focus=None, crawl=None, max_pages=None))] + #[allow(clippy::too_many_arguments)] fn fetch( &self, url: String, method: Option, as_markdown: Option, as_text: Option, + content_focus: Option, + crawl: Option, + max_pages: Option, ) -> PyResult { - let request = PyFetchRequest::new(url, method, as_markdown, as_text)?; + let request = PyFetchRequest::new( + url, + method, + as_markdown, + as_text, + content_focus, + crawl, + max_pages, + )?; self.execute(request) } } /// Fetch a URL using default options (convenience function) #[pyfunction] -#[pyo3(signature = (url, method=None, as_markdown=None, as_text=None))] +#[pyo3(signature = (url, method=None, as_markdown=None, as_text=None, content_focus=None, crawl=None, max_pages=None))] +#[allow(clippy::too_many_arguments)] fn fetch( url: String, method: Option, as_markdown: Option, as_text: Option, + content_focus: Option, + crawl: Option, + max_pages: Option, ) -> PyResult { let tool = PyFetchKitTool::new( true, true, None, None, None, None, true, false, None, None, None, false, )?; - tool.fetch(url, method, as_markdown, as_text) + tool.fetch( + url, + method, + as_markdown, + as_text, + content_focus, + crawl, + max_pages, + ) } /// Python module definition diff --git a/crates/fetchkit/src/client.rs b/crates/fetchkit/src/client.rs index 73df494..d568483 100644 --- a/crates/fetchkit/src/client.rs +++ b/crates/fetchkit/src/client.rs @@ -201,6 +201,10 @@ pub async fn fetch_with_options( } req.normalize_url_for_fetch()?; + if req.wants_crawl() { + return crate::crawl::crawl_fetch_with_options(req, options).await; + } + // Use registry with default fetchers let registry = FetcherRegistry::with_defaults(); registry.fetch(req, options).await diff --git a/crates/fetchkit/src/crawl.rs b/crates/fetchkit/src/crawl.rs new file mode 100644 index 0000000..11410ae --- /dev/null +++ b/crates/fetchkit/src/crawl.rs @@ -0,0 +1,196 @@ +//! Bounded same-origin crawl discovery for agent workflows. +//! +//! DECISION: crawl is deliberately shallow and same-origin. Agents need a small +//! discovery map, not an unbounded spider that surprises operators. + +use crate::client::FetchOptions; +use crate::error::FetchError; +use crate::fetchers::FetcherRegistry; +use crate::types::{CrawlPage, CrawlResult, FetchRequest, FetchResponse}; +use std::collections::HashSet; +use url::Url; + +pub(crate) const DEFAULT_CRAWL_MAX_PAGES: usize = 5; +pub(crate) const MAX_CRAWL_MAX_PAGES: usize = 20; + +pub(crate) async fn crawl_fetch_with_options( + mut request: FetchRequest, + options: FetchOptions, +) -> Result { + request.normalize_url_for_fetch()?; + let max_pages = request + .max_pages + .unwrap_or(DEFAULT_CRAWL_MAX_PAGES) + .clamp(1, MAX_CRAWL_MAX_PAGES); + + let registry = FetcherRegistry::with_defaults(); + let mut seed_request = request.clone(); + seed_request.crawl = None; + seed_request.max_pages = None; + if seed_request.as_markdown.is_none() { + seed_request.as_markdown = Some(true); + } + if seed_request.content_focus.is_none() { + seed_request.content_focus = Some("agent".to_string()); + } + + let mut seed_response = registry.fetch(seed_request, options.clone()).await?; + let discovery_base_url = + Url::parse(&seed_response.url).map_err(|_| FetchError::InvalidUrlScheme)?; + let mut pages = vec![page_from_response(&seed_response)]; + let mut seen = HashSet::from([canonical_url_key(&seed_response.url)]); + let mut candidates = discover_same_origin_links(&seed_response, &discovery_base_url); + candidates.retain(|candidate| seen.insert(canonical_url_key(candidate.as_str()))); + + let truncated = candidates.len() + pages.len() > max_pages; + for url in candidates.into_iter().take(max_pages.saturating_sub(1)) { + let mut page_request = FetchRequest::new(url.as_str()).as_markdown(); + page_request.content_focus = Some("agent".to_string()); + + match registry.fetch(page_request, options.clone()).await { + Ok(response) => pages.push(page_from_response(&response)), + Err(err) => pages.push(CrawlPage { + url: url.to_string(), + error: Some(err.to_string()), + ..Default::default() + }), + } + } + + seed_response.crawl = Some(CrawlResult { + seed_url: request.url, + max_pages, + pages, + truncated: if truncated { Some(true) } else { None }, + }); + Ok(seed_response) +} + +fn page_from_response(response: &FetchResponse) -> CrawlPage { + let metadata = response.metadata.as_ref(); + CrawlPage { + url: response.url.clone(), + status_code: Some(response.status_code), + title: metadata.and_then(|meta| meta.title.clone()), + description: metadata.and_then(|meta| meta.description.clone()), + content_type: response.content_type.clone(), + word_count: response.word_count, + quality_score: response.quality.as_ref().map(|quality| quality.score), + error: response.error.clone(), + } +} + +fn discover_same_origin_links(response: &FetchResponse, seed_url: &Url) -> Vec { + let Some(metadata) = response.metadata.as_ref() else { + return Vec::new(); + }; + + let mut urls = Vec::new(); + for link in &metadata.links { + let Ok(url) = seed_url.join(&link.href) else { + continue; + }; + if is_same_origin(seed_url, &url) && is_fetchable_page_url(&url) { + urls.push(url); + } + } + urls +} + +fn is_same_origin(left: &Url, right: &Url) -> bool { + left.scheme() == right.scheme() + && normalized_host(left) == normalized_host(right) + && left.port_or_known_default() == right.port_or_known_default() +} + +fn normalized_host(url: &Url) -> Option { + url.host_str() + .map(|host| host.trim_end_matches('.').to_ascii_lowercase()) +} + +fn is_fetchable_page_url(url: &Url) -> bool { + if url.scheme() != "http" && url.scheme() != "https" { + return false; + } + + let path = url.path().to_ascii_lowercase(); + ![ + ".avif", ".css", ".gif", ".ico", ".jpeg", ".jpg", ".js", ".pdf", ".png", ".svg", ".webp", + ".zip", + ] + .iter() + .any(|suffix| path.ends_with(suffix)) +} + +fn canonical_url_key(raw_url: &str) -> String { + let Ok(mut url) = Url::parse(raw_url) else { + return raw_url.to_string(); + }; + url.set_fragment(None); + url.to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::dns::DnsPolicy; + use wiremock::matchers::{method, path}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + #[tokio::test] + async fn test_crawl_discovers_same_origin_pages() { + let server = MockServer::start().await; + Mock::given(method("GET")) + .and(path("/")) + .respond_with( + ResponseTemplate::new(200) + .insert_header("content-type", "text/html") + .set_body_string( + r#"Home + Docs + Outside + Image + "#, + ), + ) + .mount(&server) + .await; + Mock::given(method("GET")) + .and(path("/docs")) + .respond_with(ResponseTemplate::new(200).insert_header("content-type", "text/html").set_body_string( + r#"Docs

Useful documentation content for agents to inspect and summarize.

"#, + )) + .mount(&server) + .await; + + let options = FetchOptions { + enable_markdown: true, + dns_policy: DnsPolicy::allow_all(), + ..Default::default() + }; + let response = crawl_fetch_with_options( + FetchRequest::new(server.uri()).crawl(true).max_pages(5), + options, + ) + .await + .unwrap(); + + let crawl = response.crawl.unwrap(); + assert_eq!(crawl.pages.len(), 2); + assert!(crawl + .pages + .iter() + .any(|page| page.url.ends_with("/docs") && page.title.as_deref() == Some("Docs"))); + assert_eq!(crawl.truncated, None); + } + + #[test] + fn test_is_fetchable_page_url_skips_assets() { + assert!(is_fetchable_page_url( + &Url::parse("https://example.com/docs").unwrap() + )); + assert!(!is_fetchable_page_url( + &Url::parse("https://example.com/app.js").unwrap() + )); + } +} diff --git a/crates/fetchkit/src/lib.rs b/crates/fetchkit/src/lib.rs index e803b8e..81c4325 100644 --- a/crates/fetchkit/src/lib.rs +++ b/crates/fetchkit/src/lib.rs @@ -77,6 +77,7 @@ pub mod bot_auth; pub mod client; mod convert; +mod crawl; mod dns; mod error; pub mod fetchers; @@ -106,7 +107,10 @@ pub use transport::{ BodyStream, HttpTransport, ReqwestTransport, TransportError, TransportMethod, TransportRequest, TransportResponse, }; -pub use types::{FetchRequest, FetchResponse, HttpMethod, PageLink, PageMetadata, PageQuality}; +pub use types::{ + CrawlPage, CrawlResult, FetchRequest, FetchResponse, HttpMethod, PageLink, PageMetadata, + PageQuality, +}; #[cfg(feature = "bot-auth")] pub use bot_auth::{BotAuthConfig, BotAuthError}; diff --git a/crates/fetchkit/src/tool.rs b/crates/fetchkit/src/tool.rs index b8298f1..dfe39e4 100644 --- a/crates/fetchkit/src/tool.rs +++ b/crates/fetchkit/src/tool.rs @@ -754,7 +754,7 @@ fn validate_args(tool: &Tool, args: &Value) -> Result<(), ToolError> { "as_markdown" => tool.enable_markdown, "as_text" => tool.enable_text, "save_to_file" => tool.enable_save_to_file, - "if_none_match" | "if_modified_since" => true, + "content_focus" | "if_none_match" | "if_modified_since" | "crawl" | "max_pages" => true, _ => false, }; @@ -830,6 +830,15 @@ fn build_input_schema( ); } + properties.insert( + "content_focus".to_string(), + json!({ + "type": "string", + "enum": ["full", "main", "readable", "agent"], + "default": "full", + "description": "Extraction focus. Use agent for low-noise AI-agent content." + }), + ); properties.insert( "if_none_match".to_string(), json!({ @@ -844,6 +853,24 @@ fn build_input_schema( "description": "Last-Modified value for conditional requests (If-Modified-Since header)" }), ); + properties.insert( + "crawl".to_string(), + json!({ + "type": "boolean", + "default": false, + "description": "Fetch the seed URL, then discover and fetch a bounded set of same-origin pages." + }), + ); + properties.insert( + "max_pages".to_string(), + json!({ + "type": "integer", + "minimum": 1, + "maximum": crate::crawl::MAX_CRAWL_MAX_PAGES, + "default": crate::crawl::DEFAULT_CRAWL_MAX_PAGES, + "description": "Maximum pages for crawl discovery, including the seed page." + }), + ); json!({ "type": "object", @@ -872,6 +899,8 @@ fn build_output_schema() -> Value { "saved_path": {"type": "string"}, "bytes_written": {"type": "integer", "minimum": 0}, "word_count": {"type": "integer", "minimum": 0}, + "quality": {"type": "object"}, + "crawl": {"type": "object"}, "redirect_chain": {"type": "array", "items": {"type": "string"}}, "is_paywall": {"type": "boolean"} }, @@ -1012,6 +1041,20 @@ fn build_help(tool: &Tool) -> String { "\"full\"", parameter_description(tool.locale(), "content_focus"), )); + rows.push(table_row( + "crawl", + "boolean", + "no", + "false", + parameter_description(tool.locale(), "crawl"), + )); + rows.push(table_row( + "max_pages", + "integer", + "no", + "5", + parameter_description(tool.locale(), "max_pages"), + )); let adapters = if tool.enable_save_to_file { if is_ukrainian(tool.locale()) { @@ -1100,12 +1143,16 @@ fn parameter_description(locale: &str, field: &str) -> &'static str { (true, "as_text") => "Перетворити HTML у plain text", (true, "save_to_file") => "Шлях призначення, визначений адаптером", (true, "content_focus") => "`full`, `main`, `readable`, або `agent`", + (true, "crawl") => "Обмежене same-origin виявлення сторінок для агентів", + (true, "max_pages") => "Максимум сторінок для crawl, включно з початковою", (false, "url") => "HTTP/HTTPS URL, or a bare domain URL normalized to `https://`", (false, "method") => "`GET` or `HEAD`", (false, "as_markdown") => "Convert HTML to markdown", (false, "as_text") => "Convert HTML to plain text", (false, "save_to_file") => "Adapter-defined destination path", (false, "content_focus") => "`full`, `main`, `readable`, or `agent`", + (false, "crawl") => "Bounded same-origin page discovery for agents", + (false, "max_pages") => "Maximum crawl pages, including the seed", _ => "", } } @@ -1302,11 +1349,16 @@ mod tests { .unwrap() .contains("bare domain URL")); assert_eq!(input_schema["properties"]["method"]["default"], "GET"); + assert!(input_schema["properties"]["content_focus"].is_object()); + assert!(input_schema["properties"]["crawl"].is_object()); + assert!(input_schema["properties"]["max_pages"].is_object()); assert!(input_schema["properties"]["if_none_match"].is_object()); assert!(input_schema["properties"]["if_modified_since"].is_object()); assert!(output_schema["properties"]["url"].is_object()); assert!(output_schema["properties"]["status_code"].is_object()); assert!(output_schema["properties"]["word_count"].is_object()); + assert!(output_schema["properties"]["quality"].is_object()); + assert!(output_schema["properties"]["crawl"].is_object()); assert!(output_schema["properties"]["redirect_chain"].is_object()); assert!(output_schema["properties"]["is_paywall"].is_object()); assert!(output_schema["properties"]["etag"].is_object()); @@ -1360,6 +1412,17 @@ mod tests { assert!(ok.is_ok()); } + #[test] + fn test_execution_accepts_agent_crawl_arguments() { + let ok = Tool::default().execution(json!({ + "url": "https://example.com", + "content_focus": "agent", + "crawl": true, + "max_pages": 3 + })); + assert!(ok.is_ok()); + } + #[test] fn test_execution_rejects_invalid_url_before_running() { let err = Tool::default().execution(json!({"url": "ftp://example.com"})); diff --git a/crates/fetchkit/src/types.rs b/crates/fetchkit/src/types.rs index 78d6967..78ed257 100644 --- a/crates/fetchkit/src/types.rs +++ b/crates/fetchkit/src/types.rs @@ -112,6 +112,14 @@ pub struct FetchRequest { /// When set, the server may return 304 Not Modified if content unchanged. #[serde(default, skip_serializing_if = "Option::is_none")] pub if_modified_since: Option, + + /// Discover same-origin links after fetching the seed URL. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub crawl: Option, + + /// Maximum pages to fetch when crawl discovery is enabled, including the seed. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub max_pages: Option, } impl FetchRequest { @@ -178,6 +186,18 @@ impl FetchRequest { self } + /// Enable bounded same-origin crawl discovery. + pub fn crawl(mut self, enable: bool) -> Self { + self.crawl = Some(enable); + self + } + + /// Set max pages for crawl discovery, including the seed page. + pub fn max_pages(mut self, max_pages: usize) -> Self { + self.max_pages = Some(max_pages); + self + } + /// Get the effective method (default to GET) pub fn effective_method(&self) -> HttpMethod { self.method.unwrap_or_default() @@ -216,6 +236,11 @@ impl FetchRequest { .map(|f| f.eq_ignore_ascii_case("agent")) .unwrap_or(false) } + + /// Check if bounded crawl discovery is requested. + pub fn wants_crawl(&self) -> bool { + self.crawl.unwrap_or(false) + } } fn canonical_fetch_url(raw_url: &str) -> Result { @@ -364,6 +389,59 @@ pub struct PageQuality { pub suggested_next_action: Option, } +/// Summary for one page visited by bounded crawl discovery. +#[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)] +pub struct CrawlPage { + /// Final page URL. + pub url: String, + + /// HTTP status code, when the page was fetched. + #[serde(skip_serializing_if = "Option::is_none")] + pub status_code: Option, + + /// Page title from metadata, when available. + #[serde(skip_serializing_if = "Option::is_none")] + pub title: Option, + + /// Page description from metadata, when available. + #[serde(skip_serializing_if = "Option::is_none")] + pub description: Option, + + /// Content-Type header, when available. + #[serde(skip_serializing_if = "Option::is_none")] + pub content_type: Option, + + /// Word count of returned content, when available. + #[serde(skip_serializing_if = "Option::is_none")] + pub word_count: Option, + + /// Agent quality score, when available. + #[serde(skip_serializing_if = "Option::is_none")] + pub quality_score: Option, + + /// Error for this crawl page, when fetching failed. + #[serde(skip_serializing_if = "Option::is_none")] + pub error: Option, +} + +/// Bounded same-origin crawl discovery summary. +#[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)] +pub struct CrawlResult { + /// Seed URL requested by the caller. + pub seed_url: String, + + /// Maximum page budget used for this crawl, including the seed. + pub max_pages: usize, + + /// Pages visited or attempted, in discovery order. + #[serde(skip_serializing_if = "Vec::is_empty", default)] + pub pages: Vec, + + /// True when more same-origin candidates existed than the page budget allowed. + #[serde(skip_serializing_if = "Option::is_none")] + pub truncated: Option, +} + /// Response from a fetch operation /// /// Contains the fetched content along with metadata like status code, @@ -450,6 +528,10 @@ pub struct FetchResponse { #[serde(skip_serializing_if = "Option::is_none")] pub quality: Option, + /// Bounded same-origin crawl discovery result + #[serde(skip_serializing_if = "Option::is_none")] + pub crawl: Option, + /// Word count of the final content #[serde(skip_serializing_if = "Option::is_none")] pub word_count: Option, diff --git a/specs/initial.md b/specs/initial.md index 08ef64d..54bda3d 100644 --- a/specs/initial.md +++ b/specs/initial.md @@ -20,7 +20,8 @@ that expose the same tool contract. - Provide a reusable library API and a CLI wrapper. - Provide an MCP server exposing the tool. - Provide Python bindings that expose the same tool contract. -- No crawling, no JS execution, no cookies, no auth. +- No JS execution, no cookies, no auth. Crawl support is bounded same-origin + discovery only. ### Library @@ -77,6 +78,8 @@ Provide a builder to configure tool options, including: - `"agent"` selects the best low-noise strategy for AI agents, currently readable-first then `"main"` - `if_none_match: Option` (sets `If-None-Match` for conditional requests) - `if_modified_since: Option` (sets `If-Modified-Since` for conditional requests) + - `crawl: Option` enables bounded same-origin discovery after fetching the seed URL + - `max_pages: Option` caps crawl discovery, including the seed page (default 5, max 20) - `HttpMethod` enum: `Get`, `Head` - Case-insensitive parser accepts only GET/HEAD. - `FetchResponse` @@ -97,6 +100,7 @@ Provide a builder to configure tool options, including: - `metadata: Option` (structured page metadata; populated for HTML) - `quality: Option` (agent-facing quality score, warnings, link density, extraction method, and suggested next action) + - `crawl: Option` (bounded same-origin discovery result) - `word_count: Option` (word count of final content) - `redirect_chain: Vec` (URLs followed during redirects; empty if none) - `is_paywall: Option` (heuristic paywall signal; not guaranteed) @@ -124,6 +128,20 @@ Provide a builder to configure tool options, including: `retry_with_browser_rendering`, `authenticate_or_use_browser`, `try_alternate_source`, `retry_with_larger_limit_or_narrower_scope`, `retry_with_agent_focus_or_crawl`, `check_url_or_retry_later`, or `use_save_to_file`) +- `CrawlResult` + - `seed_url: String` + - `max_pages: usize` + - `pages: Vec` in discovery order + - `truncated: Option` when more candidates existed than the page budget allowed +- `CrawlPage` + - `url: String` + - `status_code: Option` + - `title: Option` + - `description: Option` + - `content_type: Option` + - `word_count: Option` + - `quality_score: Option` + - `error: Option` - `FetchError` enum - Missing url - Invalid url scheme @@ -157,6 +175,9 @@ Provide a builder to configure tool options, including: - `--user-agent ` (optional, overrides default User-Agent) - `--hardened` (optional, applies the hardened outbound policy profile) - `--allow-env-proxy` (optional, opt in to `HTTP_PROXY` / `HTTPS_PROXY` / `NO_PROXY`) + - `--content-focus ` (optional, extraction focus) + - `--crawl` (optional, bounded same-origin discovery) + - `--max-pages ` (optional, crawl page cap, default 5, max 20) - `--help` (standard help) - MCP subcommand options: - `--hardened` (optional, applies the hardened outbound policy profile) @@ -168,8 +189,9 @@ Provide a builder to configure tool options, including: - Markdown with YAML frontmatter containing metadata - Frontmatter fields: `url`, `status_code`, `source_content_type`, `source_size`, `last_modified`, `filename`, `truncated`, `quality_score`, `quality_warnings`, - `extraction_method`, `suggested_next_action` + `extraction_method`, `suggested_next_action`, `crawl_pages`, `crawl_truncated` - Content follows frontmatter (markdown-converted HTML or error message) + - When `crawl` is present, append a `Crawl Discovery` markdown list after content - Output format (`json`): - JSON-serialized `FetchResponse` to stdout - Exit code: non-zero for `FetchError`. @@ -203,6 +225,19 @@ Provide a builder to configure tool options, including: - Exact host and hostname suffix block rules (if configured) are applied before DNS resolution. - If one or more allowed ports are configured, the URL port must match one of them. +### Crawl Discovery + +- Crawl discovery is opt-in via `crawl: true` or CLI `--crawl`. +- FetchKit first fetches the seed URL normally, then inspects extracted page links. +- Only same-origin HTTP(S) links are eligible: same scheme, normalized host, and port. +- Obvious static assets (`.js`, `.css`, images, PDFs, archives) are skipped. +- `max_pages` includes the seed page, defaults to 5, and is clamped to 20. +- Each discovered page is fetched with markdown conversion and `content_focus="agent"`. +- Fetch errors for discovered pages are captured in that page's `error`; they do not fail + the whole crawl when the seed fetch succeeded. +- URL allow/block lists, host/port policy, DNS policy, redirect validation, body caps, and + timeout behavior apply to every discovered page. + ### SSRF Prevention (DNS Policy) By default, FetchKit blocks connections to private/reserved IP ranges: