Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ AI-friendly web content fetching tool designed for LLM consumption. Rust library
- **Pluggable fetchers** - URL-aware dispatch to specialized handlers for repos, docs, feeds, videos, papers, and more
- **HTML-to-Markdown** - Built-in conversion optimized for LLMs, with fetched relative links/images resolved to absolute URLs
- **Agent content focus** - Optional low-noise extraction mode for AI agents
- **Crawl discovery** - Optional bounded same-origin page discovery for AI agents
- **HTML-to-Text** - Plain text extraction with clean formatting
- **Binary detection** - Returns metadata only for images, PDFs, etc.
- **Timeout handling** - 1s first-byte, 30s body with partial content on timeout
Expand Down Expand Up @@ -72,6 +73,9 @@ fetchkit fetch https://example.com --user-agent "MyBot/1.0"
# Hardened outbound policy for cluster/data-plane use
fetchkit fetch https://example.com --hardened

# Discover a small same-origin page map for an agent
fetchkit fetch https://example.com --content-focus agent --crawl --max-pages 5

# Show full documentation
fetchkit --llmtxt
```
Expand Down Expand Up @@ -216,6 +220,8 @@ response = tool.fetch("https://example.com")
| `as_text` | bool? | Convert HTML to plain text |
| `save_to_file` | string? | Save body to path (requires `FileSaver`) |
| `content_focus` | string? | `"full"`/unset returns everything; `"main"` strips semantic boilerplate; `"readable"` selects article-like content; `"agent"` selects the best low-noise strategy for AI agents |
| `crawl` | bool? | Fetch the seed URL, then discover and fetch bounded same-origin pages |
| `max_pages` | int? | Maximum crawl pages, including the seed; default 5, max 20 |
| `if_none_match` | string? | ETag for conditional `If-None-Match` |
| `if_modified_since` | string? | Timestamp for conditional `If-Modified-Since` |

Expand All @@ -239,6 +245,7 @@ response = tool.fetch("https://example.com")
| `bytes_written` | int? | Bytes saved to file |
| `metadata` | object? | Structured `PageMetadata` (title, description, links, headings, extraction method, …) |
| `quality` | object? | Agent-facing `PageQuality` (score, warnings, link density, suggested next action) |
| `crawl` | object? | Bounded crawl discovery result with visited page summaries |
| `word_count` | int? | Word count of returned content |
| `redirect_chain` | string[] | URLs visited during redirects (empty if none) |
| `is_paywall` | bool? | Heuristic paywall signal (soft, not guaranteed) |
Expand Down
129 changes: 113 additions & 16 deletions crates/fetchkit-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
mod mcp;

use clap::{Parser, Subcommand, ValueEnum};
use fetchkit::{FetchRequest, Tool, TOOL_LLMTXT};
use fetchkit::{CrawlPage, FetchRequest, Tool, TOOL_LLMTXT};
use std::io::{self, Write};

/// Output format for fetch subcommand
Expand Down Expand Up @@ -88,6 +88,18 @@ enum Commands {
/// Agent FQDN for Signature-Agent header (requires --bot-auth-key)
#[arg(long)]
bot_auth_agent: Option<String>,

/// Extraction focus: full, main, readable, or agent
#[arg(long)]
content_focus: Option<String>,

/// Discover and fetch a bounded set of same-origin pages
#[arg(long)]
crawl: bool,

/// Maximum crawl pages, including the seed
#[arg(long, default_value_t = 5)]
max_pages: usize,
},
}

Expand Down Expand Up @@ -125,17 +137,22 @@ async fn main() {
allow_env_proxy,
bot_auth_key,
bot_auth_agent,
content_focus,
crawl,
max_pages,
}) => {
run_fetch(
&url,
let options = FetchCommandOptions {
output,
user_agent,
hardened,
allow_env_proxy,
bot_auth_key,
bot_auth_agent,
)
.await;
content_focus,
crawl,
max_pages,
};
run_fetch(&url, options).await;
}
None => {
eprintln!("Usage: fetchkit fetch <URL>");
Expand Down Expand Up @@ -192,28 +209,38 @@ fn build_tool(
builder.build()
}

async fn run_fetch(
url: &str,
struct FetchCommandOptions {
output: OutputFormat,
user_agent: Option<String>,
hardened: bool,
allow_env_proxy: bool,
bot_auth_key: Option<String>,
bot_auth_agent: Option<String>,
) {
content_focus: Option<String>,
crawl: bool,
max_pages: usize,
}

async fn run_fetch(url: &str, options: FetchCommandOptions) {
// Build request with markdown conversion
let request = FetchRequest::new(url).as_markdown();
let mut request = FetchRequest::new(url).as_markdown();
if let Some(focus) = options.content_focus {
request = request.content_focus(focus);
}
if options.crawl {
request = request.crawl(true).max_pages(options.max_pages);
}
let tool = build_tool(
user_agent,
hardened,
allow_env_proxy,
bot_auth_key,
bot_auth_agent,
options.user_agent,
options.hardened,
options.allow_env_proxy,
options.bot_auth_key,
options.bot_auth_agent,
);

// Execute request
match tool.execute(request).await {
Ok(response) => match output {
Ok(response) => match options.output {
OutputFormat::Md => print_md_with_frontmatter(&response),
OutputFormat::Json => {
let json = serde_json::to_string_pretty(&response).unwrap_or_else(|e| {
Expand Down Expand Up @@ -277,6 +304,12 @@ fn format_md_with_frontmatter(response: &fetchkit::FetchResponse) -> String {
output.push_str(&format!("suggested_next_action: {}\n", yaml_quote(action)));
}
}
if let Some(ref crawl) = response.crawl {
output.push_str(&format!("crawl_pages: {}\n", crawl.pages.len()));
if crawl.truncated.unwrap_or(false) {
output.push_str("crawl_truncated: true\n");
}
}
output.push_str("---\n");

// Append content, or error as body for unsupported content
Expand All @@ -285,10 +318,43 @@ fn format_md_with_frontmatter(response: &fetchkit::FetchResponse) -> String {
} else if let Some(ref err) = response.error {
output.push_str(err);
}
append_crawl_summary(&mut output, response);

output
}

fn append_crawl_summary(output: &mut String, response: &fetchkit::FetchResponse) {
let Some(ref crawl) = response.crawl else {
return;
};
if crawl.pages.is_empty() {
return;
}

output.push_str("\n\n## Crawl Discovery\n\n");
for page in &crawl.pages {
output.push_str(&format!("- {}\n", format_crawl_page(page)));
}
}

fn format_crawl_page(page: &CrawlPage) -> String {
let title = page.title.as_deref().unwrap_or(page.url.as_str());
let mut summary = format!("[{}]({})", title.replace(['[', ']'], ""), page.url);
if let Some(status_code) = page.status_code {
summary.push_str(&format!(" - status {status_code}"));
}
if let Some(score) = page.quality_score {
summary.push_str(&format!(", quality {score:.2}"));
}
if let Some(word_count) = page.word_count {
summary.push_str(&format!(", {word_count} words"));
}
if let Some(ref error) = page.error {
summary.push_str(&format!(", error: {error}"));
}
summary
}

/// Write to stdout, exit silently on broken pipe
fn writeln_safe(s: &str) {
let stdout = io::stdout();
Expand All @@ -305,7 +371,7 @@ fn writeln_safe(s: &str) {
#[cfg(test)]
mod tests {
use super::*;
use fetchkit::{FetchResponse, PageQuality};
use fetchkit::{CrawlPage, CrawlResult, FetchResponse, PageQuality};

#[test]
fn test_format_md_basic() {
Expand Down Expand Up @@ -408,4 +474,35 @@ mod tests {
assert!(output.contains("filename: \"*alias\"\n"));
assert!(!output.contains("\nforged: true\n"));
}

#[test]
fn test_format_md_includes_crawl_summary() {
let response = FetchResponse {
url: "https://example.com".to_string(),
status_code: 200,
content: Some("# Home".to_string()),
crawl: Some(CrawlResult {
seed_url: "https://example.com".to_string(),
max_pages: 2,
pages: vec![CrawlPage {
url: "https://example.com/docs".to_string(),
status_code: Some(200),
title: Some("Docs".to_string()),
word_count: Some(42),
quality_score: Some(0.91),
..Default::default()
}],
truncated: Some(true),
}),
..Default::default()
};

let output = format_md_with_frontmatter(&response);

assert!(output.contains("crawl_pages: 1\n"));
assert!(output.contains("crawl_truncated: true\n"));
assert!(output.contains("## Crawl Discovery"));
assert!(output
.contains("[Docs](https://example.com/docs) - status 200, quality 0.91, 42 words"));
}
}
72 changes: 71 additions & 1 deletion crates/fetchkit-cli/src/mcp.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! MCP (Model Context Protocol) server implementation

use fetchkit::Tool;
use fetchkit::{CrawlPage, Tool};
use serde::{Deserialize, Serialize};
use serde_json::{json, Value};
use std::io::{self, BufRead, Write};
Expand Down Expand Up @@ -230,6 +230,12 @@ fn format_md_with_frontmatter(response: &fetchkit::FetchResponse) -> String {
output.push_str(&format!("suggested_next_action: {}\n", yaml_quote(action)));
}
}
if let Some(ref crawl) = response.crawl {
output.push_str(&format!("crawl_pages: {}\n", crawl.pages.len()));
if crawl.truncated.unwrap_or(false) {
output.push_str("crawl_truncated: true\n");
}
}
output.push_str("---\n");

// Append content, or error as body for unsupported content
Expand All @@ -238,10 +244,43 @@ fn format_md_with_frontmatter(response: &fetchkit::FetchResponse) -> String {
} else if let Some(ref err) = response.error {
output.push_str(err);
}
append_crawl_summary(&mut output, response);

output
}

fn append_crawl_summary(output: &mut String, response: &fetchkit::FetchResponse) {
let Some(ref crawl) = response.crawl else {
return;
};
if crawl.pages.is_empty() {
return;
}

output.push_str("\n\n## Crawl Discovery\n\n");
for page in &crawl.pages {
output.push_str(&format!("- {}\n", format_crawl_page(page)));
}
}

fn format_crawl_page(page: &CrawlPage) -> String {
let title = page.title.as_deref().unwrap_or(page.url.as_str());
let mut summary = format!("[{}]({})", title.replace(['[', ']'], ""), page.url);
if let Some(status_code) = page.status_code {
summary.push_str(&format!(" - status {status_code}"));
}
if let Some(score) = page.quality_score {
summary.push_str(&format!(", quality {score:.2}"));
}
if let Some(word_count) = page.word_count {
summary.push_str(&format!(", {word_count} words"));
}
if let Some(ref error) = page.error {
summary.push_str(&format!(", error: {error}"));
}
summary
}

fn yaml_quote(value: &str) -> String {
serde_json::to_string(value).unwrap_or_else(|_| "\"\"".to_string())
}
Expand Down Expand Up @@ -315,4 +354,35 @@ mod tests {
assert!(output.contains("extraction_method: \"agent_main\"\n"));
assert!(output.contains("suggested_next_action: \"retry_with_agent_focus_or_crawl\"\n"));
}

#[test]
fn test_format_md_includes_crawl_summary() {
let response = fetchkit::FetchResponse {
url: "https://example.com".to_string(),
status_code: 200,
content: Some("# Home".to_string()),
crawl: Some(fetchkit::CrawlResult {
seed_url: "https://example.com".to_string(),
max_pages: 2,
pages: vec![fetchkit::CrawlPage {
url: "https://example.com/docs".to_string(),
status_code: Some(200),
title: Some("Docs".to_string()),
word_count: Some(42),
quality_score: Some(0.91),
..Default::default()
}],
truncated: Some(true),
}),
..Default::default()
};

let output = format_md_with_frontmatter(&response);

assert!(output.contains("crawl_pages: 1\n"));
assert!(output.contains("crawl_truncated: true\n"));
assert!(output.contains("## Crawl Discovery"));
assert!(output
.contains("[Docs](https://example.com/docs) - status 200, quality 0.91, 42 words"));
}
}
3 changes: 3 additions & 0 deletions crates/fetchkit-cli/tests/cli_integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,9 @@ fn test_fetch_help_lists_hardening_flags() {
assert!(output.status.success());
assert!(stdout.contains("--hardened"));
assert!(stdout.contains("--allow-env-proxy"));
assert!(stdout.contains("--content-focus"));
assert!(stdout.contains("--crawl"));
assert!(stdout.contains("--max-pages"));
}

#[test]
Expand Down
Loading
Loading