diff --git a/README.md b/README.md index 76cb0b9..c5f288f 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,8 @@ url: https://example.com status_code: 200 source_content_type: text/html; charset=UTF-8 source_size: 1256 +quality_score: 1.00 +extraction_method: "full" --- # Example Domain @@ -236,6 +238,7 @@ response = tool.fetch("https://example.com") | `saved_path` | string? | Filesystem path when `save_to_file` succeeded | | `bytes_written` | int? | Bytes saved to file | | `metadata` | object? | Structured `PageMetadata` (title, description, links, headings, extraction method, …) | +| `quality` | object? | Agent-facing `PageQuality` (score, warnings, link density, suggested next action) | | `word_count` | int? | Word count of returned content | | `redirect_chain` | string[] | URLs visited during redirects (empty if none) | | `is_paywall` | bool? | Heuristic paywall signal (soft, not guaranteed) | diff --git a/crates/fetchkit-cli/src/main.rs b/crates/fetchkit-cli/src/main.rs index 734339f..d871e25 100644 --- a/crates/fetchkit-cli/src/main.rs +++ b/crates/fetchkit-cli/src/main.rs @@ -263,6 +263,20 @@ fn format_md_with_frontmatter(response: &fetchkit::FetchResponse) -> String { output.push_str("truncated: true\n"); } } + if let Some(ref quality) = response.quality { + output.push_str(&format!("quality_score: {:.2}\n", quality.score)); + if !quality.warnings.is_empty() { + let warnings = + serde_json::to_string(&quality.warnings).unwrap_or_else(|_| "[]".to_string()); + output.push_str(&format!("quality_warnings: {}\n", warnings)); + } + if let Some(ref method) = quality.extraction_method { + output.push_str(&format!("extraction_method: {}\n", yaml_quote(method))); + } + if let Some(ref action) = quality.suggested_next_action { + output.push_str(&format!("suggested_next_action: {}\n", yaml_quote(action))); + } + } output.push_str("---\n"); // Append content, or error as body for unsupported content @@ -291,7 +305,7 @@ fn writeln_safe(s: &str) { #[cfg(test)] mod tests { use super::*; - use fetchkit::FetchResponse; + use fetchkit::{FetchResponse, PageQuality}; #[test] fn test_format_md_basic() { @@ -322,6 +336,13 @@ mod tests { last_modified: Some("Wed, 01 Jan 2025 00:00:00 GMT".to_string()), filename: Some("page.html".to_string()), truncated: Some(true), + quality: Some(PageQuality { + score: 0.72, + warnings: vec!["low_content".to_string()], + extraction_method: Some("agent_main".to_string()), + suggested_next_action: Some("retry_with_agent_focus_or_crawl".to_string()), + ..Default::default() + }), content: Some("Content here".to_string()), ..Default::default() }; @@ -332,6 +353,10 @@ mod tests { assert!(output.contains("last_modified: \"Wed, 01 Jan 2025 00:00:00 GMT\"\n")); assert!(output.contains("filename: \"page.html\"\n")); assert!(output.contains("truncated: true\n")); + assert!(output.contains("quality_score: 0.72\n")); + assert!(output.contains("quality_warnings: [\"low_content\"]\n")); + assert!(output.contains("extraction_method: \"agent_main\"\n")); + assert!(output.contains("suggested_next_action: \"retry_with_agent_focus_or_crawl\"\n")); } #[test] diff --git a/crates/fetchkit-cli/src/mcp.rs b/crates/fetchkit-cli/src/mcp.rs index 783382c..594a43c 100644 --- a/crates/fetchkit-cli/src/mcp.rs +++ b/crates/fetchkit-cli/src/mcp.rs @@ -216,6 +216,20 @@ fn format_md_with_frontmatter(response: &fetchkit::FetchResponse) -> String { output.push_str("truncated: true\n"); } } + if let Some(ref quality) = response.quality { + output.push_str(&format!("quality_score: {:.2}\n", quality.score)); + if !quality.warnings.is_empty() { + let warnings = + serde_json::to_string(&quality.warnings).unwrap_or_else(|_| "[]".to_string()); + output.push_str(&format!("quality_warnings: {}\n", warnings)); + } + if let Some(ref method) = quality.extraction_method { + output.push_str(&format!("extraction_method: {}\n", yaml_quote(method))); + } + if let Some(ref action) = quality.suggested_next_action { + output.push_str(&format!("suggested_next_action: {}\n", yaml_quote(action))); + } + } output.push_str("---\n"); // Append content, or error as body for unsupported content @@ -273,3 +287,32 @@ pub async fn run_server(tool: Tool) { let _ = stdout.flush(); } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_format_md_includes_quality_frontmatter() { + let response = fetchkit::FetchResponse { + url: "https://example.com".to_string(), + status_code: 200, + quality: Some(fetchkit::PageQuality { + score: 0.4, + warnings: vec!["low_content".to_string()], + extraction_method: Some("agent_main".to_string()), + suggested_next_action: Some("retry_with_agent_focus_or_crawl".to_string()), + ..Default::default() + }), + content: Some("short".to_string()), + ..Default::default() + }; + + let output = format_md_with_frontmatter(&response); + + assert!(output.contains("quality_score: 0.40\n")); + assert!(output.contains("quality_warnings: [\"low_content\"]\n")); + assert!(output.contains("extraction_method: \"agent_main\"\n")); + assert!(output.contains("suggested_next_action: \"retry_with_agent_focus_or_crawl\"\n")); + } +} diff --git a/crates/fetchkit/src/fetchers/default.rs b/crates/fetchkit/src/fetchers/default.rs index d5b0fc0..bf6c409 100644 --- a/crates/fetchkit/src/fetchers/default.rs +++ b/crates/fetchkit/src/fetchers/default.rs @@ -17,7 +17,7 @@ use crate::error::FetchError; use crate::fetchers::Fetcher; use crate::file_saver::FileSaver; use crate::transport::{BodyStream, TransportMethod, TransportRequest, TransportResponse}; -use crate::types::{FetchRequest, FetchResponse, HttpMethod}; +use crate::types::{FetchRequest, FetchResponse, HttpMethod, PageQuality}; use crate::DEFAULT_USER_AGENT; use async_trait::async_trait; use bytes::Bytes; @@ -312,6 +312,7 @@ impl Fetcher for DefaultFetcher { "Binary content is not supported. Only textual content (HTML, text, JSON, etc.) can be fetched." .to_string(), ), + quality: Some(binary_quality_signal()), ..Default::default() }); } @@ -404,6 +405,14 @@ impl Fetcher for DefaultFetcher { if let (Some(metadata), Some(method)) = (&mut page_metadata, extraction_method) { metadata.extraction_method = Some(method.to_string()); } + let quality = compute_quality_signal( + &final_content, + status_code, + truncated, + is_paywall, + extraction_method, + word_count, + ); Ok(FetchResponse { url: final_url, @@ -417,6 +426,7 @@ impl Fetcher for DefaultFetcher { content: Some(final_content), truncated: if truncated { Some(true) } else { None }, metadata: page_metadata, + quality: Some(quality), word_count: Some(word_count), redirect_chain, is_paywall: if is_paywall { Some(true) } else { None }, @@ -788,6 +798,145 @@ fn count_words(text: &str) -> u64 { text.split_whitespace().count() as u64 } +fn binary_quality_signal() -> PageQuality { + PageQuality { + score: 0.0, + warnings: vec!["binary_content".to_string()], + suggested_next_action: Some("use_save_to_file".to_string()), + ..Default::default() + } +} + +fn compute_quality_signal( + content: &str, + status_code: u16, + truncated: bool, + is_paywall: bool, + extraction_method: Option<&str>, + word_count: u64, +) -> PageQuality { + let mut warnings = Vec::new(); + let mut score = 1.0f32; + let link_count = count_markdown_links(content); + let link_density = if word_count == 0 { + 0.0 + } else { + link_count as f32 / word_count as f32 + }; + let lower = content.to_lowercase(); + + if status_code >= 400 { + push_warning(&mut warnings, "http_error"); + score -= 0.35; + } + if truncated { + push_warning(&mut warnings, "truncated"); + score -= 0.20; + } + if word_count < 30 { + push_warning(&mut warnings, "low_content"); + score -= 0.30; + } + if link_count >= 20 && link_density > 0.15 { + push_warning(&mut warnings, "too_many_links"); + score -= 0.20; + } + if is_paywall { + push_warning(&mut warnings, "possible_paywall"); + score -= 0.25; + } + if looks_like_login_wall(&lower) { + push_warning(&mut warnings, "possible_login_wall"); + score -= 0.25; + } + if looks_like_consent_wall(&lower) { + push_warning(&mut warnings, "possible_consent_wall"); + score -= 0.20; + } + if looks_like_javascript_required(&lower) { + push_warning(&mut warnings, "javascript_required"); + score -= 0.30; + } + + PageQuality { + score: score.clamp(0.0, 1.0), + suggested_next_action: suggested_next_action(&warnings).map(str::to_string), + warnings, + link_density: Some(link_density), + extraction_method: extraction_method.map(str::to_string), + } +} + +fn count_markdown_links(content: &str) -> usize { + content.matches("](").count() +} + +fn push_warning(warnings: &mut Vec, warning: &str) { + if !warnings.iter().any(|existing| existing == warning) { + warnings.push(warning.to_string()); + } +} + +fn suggested_next_action(warnings: &[String]) -> Option<&'static str> { + if has_warning(warnings, "javascript_required") { + Some("retry_with_browser_rendering") + } else if has_warning(warnings, "possible_login_wall") { + Some("authenticate_or_use_browser") + } else if has_warning(warnings, "possible_paywall") { + Some("try_alternate_source") + } else if has_warning(warnings, "truncated") { + Some("retry_with_larger_limit_or_narrower_scope") + } else if has_warning(warnings, "low_content") || has_warning(warnings, "too_many_links") { + Some("retry_with_agent_focus_or_crawl") + } else if has_warning(warnings, "http_error") { + Some("check_url_or_retry_later") + } else { + None + } +} + +fn has_warning(warnings: &[String], warning: &str) -> bool { + warnings.iter().any(|existing| existing == warning) +} + +fn looks_like_login_wall(lower_content: &str) -> bool { + [ + "sign in to continue", + "log in to continue", + "please sign in", + "please log in", + "login required", + "sign in required", + ] + .iter() + .any(|needle| lower_content.contains(needle)) +} + +fn looks_like_consent_wall(lower_content: &str) -> bool { + [ + "accept cookies", + "cookie consent", + "manage cookies", + "privacy choices", + "we use cookies", + "consent preferences", + ] + .iter() + .any(|needle| lower_content.contains(needle)) +} + +fn looks_like_javascript_required(lower_content: &str) -> bool { + [ + "enable javascript", + "javascript is disabled", + "requires javascript", + "please enable js", + "enable js", + ] + .iter() + .any(|needle| lower_content.contains(needle)) +} + /// Common paywall indicators in raw HTML content. const PAYWALL_INDICATORS: &[&str] = &[ "paywall", @@ -1302,6 +1451,46 @@ mod tests { assert_eq!(count_words("word"), 1); } + #[test] + fn test_compute_quality_signal_clean_content() { + let content = "This page has enough useful words for an AI agent to answer with confidence. It includes actual content instead of just a menu, and it gives a short but complete explanation that should be useful for downstream reasoning."; + let quality = compute_quality_signal( + content, + 200, + false, + false, + Some("agent_readable"), + count_words(content), + ); + + assert!(quality.score > 0.9, "{quality:?}"); + assert!(quality.warnings.is_empty(), "{quality:?}"); + assert_eq!(quality.extraction_method.as_deref(), Some("agent_readable")); + assert!(quality.suggested_next_action.is_none()); + } + + #[test] + fn test_compute_quality_signal_low_js_content() { + let quality = compute_quality_signal( + "Please enable JavaScript to view this app.", + 200, + false, + false, + Some("full"), + 7, + ); + + assert!(quality.score < 0.5, "{quality:?}"); + assert!(quality.warnings.contains(&"low_content".to_string())); + assert!(quality + .warnings + .contains(&"javascript_required".to_string())); + assert_eq!( + quality.suggested_next_action.as_deref(), + Some("retry_with_browser_rendering") + ); + } + #[test] fn test_detect_paywall() { assert!(detect_paywall("
Subscribe
")); diff --git a/crates/fetchkit/src/lib.rs b/crates/fetchkit/src/lib.rs index 5fdb9d5..e803b8e 100644 --- a/crates/fetchkit/src/lib.rs +++ b/crates/fetchkit/src/lib.rs @@ -106,7 +106,7 @@ pub use transport::{ BodyStream, HttpTransport, ReqwestTransport, TransportError, TransportMethod, TransportRequest, TransportResponse, }; -pub use types::{FetchRequest, FetchResponse, HttpMethod, PageLink, PageMetadata}; +pub use types::{FetchRequest, FetchResponse, HttpMethod, PageLink, PageMetadata, PageQuality}; #[cfg(feature = "bot-auth")] pub use bot_auth::{BotAuthConfig, BotAuthError}; diff --git a/crates/fetchkit/src/types.rs b/crates/fetchkit/src/types.rs index ae53993..78d6967 100644 --- a/crates/fetchkit/src/types.rs +++ b/crates/fetchkit/src/types.rs @@ -338,6 +338,32 @@ impl PageMetadata { } } +/// Agent-facing content quality signals. +/// +/// These are heuristic hints for tool callers. They are intended to help agents +/// decide whether to trust, retry, narrow, or escalate a fetch result. +#[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)] +pub struct PageQuality { + /// Normalized quality score from 0.0 (poor) to 1.0 (good). + pub score: f32, + + /// Machine-readable warning labels, such as `low_content` or `truncated`. + #[serde(skip_serializing_if = "Vec::is_empty", default)] + pub warnings: Vec, + + /// Approximate markdown link count divided by word count. + #[serde(skip_serializing_if = "Option::is_none")] + pub link_density: Option, + + /// Content extraction method used for the returned content. + #[serde(skip_serializing_if = "Option::is_none")] + pub extraction_method: Option, + + /// Suggested next action for agents when warnings indicate a poor result. + #[serde(skip_serializing_if = "Option::is_none")] + pub suggested_next_action: Option, +} + /// Response from a fetch operation /// /// Contains the fetched content along with metadata like status code, @@ -420,6 +446,10 @@ pub struct FetchResponse { #[serde(skip_serializing_if = "Option::is_none")] pub metadata: Option, + /// Agent-facing content quality signals + #[serde(skip_serializing_if = "Option::is_none")] + pub quality: Option, + /// Word count of the final content #[serde(skip_serializing_if = "Option::is_none")] pub word_count: Option, diff --git a/crates/fetchkit/tests/integration.rs b/crates/fetchkit/tests/integration.rs index dc7337c..d8aadaa 100644 --- a/crates/fetchkit/tests/integration.rs +++ b/crates/fetchkit/tests/integration.rs @@ -220,6 +220,13 @@ async fn test_binary_content() { assert!(resp.content.is_none()); assert!(resp.error.is_some()); assert!(resp.error.unwrap().contains("Binary content")); + let quality = resp.quality.unwrap(); + assert_eq!(quality.score, 0.0); + assert!(quality.warnings.contains(&"binary_content".to_string())); + assert_eq!( + quality.suggested_next_action.as_deref(), + Some("use_save_to_file") + ); } #[tokio::test] diff --git a/specs/initial.md b/specs/initial.md index 3d7e301..08ef64d 100644 --- a/specs/initial.md +++ b/specs/initial.md @@ -95,6 +95,8 @@ Provide a builder to configure tool options, including: - `saved_path: Option` (set when save_to_file succeeds) - `bytes_written: Option` (set when save_to_file succeeds) - `metadata: Option` (structured page metadata; populated for HTML) + - `quality: Option` (agent-facing quality score, warnings, link density, + extraction method, and suggested next action) - `word_count: Option` (word count of final content) - `redirect_chain: Vec` (URLs followed during redirects; empty if none) - `is_paywall: Option` (heuristic paywall signal; not guaranteed) @@ -111,6 +113,17 @@ Provide a builder to configure tool options, including: - `extraction_method: Option` (`"full"`, `"main"`, `"readable"`, `"readable_fallback_main"`, `"agent_readable"`, `"agent_main"`, `"native_markdown"`, `"native_text"`, or `"raw"`) +- `PageQuality` + - `score: f32` (0.0 poor to 1.0 good) + - `warnings: Vec` (machine-readable labels such as `low_content`, `truncated`, + `too_many_links`, `possible_login_wall`, `possible_consent_wall`, `possible_paywall`, + `javascript_required`, `http_error`, or `binary_content`) + - `link_density: Option` (approximate markdown link count divided by word count) + - `extraction_method: Option` (mirrors returned extraction method for convenience) + - `suggested_next_action: Option` (agent hint such as + `retry_with_browser_rendering`, `authenticate_or_use_browser`, + `try_alternate_source`, `retry_with_larger_limit_or_narrower_scope`, + `retry_with_agent_focus_or_crawl`, `check_url_or_retry_later`, or `use_save_to_file`) - `FetchError` enum - Missing url - Invalid url scheme @@ -154,7 +167,8 @@ Provide a builder to configure tool options, including: - Output format (default `md`): - Markdown with YAML frontmatter containing metadata - Frontmatter fields: `url`, `status_code`, `source_content_type`, `source_size`, - `last_modified`, `filename`, `truncated` + `last_modified`, `filename`, `truncated`, `quality_score`, `quality_warnings`, + `extraction_method`, `suggested_next_action` - Content follows frontmatter (markdown-converted HTML or error message) - Output format (`json`): - JSON-serialized `FetchResponse` to stdout