diff --git a/README.md b/README.md index 4a31426..76cb0b9 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ AI-friendly web content fetching tool designed for LLM consumption. Rust library - **HTTP fetching** - GET and HEAD methods with streaming support - **Pluggable fetchers** - URL-aware dispatch to specialized handlers for repos, docs, feeds, videos, papers, and more -- **HTML-to-Markdown** - Built-in conversion optimized for LLMs +- **HTML-to-Markdown** - Built-in conversion optimized for LLMs, with fetched relative links/images resolved to absolute URLs - **Agent content focus** - Optional low-noise extraction mode for AI agents - **HTML-to-Text** - Plain text extraction with clean formatting - **Binary detection** - Returns metadata only for images, PDFs, etc. diff --git a/crates/fetchkit/src/convert.rs b/crates/fetchkit/src/convert.rs index 792678a..97ed37a 100644 --- a/crates/fetchkit/src/convert.rs +++ b/crates/fetchkit/src/convert.rs @@ -1,6 +1,7 @@ //! HTML conversion utilities use crate::types::{PageLink, PageMetadata}; +use url::Url; /// Check if content-type indicates markdown (e.g. `text/markdown`). pub fn is_markdown_content_type(content_type: &Option) -> bool { @@ -55,6 +56,18 @@ pub fn is_html(content_type: &Option, body: &str) -> bool { /// assert!(md.contains("**Bold**")); /// ``` pub fn html_to_markdown(html: &str) -> String { + html_to_markdown_inner(html, None) +} + +/// Convert HTML to markdown while resolving relative links/images against a base URL. +/// +/// This is useful for fetched pages: agents receive markdown with links that remain +/// valid outside the source page's original browsing context. +pub fn html_to_markdown_with_base_url(html: &str, base_url: &str) -> String { + html_to_markdown_inner(html, Url::parse(base_url).ok().as_ref()) +} + +fn html_to_markdown_inner(html: &str, base_url: Option<&Url>) -> String { let mut output = String::new(); let mut in_skip_element = 0; let mut skip_elements: Vec = Vec::new(); @@ -188,7 +201,10 @@ pub fn html_to_markdown(html: &str) -> String { } "pre" => { if !is_closing { - output.push_str("\n```\n"); + let language = extract_code_language(&tag); + output.push_str("\n```"); + output.push_str(language.as_deref().unwrap_or_default()); + output.push('\n'); in_pre = true; } else { output.push_str("\n```\n"); @@ -211,7 +227,7 @@ pub fn html_to_markdown(html: &str) -> String { if !is_closing { if let Some(href) = extract_attribute(&tag, "href") { if !href.is_empty() { - link_href = Some(href); + link_href = Some(resolve_url(base_url, &href)); link_start = output.len(); } } @@ -228,7 +244,7 @@ pub fn html_to_markdown(html: &str) -> String { "img" if !is_closing => { let alt = extract_attribute(&tag, "alt").unwrap_or_default(); if let Some(src) = extract_attribute(&tag, "src") { - output.push_str(&format!("![{}]({})", alt, src)); + output.push_str(&format!("![{}]({})", alt, resolve_url(base_url, &src))); } } // Table handling @@ -316,6 +332,40 @@ pub fn html_to_markdown(html: &str) -> String { clean_whitespace(&output) } +fn resolve_url(base_url: Option<&Url>, candidate: &str) -> String { + let trimmed = candidate.trim(); + if trimmed.is_empty() + || trimmed.starts_with('#') + || trimmed.starts_with("mailto:") + || trimmed.starts_with("tel:") + || trimmed.starts_with("data:") + { + return trimmed.to_string(); + } + + base_url + .and_then(|base| base.join(trimmed).ok()) + .map(|url| url.to_string()) + .unwrap_or_else(|| trimmed.to_string()) +} + +fn extract_code_language(tag: &str) -> Option { + let class = extract_attribute(tag, "class")?; + class + .split_whitespace() + .find_map(|part| { + part.strip_prefix("language-") + .or_else(|| part.strip_prefix("lang-")) + }) + .filter(|language| { + !language.is_empty() + && language + .chars() + .all(|ch| ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' || ch == '+') + }) + .map(ToString::to_string) +} + /// Render collected table rows as a markdown table. fn render_table(rows: &[Vec], output: &mut String) { if rows.is_empty() { @@ -1857,6 +1907,18 @@ mod tests { ); } + #[test] + fn test_html_to_markdown_with_base_url_resolves_relative_links() { + let html = r##"

Read docs and local.

"##; + let md = html_to_markdown_with_base_url(html, "https://example.com/base/index.html"); + assert!( + md.contains("[docs](https://example.com/docs/page)"), + "Got: {}", + md + ); + assert!(md.contains("[local](#local)"), "Got: {}", md); + } + #[test] fn test_html_to_markdown_link_no_text() { let html = r#""#; @@ -1871,6 +1933,25 @@ mod tests { assert!(md.contains("![A photo](photo.jpg)"), "Got: {}", md); } + #[test] + fn test_html_to_markdown_with_base_url_resolves_images() { + let html = r#"A photo"#; + let md = html_to_markdown_with_base_url(html, "https://example.com/docs/page/"); + assert!( + md.contains("![A photo](https://example.com/docs/assets/photo.jpg)"), + "Got: {}", + md + ); + } + + #[test] + fn test_html_to_markdown_preserves_pre_language() { + let html = r#"
fn main() {}
"#; + let md = html_to_markdown(html); + assert!(md.contains("```rust"), "Got: {}", md); + assert!(md.contains("fn main() {}"), "Got: {}", md); + } + #[test] fn test_html_to_markdown_image_no_alt() { let html = r#""#; diff --git a/crates/fetchkit/src/fetchers/default.rs b/crates/fetchkit/src/fetchers/default.rs index bed7851..d5b0fc0 100644 --- a/crates/fetchkit/src/fetchers/default.rs +++ b/crates/fetchkit/src/fetchers/default.rs @@ -10,8 +10,8 @@ use crate::client::FetchOptions; use crate::convert::{ extract_headings, extract_metadata, extract_readable_content, filter_excessive_newlines, - html_to_markdown, html_to_text, is_html, is_markdown_content_type, is_plain_text_content_type, - strip_boilerplate, + html_to_markdown_with_base_url, html_to_text, is_html, is_markdown_content_type, + is_plain_text_content_type, strip_boilerplate, }; use crate::error::FetchError; use crate::fetchers::Fetcher; @@ -379,7 +379,7 @@ impl Fetcher for DefaultFetcher { if wants_markdown { ( "markdown".to_string(), - html_to_markdown(&html), + html_to_markdown_with_base_url(&html, &final_url), Some(method), ) } else if wants_text { @@ -1064,6 +1064,51 @@ mod tests { ); } + #[tokio::test] + async fn test_markdown_resolves_relative_links_against_final_url() { + let server = MockServer::start().await; + let html = r#" + + +
+

Read API docs.

+ Logo +
+ + + "#; + Mock::given(method("GET")) + .and(path("/guide/start")) + .respond_with( + ResponseTemplate::new(200) + .set_body_string(html) + .insert_header("content-type", "text/html"), + ) + .mount(&server) + .await; + + let fetcher = DefaultFetcher::new(); + let options = FetchOptions { + enable_markdown: true, + dns_policy: DnsPolicy::allow_all(), + ..Default::default() + }; + let request = FetchRequest::new(format!("{}/guide/start", server.uri())) + .as_markdown() + .content_focus("main"); + let response = fetcher.fetch(&request, &options).await.unwrap(); + let content = response.content.as_deref().unwrap(); + + assert!( + content.contains(&format!("[API docs]({}/docs/api)", server.uri())), + "{content}" + ); + assert!( + content.contains(&format!("![Logo]({}/assets/logo.png)", server.uri())), + "{content}" + ); + } + #[tokio::test] async fn test_markdown_content_type_without_markdown_request_returns_raw() { let server = MockServer::start().await; diff --git a/crates/fetchkit/src/lib.rs b/crates/fetchkit/src/lib.rs index 64a20cd..5fdb9d5 100644 --- a/crates/fetchkit/src/lib.rs +++ b/crates/fetchkit/src/lib.rs @@ -87,8 +87,8 @@ mod types; pub use client::{batch_fetch, batch_fetch_with_options, fetch, fetch_with_options, FetchOptions}; pub use convert::{ - extract_headings, extract_metadata, extract_readable_content, html_to_markdown, html_to_text, - strip_boilerplate, + extract_headings, extract_metadata, extract_readable_content, html_to_markdown, + html_to_markdown_with_base_url, html_to_text, strip_boilerplate, }; pub use dns::DnsPolicy; pub use error::{FetchError, ToolError}; diff --git a/specs/initial.md b/specs/initial.md index 1de16ce..3d7e301 100644 --- a/specs/initial.md +++ b/specs/initial.md @@ -239,6 +239,15 @@ By default, FetchKit blocks connections to private/reserved IP ranges: - Always return `status_code` when HTTP response received. - 4xx/5xx are success responses (not tool errors). +#### HTML-to-Markdown + +- Fetched HTML converted to markdown resolves relative anchor `href` and image `src` + values against the final response URL after redirects. +- Fragment-only links (`#section`) and non-HTTP navigation schemes such as `mailto:`, + `tel:`, and `data:` are left unchanged. +- Code block language hints from `
` or `
`
+  are preserved in fenced markdown when present.
+
 #### Binary Content
 
 - Detect binary by Content-Type prefix: