Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ AI-friendly web content fetching tool designed for LLM consumption. Rust library

- **HTTP fetching** - GET and HEAD methods with streaming support
- **Pluggable fetchers** - URL-aware dispatch to specialized handlers for repos, docs, feeds, videos, papers, and more
- **HTML-to-Markdown** - Built-in conversion optimized for LLMs
- **HTML-to-Markdown** - Built-in conversion optimized for LLMs, with fetched relative links/images resolved to absolute URLs
- **Agent content focus** - Optional low-noise extraction mode for AI agents
- **HTML-to-Text** - Plain text extraction with clean formatting
- **Binary detection** - Returns metadata only for images, PDFs, etc.
Expand Down
87 changes: 84 additions & 3 deletions crates/fetchkit/src/convert.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
//! HTML conversion utilities

use crate::types::{PageLink, PageMetadata};
use url::Url;

/// Check if content-type indicates markdown (e.g. `text/markdown`).
pub fn is_markdown_content_type(content_type: &Option<String>) -> bool {
Expand Down Expand Up @@ -55,6 +56,18 @@ pub fn is_html(content_type: &Option<String>, body: &str) -> bool {
/// assert!(md.contains("**Bold**"));
/// ```
pub fn html_to_markdown(html: &str) -> String {
html_to_markdown_inner(html, None)
}

/// Convert HTML to markdown while resolving relative links/images against a base URL.
///
/// This is useful for fetched pages: agents receive markdown with links that remain
/// valid outside the source page's original browsing context.
pub fn html_to_markdown_with_base_url(html: &str, base_url: &str) -> String {
html_to_markdown_inner(html, Url::parse(base_url).ok().as_ref())
}

fn html_to_markdown_inner(html: &str, base_url: Option<&Url>) -> String {
let mut output = String::new();
let mut in_skip_element = 0;
let mut skip_elements: Vec<String> = Vec::new();
Expand Down Expand Up @@ -188,7 +201,10 @@ pub fn html_to_markdown(html: &str) -> String {
}
"pre" => {
if !is_closing {
output.push_str("\n```\n");
let language = extract_code_language(&tag);
output.push_str("\n```");
output.push_str(language.as_deref().unwrap_or_default());
output.push('\n');
in_pre = true;
} else {
output.push_str("\n```\n");
Expand All @@ -211,7 +227,7 @@ pub fn html_to_markdown(html: &str) -> String {
if !is_closing {
if let Some(href) = extract_attribute(&tag, "href") {
if !href.is_empty() {
link_href = Some(href);
link_href = Some(resolve_url(base_url, &href));
link_start = output.len();
}
}
Expand All @@ -228,7 +244,7 @@ pub fn html_to_markdown(html: &str) -> String {
"img" if !is_closing => {
let alt = extract_attribute(&tag, "alt").unwrap_or_default();
if let Some(src) = extract_attribute(&tag, "src") {
output.push_str(&format!("![{}]({})", alt, src));
output.push_str(&format!("![{}]({})", alt, resolve_url(base_url, &src)));
}
}
// Table handling
Expand Down Expand Up @@ -316,6 +332,40 @@ pub fn html_to_markdown(html: &str) -> String {
clean_whitespace(&output)
}

fn resolve_url(base_url: Option<&Url>, candidate: &str) -> String {
let trimmed = candidate.trim();
if trimmed.is_empty()
|| trimmed.starts_with('#')
|| trimmed.starts_with("mailto:")
|| trimmed.starts_with("tel:")
|| trimmed.starts_with("data:")
{
return trimmed.to_string();
}

base_url
.and_then(|base| base.join(trimmed).ok())
.map(|url| url.to_string())
.unwrap_or_else(|| trimmed.to_string())
}

fn extract_code_language(tag: &str) -> Option<String> {
let class = extract_attribute(tag, "class")?;
class
.split_whitespace()
.find_map(|part| {
part.strip_prefix("language-")
.or_else(|| part.strip_prefix("lang-"))
})
.filter(|language| {
!language.is_empty()
&& language
.chars()
.all(|ch| ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' || ch == '+')
})
.map(ToString::to_string)
}

/// Render collected table rows as a markdown table.
fn render_table(rows: &[Vec<String>], output: &mut String) {
if rows.is_empty() {
Expand Down Expand Up @@ -1857,6 +1907,18 @@ mod tests {
);
}

#[test]
fn test_html_to_markdown_with_base_url_resolves_relative_links() {
let html = r##"<p>Read <a href="/docs/page">docs</a> and <a href="#local">local</a>.</p>"##;
let md = html_to_markdown_with_base_url(html, "https://example.com/base/index.html");
assert!(
md.contains("[docs](https://example.com/docs/page)"),
"Got: {}",
md
);
assert!(md.contains("[local](#local)"), "Got: {}", md);
}

#[test]
fn test_html_to_markdown_link_no_text() {
let html = r#"<a href="https://example.com"></a>"#;
Expand All @@ -1871,6 +1933,25 @@ mod tests {
assert!(md.contains("![A photo](photo.jpg)"), "Got: {}", md);
}

#[test]
fn test_html_to_markdown_with_base_url_resolves_images() {
let html = r#"<img src="../assets/photo.jpg" alt="A photo">"#;
let md = html_to_markdown_with_base_url(html, "https://example.com/docs/page/");
assert!(
md.contains("![A photo](https://example.com/docs/assets/photo.jpg)"),
"Got: {}",
md
);
}

#[test]
fn test_html_to_markdown_preserves_pre_language() {
let html = r#"<pre class="language-rust">fn main() {}</pre>"#;
let md = html_to_markdown(html);
assert!(md.contains("```rust"), "Got: {}", md);
assert!(md.contains("fn main() {}"), "Got: {}", md);
}

#[test]
fn test_html_to_markdown_image_no_alt() {
let html = r#"<img src="photo.jpg">"#;
Expand Down
51 changes: 48 additions & 3 deletions crates/fetchkit/src/fetchers/default.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
use crate::client::FetchOptions;
use crate::convert::{
extract_headings, extract_metadata, extract_readable_content, filter_excessive_newlines,
html_to_markdown, html_to_text, is_html, is_markdown_content_type, is_plain_text_content_type,
strip_boilerplate,
html_to_markdown_with_base_url, html_to_text, is_html, is_markdown_content_type,
is_plain_text_content_type, strip_boilerplate,
};
use crate::error::FetchError;
use crate::fetchers::Fetcher;
Expand Down Expand Up @@ -379,7 +379,7 @@ impl Fetcher for DefaultFetcher {
if wants_markdown {
(
"markdown".to_string(),
html_to_markdown(&html),
html_to_markdown_with_base_url(&html, &final_url),
Some(method),
)
} else if wants_text {
Expand Down Expand Up @@ -1064,6 +1064,51 @@ mod tests {
);
}

#[tokio::test]
async fn test_markdown_resolves_relative_links_against_final_url() {
let server = MockServer::start().await;
let html = r#"
<html>
<body>
<main>
<p>Read <a href="/docs/api">API docs</a>.</p>
<img src="../assets/logo.png" alt="Logo">
</main>
</body>
</html>
"#;
Mock::given(method("GET"))
.and(path("/guide/start"))
.respond_with(
ResponseTemplate::new(200)
.set_body_string(html)
.insert_header("content-type", "text/html"),
)
.mount(&server)
.await;

let fetcher = DefaultFetcher::new();
let options = FetchOptions {
enable_markdown: true,
dns_policy: DnsPolicy::allow_all(),
..Default::default()
};
let request = FetchRequest::new(format!("{}/guide/start", server.uri()))
.as_markdown()
.content_focus("main");
let response = fetcher.fetch(&request, &options).await.unwrap();
let content = response.content.as_deref().unwrap();

assert!(
content.contains(&format!("[API docs]({}/docs/api)", server.uri())),
"{content}"
);
assert!(
content.contains(&format!("![Logo]({}/assets/logo.png)", server.uri())),
"{content}"
);
}

#[tokio::test]
async fn test_markdown_content_type_without_markdown_request_returns_raw() {
let server = MockServer::start().await;
Expand Down
4 changes: 2 additions & 2 deletions crates/fetchkit/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ mod types;

pub use client::{batch_fetch, batch_fetch_with_options, fetch, fetch_with_options, FetchOptions};
pub use convert::{
extract_headings, extract_metadata, extract_readable_content, html_to_markdown, html_to_text,
strip_boilerplate,
extract_headings, extract_metadata, extract_readable_content, html_to_markdown,
html_to_markdown_with_base_url, html_to_text, strip_boilerplate,
};
pub use dns::DnsPolicy;
pub use error::{FetchError, ToolError};
Expand Down
9 changes: 9 additions & 0 deletions specs/initial.md
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,15 @@ By default, FetchKit blocks connections to private/reserved IP ranges:
- Always return `status_code` when HTTP response received.
- 4xx/5xx are success responses (not tool errors).

#### HTML-to-Markdown

- Fetched HTML converted to markdown resolves relative anchor `href` and image `src`
values against the final response URL after redirects.
- Fragment-only links (`#section`) and non-HTTP navigation schemes such as `mailto:`,
`tel:`, and `data:` are left unchanged.
- Code block language hints from `<pre class="language-...">` or `<pre class="lang-...">`
are preserved in fenced markdown when present.

#### Binary Content

- Detect binary by Content-Type prefix:
Expand Down
Loading