From ed3877aee7f774098f1bf1d2ea5f2199309d0148 Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Fri, 3 Apr 2026 03:04:13 +0000 Subject: [PATCH] feat(fetchers): enhance RSSFeedFetcher with content-type detection and html_to_markdown - Add is_feed_content_type() for detecting feeds by HTTP content-type header - Use html_to_markdown for HTML content in RSS/Atom entry descriptions - Replace strip_html with html_to_markdown for richer content conversion - Add tests: content-type detection, HTML/plain content conversion, CDATA handling Closes #59 --- crates/fetchkit/src/fetchers/rss_feed.rs | 121 +++++++++++++++++------ 1 file changed, 93 insertions(+), 28 deletions(-) diff --git a/crates/fetchkit/src/fetchers/rss_feed.rs b/crates/fetchkit/src/fetchers/rss_feed.rs index b6ab43b..96afa8c 100644 --- a/crates/fetchkit/src/fetchers/rss_feed.rs +++ b/crates/fetchkit/src/fetchers/rss_feed.rs @@ -29,7 +29,10 @@ impl RSSFeedFetcher { Self } - /// Check if a URL looks like a feed URL + /// Check if a URL looks like a feed URL by path pattern. + /// + /// Content-type detection (application/rss+xml, application/atom+xml) + /// happens at fetch time since we can't know the content-type from the URL alone. fn is_feed_url(url: &Url) -> bool { let path = url.path().to_lowercase(); @@ -50,6 +53,15 @@ impl RSSFeedFetcher { || path == "/rss" || path == "/feed" } + + /// Check if a content-type indicates a feed format + fn is_feed_content_type(content_type: &str) -> bool { + let ct = content_type.to_lowercase(); + ct.contains("application/rss+xml") + || ct.contains("application/atom+xml") + || ct.contains("text/xml") + || ct.contains("application/xml") + } } impl Default for RSSFeedFetcher { @@ -113,12 +125,21 @@ impl Fetcher for RSSFeedFetcher { }); } + // Check content-type for feed detection (covers non-URL-pattern feeds) + let content_type = response + .headers() + .get(reqwest::header::CONTENT_TYPE) + .and_then(|v| v.to_str().ok()) + .unwrap_or("") + .to_string(); + let body = response .text() .await .map_err(|e| FetchError::RequestError(e.to_string()))?; - // Detect feed type and parse + // Detect feed type: by XML structure first, then content-type + let is_feed_by_ct = Self::is_feed_content_type(&content_type); let content = if body.contains("") { parse_rss(&body) } else if body.contains(" String { out.push_str(&format!("- **Published:** {}\n", date)); } if let Some(desc) = description { - let cleaned = strip_html(&desc); - if !cleaned.is_empty() { - let truncated = if cleaned.len() > 500 { - format!("{}...", &cleaned[..500]) + let converted = convert_entry_content(&desc); + if !converted.is_empty() { + let truncated = if converted.len() > 500 { + format!("{}...", &converted[..500]) } else { - cleaned + converted }; out.push_str(&format!("\n{}\n", truncated)); } @@ -251,12 +281,12 @@ fn parse_atom(xml: &str) -> String { out.push_str(&format!("- **Published:** {}\n", date)); } if let Some(summary) = summary { - let cleaned = strip_html(&summary); - if !cleaned.is_empty() { - let truncated = if cleaned.len() > 500 { - format!("{}...", &cleaned[..500]) + let converted = convert_entry_content(&summary); + if !converted.is_empty() { + let truncated = if converted.len() > 500 { + format!("{}...", &converted[..500]) } else { - cleaned + converted }; out.push_str(&format!("\n{}\n", truncated)); } @@ -330,21 +360,14 @@ fn decode_entities(s: &str) -> String { .replace("'", "'") } -/// Simple HTML tag stripper -fn strip_html(html: &str) -> String { - let mut result = String::with_capacity(html.len()); - let mut in_tag = false; - - for c in html.chars() { - match c { - '<' => in_tag = true, - '>' => in_tag = false, - _ if !in_tag => result.push(c), - _ => {} - } +/// Convert entry content: use html_to_markdown for HTML, plain text for non-HTML +fn convert_entry_content(content: &str) -> String { + if content.contains('<') && content.contains('>') { + // Contains HTML tags — convert via html_to_markdown + crate::convert::html_to_markdown(content) + } else { + content.trim().to_string() } - - result.trim().to_string() } #[cfg(test)] @@ -445,7 +468,49 @@ mod tests { } #[test] - fn test_strip_html() { - assert_eq!(strip_html("

Hello world

"), "Hello world"); + fn test_is_feed_content_type() { + assert!(RSSFeedFetcher::is_feed_content_type("application/rss+xml")); + assert!(RSSFeedFetcher::is_feed_content_type( + "application/atom+xml; charset=utf-8" + )); + assert!(RSSFeedFetcher::is_feed_content_type("text/xml")); + assert!(RSSFeedFetcher::is_feed_content_type("application/xml")); + assert!(!RSSFeedFetcher::is_feed_content_type("text/html")); + assert!(!RSSFeedFetcher::is_feed_content_type("application/json")); + } + + #[test] + fn test_convert_entry_content_html() { + let html = "

Hello world

"; + let result = convert_entry_content(html); + assert!(result.contains("Hello")); + assert!(result.contains("world")); + } + + #[test] + fn test_convert_entry_content_plain() { + let plain = "Just plain text."; + let result = convert_entry_content(plain); + assert_eq!(result, "Just plain text."); + } + + #[test] + fn test_parse_rss_with_cdata() { + let xml = r#" + + +Test Feed + +CDATA Post +https://example.com/cdata +Rich HTML content

]]>
+
+
+
"#; + + let output = parse_rss(xml); + assert!(output.contains("# Test Feed")); + assert!(output.contains("### CDATA Post")); + assert!(output.contains("HTML")); } }