From fe8270b80117077798f07d4b29fefb31761e14fc Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Fri, 3 Apr 2026 02:55:20 +0000 Subject: [PATCH] feat(fetchers): enhance YouTubeFetcher with transcript extraction - Add transcript extraction via YouTube timedtext API - Parse timedtext XML format into joined transcript text - Truncate very long transcripts (>15k chars) with indicator - Show "No transcript available" when captions are unavailable - Add mobile URL support (m.youtube.com) - Add comprehensive tests: timedtext parsing, entity decoding, transcript truncation, formatting with/without all fields Closes #56 --- crates/fetchkit/src/fetchers/youtube.rs | 274 +++++++++++++++++++++--- 1 file changed, 249 insertions(+), 25 deletions(-) diff --git a/crates/fetchkit/src/fetchers/youtube.rs b/crates/fetchkit/src/fetchers/youtube.rs index e2cae9b..076d76b 100644 --- a/crates/fetchkit/src/fetchers/youtube.rs +++ b/crates/fetchkit/src/fetchers/youtube.rs @@ -1,7 +1,7 @@ //! YouTube video fetcher //! //! Handles youtube.com/watch and youtu.be URLs, returning video metadata -//! and transcript text via oEmbed and timedtext APIs. +//! and transcript text via oEmbed and noembed APIs. use crate::client::FetchOptions; use crate::error::FetchError; @@ -19,7 +19,7 @@ const API_TIMEOUT: Duration = Duration::from_secs(10); /// YouTube video fetcher /// /// Matches `youtube.com/watch?v={id}` and `youtu.be/{id}`, returning -/// video metadata via oEmbed. +/// video metadata via oEmbed and transcript when available. pub struct YouTubeFetcher; impl YouTubeFetcher { @@ -71,6 +71,12 @@ struct OEmbedResponse { author_url: Option, } +/// Transcript segment extracted from YouTube's timedtext XML +#[derive(Debug)] +struct TranscriptSegment { + text: String, +} + #[async_trait] impl Fetcher for YouTubeFetcher { fn name(&self) -> &'static str { @@ -112,15 +118,14 @@ impl Fetcher for YouTubeFetcher { // Fetch oEmbed metadata // The canonical URL only contains safe ASCII chars, so it can be passed directly - let mut oembed = Url::parse("https://www.youtube.com/oembed").unwrap(); - oembed + let mut oembed_url = Url::parse("https://www.youtube.com/oembed").unwrap(); + oembed_url .query_pairs_mut() .append_pair("url", &canonical_url) .append_pair("format", "json"); - let oembed_url = oembed.to_string(); let oembed = match client - .get(&oembed_url) + .get(oembed_url.as_str()) .header(USER_AGENT, ua_header.clone()) .send() .await @@ -135,39 +140,160 @@ impl Fetcher for YouTubeFetcher { .unwrap_or_else(|| format!("YouTube Video {}", video_id)); let author = oembed.as_ref().and_then(|o| o.author_name.clone()); - let author_url = oembed.as_ref().and_then(|o| o.author_url.clone()); - // Build response - let mut out = String::new(); - out.push_str(&format!("# {}\n\n", title)); + // Attempt transcript extraction via timedtext API + let transcript = fetch_transcript(&client, &ua_header, &video_id).await; - out.push_str("## Video Info\n\n"); - if let Some(author) = &author { - if let Some(author_url) = &author_url { - out.push_str(&format!("- **Channel:** [{}]({})\n", author, author_url)); - } else { - out.push_str(&format!("- **Channel:** {}\n", author)); - } - } - out.push_str(&format!("- **Video ID:** {}\n", video_id)); - out.push_str(&format!("- **URL:** {}\n", canonical_url)); - out.push_str(&format!( - "- **Thumbnail:** https://img.youtube.com/vi/{}/maxresdefault.jpg\n", - video_id - )); + let content = format_youtube_response( + &title, + &video_id, + &canonical_url, + author.as_deref(), + author_url.as_deref(), + transcript.as_deref(), + ); Ok(FetchResponse { url: request.url.clone(), status_code: 200, content_type: Some("text/markdown".to_string()), format: Some("youtube_video".to_string()), - content: Some(out), + content: Some(content), ..Default::default() }) } } +/// Attempt to fetch transcript/captions via YouTube's timedtext XML API. +/// Returns None if transcript is unavailable. +async fn fetch_transcript( + client: &reqwest::Client, + ua: &HeaderValue, + video_id: &str, +) -> Option { + // Try the legacy timedtext API (auto-generated English captions) + let timedtext_url = format!( + "https://www.youtube.com/api/timedtext?v={}&lang=en&fmt=srv3", + video_id + ); + + let resp = client + .get(&timedtext_url) + .header(USER_AGENT, ua.clone()) + .send() + .await + .ok()?; + + if !resp.status().is_success() { + return None; + } + + let xml = resp.text().await.ok()?; + if xml.is_empty() || !xml.contains(">() + .join(" "); + + if transcript.is_empty() { + None + } else { + Some(transcript) + } +} + +/// Parse YouTube timedtext XML format into transcript segments +fn parse_timedtext_xml(xml: &str) -> Vec { + let mut segments = Vec::new(); + let mut search_from = 0; + + while let Some(start) = xml[search_from..].find("') { + Some(pos) => abs_start + pos + 1, + None => break, + }; + + let content_end = match xml[content_start..].find("") { + Some(pos) => content_start + pos, + None => break, + }; + + let text = decode_xml_entities(&xml[content_start..content_end]); + let text = text.trim().to_string(); + if !text.is_empty() { + segments.push(TranscriptSegment { text }); + } + + search_from = content_end + 7; // "".len() + } + + segments +} + +/// Decode XML/HTML entities commonly found in YouTube transcripts +fn decode_xml_entities(s: &str) -> String { + s.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace(""", "\"") + .replace("'", "'") + .replace("'", "'") +} + +fn format_youtube_response( + title: &str, + video_id: &str, + canonical_url: &str, + author: Option<&str>, + author_url: Option<&str>, + transcript: Option<&str>, +) -> String { + let mut out = String::new(); + out.push_str(&format!("# {}\n\n", title)); + + out.push_str("## Video Info\n\n"); + if let Some(author) = author { + if let Some(url) = author_url { + out.push_str(&format!("- **Channel:** [{}]({})\n", author, url)); + } else { + out.push_str(&format!("- **Channel:** {}\n", author)); + } + } + out.push_str(&format!("- **Video ID:** {}\n", video_id)); + out.push_str(&format!("- **URL:** {}\n", canonical_url)); + out.push_str(&format!( + "- **Thumbnail:** https://img.youtube.com/vi/{}/maxresdefault.jpg\n", + video_id + )); + + if let Some(transcript) = transcript { + out.push_str("\n## Transcript\n\n"); + // Truncate very long transcripts + if transcript.len() > 15000 { + out.push_str(&transcript[..15000]); + out.push_str("\n\n*[Transcript truncated]*\n"); + } else { + out.push_str(transcript); + out.push('\n'); + } + } else { + out.push_str("\n*No transcript available for this video.*\n"); + } + + out +} + #[cfg(test)] mod tests { use super::*; @@ -199,6 +325,15 @@ mod tests { ); } + #[test] + fn test_parse_youtube_mobile() { + let url = Url::parse("https://m.youtube.com/watch?v=abc123").unwrap(); + assert_eq!( + YouTubeFetcher::parse_video_id(&url), + Some("abc123".to_string()) + ); + } + #[test] fn test_rejects_non_watch() { let url = Url::parse("https://www.youtube.com/channel/UC123").unwrap(); @@ -217,6 +352,12 @@ mod tests { assert_eq!(YouTubeFetcher::parse_video_id(&url), None); } + #[test] + fn test_rejects_empty_v_param() { + let url = Url::parse("https://www.youtube.com/watch?v=").unwrap(); + assert_eq!(YouTubeFetcher::parse_video_id(&url), None); + } + #[test] fn test_fetcher_matches() { let fetcher = YouTubeFetcher::new(); @@ -227,7 +368,90 @@ mod tests { let url = Url::parse("https://youtu.be/abc").unwrap(); assert!(fetcher.matches(&url)); + let url = Url::parse("https://m.youtube.com/watch?v=abc").unwrap(); + assert!(fetcher.matches(&url)); + let url = Url::parse("https://example.com/watch?v=abc").unwrap(); assert!(!fetcher.matches(&url)); } + + #[test] + fn test_format_youtube_response_with_all_fields() { + let output = format_youtube_response( + "Test Video", + "abc123", + "https://www.youtube.com/watch?v=abc123", + Some("Test Channel"), + Some("https://www.youtube.com/channel/UC123"), + Some("Hello world this is a transcript."), + ); + + assert!(output.contains("# Test Video")); + assert!(output.contains("[Test Channel](https://www.youtube.com/channel/UC123)")); + assert!(output.contains("**Video ID:** abc123")); + assert!(output.contains("## Transcript")); + assert!(output.contains("Hello world this is a transcript.")); + } + + #[test] + fn test_format_youtube_response_no_transcript() { + let output = format_youtube_response( + "Test Video", + "abc123", + "https://www.youtube.com/watch?v=abc123", + None, + None, + None, + ); + + assert!(output.contains("# Test Video")); + assert!(output.contains("No transcript available")); + assert!(!output.contains("## Transcript")); + } + + #[test] + fn test_format_youtube_response_truncates_long_transcript() { + let long_transcript = "a".repeat(20000); + let output = format_youtube_response( + "Long Video", + "abc", + "https://www.youtube.com/watch?v=abc", + None, + None, + Some(&long_transcript), + ); + + assert!(output.contains("[Transcript truncated]")); + assert!(output.len() < 20000); + } + + #[test] + fn test_parse_timedtext_xml() { + let xml = r#" + +Hello everyone +Welcome to this video +Let's get started +"#; + + let segments = parse_timedtext_xml(xml); + assert_eq!(segments.len(), 3); + assert_eq!(segments[0].text, "Hello everyone"); + assert_eq!(segments[1].text, "Welcome to this video"); + assert_eq!(segments[2].text, "Let's get started"); + } + + #[test] + fn test_parse_timedtext_xml_empty() { + let xml = r#""#; + let segments = parse_timedtext_xml(xml); + assert!(segments.is_empty()); + } + + #[test] + fn test_decode_xml_entities() { + assert_eq!(decode_xml_entities("a & b"), "a & b"); + assert_eq!(decode_xml_entities("<tag>"), ""); + assert_eq!(decode_xml_entities("it's"), "it's"); + } }