Skip to content

Commit 796a050

Browse files
authored
feat(fetchers): enhance WikipediaFetcher with redirect resolution (#87)
## What Enhance WikipediaFetcher with redirect resolution support and comprehensive tests. ## Why Closes #55 — WikipediaFetcher was already implemented but lacked redirect resolution (MediaWiki API supports it) and had minimal test coverage. ## How - Added `WikiTitles` deserialization from the summary API response - Display redirect info when canonical title differs from display title - Added comprehensive unit tests: subpage URLs, mobile URLs, bare paths, subdomain rejection, full content rendering, redirect display ## Risk - Low - Only adds new fields/tests, no breaking changes to existing behavior ### Checklist - [x] Unit tests are passed - [x] Smoke tests are passed - [x] Specs are up to date and not in conflict
1 parent 582c495 commit 796a050

1 file changed

Lines changed: 119 additions & 1 deletion

File tree

crates/fetchkit/src/fetchers/wikipedia.rs

Lines changed: 119 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,17 @@ struct WikiSummary {
6565
extract: Option<String>,
6666
description: Option<String>,
6767
content_urls: Option<ContentUrls>,
68+
/// Redirect target — populated when the requested title redirects
69+
#[serde(default)]
70+
titles: Option<WikiTitles>,
71+
}
72+
73+
#[derive(Debug, Deserialize)]
74+
struct WikiTitles {
75+
canonical: Option<String>,
76+
#[allow(dead_code)]
77+
normalized: Option<String>,
78+
display: Option<String>,
6879
}
6980

7081
#[derive(Debug, Deserialize)]
@@ -202,6 +213,15 @@ fn format_wikipedia_response(
202213
}
203214
}
204215

216+
// Show redirect info if the canonical title differs from the display title
217+
if let Some(titles) = &summary.titles {
218+
if let (Some(canonical), Some(display)) = (&titles.canonical, &titles.display) {
219+
if canonical != display {
220+
out.push_str(&format!("- **Redirected from:** {}\n", display));
221+
}
222+
}
223+
}
224+
205225
// Use full content if available, otherwise use summary extract
206226
if let Some(content) = full_content {
207227
out.push_str(&format!("\n---\n\n{}", content));
@@ -234,6 +254,25 @@ mod tests {
234254
);
235255
}
236256

257+
#[test]
258+
fn test_parse_subpage_url() {
259+
let url = Url::parse("https://en.wikipedia.org/wiki/Rust/History").unwrap();
260+
assert_eq!(
261+
WikipediaFetcher::parse_url(&url),
262+
Some(("en".to_string(), "Rust/History".to_string()))
263+
);
264+
}
265+
266+
#[test]
267+
fn test_parse_mobile_url() {
268+
// Mobile URLs use m.wikipedia.org, not {lang}.wikipedia.org
269+
let url = Url::parse("https://m.wikipedia.org/wiki/Rust").unwrap();
270+
assert_eq!(
271+
WikipediaFetcher::parse_url(&url),
272+
Some(("m".to_string(), "Rust".to_string()))
273+
);
274+
}
275+
237276
#[test]
238277
fn test_rejects_non_wiki_path() {
239278
let url = Url::parse("https://en.wikipedia.org/w/index.php?title=Rust").unwrap();
@@ -246,30 +285,109 @@ mod tests {
246285
assert_eq!(WikipediaFetcher::parse_url(&url), None);
247286
}
248287

288+
#[test]
289+
fn test_rejects_bare_wiki_path() {
290+
let url = Url::parse("https://en.wikipedia.org/wiki").unwrap();
291+
assert_eq!(WikipediaFetcher::parse_url(&url), None);
292+
}
293+
294+
#[test]
295+
fn test_rejects_subdomain_wikipedia() {
296+
// sub.sub.wikipedia.org shouldn't match (contains dot)
297+
let url = Url::parse("https://upload.wikimedia.wikipedia.org/wiki/Test").unwrap();
298+
assert_eq!(WikipediaFetcher::parse_url(&url), None);
299+
}
300+
249301
#[test]
250302
fn test_fetcher_matches() {
251303
let fetcher = WikipediaFetcher::new();
252304

253305
let url = Url::parse("https://en.wikipedia.org/wiki/Rust").unwrap();
254306
assert!(fetcher.matches(&url));
255307

308+
let url = Url::parse("https://fr.wikipedia.org/wiki/Paris").unwrap();
309+
assert!(fetcher.matches(&url));
310+
256311
let url = Url::parse("https://example.com/wiki/Rust").unwrap();
257312
assert!(!fetcher.matches(&url));
258313
}
259314

260315
#[test]
261-
fn test_format_wikipedia_response() {
316+
fn test_format_wikipedia_response_summary_only() {
262317
let summary = WikiSummary {
263318
title: "Rust (programming language)".to_string(),
264319
extract: Some("Rust is a systems programming language.".to_string()),
265320
description: Some("Programming language".to_string()),
266321
content_urls: None,
322+
titles: None,
267323
};
268324

269325
let output = format_wikipedia_response(&summary, None, "en");
270326

271327
assert!(output.contains("# Rust (programming language)"));
272328
assert!(output.contains("*Programming language*"));
329+
assert!(output.contains("**Language:** en"));
273330
assert!(output.contains("Rust is a systems programming language."));
274331
}
332+
333+
#[test]
334+
fn test_format_wikipedia_response_with_full_content() {
335+
let summary = WikiSummary {
336+
title: "Rust".to_string(),
337+
extract: Some("Short extract.".to_string()),
338+
description: None,
339+
content_urls: Some(ContentUrls {
340+
desktop: Some(DesktopUrl {
341+
page: Some("https://en.wikipedia.org/wiki/Rust".to_string()),
342+
}),
343+
}),
344+
titles: None,
345+
};
346+
347+
let output = format_wikipedia_response(&summary, Some("# Full article content"), "en");
348+
349+
assert!(output.contains("# Rust"));
350+
assert!(output.contains("**URL:** https://en.wikipedia.org/wiki/Rust"));
351+
// Full content should be used instead of extract
352+
assert!(output.contains("Full article content"));
353+
assert!(!output.contains("Short extract."));
354+
}
355+
356+
#[test]
357+
fn test_format_wikipedia_response_with_redirect() {
358+
let summary = WikiSummary {
359+
title: "Rust (programming language)".to_string(),
360+
extract: Some("Rust is...".to_string()),
361+
description: None,
362+
content_urls: None,
363+
titles: Some(WikiTitles {
364+
canonical: Some("Rust (programming language)".to_string()),
365+
normalized: Some("Rust (programming language)".to_string()),
366+
display: Some("Rust programming language".to_string()),
367+
}),
368+
};
369+
370+
let output = format_wikipedia_response(&summary, None, "en");
371+
372+
assert!(output.contains("**Redirected from:** Rust programming language"));
373+
}
374+
375+
#[test]
376+
fn test_format_wikipedia_response_no_redirect_when_same() {
377+
let summary = WikiSummary {
378+
title: "Rust".to_string(),
379+
extract: Some("Rust is...".to_string()),
380+
description: None,
381+
content_urls: None,
382+
titles: Some(WikiTitles {
383+
canonical: Some("Rust".to_string()),
384+
normalized: Some("Rust".to_string()),
385+
display: Some("Rust".to_string()),
386+
}),
387+
};
388+
389+
let output = format_wikipedia_response(&summary, None, "en");
390+
391+
assert!(!output.contains("Redirected from"));
392+
}
275393
}

0 commit comments

Comments
 (0)