From 07e2fba1b1c1ac6392bcc751f8f4f445a51ea729 Mon Sep 17 00:00:00 2001 From: hexin Date: Wed, 27 May 2026 11:07:18 +0800 Subject: [PATCH] fix(web_search): decode HTML entities in Bing result URLs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bing wraps every SERP result URL in a `/ck/a?...&u=` click-tracking redirect, and in the raw HTML the separators are `&` entities. normalize_bing_url parsed the href without decoding entities first, so extract_query_param looked for `u` while the actual key was `amp;u`. The base64 redirect target was never recovered: every result collapsed to a `bing.com` root domain, is_likely_spam_results rejected the whole batch, and Bing — the default backend — returned zero results. Decode HTML entities before extracting the redirect target. Adds a regression test. --- crates/tui/src/tools/web_search.rs | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/crates/tui/src/tools/web_search.rs b/crates/tui/src/tools/web_search.rs index 16c7b632b..b5cc1e4e7 100644 --- a/crates/tui/src/tools/web_search.rs +++ b/crates/tui/src/tools/web_search.rs @@ -901,6 +901,14 @@ fn normalize_url(href: &str) -> String { } fn normalize_bing_url(href: &str) -> String { + // Bing wraps every SERP result URL in a `/ck/a?...&u=` click-tracking + // redirect, and in the raw HTML the separators are `&` entities. Without + // decoding entities first, `extract_query_param` looks for `u` but the actual + // key is `amp;u`, so the real URL is never recovered: every result collapses to + // a `bing.com` root domain, which the spam heuristic then rejects — yielding + // zero results for the default Bing backend. Decode entities before parsing. + let href = decode_html_entities(href); + let href = href.as_str(); if let Some(encoded) = extract_query_param(href, "u") { let decoded = percent_decode(&encoded); let token = decoded.strip_prefix("a1").unwrap_or(&decoded); @@ -1027,11 +1035,22 @@ fn extract_query_param(url: &str, key: &str) -> Option { mod tests { use super::{ ERROR_BODY_PREVIEW_BYTES, WebSearchEntry, WebSearchTool, decode_html_entities, - extract_search_query, is_likely_spam_results, optional_search_max_results, root_domain, - sanitize_error_body, truncate_error_body, + extract_search_query, is_likely_spam_results, normalize_bing_url, + optional_search_max_results, root_domain, sanitize_error_body, truncate_error_body, }; use serde_json::json; + // Regression guard: Bing /ck/a redirect hrefs are HTML-entity-encoded + // (`&`). normalize_bing_url must decode entities before extracting the + // `u=` base64 payload, otherwise the real URL is never recovered and the + // result's root domain collapses to bing.com (then dropped as spam → 0 + // results for the default Bing backend). + #[test] + fn bing_ckurl_with_html_entities_decodes_real_url() { + let href = "https://www.bing.com/ck/a?!&&p=abc&u=a1aHR0cHM6Ly9ydXN0LWxhbmcub3JnLw&ntb=1"; + assert_eq!(normalize_bing_url(href), "https://rust-lang.org/"); + } + fn entry(url: &str) -> WebSearchEntry { WebSearchEntry { title: "x".into(),