diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 282d23f..8fad72f 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -61,6 +61,11 @@ name = "quality_bench" harness = false path = "benches/quality_bench.rs" +[[bench]] +name = "decode_entities_bench" +harness = false +path = "benches/decode_entities_bench.rs" + [[bin]] name = "do-wdr" path = "src/main.rs" diff --git a/cli/benches/decode_entities_bench.rs b/cli/benches/decode_entities_bench.rs new file mode 100644 index 0000000..bb82318 --- /dev/null +++ b/cli/benches/decode_entities_bench.rs @@ -0,0 +1,140 @@ +use criterion::{Criterion, black_box, criterion_group, criterion_main}; + +// Current implementation +fn decode_entities_old(text: &str) -> String { + text.replace("<", "<") + .replace(">", ">") + .replace(""", "\"") + .replace("'", "'") + .replace("'", "'") + .replace(" ", " ") + .replace("©", "©") + .replace("®", "®") + .replace("™", "™") + .replace("–", "–") + .replace("—", "—") + .replace("‘", "‘") + .replace("’", "’") + .replace("“", "“") + .replace("”", "”") + .replace("[", "[") + .replace("]", "]") + .replace("⁠", "") // word joiner + .replace("&", "&") // Ampersand last to avoid double-unescaping + .replace("\u{2060}", "") // Remove word joiner +} + +// Proposed optimized implementation +fn decode_entities_optimized(text: &str) -> String { + if !text.contains('&') && !text.contains('\u{2060}') { + return text.to_string(); + } + + let mut result = String::with_capacity(text.len()); + let mut chars = text.char_indices().peekable(); + + while let Some((_, ch)) = chars.next() { + if ch == '&' { + let mut found_semi = false; + let mut end_idx = 0; + let temp_chars = chars.clone(); + + for (idx, next_ch) in temp_chars.take(10) { + if next_ch == ';' { + found_semi = true; + end_idx = idx + 1; + break; + } + } + + if found_semi { + let start_idx = if let Some(&(idx, _)) = chars.peek() { + idx + } else { + end_idx + }; + + let entity = &text[start_idx..end_idx]; + let decoded = match entity { + "lt;" => Some("<"), + "gt;" => Some(">"), + "quot;" => Some("\""), + "#x27;" | "#39;" => Some("'"), + "nbsp;" => Some(" "), + "copy;" => Some("©"), + "reg;" => Some("®"), + "trade;" => Some("™"), + "ndash;" => Some("–"), + "mdash;" => Some("—"), + "lsquo;" => Some("‘"), + "rsquo;" => Some("’"), + "ldquo;" => Some("“"), + "rdquo;" => Some("”"), + "#91;" => Some("["), + "#93;" => Some("]"), + "#8288;" => Some(""), + "amp;" => Some("&"), + _ => None, + }; + + if let Some(d) = decoded { + result.push_str(d); + // Advance main iterator to after the semicolon + while let Some(&(idx, _)) = chars.peek() { + if idx < end_idx { + chars.next(); + } else { + break; + } + } + continue; + } + } + } + + if ch == '\u{2060}' { + continue; + } + + result.push(ch); + } + + result +} + +fn bench_decode_entities(c: &mut Criterion) { + let text_with_entities = "This <is> a "test" with many & various entities like ©, ®, ™, –, —, ‘, ’, “, ”, [, ], ⁠, \u{2060}, ', ',  ."; + let text_no_entities = "This is a test with no entities at all. Just some plain text to see the overhead of the replacement mechanism when nothing matches."; + let long_text = text_with_entities.repeat(10); + + let mut group = c.benchmark_group("decode_entities"); + + group.bench_function("old_with_entities", |b| { + b.iter(|| decode_entities_old(black_box(&text_with_entities))) + }); + + group.bench_function("optimized_with_entities", |b| { + b.iter(|| decode_entities_optimized(black_box(&text_with_entities))) + }); + + group.bench_function("old_no_entities", |b| { + b.iter(|| decode_entities_old(black_box(&text_no_entities))) + }); + + group.bench_function("optimized_no_entities", |b| { + b.iter(|| decode_entities_optimized(black_box(&text_no_entities))) + }); + + group.bench_function("old_long", |b| { + b.iter(|| decode_entities_old(black_box(&long_text))) + }); + + group.bench_function("optimized_long", |b| { + b.iter(|| decode_entities_optimized(black_box(&long_text))) + }); + + group.finish(); +} + +criterion_group!(benches, bench_decode_entities); +criterion_main!(benches); diff --git a/cli/src/providers/direct_fetch.rs b/cli/src/providers/direct_fetch.rs index 06ba057..612c90c 100644 --- a/cli/src/providers/direct_fetch.rs +++ b/cli/src/providers/direct_fetch.rs @@ -92,28 +92,83 @@ impl crate::providers::UrlProvider for DirectFetchProvider { } } -/// Decode basic HTML entities +/// Decode basic HTML entities using a single-pass scanner for performance fn decode_entities(text: &str) -> String { - text.replace("<", "<") - .replace(">", ">") - .replace(""", "\"") - .replace("'", "'") - .replace("'", "'") - .replace(" ", " ") - .replace("©", "©") - .replace("®", "®") - .replace("™", "™") - .replace("–", "–") - .replace("—", "—") - .replace("‘", "‘") - .replace("’", "’") - .replace("“", "“") - .replace("”", "”") - .replace("[", "[") - .replace("]", "]") - .replace("⁠", "") // word joiner - .replace("&", "&") // Ampersand last to avoid double-unescaping - .replace("\u{2060}", "") // Remove word joiner + if !text.contains('&') && !text.contains('\u{2060}') { + return text.to_string(); + } + + let mut result = String::with_capacity(text.len()); + let mut chars = text.char_indices().peekable(); + + while let Some((_, ch)) = chars.next() { + if ch == '&' { + let mut found_semi = false; + let mut end_idx = 0; + let temp_chars = chars.clone(); + + for (idx, next_ch) in temp_chars.take(10) { + if next_ch == ';' { + found_semi = true; + end_idx = idx + 1; + break; + } + } + + if found_semi { + let start_idx = if let Some(&(idx, _)) = chars.peek() { + idx + } else { + end_idx + }; + + let entity = &text[start_idx..end_idx]; + let decoded = match entity { + "lt;" => Some("<"), + "gt;" => Some(">"), + "quot;" => Some("\""), + "#x27;" | "#39;" => Some("'"), + "nbsp;" => Some(" "), + "copy;" => Some("©"), + "reg;" => Some("®"), + "trade;" => Some("™"), + "ndash;" => Some("–"), + "mdash;" => Some("—"), + "lsquo;" => Some("‘"), + "rsquo;" => Some("’"), + "ldquo;" => Some("“"), + "rdquo;" => Some("”"), + "#91;" => Some("["), + "#93;" => Some("]"), + "#8288;" => Some(""), + "amp;" => Some("&"), + _ => None, + }; + + if let Some(d) = decoded { + result.push_str(d); + // Advance main iterator to after the semicolon + while let Some(&(idx, _)) = chars.peek() { + if idx < end_idx { + chars.next(); + } else { + break; + } + } + continue; + } + } + } + + if ch == '\u{2060}' { + // Remove word joiner + continue; + } + + result.push(ch); + } + + result } /// Get an attribute value from a tag string diff --git a/web/app/components/MetadataBar.tsx b/web/app/components/MetadataBar.tsx index 04f3a28..c218f90 100644 --- a/web/app/components/MetadataBar.tsx +++ b/web/app/components/MetadataBar.tsx @@ -71,8 +71,7 @@ export function MetadataBar({ onClick={handleCopyResult} aria-label={copied ? "Copied to clipboard" : "Copy to clipboard"} aria-live="polite" - title="Copy full result as markdown" - className={`transition-colors min-h-[36px] px-2 ${copied ? "text-accent" : "text-text-muted hover:text-foreground"}`} + className="hover:text-foreground transition-colors min-h-[36px] px-2" > {copied ? "Copied" : "Copy"} diff --git a/web/app/components/ResultCard.tsx b/web/app/components/ResultCard.tsx index f8854b7..a32403f 100644 --- a/web/app/components/ResultCard.tsx +++ b/web/app/components/ResultCard.tsx @@ -14,23 +14,21 @@ interface ResultCardProps { const ResultHeader = ({ id, title, url, normalizedUrl }: { id: string; title: string; url?: string | null; normalizedUrl?: string | null }) => (
-

- {url ? ( - - {title} - - ) : ( - - {title} - - )} -

+ {url ? ( + + {title} + + ) : ( +

+ {title} +

+ )} {normalizedUrl && (
{normalizedUrl}
)} @@ -78,11 +76,8 @@ export default function ResultCard({ result, onCopy, onHelpfulToggle, helpful }: