Skip to content

Commit 1d8fb28

Browse files
committed
feat: Enhanced text normalization for search and audit
1 parent c356b7b commit 1d8fb28

7 files changed

Lines changed: 333 additions & 50 deletions

File tree

src/components/Features/SearchContent.spec.ts

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1456,6 +1456,66 @@ describe('SearchContent.vue', () => {
14561456
const markCount = (matchValue.html().match(/<mark>/g) || []).length
14571457
expect(markCount).toBe(3)
14581458
})
1459+
1460+
it('should highlight normalized character variants', async () => {
1461+
mockSearchContent.mockResolvedValue({
1462+
success: true,
1463+
results: [
1464+
{
1465+
title: 'Test',
1466+
slug: 'test',
1467+
sourceType: 'Blog',
1468+
matches: [
1469+
{ path: 'price', value: '£100 or £50', count: 2 },
1470+
{ path: 'quote', value: '"Hello" and "World"', count: 2 },
1471+
],
1472+
},
1473+
],
1474+
totalItems: 1,
1475+
failedScopes: [],
1476+
})
1477+
1478+
const wrapper = mountComponent()
1479+
const store = useStore()
1480+
store.token = 'test-token'
1481+
store.selectedScopes.blog = true
1482+
1483+
// Search for £ - should highlight both £ symbols
1484+
await wrapper.find('#search-content-search-term').setValue('£')
1485+
await wrapper
1486+
.findAll('button')
1487+
.find((btn) => btn.text() === 'Search')
1488+
?.trigger('click')
1489+
await flushPromises()
1490+
await nextTick()
1491+
1492+
const priceMatch = wrapper.findAll('.search-content__match-value')[0]
1493+
expect(priceMatch).toBeDefined()
1494+
const priceHtml = priceMatch!.html()
1495+
// Should highlight both £ symbols
1496+
expect(priceHtml).toContain('<mark>')
1497+
expect((priceHtml.match(/<mark>/g) || []).length).toBe(2)
1498+
1499+
// Reset and search for quotes
1500+
await wrapper.find('button[type="reset"]').trigger('click')
1501+
await nextTick()
1502+
1503+
// Search for " - should highlight both quote pairs
1504+
await wrapper.find('#search-content-search-term').setValue('"')
1505+
await wrapper
1506+
.findAll('button')
1507+
.find((btn) => btn.text() === 'Search')
1508+
?.trigger('click')
1509+
await flushPromises()
1510+
await nextTick()
1511+
1512+
const quoteMatch = wrapper.findAll('.search-content__match-value')[1]
1513+
expect(quoteMatch).toBeDefined()
1514+
const quoteHtml = quoteMatch!.html()
1515+
// Should highlight all 4 quote marks
1516+
expect(quoteHtml).toContain('<mark>')
1517+
expect((quoteHtml.match(/<mark>/g) || []).length).toBe(4)
1518+
})
14591519
})
14601520

14611521
describe('Status Messages - Info Banner', () => {

src/components/Features/SearchContent.vue

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ import InfoBanner from '../InfoBanner.vue'
121121
import Chip from '../Chip.vue'
122122
import Toggle from '../Toggle.vue'
123123
import { searchContent } from '@/features/searchContent'
124+
import { normalizeWhitespace } from '@/utils/textNormalization'
124125
import type { AsyncReturnType } from 'type-fest'
125126
126127
const Card = defineAsyncComponent(() => import('../Card.vue'))
@@ -151,10 +152,44 @@ function escapeHtml(str: string): string {
151152
return div.innerHTML
152153
}
153154
155+
/**
156+
* Creates a regex pattern that matches a character and all its normalized variants.
157+
* For example, '£' should match both '£' and '&pound;'.
158+
*/
159+
function createVariantPattern(char: string): string {
160+
const variants: Record<string, string[]> = {
161+
"'": ["'", '&apos;', '&#39;', '\u2018', '\u2019'],
162+
'"': ['"', '&quot;', '\u201C', '\u201D'],
163+
'-': ['-', '&ndash;', '&mdash;', '\u2013', '\u2014'],
164+
'£': ['£', '&pound;'],
165+
'': ['', '&euro;'],
166+
'&': ['&', '&amp;'],
167+
'<': ['<', '&lt;'],
168+
'>': ['>', '&gt;'],
169+
' ': [' ', '&nbsp;', '\u00A0'],
170+
}
171+
172+
// If this character has known variants, match any of them
173+
if (variants[char]) {
174+
const escapedVariants = variants[char].map((v) => v.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'))
175+
return `(?:${escapedVariants.join('|')})`
176+
}
177+
178+
// Otherwise, just escape and match the character itself
179+
return char.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
180+
}
181+
154182
function highlightMatches(text: string, searchTerm: string): string {
155183
const escapedText = escapeHtml(text)
156-
const escapedSearchTerm = searchTerm.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
157-
const regex = new RegExp(`(${escapedSearchTerm})`, 'gi')
184+
185+
// Normalize the search term to get the canonical form
186+
const normalizedSearch = normalizeWhitespace(searchTerm.trim())
187+
188+
// Build a regex pattern that matches the search term and all its variant forms
189+
const patternParts = Array.from(normalizedSearch).map(createVariantPattern)
190+
const pattern = patternParts.join('')
191+
192+
const regex = new RegExp(`(${pattern})`, 'gi')
158193
return escapedText.replace(regex, '<mark>$1</mark>')
159194
}
160195

src/components/WhatsNew.vue

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,14 @@ interface Feature {
4040
const showModal = ref<boolean>(false)
4141
4242
const features: Feature[] = [
43+
{
44+
id: 'enhanced-text-normalization',
45+
type: 'improvement',
46+
title: 'Enhanced text normalization for search and audit',
47+
description:
48+
'Search and Audit utilities now normalize many more character variations for better matching. This includes HTML entities (<code>&amp;quot;</code>, <code>&amp;pound;</code>, <code>&amp;euro;</code>, <code>&amp;amp;</code>), fancy Unicode quotes (<code>"</code> <code>"</code> <code>\'</code> <code>\'</code>), and various dashes (<code>–</code> <code>—</code>). For example, searching for <code>"quote"</code> will now match <code>&amp;quot;quote&amp;quot;</code> and <code>"quote"</code>. The logic is also now shared between both utilities to reduce code duplication.',
49+
utcDatetimeAdded: new Date('2026-02-14T18:30:00Z'),
50+
},
4351
{
4452
id: 'audit-html-bloat',
4553
type: 'feature',

src/features/audit.ts

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { getAllPages } from '@/core/pages'
22
import { getAllPosts } from '@/core/posts'
33
import { getAllCollections } from '@/core/collections'
4+
import { normalizeWhitespace, createContextSnippet } from '@/utils/textNormalization'
45

56
/**
67
* Predefined patterns for detecting HTML bloat from various sources.
@@ -132,30 +133,6 @@ interface MatchAccumulator {
132133
positions?: number[] // Track match positions for deduplication
133134
}
134135

135-
function normalizeWhitespace(str: string): string {
136-
return str
137-
.replace(/&nbsp;/gi, ' ')
138-
.replace(/\u00A0/g, ' ')
139-
.replace(/\s+/g, ' ')
140-
}
141-
142-
function createContextSnippet(
143-
normalizedText: string,
144-
matchIndex: number,
145-
matchLength: number,
146-
contextSize = 100,
147-
): string {
148-
const contextStart = Math.max(0, matchIndex - contextSize)
149-
const contextEnd = Math.min(normalizedText.length, matchIndex + matchLength + contextSize)
150-
151-
let snippet = normalizedText.substring(contextStart, contextEnd)
152-
153-
if (contextStart > 0) snippet = '...' + snippet
154-
if (contextEnd < normalizedText.length) snippet = snippet + '...'
155-
156-
return snippet
157-
}
158-
159136
function searchObjectForPattern(
160137
obj: unknown,
161138
pattern: string,

src/features/searchContent.ts

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { getAllPages } from '@/core/pages'
22
import { getAllPosts } from '@/core/posts'
33
import { getAllCollections } from '@/core/collections'
4+
import { normalizeWhitespace, createContextSnippet } from '@/utils/textNormalization'
45

56
interface SearchResponse {
67
success: boolean
@@ -21,30 +22,6 @@ interface MatchAccumulator {
2122
count: number
2223
}
2324

24-
function normalizeWhitespace(str: string): string {
25-
return str
26-
.replace(/&nbsp;/gi, ' ')
27-
.replace(/\u00A0/g, ' ')
28-
.replace(/\s+/g, ' ')
29-
}
30-
31-
function createContextSnippet(
32-
normalizedText: string,
33-
matchIndex: number,
34-
matchLength: number,
35-
contextSize = 100,
36-
): string {
37-
const contextStart = Math.max(0, matchIndex - contextSize)
38-
const contextEnd = Math.min(normalizedText.length, matchIndex + matchLength + contextSize)
39-
40-
let snippet = normalizedText.substring(contextStart, contextEnd)
41-
42-
if (contextStart > 0) snippet = '...' + snippet
43-
if (contextEnd < normalizedText.length) snippet = snippet + '...'
44-
45-
return snippet
46-
}
47-
4825
function searchObject(
4926
obj: unknown,
5027
searchLower: string,
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
import { describe, it, expect } from 'vitest'
2+
import { normalizeWhitespace, createContextSnippet } from './textNormalization'
3+
4+
describe('normalizeWhitespace', () => {
5+
describe('HTML entity normalization', () => {
6+
it('should normalize &nbsp; to regular space', () => {
7+
expect(normalizeWhitespace('Hello&nbsp;World')).toBe('Hello World')
8+
expect(normalizeWhitespace('Hello&NBSP;World')).toBe('Hello World')
9+
})
10+
11+
it('should normalize Unicode non-breaking space to regular space', () => {
12+
expect(normalizeWhitespace('Hello\u00A0World')).toBe('Hello World')
13+
})
14+
15+
it('should normalize &quot; to double quotes', () => {
16+
expect(normalizeWhitespace('&quot;Hello&quot;')).toBe('"Hello"')
17+
expect(normalizeWhitespace('&QUOT;Hello&QUOT;')).toBe('"Hello"')
18+
})
19+
20+
it('should normalize &apos; and &#39; to single quotes', () => {
21+
expect(normalizeWhitespace('&apos;Hello&apos;')).toBe("'Hello'")
22+
expect(normalizeWhitespace('&#39;Hello&#39;')).toBe("'Hello'")
23+
})
24+
25+
it('should normalize &pound; to £', () => {
26+
expect(normalizeWhitespace('&pound;100')).toBe('£100')
27+
expect(normalizeWhitespace('&POUND;100')).toBe('£100')
28+
})
29+
30+
it('should normalize &euro; to €', () => {
31+
expect(normalizeWhitespace('&euro;50')).toBe('€50')
32+
expect(normalizeWhitespace('&EURO;50')).toBe('€50')
33+
})
34+
35+
it('should normalize &amp; to &', () => {
36+
expect(normalizeWhitespace('Fish &amp; Chips')).toBe('Fish & Chips')
37+
expect(normalizeWhitespace('Fish &AMP; Chips')).toBe('Fish & Chips')
38+
})
39+
40+
it('should normalize &lt; and &gt; to < and >', () => {
41+
expect(normalizeWhitespace('&lt;div&gt;')).toBe('<div>')
42+
expect(normalizeWhitespace('&LT;div&GT;')).toBe('<div>')
43+
})
44+
45+
it('should normalize &ndash; and &mdash; to regular dash', () => {
46+
expect(normalizeWhitespace('2020&ndash;2021')).toBe('2020-2021')
47+
expect(normalizeWhitespace('Hello&mdash;World')).toBe('Hello-World')
48+
})
49+
})
50+
51+
describe('Unicode character normalization', () => {
52+
it('should normalize fancy single quotes to regular apostrophe', () => {
53+
const input = '\u2018Hello\u2019'
54+
const result = normalizeWhitespace(input)
55+
expect(result).toBe("'Hello'")
56+
// Verify it's a regular apostrophe (U+0027), not fancy quotes
57+
expect(result.charCodeAt(0)).toBe(0x0027)
58+
expect(result.charCodeAt(result.length - 1)).toBe(0x0027)
59+
})
60+
61+
it('should normalize fancy double quotes to regular quotes', () => {
62+
const input = '\u201CHello\u201D'
63+
const result = normalizeWhitespace(input)
64+
expect(result).toBe('"Hello"')
65+
// Verify it's a regular quote (U+0022), not fancy quotes
66+
expect(result.charCodeAt(0)).toBe(0x0022)
67+
expect(result.charCodeAt(result.length - 1)).toBe(0x0022)
68+
})
69+
70+
it('should normalize en-dash and em-dash to regular dash', () => {
71+
expect(normalizeWhitespace('2020\u20132021')).toBe('2020-2021') // en-dash
72+
expect(normalizeWhitespace('Hello\u2014World')).toBe('Hello-World') // em-dash
73+
})
74+
})
75+
76+
describe('Whitespace collapsing', () => {
77+
it('should collapse multiple spaces to single space', () => {
78+
expect(normalizeWhitespace('Hello World')).toBe('Hello World')
79+
})
80+
81+
it('should collapse multiple whitespace types to single space', () => {
82+
expect(normalizeWhitespace('Hello\n\t World')).toBe('Hello World')
83+
})
84+
85+
it('should handle mixed whitespace and HTML entities', () => {
86+
expect(normalizeWhitespace('Hello&nbsp;&nbsp; World')).toBe('Hello World')
87+
})
88+
})
89+
90+
describe('Combined normalizations', () => {
91+
it('should normalize complex mixed content', () => {
92+
const input = '&quot;Hello&nbsp;&nbsp;World&quot;&mdash;&pound;100'
93+
const expected = '"Hello World"-£100'
94+
expect(normalizeWhitespace(input)).toBe(expected)
95+
})
96+
97+
it('should handle empty string', () => {
98+
expect(normalizeWhitespace('')).toBe('')
99+
})
100+
101+
it('should handle string with no entities or special chars', () => {
102+
expect(normalizeWhitespace('Hello World')).toBe('Hello World')
103+
})
104+
})
105+
})
106+
107+
describe('createContextSnippet', () => {
108+
it('should create snippet with context around match', () => {
109+
const text = 'The quick brown fox jumps over the lazy dog'
110+
const matchIndex = text.indexOf('fox')
111+
const result = createContextSnippet(text, matchIndex, 3, 10)
112+
expect(result).toBe('...ick brown fox jumps ove...')
113+
})
114+
115+
it('should not add leading ellipsis when match is near start', () => {
116+
const text = 'The quick brown fox jumps over the lazy dog'
117+
const matchIndex = text.indexOf('quick')
118+
const result = createContextSnippet(text, matchIndex, 5, 10)
119+
expect(result).toBe('The quick brown fox...')
120+
})
121+
122+
it('should not add trailing ellipsis when match is near end', () => {
123+
const text = 'The quick brown fox jumps over the lazy dog'
124+
const matchIndex = text.indexOf('lazy')
125+
const result = createContextSnippet(text, matchIndex, 4, 10)
126+
expect(result).toBe('... over the lazy dog')
127+
})
128+
129+
it('should handle match at start of text', () => {
130+
const text = 'The quick brown fox'
131+
const matchIndex = 0
132+
const result = createContextSnippet(text, matchIndex, 3, 10)
133+
expect(result).toBe('The quick bro...')
134+
})
135+
136+
it('should handle match at end of text', () => {
137+
const text = 'The quick brown fox'
138+
const matchIndex = text.indexOf('fox')
139+
const result = createContextSnippet(text, matchIndex, 3, 10)
140+
expect(result).toBe('...ick brown fox')
141+
})
142+
143+
it('should include entire text if shorter than context size', () => {
144+
const text = 'Short text'
145+
const matchIndex = text.indexOf('text')
146+
const result = createContextSnippet(text, matchIndex, 4, 100)
147+
expect(result).toBe('Short text')
148+
})
149+
150+
it('should use default context size of 100', () => {
151+
const text = 'a'.repeat(300)
152+
const matchIndex = 150
153+
const result = createContextSnippet(text, matchIndex, 1)
154+
// Should be: 100 chars before + 1 match + 100 chars after = 201 chars
155+
// Plus ellipsis on both ends = 207 chars
156+
expect(result).toHaveLength(207)
157+
expect(result.startsWith('...')).toBe(true)
158+
expect(result.endsWith('...')).toBe(true)
159+
})
160+
161+
it('should handle custom context size', () => {
162+
const text = 'a'.repeat(100)
163+
const matchIndex = 50
164+
const result = createContextSnippet(text, matchIndex, 1, 20)
165+
// Should be: 20 chars before + 1 match + 20 chars after = 41 chars
166+
// Plus ellipsis on both ends = 47 chars
167+
expect(result).toHaveLength(47)
168+
})
169+
})

0 commit comments

Comments
 (0)