feat: Enhanced text normalization for search and audit

jackdomleo7 · jackdomleo7 · commit 1d8fb28676cc · 2026-02-14T18:27:54.000Z
diff --git a/src/components/Features/SearchContent.spec.ts b/src/components/Features/SearchContent.spec.ts
@@ -1456,6 +1456,66 @@ describe('SearchContent.vue', () => {
       const markCount = (matchValue.html().match(/<mark>/g) || []).length
       expect(markCount).toBe(3)
     })
+
+    it('should highlight normalized character variants', async () => {
+      mockSearchContent.mockResolvedValue({
+        success: true,
+        results: [
+          {
+            title: 'Test',
+            slug: 'test',
+            sourceType: 'Blog',
+            matches: [
+              { path: 'price', value: '£100 or £50', count: 2 },
+              { path: 'quote', value: '"Hello" and "World"', count: 2 },
+            ],
+          },
+        ],
+        totalItems: 1,
+        failedScopes: [],
+      })
+
+      const wrapper = mountComponent()
+      const store = useStore()
+      store.token = 'test-token'
+      store.selectedScopes.blog = true
+
+      // Search for £ - should highlight both £ symbols
+      await wrapper.find('#search-content-search-term').setValue('£')
+      await wrapper
+        .findAll('button')
+        .find((btn) => btn.text() === 'Search')
+        ?.trigger('click')
+      await flushPromises()
+      await nextTick()
+
+      const priceMatch = wrapper.findAll('.search-content__match-value')[0]
+      expect(priceMatch).toBeDefined()
+      const priceHtml = priceMatch!.html()
+      // Should highlight both £ symbols
+      expect(priceHtml).toContain('<mark>')
+      expect((priceHtml.match(/<mark>/g) || []).length).toBe(2)
+
+      // Reset and search for quotes
+      await wrapper.find('button[type="reset"]').trigger('click')
+      await nextTick()
+
+      // Search for " - should highlight both quote pairs
+      await wrapper.find('#search-content-search-term').setValue('"')
+      await wrapper
+        .findAll('button')
+        .find((btn) => btn.text() === 'Search')
+        ?.trigger('click')
+      await flushPromises()
+      await nextTick()
+
+      const quoteMatch = wrapper.findAll('.search-content__match-value')[1]
+      expect(quoteMatch).toBeDefined()
+      const quoteHtml = quoteMatch!.html()
+      // Should highlight all 4 quote marks
+      expect(quoteHtml).toContain('<mark>')
+      expect((quoteHtml.match(/<mark>/g) || []).length).toBe(4)
+    })
   })
 
   describe('Status Messages - Info Banner', () => {
diff --git a/src/components/Features/SearchContent.vue b/src/components/Features/SearchContent.vue
@@ -121,6 +121,7 @@ import InfoBanner from '../InfoBanner.vue'
 import Chip from '../Chip.vue'
 import Toggle from '../Toggle.vue'
 import { searchContent } from '@/features/searchContent'
+import { normalizeWhitespace } from '@/utils/textNormalization'
 import type { AsyncReturnType } from 'type-fest'
 
 const Card = defineAsyncComponent(() => import('../Card.vue'))
@@ -151,10 +152,44 @@ function escapeHtml(str: string): string {
   return div.innerHTML
 }
 
+/**
+ * Creates a regex pattern that matches a character and all its normalized variants.
+ * For example, '£' should match both '£' and '&pound;'.
+ */
+function createVariantPattern(char: string): string {
+  const variants: Record<string, string[]> = {
+    "'": ["'", '&apos;', '&#39;', '\u2018', '\u2019'],
+    '"': ['"', '&quot;', '\u201C', '\u201D'],
+    '-': ['-', '&ndash;', '&mdash;', '\u2013', '\u2014'],
+    '£': ['£', '&pound;'],
+    '€': ['€', '&euro;'],
+    '&': ['&', '&amp;'],
+    '<': ['<', '&lt;'],
+    '>': ['>', '&gt;'],
+    ' ': [' ', '&nbsp;', '\u00A0'],
+  }
+
+  // If this character has known variants, match any of them
+  if (variants[char]) {
+    const escapedVariants = variants[char].map((v) => v.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'))
+    return `(?:${escapedVariants.join('|')})`
+  }
+
+  // Otherwise, just escape and match the character itself
+  return char.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
+}
+
 function highlightMatches(text: string, searchTerm: string): string {
   const escapedText = escapeHtml(text)
-  const escapedSearchTerm = searchTerm.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
-  const regex = new RegExp(`(${escapedSearchTerm})`, 'gi')
+
+  // Normalize the search term to get the canonical form
+  const normalizedSearch = normalizeWhitespace(searchTerm.trim())
+
+  // Build a regex pattern that matches the search term and all its variant forms
+  const patternParts = Array.from(normalizedSearch).map(createVariantPattern)
+  const pattern = patternParts.join('')
+
+  const regex = new RegExp(`(${pattern})`, 'gi')
   return escapedText.replace(regex, '<mark>$1</mark>')
 }
 
diff --git a/src/components/WhatsNew.vue b/src/components/WhatsNew.vue
@@ -40,6 +40,14 @@ interface Feature {
 const showModal = ref<boolean>(false)
 
 const features: Feature[] = [
+  {
+    id: 'enhanced-text-normalization',
+    type: 'improvement',
+    title: 'Enhanced text normalization for search and audit',
+    description:
+      'Search and Audit utilities now normalize many more character variations for better matching. This includes HTML entities (<code>&amp;quot;</code>, <code>&amp;pound;</code>, <code>&amp;euro;</code>, <code>&amp;amp;</code>), fancy Unicode quotes (<code>"</code> <code>"</code> <code>\'</code> <code>\'</code>), and various dashes (<code>–</code> <code>—</code>). For example, searching for <code>"quote"</code> will now match <code>&amp;quot;quote&amp;quot;</code> and <code>"quote"</code>. The logic is also now shared between both utilities to reduce code duplication.',
+    utcDatetimeAdded: new Date('2026-02-14T18:30:00Z'),
+  },
   {
     id: 'audit-html-bloat',
     type: 'feature',
diff --git a/src/features/audit.ts b/src/features/audit.ts
@@ -1,6 +1,7 @@
 import { getAllPages } from '@/core/pages'
 import { getAllPosts } from '@/core/posts'
 import { getAllCollections } from '@/core/collections'
+import { normalizeWhitespace, createContextSnippet } from '@/utils/textNormalization'
 
 /**
  * Predefined patterns for detecting HTML bloat from various sources.
@@ -132,30 +133,6 @@ interface MatchAccumulator {
   positions?: number[] // Track match positions for deduplication
 }
 
-function normalizeWhitespace(str: string): string {
-  return str
-    .replace(/&nbsp;/gi, ' ')
-    .replace(/\u00A0/g, ' ')
-    .replace(/\s+/g, ' ')
-}
-
-function createContextSnippet(
-  normalizedText: string,
-  matchIndex: number,
-  matchLength: number,
-  contextSize = 100,
-): string {
-  const contextStart = Math.max(0, matchIndex - contextSize)
-  const contextEnd = Math.min(normalizedText.length, matchIndex + matchLength + contextSize)
-
-  let snippet = normalizedText.substring(contextStart, contextEnd)
-
-  if (contextStart > 0) snippet = '...' + snippet
-  if (contextEnd < normalizedText.length) snippet = snippet + '...'
-
-  return snippet
-}
-
 function searchObjectForPattern(
   obj: unknown,
   pattern: string,
diff --git a/src/features/searchContent.ts b/src/features/searchContent.ts
@@ -1,6 +1,7 @@
 import { getAllPages } from '@/core/pages'
 import { getAllPosts } from '@/core/posts'
 import { getAllCollections } from '@/core/collections'
+import { normalizeWhitespace, createContextSnippet } from '@/utils/textNormalization'
 
 interface SearchResponse {
   success: boolean
@@ -21,30 +22,6 @@ interface MatchAccumulator {
   count: number
 }
 
-function normalizeWhitespace(str: string): string {
-  return str
-    .replace(/&nbsp;/gi, ' ')
-    .replace(/\u00A0/g, ' ')
-    .replace(/\s+/g, ' ')
-}
-
-function createContextSnippet(
-  normalizedText: string,
-  matchIndex: number,
-  matchLength: number,
-  contextSize = 100,
-): string {
-  const contextStart = Math.max(0, matchIndex - contextSize)
-  const contextEnd = Math.min(normalizedText.length, matchIndex + matchLength + contextSize)
-
-  let snippet = normalizedText.substring(contextStart, contextEnd)
-
-  if (contextStart > 0) snippet = '...' + snippet
-  if (contextEnd < normalizedText.length) snippet = snippet + '...'
-
-  return snippet
-}
-
 function searchObject(
   obj: unknown,
   searchLower: string,
diff --git a/src/utils/textNormalization.spec.ts b/src/utils/textNormalization.spec.ts
@@ -0,0 +1,169 @@
+import { describe, it, expect } from 'vitest'
+import { normalizeWhitespace, createContextSnippet } from './textNormalization'
+
+describe('normalizeWhitespace', () => {
+  describe('HTML entity normalization', () => {
+    it('should normalize &nbsp; to regular space', () => {
+      expect(normalizeWhitespace('Hello&nbsp;World')).toBe('Hello World')
+      expect(normalizeWhitespace('Hello&NBSP;World')).toBe('Hello World')
+    })
+
+    it('should normalize Unicode non-breaking space to regular space', () => {
+      expect(normalizeWhitespace('Hello\u00A0World')).toBe('Hello World')
+    })
+
+    it('should normalize &quot; to double quotes', () => {
+      expect(normalizeWhitespace('&quot;Hello&quot;')).toBe('"Hello"')
+      expect(normalizeWhitespace('&QUOT;Hello&QUOT;')).toBe('"Hello"')
+    })
+
+    it('should normalize &apos; and &#39; to single quotes', () => {
+      expect(normalizeWhitespace('&apos;Hello&apos;')).toBe("'Hello'")
+      expect(normalizeWhitespace('&#39;Hello&#39;')).toBe("'Hello'")
+    })
+
+    it('should normalize &pound; to £', () => {
+      expect(normalizeWhitespace('&pound;100')).toBe('£100')
+      expect(normalizeWhitespace('&POUND;100')).toBe('£100')
+    })
+
+    it('should normalize &euro; to €', () => {
+      expect(normalizeWhitespace('&euro;50')).toBe('€50')
+      expect(normalizeWhitespace('&EURO;50')).toBe('€50')
+    })
+
+    it('should normalize &amp; to &', () => {
+      expect(normalizeWhitespace('Fish &amp; Chips')).toBe('Fish & Chips')
+      expect(normalizeWhitespace('Fish &AMP; Chips')).toBe('Fish & Chips')
+    })
+
+    it('should normalize &lt; and &gt; to < and >', () => {
+      expect(normalizeWhitespace('&lt;div&gt;')).toBe('<div>')
+      expect(normalizeWhitespace('&LT;div&GT;')).toBe('<div>')
+    })
+
+    it('should normalize &ndash; and &mdash; to regular dash', () => {
+      expect(normalizeWhitespace('2020&ndash;2021')).toBe('2020-2021')
+      expect(normalizeWhitespace('Hello&mdash;World')).toBe('Hello-World')
+    })
+  })
+
+  describe('Unicode character normalization', () => {
+    it('should normalize fancy single quotes to regular apostrophe', () => {
+      const input = '\u2018Hello\u2019'
+      const result = normalizeWhitespace(input)
+      expect(result).toBe("'Hello'")
+      // Verify it's a regular apostrophe (U+0027), not fancy quotes
+      expect(result.charCodeAt(0)).toBe(0x0027)
+      expect(result.charCodeAt(result.length - 1)).toBe(0x0027)
+    })
+
+    it('should normalize fancy double quotes to regular quotes', () => {
+      const input = '\u201CHello\u201D'
+      const result = normalizeWhitespace(input)
+      expect(result).toBe('"Hello"')
+      // Verify it's a regular quote (U+0022), not fancy quotes
+      expect(result.charCodeAt(0)).toBe(0x0022)
+      expect(result.charCodeAt(result.length - 1)).toBe(0x0022)
+    })
+
+    it('should normalize en-dash and em-dash to regular dash', () => {
+      expect(normalizeWhitespace('2020\u20132021')).toBe('2020-2021') // en-dash
+      expect(normalizeWhitespace('Hello\u2014World')).toBe('Hello-World') // em-dash
+    })
+  })
+
+  describe('Whitespace collapsing', () => {
+    it('should collapse multiple spaces to single space', () => {
+      expect(normalizeWhitespace('Hello    World')).toBe('Hello World')
+    })
+
+    it('should collapse multiple whitespace types to single space', () => {
+      expect(normalizeWhitespace('Hello\n\t  World')).toBe('Hello World')
+    })
+
+    it('should handle mixed whitespace and HTML entities', () => {
+      expect(normalizeWhitespace('Hello&nbsp;&nbsp;  World')).toBe('Hello World')
+    })
+  })
+
+  describe('Combined normalizations', () => {
+    it('should normalize complex mixed content', () => {
+      const input = '&quot;Hello&nbsp;&nbsp;World&quot;&mdash;&pound;100'
+      const expected = '"Hello World"-£100'
+      expect(normalizeWhitespace(input)).toBe(expected)
+    })
+
+    it('should handle empty string', () => {
+      expect(normalizeWhitespace('')).toBe('')
+    })
+
+    it('should handle string with no entities or special chars', () => {
+      expect(normalizeWhitespace('Hello World')).toBe('Hello World')
+    })
+  })
+})
+
+describe('createContextSnippet', () => {
+  it('should create snippet with context around match', () => {
+    const text = 'The quick brown fox jumps over the lazy dog'
+    const matchIndex = text.indexOf('fox')
+    const result = createContextSnippet(text, matchIndex, 3, 10)
+    expect(result).toBe('...ick brown fox jumps ove...')
+  })
+
+  it('should not add leading ellipsis when match is near start', () => {
+    const text = 'The quick brown fox jumps over the lazy dog'
+    const matchIndex = text.indexOf('quick')
+    const result = createContextSnippet(text, matchIndex, 5, 10)
+    expect(result).toBe('The quick brown fox...')
+  })
+
+  it('should not add trailing ellipsis when match is near end', () => {
+    const text = 'The quick brown fox jumps over the lazy dog'
+    const matchIndex = text.indexOf('lazy')
+    const result = createContextSnippet(text, matchIndex, 4, 10)
+    expect(result).toBe('... over the lazy dog')
+  })
+
+  it('should handle match at start of text', () => {
+    const text = 'The quick brown fox'
+    const matchIndex = 0
+    const result = createContextSnippet(text, matchIndex, 3, 10)
+    expect(result).toBe('The quick bro...')
+  })
+
+  it('should handle match at end of text', () => {
+    const text = 'The quick brown fox'
+    const matchIndex = text.indexOf('fox')
+    const result = createContextSnippet(text, matchIndex, 3, 10)
+    expect(result).toBe('...ick brown fox')
+  })
+
+  it('should include entire text if shorter than context size', () => {
+    const text = 'Short text'
+    const matchIndex = text.indexOf('text')
+    const result = createContextSnippet(text, matchIndex, 4, 100)
+    expect(result).toBe('Short text')
+  })
+
+  it('should use default context size of 100', () => {
+    const text = 'a'.repeat(300)
+    const matchIndex = 150
+    const result = createContextSnippet(text, matchIndex, 1)
+    // Should be: 100 chars before + 1 match + 100 chars after = 201 chars
+    // Plus ellipsis on both ends = 207 chars
+    expect(result).toHaveLength(207)
+    expect(result.startsWith('...')).toBe(true)
+    expect(result.endsWith('...')).toBe(true)
+  })
+
+  it('should handle custom context size', () => {
+    const text = 'a'.repeat(100)
+    const matchIndex = 50
+    const result = createContextSnippet(text, matchIndex, 1, 20)
+    // Should be: 20 chars before + 1 match + 20 chars after = 41 chars
+    // Plus ellipsis on both ends = 47 chars
+    expect(result).toHaveLength(47)
+  })
+})
diff --git a/src/utils/textNormalization.ts b/src/utils/textNormalization.ts