Skip to content

Commit 62636c7

Browse files
authored
improvement(gmail): replace custom html-to-text regex with library (#4613)
* improvement(gmail): replace custom html-to-text regex with html-to-text library Resolves 4 CodeQL alerts on htmlToPlainText (incomplete tag/entity handling, unsafe regex backtracking). Delegates to the html-to-text npm package already used by the outlook polling trigger and the mail/send route. * improvement(gmail): match outlook selectors config, add nbsp/anchor tests Aligns html-to-text options with apps/sim/lib/webhooks/polling/outlook.ts: suppress anchor hrefs when identical to text, drop bare # anchors, skip img/script/style content. Adds tests for nbsp preservation and anchor behavior.
1 parent e23c20e commit 62636c7

2 files changed

Lines changed: 29 additions & 22 deletions

File tree

apps/sim/tools/gmail/utils.test.ts

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,9 @@ describe('plainTextToHtml', () => {
8181
})
8282

8383
describe('htmlToPlainText', () => {
84-
it('strips tags, decodes entities, and collapses whitespace', () => {
84+
it('strips tags and decodes entities', () => {
8585
const result = htmlToPlainText('<p>Hi &amp; bye</p><p>Line<br>break</p>')
86-
expect(result).toBe('Hi & bye\nLine\nbreak')
86+
expect(result).toBe('Hi & bye\n\nLine\nbreak')
8787
})
8888

8989
it('drops <style> and <script> contents', () => {
@@ -97,10 +97,21 @@ describe('htmlToPlainText', () => {
9797
})
9898

9999
it('decodes decimal and hexadecimal numeric entities', () => {
100-
expect(htmlToPlainText('<p>&#8220;hi&#8221; &#160;and&#x2019;s</p>')).toBe(
101-
'\u201chi\u201d \u00a0and\u2019s'
100+
expect(htmlToPlainText('<p>&#8220;hi&#8221; and&#x2019;s</p>')).toBe(
101+
'\u201chi\u201d and\u2019s'
102102
)
103103
})
104+
105+
it('preserves &#160; (non-breaking space) as U+00A0 for fidelity in plain-text output', () => {
106+
expect(htmlToPlainText('<p>a&#160;b</p>')).toBe('a\u00a0b')
107+
})
108+
109+
it('elides anchor URLs that exactly match link text, and drops bare # anchors', () => {
110+
expect(
111+
htmlToPlainText('<p>Visit <a href="https://example.com">https://example.com</a></p>')
112+
).toBe('Visit https://example.com')
113+
expect(htmlToPlainText('<p><a href="#section">Anchor</a></p>')).toBe('Anchor')
114+
})
104115
})
105116

106117
describe('buildSimpleEmailMessage', () => {

apps/sim/tools/gmail/utils.ts

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { convert } from 'html-to-text'
12
import type {
23
GmailAttachment,
34
GmailMessage,
@@ -344,26 +345,21 @@ export function plainTextToHtml(body: string): string {
344345
}
345346

346347
/**
347-
* Best-effort conversion of an HTML body to a plain-text fallback. Strips tags
348-
* and decodes the common entities. Used so we always include a plain-text part
349-
* alongside HTML for clients that don't render HTML.
348+
* Best-effort conversion of an HTML body to a plain-text fallback. Used so we
349+
* always include a plain-text part alongside HTML for clients that don't render
350+
* HTML. Delegates to the `html-to-text` library for robust tag stripping and
351+
* entity decoding (also used elsewhere in the repo for the same purpose).
350352
*/
351353
export function htmlToPlainText(html: string): string {
352-
return html
353-
.replace(/<style[\s\S]*?<\/style>/gi, '')
354-
.replace(/<script[\s\S]*?<\/script>/gi, '')
355-
.replace(/<br\s*\/?>/gi, '\n')
356-
.replace(/<\/(p|div|h[1-6]|li|tr)>/gi, '\n')
357-
.replace(/<[^>]+>/g, '')
358-
.replace(/&nbsp;/g, ' ')
359-
.replace(/&lt;/g, '<')
360-
.replace(/&gt;/g, '>')
361-
.replace(/&quot;/g, '"')
362-
.replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(Number.parseInt(hex, 16)))
363-
.replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(Number.parseInt(dec, 10)))
364-
.replace(/&amp;/g, '&')
365-
.replace(/\n{3,}/g, '\n\n')
366-
.trim()
354+
return convert(html, {
355+
wordwrap: false,
356+
selectors: [
357+
{ selector: 'a', options: { hideLinkHrefIfSameAsText: true, noAnchorUrl: true } },
358+
{ selector: 'img', format: 'skip' },
359+
{ selector: 'script', format: 'skip' },
360+
{ selector: 'style', format: 'skip' },
361+
],
362+
})
367363
}
368364

369365
/**

0 commit comments

Comments
 (0)