Skip to content

Commit c39f388

Browse files
authored
fix: preserve line breaks when converting HTML to markdown (#79)
1 parent e762aca commit c39f388

2 files changed

Lines changed: 110 additions & 5 deletions

File tree

lib/confluence-client.js

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1293,12 +1293,12 @@ class ConfluenceClient {
12931293

12941294
// Convert Confluence code macros to markdown
12951295
markdown = markdown.replace(/<ac:structured-macro ac:name="code"[^>]*>[\s\S]*?<ac:parameter ac:name="language">([^<]*)<\/ac:parameter>[\s\S]*?<ac:plain-text-body><!\[CDATA\[([\s\S]*?)\]\]><\/ac:plain-text-body>[\s\S]*?<\/ac:structured-macro>/g, (_, lang, code) => {
1296-
return `\`\`\`${lang}\n${code}\n\`\`\``;
1296+
return `\n\`\`\`${lang}\n${code}\n\`\`\`\n`;
12971297
});
12981298

12991299
// Convert code macros without language parameter
13001300
markdown = markdown.replace(/<ac:structured-macro ac:name="code"[^>]*>[\s\S]*?<ac:plain-text-body><!\[CDATA\[([\s\S]*?)\]\]><\/ac:plain-text-body>[\s\S]*?<\/ac:structured-macro>/g, (_, code) => {
1301-
return `\`\`\`\n${code}\n\`\`\``;
1301+
return `\n\`\`\`\n${code}\n\`\`\`\n`;
13021302
});
13031303

13041304
// Convert info macro to admonition
@@ -1515,8 +1515,8 @@ class ConfluenceClient {
15151515
});
15161516

15171517
// Convert paragraphs (after lists and tables)
1518-
markdown = markdown.replace(/<p>(.*?)<\/p>/g, (_, content) => {
1519-
return content.trim() + '\n';
1518+
markdown = markdown.replace(/<p>(.*?)<\/p>/gs, (_, content) => {
1519+
return '\n' + content.trim() + '\n';
15201520
});
15211521

15221522
// Convert line breaks

tests/confluence-client.test.js

Lines changed: 106 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -379,12 +379,59 @@ describe('ConfluenceClient', () => {
379379
test('should convert Confluence code macro to markdown', () => {
380380
const storage = '<ac:structured-macro ac:name="code"><ac:parameter ac:name="language">javascript</ac:parameter><ac:plain-text-body><![CDATA[console.log("Hello");]]></ac:plain-text-body></ac:structured-macro>';
381381
const result = client.storageToMarkdown(storage);
382-
382+
383383
expect(result).toContain('```javascript');
384384
expect(result).toContain('console.log("Hello");');
385385
expect(result).toContain('```');
386386
});
387387

388+
test('should separate code block (with language) from surrounding content with blank lines', () => {
389+
const storage = '<p>Intro</p><ac:structured-macro ac:name="code"><ac:parameter ac:name="language">python</ac:parameter><ac:plain-text-body><![CDATA[print("hi")]]></ac:plain-text-body></ac:structured-macro><p>Outro</p>';
390+
const result = client.storageToMarkdown(storage);
391+
expect(result).toMatch(/Intro\n\n/);
392+
expect(result).toMatch(/\n\n```python\n/);
393+
expect(result).toMatch(/\n```\n\n/);
394+
expect(result).toMatch(/\n\nOutro/);
395+
});
396+
397+
test('should separate code block (no language) from surrounding content with blank lines', () => {
398+
const storage = '<p>Before</p><ac:structured-macro ac:name="code"><ac:plain-text-body><![CDATA[raw code]]></ac:plain-text-body></ac:structured-macro><p>After</p>';
399+
const result = client.storageToMarkdown(storage);
400+
expect(result).toMatch(/Before\n\n/);
401+
expect(result).toMatch(/\n\n```\n/);
402+
expect(result).toMatch(/\n```\n\n/);
403+
expect(result).toMatch(/\n\nAfter/);
404+
});
405+
406+
test('should separate mermaid macro from surrounding content with blank lines', () => {
407+
const storage = '<p>Diagram:</p><ac:structured-macro ac:name="mermaid-macro"><ac:plain-text-body><![CDATA[graph TD; A-->B]]></ac:plain-text-body></ac:structured-macro><p>End</p>';
408+
const result = client.storageToMarkdown(storage);
409+
expect(result).toMatch(/Diagram:\n\n/);
410+
expect(result).toMatch(/\n\n```mermaid\n/);
411+
expect(result).toMatch(/\n```\n\n/);
412+
expect(result).toMatch(/\n\nEnd/);
413+
});
414+
415+
test('complex page: heading, multi-line paragraph, code block, ordered list', () => {
416+
const storage = [
417+
'<h1>Deployment Guide</h1>',
418+
'<p>Deploy using the following steps.\nEnsure prerequisites are met.</p>',
419+
'<ac:structured-macro ac:name="code"><ac:parameter ac:name="language">bash</ac:parameter><ac:plain-text-body><![CDATA[git pull origin main\nnpm run build]]></ac:plain-text-body></ac:structured-macro>',
420+
'<p>Then verify:</p>',
421+
'<ol><li>Check logs</li><li>Run smoke tests</li></ol>',
422+
'<p>Deployment complete.</p>'
423+
].join('');
424+
const result = client.storageToMarkdown(storage);
425+
expect(result).toBe(
426+
'# Deployment Guide\n\n' +
427+
'Deploy using the following steps.\nEnsure prerequisites are met.\n\n' +
428+
'```bash\ngit pull origin main\nnpm run build\n```\n\n' +
429+
'Then verify:\n\n' +
430+
'1. Check logs\n2. Run smoke tests\n\n' +
431+
'Deployment complete.'
432+
);
433+
});
434+
388435
test('should convert Confluence macros to admonitions', () => {
389436
const storage = '<ac:structured-macro ac:name="info"><ac:rich-text-body><p>This is info</p></ac:rich-text-body></ac:structured-macro>';
390437
const result = client.storageToMarkdown(storage);
@@ -428,6 +475,64 @@ describe('ConfluenceClient', () => {
428475
expect(result).toContain('| Cell |');
429476
});
430477

478+
test('should preserve content of multi-line paragraphs', () => {
479+
// Without the dotAll flag on the <p> regex, content with embedded newlines is silently dropped
480+
const html = '<p>First line\nSecond line</p>';
481+
const result = client.htmlToMarkdown(html);
482+
expect(result).toContain('First line');
483+
expect(result).toContain('Second line');
484+
});
485+
486+
test('should separate consecutive paragraphs with a blank line', () => {
487+
const html = '<p>Alpha</p><p>Beta</p>';
488+
const result = client.htmlToMarkdown(html);
489+
expect(result).toMatch(/Alpha\n\nBeta/);
490+
});
491+
492+
test('should separate lists from surrounding content with blank lines', () => {
493+
const html = '<p>Intro</p><ul><li>Item A</li><li>Item B</li></ul><p>Outro</p>';
494+
const result = client.htmlToMarkdown(html);
495+
expect(result).toMatch(/Intro\n\n/);
496+
expect(result).toMatch(/\n\n- Item A\n- Item B\n\n/);
497+
expect(result).toMatch(/\n\nOutro/);
498+
});
499+
500+
test('should separate ordered lists from surrounding content with blank lines', () => {
501+
const html = '<p>Steps:</p><ol><li>First</li><li>Second</li></ol><p>Done</p>';
502+
const result = client.htmlToMarkdown(html);
503+
expect(result).toMatch(/Steps:\n\n/);
504+
expect(result).toMatch(/\n\n1\. First\n2\. Second\n\n/);
505+
expect(result).toMatch(/\n\nDone/);
506+
});
507+
508+
test('should separate tables from surrounding content with blank lines', () => {
509+
const html = '<p>See table:</p><table><tr><th>Col</th></tr><tr><td>Val</td></tr></table><p>End</p>';
510+
const result = client.htmlToMarkdown(html);
511+
expect(result).toMatch(/See table:\n\n/);
512+
expect(result).toMatch(/\| Col \|/);
513+
expect(result).toMatch(/\n\nEnd/);
514+
});
515+
516+
test('complex page: heading, multi-line paragraph, table, list', () => {
517+
const html = [
518+
'<h2>API Reference</h2>',
519+
'<p>The following endpoints are available.\nAll requests require authentication.</p>',
520+
'<table><tr><th>Method</th><th>Path</th></tr><tr><td>GET</td><td>/users</td></tr><tr><td>POST</td><td>/users</td></tr></table>',
521+
'<p>Authentication options:</p>',
522+
'<ul><li>Bearer token</li><li>API key</li></ul>',
523+
'<p>See docs for details.</p>'
524+
].join('');
525+
const result = client.htmlToMarkdown(html);
526+
expect(result).toBe(
527+
'## API Reference\n\n' +
528+
'The following endpoints are available.\nAll requests require authentication.\n\n' +
529+
'| Method | Path |\n| --- | --- |\n| GET | /users |\n| POST | /users |\n\n' +
530+
'Authentication options:\n\n' +
531+
'- Bearer token\n- API key\n\n' +
532+
'See docs for details.'
533+
);
534+
});
535+
431536
test('should convert named characters correctly', () => {
432537
const NAMED_ENTITIES = ConfluenceClient.NAMED_ENTITIES;
433538

0 commit comments

Comments
 (0)