Skip to content

Commit 78609a9

Browse files
yarikopticclaude
andcommitted
fix: post-process pandoc markdown to remove spurious backslash escapes
Pandoc's default markdown writer backslash-escapes apostrophes, quotes, and em-dash sequences, producing artifacts like \', \", and \-\-- that make the .md harder to read and cause problems when round-tripping through Google Docs. Add post-processing in docx-to-md to unescape these after conversion. Also add --markdown-variant option for choosing pandoc output dialect, and fix ruff lint issues (unused imports/variables, import sorting). Co-Authored-By: Claude Code 2.1.81 / Claude Opus 4.6 <noreply@anthropic.com>
1 parent 4dd633c commit 78609a9

4 files changed

Lines changed: 91 additions & 10 deletions

File tree

docflow/cli/main.py

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,12 @@
77
import click
88

99
from docflow import __version__
10-
from docflow.config import get_zotero_api_key, load_config
11-
from docflow.convert.spreadsheet import convert_spreadsheet, extract_citations_from_spreadsheet, read_spreadsheet
10+
from docflow.config import load_config
11+
from docflow.convert.spreadsheet import (
12+
convert_spreadsheet,
13+
extract_citations_from_spreadsheet,
14+
read_spreadsheet,
15+
)
1216
from docflow.extract.citations import Citation, CitationType
1317
from docflow.extract.docx import extract_comments_from_docx, format_comments_as_json
1418
from docflow.integrations.zotero import ZoteroClient
@@ -65,8 +69,18 @@ def convert() -> None:
6569
help="Extract images to this directory",
6670
)
6771
@click.option("--standalone", is_flag=True, help="Produce standalone markdown file")
72+
@click.option(
73+
"--markdown-variant",
74+
type=click.Choice(["markdown", "gfm", "commonmark", "markdown_strict"]),
75+
default="markdown",
76+
help="Pandoc markdown output variant [default: markdown]",
77+
)
6878
def docx_to_md(
69-
input_file: str, output: str | None, extract_media: str | None, standalone: bool
79+
input_file: str,
80+
output: str | None,
81+
extract_media: str | None,
82+
standalone: bool,
83+
markdown_variant: str,
7084
) -> None:
7185
"""Convert Word document to Markdown.
7286
@@ -77,6 +91,7 @@ def docx_to_md(
7791
docflow convert docx-to-md manuscript.docx -o output.md
7892
docflow convert docx-to-md manuscript.docx --extract-media ./images
7993
"""
94+
import re
8095
import shutil
8196
import subprocess
8297

@@ -92,7 +107,16 @@ def docx_to_md(
92107
output_path = Path(output) if output else input_path.with_suffix(".md")
93108

94109
# Build pandoc command
95-
cmd = ["pandoc", str(input_path), "-f", "docx", "-t", "markdown", "-o", str(output_path)]
110+
cmd = [
111+
"pandoc",
112+
str(input_path),
113+
"-f",
114+
"docx",
115+
"-t",
116+
markdown_variant,
117+
"-o",
118+
str(output_path),
119+
]
96120

97121
if extract_media:
98122
cmd.extend(["--extract-media", extract_media])
@@ -103,7 +127,24 @@ def docx_to_md(
103127
click.echo(f"Converting {input_path} to {output_path}")
104128

105129
try:
106-
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
130+
subprocess.run(cmd, capture_output=True, text=True, check=True)
131+
132+
# Post-process: remove spurious backslash escapes that pandoc's
133+
# default markdown writer introduces (escaped quotes, apostrophes,
134+
# and em-dashes). These make the .md harder to read/edit and cause
135+
# problems when round-tripping through Google Docs.
136+
text = output_path.read_text(encoding="utf-8")
137+
original = text
138+
# Unescape apostrophes and quotes that pandoc backslash-escapes
139+
text = text.replace("\\'", "'")
140+
text = text.replace('\\"', '"')
141+
# Unescape em-dash sequences: \-\-\- → --- and \-\-- → ---
142+
text = re.sub(r"\\-\\-\\-", "---", text)
143+
text = re.sub(r"\\-\\--", "---", text)
144+
# Unescape en-dash: \-- → --
145+
text = text.replace("\\--", "--")
146+
if text != original:
147+
output_path.write_text(text, encoding="utf-8")
107148

108149
click.echo(f"✓ Converted to: {output_path}")
109150

@@ -164,7 +205,7 @@ def md_to_docx(input_file: str, output: str | None, reference_doc: str | None) -
164205
click.echo(f"Converting {input_path} to {output_path}")
165206

166207
try:
167-
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
208+
subprocess.run(cmd, capture_output=True, text=True, check=True)
168209

169210
click.echo(f"✓ Converted to: {output_path}")
170211

@@ -301,7 +342,7 @@ def spreadsheet_citations(files: tuple[str, ...], columns: tuple[str, ...]) -> N
301342
citations = extract_citations_from_spreadsheet(spreadsheet, citation_cols)
302343

303344
if not citations:
304-
click.echo(f" No citations found")
345+
click.echo(" No citations found")
305346
continue
306347

307348
click.echo(f" Found {len(citations)} citation(s)")

tests/test_convert.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
"""Tests for document conversion CLI."""
22

3+
import re
4+
5+
import pytest
36
from click.testing import CliRunner
47

58
from docflow.cli.main import main
@@ -54,3 +57,43 @@ def test_docx_to_md_no_pandoc(tmp_path) -> None:
5457
assert result.exit_code != 0
5558
# Should mention pandoc error or conversion error
5659
assert "pandoc" in result.output.lower() or "error" in result.output.lower()
60+
61+
62+
@pytest.mark.ai_generated
63+
class TestPostProcessUnescaping:
64+
"""Test the post-processing step that removes pandoc's spurious backslash escapes."""
65+
66+
@staticmethod
67+
def _unescape(text: str) -> str:
68+
"""Apply the same unescape logic used in docx_to_md post-processing."""
69+
text = text.replace("\\'", "'")
70+
text = text.replace('\\"', '"')
71+
# Order matters: longest pattern first
72+
text = re.sub(r"\\-\\-\\-", "---", text)
73+
text = re.sub(r"\\-\\--", "---", text)
74+
text = text.replace("\\--", "--")
75+
return text
76+
77+
def test_escaped_apostrophes(self) -> None:
78+
assert self._unescape(r"TRD3\'s architecture") == "TRD3's architecture"
79+
80+
def test_escaped_quotes(self) -> None:
81+
assert self._unescape(r'the \"Open Science\" framework') == 'the "Open Science" framework'
82+
83+
def test_escaped_emdash(self) -> None:
84+
assert self._unescape(r"challenge\-\--the solutions") == "challenge---the solutions"
85+
86+
def test_escaped_emdash_triple(self) -> None:
87+
assert self._unescape(r"foo\-\-\-bar") == "foo---bar"
88+
89+
def test_escaped_endash(self) -> None:
90+
assert self._unescape(r"Years 3\--4") == "Years 3--4"
91+
92+
def test_no_change_on_clean_text(self) -> None:
93+
clean = "This is TRD3's test---with em-dashes---and \"quotes\" here."
94+
assert self._unescape(clean) == clean
95+
96+
def test_mixed_escapes(self) -> None:
97+
text = r"TRD3\'s mission\-\--making things\'s \"better\""
98+
expected = 'TRD3\'s mission---making things\'s "better"'
99+
assert self._unescape(text) == expected

tests/test_convert_spreadsheet.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
11
"""Tests for spreadsheet conversion and citation extraction."""
22

3-
import json
43
from pathlib import Path
54

65
import pytest
76

87
from docflow.convert.spreadsheet import (
9-
SpreadsheetData,
108
convert_spreadsheet,
119
extract_citations_from_spreadsheet,
1210
read_spreadsheet,

tests/test_init.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""Tests for project initialization."""
22

3-
from pathlib import Path
43
from unittest.mock import MagicMock, patch
54

65
import pytest

0 commit comments

Comments
 (0)