fix: post-process pandoc markdown to remove spurious backslash escapes

yarikoptic · claude · yarikoptic · commit 78609a9d87a9 · 2026-03-21T17:57:56.000-04:00
Pandoc's default markdown writer backslash-escapes apostrophes, quotes,
and em-dash sequences, producing artifacts like \', \", and \-\-- that
make the .md harder to read and cause problems when round-tripping
through Google Docs.

Add post-processing in docx-to-md to unescape these after conversion.
Also add --markdown-variant option for choosing pandoc output dialect,
and fix ruff lint issues (unused imports/variables, import sorting).

Co-Authored-By: Claude Code 2.1.81 / Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/docflow/cli/main.py b/docflow/cli/main.py
@@ -7,8 +7,12 @@
 import click
 
 from docflow import __version__
-from docflow.config import get_zotero_api_key, load_config
-from docflow.convert.spreadsheet import convert_spreadsheet, extract_citations_from_spreadsheet, read_spreadsheet
+from docflow.config import load_config
+from docflow.convert.spreadsheet import (
+    convert_spreadsheet,
+    extract_citations_from_spreadsheet,
+    read_spreadsheet,
+)
 from docflow.extract.citations import Citation, CitationType
 from docflow.extract.docx import extract_comments_from_docx, format_comments_as_json
 from docflow.integrations.zotero import ZoteroClient
@@ -65,8 +69,18 @@ def convert() -> None:
     help="Extract images to this directory",
 )
 @click.option("--standalone", is_flag=True, help="Produce standalone markdown file")
+@click.option(
+    "--markdown-variant",
+    type=click.Choice(["markdown", "gfm", "commonmark", "markdown_strict"]),
+    default="markdown",
+    help="Pandoc markdown output variant [default: markdown]",
+)
 def docx_to_md(
-    input_file: str, output: str | None, extract_media: str | None, standalone: bool
+    input_file: str,
+    output: str | None,
+    extract_media: str | None,
+    standalone: bool,
+    markdown_variant: str,
 ) -> None:
     """Convert Word document to Markdown.
 
@@ -77,6 +91,7 @@ def docx_to_md(
         docflow convert docx-to-md manuscript.docx -o output.md
         docflow convert docx-to-md manuscript.docx --extract-media ./images
     """
+    import re
     import shutil
     import subprocess
 
@@ -92,7 +107,16 @@ def docx_to_md(
     output_path = Path(output) if output else input_path.with_suffix(".md")
 
     # Build pandoc command
-    cmd = ["pandoc", str(input_path), "-f", "docx", "-t", "markdown", "-o", str(output_path)]
+    cmd = [
+        "pandoc",
+        str(input_path),
+        "-f",
+        "docx",
+        "-t",
+        markdown_variant,
+        "-o",
+        str(output_path),
+    ]
 
     if extract_media:
         cmd.extend(["--extract-media", extract_media])
@@ -103,7 +127,24 @@ def docx_to_md(
     click.echo(f"Converting {input_path} to {output_path}")
 
     try:
-        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        subprocess.run(cmd, capture_output=True, text=True, check=True)
+
+        # Post-process: remove spurious backslash escapes that pandoc's
+        # default markdown writer introduces (escaped quotes, apostrophes,
+        # and em-dashes).  These make the .md harder to read/edit and cause
+        # problems when round-tripping through Google Docs.
+        text = output_path.read_text(encoding="utf-8")
+        original = text
+        # Unescape apostrophes and quotes that pandoc backslash-escapes
+        text = text.replace("\\'", "'")
+        text = text.replace('\\"', '"')
+        # Unescape em-dash sequences: \-\-\- → --- and \-\-- → ---
+        text = re.sub(r"\\-\\-\\-", "---", text)
+        text = re.sub(r"\\-\\--", "---", text)
+        # Unescape en-dash: \-- → --
+        text = text.replace("\\--", "--")
+        if text != original:
+            output_path.write_text(text, encoding="utf-8")
 
         click.echo(f"✓ Converted to: {output_path}")
 
@@ -164,7 +205,7 @@ def md_to_docx(input_file: str, output: str | None, reference_doc: str | None) -
     click.echo(f"Converting {input_path} to {output_path}")
 
     try:
-        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        subprocess.run(cmd, capture_output=True, text=True, check=True)
 
         click.echo(f"✓ Converted to: {output_path}")
 
@@ -301,7 +342,7 @@ def spreadsheet_citations(files: tuple[str, ...], columns: tuple[str, ...]) -> N
             citations = extract_citations_from_spreadsheet(spreadsheet, citation_cols)
 
             if not citations:
-                click.echo(f"  No citations found")
+                click.echo("  No citations found")
                 continue
 
             click.echo(f"  Found {len(citations)} citation(s)")
diff --git a/tests/test_convert.py b/tests/test_convert.py
@@ -1,5 +1,8 @@
 """Tests for document conversion CLI."""
 
+import re
+
+import pytest
 from click.testing import CliRunner
 
 from docflow.cli.main import main
@@ -54,3 +57,43 @@ def test_docx_to_md_no_pandoc(tmp_path) -> None:
     assert result.exit_code != 0
     # Should mention pandoc error or conversion error
     assert "pandoc" in result.output.lower() or "error" in result.output.lower()
+
+
+@pytest.mark.ai_generated
+class TestPostProcessUnescaping:
+    """Test the post-processing step that removes pandoc's spurious backslash escapes."""
+
+    @staticmethod
+    def _unescape(text: str) -> str:
+        """Apply the same unescape logic used in docx_to_md post-processing."""
+        text = text.replace("\\'", "'")
+        text = text.replace('\\"', '"')
+        # Order matters: longest pattern first
+        text = re.sub(r"\\-\\-\\-", "---", text)
+        text = re.sub(r"\\-\\--", "---", text)
+        text = text.replace("\\--", "--")
+        return text
+
+    def test_escaped_apostrophes(self) -> None:
+        assert self._unescape(r"TRD3\'s architecture") == "TRD3's architecture"
+
+    def test_escaped_quotes(self) -> None:
+        assert self._unescape(r'the \"Open Science\" framework') == 'the "Open Science" framework'
+
+    def test_escaped_emdash(self) -> None:
+        assert self._unescape(r"challenge\-\--the solutions") == "challenge---the solutions"
+
+    def test_escaped_emdash_triple(self) -> None:
+        assert self._unescape(r"foo\-\-\-bar") == "foo---bar"
+
+    def test_escaped_endash(self) -> None:
+        assert self._unescape(r"Years 3\--4") == "Years 3--4"
+
+    def test_no_change_on_clean_text(self) -> None:
+        clean = "This is TRD3's test---with em-dashes---and \"quotes\" here."
+        assert self._unescape(clean) == clean
+
+    def test_mixed_escapes(self) -> None:
+        text = r"TRD3\'s mission\-\--making things\'s \"better\""
+        expected = 'TRD3\'s mission---making things\'s "better"'
+        assert self._unescape(text) == expected
diff --git a/tests/test_convert_spreadsheet.py b/tests/test_convert_spreadsheet.py
@@ -1,12 +1,10 @@
 """Tests for spreadsheet conversion and citation extraction."""
 
-import json
 from pathlib import Path
 
 import pytest
 
 from docflow.convert.spreadsheet import (
-    SpreadsheetData,
     convert_spreadsheet,
     extract_citations_from_spreadsheet,
     read_spreadsheet,
diff --git a/tests/test_init.py b/tests/test_init.py
@@ -1,6 +1,5 @@
 """Tests for project initialization."""
 
-from pathlib import Path
 from unittest.mock import MagicMock, patch
 
 import pytest