77import click
88
99from docflow import __version__
10- from docflow .config import get_zotero_api_key , load_config
11- from docflow .convert .spreadsheet import convert_spreadsheet , extract_citations_from_spreadsheet , read_spreadsheet
10+ from docflow .config import load_config
11+ from docflow .convert .spreadsheet import (
12+ convert_spreadsheet ,
13+ extract_citations_from_spreadsheet ,
14+ read_spreadsheet ,
15+ )
1216from docflow .extract .citations import Citation , CitationType
1317from docflow .extract .docx import extract_comments_from_docx , format_comments_as_json
1418from docflow .integrations .zotero import ZoteroClient
@@ -65,8 +69,18 @@ def convert() -> None:
6569 help = "Extract images to this directory" ,
6670)
6771@click .option ("--standalone" , is_flag = True , help = "Produce standalone markdown file" )
72+ @click .option (
73+ "--markdown-variant" ,
74+ type = click .Choice (["markdown" , "gfm" , "commonmark" , "markdown_strict" ]),
75+ default = "markdown" ,
76+ help = "Pandoc markdown output variant [default: markdown]" ,
77+ )
6878def docx_to_md (
69- input_file : str , output : str | None , extract_media : str | None , standalone : bool
79+ input_file : str ,
80+ output : str | None ,
81+ extract_media : str | None ,
82+ standalone : bool ,
83+ markdown_variant : str ,
7084) -> None :
7185 """Convert Word document to Markdown.
7286
@@ -77,6 +91,7 @@ def docx_to_md(
7791 docflow convert docx-to-md manuscript.docx -o output.md
7892 docflow convert docx-to-md manuscript.docx --extract-media ./images
7993 """
94+ import re
8095 import shutil
8196 import subprocess
8297
@@ -92,7 +107,16 @@ def docx_to_md(
92107 output_path = Path (output ) if output else input_path .with_suffix (".md" )
93108
94109 # Build pandoc command
95- cmd = ["pandoc" , str (input_path ), "-f" , "docx" , "-t" , "markdown" , "-o" , str (output_path )]
110+ cmd = [
111+ "pandoc" ,
112+ str (input_path ),
113+ "-f" ,
114+ "docx" ,
115+ "-t" ,
116+ markdown_variant ,
117+ "-o" ,
118+ str (output_path ),
119+ ]
96120
97121 if extract_media :
98122 cmd .extend (["--extract-media" , extract_media ])
@@ -103,7 +127,24 @@ def docx_to_md(
103127 click .echo (f"Converting { input_path } to { output_path } " )
104128
105129 try :
106- result = subprocess .run (cmd , capture_output = True , text = True , check = True )
130+ subprocess .run (cmd , capture_output = True , text = True , check = True )
131+
132+ # Post-process: remove spurious backslash escapes that pandoc's
133+ # default markdown writer introduces (escaped quotes, apostrophes,
134+ # and em-dashes). These make the .md harder to read/edit and cause
135+ # problems when round-tripping through Google Docs.
136+ text = output_path .read_text (encoding = "utf-8" )
137+ original = text
138+ # Unescape apostrophes and quotes that pandoc backslash-escapes
139+ text = text .replace ("\\ '" , "'" )
140+ text = text .replace ('\\ "' , '"' )
141+ # Unescape em-dash sequences: \-\-\- → --- and \-\-- → ---
142+ text = re .sub (r"\\-\\-\\-" , "---" , text )
143+ text = re .sub (r"\\-\\--" , "---" , text )
144+ # Unescape en-dash: \-- → --
145+ text = text .replace ("\\ --" , "--" )
146+ if text != original :
147+ output_path .write_text (text , encoding = "utf-8" )
107148
108149 click .echo (f"✓ Converted to: { output_path } " )
109150
@@ -164,7 +205,7 @@ def md_to_docx(input_file: str, output: str | None, reference_doc: str | None) -
164205 click .echo (f"Converting { input_path } to { output_path } " )
165206
166207 try :
167- result = subprocess .run (cmd , capture_output = True , text = True , check = True )
208+ subprocess .run (cmd , capture_output = True , text = True , check = True )
168209
169210 click .echo (f"✓ Converted to: { output_path } " )
170211
@@ -301,7 +342,7 @@ def spreadsheet_citations(files: tuple[str, ...], columns: tuple[str, ...]) -> N
301342 citations = extract_citations_from_spreadsheet (spreadsheet , citation_cols )
302343
303344 if not citations :
304- click .echo (f " No citations found" )
345+ click .echo (" No citations found" )
305346 continue
306347
307348 click .echo (f" Found { len (citations )} citation(s)" )
0 commit comments