Skip to content

Commit ea26ed7

Browse files
jk-kim0claude
andcommitted
mdx: main 브랜치 병합 - CJK 공백 최적화 및 trailing blank line 변경 반영
3aa5b93(ko 문서 **/* delimiter CJK 공백 최적화) 및 e9933c9(하단 빈줄 변경)의 변경사항을 병합하여 PR #822의 교정 내용과 통합합니다. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2 parents 557884b + 3aa5b93 commit ea26ed7

File tree

508 files changed

+5151
-4022
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

508 files changed

+5151
-4022
lines changed

confluence-mdx/bin/converter/cli.py

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,9 @@ def main():
120120
help='Directory to save attachments (default: output file directory)')
121121
parser.add_argument('--skip-image-copy', action='store_true',
122122
help='이미지 파일 복사를 생략 (경로만 지정대로 생성)')
123+
parser.add_argument('--language',
124+
choices=['ko', 'ja', 'en'],
125+
help='언어 코드를 명시적으로 지정 (미지정 시 출력 경로에서 자동 감지)')
123126
parser.add_argument('--log-level',
124127
choices=['debug', 'info', 'warning', 'error', 'critical'],
125128
default='info',
@@ -144,21 +147,25 @@ def main():
144147
output_dir = os.path.join(os.path.dirname(args.output_file), output_file_stem)
145148
logging.info(f"Using default attachment directory: {output_dir}")
146149

147-
# Extract language code from the output file path
148-
path_parts = ctx.OUTPUT_FILE_PATH.split(os.sep)
149-
150-
# Look for 2-letter language code in the path
151-
detected_language = 'en' # Default to English
152-
for part in path_parts:
153-
if len(part) == 2 and part.isalpha():
154-
# Check if it's a known language code
155-
if part in ['ko', 'ja', 'en']:
156-
detected_language = part
157-
break
158-
159-
# Update shared LANGUAGE variable
160-
ctx.LANGUAGE = detected_language
161-
logging.info(f"Detected language from output path: {ctx.LANGUAGE}")
150+
# Determine language: explicit --language takes precedence over path detection
151+
if args.language:
152+
ctx.LANGUAGE = args.language
153+
logging.info(f"Language set explicitly: {ctx.LANGUAGE}")
154+
else:
155+
# Extract language code from the output file path
156+
path_parts = ctx.OUTPUT_FILE_PATH.split(os.sep)
157+
158+
# Look for 2-letter language code in the path
159+
detected_language = 'en' # Default to English
160+
for part in path_parts:
161+
if len(part) == 2 and part.isalpha():
162+
# Check if it's a known language code
163+
if part in ['ko', 'ja', 'en']:
164+
detected_language = part
165+
break
166+
167+
ctx.LANGUAGE = detected_language
168+
logging.info(f"Detected language from output path: {ctx.LANGUAGE}")
162169

163170
try:
164171
with open(args.input_file, 'r', encoding='utf-8') as f:

confluence-mdx/bin/converter/core.py

Lines changed: 69 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,18 @@ def as_markdown(self, caption: Optional[str] = None, width: Optional[str] = None
116116
return f'[{caption}]({self.output_dir}/{self.filename})'
117117

118118

119+
def _is_unicode_punctuation(ch: str) -> bool:
120+
"""CommonMark spec의 Unicode punctuation 판정.
121+
122+
Unicode general category가 P(punctuation) 또는 S(symbol)이면 True.
123+
ASCII punctuation도 포함된다.
124+
"""
125+
if not ch:
126+
return False
127+
cat = unicodedata.category(ch[0])
128+
return cat.startswith('P') or cat.startswith('S')
129+
130+
119131
class SingleLineParser:
120132
def __init__(self, node, collector: LostInfoCollector | None = None):
121133
self.node = node
@@ -202,13 +214,25 @@ def convert_recursively(self, node):
202214
for child in node.children:
203215
self.convert_recursively(child)
204216
else:
205-
self.markdown_lines.append(" **")
206-
self.markdown_lines.append(self.markdown_of_children(node).strip())
207-
self.markdown_lines.append("** ")
217+
inner = self.markdown_of_children(node).strip()
218+
open_sp = " " if inner and _is_unicode_punctuation(inner[0]) else ""
219+
close_sp = " " if inner and _is_unicode_punctuation(inner[-1]) else ""
220+
# 연속 emphasis delimiter 충돌 방지
221+
if not close_sp and isinstance(node.next_sibling, Tag) and node.next_sibling.name in ('strong', 'em'):
222+
close_sp = " "
223+
self.markdown_lines.append(f"{open_sp}**")
224+
self.markdown_lines.append(inner)
225+
self.markdown_lines.append(f"**{close_sp}")
208226
elif node.name in ['em']:
209-
self.markdown_lines.append(" *")
210-
self.markdown_lines.append(self.markdown_of_children(node).strip())
211-
self.markdown_lines.append("* ")
227+
inner = self.markdown_of_children(node).strip()
228+
open_sp = " " if inner and _is_unicode_punctuation(inner[0]) else ""
229+
close_sp = " " if inner and _is_unicode_punctuation(inner[-1]) else ""
230+
# 연속 emphasis delimiter 충돌 방지
231+
if not close_sp and isinstance(node.next_sibling, Tag) and node.next_sibling.name in ('strong', 'em'):
232+
close_sp = " "
233+
self.markdown_lines.append(f"{open_sp}*")
234+
self.markdown_lines.append(inner)
235+
self.markdown_lines.append(f"*{close_sp}")
212236
elif node.name in ['code']:
213237
self.markdown_lines.append("`")
214238
self.markdown_lines.append(self.markdown_of_children(node).strip())
@@ -617,6 +641,43 @@ def is_standalone_dash(self):
617641

618642
return True
619643

644+
@staticmethod
645+
def _is_trailing_empty_p(node):
646+
"""Trailing empty <p>/<div> 앞의 separator를 건너뛰어 1:1 매핑을 보장한다.
647+
648+
Markdown에서 블록 사이 빈 줄(separator)은 필수이므로, separator를
649+
그대로 두면 N개의 trailing empty <p> → N+1개의 blank line이 된다:
650+
651+
XHTML empty <p> 수 | separator 포함 시 blank line 수
652+
0 | 0
653+
1 | 2 ← "1"이 불가능
654+
2 | 3
655+
N | N+1
656+
657+
1 blank line을 만들 수 있는 XHTML 상태가 존재하지 않으므로,
658+
사용자가 trailing blank을 2→1로 편집하면 roundtrip에서 재현할 수 없다.
659+
660+
Trailing empty <p> 앞의 separator를 건너뛰면 N → N으로 1:1 매핑되어
661+
모든 trailing blank 수를 XHTML로 정확히 표현할 수 있다.
662+
663+
Top-level [document] 컨텍스트에서만 적용하여, expand 매크로 등
664+
중첩 컨테이너 내부에는 영향을 주지 않는다.
665+
"""
666+
if node.name not in ('p', 'div'):
667+
return False
668+
if node.get_text(strip=True):
669+
return False
670+
if node.parent.name != '[document]':
671+
return False
672+
for sibling in node.next_siblings:
673+
if isinstance(sibling, NavigableString):
674+
if sibling.strip():
675+
return False
676+
else:
677+
if sibling.get_text(strip=True):
678+
return False
679+
return True
680+
620681
def append_empty_line_unless_first_child(self, node):
621682
# Convert generator to list to check length
622683
children_list = list(node.parent.children)
@@ -708,7 +769,8 @@ def convert_recursively(self, node):
708769
self.append_empty_line_unless_first_child(node)
709770
self.markdown_lines.extend(TableToHtmlTable(node, collector=self.collector).as_markdown)
710771
elif node.name in ['p', 'div']:
711-
self.append_empty_line_unless_first_child(node)
772+
if not self._is_trailing_empty_p(node):
773+
self.append_empty_line_unless_first_child(node)
712774
child_markdown = []
713775
for child in node.children:
714776
if isinstance(child, NavigableString):

confluence-mdx/bin/reverse_sync/mdx_to_xhtml_inline.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import re
77
from typing import List
88

9+
from bs4 import BeautifulSoup, Tag
910
from mdx_to_storage.inline import convert_inline
1011

1112

@@ -23,10 +24,14 @@ def mdx_block_to_inner_xhtml(content: str, block_type: str) -> str:
2324
return _convert_heading(text)
2425
elif block_type == 'paragraph':
2526
return _convert_paragraph(text)
27+
elif block_type == 'callout':
28+
return _convert_callout_inner(text)
2629
elif block_type == 'list':
2730
return _convert_list_content(text)
2831
elif block_type == 'code_block':
2932
return _convert_code_block(text)
33+
elif block_type == 'html_block':
34+
return _convert_html_block_inner(text)
3035
else:
3136
return convert_inline(text)
3237

@@ -55,6 +60,17 @@ def _convert_paragraph(text: str) -> str:
5560
return ' '.join(converted)
5661

5762

63+
def _convert_callout_inner(text: str) -> str:
64+
"""callout: <Callout> 래퍼 태그를 제거하고 내부 텍스트를 paragraph로 변환."""
65+
lines = text.splitlines()
66+
if lines and lines[0].strip().startswith('<Callout'):
67+
lines = lines[1:]
68+
if lines and lines[-1].strip().startswith('</Callout'):
69+
lines = lines[:-1]
70+
inner = '\n'.join(lines).strip()
71+
return _convert_paragraph(inner)
72+
73+
5874
def _convert_code_block(text: str) -> str:
5975
"""code_block: 펜스 마커 제거, 코드 내용만 추출."""
6076
lines = text.split('\n')
@@ -66,6 +82,21 @@ def _convert_code_block(text: str) -> str:
6682
return '\n'.join(lines)
6783

6884

85+
def _convert_html_block_inner(text: str) -> str:
86+
"""html_block: inline 변환 후 루트 요소의 innerHTML만 반환한다.
87+
88+
html_block content는 ``<table>...**bold**...</table>`` 처럼
89+
outer 태그를 포함하므로, inline 변환 후 루트 요소를 벗겨내야
90+
_replace_inner_html()에서 중첩이 발생하지 않는다.
91+
"""
92+
converted = convert_inline(text)
93+
soup = BeautifulSoup(converted, 'html.parser')
94+
root = soup.find(True) # 첫 번째 태그 요소
95+
if isinstance(root, Tag):
96+
return root.decode_contents()
97+
return converted
98+
99+
69100
def _convert_code_spans(text: str) -> str:
70101
"""code span만 변환 (`text` → <code>text</code>)."""
71102
return re.sub(r'`([^`]+)`', r'<code>\1</code>', text)

0 commit comments

Comments
 (0)