Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions markdown_it/rules_core/text_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,33 @@ def text_join(state: StateCore) -> None:

# convert text_special to text and join all adjacent text nodes
new_tokens: list[Token] = []
for child_token in inline_token.children or []:
children = inline_token.children or []
i = 0
while i < len(children):
child_token = children[i]
if child_token.type == "text_special":
child_token.type = "text"
if (
child_token.type == "text"
and new_tokens
and new_tokens[-1].type == "text"
):
new_tokens[-1].content += child_token.content
# Collapse a run of adjacent text nodes in a single join, instead
# of pairwise `a + b` concatenation. The pairwise form is O(L*k)
# in the size of the run because each step rebuilds the growing
# prefix; "".join is O(L).
parts = [new_tokens[-1].content, child_token.content]
i += 1
while i < len(children):
next_token = children[i]
if next_token.type == "text_special":
next_token.type = "text"
if next_token.type != "text":
break
parts.append(next_token.content)
i += 1
new_tokens[-1].content = "".join(parts)
else:
new_tokens.append(child_token)
i += 1
inline_token.children = new_tokens
25 changes: 18 additions & 7 deletions markdown_it/rules_inline/fragments_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,25 @@ def fragments_join(state: StateInline) -> None:
and curr + 1 < maximum
and state.tokens[curr + 1].type == "text"
):
# collapse two adjacent text nodes
state.tokens[curr + 1].content = (
state.tokens[curr].content + state.tokens[curr + 1].content
)
else:
if curr != last:
state.tokens[last] = state.tokens[curr]
# Collapse a run of adjacent text nodes in a single join, instead
# of pairwise `a + b` concatenation. The pairwise form is O(L*k)
# in the size of the run because each step rebuilds the growing
# prefix; "".join is O(L).
parts = [state.tokens[curr].content]
curr += 1
while curr < maximum and state.tokens[curr].type == "text":
parts.append(state.tokens[curr].content)
curr += 1
merged = state.tokens[curr - 1]
merged.content = "".join(parts)
merged.level = level
state.tokens[last] = merged
last += 1
continue

if curr != last:
state.tokens[last] = state.tokens[curr]
last += 1
curr += 1

if curr != last:
Expand Down
63 changes: 63 additions & 0 deletions tests/test_api/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,3 +279,66 @@ def test_table_tokens(data_regression):
"""
)
data_regression.check([t.as_dict() for t in tokens])


def test_fragments_join_merges_adjacent_text_tokens():
"""fragments_join should merge runs of adjacent text tokens into one.

Underscore characters flanked by word characters (e.g. ``a_b``) are not
valid emphasis delimiters in CommonMark, so the emphasis rule leaves each
``_`` as a plain text token adjacent to the surrounding text tokens,
giving a run of five tokens: text("a"), text("_"), text("b c"),
text("_"), text("d").

Note: there is also a core-level ``text_join`` rule that collapses adjacent
text tokens as a fallback. We disable it here so that the assertions are
sensitive only to ``fragments_join``.
"""
src = "a_b c_d"

# --- both rules disabled: five separate text tokens must survive ---
md_both_off = MarkdownIt()
md_both_off.disable(["text_join", "fragments_join"])
children_both_off = md_both_off.parseInline(src)[0].children
assert children_both_off is not None
assert len(children_both_off) > 1, "expected multiple text tokens with no merging"
assert all(t.type == "text" for t in children_both_off)

# --- only fragments_join enabled (text_join still off): run must collapse ---
md_fj_on = MarkdownIt()
md_fj_on.disable("text_join")
children_fj_on = md_fj_on.parseInline(src)[0].children
assert children_fj_on is not None
assert len(children_fj_on) == 1
assert children_fj_on[0].type == "text"
assert children_fj_on[0].content == "a_b c_d"


def test_text_join_merges_adjacent_text_special_tokens():
"""text_join should convert text_special tokens and merge runs into one.

Backslash-escaped characters each produce a ``text_special`` token.
``fragments_join`` only merges ``text`` tokens, so a run of
``text_special`` tokens passes through it untouched. ``text_join``
must then convert them to ``text`` and collapse the run in a single
pass rather than via pairwise concatenation.
"""
# Three consecutive backslash escapes → three text_special tokens before
# text_join runs.
src = r"\*\*\*"

# --- text_join disabled: three text_special tokens must survive ---
md_off = MarkdownIt()
md_off.disable("text_join")
children_off = md_off.parseInline(src)[0].children
assert children_off is not None
assert len(children_off) > 1, "expected multiple text_special tokens before merging"
assert all(t.type == "text_special" for t in children_off)

# --- text_join enabled (default): must collapse to a single text token ---
md_on = MarkdownIt()
children_on = md_on.parseInline(src)[0].children
assert children_on is not None
assert len(children_on) == 1
assert children_on[0].type == "text"
assert children_on[0].content == "***"
Loading