From c6a7430ddef3625a8b101538179db7a4820ac596 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 4 May 2026 21:13:06 +0000 Subject: [PATCH 1/6] Initial plan From 7b870ef5f046aca572b4ff50df0bae8781b1bfac Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 4 May 2026 21:15:55 +0000 Subject: [PATCH 2/6] =?UTF-8?q?=E2=9C=A8=20NEW:=20Allow=20plugins=20to=20r?= =?UTF-8?q?egister=20inline=20terminator=20characters?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Agent-Logs-Url: https://github.com/executablebooks/markdown-it-py/sessions/e9a49254-6b3a-4ecc-9b57-84f6df4e6ccd Co-authored-by: chrisjsewell <2997570+chrisjsewell@users.noreply.github.com> --- docs/contributing.md | 11 +++-- markdown_it/parser_inline.py | 56 ++++++++++++++++++++++++++ markdown_it/rules_inline/text.py | 41 +------------------ tests/test_api/test_plugin_creation.py | 43 ++++++++++++++++++++ 4 files changed, 108 insertions(+), 43 deletions(-) diff --git a/docs/contributing.md b/docs/contributing.md index 3a6d6aeb..4e3e3149 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -118,7 +118,12 @@ __Note:__ Don't try to replace text with HTML markup! That's not secure. ### Why is my inline rule not executed? -The inline parser skips pieces of texts to optimize speed. It stops only on [a small set of chars](https://github.com/markdown-it/markdown-it/blob/master/lib/rules_inline/text.mjs), which can be tokens. We did not made this list extensible for performance reasons too. +The inline parser skips pieces of texts to optimize speed. It stops only on [a small set of chars](https://github.com/executablebooks/markdown-it-py/blob/master/markdown_it/parser_inline.py), which can be tokens. -If you are absolutely sure that something important is missing there - create a -ticket and we will consider adding it as a new charcode. +If your inline rule needs to trigger on a character that is not in the default terminator set, you can register it via `md.inline.add_terminator_char`: + +```python +def my_plugin(md: MarkdownIt) -> None: + md.inline.add_terminator_char("w") # stop text rule on 'w' + md.inline.ruler.push("my_rule", my_inline_rule) +``` diff --git a/markdown_it/parser_inline.py b/markdown_it/parser_inline.py index 26ec2e63..2cabaf67 100644 --- a/markdown_it/parser_inline.py +++ b/markdown_it/parser_inline.py @@ -2,6 +2,7 @@ from __future__ import annotations +import re from collections.abc import Callable from typing import TYPE_CHECKING @@ -15,6 +16,39 @@ from markdown_it import MarkdownIt +# Default set of characters that terminate a text token and allow inline rules to fire. +# '{}$%@~+=:' reserved for extensions. +# Note: Don't confuse with "Markdown ASCII Punctuation" chars. +# http://spec.commonmark.org/0.15/#ascii-punctuation-character +_DEFAULT_TERMINATORS: frozenset[str] = frozenset( + { + "\n", + "!", + "#", + "$", + "%", + "&", + "*", + "+", + "-", + ":", + "<", + "=", + ">", + "@", + "[", + "\\", + "]", + "^", + "_", + "`", + "{", + "}", + "~", + } +) + + # Parser rules RuleFuncInlineType = Callable[[StateInline, bool], bool] """(state: StateInline, silent: bool) -> matched: bool) @@ -61,6 +95,28 @@ def __init__(self) -> None: self.ruler2 = Ruler[RuleFuncInline2Type]() for name, rule2 in _rules2: self.ruler2.push(name, rule2) + # Characters that stop the text rule, allowing other inline rules to fire. + self._terminator_chars: set[str] = set(_DEFAULT_TERMINATORS) + # Pre-compiled regex is kept in sync with _terminator_chars (updated eagerly in + # add_terminator_char) so there is no per-call None-check overhead in the hot path. + self.terminator_re: re.Pattern[str] = self._build_terminator_re() + + def _build_terminator_re(self) -> re.Pattern[str]: + return re.compile( + "[" + re.escape("".join(sorted(self._terminator_chars))) + "]" + ) + + def add_terminator_char(self, ch: str) -> None: + """Register a character that stops the ``text`` rule, allowing inline rules to fire. + + This lets plugins declare which characters their inline rules react to, + mirroring the ``MARKER`` mechanism in the Rust markdown-it implementation. + + :param ch: A single character to add to the terminator set. + """ + if ch not in self._terminator_chars: + self._terminator_chars.add(ch) + self.terminator_re = self._build_terminator_re() def skipToken(self, state: StateInline) -> None: """Skip single token by running all rules in validation mode; diff --git a/markdown_it/rules_inline/text.py b/markdown_it/rules_inline/text.py index 18b2fcc7..ef0cc9ce 100644 --- a/markdown_it/rules_inline/text.py +++ b/markdown_it/rules_inline/text.py @@ -1,54 +1,15 @@ -import functools -import re - # Skip text characters for text token, place those to pending buffer # and increment current pos from .state_inline import StateInline # Rule to skip pure text -# '{}$%@~+=:' reserved for extensions - -# !!!! Don't confuse with "Markdown ASCII Punctuation" chars -# http://spec.commonmark.org/0.15/#ascii-punctuation-character - - -_TerminatorChars = { - "\n", - "!", - "#", - "$", - "%", - "&", - "*", - "+", - "-", - ":", - "<", - "=", - ">", - "@", - "[", - "\\", - "]", - "^", - "_", - "`", - "{", - "}", - "~", -} - - -@functools.cache -def _terminator_char_regex() -> re.Pattern[str]: - return re.compile("[" + re.escape("".join(_TerminatorChars)) + "]") def text(state: StateInline, silent: bool) -> bool: pos = state.pos posMax = state.posMax - terminator_char = _terminator_char_regex().search(state.src, pos) + terminator_char = state.md.inline.terminator_re.search(state.src, pos) pos = terminator_char.start() if terminator_char else posMax if pos == state.pos: diff --git a/tests/test_api/test_plugin_creation.py b/tests/test_api/test_plugin_creation.py index d555be18..611ca4c1 100644 --- a/tests/test_api/test_plugin_creation.py +++ b/tests/test_api/test_plugin_creation.py @@ -89,3 +89,46 @@ def _plugin(_md: MarkdownIt) -> None: MarkdownIt().use(_plugin).parse("a") assert "plugin called" in capsys.readouterr().out + + +def test_add_terminator_char(): + """Test that add_terminator_char stops the text rule on a new character.""" + hit_positions = [] + + def w_rule(state, silent): + if state.src[state.pos] != "w": + return False + hit_positions.append(state.pos) + state.pos += 1 + return True + + def _plugin(_md: MarkdownIt) -> None: + _md.inline.add_terminator_char("w") + _md.inline.ruler.before("text", "w_rule", w_rule) + + md = MarkdownIt().use(_plugin) + + # Without the terminator 'w' would be consumed as plain text; + # with it the rule fires for each 'w'. + md.render("awb") + assert len(hit_positions) > 0 + + +def test_add_terminator_char_idempotent(): + """add_terminator_char with an already-present char should not rebuild the regex.""" + md = MarkdownIt() + original_re = md.inline.terminator_re + + # '\n' is already in the default set – adding it again must not rebuild + md.inline.add_terminator_char("\n") + assert md.inline.terminator_re is original_re + + +def test_add_terminator_char_rebuilds(): + """add_terminator_char with a new char should rebuild the regex.""" + md = MarkdownIt() + original_re = md.inline.terminator_re + + md.inline.add_terminator_char("w") + assert md.inline.terminator_re is not original_re + assert "w" in md.inline._terminator_chars From 3a1fb69b75fdbbf67635f8bb1e601d667e87c4ee Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 4 May 2026 21:16:57 +0000 Subject: [PATCH 3/6] Address code review: remove sorted() overhead, tighten test assertion Agent-Logs-Url: https://github.com/executablebooks/markdown-it-py/sessions/e9a49254-6b3a-4ecc-9b57-84f6df4e6ccd Co-authored-by: chrisjsewell <2997570+chrisjsewell@users.noreply.github.com> --- markdown_it/parser_inline.py | 2 +- tests/test_api/test_plugin_creation.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/markdown_it/parser_inline.py b/markdown_it/parser_inline.py index 2cabaf67..0f0e5029 100644 --- a/markdown_it/parser_inline.py +++ b/markdown_it/parser_inline.py @@ -103,7 +103,7 @@ def __init__(self) -> None: def _build_terminator_re(self) -> re.Pattern[str]: return re.compile( - "[" + re.escape("".join(sorted(self._terminator_chars))) + "]" + "[" + re.escape("".join(self._terminator_chars)) + "]" ) def add_terminator_char(self, ch: str) -> None: diff --git a/tests/test_api/test_plugin_creation.py b/tests/test_api/test_plugin_creation.py index 611ca4c1..41e5015e 100644 --- a/tests/test_api/test_plugin_creation.py +++ b/tests/test_api/test_plugin_creation.py @@ -109,9 +109,9 @@ def _plugin(_md: MarkdownIt) -> None: md = MarkdownIt().use(_plugin) # Without the terminator 'w' would be consumed as plain text; - # with it the rule fires for each 'w'. + # with it the rule fires exactly for the 'w' at position 1 in "awb". md.render("awb") - assert len(hit_positions) > 0 + assert hit_positions == [1] def test_add_terminator_char_idempotent(): From 6100aeb9fa02367a9ea23f30e9de3349b751c39d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 5 May 2026 03:16:10 +0000 Subject: [PATCH 4/6] Avoid per-instance set copy; share module-level default terminator regex Agent-Logs-Url: https://github.com/executablebooks/markdown-it-py/sessions/913c6f4f-7b8b-496d-9626-f0e5b7c8bc0a Co-authored-by: chrisjsewell <2997570+chrisjsewell@users.noreply.github.com> --- markdown_it/parser_inline.py | 30 +++++++++++++++----------- tests/test_api/test_plugin_creation.py | 2 +- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/markdown_it/parser_inline.py b/markdown_it/parser_inline.py index 0f0e5029..938c5f93 100644 --- a/markdown_it/parser_inline.py +++ b/markdown_it/parser_inline.py @@ -48,6 +48,12 @@ } ) +# Pre-compiled regex for the default terminator set. Shared across all ParserInline +# instances that have not had extra chars added, so __init__ pays no allocation cost. +_default_terminator_re: re.Pattern[str] = re.compile( + "[" + re.escape("".join(_DEFAULT_TERMINATORS)) + "]" +) + # Parser rules RuleFuncInlineType = Callable[[StateInline, bool], bool] @@ -96,15 +102,11 @@ def __init__(self) -> None: for name, rule2 in _rules2: self.ruler2.push(name, rule2) # Characters that stop the text rule, allowing other inline rules to fire. - self._terminator_chars: set[str] = set(_DEFAULT_TERMINATORS) - # Pre-compiled regex is kept in sync with _terminator_chars (updated eagerly in - # add_terminator_char) so there is no per-call None-check overhead in the hot path. - self.terminator_re: re.Pattern[str] = self._build_terminator_re() - - def _build_terminator_re(self) -> re.Pattern[str]: - return re.compile( - "[" + re.escape("".join(self._terminator_chars)) + "]" - ) + # _extra_terminator_chars is only allocated when add_terminator_char() is called + # with a char outside the defaults, keeping __init__ allocation-free. + self._extra_terminator_chars: set[str] = set() + # Pre-compiled regex shared with all default instances (no copy in the common path). + self.terminator_re: re.Pattern[str] = _default_terminator_re def add_terminator_char(self, ch: str) -> None: """Register a character that stops the ``text`` rule, allowing inline rules to fire. @@ -114,9 +116,13 @@ def add_terminator_char(self, ch: str) -> None: :param ch: A single character to add to the terminator set. """ - if ch not in self._terminator_chars: - self._terminator_chars.add(ch) - self.terminator_re = self._build_terminator_re() + if ch not in _DEFAULT_TERMINATORS and ch not in self._extra_terminator_chars: + self._extra_terminator_chars.add(ch) + self.terminator_re = re.compile( + "[" + + re.escape("".join(_DEFAULT_TERMINATORS | self._extra_terminator_chars)) + + "]" + ) def skipToken(self, state: StateInline) -> None: """Skip single token by running all rules in validation mode; diff --git a/tests/test_api/test_plugin_creation.py b/tests/test_api/test_plugin_creation.py index 41e5015e..1970ced2 100644 --- a/tests/test_api/test_plugin_creation.py +++ b/tests/test_api/test_plugin_creation.py @@ -131,4 +131,4 @@ def test_add_terminator_char_rebuilds(): md.inline.add_terminator_char("w") assert md.inline.terminator_re is not original_re - assert "w" in md.inline._terminator_chars + assert "w" in md.inline._extra_terminator_chars From d0d8d464edc3143b8081f8df0291d189b9c0b16a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 5 May 2026 03:40:12 +0000 Subject: [PATCH 5/6] Make default terminator regex a lazily-computed cached function Agent-Logs-Url: https://github.com/executablebooks/markdown-it-py/sessions/04e84ce1-b921-4475-9f26-c0b296e29e33 Co-authored-by: chrisjsewell <2997570+chrisjsewell@users.noreply.github.com> --- markdown_it/parser_inline.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/markdown_it/parser_inline.py b/markdown_it/parser_inline.py index 938c5f93..632a5442 100644 --- a/markdown_it/parser_inline.py +++ b/markdown_it/parser_inline.py @@ -2,6 +2,7 @@ from __future__ import annotations +import functools import re from collections.abc import Callable from typing import TYPE_CHECKING @@ -48,11 +49,14 @@ } ) -# Pre-compiled regex for the default terminator set. Shared across all ParserInline -# instances that have not had extra chars added, so __init__ pays no allocation cost. -_default_terminator_re: re.Pattern[str] = re.compile( - "[" + re.escape("".join(_DEFAULT_TERMINATORS)) + "]" -) +# Lazily compiled regex for the default terminator set. The @cache ensures it is +# compiled at most once (on first ParserInline instantiation) and shared across all +# instances that have not added extra chars, keeping __init__ cost near zero. +@functools.cache +def _default_terminator_re() -> re.Pattern[str]: + return re.compile( + "[" + re.escape("".join(_DEFAULT_TERMINATORS)) + "]" + ) # Parser rules @@ -106,7 +110,7 @@ def __init__(self) -> None: # with a char outside the defaults, keeping __init__ allocation-free. self._extra_terminator_chars: set[str] = set() # Pre-compiled regex shared with all default instances (no copy in the common path). - self.terminator_re: re.Pattern[str] = _default_terminator_re + self.terminator_re: re.Pattern[str] = _default_terminator_re() def add_terminator_char(self, ch: str) -> None: """Register a character that stops the ``text`` rule, allowing inline rules to fire. From d5cf7ff5a663d23dac72bd68706a9671ac9e0542 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 5 May 2026 06:56:43 +0000 Subject: [PATCH 6/6] Fix pre-commit: ruff import ordering and formatting Agent-Logs-Url: https://github.com/executablebooks/markdown-it-py/sessions/1c71001c-d7bd-4b35-8682-9c0afb71b1a9 Co-authored-by: chrisjsewell <2997570+chrisjsewell@users.noreply.github.com> --- markdown_it/parser_inline.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/markdown_it/parser_inline.py b/markdown_it/parser_inline.py index 632a5442..8fabb988 100644 --- a/markdown_it/parser_inline.py +++ b/markdown_it/parser_inline.py @@ -2,9 +2,9 @@ from __future__ import annotations +from collections.abc import Callable import functools import re -from collections.abc import Callable from typing import TYPE_CHECKING from . import rules_inline @@ -49,14 +49,13 @@ } ) + # Lazily compiled regex for the default terminator set. The @cache ensures it is # compiled at most once (on first ParserInline instantiation) and shared across all # instances that have not added extra chars, keeping __init__ cost near zero. @functools.cache def _default_terminator_re() -> re.Pattern[str]: - return re.compile( - "[" + re.escape("".join(_DEFAULT_TERMINATORS)) + "]" - ) + return re.compile("[" + re.escape("".join(_DEFAULT_TERMINATORS)) + "]") # Parser rules @@ -124,7 +123,9 @@ def add_terminator_char(self, ch: str) -> None: self._extra_terminator_chars.add(ch) self.terminator_re = re.compile( "[" - + re.escape("".join(_DEFAULT_TERMINATORS | self._extra_terminator_chars)) + + re.escape( + "".join(_DEFAULT_TERMINATORS | self._extra_terminator_chars) + ) + "]" )