diff --git a/docs/contributing.md b/docs/contributing.md index 3a6d6aeb..4e3e3149 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -118,7 +118,12 @@ __Note:__ Don't try to replace text with HTML markup! That's not secure. ### Why is my inline rule not executed? -The inline parser skips pieces of texts to optimize speed. It stops only on [a small set of chars](https://github.com/markdown-it/markdown-it/blob/master/lib/rules_inline/text.mjs), which can be tokens. We did not made this list extensible for performance reasons too. +The inline parser skips pieces of texts to optimize speed. It stops only on [a small set of chars](https://github.com/executablebooks/markdown-it-py/blob/master/markdown_it/parser_inline.py), which can be tokens. -If you are absolutely sure that something important is missing there - create a -ticket and we will consider adding it as a new charcode. +If your inline rule needs to trigger on a character that is not in the default terminator set, you can register it via `md.inline.add_terminator_char`: + +```python +def my_plugin(md: MarkdownIt) -> None: + md.inline.add_terminator_char("w") # stop text rule on 'w' + md.inline.ruler.push("my_rule", my_inline_rule) +``` diff --git a/markdown_it/parser_inline.py b/markdown_it/parser_inline.py index 26ec2e63..8fabb988 100644 --- a/markdown_it/parser_inline.py +++ b/markdown_it/parser_inline.py @@ -3,6 +3,8 @@ from __future__ import annotations from collections.abc import Callable +import functools +import re from typing import TYPE_CHECKING from . import rules_inline @@ -15,6 +17,47 @@ from markdown_it import MarkdownIt +# Default set of characters that terminate a text token and allow inline rules to fire. +# '{}$%@~+=:' reserved for extensions. +# Note: Don't confuse with "Markdown ASCII Punctuation" chars. +# http://spec.commonmark.org/0.15/#ascii-punctuation-character +_DEFAULT_TERMINATORS: frozenset[str] = frozenset( + { + "\n", + "!", + "#", + "$", + "%", + "&", + "*", + "+", + "-", + ":", + "<", + "=", + ">", + "@", + "[", + "\\", + "]", + "^", + "_", + "`", + "{", + "}", + "~", + } +) + + +# Lazily compiled regex for the default terminator set. The @cache ensures it is +# compiled at most once (on first ParserInline instantiation) and shared across all +# instances that have not added extra chars, keeping __init__ cost near zero. +@functools.cache +def _default_terminator_re() -> re.Pattern[str]: + return re.compile("[" + re.escape("".join(_DEFAULT_TERMINATORS)) + "]") + + # Parser rules RuleFuncInlineType = Callable[[StateInline, bool], bool] """(state: StateInline, silent: bool) -> matched: bool) @@ -61,6 +104,30 @@ def __init__(self) -> None: self.ruler2 = Ruler[RuleFuncInline2Type]() for name, rule2 in _rules2: self.ruler2.push(name, rule2) + # Characters that stop the text rule, allowing other inline rules to fire. + # _extra_terminator_chars is only allocated when add_terminator_char() is called + # with a char outside the defaults, keeping __init__ allocation-free. + self._extra_terminator_chars: set[str] = set() + # Pre-compiled regex shared with all default instances (no copy in the common path). + self.terminator_re: re.Pattern[str] = _default_terminator_re() + + def add_terminator_char(self, ch: str) -> None: + """Register a character that stops the ``text`` rule, allowing inline rules to fire. + + This lets plugins declare which characters their inline rules react to, + mirroring the ``MARKER`` mechanism in the Rust markdown-it implementation. + + :param ch: A single character to add to the terminator set. + """ + if ch not in _DEFAULT_TERMINATORS and ch not in self._extra_terminator_chars: + self._extra_terminator_chars.add(ch) + self.terminator_re = re.compile( + "[" + + re.escape( + "".join(_DEFAULT_TERMINATORS | self._extra_terminator_chars) + ) + + "]" + ) def skipToken(self, state: StateInline) -> None: """Skip single token by running all rules in validation mode; diff --git a/markdown_it/rules_inline/text.py b/markdown_it/rules_inline/text.py index 18b2fcc7..ef0cc9ce 100644 --- a/markdown_it/rules_inline/text.py +++ b/markdown_it/rules_inline/text.py @@ -1,54 +1,15 @@ -import functools -import re - # Skip text characters for text token, place those to pending buffer # and increment current pos from .state_inline import StateInline # Rule to skip pure text -# '{}$%@~+=:' reserved for extensions - -# !!!! Don't confuse with "Markdown ASCII Punctuation" chars -# http://spec.commonmark.org/0.15/#ascii-punctuation-character - - -_TerminatorChars = { - "\n", - "!", - "#", - "$", - "%", - "&", - "*", - "+", - "-", - ":", - "<", - "=", - ">", - "@", - "[", - "\\", - "]", - "^", - "_", - "`", - "{", - "}", - "~", -} - - -@functools.cache -def _terminator_char_regex() -> re.Pattern[str]: - return re.compile("[" + re.escape("".join(_TerminatorChars)) + "]") def text(state: StateInline, silent: bool) -> bool: pos = state.pos posMax = state.posMax - terminator_char = _terminator_char_regex().search(state.src, pos) + terminator_char = state.md.inline.terminator_re.search(state.src, pos) pos = terminator_char.start() if terminator_char else posMax if pos == state.pos: diff --git a/tests/test_api/test_plugin_creation.py b/tests/test_api/test_plugin_creation.py index d555be18..1970ced2 100644 --- a/tests/test_api/test_plugin_creation.py +++ b/tests/test_api/test_plugin_creation.py @@ -89,3 +89,46 @@ def _plugin(_md: MarkdownIt) -> None: MarkdownIt().use(_plugin).parse("a") assert "plugin called" in capsys.readouterr().out + + +def test_add_terminator_char(): + """Test that add_terminator_char stops the text rule on a new character.""" + hit_positions = [] + + def w_rule(state, silent): + if state.src[state.pos] != "w": + return False + hit_positions.append(state.pos) + state.pos += 1 + return True + + def _plugin(_md: MarkdownIt) -> None: + _md.inline.add_terminator_char("w") + _md.inline.ruler.before("text", "w_rule", w_rule) + + md = MarkdownIt().use(_plugin) + + # Without the terminator 'w' would be consumed as plain text; + # with it the rule fires exactly for the 'w' at position 1 in "awb". + md.render("awb") + assert hit_positions == [1] + + +def test_add_terminator_char_idempotent(): + """add_terminator_char with an already-present char should not rebuild the regex.""" + md = MarkdownIt() + original_re = md.inline.terminator_re + + # '\n' is already in the default set – adding it again must not rebuild + md.inline.add_terminator_char("\n") + assert md.inline.terminator_re is original_re + + +def test_add_terminator_char_rebuilds(): + """add_terminator_char with a new char should rebuild the regex.""" + md = MarkdownIt() + original_re = md.inline.terminator_re + + md.inline.add_terminator_char("w") + assert md.inline.terminator_re is not original_re + assert "w" in md.inline._extra_terminator_chars