Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions docs/contributing.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,12 @@ __Note:__ Don't try to replace text with HTML markup! That's not secure.

### Why is my inline rule not executed?

The inline parser skips pieces of texts to optimize speed. It stops only on [a small set of chars](https://github.com/markdown-it/markdown-it/blob/master/lib/rules_inline/text.mjs), which can be tokens. We did not made this list extensible for performance reasons too.
The inline parser skips pieces of texts to optimize speed. It stops only on [a small set of chars](https://github.com/executablebooks/markdown-it-py/blob/master/markdown_it/parser_inline.py), which can be tokens.

If you are absolutely sure that something important is missing there - create a
ticket and we will consider adding it as a new charcode.
If your inline rule needs to trigger on a character that is not in the default terminator set, you can register it via `md.inline.add_terminator_char`:

```python
def my_plugin(md: MarkdownIt) -> None:
md.inline.add_terminator_char("w") # stop text rule on 'w'
md.inline.ruler.push("my_rule", my_inline_rule)
```
67 changes: 67 additions & 0 deletions markdown_it/parser_inline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from __future__ import annotations

from collections.abc import Callable
import functools
import re
from typing import TYPE_CHECKING

from . import rules_inline
Expand All @@ -15,6 +17,47 @@
from markdown_it import MarkdownIt


# Default set of characters that terminate a text token and allow inline rules to fire.
# '{}$%@~+=:' reserved for extensions.
# Note: Don't confuse with "Markdown ASCII Punctuation" chars.
# http://spec.commonmark.org/0.15/#ascii-punctuation-character
_DEFAULT_TERMINATORS: frozenset[str] = frozenset(
{
"\n",
"!",
"#",
"$",
"%",
"&",
"*",
"+",
"-",
":",
"<",
"=",
">",
"@",
"[",
"\\",
"]",
"^",
"_",
"`",
"{",
"}",
"~",
}
)


# Lazily compiled regex for the default terminator set. The @cache ensures it is
# compiled at most once (on first ParserInline instantiation) and shared across all
# instances that have not added extra chars, keeping __init__ cost near zero.
@functools.cache
def _default_terminator_re() -> re.Pattern[str]:
return re.compile("[" + re.escape("".join(_DEFAULT_TERMINATORS)) + "]")


# Parser rules
RuleFuncInlineType = Callable[[StateInline, bool], bool]
"""(state: StateInline, silent: bool) -> matched: bool)
Expand Down Expand Up @@ -61,6 +104,30 @@ def __init__(self) -> None:
self.ruler2 = Ruler[RuleFuncInline2Type]()
for name, rule2 in _rules2:
self.ruler2.push(name, rule2)
# Characters that stop the text rule, allowing other inline rules to fire.
# _extra_terminator_chars is only allocated when add_terminator_char() is called
# with a char outside the defaults, keeping __init__ allocation-free.
self._extra_terminator_chars: set[str] = set()
# Pre-compiled regex shared with all default instances (no copy in the common path).
self.terminator_re: re.Pattern[str] = _default_terminator_re()

def add_terminator_char(self, ch: str) -> None:
"""Register a character that stops the ``text`` rule, allowing inline rules to fire.

This lets plugins declare which characters their inline rules react to,
mirroring the ``MARKER`` mechanism in the Rust markdown-it implementation.

:param ch: A single character to add to the terminator set.
"""
if ch not in _DEFAULT_TERMINATORS and ch not in self._extra_terminator_chars:
self._extra_terminator_chars.add(ch)
self.terminator_re = re.compile(
"["
+ re.escape(
"".join(_DEFAULT_TERMINATORS | self._extra_terminator_chars)
)
+ "]"
)

def skipToken(self, state: StateInline) -> None:
"""Skip single token by running all rules in validation mode;
Expand Down
41 changes: 1 addition & 40 deletions markdown_it/rules_inline/text.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,15 @@
import functools
import re

# Skip text characters for text token, place those to pending buffer
# and increment current pos
from .state_inline import StateInline

# Rule to skip pure text
# '{}$%@~+=:' reserved for extensions

# !!!! Don't confuse with "Markdown ASCII Punctuation" chars
# http://spec.commonmark.org/0.15/#ascii-punctuation-character


_TerminatorChars = {
"\n",
"!",
"#",
"$",
"%",
"&",
"*",
"+",
"-",
":",
"<",
"=",
">",
"@",
"[",
"\\",
"]",
"^",
"_",
"`",
"{",
"}",
"~",
}


@functools.cache
def _terminator_char_regex() -> re.Pattern[str]:
return re.compile("[" + re.escape("".join(_TerminatorChars)) + "]")


def text(state: StateInline, silent: bool) -> bool:
pos = state.pos
posMax = state.posMax

terminator_char = _terminator_char_regex().search(state.src, pos)
terminator_char = state.md.inline.terminator_re.search(state.src, pos)
pos = terminator_char.start() if terminator_char else posMax

if pos == state.pos:
Expand Down
43 changes: 43 additions & 0 deletions tests/test_api/test_plugin_creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,46 @@ def _plugin(_md: MarkdownIt) -> None:

MarkdownIt().use(_plugin).parse("a")
assert "plugin called" in capsys.readouterr().out


def test_add_terminator_char():
"""Test that add_terminator_char stops the text rule on a new character."""
hit_positions = []

def w_rule(state, silent):
if state.src[state.pos] != "w":
return False
hit_positions.append(state.pos)
state.pos += 1
return True

def _plugin(_md: MarkdownIt) -> None:
_md.inline.add_terminator_char("w")
_md.inline.ruler.before("text", "w_rule", w_rule)

md = MarkdownIt().use(_plugin)

# Without the terminator 'w' would be consumed as plain text;
# with it the rule fires exactly for the 'w' at position 1 in "awb".
md.render("awb")
assert hit_positions == [1]


def test_add_terminator_char_idempotent():
"""add_terminator_char with an already-present char should not rebuild the regex."""
md = MarkdownIt()
original_re = md.inline.terminator_re

# '\n' is already in the default set – adding it again must not rebuild
md.inline.add_terminator_char("\n")
assert md.inline.terminator_re is original_re


def test_add_terminator_char_rebuilds():
"""add_terminator_char with a new char should rebuild the regex."""
md = MarkdownIt()
original_re = md.inline.terminator_re

md.inline.add_terminator_char("w")
assert md.inline.terminator_re is not original_re
assert "w" in md.inline._extra_terminator_chars
Loading