Skip to content

Commit d630820

Browse files
committed
use compiled regex for escaping patterns
1 parent 24977fd commit d630820

File tree

1 file changed

+27
-14
lines changed

1 file changed

+27
-14
lines changed

markdownify/__init__.py

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,26 @@
44
import six
55

66

7+
# General-purpose regex patterns
78
re_convert_heading = re.compile(r'convert_h(\d+)')
89
re_line_with_content = re.compile(r'^(.*)', flags=re.MULTILINE)
910
re_whitespace = re.compile(r'[\t ]+')
1011
re_all_whitespace = re.compile(r'[\t \r\n]+')
1112
re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
1213
re_html_heading = re.compile(r'h[1-6]')
1314

14-
# extract (leading_nl, content, trailing_nl) from a string
15+
# Pattern for creating convert_<tag> function names from tag names
16+
re_make_convert_fn_name = re.compile(r'[\[\]:-]')
17+
18+
# Extract (leading_nl, content, trailing_nl) from a string
1519
# (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here)
1620
re_extract_newlines = re.compile(r'^(\n*)((?:.*[^\n])?)(\n*)$', flags=re.DOTALL)
1721

22+
# Patterns for escaping
23+
re_chars_to_escape = re.compile(r'([]\\&<`[>~=+|])')
24+
re_dash_sequences_to_escape = re.compile(r'(\s|^)(-+(?:\s|$))')
25+
re_hashes_to_escape = re.compile(r'(\s|^)(#{1,6}(?:\s|$))')
26+
re_list_items_to_escape = re.compile(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))')
1827

1928
# Heading styles
2029
ATX = 'atx'
@@ -266,7 +275,7 @@ def _can_ignore(el):
266275
text = ''.join(child_strings)
267276

268277
# apply this tag's final conversion function
269-
convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", node.name)
278+
convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub('_', node.name)
270279
convert_fn = getattr(self, convert_fn_name, None)
271280
if convert_fn and self.should_convert_tag(node.name):
272281
text = convert_fn(node, text, parent_tags=parent_tags)
@@ -351,20 +360,24 @@ def escape(self, text, parent_tags):
351360
if not text:
352361
return ''
353362
if self.options['escape_misc']:
354-
text = re.sub(r'([]\\&<`[>~=+|])', r'\\\1', text)
355-
# A sequence of one or more consecutive '-', preceded and
356-
# followed by whitespace or start/end of fragment, might
357-
# be confused with an underline of a header, or with a
363+
# Escape miscellaneous special Markdown characters.
364+
text = re_chars_to_escape.sub(r'\\\1', text)
365+
366+
# Escape sequence of one or more consecutive '-', preceded
367+
# and followed by whitespace or start/end of fragment, as it
368+
# might be confused with an underline of a header, or with a
358369
# list marker.
359-
text = re.sub(r'(\s|^)(-+(?:\s|$))', r'\1\\\2', text)
360-
# A sequence of up to six consecutive '#', preceded and
361-
# followed by whitespace or start/end of fragment, might
362-
# be confused with an ATX heading.
363-
text = re.sub(r'(\s|^)(#{1,6}(?:\s|$))', r'\1\\\2', text)
364-
# '.' or ')' preceded by up to nine digits might be
370+
text = re_dash_sequences_to_escape.sub(r'\1\\\2', text)
371+
372+
# Escape a sequence of up to six consecutive '#', preceded
373+
# and followed by whitespace or start/end of fragment, as
374+
# it might be confused with an ATX heading.
375+
text = re_hashes_to_escape.sub(r'\1\\\2', text)
376+
377+
# Escape '.' or ')' preceded by up to nine digits, as it might be
365378
# confused with a list item.
366-
text = re.sub(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))', r'\1\\\2',
367-
text)
379+
text = re_list_items_to_escape.sub(r'\1\\\2', text)
380+
368381
if self.options['escape_asterisks']:
369382
text = text.replace('*', r'\*')
370383
if self.options['escape_underscores']:

0 commit comments

Comments
 (0)