Skip to content

Commit 9848a04

Browse files
committed
propagate a set of parent tag names downward to improve runtime
Signed-off-by: chrispy <chrispy@synopsys.com>
1 parent 3026602 commit 9848a04

2 files changed

Lines changed: 71 additions & 66 deletions

File tree

markdownify/__init__.py

Lines changed: 68 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,13 @@ def abstract_inline_conversion(markup_fn):
5353
the text if it looks like an HTML tag. markup_fn is necessary to allow for
5454
references to self.strong_em_symbol etc.
5555
"""
56-
def implementation(self, el, text, convert_as_inline):
56+
def implementation(self, el, text, parent_tags):
5757
markup_prefix = markup_fn(self)
5858
if markup_prefix.startswith('<') and markup_prefix.endswith('>'):
5959
markup_suffix = '</' + markup_prefix[1:]
6060
else:
6161
markup_suffix = markup_prefix
62-
if el.find_parent(['pre', 'code', 'kbd', 'samp']):
62+
if '_noformat' in parent_tags:
6363
return text
6464
prefix, suffix, text = chomp(text)
6565
if not text:
@@ -166,18 +166,13 @@ def convert(self, html):
166166
return self.convert_soup(soup)
167167

168168
def convert_soup(self, soup):
169-
return self.process_tag(soup, convert_as_inline=False)
169+
return self.process_tag(soup, parent_tags=set())
170170

171-
def process_tag(self, node, convert_as_inline):
172-
text = ''
171+
def process_tag(self, node, parent_tags=None):
172+
if parent_tags is None:
173+
parent_tags = set()
173174

174-
# For Markdown headings and table cells, convert children as inline
175-
# (so that block element children do not produce newlines).
176-
convert_children_as_inline = (
177-
convert_as_inline # propagated from parent
178-
or html_heading_re.match(node.name) is not None # headings
179-
or node.name in ['td', 'th'] # table cells
180-
)
175+
text = ''
181176

182177
# Collect child elements to process, ignoring whitespace-only text elements
183178
# adjacent to the inner/outer boundaries of block elements.
@@ -208,14 +203,30 @@ def _can_ignore(el):
208203

209204
children_to_convert = [child for child in node.children if not _can_ignore(child)]
210205

206+
# create a copy of this tag's parent context, the update it to include this tag
207+
# to propagate down into the children
208+
parent_tags_for_children = set(parent_tags)
209+
parent_tags_for_children.add(node.name)
210+
211+
# if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
212+
if (
213+
html_heading_re.match(node.name) is not None # headings
214+
or node.name in {'td', 'th'} # table cells
215+
):
216+
parent_tags_for_children.add('_inline')
217+
218+
# if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
219+
if node.name in {'pre', 'code', 'kbd', 'samp'}:
220+
parent_tags_for_children.add('_noformat')
221+
211222
# Convert the children first
212223
for el in children_to_convert:
213224
if isinstance(el, NavigableString):
214-
text += self.process_text(el)
225+
text += self.process_text(el, parent_tags=parent_tags_for_children)
215226
else:
216227
text_strip = text.rstrip('\n')
217228
newlines_left = len(text) - len(text_strip)
218-
next_text = self.process_tag(el, convert_children_as_inline)
229+
next_text = self.process_tag(el, parent_tags=parent_tags_for_children)
219230
next_text_strip = next_text.lstrip('\n')
220231
newlines_right = len(next_text) - len(next_text_strip)
221232
newlines = '\n' * max(newlines_left, newlines_right)
@@ -225,11 +236,11 @@ def _can_ignore(el):
225236
convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", node.name)
226237
convert_fn = getattr(self, convert_fn_name, None)
227238
if convert_fn and self.should_convert_tag(node.name):
228-
text = convert_fn(node, text, convert_as_inline)
239+
text = convert_fn(node, text, parent_tags=parent_tags)
229240

230241
return text
231242

232-
def convert__document_(self, el, text, convert_as_inline):
243+
def convert__document_(self, el, text, parent_tags):
233244
"""Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
234245
if self.options['strip_document'] == LSTRIP:
235246
text = text.lstrip('\n') # remove leading separation newlines
@@ -244,19 +255,19 @@ def convert__document_(self, el, text, convert_as_inline):
244255

245256
return text
246257

247-
def process_text(self, el):
258+
def process_text(self, el, parent_tags):
248259
text = six.text_type(el) or ''
249260

250261
# normalize whitespace if we're not inside a preformatted element
251-
if not el.find_parent('pre'):
262+
if 'pre' not in parent_tags:
252263
if self.options['wrap']:
253264
text = all_whitespace_re.sub(' ', text)
254265
else:
255266
text = newline_whitespace_re.sub('\n', text)
256267
text = whitespace_re.sub(' ', text)
257268

258269
# escape special characters if we're not inside a preformatted or code element
259-
if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
270+
if '_noformat' not in parent_tags:
260271
text = self.escape(text)
261272

262273
# remove leading whitespace at the start or just after a
@@ -279,8 +290,8 @@ def __getattr__(self, attr):
279290
if m:
280291
n = int(m.group(1))
281292

282-
def convert_tag(el, text, convert_as_inline):
283-
return self._convert_hn(n, el, text, convert_as_inline)
293+
def convert_tag(el, text, parent_tags):
294+
return self._convert_hn(n, el, text, parent_tags)
284295

285296
convert_tag.__name__ = 'convert_h%s' % n
286297
setattr(self, convert_tag.__name__, convert_tag)
@@ -327,8 +338,8 @@ def underline(self, text, pad_char):
327338
text = (text or '').rstrip()
328339
return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
329340

330-
def convert_a(self, el, text, convert_as_inline):
331-
if el.find_parent(['pre', 'code', 'kbd', 'samp']):
341+
def convert_a(self, el, text, parent_tags):
342+
if '_noformat' in parent_tags:
332343
return text
333344
prefix, suffix, text = chomp(text)
334345
if not text:
@@ -349,10 +360,10 @@ def convert_a(self, el, text, convert_as_inline):
349360

350361
convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])
351362

352-
def convert_blockquote(self, el, text, convert_as_inline):
363+
def convert_blockquote(self, el, text, parent_tags):
353364
# handle some early-exit scenarios
354365
text = (text or '').strip()
355-
if convert_as_inline:
366+
if '_inline' in parent_tags:
356367
return ' ' + text + ' '
357368
if not text:
358369
return "\n"
@@ -365,25 +376,25 @@ def _indent_for_blockquote(match):
365376

366377
return '\n' + text + '\n\n'
367378

368-
def convert_br(self, el, text, convert_as_inline):
369-
if convert_as_inline:
379+
def convert_br(self, el, text, parent_tags):
380+
if '_inline' in parent_tags:
370381
return ""
371382

372383
if self.options['newline_style'].lower() == BACKSLASH:
373384
return '\\\n'
374385
else:
375386
return ' \n'
376387

377-
def convert_code(self, el, text, convert_as_inline):
378-
if el.parent.name == 'pre':
388+
def convert_code(self, el, text, parent_tags):
389+
if 'pre' in parent_tags:
379390
return text
380391
converter = abstract_inline_conversion(lambda self: '`')
381-
return converter(self, el, text, convert_as_inline)
392+
return converter(self, el, text, parent_tags)
382393

383394
convert_del = abstract_inline_conversion(lambda self: '~~')
384395

385-
def convert_div(self, el, text, convert_as_inline):
386-
if convert_as_inline:
396+
def convert_div(self, el, text, parent_tags):
397+
if '_inline' in parent_tags:
387398
return ' ' + text.strip() + ' '
388399
text = text.strip()
389400
return '\n\n%s\n\n' % text if text else ''
@@ -396,9 +407,9 @@ def convert_div(self, el, text, convert_as_inline):
396407

397408
convert_kbd = convert_code
398409

399-
def convert_dd(self, el, text, convert_as_inline):
410+
def convert_dd(self, el, text, parent_tags):
400411
text = (text or '').strip()
401-
if convert_as_inline:
412+
if '_inline' in parent_tags:
402413
return ' ' + text + ' '
403414
if not text:
404415
return '\n'
@@ -414,11 +425,11 @@ def _indent_for_dd(match):
414425

415426
return '%s\n' % text
416427

417-
def convert_dt(self, el, text, convert_as_inline):
428+
def convert_dt(self, el, text, parent_tags):
418429
# remove newlines from term text
419430
text = (text or '').strip()
420431
text = all_whitespace_re.sub(' ', text)
421-
if convert_as_inline:
432+
if '_inline' in parent_tags:
422433
return ' ' + text + ' '
423434
if not text:
424435
return '\n'
@@ -428,9 +439,9 @@ def convert_dt(self, el, text, convert_as_inline):
428439

429440
return '\n%s\n' % text
430441

431-
def _convert_hn(self, n, el, text, convert_as_inline):
442+
def _convert_hn(self, n, el, text, parent_tags):
432443
""" Method name prefixed with _ to prevent <hn> to call this """
433-
if convert_as_inline:
444+
if '_inline' in parent_tags:
434445
return text
435446

436447
# prevent MemoryErrors in case of very large n
@@ -447,46 +458,40 @@ def _convert_hn(self, n, el, text, convert_as_inline):
447458
return '\n\n%s %s %s\n\n' % (hashes, text, hashes)
448459
return '\n\n%s %s\n\n' % (hashes, text)
449460

450-
def convert_hr(self, el, text, convert_as_inline):
461+
def convert_hr(self, el, text, parent_tags):
451462
return '\n\n---\n\n'
452463

453464
convert_i = convert_em
454465

455-
def convert_img(self, el, text, convert_as_inline):
466+
def convert_img(self, el, text, parent_tags):
456467
alt = el.attrs.get('alt', None) or ''
457468
src = el.attrs.get('src', None) or ''
458469
title = el.attrs.get('title', None) or ''
459470
title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
460-
if (convert_as_inline
471+
if ('_inline' in parent_tags
461472
and el.parent.name not in self.options['keep_inline_images_in']):
462473
return alt
463474

464475
return '![%s](%s%s)' % (alt, src, title_part)
465476

466-
def convert_list(self, el, text, convert_as_inline):
477+
def convert_list(self, el, text, parent_tags):
467478

468479
# Converting a list to inline is undefined.
469-
# Ignoring convert_to_inline for list.
480+
# Ignoring inline conversion parents for list.
470481

471-
nested = False
472482
before_paragraph = False
473483
next_sibling = _next_block_content_sibling(el)
474484
if next_sibling and next_sibling.name not in ['ul', 'ol']:
475485
before_paragraph = True
476-
while el:
477-
if el.name == 'li':
478-
nested = True
479-
break
480-
el = el.parent
481-
if nested:
482-
# remove trailing newline if nested
486+
if 'li' in parent_tags:
487+
# remove trailing newline if we're in a nested list
483488
return '\n' + text.rstrip()
484489
return '\n\n' + text + ('\n' if before_paragraph else '')
485490

486491
convert_ul = convert_list
487492
convert_ol = convert_list
488493

489-
def convert_li(self, el, text, convert_as_inline):
494+
def convert_li(self, el, text, parent_tags):
490495
# handle some early-exit scenarios
491496
text = (text or '').strip()
492497
if not text:
@@ -523,8 +528,8 @@ def _indent_for_li(match):
523528

524529
return '%s\n' % text
525530

526-
def convert_p(self, el, text, convert_as_inline):
527-
if convert_as_inline:
531+
def convert_p(self, el, text, parent_tags):
532+
if '_inline' in parent_tags:
528533
return ' ' + text.strip() + ' '
529534
text = text.strip()
530535
if self.options['wrap']:
@@ -546,7 +551,7 @@ def convert_p(self, el, text, convert_as_inline):
546551
text = '\n'.join(new_lines)
547552
return '\n\n%s\n\n' % text if text else ''
548553

549-
def convert_pre(self, el, text, convert_as_inline):
554+
def convert_pre(self, el, text, parent_tags):
550555
if not text:
551556
return ''
552557
code_language = self.options['code_language']
@@ -556,10 +561,10 @@ def convert_pre(self, el, text, convert_as_inline):
556561

557562
return '\n\n```%s\n%s\n```\n\n' % (code_language, text)
558563

559-
def convert_script(self, el, text, convert_as_inline):
564+
def convert_script(self, el, text, parent_tags):
560565
return ''
561566

562-
def convert_style(self, el, text, convert_as_inline):
567+
def convert_style(self, el, text, parent_tags):
563568
return ''
564569

565570
convert_s = convert_del
@@ -572,28 +577,28 @@ def convert_style(self, el, text, convert_as_inline):
572577

573578
convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])
574579

575-
def convert_table(self, el, text, convert_as_inline):
580+
def convert_table(self, el, text, parent_tags):
576581
return '\n\n' + text.strip() + '\n\n'
577582

578-
def convert_caption(self, el, text, convert_as_inline):
583+
def convert_caption(self, el, text, parent_tags):
579584
return text.strip() + '\n\n'
580585

581-
def convert_figcaption(self, el, text, convert_as_inline):
586+
def convert_figcaption(self, el, text, parent_tags):
582587
return '\n\n' + text.strip() + '\n\n'
583588

584-
def convert_td(self, el, text, convert_as_inline):
589+
def convert_td(self, el, text, parent_tags):
585590
colspan = 1
586591
if 'colspan' in el.attrs and el['colspan'].isdigit():
587592
colspan = int(el['colspan'])
588593
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
589594

590-
def convert_th(self, el, text, convert_as_inline):
595+
def convert_th(self, el, text, parent_tags):
591596
colspan = 1
592597
if 'colspan' in el.attrs and el['colspan'].isdigit():
593598
colspan = int(el['colspan'])
594599
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
595600

596-
def convert_tr(self, el, text, convert_as_inline):
601+
def convert_tr(self, el, text, parent_tags):
597602
cells = el.find_all(['td', 'th'])
598603
is_first_row = el.find_previous_sibling() is None
599604
is_headrow = (

tests/test_custom_converter.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@ class UnitTestConverter(MarkdownConverter):
66
"""
77
Create a custom MarkdownConverter for unit tests
88
"""
9-
def convert_img(self, el, text, convert_as_inline):
9+
def convert_img(self, el, text, parent_tags):
1010
"""Add two newlines after an image"""
11-
return super().convert_img(el, text, convert_as_inline) + '\n\n'
11+
return super().convert_img(el, text, parent_tags) + '\n\n'
1212

13-
def convert_custom_tag(self, el, text, convert_as_inline):
13+
def convert_custom_tag(self, el, text, parent_tags):
1414
"""Ensure conversion function is found for tags with special characters in name"""
1515
return "FUNCTION USED: %s" % text
1616

0 commit comments

Comments
 (0)