@@ -53,13 +53,13 @@ def abstract_inline_conversion(markup_fn):
5353 the text if it looks like an HTML tag. markup_fn is necessary to allow for
5454 references to self.strong_em_symbol etc.
5555 """
56- def implementation (self , el , text , convert_as_inline ):
56+ def implementation (self , el , text , parent_tags ):
5757 markup_prefix = markup_fn (self )
5858 if markup_prefix .startswith ('<' ) and markup_prefix .endswith ('>' ):
5959 markup_suffix = '</' + markup_prefix [1 :]
6060 else :
6161 markup_suffix = markup_prefix
62- if el . find_parent ([ 'pre' , 'code' , 'kbd' , 'samp' ]) :
62+ if '_noformat' in parent_tags :
6363 return text
6464 prefix , suffix , text = chomp (text )
6565 if not text :
@@ -166,18 +166,13 @@ def convert(self, html):
166166 return self .convert_soup (soup )
167167
168168 def convert_soup (self , soup ):
169- return self .process_tag (soup , convert_as_inline = False )
169+ return self .process_tag (soup , parent_tags = set () )
170170
171- def process_tag (self , node , convert_as_inline ):
172- text = ''
171+ def process_tag (self , node , parent_tags = None ):
172+ if parent_tags is None :
173+ parent_tags = set ()
173174
174- # For Markdown headings and table cells, convert children as inline
175- # (so that block element children do not produce newlines).
176- convert_children_as_inline = (
177- convert_as_inline # propagated from parent
178- or html_heading_re .match (node .name ) is not None # headings
179- or node .name in ['td' , 'th' ] # table cells
180- )
175+ text = ''
181176
182177 # Collect child elements to process, ignoring whitespace-only text elements
183178 # adjacent to the inner/outer boundaries of block elements.
@@ -208,14 +203,30 @@ def _can_ignore(el):
208203
209204 children_to_convert = [child for child in node .children if not _can_ignore (child )]
210205
206+ # create a copy of this tag's parent context, the update it to include this tag
207+ # to propagate down into the children
208+ parent_tags_for_children = set (parent_tags )
209+ parent_tags_for_children .add (node .name )
210+
211+ # if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
212+ if (
213+ html_heading_re .match (node .name ) is not None # headings
214+ or node .name in {'td' , 'th' } # table cells
215+ ):
216+ parent_tags_for_children .add ('_inline' )
217+
218+ # if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
219+ if node .name in {'pre' , 'code' , 'kbd' , 'samp' }:
220+ parent_tags_for_children .add ('_noformat' )
221+
211222 # Convert the children first
212223 for el in children_to_convert :
213224 if isinstance (el , NavigableString ):
214- text += self .process_text (el )
225+ text += self .process_text (el , parent_tags = parent_tags_for_children )
215226 else :
216227 text_strip = text .rstrip ('\n ' )
217228 newlines_left = len (text ) - len (text_strip )
218- next_text = self .process_tag (el , convert_children_as_inline )
229+ next_text = self .process_tag (el , parent_tags = parent_tags_for_children )
219230 next_text_strip = next_text .lstrip ('\n ' )
220231 newlines_right = len (next_text ) - len (next_text_strip )
221232 newlines = '\n ' * max (newlines_left , newlines_right )
@@ -225,11 +236,11 @@ def _can_ignore(el):
225236 convert_fn_name = "convert_%s" % re .sub (r"[\[\]:-]" , "_" , node .name )
226237 convert_fn = getattr (self , convert_fn_name , None )
227238 if convert_fn and self .should_convert_tag (node .name ):
228- text = convert_fn (node , text , convert_as_inline )
239+ text = convert_fn (node , text , parent_tags = parent_tags )
229240
230241 return text
231242
232- def convert__document_ (self , el , text , convert_as_inline ):
243+ def convert__document_ (self , el , text , parent_tags ):
233244 """Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
234245 if self .options ['strip_document' ] == LSTRIP :
235246 text = text .lstrip ('\n ' ) # remove leading separation newlines
@@ -244,19 +255,19 @@ def convert__document_(self, el, text, convert_as_inline):
244255
245256 return text
246257
247- def process_text (self , el ):
258+ def process_text (self , el , parent_tags ):
248259 text = six .text_type (el ) or ''
249260
250261 # normalize whitespace if we're not inside a preformatted element
251- if not el . find_parent ( 'pre' ) :
262+ if 'pre' not in parent_tags :
252263 if self .options ['wrap' ]:
253264 text = all_whitespace_re .sub (' ' , text )
254265 else :
255266 text = newline_whitespace_re .sub ('\n ' , text )
256267 text = whitespace_re .sub (' ' , text )
257268
258269 # escape special characters if we're not inside a preformatted or code element
259- if not el . find_parent ([ 'pre' , 'code' , 'kbd' , 'samp' ]) :
270+ if '_noformat' not in parent_tags :
260271 text = self .escape (text )
261272
262273 # remove leading whitespace at the start or just after a
@@ -279,8 +290,8 @@ def __getattr__(self, attr):
279290 if m :
280291 n = int (m .group (1 ))
281292
282- def convert_tag (el , text , convert_as_inline ):
283- return self ._convert_hn (n , el , text , convert_as_inline )
293+ def convert_tag (el , text , parent_tags ):
294+ return self ._convert_hn (n , el , text , parent_tags )
284295
285296 convert_tag .__name__ = 'convert_h%s' % n
286297 setattr (self , convert_tag .__name__ , convert_tag )
@@ -327,8 +338,8 @@ def underline(self, text, pad_char):
327338 text = (text or '' ).rstrip ()
328339 return '\n \n %s\n %s\n \n ' % (text , pad_char * len (text )) if text else ''
329340
330- def convert_a (self , el , text , convert_as_inline ):
331- if el . find_parent ([ 'pre' , 'code' , 'kbd' , 'samp' ]) :
341+ def convert_a (self , el , text , parent_tags ):
342+ if '_noformat' in parent_tags :
332343 return text
333344 prefix , suffix , text = chomp (text )
334345 if not text :
@@ -349,10 +360,10 @@ def convert_a(self, el, text, convert_as_inline):
349360
350361 convert_b = abstract_inline_conversion (lambda self : 2 * self .options ['strong_em_symbol' ])
351362
352- def convert_blockquote (self , el , text , convert_as_inline ):
363+ def convert_blockquote (self , el , text , parent_tags ):
353364 # handle some early-exit scenarios
354365 text = (text or '' ).strip ()
355- if convert_as_inline :
366+ if '_inline' in parent_tags :
356367 return ' ' + text + ' '
357368 if not text :
358369 return "\n "
@@ -365,25 +376,25 @@ def _indent_for_blockquote(match):
365376
366377 return '\n ' + text + '\n \n '
367378
368- def convert_br (self , el , text , convert_as_inline ):
369- if convert_as_inline :
379+ def convert_br (self , el , text , parent_tags ):
380+ if '_inline' in parent_tags :
370381 return ""
371382
372383 if self .options ['newline_style' ].lower () == BACKSLASH :
373384 return '\\ \n '
374385 else :
375386 return ' \n '
376387
377- def convert_code (self , el , text , convert_as_inline ):
378- if el . parent . name == 'pre' :
388+ def convert_code (self , el , text , parent_tags ):
389+ if 'pre' in parent_tags :
379390 return text
380391 converter = abstract_inline_conversion (lambda self : '`' )
381- return converter (self , el , text , convert_as_inline )
392+ return converter (self , el , text , parent_tags )
382393
383394 convert_del = abstract_inline_conversion (lambda self : '~~' )
384395
385- def convert_div (self , el , text , convert_as_inline ):
386- if convert_as_inline :
396+ def convert_div (self , el , text , parent_tags ):
397+ if '_inline' in parent_tags :
387398 return ' ' + text .strip () + ' '
388399 text = text .strip ()
389400 return '\n \n %s\n \n ' % text if text else ''
@@ -396,9 +407,9 @@ def convert_div(self, el, text, convert_as_inline):
396407
397408 convert_kbd = convert_code
398409
399- def convert_dd (self , el , text , convert_as_inline ):
410+ def convert_dd (self , el , text , parent_tags ):
400411 text = (text or '' ).strip ()
401- if convert_as_inline :
412+ if '_inline' in parent_tags :
402413 return ' ' + text + ' '
403414 if not text :
404415 return '\n '
@@ -414,11 +425,11 @@ def _indent_for_dd(match):
414425
415426 return '%s\n ' % text
416427
417- def convert_dt (self , el , text , convert_as_inline ):
428+ def convert_dt (self , el , text , parent_tags ):
418429 # remove newlines from term text
419430 text = (text or '' ).strip ()
420431 text = all_whitespace_re .sub (' ' , text )
421- if convert_as_inline :
432+ if '_inline' in parent_tags :
422433 return ' ' + text + ' '
423434 if not text :
424435 return '\n '
@@ -428,9 +439,9 @@ def convert_dt(self, el, text, convert_as_inline):
428439
429440 return '\n %s\n ' % text
430441
431- def _convert_hn (self , n , el , text , convert_as_inline ):
442+ def _convert_hn (self , n , el , text , parent_tags ):
432443 """ Method name prefixed with _ to prevent <hn> to call this """
433- if convert_as_inline :
444+ if '_inline' in parent_tags :
434445 return text
435446
436447 # prevent MemoryErrors in case of very large n
@@ -447,46 +458,40 @@ def _convert_hn(self, n, el, text, convert_as_inline):
447458 return '\n \n %s %s %s\n \n ' % (hashes , text , hashes )
448459 return '\n \n %s %s\n \n ' % (hashes , text )
449460
450- def convert_hr (self , el , text , convert_as_inline ):
461+ def convert_hr (self , el , text , parent_tags ):
451462 return '\n \n ---\n \n '
452463
453464 convert_i = convert_em
454465
455- def convert_img (self , el , text , convert_as_inline ):
466+ def convert_img (self , el , text , parent_tags ):
456467 alt = el .attrs .get ('alt' , None ) or ''
457468 src = el .attrs .get ('src' , None ) or ''
458469 title = el .attrs .get ('title' , None ) or ''
459470 title_part = ' "%s"' % title .replace ('"' , r'\"' ) if title else ''
460- if (convert_as_inline
471+ if ('_inline' in parent_tags
461472 and el .parent .name not in self .options ['keep_inline_images_in' ]):
462473 return alt
463474
464475 return '' % (alt , src , title_part )
465476
466- def convert_list (self , el , text , convert_as_inline ):
477+ def convert_list (self , el , text , parent_tags ):
467478
468479 # Converting a list to inline is undefined.
469- # Ignoring convert_to_inline for list.
480+ # Ignoring inline conversion parents for list.
470481
471- nested = False
472482 before_paragraph = False
473483 next_sibling = _next_block_content_sibling (el )
474484 if next_sibling and next_sibling .name not in ['ul' , 'ol' ]:
475485 before_paragraph = True
476- while el :
477- if el .name == 'li' :
478- nested = True
479- break
480- el = el .parent
481- if nested :
482- # remove trailing newline if nested
486+ if 'li' in parent_tags :
487+ # remove trailing newline if we're in a nested list
483488 return '\n ' + text .rstrip ()
484489 return '\n \n ' + text + ('\n ' if before_paragraph else '' )
485490
486491 convert_ul = convert_list
487492 convert_ol = convert_list
488493
489- def convert_li (self , el , text , convert_as_inline ):
494+ def convert_li (self , el , text , parent_tags ):
490495 # handle some early-exit scenarios
491496 text = (text or '' ).strip ()
492497 if not text :
@@ -523,8 +528,8 @@ def _indent_for_li(match):
523528
524529 return '%s\n ' % text
525530
526- def convert_p (self , el , text , convert_as_inline ):
527- if convert_as_inline :
531+ def convert_p (self , el , text , parent_tags ):
532+ if '_inline' in parent_tags :
528533 return ' ' + text .strip () + ' '
529534 text = text .strip ()
530535 if self .options ['wrap' ]:
@@ -546,7 +551,7 @@ def convert_p(self, el, text, convert_as_inline):
546551 text = '\n ' .join (new_lines )
547552 return '\n \n %s\n \n ' % text if text else ''
548553
549- def convert_pre (self , el , text , convert_as_inline ):
554+ def convert_pre (self , el , text , parent_tags ):
550555 if not text :
551556 return ''
552557 code_language = self .options ['code_language' ]
@@ -556,10 +561,10 @@ def convert_pre(self, el, text, convert_as_inline):
556561
557562 return '\n \n ```%s\n %s\n ```\n \n ' % (code_language , text )
558563
559- def convert_script (self , el , text , convert_as_inline ):
564+ def convert_script (self , el , text , parent_tags ):
560565 return ''
561566
562- def convert_style (self , el , text , convert_as_inline ):
567+ def convert_style (self , el , text , parent_tags ):
563568 return ''
564569
565570 convert_s = convert_del
@@ -572,28 +577,28 @@ def convert_style(self, el, text, convert_as_inline):
572577
573578 convert_sup = abstract_inline_conversion (lambda self : self .options ['sup_symbol' ])
574579
575- def convert_table (self , el , text , convert_as_inline ):
580+ def convert_table (self , el , text , parent_tags ):
576581 return '\n \n ' + text .strip () + '\n \n '
577582
578- def convert_caption (self , el , text , convert_as_inline ):
583+ def convert_caption (self , el , text , parent_tags ):
579584 return text .strip () + '\n \n '
580585
581- def convert_figcaption (self , el , text , convert_as_inline ):
586+ def convert_figcaption (self , el , text , parent_tags ):
582587 return '\n \n ' + text .strip () + '\n \n '
583588
584- def convert_td (self , el , text , convert_as_inline ):
589+ def convert_td (self , el , text , parent_tags ):
585590 colspan = 1
586591 if 'colspan' in el .attrs and el ['colspan' ].isdigit ():
587592 colspan = int (el ['colspan' ])
588593 return ' ' + text .strip ().replace ("\n " , " " ) + ' |' * colspan
589594
590- def convert_th (self , el , text , convert_as_inline ):
595+ def convert_th (self , el , text , parent_tags ):
591596 colspan = 1
592597 if 'colspan' in el .attrs and el ['colspan' ].isdigit ():
593598 colspan = int (el ['colspan' ])
594599 return ' ' + text .strip ().replace ("\n " , " " ) + ' |' * colspan
595600
596- def convert_tr (self , el , text , convert_as_inline ):
601+ def convert_tr (self , el , text , parent_tags ):
597602 cells = el .find_all (['td' , 'th' ])
598603 is_first_row = el .find_previous_sibling () is None
599604 is_headrow = (
0 commit comments