1111newline_whitespace_re = re .compile (r'[\t \r\n]*[\r\n][\t \r\n]*' )
1212html_heading_re = re .compile (r'h[1-6]' )
1313
14+ # extract (leading_nl, content, trailing_nl) from a string
15+ # (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here)
16+ extract_newlines_re = re .compile (r'^(\n*)((?:.*[^\n])?)(\n*)$' , flags = re .DOTALL )
17+
1418
1519# Heading styles
1620ATX = 'atx'
@@ -168,12 +172,17 @@ def convert(self, html):
168172 def convert_soup (self , soup ):
169173 return self .process_tag (soup , parent_tags = set ())
170174
175+ def process_element (self , node , parent_tags = None ):
176+ if isinstance (node , NavigableString ):
177+ return self .process_text (node , parent_tags = parent_tags )
178+ else :
179+ return self .process_tag (node , parent_tags = parent_tags )
180+
171181 def process_tag (self , node , parent_tags = None ):
182+ # For the top-level element, initialize the parent context with an empty set.
172183 if parent_tags is None :
173184 parent_tags = set ()
174185
175- text = ''
176-
177186 # Collect child elements to process, ignoring whitespace-only text elements
178187 # adjacent to the inner/outer boundaries of block elements.
179188 should_remove_inside = should_remove_whitespace_inside (node )
@@ -198,13 +207,15 @@ def _can_ignore(el):
198207 return True
199208 else :
200209 return False
210+ elif el is None :
211+ return True
201212 else :
202213 raise ValueError ('Unexpected element type: %s' % type (el ))
203214
204- children_to_convert = [child for child in node .children if not _can_ignore (child )]
215+ children_to_convert = [el for el in node .children if not _can_ignore (el )]
205216
206- # create a copy of this tag's parent context, the update it to include this tag
207- # to propagate down into the children
217+ # Create a copy of this tag's parent context, then update it to include this tag
218+ # to propagate down into the children.
208219 parent_tags_for_children = set (parent_tags )
209220 parent_tags_for_children .add (node .name )
210221
@@ -219,18 +230,40 @@ def _can_ignore(el):
219230 if node .name in {'pre' , 'code' , 'kbd' , 'samp' }:
220231 parent_tags_for_children .add ('_noformat' )
221232
222- # Convert the children first
223- for el in children_to_convert :
224- if isinstance (el , NavigableString ):
225- text += self .process_text (el , parent_tags = parent_tags_for_children )
226- else :
227- text_strip = text .rstrip ('\n ' )
228- newlines_left = len (text ) - len (text_strip )
229- next_text = self .process_tag (el , parent_tags = parent_tags_for_children )
230- next_text_strip = next_text .lstrip ('\n ' )
231- newlines_right = len (next_text ) - len (next_text_strip )
232- newlines = '\n ' * max (newlines_left , newlines_right )
233- text = text_strip + newlines + next_text_strip
233+ # Convert the children elements into a list of result strings.
234+ child_strings = [
235+ self .process_element (el , parent_tags = parent_tags_for_children )
236+ for el in children_to_convert
237+ ]
238+
239+ # Remove empty string values.
240+ child_strings = [s for s in child_strings if s ]
241+
242+ # Collapse newlines at child element boundaries, if needed.
243+ if node .name == 'pre' or node .find_parent ('pre' ):
244+ # Inside <pre> blocks, do not collapse newlines.
245+ pass
246+ else :
247+ # Collapse newlines at child element boundaries.
248+ updated_child_strings = ['' ] # so the first lookback works
249+ for child_string in child_strings :
250+ # Separate the leading/trailing newlines from the content.
251+ leading_nl , content , trailing_nl = extract_newlines_re .match (child_string ).groups ()
252+
253+ # If the last child had trailing newlines and this child has leading newlines,
254+ # use the larger newline count, limited to 2.
255+ if updated_child_strings [- 1 ] and leading_nl :
256+ prev_trailing_nl = updated_child_strings .pop () # will be replaced by the collapsed value
257+ num_newlines = min (2 , max (len (prev_trailing_nl ), len (leading_nl )))
258+ leading_nl = '\n ' * num_newlines
259+
260+ # Add the results to the updated child string list.
261+ updated_child_strings .extend ([leading_nl , content , trailing_nl ])
262+
263+ child_strings = updated_child_strings
264+
265+ # Join all child text strings into a single string.
266+ text = '' .join (child_strings )
234267
235268 # apply this tag's final conversion function
236269 convert_fn_name = "convert_%s" % re .sub (r"[\[\]:-]" , "_" , node .name )
@@ -255,7 +288,11 @@ def convert__document_(self, el, text, parent_tags):
255288
256289 return text
257290
258- def process_text (self , el , parent_tags ):
291+ def process_text (self , el , parent_tags = None ):
292+ # For the top-level element, initialize the parent context with an empty set.
293+ if parent_tags is None :
294+ parent_tags = set ()
295+
259296 text = six .text_type (el ) or ''
260297
261298 # normalize whitespace if we're not inside a preformatted element
0 commit comments