Merge branch 'develop' into chrispy/propagate-contexts-downward

chrispy-snps · chrispy-snps · commit 02a2bcac3629 · 2025-02-17T06:10:48.000-08:00
diff --git a/markdownify/__init__.py b/markdownify/__init__.py
@@ -11,6 +11,10 @@
 newline_whitespace_re = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
 html_heading_re = re.compile(r'h[1-6]')
 
+# extract (leading_nl, content, trailing_nl) from a string
+# (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here)
+extract_newlines_re = re.compile(r'^(\n*)((?:.*[^\n])?)(\n*)$', flags=re.DOTALL)
+
 
 # Heading styles
 ATX = 'atx'
@@ -168,12 +172,17 @@ def convert(self, html):
     def convert_soup(self, soup):
         return self.process_tag(soup, parent_tags=set())
 
+    def process_element(self, node, parent_tags=None):
+        if isinstance(node, NavigableString):
+            return self.process_text(node, parent_tags=parent_tags)
+        else:
+            return self.process_tag(node, parent_tags=parent_tags)
+
     def process_tag(self, node, parent_tags=None):
+        # For the top-level element, initialize the parent context with an empty set.
         if parent_tags is None:
             parent_tags = set()
 
-        text = ''
-
         # Collect child elements to process, ignoring whitespace-only text elements
         # adjacent to the inner/outer boundaries of block elements.
         should_remove_inside = should_remove_whitespace_inside(node)
@@ -198,13 +207,15 @@ def _can_ignore(el):
                     return True
                 else:
                     return False
+            elif el is None:
+                return True
             else:
                 raise ValueError('Unexpected element type: %s' % type(el))
 
-        children_to_convert = [child for child in node.children if not _can_ignore(child)]
+        children_to_convert = [el for el in node.children if not _can_ignore(el)]
 
-        # create a copy of this tag's parent context, the update it to include this tag
-        # to propagate down into the children
+        # Create a copy of this tag's parent context, then update it to include this tag
+        # to propagate down into the children.
         parent_tags_for_children = set(parent_tags)
         parent_tags_for_children.add(node.name)
 
@@ -219,18 +230,40 @@ def _can_ignore(el):
         if node.name in {'pre', 'code', 'kbd', 'samp'}:
             parent_tags_for_children.add('_noformat')
 
-        # Convert the children first
-        for el in children_to_convert:
-            if isinstance(el, NavigableString):
-                text += self.process_text(el, parent_tags=parent_tags_for_children)
-            else:
-                text_strip = text.rstrip('\n')
-                newlines_left = len(text) - len(text_strip)
-                next_text = self.process_tag(el, parent_tags=parent_tags_for_children)
-                next_text_strip = next_text.lstrip('\n')
-                newlines_right = len(next_text) - len(next_text_strip)
-                newlines = '\n' * max(newlines_left, newlines_right)
-                text = text_strip + newlines + next_text_strip
+        # Convert the children elements into a list of result strings.
+        child_strings = [
+            self.process_element(el, parent_tags=parent_tags_for_children)
+            for el in children_to_convert
+        ]
+
+        # Remove empty string values.
+        child_strings = [s for s in child_strings if s]
+
+        # Collapse newlines at child element boundaries, if needed.
+        if node.name == 'pre' or node.find_parent('pre'):
+            # Inside <pre> blocks, do not collapse newlines.
+            pass
+        else:
+            # Collapse newlines at child element boundaries.
+            updated_child_strings = ['']  # so the first lookback works
+            for child_string in child_strings:
+                # Separate the leading/trailing newlines from the content.
+                leading_nl, content, trailing_nl = extract_newlines_re.match(child_string).groups()
+
+                # If the last child had trailing newlines and this child has leading newlines,
+                # use the larger newline count, limited to 2.
+                if updated_child_strings[-1] and leading_nl:
+                    prev_trailing_nl = updated_child_strings.pop()  # will be replaced by the collapsed value
+                    num_newlines = min(2, max(len(prev_trailing_nl), len(leading_nl)))
+                    leading_nl = '\n' * num_newlines
+
+                # Add the results to the updated child string list.
+                updated_child_strings.extend([leading_nl, content, trailing_nl])
+
+            child_strings = updated_child_strings
+
+        # Join all child text strings into a single string.
+        text = ''.join(child_strings)
 
         # apply this tag's final conversion function
         convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", node.name)
@@ -255,7 +288,11 @@ def convert__document_(self, el, text, parent_tags):
 
         return text
 
-    def process_text(self, el, parent_tags):
+    def process_text(self, el, parent_tags=None):
+        # For the top-level element, initialize the parent context with an empty set.
+        if parent_tags is None:
+            parent_tags = set()
+
         text = six.text_type(el) or ''
 
         # normalize whitespace if we're not inside a preformatted element