grobidOrg
diff --git a/‎.github/workflows/ci-build.yml‎
Lines changed: 5 additions & 1 deletion b/‎.github/workflows/ci-build.yml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎grobid_client/format/TEI2LossyJSON.py‎
Lines changed: 80 additions & 22 deletions b/‎grobid_client/format/TEI2LossyJSON.py‎
Lines changed: 80 additions & 22 deletions
diff --git a/‎grobid_client/format/TEI2Markdown.py‎
Lines changed: 82 additions & 25 deletions b/‎grobid_client/format/TEI2Markdown.py‎
Lines changed: 82 additions & 25 deletions
@@ -22,7 +22,11 @@ jobs:
           python-version: ${{ matrix.python-version }}
           cache: 'pip'
       - name: Cleanup more disk space
-        run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc 
+          sudo rm -rf "/usr/local/share/boost"
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
 
@@ -9,6 +9,8 @@
 import uuid
 from collections import OrderedDict
 from concurrent.futures import ProcessPoolExecutor, as_completed
+import html
+import re
 from pathlib import Path
 from typing import Dict, Union, BinaryIO, Iterator
 
@@ -41,8 +43,15 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO], stream: bool = False
         If stream=False returns the full document dict (same shape as original function).
         """
         # Load with BeautifulSoup but avoid building huge structures when streaming
-        with open(tei_file, 'r') as f:
-            content = f.read()
+        if hasattr(tei_file, 'read'):
+            # File-like object (BinaryIO/StringIO)
+            content = tei_file.read()
+            if isinstance(content, bytes):
+                content = content.decode('utf-8')
+        else:
+            # Path-like object
+            with open(tei_file, 'r', encoding='utf-8') as f:
+                content = f.read()
         soup = BeautifulSoup(content, 'xml')
 
         if soup.TEI is None:
@@ -222,7 +231,6 @@ def _extract_comprehensive_reference_data(self, bibl_struct: Tag, index: int) ->
         Extract detailed bibliographic information from TEI biblStruct elements.
         Implements comprehensive parsing for all standard TEI bibliographic components.
         """
-        import re
 
         citation_data = OrderedDict()
         citation_data['id'] = f"b{index}"
@@ -430,7 +438,6 @@ def _process_pointer_element(self, pointer_element: Tag, link_references: list):
 
     def _process_imprint_details(self, imprint_element: Tag, publication_metadata: Dict):
         """Extract and process imprint information including publisher, dates, and page ranges."""
-        import re
 
         # Extract publisher information
         publisher_elements = imprint_element.find_all("publisher")
@@ -557,7 +564,6 @@ def _extract_person_data(self, person_element: Tag) -> Dict:
         Extract person data (author/editor) from TEI persName or author elements.
         Handles various name formats and affiliations.
         """
-        import re
 
         person_data = {}
 
@@ -628,11 +634,9 @@ def _clean_text(self, text: str) -> str:
                     text = text.decode('utf-8', errors='ignore')
 
         # Normalize whitespace and strip
-        import re
         text = re.sub(r'\s+', ' ', text.strip())
 
         # Remove any potential XML/HTML entities
-        import html
         text = html.unescape(text)
 
         return text
@@ -665,14 +669,33 @@ def _iter_passages_from_soup_for_text(self, text_node: Tag, passage_level: str)
 
                 div_type = div.get("type")
 
+                # Check if this is a header-only div (no content, no nested divs)
+                # If so, capture its header as context for subsequent divs
+                head = div.find("head")
+                direct_p_nodes = [c for c in div.children if hasattr(c, 'name') and c.name == "p"]
+                direct_formula_nodes = [c for c in div.children if hasattr(c, 'name') and c.name == "formula"]
+                nested_divs = [c for c in div.children if hasattr(c, 'name') and (c.name == "div" or (c.name and c.name.endswith(":div")))]
+                has_direct_content = len(direct_p_nodes) > 0 or len(direct_formula_nodes) > 0
+                
+                if head and not has_direct_content and len(nested_divs) == 0:
+                    # This is a header-only div with no nested content
+                    # Capture the header for the next div
+                    head_paragraph = self._clean_text(head.get_text())
+                    continue  # Skip to next div, the header will be used by subsequent sibling
+
                 # Process this div and potentially nested divs
                 for passage in self._process_div_with_nested_content(div, passage_level, head_paragraph):
                     yield passage
+                
+                # Reset head_paragraph after it's been used by a content-bearing div
+                head_paragraph = None
+
 
     def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_paragraph: str = None) -> Iterator[Dict[str, Union[str, Dict[str, str]]]]:
         """
         Process a div and its nested content, handling various back section types.
         Supports nested divs for complex back sections like annex with multiple subsections.
+        Also handles formula elements that are direct children of divs.
         """
         head = div.find("head")
         p_nodes = div.find_all("p")
@@ -687,10 +710,12 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
                 if child.name == "div" or child.name.endswith(":div"):
                     nested_divs.append(child)
 
-        # Count only direct child paragraphs, not those in nested divs
+        # Count only direct child paragraphs and formulas, not those in nested divs
         direct_p_nodes = [child for child in div.children if hasattr(child, 'name') and child.name == "p"]
+        direct_formula_nodes = [child for child in div.children if hasattr(child, 'name') and child.name == "formula"]
+        has_direct_content = len(direct_p_nodes) > 0 or len(direct_formula_nodes) > 0
 
-        if len(nested_divs) > 0 and len(direct_p_nodes) == 0:
+        if len(nested_divs) > 0 and not has_direct_content:
             # This is a container div - process each nested div independently
             for nested_div in nested_divs:
                 # Skip references divs
@@ -703,11 +728,11 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
 
         # Determine the section header and content type for divs with content
         if head:
-            if len(direct_p_nodes) == 0:
-                # This div has only a head, no paragraphs (standalone head)
+            if not has_direct_content:
+                # This div has only a head, no paragraphs or formulas (standalone head)
                 current_head_paragraph = self._clean_text(head.get_text())
             else:
-                # This div has both head and paragraphs - head is the section header
+                # This div has both head and content - head is the section header
                 head_section = self._clean_text(head.get_text())
         else:
             # If no head element, try to use the type attribute as head_section
@@ -722,35 +747,68 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
                     head_section = "Author Contributions"
                 elif div_type == "availability":
                     # Only set as default if this div has its own content
-                    if len(direct_p_nodes) > 0:
+                    if has_direct_content:
                         head_section = "Data Availability"
                 elif div_type == "annex":
                     head_section = "Annex"
                 else:
                     # Generic handling - capitalize and format
                     head_section = div_type.replace("_", " ").title()
 
-        # Process paragraphs in this div
-        if len(direct_p_nodes) > 0:
-            for id_p, p in enumerate(direct_p_nodes):
+        # Process direct children (paragraphs and formulas) in document order
+        for child in div.children:
+            if not hasattr(child, 'name') or not child.name:
+                continue
+
+            if child.name == "p":
                 paragraph_id = get_random_id(prefix="p_")
 
                 if passage_level == "sentence":
-                    for id_s, sentence in enumerate(p.find_all("s")):
+                    for id_s, sentence in enumerate(child.find_all("s")):
                         struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, sentence)
                         if self.validate_refs:
                             for ref in struct['refs']:
-                                assert "Wrong offsets", ref['offset_start'] < ref['offset_end']
-                                assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text']
+                                assert ref['offset_start'] < ref['offset_end'], "Wrong offsets"
+                                assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets"
                         yield struct
                 else:
-                    struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, p)
+                    struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, child)
                     if self.validate_refs:
                         for ref in struct['refs']:
-                            assert "Wrong offsets", ref['offset_start'] < ref['offset_end']
-                            assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text']
+                            assert ref['offset_start'] < ref['offset_end'], "Wrong offsets"
+                            assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets"
                     yield struct
 
+            elif child.name == "formula":
+                # Process formula elements as passages
+                formula_id = get_random_id(prefix="f_")
+                formula_text = self._clean_text(child.get_text())
+                
+                if formula_text:
+                    # Create a passage structure for the formula
+                    formula_passage = {
+                        "id": formula_id,
+                        "text": formula_text,
+                        "coords": [
+                            box_to_dict(coord.split(","))
+                            for coord in child.get("coords", "").split(";")
+                        ] if child.has_attr("coords") else [],
+                        "refs": [],
+                        "type": "formula"
+                    }
+                    
+                    if current_head_paragraph or head_paragraph:
+                        formula_passage["head_paragraph"] = current_head_paragraph or head_paragraph
+                    if head_section:
+                        formula_passage["head_section"] = head_section
+                    
+                    # Extract formula label if present
+                    label = child.find("label")
+                    if label:
+                        formula_passage["label"] = self._clean_text(label.get_text())
+                    
+                    yield formula_passage
+
         # Update head_paragraph for potential next div
         if current_head_paragraph is not None:
             head_paragraph = current_head_paragraph
 
@@ -11,8 +11,7 @@
 - Annex
 - References
 """
-import os
-import uuid
+import re
 from pathlib import Path
 from typing import List, Dict, Union, Optional, BinaryIO
 from bs4 import BeautifulSoup, NavigableString, Tag
@@ -44,9 +43,12 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO]) -> Optional[str]:
         try:
             # Load with BeautifulSoup
             if isinstance(tei_file, (str, Path)):
-                content = open(tei_file, 'r', encoding='utf-8').read()
+                with open(tei_file, 'r', encoding='utf-8') as f:
+                    content = f.read()
             else:
                 content = tei_file.read()
+                if isinstance(content, bytes):
+                    content = content.decode('utf-8')
 
             soup = BeautifulSoup(content, 'xml')
 
@@ -77,7 +79,7 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO]) -> Optional[str]:
             # Extract publication date
             pub_date = self._extract_publication_date(soup)
             if pub_date:
-                markdown_sections.append(f"Publishd on {pub_date}\n\n")
+                markdown_sections.append(f"Published on {pub_date}\n\n")
 
             # Extract abstract
             abstract = self._extract_abstract(soup)
@@ -211,14 +213,24 @@ def _extract_fulltext(self, soup: BeautifulSoup) -> str:
             head = div.find("head")
             if head:
                 section_title = head.get_text().strip()
-                fulltext_sections.append(f"### {section_title}\n")
-
-            # Get paragraphs
-            paragraphs = div.find_all("p")
-            for p in paragraphs:
-                paragraph_text = self._process_paragraph(p)
-                if paragraph_text.strip():
-                    fulltext_sections.append(f"{paragraph_text}\n\n")
+                if section_title:
+                    fulltext_sections.append(f"### {section_title}\n")
+
+            # Process direct children of the div in document order
+            # This captures paragraphs, formulas, and other elements as they appear
+            for child in div.children:
+                if not hasattr(child, 'name') or not child.name:
+                    continue
+                    
+                if child.name == "p":
+                    paragraph_text = self._process_paragraph(child)
+                    if paragraph_text.strip():
+                        fulltext_sections.append(f"{paragraph_text}\n\n")
+                elif child.name == "formula":
+                    # Handle formula elements - extract text and optional label
+                    formula_text = self._process_formula(child)
+                    if formula_text.strip():
+                        fulltext_sections.append(f"{formula_text}\n\n")
 
         return "".join(fulltext_sections)
 
@@ -270,16 +282,23 @@ def _process_div_and_nested_divs(self, div: Tag, annex_sections: list) -> None:
             if header_text not in annex_sections:
                 annex_sections.append(header_text)
 
-        # Process paragraphs that are direct children of this div (not in nested divs)
+        # Process direct children of this div in document order
+        # This captures paragraphs, formulas, and other elements as they appear
         for child in div.children:
-            if hasattr(child, 'name') and child.name == "p":
+            if not hasattr(child, 'name') or not child.name:
+                continue
+                
+            if child.name == "p":
                 paragraph_text = self._process_paragraph(child)
                 if paragraph_text.strip():
                     annex_sections.append(f"{paragraph_text}\n\n")
-
-        # Process nested div elements
-        for child in div.children:
-            if hasattr(child, 'name') and child.name == "div":
+            elif child.name == "formula":
+                # Handle formula elements
+                formula_text = self._process_formula(child)
+                if formula_text.strip():
+                    annex_sections.append(f"{formula_text}\n\n")
+            elif child.name == "div":
+                # Process nested div elements
                 self._process_div_and_nested_divs(child, annex_sections)
 
     def _extract_references(self, soup: BeautifulSoup) -> str:
@@ -336,6 +355,34 @@ def _process_paragraph(self, p_element: Tag) -> str:
 
         return "".join(text_parts).strip()
 
+    def _process_formula(self, formula_element: Tag) -> str:
+        """Process a formula element and convert to markdown.
+        
+        Formulas are rendered as italicized text with optional equation label.
+        """
+        # Get the main formula text (excluding the label)
+        formula_text_parts = []
+        label_text = ""
+        
+        for child in formula_element.children:
+            if hasattr(child, 'name') and child.name == "label":
+                # Extract equation label (e.g., "(1)", "(2)")
+                label_text = child.get_text().strip()
+            elif isinstance(child, NavigableString):
+                formula_text_parts.append(str(child))
+            else:
+                # Other elements within formula - get their text
+                formula_text_parts.append(child.get_text())
+        
+        formula_text = "".join(formula_text_parts).strip()
+        
+        if formula_text:
+            # Format as: *formula text* (label) if label exists
+            if label_text:
+                return f"*{formula_text}* {label_text}"
+            return f"*{formula_text}*"
+        return ""
+
     def _table_to_markdown(self, table_element: Tag) -> str:
         """Convert a table element to simple markdown."""
         markdown_lines = []
@@ -511,17 +558,25 @@ def _process_imprint_section(self, imprint: Tag, bib_data: dict) -> None:
             unit = bibl_scope.get("unit", "").lower()
             text = bibl_scope.get_text().strip()
 
-            if unit == "vol" and text:
+            if unit in ["vol", "volume"] and text:
                 bib_data['volume'] = text
             elif unit == "issue" and text:
                 bib_data['issue'] = text
             elif unit == "page" and text:
                 # Handle page ranges
-                if "from" in bibl_scope.attrs:
-                    bib_data['pages'] = f"{text}-"
-                elif "to" in bibl_scope.attrs and bib_data.get('pages'):
-                    bib_data['pages'] += text
-                else:
+                from_val = bibl_scope.get("from")
+                to_val = bibl_scope.get("to")
+                if from_val and to_val:
+                    # Both from and to in same element
+                    bib_data['pages'] = f"{from_val}-{to_val}"
+                elif from_val:
+                    # Only from specified, may get combined with another element
+                    bib_data['pages'] = f"{from_val}-"
+                elif to_val and bib_data.get('pages'):
+                    # Only to specified, append to existing from
+                    bib_data['pages'] = bib_data['pages'] + to_val
+                elif text and not bib_data.get('pages'):
+                    # Plain text, no from/to attributes
                     bib_data['pages'] = text
 
     def _extract_author_info(self, author: Tag) -> dict:
@@ -629,6 +684,9 @@ def _build_publication_details(self, ref_data: dict) -> str:
         """Build publication details string from extracted data."""
         details = []
 
+        if ref_data.get('year'):
+            details.append(f"({ref_data['year']})")
+
         if ref_data.get('volume'):
             details.append(ref_data['volume'])
 
@@ -684,7 +742,6 @@ def _extract_raw_reference(self, bibl_struct: Tag) -> str:
         raw_text = bibl_struct.get_text().strip()
 
         # Remove reference number if present
-        import re
         raw_text = re.sub(r'^\[\d+\]\s*', '', raw_text)
 
         # Clean up excessive whitespace