99import uuid
1010from collections import OrderedDict
1111from concurrent .futures import ProcessPoolExecutor , as_completed
12+ import html
13+ import re
1214from pathlib import Path
1315from typing import Dict , Union , BinaryIO , Iterator
1416
@@ -41,8 +43,15 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO], stream: bool = False
4143 If stream=False returns the full document dict (same shape as original function).
4244 """
4345 # Load with BeautifulSoup but avoid building huge structures when streaming
44- with open (tei_file , 'r' ) as f :
45- content = f .read ()
46+ if hasattr (tei_file , 'read' ):
47+ # File-like object (BinaryIO/StringIO)
48+ content = tei_file .read ()
49+ if isinstance (content , bytes ):
50+ content = content .decode ('utf-8' )
51+ else :
52+ # Path-like object
53+ with open (tei_file , 'r' , encoding = 'utf-8' ) as f :
54+ content = f .read ()
4655 soup = BeautifulSoup (content , 'xml' )
4756
4857 if soup .TEI is None :
@@ -222,7 +231,6 @@ def _extract_comprehensive_reference_data(self, bibl_struct: Tag, index: int) ->
222231 Extract detailed bibliographic information from TEI biblStruct elements.
223232 Implements comprehensive parsing for all standard TEI bibliographic components.
224233 """
225- import re
226234
227235 citation_data = OrderedDict ()
228236 citation_data ['id' ] = f"b{ index } "
@@ -430,7 +438,6 @@ def _process_pointer_element(self, pointer_element: Tag, link_references: list):
430438
431439 def _process_imprint_details (self , imprint_element : Tag , publication_metadata : Dict ):
432440 """Extract and process imprint information including publisher, dates, and page ranges."""
433- import re
434441
435442 # Extract publisher information
436443 publisher_elements = imprint_element .find_all ("publisher" )
@@ -557,7 +564,6 @@ def _extract_person_data(self, person_element: Tag) -> Dict:
557564 Extract person data (author/editor) from TEI persName or author elements.
558565 Handles various name formats and affiliations.
559566 """
560- import re
561567
562568 person_data = {}
563569
@@ -628,11 +634,9 @@ def _clean_text(self, text: str) -> str:
628634 text = text .decode ('utf-8' , errors = 'ignore' )
629635
630636 # Normalize whitespace and strip
631- import re
632637 text = re .sub (r'\s+' , ' ' , text .strip ())
633638
634639 # Remove any potential XML/HTML entities
635- import html
636640 text = html .unescape (text )
637641
638642 return text
@@ -665,14 +669,33 @@ def _iter_passages_from_soup_for_text(self, text_node: Tag, passage_level: str)
665669
666670 div_type = div .get ("type" )
667671
672+ # Check if this is a header-only div (no content, no nested divs)
673+ # If so, capture its header as context for subsequent divs
674+ head = div .find ("head" )
675+ direct_p_nodes = [c for c in div .children if hasattr (c , 'name' ) and c .name == "p" ]
676+ direct_formula_nodes = [c for c in div .children if hasattr (c , 'name' ) and c .name == "formula" ]
677+ nested_divs = [c for c in div .children if hasattr (c , 'name' ) and (c .name == "div" or (c .name and c .name .endswith (":div" )))]
678+ has_direct_content = len (direct_p_nodes ) > 0 or len (direct_formula_nodes ) > 0
679+
680+ if head and not has_direct_content and len (nested_divs ) == 0 :
681+ # This is a header-only div with no nested content
682+ # Capture the header for the next div
683+ head_paragraph = self ._clean_text (head .get_text ())
684+ continue # Skip to next div, the header will be used by subsequent sibling
685+
668686 # Process this div and potentially nested divs
669687 for passage in self ._process_div_with_nested_content (div , passage_level , head_paragraph ):
670688 yield passage
689+
690+ # Reset head_paragraph after it's been used by a content-bearing div
691+ head_paragraph = None
692+
671693
672694 def _process_div_with_nested_content (self , div : Tag , passage_level : str , head_paragraph : str = None ) -> Iterator [Dict [str , Union [str , Dict [str , str ]]]]:
673695 """
674696 Process a div and its nested content, handling various back section types.
675697 Supports nested divs for complex back sections like annex with multiple subsections.
698+ Also handles formula elements that are direct children of divs.
676699 """
677700 head = div .find ("head" )
678701 p_nodes = div .find_all ("p" )
@@ -687,10 +710,12 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
687710 if child .name == "div" or child .name .endswith (":div" ):
688711 nested_divs .append (child )
689712
690- # Count only direct child paragraphs, not those in nested divs
713+ # Count only direct child paragraphs and formulas , not those in nested divs
691714 direct_p_nodes = [child for child in div .children if hasattr (child , 'name' ) and child .name == "p" ]
715+ direct_formula_nodes = [child for child in div .children if hasattr (child , 'name' ) and child .name == "formula" ]
716+ has_direct_content = len (direct_p_nodes ) > 0 or len (direct_formula_nodes ) > 0
692717
693- if len (nested_divs ) > 0 and len ( direct_p_nodes ) == 0 :
718+ if len (nested_divs ) > 0 and not has_direct_content :
694719 # This is a container div - process each nested div independently
695720 for nested_div in nested_divs :
696721 # Skip references divs
@@ -703,11 +728,11 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
703728
704729 # Determine the section header and content type for divs with content
705730 if head :
706- if len ( direct_p_nodes ) == 0 :
707- # This div has only a head, no paragraphs (standalone head)
731+ if not has_direct_content :
732+ # This div has only a head, no paragraphs or formulas (standalone head)
708733 current_head_paragraph = self ._clean_text (head .get_text ())
709734 else :
710- # This div has both head and paragraphs - head is the section header
735+ # This div has both head and content - head is the section header
711736 head_section = self ._clean_text (head .get_text ())
712737 else :
713738 # If no head element, try to use the type attribute as head_section
@@ -722,35 +747,68 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
722747 head_section = "Author Contributions"
723748 elif div_type == "availability" :
724749 # Only set as default if this div has its own content
725- if len ( direct_p_nodes ) > 0 :
750+ if has_direct_content :
726751 head_section = "Data Availability"
727752 elif div_type == "annex" :
728753 head_section = "Annex"
729754 else :
730755 # Generic handling - capitalize and format
731756 head_section = div_type .replace ("_" , " " ).title ()
732757
733- # Process paragraphs in this div
734- if len (direct_p_nodes ) > 0 :
735- for id_p , p in enumerate (direct_p_nodes ):
758+ # Process direct children (paragraphs and formulas) in document order
759+ for child in div .children :
760+ if not hasattr (child , 'name' ) or not child .name :
761+ continue
762+
763+ if child .name == "p" :
736764 paragraph_id = get_random_id (prefix = "p_" )
737765
738766 if passage_level == "sentence" :
739- for id_s , sentence in enumerate (p .find_all ("s" )):
767+ for id_s , sentence in enumerate (child .find_all ("s" )):
740768 struct = get_formatted_passage (current_head_paragraph or head_paragraph , head_section , paragraph_id , sentence )
741769 if self .validate_refs :
742770 for ref in struct ['refs' ]:
743- assert "Wrong offsets" , ref ['offset_start' ] < ref ['offset_end' ]
744- assert "Cannot apply offsets" , struct ['text' ][ref ['offset_start' ]:ref ['offset_end' ]] == ref ['text' ]
771+ assert ref ['offset_start' ] < ref ['offset_end' ], "Wrong offsets"
772+ assert struct ['text' ][ref ['offset_start' ]:ref ['offset_end' ]] == ref ['text' ], "Cannot apply offsets"
745773 yield struct
746774 else :
747- struct = get_formatted_passage (current_head_paragraph or head_paragraph , head_section , paragraph_id , p )
775+ struct = get_formatted_passage (current_head_paragraph or head_paragraph , head_section , paragraph_id , child )
748776 if self .validate_refs :
749777 for ref in struct ['refs' ]:
750- assert "Wrong offsets" , ref ['offset_start' ] < ref ['offset_end' ]
751- assert "Cannot apply offsets" , struct ['text' ][ref ['offset_start' ]:ref ['offset_end' ]] == ref ['text' ]
778+ assert ref ['offset_start' ] < ref ['offset_end' ], "Wrong offsets"
779+ assert struct ['text' ][ref ['offset_start' ]:ref ['offset_end' ]] == ref ['text' ], "Cannot apply offsets"
752780 yield struct
753781
782+ elif child .name == "formula" :
783+ # Process formula elements as passages
784+ formula_id = get_random_id (prefix = "f_" )
785+ formula_text = self ._clean_text (child .get_text ())
786+
787+ if formula_text :
788+ # Create a passage structure for the formula
789+ formula_passage = {
790+ "id" : formula_id ,
791+ "text" : formula_text ,
792+ "coords" : [
793+ box_to_dict (coord .split ("," ))
794+ for coord in child .get ("coords" , "" ).split (";" )
795+ ] if child .has_attr ("coords" ) else [],
796+ "refs" : [],
797+ "type" : "formula"
798+ }
799+
800+ if current_head_paragraph or head_paragraph :
801+ formula_passage ["head_paragraph" ] = current_head_paragraph or head_paragraph
802+ if head_section :
803+ formula_passage ["head_section" ] = head_section
804+
805+ # Extract formula label if present
806+ label = child .find ("label" )
807+ if label :
808+ formula_passage ["label" ] = self ._clean_text (label .get_text ())
809+
810+ yield formula_passage
811+
754812 # Update head_paragraph for potential next div
755813 if current_head_paragraph is not None :
756814 head_paragraph = current_head_paragraph
0 commit comments