Skip to content

Commit cb18d20

Browse files
committed
Merge branch 'master' into feature/cli-converters
2 parents 3a62975 + 29f3b54 commit cb18d20

8 files changed

Lines changed: 1353 additions & 54 deletions

File tree

.github/workflows/ci-build.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,11 @@ jobs:
2222
python-version: ${{ matrix.python-version }}
2323
cache: 'pip'
2424
- name: Cleanup more disk space
25-
run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY"
25+
run: |
26+
sudo rm -rf /usr/share/dotnet
27+
sudo rm -rf /opt/ghc
28+
sudo rm -rf "/usr/local/share/boost"
29+
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
2630
- name: Install dependencies
2731
run: |
2832
python -m pip install --upgrade pip

grobid_client/format/TEI2LossyJSON.py

Lines changed: 80 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
import uuid
1010
from collections import OrderedDict
1111
from concurrent.futures import ProcessPoolExecutor, as_completed
12+
import html
13+
import re
1214
from pathlib import Path
1315
from typing import Dict, Union, BinaryIO, Iterator
1416

@@ -41,8 +43,15 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO], stream: bool = False
4143
If stream=False returns the full document dict (same shape as original function).
4244
"""
4345
# Load with BeautifulSoup but avoid building huge structures when streaming
44-
with open(tei_file, 'r') as f:
45-
content = f.read()
46+
if hasattr(tei_file, 'read'):
47+
# File-like object (BinaryIO/StringIO)
48+
content = tei_file.read()
49+
if isinstance(content, bytes):
50+
content = content.decode('utf-8')
51+
else:
52+
# Path-like object
53+
with open(tei_file, 'r', encoding='utf-8') as f:
54+
content = f.read()
4655
soup = BeautifulSoup(content, 'xml')
4756

4857
if soup.TEI is None:
@@ -222,7 +231,6 @@ def _extract_comprehensive_reference_data(self, bibl_struct: Tag, index: int) ->
222231
Extract detailed bibliographic information from TEI biblStruct elements.
223232
Implements comprehensive parsing for all standard TEI bibliographic components.
224233
"""
225-
import re
226234

227235
citation_data = OrderedDict()
228236
citation_data['id'] = f"b{index}"
@@ -430,7 +438,6 @@ def _process_pointer_element(self, pointer_element: Tag, link_references: list):
430438

431439
def _process_imprint_details(self, imprint_element: Tag, publication_metadata: Dict):
432440
"""Extract and process imprint information including publisher, dates, and page ranges."""
433-
import re
434441

435442
# Extract publisher information
436443
publisher_elements = imprint_element.find_all("publisher")
@@ -557,7 +564,6 @@ def _extract_person_data(self, person_element: Tag) -> Dict:
557564
Extract person data (author/editor) from TEI persName or author elements.
558565
Handles various name formats and affiliations.
559566
"""
560-
import re
561567

562568
person_data = {}
563569

@@ -628,11 +634,9 @@ def _clean_text(self, text: str) -> str:
628634
text = text.decode('utf-8', errors='ignore')
629635

630636
# Normalize whitespace and strip
631-
import re
632637
text = re.sub(r'\s+', ' ', text.strip())
633638

634639
# Remove any potential XML/HTML entities
635-
import html
636640
text = html.unescape(text)
637641

638642
return text
@@ -665,14 +669,33 @@ def _iter_passages_from_soup_for_text(self, text_node: Tag, passage_level: str)
665669

666670
div_type = div.get("type")
667671

672+
# Check if this is a header-only div (no content, no nested divs)
673+
# If so, capture its header as context for subsequent divs
674+
head = div.find("head")
675+
direct_p_nodes = [c for c in div.children if hasattr(c, 'name') and c.name == "p"]
676+
direct_formula_nodes = [c for c in div.children if hasattr(c, 'name') and c.name == "formula"]
677+
nested_divs = [c for c in div.children if hasattr(c, 'name') and (c.name == "div" or (c.name and c.name.endswith(":div")))]
678+
has_direct_content = len(direct_p_nodes) > 0 or len(direct_formula_nodes) > 0
679+
680+
if head and not has_direct_content and len(nested_divs) == 0:
681+
# This is a header-only div with no nested content
682+
# Capture the header for the next div
683+
head_paragraph = self._clean_text(head.get_text())
684+
continue # Skip to next div, the header will be used by subsequent sibling
685+
668686
# Process this div and potentially nested divs
669687
for passage in self._process_div_with_nested_content(div, passage_level, head_paragraph):
670688
yield passage
689+
690+
# Reset head_paragraph after it's been used by a content-bearing div
691+
head_paragraph = None
692+
671693

672694
def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_paragraph: str = None) -> Iterator[Dict[str, Union[str, Dict[str, str]]]]:
673695
"""
674696
Process a div and its nested content, handling various back section types.
675697
Supports nested divs for complex back sections like annex with multiple subsections.
698+
Also handles formula elements that are direct children of divs.
676699
"""
677700
head = div.find("head")
678701
p_nodes = div.find_all("p")
@@ -687,10 +710,12 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
687710
if child.name == "div" or child.name.endswith(":div"):
688711
nested_divs.append(child)
689712

690-
# Count only direct child paragraphs, not those in nested divs
713+
# Count only direct child paragraphs and formulas, not those in nested divs
691714
direct_p_nodes = [child for child in div.children if hasattr(child, 'name') and child.name == "p"]
715+
direct_formula_nodes = [child for child in div.children if hasattr(child, 'name') and child.name == "formula"]
716+
has_direct_content = len(direct_p_nodes) > 0 or len(direct_formula_nodes) > 0
692717

693-
if len(nested_divs) > 0 and len(direct_p_nodes) == 0:
718+
if len(nested_divs) > 0 and not has_direct_content:
694719
# This is a container div - process each nested div independently
695720
for nested_div in nested_divs:
696721
# Skip references divs
@@ -703,11 +728,11 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
703728

704729
# Determine the section header and content type for divs with content
705730
if head:
706-
if len(direct_p_nodes) == 0:
707-
# This div has only a head, no paragraphs (standalone head)
731+
if not has_direct_content:
732+
# This div has only a head, no paragraphs or formulas (standalone head)
708733
current_head_paragraph = self._clean_text(head.get_text())
709734
else:
710-
# This div has both head and paragraphs - head is the section header
735+
# This div has both head and content - head is the section header
711736
head_section = self._clean_text(head.get_text())
712737
else:
713738
# If no head element, try to use the type attribute as head_section
@@ -722,35 +747,68 @@ def _process_div_with_nested_content(self, div: Tag, passage_level: str, head_pa
722747
head_section = "Author Contributions"
723748
elif div_type == "availability":
724749
# Only set as default if this div has its own content
725-
if len(direct_p_nodes) > 0:
750+
if has_direct_content:
726751
head_section = "Data Availability"
727752
elif div_type == "annex":
728753
head_section = "Annex"
729754
else:
730755
# Generic handling - capitalize and format
731756
head_section = div_type.replace("_", " ").title()
732757

733-
# Process paragraphs in this div
734-
if len(direct_p_nodes) > 0:
735-
for id_p, p in enumerate(direct_p_nodes):
758+
# Process direct children (paragraphs and formulas) in document order
759+
for child in div.children:
760+
if not hasattr(child, 'name') or not child.name:
761+
continue
762+
763+
if child.name == "p":
736764
paragraph_id = get_random_id(prefix="p_")
737765

738766
if passage_level == "sentence":
739-
for id_s, sentence in enumerate(p.find_all("s")):
767+
for id_s, sentence in enumerate(child.find_all("s")):
740768
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, sentence)
741769
if self.validate_refs:
742770
for ref in struct['refs']:
743-
assert "Wrong offsets", ref['offset_start'] < ref['offset_end']
744-
assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text']
771+
assert ref['offset_start'] < ref['offset_end'], "Wrong offsets"
772+
assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets"
745773
yield struct
746774
else:
747-
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, p)
775+
struct = get_formatted_passage(current_head_paragraph or head_paragraph, head_section, paragraph_id, child)
748776
if self.validate_refs:
749777
for ref in struct['refs']:
750-
assert "Wrong offsets", ref['offset_start'] < ref['offset_end']
751-
assert "Cannot apply offsets", struct['text'][ref['offset_start']:ref['offset_end']] == ref['text']
778+
assert ref['offset_start'] < ref['offset_end'], "Wrong offsets"
779+
assert struct['text'][ref['offset_start']:ref['offset_end']] == ref['text'], "Cannot apply offsets"
752780
yield struct
753781

782+
elif child.name == "formula":
783+
# Process formula elements as passages
784+
formula_id = get_random_id(prefix="f_")
785+
formula_text = self._clean_text(child.get_text())
786+
787+
if formula_text:
788+
# Create a passage structure for the formula
789+
formula_passage = {
790+
"id": formula_id,
791+
"text": formula_text,
792+
"coords": [
793+
box_to_dict(coord.split(","))
794+
for coord in child.get("coords", "").split(";")
795+
] if child.has_attr("coords") else [],
796+
"refs": [],
797+
"type": "formula"
798+
}
799+
800+
if current_head_paragraph or head_paragraph:
801+
formula_passage["head_paragraph"] = current_head_paragraph or head_paragraph
802+
if head_section:
803+
formula_passage["head_section"] = head_section
804+
805+
# Extract formula label if present
806+
label = child.find("label")
807+
if label:
808+
formula_passage["label"] = self._clean_text(label.get_text())
809+
810+
yield formula_passage
811+
754812
# Update head_paragraph for potential next div
755813
if current_head_paragraph is not None:
756814
head_paragraph = current_head_paragraph

grobid_client/format/TEI2Markdown.py

Lines changed: 82 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,7 @@
1111
- Annex
1212
- References
1313
"""
14-
import os
15-
import uuid
14+
import re
1615
from pathlib import Path
1716
from typing import List, Dict, Union, Optional, BinaryIO
1817
from bs4 import BeautifulSoup, NavigableString, Tag
@@ -44,9 +43,12 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO]) -> Optional[str]:
4443
try:
4544
# Load with BeautifulSoup
4645
if isinstance(tei_file, (str, Path)):
47-
content = open(tei_file, 'r', encoding='utf-8').read()
46+
with open(tei_file, 'r', encoding='utf-8') as f:
47+
content = f.read()
4848
else:
4949
content = tei_file.read()
50+
if isinstance(content, bytes):
51+
content = content.decode('utf-8')
5052

5153
soup = BeautifulSoup(content, 'xml')
5254

@@ -77,7 +79,7 @@ def convert_tei_file(self, tei_file: Union[Path, BinaryIO]) -> Optional[str]:
7779
# Extract publication date
7880
pub_date = self._extract_publication_date(soup)
7981
if pub_date:
80-
markdown_sections.append(f"Publishd on {pub_date}\n\n")
82+
markdown_sections.append(f"Published on {pub_date}\n\n")
8183

8284
# Extract abstract
8385
abstract = self._extract_abstract(soup)
@@ -211,14 +213,24 @@ def _extract_fulltext(self, soup: BeautifulSoup) -> str:
211213
head = div.find("head")
212214
if head:
213215
section_title = head.get_text().strip()
214-
fulltext_sections.append(f"### {section_title}\n")
215-
216-
# Get paragraphs
217-
paragraphs = div.find_all("p")
218-
for p in paragraphs:
219-
paragraph_text = self._process_paragraph(p)
220-
if paragraph_text.strip():
221-
fulltext_sections.append(f"{paragraph_text}\n\n")
216+
if section_title:
217+
fulltext_sections.append(f"### {section_title}\n")
218+
219+
# Process direct children of the div in document order
220+
# This captures paragraphs, formulas, and other elements as they appear
221+
for child in div.children:
222+
if not hasattr(child, 'name') or not child.name:
223+
continue
224+
225+
if child.name == "p":
226+
paragraph_text = self._process_paragraph(child)
227+
if paragraph_text.strip():
228+
fulltext_sections.append(f"{paragraph_text}\n\n")
229+
elif child.name == "formula":
230+
# Handle formula elements - extract text and optional label
231+
formula_text = self._process_formula(child)
232+
if formula_text.strip():
233+
fulltext_sections.append(f"{formula_text}\n\n")
222234

223235
return "".join(fulltext_sections)
224236

@@ -270,16 +282,23 @@ def _process_div_and_nested_divs(self, div: Tag, annex_sections: list) -> None:
270282
if header_text not in annex_sections:
271283
annex_sections.append(header_text)
272284

273-
# Process paragraphs that are direct children of this div (not in nested divs)
285+
# Process direct children of this div in document order
286+
# This captures paragraphs, formulas, and other elements as they appear
274287
for child in div.children:
275-
if hasattr(child, 'name') and child.name == "p":
288+
if not hasattr(child, 'name') or not child.name:
289+
continue
290+
291+
if child.name == "p":
276292
paragraph_text = self._process_paragraph(child)
277293
if paragraph_text.strip():
278294
annex_sections.append(f"{paragraph_text}\n\n")
279-
280-
# Process nested div elements
281-
for child in div.children:
282-
if hasattr(child, 'name') and child.name == "div":
295+
elif child.name == "formula":
296+
# Handle formula elements
297+
formula_text = self._process_formula(child)
298+
if formula_text.strip():
299+
annex_sections.append(f"{formula_text}\n\n")
300+
elif child.name == "div":
301+
# Process nested div elements
283302
self._process_div_and_nested_divs(child, annex_sections)
284303

285304
def _extract_references(self, soup: BeautifulSoup) -> str:
@@ -336,6 +355,34 @@ def _process_paragraph(self, p_element: Tag) -> str:
336355

337356
return "".join(text_parts).strip()
338357

358+
def _process_formula(self, formula_element: Tag) -> str:
359+
"""Process a formula element and convert to markdown.
360+
361+
Formulas are rendered as italicized text with optional equation label.
362+
"""
363+
# Get the main formula text (excluding the label)
364+
formula_text_parts = []
365+
label_text = ""
366+
367+
for child in formula_element.children:
368+
if hasattr(child, 'name') and child.name == "label":
369+
# Extract equation label (e.g., "(1)", "(2)")
370+
label_text = child.get_text().strip()
371+
elif isinstance(child, NavigableString):
372+
formula_text_parts.append(str(child))
373+
else:
374+
# Other elements within formula - get their text
375+
formula_text_parts.append(child.get_text())
376+
377+
formula_text = "".join(formula_text_parts).strip()
378+
379+
if formula_text:
380+
# Format as: *formula text* (label) if label exists
381+
if label_text:
382+
return f"*{formula_text}* {label_text}"
383+
return f"*{formula_text}*"
384+
return ""
385+
339386
def _table_to_markdown(self, table_element: Tag) -> str:
340387
"""Convert a table element to simple markdown."""
341388
markdown_lines = []
@@ -511,17 +558,25 @@ def _process_imprint_section(self, imprint: Tag, bib_data: dict) -> None:
511558
unit = bibl_scope.get("unit", "").lower()
512559
text = bibl_scope.get_text().strip()
513560

514-
if unit == "vol" and text:
561+
if unit in ["vol", "volume"] and text:
515562
bib_data['volume'] = text
516563
elif unit == "issue" and text:
517564
bib_data['issue'] = text
518565
elif unit == "page" and text:
519566
# Handle page ranges
520-
if "from" in bibl_scope.attrs:
521-
bib_data['pages'] = f"{text}-"
522-
elif "to" in bibl_scope.attrs and bib_data.get('pages'):
523-
bib_data['pages'] += text
524-
else:
567+
from_val = bibl_scope.get("from")
568+
to_val = bibl_scope.get("to")
569+
if from_val and to_val:
570+
# Both from and to in same element
571+
bib_data['pages'] = f"{from_val}-{to_val}"
572+
elif from_val:
573+
# Only from specified, may get combined with another element
574+
bib_data['pages'] = f"{from_val}-"
575+
elif to_val and bib_data.get('pages'):
576+
# Only to specified, append to existing from
577+
bib_data['pages'] = bib_data['pages'] + to_val
578+
elif text and not bib_data.get('pages'):
579+
# Plain text, no from/to attributes
525580
bib_data['pages'] = text
526581

527582
def _extract_author_info(self, author: Tag) -> dict:
@@ -629,6 +684,9 @@ def _build_publication_details(self, ref_data: dict) -> str:
629684
"""Build publication details string from extracted data."""
630685
details = []
631686

687+
if ref_data.get('year'):
688+
details.append(f"({ref_data['year']})")
689+
632690
if ref_data.get('volume'):
633691
details.append(ref_data['volume'])
634692

@@ -684,7 +742,6 @@ def _extract_raw_reference(self, bibl_struct: Tag) -> str:
684742
raw_text = bibl_struct.get_text().strip()
685743

686744
# Remove reference number if present
687-
import re
688745
raw_text = re.sub(r'^\[\d+\]\s*', '', raw_text)
689746

690747
# Clean up excessive whitespace

0 commit comments

Comments
 (0)