Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 151 additions & 0 deletions packtools/sps/models/ext_link.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
"""
Model for extracting ext-link elements from XML documents.

<ext-link> is used for marking external links/hyperlinks in scientific articles.
It ensures accessibility and interoperability of external references.

Example:
<ext-link ext-link-type="uri"
xlink:href="https://www.scielo.br/"
xlink:title="SciELO Scientific Library">
SciELO Brasil
</ext-link>
"""
from packtools.sps.models.article_and_subarticles import Fulltext
from packtools.sps.utils.xml_utils import (
node_plain_text,
tostring,
remove_namespaces,
)


class ExtLink(Fulltext):
"""
Extracts ext-link elements from article or sub-article nodes.

Processes article/sub-article and provides methods to extract
ext-link information including attributes and text content.
"""

@property
def parent_data(self):
"""Returns parent context data for validation."""
return self.attribs_parent_prefixed

@property
def ext_links(self):
"""
Extract all ext-link elements with their attributes and context.

Returns
-------
list
List of dictionaries containing ext-link information:
- ext_link_type: Value of @ext-link-type attribute
- xlink_href: Value of @xlink:href attribute
- xlink_title: Value of @xlink:title attribute (optional)
- text: Plain text content of the element
- parent: Parent element tag
- parent_id: Parent element id
- parent_lang: Parent element language
- parent_article_type: Parent article type

Example:
[
{
'ext_link_type': 'uri',
'xlink_href': 'https://www.scielo.br/',
'xlink_title': 'SciELO Platform',
'text': 'SciELO Brasil',
'parent': 'article',
'parent_id': None,
'parent_lang': 'en',
'parent_article_type': 'research-article'
}
]
"""
# Search in front, body and back sections
nodes = self.node.xpath("./front | ./front-stub | ./body | ./back")

for node in nodes:
for ext_link in node.xpath(".//ext-link"):
ext_link_data = {}

# Extract attributes, handling namespace prefixes
for key, value in ext_link.attrib.items():
if "}" in key:
# Remove namespace prefix (e.g., {http://www.w3.org/1999/xlink}href -> href)
key = key.split("}")[-1]

# Map xlink attributes to snake_case
if key == "href":
ext_link_data["xlink_href"] = value
elif key == "title":
ext_link_data["xlink_title"] = value
elif key == "ext-link-type":
ext_link_data["ext_link_type"] = value
else:
ext_link_data[key] = value

# Set None for missing attributes
ext_link_data.setdefault("ext_link_type", None)
ext_link_data.setdefault("xlink_href", None)
ext_link_data.setdefault("xlink_title", None)

# Extract text content
ext_link_data["text"] = node_plain_text(ext_link) or ""

# Add parent context
ext_link_data.update(self.parent_data)

yield ext_link_data

@property
def fulltexts(self):
"""Recursively process sub-articles."""
for node in self.node.xpath("sub-article"):
yield ExtLink(node)


class ArticleExtLinks:
"""
Main class for extracting all ext-links from an XML document.

Processes main article and all sub-articles to extract ext-link information.
"""

def __init__(self, xmltree):
"""
Initialize with XML tree.

Parameters
----------
xmltree : lxml.etree._Element
The root element of the XML document
"""
self.xmltree = xmltree
self._ext_link = ExtLink(xmltree)

@property
def ext_links(self):
"""
Extract all ext-links from main article and sub-articles.

Returns
-------
list
List of all ext-link dictionaries from article and sub-articles
"""
ext_links = []

# Main article ext-links
ext_links.extend(list(self._ext_link.ext_links))

# Sub-articles ext-links (recursively)
for fulltext in self._ext_link.fulltexts:
ext_links.extend(list(fulltext.ext_links))
# Handle nested sub-articles
for nested_fulltext in fulltext.fulltexts:
ext_links.extend(list(nested_fulltext.ext_links))

return ext_links
Loading