-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathobo_extractor.py
More file actions
113 lines (88 loc) · 3.53 KB
/
obo_extractor.py
File metadata and controls
113 lines (88 loc) · 3.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""Extract ChEBI ontology data using fastobo and build a networkx graph."""
from __future__ import annotations
from pathlib import Path
import fastobo
import networkx as nx
def _chebi_id_to_str(chebi_id: str) -> str:
"""Convert 'CHEBI:123' to '123' (string)."""
return chebi_id.split(":")[1]
def _term_data(doc: "fastobo.term.TermFrame") -> dict | None:
"""Extract data from a single fastobo TermFrame.
Returns
-------
dict or None
Parsed term data, or ``None`` if the term is marked as obsolete.
"""
parents: list[str] = []
has_part: set[str] = set()
name: str | None = None
smiles: str | None = None
subset: str | None = None
for clause in doc:
if isinstance(clause, fastobo.term.IsObsoleteClause):
if clause.obsolete:
return None
elif isinstance(clause, fastobo.term.PropertyValueClause):
pv = clause.property_value
if str(pv.relation) in (
"chemrof:smiles_string",
"http://purl.obolibrary.org/obo/chebi/smiles",
):
smiles = pv.value
elif isinstance(clause, fastobo.term.SynonymClause):
if "SMILES" in clause.raw_value() and smiles is None:
smiles = clause.raw_value().split('"')[1]
elif isinstance(clause, fastobo.term.RelationshipClause):
if str(clause.typedef) == "has_part":
has_part.add(_chebi_id_to_str(str(clause.term)))
elif isinstance(clause, fastobo.term.IsAClause):
parents.append(_chebi_id_to_str(str(clause.term)))
elif isinstance(clause, fastobo.term.NameClause):
name = str(clause.name)
elif isinstance(clause, fastobo.term.SubsetClause):
subset = str(clause.subset)
return {
"id": _chebi_id_to_str(str(doc.id)),
"parents": parents,
"has_part": has_part,
"name": name,
"smiles": smiles,
"subset": subset,
}
def build_chebi_graph(filepath: str | Path) -> nx.DiGraph:
"""Parse a ChEBI OBO file and build a directed graph of ontology terms.
``xref:`` lines are stripped before parsing as they can cause fastobo
errors on some ChEBI releases. Only non-obsolete CHEBI-prefixed terms
are included.
**Nodes** are string CHEBI IDs (e.g. ``"1"`` for ``CHEBI:1``) with
attributes ``name``, ``smiles``, and ``subset``.
**Edges** carry a ``relation`` attribute and represent:
- ``is_a`` — directed from child to parent
- ``has_part`` — directed from whole to part
Parameters
----------
filepath : str or Path
Path to the ChEBI OBO file.
Returns
-------
nx.DiGraph
Directed graph of ChEBI ontology terms and their relationships.
"""
with open(filepath, encoding="utf-8") as f:
content = "\n".join(line for line in f if not line.startswith("xref:"))
graph: nx.DiGraph = nx.DiGraph()
for frame in fastobo.loads(content):
if not (
frame and isinstance(frame.id, fastobo.id.PrefixedIdent) and frame.id.prefix == "CHEBI"
):
continue
term = _term_data(frame)
if term is None:
continue
node_id = term["id"]
graph.add_node(node_id, name=term["name"], smiles=term["smiles"], subset=term["subset"])
for parent_id in term["parents"]:
graph.add_edge(node_id, parent_id, relation="is_a")
for part_id in term["has_part"]:
graph.add_edge(node_id, part_id, relation="has_part")
return graph