Generate the `fda_approved_drugs_v1.0.pickle` file using Sep. 2025 Babel and the latest DrugBank

I used the following script, ran on `kg2103build.rtx.ai` to produce the `fda_approved_drugs_v1.0_KG2.10.3c.pickle` file:

```python 
#!/usr/bin/env python3

import xml.etree.ElementTree as ET
import stitch_proj.local_babel as lb
import pickle
import argparse

__author__ = 'Frankie Hodges'
__copyright__ = 'Oregon State University'
__credits__ = ['Stephen Ramsey', 'Frankie Hodges']
__license__ = 'MIT'
__version__ = '0.1.0'
__maintainer__ = ''
__email__ = ''
__status__ = 'Prototype'


def parse_args():
    parser = argparse.ArgumentParser(
        description="Generate fda_approved_drugs_v1.0_KG2.10.3c.pickle "
                    "from DrugBank XML and local Babel DB"
    )

    parser.add_argument(
        "--drugbank_xml",
        type=str,
        default="/home/ubuntu/kg2-build/drugbank.xml",
        help="Path to drugbank.xml"
    )

    parser.add_argument(
        "--babel_db",
        type=str,
        default="/data/babel-20250901-p1.sqlite",
        help="Path to local Babel sqlite DB"
    )

    parser.add_argument(
        "--output_pickle",
        type=str,
        default="/home/ubuntu/kg2-build/fda_approved_drugs_v1.0_KG2.10.3c.pickle",
        help="Output pickle filename"
    )

    return parser.parse_args()


def extract_approved_drugbank_ids(xml_path):
    ns = {"db": "http://www.drugbank.ca"}

    tree = ET.parse(xml_path)
    root = tree.getroot()

    approved_curie_ids = []

    for drug in root.findall("db:drug", ns):
        groups = drug.find("db:groups", ns)
        if groups is None:
            continue

        is_approved = any(
            g.text == "approved"
            for g in groups.findall("db:group", ns)
        )

        if not is_approved:
            continue

        primary_id = drug.find("db:drugbank-id[@primary='true']", ns)
        if primary_id is not None and primary_id.text:
            approved_curie_ids.append("DRUGBANK:" + primary_id.text.strip())

    return approved_curie_ids


def canonicalize_ids(curie_ids, babel_db_path):
    canonical_ids = set()

    with lb.connect_to_db_read_only(babel_db_path) as conn:
        for curie in curie_ids:
            cliques = lb.map_any_curie_to_cliques(conn, curie)
            if not cliques:
                continue

            for clique in cliques:
                canonical_ids.add(clique["id"]["identifier"])

    return canonical_ids


def main():
    args = parse_args()

    print("Extracting approved DrugBank IDs...")
    approved_curie_ids = extract_approved_drugbank_ids(args.drugbank_xml)
    print(f"Found {len(approved_curie_ids)} approved DrugBank IDs.")

    print("Canonicalizing via local Babel...")
    canonical_ids = canonicalize_ids(approved_curie_ids, args.babel_db)
    print(f"Canonicalized to {len(canonical_ids)} unique canonical IDs.")

    print("Writing pickle...")
    with open(args.output_pickle, "wb") as out:
        pickle.dump(canonical_ids, out)

    print("Done.")


if __name__ == "__main__":
    main()
```
and the final output file, on kg2103build.rtx.ai is `fda_approved_drugs_v1.0_KG2.10.3c.pickle`


See [this commit](https://github.com/RTXteam/RTX-KG2/commit/5660192dc9a3bc3ff6829eb5089eff44156c43c5). It is at `RTX-KG2/process/generate_fda_pickle.py` and can be ran by: `python generate_fda_pickle` with the option to pass in the drugbank file, babel, and where to output the file. If those arguments are not provided, the defaults are:

```python 
default="/home/ubuntu/kg2-build/drugbank.xml"
babel="/data/babel-20250901-p1.sqlite"
output="/home/ubuntu/kg2-build/fda_approved_drugs_v1.0_KG2.10.3c.pickle"
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Generate the `fda_approved_drugs_v1.0.pickle` file using Sep. 2025 Babel and the latest DrugBank #2670

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Generate the fda_approved_drugs_v1.0.pickle file using Sep. 2025 Babel and the latest DrugBank #2670

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions

Generate the `fda_approved_drugs_v1.0.pickle` file using Sep. 2025 Babel and the latest DrugBank #2670