Skip to content

Generate the fda_approved_drugs_v1.0.pickle file using Sep. 2025 Babel and the latest DrugBank #2670

@hodgesf

Description

@hodgesf

I used the following script, ran on kg2103build.rtx.ai to produce the fda_approved_drugs_v1.0_KG2.10.3c.pickle file:

#!/usr/bin/env python3

import xml.etree.ElementTree as ET
import stitch_proj.local_babel as lb
import pickle
import argparse

__author__ = 'Frankie Hodges'
__copyright__ = 'Oregon State University'
__credits__ = ['Stephen Ramsey', 'Frankie Hodges']
__license__ = 'MIT'
__version__ = '0.1.0'
__maintainer__ = ''
__email__ = ''
__status__ = 'Prototype'


def parse_args():
    parser = argparse.ArgumentParser(
        description="Generate fda_approved_drugs_v1.0_KG2.10.3c.pickle "
                    "from DrugBank XML and local Babel DB"
    )

    parser.add_argument(
        "--drugbank_xml",
        type=str,
        default="/home/ubuntu/kg2-build/drugbank.xml",
        help="Path to drugbank.xml"
    )

    parser.add_argument(
        "--babel_db",
        type=str,
        default="/data/babel-20250901-p1.sqlite",
        help="Path to local Babel sqlite DB"
    )

    parser.add_argument(
        "--output_pickle",
        type=str,
        default="/home/ubuntu/kg2-build/fda_approved_drugs_v1.0_KG2.10.3c.pickle",
        help="Output pickle filename"
    )

    return parser.parse_args()


def extract_approved_drugbank_ids(xml_path):
    ns = {"db": "http://www.drugbank.ca"}

    tree = ET.parse(xml_path)
    root = tree.getroot()

    approved_curie_ids = []

    for drug in root.findall("db:drug", ns):
        groups = drug.find("db:groups", ns)
        if groups is None:
            continue

        is_approved = any(
            g.text == "approved"
            for g in groups.findall("db:group", ns)
        )

        if not is_approved:
            continue

        primary_id = drug.find("db:drugbank-id[@primary='true']", ns)
        if primary_id is not None and primary_id.text:
            approved_curie_ids.append("DRUGBANK:" + primary_id.text.strip())

    return approved_curie_ids


def canonicalize_ids(curie_ids, babel_db_path):
    canonical_ids = set()

    with lb.connect_to_db_read_only(babel_db_path) as conn:
        for curie in curie_ids:
            cliques = lb.map_any_curie_to_cliques(conn, curie)
            if not cliques:
                continue

            for clique in cliques:
                canonical_ids.add(clique["id"]["identifier"])

    return canonical_ids


def main():
    args = parse_args()

    print("Extracting approved DrugBank IDs...")
    approved_curie_ids = extract_approved_drugbank_ids(args.drugbank_xml)
    print(f"Found {len(approved_curie_ids)} approved DrugBank IDs.")

    print("Canonicalizing via local Babel...")
    canonical_ids = canonicalize_ids(approved_curie_ids, args.babel_db)
    print(f"Canonicalized to {len(canonical_ids)} unique canonical IDs.")

    print("Writing pickle...")
    with open(args.output_pickle, "wb") as out:
        pickle.dump(canonical_ids, out)

    print("Done.")


if __name__ == "__main__":
    main()

and the final output file, on kg2103build.rtx.ai is fda_approved_drugs_v1.0_KG2.10.3c.pickle

See this commit. It is at RTX-KG2/process/generate_fda_pickle.py and can be ran by: python generate_fda_pickle with the option to pass in the drugbank file, babel, and where to output the file. If those arguments are not provided, the defaults are:

default="/home/ubuntu/kg2-build/drugbank.xml"
babel="/data/babel-20250901-p1.sqlite"
output="/home/ubuntu/kg2-build/fda_approved_drugs_v1.0_KG2.10.3c.pickle"

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions