From 775c8ee29ba1fe36bfe3b30f7f063e1d99aa6b6b Mon Sep 17 00:00:00 2001
From: Jason Fok <ventiduefok12@gmail.com>
Date: Wed, 15 Apr 2026 00:34:01 +0800
Subject: [PATCH] Add KEEP model with frequency-aware regularization and
 MIMIC-IV ablation example

---
 docs/api/models.rst                      |   1 +
 docs/api/models/pyhealth.models.keep.rst |  47 ++++
 examples/mimic4_readmission_keep.py      | 298 +++++++++++++++++++++++
 pyhealth/models/__init__.py              |   1 +
 pyhealth/models/keep.py                  | 298 +++++++++++++++++++++++
 tests/core/test_keep.py                  |  94 +++++++
 6 files changed, 739 insertions(+)
 create mode 100644 docs/api/models/pyhealth.models.keep.rst
 create mode 100644 examples/mimic4_readmission_keep.py
 create mode 100644 pyhealth/models/keep.py
 create mode 100644 tests/core/test_keep.py

diff --git a/docs/api/models.rst b/docs/api/models.rst
index 7368dec94..7beba8537 100644
--- a/docs/api/models.rst
+++ b/docs/api/models.rst
@@ -186,6 +186,7 @@ API Reference
     models/pyhealth.models.Deepr
     models/pyhealth.models.EHRMamba
     models/pyhealth.models.JambaEHR
+    models/pyhealth.models.keep
     models/pyhealth.models.ContraWR
     models/pyhealth.models.SparcNet
     models/pyhealth.models.StageNet
diff --git a/docs/api/models/pyhealth.models.keep.rst b/docs/api/models/pyhealth.models.keep.rst
new file mode 100644
index 000000000..a4c8ced00
--- /dev/null
+++ b/docs/api/models/pyhealth.models.keep.rst
@@ -0,0 +1,47 @@
+pyhealth.models.keep
+====================
+
+Overview
+--------
+
+This module implements **KEEP** (Knowledge-Preserving and Empirically
+Refined Embedding Process), a method for learning robust medical code
+embeddings by integrating structured medical ontologies with empirical
+co-occurrence patterns from electronic health records (EHR).
+
+KEEP addresses the trade-off between:
+
+- Knowledge-graph-based embeddings (which preserve ontology structure)
+- Data-driven embeddings (which capture empirical associations)
+
+Our implementation provides:
+
+- Lightweight co-occurrence-based embedding pretraining
+- Optional frequency-aware ontology regularization
+- Supervised readmission prediction via mean pooling
+- Compatibility with the PyHealth Trainer API
+
+This implementation is adapted for coursework-scale experiments
+using MIMIC-IV.
+
+Paper Reference
+---------------
+
+Ahmed Elhussein, Paul Meddeb, Abigail Newbury, Jeanne Mirone,
+Martin Stoll, and Gamze Gursoy.
+
+**"KEEP: Integrating Medical Ontologies with Clinical Data for Robust Code Embeddings."**
+
+Proceedings of Machine Learning Research (PMLR), vol. 287,
+pp. 1–19, 2025.
+
+arXiv: https://arxiv.org/abs/2510.05049  
+DOI: https://doi.org/10.48550/arXiv.2510.05049
+
+API Reference
+-------------
+
+.. autoclass:: pyhealth.models.KEEP
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/examples/mimic4_readmission_keep.py b/examples/mimic4_readmission_keep.py
new file mode 100644
index 000000000..ca5cfea7a
--- /dev/null
+++ b/examples/mimic4_readmission_keep.py
@@ -0,0 +1,298 @@
+"""
+examples/mimic4_readmission_keep.py
+
+Ablation Study for KEEP on MIMIC-IV Readmission Prediction
+===========================================================
+
+This script evaluates the KEEP model and performs a structured
+ablation study to analyze the impact of ontology regularization
+and frequency-aware regularization on readmission prediction.
+
+----------------------------------------------------------------------
+RESEARCH QUESTION
+----------------------------------------------------------------------
+
+Does ontology-based regularization improve readmission prediction
+performance, and does frequency-aware regularization further improve
+robustness compared to uniform regularization?
+
+----------------------------------------------------------------------
+EXPERIMENTAL VARIABLES
+----------------------------------------------------------------------
+
+We systematically vary two factors:
+
+1) Regularization Strength (lambda_base)
+   - 0.0  → No ontology regularization
+   - 0.1  → Standard KEEP regularization
+
+2) Frequency-Aware Regularization
+   - False → Uniform λ for all codes
+   - True  → λ_i = lambda_base / sqrt(freq_i + 1)
+
+Additionally, we vary embedding dimensionality:
+   - 64
+   - 128
+
+----------------------------------------------------------------------
+DATASET
+----------------------------------------------------------------------
+
+This script uses the official MIMIC-IV demo dataset
+(mimic-iv-clinical-database-demo-2.2).
+
+The demo dataset is sufficient for:
+    - Verifying model integration
+    - Running structured ablation
+    - Demonstrating performance comparison
+
+----------------------------------------------------------------------
+EVALUATION METRICS
+----------------------------------------------------------------------
+
+We report:
+    - AUROC
+    - AUPRC
+    - F1 Score
+    - Accuracy
+
+----------------------------------------------------------------------
+USAGE
+----------------------------------------------------------------------
+
+Run with MIMIC-IV demo data:
+
+    python examples/mimic4_readmission_keep.py \
+        --mimic_root /path/to/mimic-iv-demo
+"""
+
+import os
+import random
+import argparse
+import numpy as np
+import torch
+
+from pyhealth.datasets import get_dataloader
+from pyhealth.datasets import MIMIC4Dataset
+from pyhealth.datasets.splitter import split_by_patient
+from pyhealth.tasks import ReadmissionPredictionMIMIC4
+from pyhealth.trainer import Trainer
+from pyhealth.models import KEEP
+
+
+# ---------------------------------------------------------------------
+# Reproducibility
+# ---------------------------------------------------------------------
+def set_seed(seed: int = 42) -> None:
+    """Set random seeds for reproducibility."""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+
+# ---------------------------------------------------------------------
+# Single Experiment Runner
+# ---------------------------------------------------------------------
+def run_experiment(
+    train_dataset,
+    val_dataset,
+    test_dataset,
+    lambda_base: float,
+    use_frequency_regularization: bool,
+    embedding_dim: int,
+):
+    """
+    Train and evaluate KEEP under a specific configuration.
+
+    Each experiment includes:
+        1) Unsupervised embedding pretraining
+        2) Supervised readmission training
+        3) Evaluation on held-out test set
+    """
+
+    print("=" * 80)
+    print(
+        f"Config | lambda_base={lambda_base} | "
+        f"use_freq_reg={use_frequency_regularization} | "
+        f"embedding_dim={embedding_dim}"
+    )
+
+    model = KEEP(
+        dataset=train_dataset,
+        embedding_dim=embedding_dim,
+        lambda_base=lambda_base,
+        use_frequency_regularization=use_frequency_regularization,
+    )
+
+    # ----------------------------------------------------------
+    # Stage 1: Co-occurrence Pretraining
+    # ----------------------------------------------------------
+    samples = [train_dataset[i] for i in range(len(train_dataset))]
+    model.pretrain_embeddings(
+        samples=samples,
+        epochs=1,      # Reduced for demo speed
+        lr=1e-3,
+    )
+
+    # ----------------------------------------------------------
+    # Stage 2: Supervised Fine-tuning
+    # ----------------------------------------------------------
+    trainer = Trainer(
+        model=model,
+        metrics=["roc_auc", "pr_auc", "f1", "accuracy"],
+    )
+
+    train_loader = get_dataloader(
+        train_dataset,
+        batch_size=16,
+        shuffle=True,
+    )
+
+    val_loader = get_dataloader(
+        val_dataset,
+        batch_size=16,
+        shuffle=False,
+    )
+
+    test_loader = get_dataloader(
+        test_dataset,
+        batch_size=16,
+        shuffle=False,
+    )
+
+    trainer.train(
+        train_dataloader=train_loader,
+        val_dataloader=val_loader,
+        epochs=2,
+    )
+
+    results = trainer.evaluate(test_loader)
+
+    return {
+        "auroc": results.get("roc_auc", float("nan")),
+        "auprc": results.get("pr_auc", float("nan")),
+        "f1": results.get("f1", float("nan")),
+        "accuracy": results.get("accuracy", float("nan")),
+    }
+
+
+# ---------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------
+def main() -> None:
+
+    set_seed(42)
+
+    # ----------------------------------------------------------
+    # Argument Parsing
+    # ----------------------------------------------------------
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mimic_root",
+        type=str,
+        required=True,
+        help="Path to MIMIC-IV demo dataset.",
+    )
+    args = parser.parse_args()
+
+    print(f"Using MIMIC-IV demo dataset at: {args.mimic_root}")
+
+    # ----------------------------------------------------------
+    # Dataset Loading
+    # ----------------------------------------------------------
+    dataset = MIMIC4Dataset(
+        ehr_root=args.mimic_root,
+        ehr_tables=[
+            "admissions",
+            "diagnoses_icd",
+            "procedures_icd",
+            "prescriptions",
+        ],
+        dev=False,
+    )
+
+    print("Setting up readmission prediction task...")
+    task_dataset = dataset.set_task(
+        ReadmissionPredictionMIMIC4()
+    )
+
+    # ----------------------------------------------------------
+    # Train / Validation / Test Split
+    # ----------------------------------------------------------
+    print("Splitting dataset by patient...")
+    train_dataset, val_dataset, test_dataset = split_by_patient(
+        task_dataset,
+        ratios=[0.7, 0.1, 0.2],
+    )
+
+    print(f"Train size: {len(train_dataset)}")
+    print(f"Val size:   {len(val_dataset)}")
+    print(f"Test size:  {len(test_dataset)}")
+
+    # ----------------------------------------------------------
+    # Ablation Configurations
+    # ----------------------------------------------------------
+    configs = [
+        (0.0, False),   # No regularization
+        (0.1, False),   # Standard KEEP
+        (0.1, True),    # Frequency-aware KEEP
+    ]
+
+    embedding_dims = [64, 128]
+    all_results = []
+
+    # ----------------------------------------------------------
+    # Run Experiments
+    # ----------------------------------------------------------
+    for embedding_dim in embedding_dims:
+        for lambda_base, use_freq_reg in configs:
+
+            metrics = run_experiment(
+                train_dataset=train_dataset,
+                val_dataset=val_dataset,
+                test_dataset=test_dataset,
+                lambda_base=lambda_base,
+                use_frequency_regularization=use_freq_reg,
+                embedding_dim=embedding_dim,
+            )
+
+            all_results.append(
+                {
+                    "embedding_dim": embedding_dim,
+                    "lambda_base": lambda_base,
+                    "use_freq_reg": use_freq_reg,
+                    **metrics,
+                }
+            )
+
+    # ----------------------------------------------------------
+    # Print Results Table
+    # ----------------------------------------------------------
+    print("\n" + "=" * 80)
+    print("FINAL ABLATION RESULTS")
+    print("Comparison across regularization strategies and embedding sizes")
+    print("=" * 80)
+
+    header = (
+        f"{'emb_dim':<8} | {'lambda':<8} | {'freq_reg':<8} | "
+        f"{'AUROC':<8} | {'AUPRC':<8} | {'F1':<8} | {'Accuracy':<8}"
+    )
+    print(header)
+    print("-" * len(header))
+
+    for result in all_results:
+        print(
+            f"{result['embedding_dim']:<8} | "
+            f"{result['lambda_base']:<8} | "
+            f"{str(result['use_freq_reg']):<8} | "
+            f"{result['auroc']:<8.4f} | "
+            f"{result['auprc']:<8.4f} | "
+            f"{result['f1']:<8.4f} | "
+            f"{result['accuracy']:<8.4f}"
+        )
+
+    print("=" * 80)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/pyhealth/models/__init__.py b/pyhealth/models/__init__.py
index 5233b1726..ab0f73390 100644
--- a/pyhealth/models/__init__.py
+++ b/pyhealth/models/__init__.py
@@ -9,6 +9,7 @@
 from .embedding import EmbeddingModel
 from .gamenet import GAMENet, GAMENetLayer
 from .jamba_ehr import JambaEHR, JambaLayer
+from .keep import KEEP
 from .logistic_regression import LogisticRegression
 from .gan import GAN
 from .gnn import GAT, GCN
diff --git a/pyhealth/models/keep.py b/pyhealth/models/keep.py
new file mode 100644
index 000000000..914859aad
--- /dev/null
+++ b/pyhealth/models/keep.py
@@ -0,0 +1,298 @@
+"""
+KEEP-lite: A minimal co-occurrence + embedding model for
+readmission prediction.
+
+Extended with:
+- Optional Frequency-Aware Regularization
+- Lightweight GloVe-style embedding pretraining
+- Mean pooled supervised readmission prediction
+"""
+
+import math
+from collections import defaultdict
+from itertools import combinations
+from typing import Dict, List, Any
+
+import torch
+import torch.nn as nn
+
+from pyhealth.models import BaseModel
+
+
+class KEEP(BaseModel):
+    """
+    KEEP-lite implementation.
+
+    This model learns medical code embeddings using a lightweight
+    co-occurrence objective and supports optional frequency-aware
+    regularization. The learned embeddings are used for supervised
+    readmission prediction via mean pooling.
+
+    Args:
+        dataset: PyHealth dataset instance. Must contain the
+            "conditions" feature in `input_processors`.
+        embedding_dim: Dimension of embedding vectors.
+        lambda_base: Base regularization strength.
+        use_frequency_regularization: Whether to apply
+            frequency-aware regularization during pretraining.
+
+    Example:
+        >>> model = KEEP(dataset, embedding_dim=128)
+        >>> output = model(conditions=batch["conditions"],
+        ...                label=batch["label"])
+    """
+
+    def __init__(
+        self,
+        dataset,
+        embedding_dim: int = 128,
+        lambda_base: float = 0.1,
+        use_frequency_regularization: bool = True,
+    ) -> None:
+        super().__init__(dataset=dataset)
+
+        if "conditions" not in dataset.input_processors:
+            raise ValueError(
+                "KEEP requires 'conditions' feature in dataset."
+            )
+
+        self.dataset = dataset
+        self.embedding_dim = embedding_dim
+        self.lambda_base = lambda_base
+        self.use_frequency_regularization = use_frequency_regularization
+
+        processor = dataset.input_processors["conditions"]
+
+        # ----------------------------------------------------------
+        # Robust vocabulary size detection (handles:
+        #  - attribute
+        #  - method
+        #  - vocab object
+        #  - legacy code_vocab_size
+        # ----------------------------------------------------------
+        if hasattr(processor, "code_vocab_size"):
+            vocab_size = processor.code_vocab_size
+
+        elif hasattr(processor, "vocab_size"):
+            attr = processor.vocab_size
+            vocab_size = attr() if callable(attr) else attr
+
+        elif hasattr(processor, "get_vocab_size"):
+            vocab_size = processor.get_vocab_size()
+
+        elif hasattr(processor, "vocab"):
+            vocab_size = len(processor.vocab)
+
+        else:
+            raise AttributeError(
+                "Cannot determine vocabulary size from processor."
+            )
+
+        vocab_size = int(vocab_size)
+        self.vocab_size = vocab_size
+
+        self.embedding = nn.Embedding(
+            vocab_size,
+            embedding_dim,
+            padding_idx=0,
+        )
+
+        self.classifier = nn.Linear(embedding_dim, 1)
+        self.loss_fn = nn.BCELoss()
+
+        # Frequency-aware components
+        self.code_frequencies: torch.Tensor | None = None
+        self.lambda_vector: torch.Tensor | None = None
+
+    # ==========================================================
+    # PART A — Sparse Co-occurrence Builder
+    # ==========================================================
+    def build_cooccurrence(
+        self, samples: List[Dict[str, Any]]
+    ) -> Dict:
+        """Build sparse visit-level co-occurrence counts.
+
+        Args:
+            samples: List of dataset samples containing "conditions".
+
+        Returns:
+            Dictionary mapping (code_i, code_j) to co-occurrence count.
+        """
+        cooccur = defaultdict(int)
+
+        for sample in samples:
+            codes = sample["conditions"]
+
+            if isinstance(codes, torch.Tensor):
+                codes = codes.detach().cpu().tolist()
+
+            codes = [c for c in codes if c != 0]
+
+            if len(codes) < 2:
+                continue
+
+            unique_codes = list(set(codes))
+            if len(unique_codes) < 2:
+                continue
+
+            for i, j in combinations(sorted(unique_codes), 2):
+                cooccur[(int(i), int(j))] += 1
+
+        return cooccur
+
+    # ==========================================================
+    # PART B — Frequency Computation
+    # ==========================================================
+    def compute_code_frequencies(
+        self, samples: List[Dict[str, Any]]
+    ) -> None:
+        """Compute per-code frequency across samples.
+
+        Args:
+            samples: List of dataset samples.
+
+        Sets:
+            self.code_frequencies
+            self.lambda_vector
+        """
+        freq = torch.zeros(self.vocab_size)
+
+        for sample in samples:
+            codes = sample["conditions"]
+
+            if isinstance(codes, torch.Tensor):
+                codes = codes.detach().cpu().tolist()
+
+            codes = list(set([c for c in codes if c != 0]))
+
+            for c in codes:
+                freq[int(c)] += 1
+
+        self.code_frequencies = freq
+        self.lambda_vector = self.lambda_base / torch.sqrt(
+            freq + 1.0
+        )
+
+    # ==========================================================
+    # PART C — Lightweight GloVe-style Pretraining
+    # ==========================================================
+    def pretrain_embeddings(
+        self,
+        samples: List[Dict[str, Any]],
+        epochs: int = 3,
+        lr: float = 1e-3,
+    ) -> None:
+        """Pretrain embeddings using co-occurrence objective.
+
+        Minimizes:
+            ( dot(w_i, w_j) - log(count + 1) )^2
+
+        If frequency regularization is enabled:
+            + lambda_i ||w_i||^2 + lambda_j ||w_j||^2
+
+        Args:
+            samples: Training samples.
+            epochs: Number of pretraining epochs.
+            lr: Learning rate.
+        """
+        print("Building co-occurrence matrix...")
+        cooccur = self.build_cooccurrence(samples)
+
+        if len(cooccur) == 0:
+            print("No co-occurring condition pairs found.")
+            return
+
+        if self.use_frequency_regularization:
+            print("Computing code frequencies...")
+            self.compute_code_frequencies(samples)
+
+        optimizer = torch.optim.Adam(
+            self.embedding.parameters(), lr=lr
+        )
+
+        self.embedding.train()
+
+        for epoch in range(epochs):
+            total_loss = 0.0
+
+            for (i, j), count in cooccur.items():
+                wi = self.embedding.weight[i]
+                wj = self.embedding.weight[j]
+
+                dot = torch.dot(wi, wj)
+                target = dot.new_tensor(math.log(count + 1.0))
+                glove_loss = (dot - target) ** 2
+
+                if self.use_frequency_regularization:
+                    lambda_i = self.lambda_vector[i]
+                    lambda_j = self.lambda_vector[j]
+
+                    reg_loss = (
+                        lambda_i * torch.norm(wi, p=2) ** 2
+                        + lambda_j * torch.norm(wj, p=2) ** 2
+                    )
+
+                    loss = glove_loss + reg_loss
+                else:
+                    loss = glove_loss
+
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+
+                total_loss += loss.item()
+
+            print(
+                f"[Pretrain] Epoch {epoch+1}/{epochs} "
+                f"| Loss: {total_loss:.4f}"
+            )
+
+        print("Pretraining complete.\n")
+
+    # ==========================================================
+    # PART D — Supervised Forward
+    # ==========================================================
+    def forward(
+        self,
+        conditions: torch.Tensor,
+        x: torch.Tensor = None,
+        label: torch.Tensor | None = None,
+        y: torch.Tensor | None = None,
+        **kwargs,
+    ) -> Dict[str, torch.Tensor]:
+        """Forward pass for readmission prediction."""
+
+        if conditions is None and x is not None:
+            conditions = x
+
+        # ----------------------------------------------------------
+        # Resolve label key dynamically (PyHealth compatibility)
+        # ----------------------------------------------------------
+        if label is None:
+            if y is not None:
+                label = y
+            elif "readmission" in kwargs:
+                label = kwargs["readmission"]
+            elif "label" in kwargs:
+                label = kwargs["label"]
+
+        x = self.embedding(conditions)
+
+        mask = (conditions != 0).unsqueeze(-1).float()
+        summed = (x * mask).sum(dim=1)
+        counts = mask.sum(dim=1).clamp(min=1.0)
+        pooled = summed / counts
+
+        logits = self.classifier(pooled).squeeze(-1)
+        y_prob = torch.sigmoid(logits)
+
+        if label is not None:
+            label = label.view(-1)  # flatten to [batch_size]
+            loss = self.loss_fn(y_prob, label.float())
+            return {
+                "loss": loss,
+                "y_prob": y_prob,
+                "y_true": label,
+            }
+
+        return {"y_prob": y_prob}
\ No newline at end of file
diff --git a/tests/core/test_keep.py b/tests/core/test_keep.py
new file mode 100644
index 000000000..8a274453d
--- /dev/null
+++ b/tests/core/test_keep.py
@@ -0,0 +1,94 @@
+import unittest
+import torch
+
+from pyhealth.models import KEEP
+
+
+class DummyProcessor:
+    """Minimal processor with code_vocab_size."""
+
+    def __init__(self, vocab_size: int):
+        self.code_vocab_size = vocab_size
+
+
+class DummyLabelProcessor:
+    def get_output_size(self):
+        return 1
+
+
+class DummyInputProcessor:
+    def __init__(self, vocab_size):
+        self.code_vocab_size = vocab_size
+
+
+class DummyDataset:
+    def __init__(self, vocab_size=20):
+        self.vocab_size = vocab_size
+
+        # Required by BaseModel
+        self.input_schema = {"conditions": "sequence"}
+        self.output_schema = {"label": "binary"}
+
+        self.input_processors = {
+            "conditions": DummyInputProcessor(vocab_size)
+        }
+
+        self.label_processors = {
+            "label": DummyLabelProcessor()
+        }
+
+
+class TestKEEP(unittest.TestCase):
+    def setUp(self):
+        torch.manual_seed(42)
+        self.dataset = DummyDataset(vocab_size=20)
+
+    def test_instantiation(self):
+        model = KEEP(
+            dataset=self.dataset,
+            embedding_dim=8,
+            lambda_base=0.1,
+        )
+        self.assertIsInstance(model, KEEP)
+
+    def test_forward_pass(self):
+        model = KEEP(
+            dataset=self.dataset,
+            embedding_dim=8,
+            lambda_base=0.1,
+        )
+
+        # batch_size=2, seq_len=5
+        conditions = torch.randint(0, 20, (2, 5))
+
+        output = model(conditions=conditions)
+
+        self.assertIn("y_prob", output)
+        self.assertEqual(output["y_prob"].shape, (2,))
+
+    def test_gradient_computation(self):
+        model = KEEP(
+            dataset=self.dataset,
+            embedding_dim=8,
+            lambda_base=0.1,
+        )
+
+        conditions = torch.randint(0, 20, (2, 5))
+        labels = torch.tensor([1.0, 0.0])
+
+        output = model(
+            conditions=conditions,
+            label=labels,
+        )
+
+        loss = output["loss"]
+        loss.backward()
+
+        # Check at least one parameter received gradient
+        grads = [
+            param.grad
+            for param in model.parameters()
+            if param.requires_grad and param.grad is not None
+        ]
+
+        self.assertTrue(len(grads) > 0)
\ No newline at end of file