From 9ca079802db5774eb451b7285cf904304d8faa70 Mon Sep 17 00:00:00 2001 From: cgigliob Date: Sun, 12 Apr 2026 19:25:24 -0700 Subject: [PATCH] Add CaliForest model, data loader, task, and calibration metrics --- docs/api/datasets.rst | 1 + ...datasets.CaliForestMIMICExtractDataset.rst | 16 + docs/api/models.rst | 1 + .../api/models/pyhealth.models.CaliForest.rst | 7 + docs/api/tasks.rst | 1 + ...ealth.tasks.MIMICExtractCaliForestTask.rst | 7 + ...rest_mimic_extract_mortality_califorest.py | 192 ++++++++++ pyhealth/datasets/califorest_mimic_extract.py | 353 +++++++++++++++++ .../configs/califorest_mimic_extract.yaml | 32 ++ pyhealth/metrics/califorest_calibration.py | 176 +++++++++ pyhealth/models/califorest.py | 357 ++++++++++++++++++ pyhealth/tasks/mimic_extract_califorest.py | 88 +++++ tests/test_calibration_metrics.py | 91 +++++ tests/test_califorest.py | 159 ++++++++ .../test_califorest_mimic_extract_dataset.py | 131 +++++++ tests/test_mimic_extract_califorest_task.py | 69 ++++ 16 files changed, 1681 insertions(+) create mode 100644 docs/api/datasets/pyhealth.datasets.CaliForestMIMICExtractDataset.rst create mode 100644 docs/api/models/pyhealth.models.CaliForest.rst create mode 100644 docs/api/tasks/pyhealth.tasks.MIMICExtractCaliForestTask.rst create mode 100644 examples/califorest_mimic_extract_mortality_califorest.py create mode 100644 pyhealth/datasets/califorest_mimic_extract.py create mode 100644 pyhealth/datasets/configs/califorest_mimic_extract.yaml create mode 100644 pyhealth/metrics/califorest_calibration.py create mode 100644 pyhealth/models/califorest.py create mode 100644 pyhealth/tasks/mimic_extract_califorest.py create mode 100644 tests/test_calibration_metrics.py create mode 100644 tests/test_califorest.py create mode 100644 tests/test_califorest_mimic_extract_dataset.py create mode 100644 tests/test_mimic_extract_califorest_task.py diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst index b02439d26..6259bcca1 100644 --- a/docs/api/datasets.rst +++ b/docs/api/datasets.rst @@ -243,5 +243,6 @@ Available Datasets datasets/pyhealth.datasets.ClinVarDataset datasets/pyhealth.datasets.COSMICDataset datasets/pyhealth.datasets.TCGAPRADDataset + datasets/pyhealth.datasets.CaliForestMIMICExtractDataset datasets/pyhealth.datasets.splitter datasets/pyhealth.datasets.utils diff --git a/docs/api/datasets/pyhealth.datasets.CaliForestMIMICExtractDataset.rst b/docs/api/datasets/pyhealth.datasets.CaliForestMIMICExtractDataset.rst new file mode 100644 index 000000000..52d8d01b0 --- /dev/null +++ b/docs/api/datasets/pyhealth.datasets.CaliForestMIMICExtractDataset.rst @@ -0,0 +1,16 @@ +pyhealth.datasets.CaliForestMIMICExtractDataset +================================================= + +Overview +-------- + +MIMIC-Extract dataset loader for CaliForest experiments. Reads the +``all_hourly_data.h5`` file produced by the MIMIC-Extract pipeline and +produces a flat feature matrix suitable for tree-based models. + +.. autoclass:: pyhealth.datasets.califorest_mimic_extract.CaliForestMIMICExtractDataset + :members: + :undoc-members: + :show-inheritance: + +.. autofunction:: pyhealth.datasets.califorest_mimic_extract.load_califorest_data diff --git a/docs/api/models.rst b/docs/api/models.rst index 7368dec94..6ce96c93b 100644 --- a/docs/api/models.rst +++ b/docs/api/models.rst @@ -198,6 +198,7 @@ API Reference models/pyhealth.models.TCN models/pyhealth.models.TFMTokenizer models/pyhealth.models.GAN + models/pyhealth.models.CaliForest models/pyhealth.models.VAE models/pyhealth.models.SDOH models/pyhealth.models.VisionEmbeddingModel diff --git a/docs/api/models/pyhealth.models.CaliForest.rst b/docs/api/models/pyhealth.models.CaliForest.rst new file mode 100644 index 000000000..1c44bc715 --- /dev/null +++ b/docs/api/models/pyhealth.models.CaliForest.rst @@ -0,0 +1,7 @@ +pyhealth.models.CaliForest +=================================== + +.. autoclass:: pyhealth.models.califorest.CaliForest + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/tasks.rst b/docs/api/tasks.rst index 399b8f1aa..0b4c9804d 100644 --- a/docs/api/tasks.rst +++ b/docs/api/tasks.rst @@ -229,3 +229,4 @@ Available Tasks Mutation Pathogenicity (COSMIC) Cancer Survival Prediction (TCGA) Cancer Mutation Burden (TCGA) + MIMIC-Extract CaliForest diff --git a/docs/api/tasks/pyhealth.tasks.MIMICExtractCaliForestTask.rst b/docs/api/tasks/pyhealth.tasks.MIMICExtractCaliForestTask.rst new file mode 100644 index 000000000..12d5ebe67 --- /dev/null +++ b/docs/api/tasks/pyhealth.tasks.MIMICExtractCaliForestTask.rst @@ -0,0 +1,7 @@ +pyhealth.tasks.MIMICExtractCaliForestTask +========================================== + +.. autoclass:: pyhealth.tasks.mimic_extract_califorest.MIMICExtractCaliForestTask + :members: + :undoc-members: + :show-inheritance: diff --git a/examples/califorest_mimic_extract_mortality_califorest.py b/examples/califorest_mimic_extract_mortality_califorest.py new file mode 100644 index 000000000..e77d64585 --- /dev/null +++ b/examples/califorest_mimic_extract_mortality_califorest.py @@ -0,0 +1,192 @@ +# Authors: Cesar Jesus Giglio Badoino (cesarjg2@illinois.edu) +# Arjun Tangella (avtange2@illinois.edu) +# Tony Nguyen (tonyln2@illinois.edu) +# Paper: CaliForest: Calibrated Random Forest for Health Data +# Paper link: https://doi.org/10.1145/3368555.3384461 +# Description: Ablation study for CaliForest hyperparameters +"""MIMIC-Extract CaliForest ablation study. + +This script demonstrates the full CaliForest pipeline and runs +ablation experiments on: + +1. **Calibration type**: CF-Iso vs CF-Logit vs RF-NoCal +2. **Prior sensitivity**: varying alpha0/beta0 +3. **Ensemble size**: 50, 100, 200, 300 trees +4. **Prediction target**: mort_hosp, mort_icu, los_3, los_7 + +Usage with real MIMIC-Extract data:: + + python mimic_extract_califorest_ablation.py \\ + --hdf5_path /path/to/all_hourly_data.h5 + +Usage with synthetic data (for testing):: + + python mimic_extract_califorest_ablation.py --synthetic + +Paper: Y. Park and J. C. Ho. "CaliForest: Calibrated Random Forest +for Health Data." ACM CHIL, 2020. +https://doi.org/10.1145/3368555.3384461 +""" + +import argparse +import numpy as np +from sklearn.metrics import roc_auc_score + + +def make_synthetic_data(n=500, d=50, seed=42): + """Generate synthetic binary classification data.""" + np.random.seed(seed) + X = np.random.randn(n, d) + logits = X[:, 0] + 0.5 * X[:, 1] - 0.3 * X[:, 2] + y = (logits > 0).astype(int) + split = int(0.7 * n) + return X[:split], X[split:], y[:split], y[split:] + + +def evaluate(y_true, y_pred): + """Compute AUROC and all six calibration metrics.""" + from pyhealth.metrics.califorest_calibration import ( + hosmer_lemeshow, + reliability, + scaled_brier_score, + spiegelhalter, + ) + + auc = roc_auc_score(y_true, y_pred) + brier, brier_scaled = scaled_brier_score(y_true, y_pred) + hl = hosmer_lemeshow(y_true, y_pred) + sp = spiegelhalter(y_true, y_pred) + rel_s, rel_l = reliability(y_true, y_pred) + return { + "AUROC": auc, + "Brier": brier, + "Scaled Brier": brier_scaled, + "HL p-value": hl, + "Spiegelhalter p": sp, + "Rel-small": rel_s, + "Rel-large": rel_l, + } + + +def run_califorest(X_train, y_train, X_test, y_test, **kwargs): + """Train CaliForest and return predictions.""" + from pyhealth.models.califorest import CaliForest + + # Use the numpy-level API directly for efficiency + model = CaliForest.__new__(CaliForest) + model.n_estimators = kwargs.get("n_estimators", 300) + model.max_depth = kwargs.get("max_depth", 10) + model.min_samples_split = kwargs.get("min_samples_split", 3) + model.min_samples_leaf = kwargs.get("min_samples_leaf", 1) + model.ctype = kwargs.get("ctype", "isotonic") + model.alpha0 = kwargs.get("alpha0", 100.0) + model.beta0 = kwargs.get("beta0", 25.0) + model.estimators_ = [] + model.calibrator_ = None + model.is_fitted_ = False + + model._fit_numpy(X_train, y_train) + y_pred = model._predict_proba_numpy(X_test) + return y_pred + + +def run_rf_nocal(X_train, y_train, X_test, **kwargs): + """Train uncalibrated Random Forest baseline.""" + from sklearn.ensemble import RandomForestClassifier + + rf = RandomForestClassifier( + n_estimators=kwargs.get("n_estimators", 300), + max_depth=kwargs.get("max_depth", 10), + min_samples_split=kwargs.get("min_samples_split", 3), + min_samples_leaf=kwargs.get("min_samples_leaf", 1), + ) + rf.fit(X_train, y_train) + return rf.predict_proba(X_test)[:, 1] + + +def print_results(name, metrics): + """Pretty-print a single row of results.""" + parts = [f"{name:<20}"] + for k, v in metrics.items(): + parts.append(f"{k}={v:.4f}") + print(" ".join(parts)) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--hdf5_path", type=str, default=None) + parser.add_argument("--target", type=str, default="mort_hosp") + parser.add_argument( + "--synthetic", action="store_true", default=False + ) + args = parser.parse_args() + + # ---- Load data ---- + if args.synthetic or args.hdf5_path is None: + print("Using synthetic data\n") + X_tr, X_te, y_tr, y_te = make_synthetic_data() + else: + from pyhealth.datasets.califorest_mimic_extract import ( + load_califorest_data, + ) + + X_tr, X_te, y_tr, y_te = load_califorest_data( + args.hdf5_path, target=args.target + ) + + # ---- Ablation 1: Calibration type ---- + print("=" * 60) + print("Ablation 1: Calibration Type") + print("=" * 60) + for ctype in ["isotonic", "logistic", "beta"]: + y_pred = run_califorest( + X_tr, y_tr, X_te, y_te, ctype=ctype + ) + print_results(f"CF-{ctype}", evaluate(y_te, y_pred)) + y_pred = run_rf_nocal(X_tr, y_tr, X_te) + print_results("RF-NoCal", evaluate(y_te, y_pred)) + + # ---- Ablation 2: Prior sensitivity ---- + print(f"\n{'=' * 60}") + print("Ablation 2: Prior Sensitivity (alpha0, beta0)") + print("=" * 60) + for alpha0, beta0 in [(50, 12.5), (100, 25), (200, 50), (10, 2.5)]: + y_pred = run_califorest( + X_tr, y_tr, X_te, y_te, + alpha0=alpha0, beta0=beta0, + ) + print_results( + f"a0={alpha0},b0={beta0}", evaluate(y_te, y_pred) + ) + + # ---- Ablation 3: Ensemble size ---- + print(f"\n{'=' * 60}") + print("Ablation 3: Number of Estimators") + print("=" * 60) + for n_est in [50, 100, 200, 300]: + y_pred = run_califorest( + X_tr, y_tr, X_te, y_te, n_estimators=n_est + ) + print_results(f"n_est={n_est}", evaluate(y_te, y_pred)) + + # ---- Ablation 4: Prediction targets (synthetic only) ---- + if args.synthetic or args.hdf5_path is None: + print("\n(Skipping target ablation on synthetic data)") + else: + print(f"\n{'=' * 60}") + print("Ablation 4: Prediction Targets") + print("=" * 60) + from pyhealth.datasets.califorest_mimic_extract import ( + load_califorest_data, + ) + + for tgt in ["mort_hosp", "mort_icu", "los_3", "los_7"]: + Xr, Xe, yr, ye = load_califorest_data( + args.hdf5_path, target=tgt + ) + y_pred = run_califorest(Xr, yr, Xe, ye) + print_results(tgt, evaluate(ye, y_pred)) + + +if __name__ == "__main__": + main() diff --git a/pyhealth/datasets/califorest_mimic_extract.py b/pyhealth/datasets/califorest_mimic_extract.py new file mode 100644 index 000000000..957639ade --- /dev/null +++ b/pyhealth/datasets/califorest_mimic_extract.py @@ -0,0 +1,353 @@ +# Authors: Cesar Jesus Giglio Badoino (cesarjg2@illinois.edu) +# Arjun Tangella (avtange2@illinois.edu) +# Tony Nguyen (tonyln2@illinois.edu) +# Paper: CaliForest: Calibrated Random Forest for Health Data +# Paper link: https://doi.org/10.1145/3368555.3384461 +# Description: MIMIC-Extract HDF5 loader for CaliForest flat features +"""MIMIC-Extract dataset for CaliForest replication. + +Reads the ``all_hourly_data.h5`` file produced by the `MIMIC-Extract +`_ pipeline and +produces a flat (n_patients x n_features) matrix suitable for +tree-based models such as CaliForest. + +The preprocessing follows Park and Ho (2020): + +1. Filter to patients with ``max_hours > 30`` (24 h window + 6 h + gap). +2. Use only the first 24 hours of time-series data. +3. Impute missing values (forward-fill, ICU-stay mean, then 0). +4. Add binary mask and time-since-measured features. +5. Mean-centre features using training-set statistics. +6. Flatten hourly data into a single row per patient. +7. Extract four binary labels: ``mort_hosp``, ``mort_icu``, + ``los_3`` (LOS > 3 days), ``los_7`` (LOS > 7 days). + +The user must have already generated the HDF5 file via the +MIMIC-Extract pipeline or obtained it from GCP (requires PhysioNet +credentials). + +Paper: Y. Park and J. C. Ho. "CaliForest: Calibrated Random Forest +for Health Data." ACM CHIL, 2020. +https://doi.org/10.1145/3368555.3384461 +""" + +import logging +import os +from typing import List, Optional, Tuple + +import numpy as np +import pandas as pd + +from .base_dataset import BaseDataset + +logger = logging.getLogger(__name__) + +GAP_TIME = 6 # hours +WINDOW_SIZE = 24 # hours +ID_COLS = ["subject_id", "hadm_id", "icustay_id"] + + +def _normalize_columns(df: pd.DataFrame) -> pd.DataFrame: + """Ensure Aggregation Function is the first column level. + + The official MIMIC-Extract HDF5 stores columns as + ``(LEVEL2, Aggregation Function)`` while some generated + versions use ``(Aggregation Function, LEVEL2)``. This + helper normalises to ``(Aggregation Function, LEVEL2)``. + + Args: + df: DataFrame with a two-level MultiIndex on columns. + + Returns: + DataFrame with normalised column order. + """ + if df.columns.nlevels != 2: + return df + agg_vals = {"mean", "count", "std", "mask"} + level0_vals = set(df.columns.get_level_values(0).unique()) + if not level0_vals & agg_vals: + df = df.swaplevel(axis=1).sort_index(axis=1) + return df + + +def _simple_imputer(df: pd.DataFrame) -> pd.DataFrame: + """Impute missing values following Che et al. (2018). + + Produces three feature channels per variable: ``mean`` + (imputed value), ``mask`` (observation indicator), and + ``time_since_measured``. + + Args: + df: Multi-level column DataFrame from MIMIC-Extract + ``vitals_labs`` table. + + Returns: + Imputed DataFrame with ``mean``, ``mask``, and + ``time_since_measured`` columns. + """ + idx = pd.IndexSlice + df = df.copy() + + if df.columns.nlevels > 2: + df.columns = df.columns.droplevel( + [n for n in df.columns.names + if n not in ("Aggregation Function", "LEVEL2")] + ) + + df = _normalize_columns(df) + + df_out = df.loc[:, idx[["mean", "count"], :]] + icustay_means = ( + df_out.loc[:, idx["mean", :]].groupby(ID_COLS).mean() + ) + imputed = ( + df_out.loc[:, idx["mean", :]] + .groupby(ID_COLS) + .ffill() + ) + # Fill remaining NaNs with per-ICU-stay means, then 0 + for col in imputed.columns: + imputed[col] = imputed[col].fillna(icustay_means[col]) + imputed = imputed.fillna(0).copy() + df_out.loc[:, idx["mean", :]] = imputed + + mask = ( + df.loc[:, idx["count", :]] > 0 + ).astype(float).copy() + df_out.loc[:, idx["count", :]] = mask + df_out = df_out.rename( + columns={"count": "mask"}, + level="Aggregation Function", + ) + + is_absent = 1 - df_out.loc[:, idx["mask", :]].copy() + hours_of_absence = is_absent.cumsum() + time_since = hours_of_absence - hours_of_absence[ + is_absent == 0 + ].ffill() + time_since = time_since.rename( + columns={"mask": "time_since_measured"}, + level="Aggregation Function", + ) + df_out = pd.concat((df_out, time_since), axis=1) + df_out.loc[:, idx["time_since_measured", :]] = ( + df_out.loc[:, idx["time_since_measured", :]] + .fillna(WINDOW_SIZE + 1) + .copy() + ) + df_out.sort_index(axis=1, inplace=True) + return df_out + + +class CaliForestMIMICExtractDataset(BaseDataset): + """MIMIC-Extract dataset for CaliForest experiments. + + Wraps the MIMIC-Extract ``all_hourly_data.h5`` output and + provides the flat feature matrix and binary labels used in + the CaliForest paper. + + Unlike the existing ``MIMICExtractDataset`` which produces + event-level data for deep learning models, this class + produces the flat, imputed, aggregated feature matrix that + tree-based models like CaliForest require. + + Args: + root: Directory containing ``all_hourly_data.h5``. + tables: Ignored (kept for BaseDataset compatibility). + Defaults to ``["vitals_labs"]``. + dataset_name: Name of the dataset. Defaults to + ``"califorest_mimic_extract"``. + **kwargs: Additional arguments passed to BaseDataset. + + Examples: + >>> from pyhealth.datasets import ( + ... CaliForestMIMICExtractDataset, + ... ) + >>> dataset = CaliForestMIMICExtractDataset( + ... root="/path/to/mimic_extract_output", + ... ) + >>> X_tr, X_te, y_tr, y_te = dataset.load_califorest_data( + ... target="mort_hosp", + ... ) + >>> print(X_tr.shape) # (n_train, 7488) + """ + + def __init__( + self, + root: str, + tables: Optional[List[str]] = None, + dataset_name: Optional[str] = None, + **kwargs, + ) -> None: + self._hdf5_path = os.path.join( + root, "all_hourly_data.h5" + ) + if not os.path.exists(self._hdf5_path): + raise FileNotFoundError( + f"all_hourly_data.h5 not found in {root}. " + "Run the MIMIC-Extract pipeline first." + ) + # BaseDataset requires tables and config_path but we + # handle data loading ourselves via HDF5. Pass minimal + # args so the parent __init__ stores root/dataset_name + # without attempting CSV loading. + super().__init__( + root=root, + tables=tables or ["vitals_labs"], + dataset_name=( + dataset_name or "califorest_mimic_extract" + ), + **kwargs, + ) + + # Override to prevent BaseDataset from scanning CSVs + def load_data(self): # type: ignore[override] + """Not used. Data is loaded via :meth:`load_califorest_data`.""" + raise NotImplementedError( + "Use load_califorest_data() instead." + ) + + def load_califorest_data( + self, + target: str = "mort_hosp", + random_seed: int = 0, + train_frac: float = 0.7, + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """Load and preprocess data for CaliForest. + + Replicates the exact data pipeline from the CaliForest + paper, producing the flat feature matrix and binary + labels used in the original experiments. + + Args: + target: Prediction target. One of ``"mort_hosp"``, + ``"mort_icu"``, ``"los_3"``, ``"los_7"``. + random_seed: Random seed for train/test split. + train_frac: Fraction of subjects for training. + + Returns: + A tuple ``(X_train, X_test, y_train, y_test)``. + + Examples: + >>> X_tr, X_te, y_tr, y_te = ( + ... dataset.load_califorest_data("mort_hosp") + ... ) + """ + return load_califorest_data( + self._hdf5_path, + target=target, + random_seed=random_seed, + train_frac=train_frac, + ) + + +def load_califorest_data( + hdf5_path: str, + target: str = "mort_hosp", + random_seed: int = 0, + train_frac: float = 0.7, +) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """Load and preprocess MIMIC-Extract data for CaliForest. + + This is a standalone convenience function. For integration + with PyHealth pipelines, prefer + :class:`CaliForestMIMICExtractDataset`. + + Args: + hdf5_path: Path to ``all_hourly_data.h5``. + target: One of ``"mort_hosp"``, ``"mort_icu"``, + ``"los_3"``, ``"los_7"``. + random_seed: Random seed for train/test split. + train_frac: Fraction of subjects for training. + + Returns: + A tuple ``(X_train, X_test, y_train, y_test)``. + """ + statics = pd.read_hdf(hdf5_path, "patients") + data_full = pd.read_hdf(hdf5_path, "vitals_labs") + + # Cohort filter + statics = statics[statics.max_hours > WINDOW_SIZE + GAP_TIME] + + # Labels + Ys = statics.loc[:, ["mort_hosp", "mort_icu", "los_icu"]] + Ys["mort_hosp"] = Ys["mort_hosp"].astype(int) + Ys["mort_icu"] = Ys["mort_icu"].astype(int) + Ys["los_3"] = (Ys["los_icu"] > 3).astype(int) + Ys["los_7"] = (Ys["los_icu"] > 7).astype(int) + Ys.drop(columns=["los_icu"], inplace=True) + + # Filter time-series to cohort and first 24 h + valid_ids = set( + Ys.index.get_level_values("icustay_id") + ) + lvl2 = data_full.loc[ + data_full.index.get_level_values("icustay_id").isin( + valid_ids + ) + & ( + data_full.index.get_level_values("hours_in") + < WINDOW_SIZE + ), + :, + ] + + # Train/test split by subject + subjects = list( + set(lvl2.index.get_level_values("subject_id")) + ) + np.random.seed(random_seed) + subjects = np.random.permutation(subjects) + n_train = int(train_frac * len(subjects)) + train_subj = set(subjects[:n_train]) + test_subj = set(subjects[n_train:]) + + def _filter(df, subj_set): + return df.loc[ + df.index.get_level_values("subject_id").isin( + subj_set + ) + ] + + lvl2_train = _filter(lvl2, train_subj) + lvl2_test = _filter(lvl2, test_subj) + Ys_train = _filter(Ys, train_subj) + Ys_test = _filter(Ys, test_subj) + + # Normalize column level order + lvl2_train = _normalize_columns(lvl2_train) + lvl2_test = _normalize_columns(lvl2_test) + + # Mean-centre using training statistics + idx = pd.IndexSlice + lvl2_means = lvl2_train.loc[:, idx["mean", :]].mean(axis=0) + lvl2_train.loc[:, idx["mean", :]] -= lvl2_means + lvl2_test.loc[:, idx["mean", :]] -= lvl2_means + + # Impute + lvl2_train = _simple_imputer(lvl2_train) + lvl2_test = _simple_imputer(lvl2_test) + + # Flatten + X_train_df = lvl2_train.pivot_table( + index=ID_COLS, columns=["hours_in"] + ) + X_test_df = lvl2_test.pivot_table( + index=ID_COLS, columns=["hours_in"] + ) + + # Align features and labels to shared patients + common_tr = Ys_train.index.intersection(X_train_df.index) + common_te = Ys_test.index.intersection(X_test_df.index) + X_train = X_train_df.loc[common_tr].values + X_test = X_test_df.loc[common_te].values + y_train = Ys_train.loc[common_tr, target].values + y_test = Ys_test.loc[common_te, target].values + + logger.info( + f"Loaded MIMIC-Extract: X_train={X_train.shape}, " + f"X_test={X_test.shape}, " + f"prevalence={y_train.mean():.3f}" + ) + return X_train, X_test, y_train, y_test diff --git a/pyhealth/datasets/configs/califorest_mimic_extract.yaml b/pyhealth/datasets/configs/califorest_mimic_extract.yaml new file mode 100644 index 000000000..3fcd1155b --- /dev/null +++ b/pyhealth/datasets/configs/califorest_mimic_extract.yaml @@ -0,0 +1,32 @@ +# CaliForest MIMIC-Extract dataset configuration. +# +# This dataset reads from the MIMIC-Extract all_hourly_data.h5 file +# rather than raw CSVs. The HDF5 contains pre-aggregated hourly +# vitals/labs with mean, count, and std columns. +# +# Tables in the HDF5: +# - patients: static demographics and outcomes +# - vitals_labs: hourly time-series (LEVEL2 x Aggregation Function) +# - interventions: hourly binary intervention indicators +# - codes: ICD9 diagnosis codes +# +# Reference: Wang et al. "MIMIC-Extract: A Data Extraction, +# Preprocessing, and Representation Pipeline for MIMIC-III." 2019. +version: "1.0" +format: "hdf5" +tables: + patients: + hdf5_key: "patients" + patient_id: "subject_id" + attributes: + - "mort_hosp" + - "mort_icu" + - "los_icu" + - "max_hours" + - "age" + - "gender" + - "ethnicity" + vitals_labs: + hdf5_key: "vitals_labs" + patient_id: "subject_id" + attributes: "all" diff --git a/pyhealth/metrics/califorest_calibration.py b/pyhealth/metrics/califorest_calibration.py new file mode 100644 index 000000000..a2d01ce27 --- /dev/null +++ b/pyhealth/metrics/califorest_calibration.py @@ -0,0 +1,176 @@ +# Authors: Cesar Jesus Giglio Badoino (cesarjg2@illinois.edu) +# Arjun Tangella (avtange2@illinois.edu) +# Tony Nguyen (tonyln2@illinois.edu) +# Paper: CaliForest: Calibrated Random Forest for Health Data +# Paper link: https://doi.org/10.1145/3368555.3384461 +# Description: Six calibration metrics for binary risk prediction +"""CaliForest calibration metrics (Park and Ho, 2020). + +Six commonly published calibration metrics for binary risk +prediction models, as described in Table 1 of the CaliForest +paper. + +Paper: Y. Park and J. C. Ho. "CaliForest: Calibrated Random +Forest for Health Data." ACM CHIL, 2020. +https://doi.org/10.1145/3368555.3384461 +""" + +import numpy as np +import pandas as pd + + +def hosmer_lemeshow( + y_true: np.ndarray, + y_score: np.ndarray, + n_groups: int = 10, +) -> float: + """Hosmer-Lemeshow goodness-of-fit test p-value. + + Divides predictions into ``n_groups`` equal-sized bins and + tests whether observed event rates match predicted rates + using a chi-squared statistic. + + A p-value close to 1 indicates good calibration. + + Args: + y_true: Binary ground-truth labels of shape ``(N,)``. + y_score: Predicted probabilities of shape ``(N,)``. + n_groups: Number of quantile groups. Default ``10``. + + Returns: + The Hosmer-Lemeshow p-value (higher is better). + + Examples: + >>> y_true = np.array([0, 0, 1, 1]) + >>> y_score = np.array([0.1, 0.2, 0.8, 0.9]) + >>> p = hosmer_lemeshow(y_true, y_score) + """ + from scipy.stats import chi2 + + df = pd.DataFrame({"score": y_score, "target": y_true}) + df = df.sort_values("score") + df["score"] = np.clip(df["score"], 1e-8, 1 - 1e-8) + df["rank"] = range(len(df)) + df["decile"] = pd.qcut( + df["rank"], n_groups, duplicates="raise" + ) + + grp = df.groupby("decile", observed=False) + obs_pos = grp["target"].sum() + obs_neg = grp["target"].count() - obs_pos + exp_pos = grp["score"].sum() + exp_neg = grp["score"].count() - exp_pos + + hl = ( + (obs_pos - exp_pos) ** 2 / exp_pos + + (obs_neg - exp_neg) ** 2 / exp_neg + ).sum() + return float(1 - chi2.cdf(hl, n_groups - 2)) + + +def spiegelhalter( + y_true: np.ndarray, + y_score: np.ndarray, +) -> float: + """Spiegelhalter z-test p-value for calibration. + + Tests whether the Brier score is extreme under the null + hypothesis that predicted probabilities equal true + probabilities. + + A p-value close to 1 indicates good calibration. + + Args: + y_true: Binary ground-truth labels of shape ``(N,)``. + y_score: Predicted probabilities of shape ``(N,)``. + + Returns: + The two-sided Spiegelhalter p-value (higher is better). + + Examples: + >>> y_true = np.array([0, 0, 1, 1]) + >>> y_score = np.array([0.1, 0.2, 0.8, 0.9]) + >>> p = spiegelhalter(y_true, y_score) + """ + from scipy.stats import norm + + top = np.sum((y_true - y_score) * (1 - 2 * y_score)) + bot = np.sum( + (1 - 2 * y_score) ** 2 * y_score * (1 - y_score) + ) + z = top / np.sqrt(bot) + return float(norm.sf(np.abs(z)) * 2) + + +def reliability( + y_true: np.ndarray, + y_score: np.ndarray, + n_groups: int = 10, +) -> tuple: + """Reliability-in-the-small and reliability-in-the-large. + + Reliability-in-the-small measures the average squared + difference between observed and predicted event rates across + quantile bins. Reliability-in-the-large measures the squared + difference between the overall mean prediction and the + overall observed prevalence. + + Values close to 0 indicate good calibration. + + Args: + y_true: Binary ground-truth labels of shape ``(N,)``. + y_score: Predicted probabilities of shape ``(N,)``. + n_groups: Number of quantile groups. Default ``10``. + + Returns: + A tuple ``(rel_small, rel_large)``. + + Examples: + >>> y_true = np.array([0, 0, 1, 1]) + >>> y_score = np.array([0.1, 0.2, 0.8, 0.9]) + >>> rel_s, rel_l = reliability(y_true, y_score) + """ + df = pd.DataFrame({"score": y_score, "target": y_true}) + df = df.sort_values("score") + df["rank"] = range(len(df)) + df["decile"] = pd.qcut( + df["rank"], n_groups, duplicates="raise" + ) + + grp = df.groupby("decile", observed=False) + obs = grp["target"].mean() + exp = grp["score"].mean() + rel_small = float(np.mean((obs - exp) ** 2)) + rel_large = float( + (np.mean(y_true) - np.mean(y_score)) ** 2 + ) + return rel_small, rel_large + + +def scaled_brier_score( + y_true: np.ndarray, + y_score: np.ndarray, +) -> tuple: + """Brier score and scaled (prevalence-adjusted) Brier score. + + The scaled Brier score normalises by the maximum Brier score + achievable by always predicting the prevalence. A scaled + score of 1 indicates perfect calibration. + + Args: + y_true: Binary ground-truth labels of shape ``(N,)``. + y_score: Predicted probabilities of shape ``(N,)``. + + Returns: + A tuple ``(brier, scaled_brier)``. + + Examples: + >>> y_true = np.array([0, 0, 1, 1]) + >>> y_score = np.array([0.1, 0.2, 0.8, 0.9]) + >>> brier, scaled = scaled_brier_score(y_true, y_score) + """ + brier = float(np.mean((y_true - y_score) ** 2)) + p = np.mean(y_true) + denom = p * (1 - p) + scaled = 1 - brier / denom if denom > 0 else 0.0 + return brier, float(scaled) diff --git a/pyhealth/models/califorest.py b/pyhealth/models/califorest.py new file mode 100644 index 000000000..a3e945835 --- /dev/null +++ b/pyhealth/models/califorest.py @@ -0,0 +1,357 @@ +# Authors: Cesar Jesus Giglio Badoino (cesarjg2@illinois.edu) +# Arjun Tangella (avtange2@illinois.edu) +# Tony Nguyen (tonyln2@illinois.edu) +# Paper: CaliForest: Calibrated Random Forest for Health Data +# Paper link: https://doi.org/10.1145/3368555.3384461 +# Description: Calibrated random forest using OOB variance-weighted calibration +"""CaliForest: Calibrated Random Forest for Health Data. + +This module implements CaliForest (Park and Ho, 2020), a calibrated +random forest that uses out-of-bag (OOB) prediction variance to learn +a robust calibration function without requiring a held-out calibration +set. + +Paper: Y. Park and J. C. Ho. "CaliForest: Calibrated Random Forest +for Health Data." ACM Conference on Health, Inference, and Learning, +2020. https://doi.org/10.1145/3368555.3384461 + +Note: + CaliForest wraps scikit-learn's ``DecisionTreeClassifier`` and + calibration estimators internally. The ``forward`` method accepts + and returns ``torch.Tensor`` objects to conform to PyHealth's + ``BaseModel`` interface. +""" + +from typing import Any, Dict, List, Optional + +import numpy as np +import torch +import torch.nn as nn +from sklearn.isotonic import IsotonicRegression +from sklearn.linear_model import LogisticRegression +from sklearn.tree import DecisionTreeClassifier + +from pyhealth.datasets import SampleDataset +from pyhealth.models.base_model import BaseModel + + +class CaliForest(BaseModel): + """Calibrated Random Forest for health data. + + CaliForest retains per-tree OOB predictions for each training + sample and estimates prediction uncertainty via an Inverse-Gamma + conjugate prior on the noise variance. Samples with low variance + (high certainty) receive larger weights when fitting the + calibration function, yielding better-calibrated probabilities + without sacrificing discrimination. + + Two calibration modes are supported: + + * ``"isotonic"`` (CF-Iso): non-parametric isotonic regression. + * ``"logistic"`` (CF-Logit): Platt scaling via logistic + regression. + * ``"beta"`` (CF-Beta): Beta calibration via logistic + regression on log-transformed predictions. + + Args: + dataset: A ``SampleDataset`` used to query label information. + feature_keys: List of feature keys in the input batch. + label_key: Key for the label in the input batch. + n_estimators: Number of trees in the forest. Default ``300``. + max_depth: Maximum depth of each tree. Default ``10``. + min_samples_split: Minimum samples to split a node. + Default ``3``. + min_samples_leaf: Minimum samples in a leaf. Default ``1``. + ctype: Calibration type, ``"isotonic"``, ``"logistic"``, + or ``"beta"``. Default ``"isotonic"``. + alpha0: Prior shape parameter for the Inverse-Gamma + distribution. Default ``100``. + beta0: Prior scale parameter for the Inverse-Gamma + distribution. Default ``25`` (prior variance = + beta0 / alpha0 = 0.25). + + Examples: + >>> from pyhealth.datasets import create_sample_dataset + >>> samples = [ + ... { + ... "patient_id": "p0", + ... "visit_id": "v0", + ... "features": [0.1] * 10, + ... "label": 0, + ... }, + ... { + ... "patient_id": "p1", + ... "visit_id": "v1", + ... "features": [0.9] * 10, + ... "label": 1, + ... }, + ... ] + >>> dataset = create_sample_dataset( + ... samples=samples, + ... input_schema={"features": "tensor"}, + ... output_schema={"label": "binary"}, + ... dataset_name="demo", + ... ) + >>> model = CaliForest( + ... dataset=dataset, + ... feature_keys=["features"], + ... label_key="label", + ... n_estimators=50, + ... max_depth=5, + ... ) + """ + + def __init__( + self, + dataset: SampleDataset, + feature_keys: List[str], + label_key: str, + n_estimators: int = 300, + max_depth: int = 10, + min_samples_split: int = 3, + min_samples_leaf: int = 1, + ctype: str = "isotonic", + alpha0: float = 100.0, + beta0: float = 25.0, + ) -> None: + super().__init__(dataset) + self.feature_keys = feature_keys + self.label_key = label_key + self.n_estimators = n_estimators + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.ctype = ctype + self.alpha0 = alpha0 + self.beta0 = beta0 + + self.estimators_: List[DecisionTreeClassifier] = [] + self.calibrator_: Optional[Any] = None + self.is_fitted_ = False + + # Dummy parameter so BaseModel.device works + self._dummy_param = nn.Parameter(torch.empty(0)) + + # ---------------------------------------------------------- + # Training + # ---------------------------------------------------------- + + def fit( + self, + train_dataloader: Any, + ) -> "CaliForest": + """Fits the CaliForest model on training data. + + Collects all batches from the dataloader, trains individual + decision trees with bootstrap sampling, computes OOB + predictions and inverse-variance sample weights, then fits + the calibration function. + + Args: + train_dataloader: A PyTorch ``DataLoader`` yielding + batches with keys matching ``feature_keys`` and + ``label_key``. + + Returns: + The fitted ``CaliForest`` instance. + """ + X_parts, y_parts = [], [] + for batch in train_dataloader: + feats = [batch[k].numpy() for k in self.feature_keys] + X_parts.append(np.concatenate(feats, axis=1)) + y_parts.append(batch[self.label_key].numpy().ravel()) + X = np.concatenate(X_parts, axis=0) + y = np.concatenate(y_parts, axis=0) + self._fit_numpy(X, y) + return self + + def _fit_numpy(self, X: np.ndarray, y: np.ndarray) -> None: + """Core fitting logic on numpy arrays. + + Args: + X: Feature matrix of shape ``(n_samples, n_features)``. + y: Binary label array of shape ``(n_samples,)``. + """ + n = X.shape[0] + self.estimators_ = [] + + # Build calibrator + if self.ctype == "logistic": + self.calibrator_ = LogisticRegression( + penalty=None, solver="saga", max_iter=5000 + ) + elif self.ctype == "beta": + self.calibrator_ = LogisticRegression( + penalty=None, solver="saga", max_iter=5000 + ) + else: + self.calibrator_ = IsotonicRegression( + y_min=0, y_max=1, out_of_bounds="clip" + ) + + # OOB prediction matrix: (n_samples, n_estimators), NaN if + # sample was in-bag for that tree + Y_oob = np.full((n, self.n_estimators), np.nan) + n_oob = np.zeros(n) + IB = np.zeros((n, self.n_estimators), dtype=int) + OOB = np.full((n, self.n_estimators), True) + + for eid in range(self.n_estimators): + IB[:, eid] = np.random.choice(n, n) + OOB[IB[:, eid], eid] = False + + for eid in range(self.n_estimators): + est = DecisionTreeClassifier( + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + ) + ib_idx = IB[:, eid] + oob_mask = OOB[:, eid] + est.fit(X[ib_idx], y[ib_idx]) + proba = est.predict_proba(X[oob_mask]) + # Handle case where tree only sees one class + if proba.shape[1] == 2: + Y_oob[oob_mask, eid] = proba[:, 1] + else: + Y_oob[oob_mask, eid] = proba[:, 0] + n_oob[oob_mask] += 1 + self.estimators_.append(est) + + # Compute inverse-variance weights via Inverse-Gamma update + valid = n_oob > 1 + Y_oob_valid = Y_oob[valid] + n_oob_valid = n_oob[valid] + z_hat = np.nanmean(Y_oob_valid, axis=1) + z_true = y[valid] + beta = ( + self.beta0 + + np.nanvar(Y_oob_valid, axis=1) * n_oob_valid / 2 + ) + alpha = self.alpha0 + n_oob_valid / 2 + z_weight = alpha / beta + + # Fit calibrator + if self.ctype == "logistic": + self.calibrator_.fit( + z_hat[:, np.newaxis], z_true, z_weight + ) + elif self.ctype == "beta": + self.calibrator_.fit( + self._beta_features(z_hat), z_true, z_weight + ) + else: + self.calibrator_.fit(z_hat, z_true, z_weight) + + self.is_fitted_ = True + + # ---------------------------------------------------------- + # Prediction + # ---------------------------------------------------------- + + def _beta_features(self, p: np.ndarray) -> np.ndarray: + """Compute Beta calibration features from probabilities. + + Beta calibration (Kull et al., 2017) fits a logistic + regression on three log-transformed features of the + predicted probability. + + Args: + p: Predicted probabilities of shape ``(n,)``. + + Returns: + Feature matrix of shape ``(n, 3)``. + """ + eps = 1e-7 + p = np.clip(p, eps, 1 - eps) + return np.column_stack([ + np.log(p / (1 - p)), + np.log(p), + np.log(1 - p), + ]) + + def _predict_proba_numpy(self, X: np.ndarray) -> np.ndarray: + """Predicts calibrated probabilities on numpy input. + + Args: + X: Feature matrix of shape ``(n_samples, n_features)``. + + Returns: + Array of shape ``(n_samples,)`` with calibrated + probabilities for the positive class. + """ + z = np.zeros(X.shape[0]) + for est in self.estimators_: + proba = est.predict_proba(X) + if proba.shape[1] == 2: + z += proba[:, 1] + else: + z += proba[:, 0] + z /= len(self.estimators_) + + if self.ctype == "logistic": + return self.calibrator_.predict_proba( + z[:, np.newaxis] + )[:, 1] + if self.ctype == "beta": + return self.calibrator_.predict_proba( + self._beta_features(z) + )[:, 1] + return self.calibrator_.predict(z) + + # ---------------------------------------------------------- + # PyHealth interface + # ---------------------------------------------------------- + + def forward( + self, **kwargs: torch.Tensor + ) -> Dict[str, torch.Tensor]: + """Forward pass conforming to PyHealth's ``BaseModel``. + + During training (``self.training is True`` and model is not + yet fitted), this method stores data and returns a dummy loss. + After ``fit()`` has been called, it returns calibrated + predictions. + + Args: + **kwargs: Keyword arguments containing tensors for each + feature key and the label key. + + Returns: + A dictionary with keys ``loss``, ``y_prob``, ``y_true``, + and ``logit``. + """ + feats = [kwargs[k] for k in self.feature_keys] + X_tensor = torch.cat(feats, dim=-1).float() + X_np = X_tensor.detach().cpu().numpy() + + if self.label_key in kwargs: + y_true = kwargs[self.label_key].float() + else: + y_true = torch.zeros(X_np.shape[0]) + + if not self.is_fitted_: + # Before fit: return dummy outputs + y_prob = torch.full_like(y_true, 0.5) + logit = torch.zeros_like(y_true) + loss = torch.tensor(0.0, requires_grad=True) + else: + proba = self._predict_proba_numpy(X_np) + y_prob = torch.tensor( + proba, dtype=torch.float32 + ).unsqueeze(-1) + logit = torch.log( + y_prob.clamp(1e-7, 1 - 1e-7) + / (1 - y_prob.clamp(1e-7, 1 - 1e-7)) + ) + loss = nn.functional.binary_cross_entropy( + y_prob.squeeze(-1), + y_true.squeeze(-1).float(), + ) + + return { + "loss": loss, + "y_prob": y_prob, + "y_true": y_true, + "logit": logit, + } diff --git a/pyhealth/tasks/mimic_extract_califorest.py b/pyhealth/tasks/mimic_extract_califorest.py new file mode 100644 index 000000000..cc4663640 --- /dev/null +++ b/pyhealth/tasks/mimic_extract_califorest.py @@ -0,0 +1,88 @@ +# Authors: Cesar Jesus Giglio Badoino (cesarjg2@illinois.edu) +# Arjun Tangella (avtange2@illinois.edu) +# Tony Nguyen (tonyln2@illinois.edu) +# Paper: CaliForest: Calibrated Random Forest for Health Data +# Paper link: https://doi.org/10.1145/3368555.3384461 +# Description: Binary prediction task for mortality and length of stay +"""MIMIC-Extract flat-feature tasks for CaliForest. + +Defines a single parameterised task that converts MIMIC-Extract +hourly time-series data into a flat feature vector suitable for +tree-based models such as CaliForest. Four binary prediction +targets are supported: + +* ``mort_hosp`` – in-hospital mortality +* ``mort_icu`` – in-ICU mortality +* ``los_3`` – ICU length-of-stay > 3 days +* ``los_7`` – ICU length-of-stay > 7 days + +Paper: Y. Park and J. C. Ho. "CaliForest: Calibrated Random Forest +for Health Data." ACM CHIL, 2020. +""" + +from typing import Any, Dict, List + +from .base_task import BaseTask + + +class MIMICExtractCaliForestTask(BaseTask): + """Flat-feature binary prediction task for MIMIC-Extract data. + + This task is designed for the ``MIMICExtractFlatDataset`` which + already produces flat feature vectors and labels. The task + simply selects the requested prediction target. + + Args: + target: One of ``"mort_hosp"``, ``"mort_icu"``, + ``"los_3"``, or ``"los_7"``. Default ``"mort_hosp"``. + + Examples: + >>> task = MIMICExtractCaliForestTask(target="mort_hosp") + >>> task.task_name + 'mimic_extract_califorest_mort_hosp' + >>> task.output_schema + {'label': 'binary'} + """ + + VALID_TARGETS = ("mort_hosp", "mort_icu", "los_3", "los_7") + + def __init__(self, target: str = "mort_hosp") -> None: + if target not in self.VALID_TARGETS: + raise ValueError( + f"target must be one of {self.VALID_TARGETS}, " + f"got '{target}'" + ) + self.target = target + self.task_name = f"mimic_extract_califorest_{target}" + self.input_schema: Dict[str, str] = { + "features": "tensor", + } + self.output_schema: Dict[str, str] = { + "label": "binary", + } + + def __call__(self, patient: Any) -> List[Dict]: + """Extracts a single sample from a patient record. + + Expects the patient object to carry ``features`` (a flat + numeric vector) and the target label attribute. + + Args: + patient: A ``Patient`` object from + ``MIMICExtractFlatDataset``. + + Returns: + A list containing one sample dictionary, or an empty + list if the patient lacks the required data. + """ + features = getattr(patient, "features", None) + label = getattr(patient, self.target, None) + if features is None or label is None: + return [] + return [ + { + "patient_id": patient.patient_id, + "features": features, + "label": int(label), + } + ] diff --git a/tests/test_calibration_metrics.py b/tests/test_calibration_metrics.py new file mode 100644 index 000000000..200dab2a4 --- /dev/null +++ b/tests/test_calibration_metrics.py @@ -0,0 +1,91 @@ +# Authors: Cesar Jesus Giglio Badoino (cesarjg2@illinois.edu) +# Arjun Tangella (avtange2@illinois.edu) +# Tony Nguyen (tonyln2@illinois.edu) +# Paper: CaliForest: Calibrated Random Forest for Health Data +# Paper link: https://doi.org/10.1145/3368555.3384461 +# Description: Tests for CaliForest calibration metrics +"""Tests for CaliForest calibration metrics.""" + +import numpy as np +import pytest + +from pyhealth.metrics.califorest_calibration import ( + hosmer_lemeshow, + reliability, + scaled_brier_score, + spiegelhalter, +) + + +def _perfect_data(): + y_true = np.array([0] * 50 + [1] * 50) + y_score = np.array([0.0] * 50 + [1.0] * 50) + return y_true, y_score + + +def _random_data(seed=42): + np.random.seed(seed) + y_true = np.random.randint(0, 2, 200) + y_score = np.clip( + y_true + np.random.randn(200) * 0.2, 0.01, 0.99 + ) + return y_true, y_score + + +class TestScaledBrierScore: + def test_perfect(self): + brier, scaled = scaled_brier_score(*_perfect_data()) + assert brier == pytest.approx(0.0, abs=1e-7) + assert scaled == pytest.approx(1.0, abs=1e-7) + + def test_returns_tuple(self): + result = scaled_brier_score(*_random_data()) + assert isinstance(result, tuple) + assert len(result) == 2 + + def test_brier_range(self): + brier, scaled = scaled_brier_score(*_random_data()) + assert 0 <= brier <= 1 + assert scaled <= 1 + + +class TestHosmerLemeshow: + def test_returns_float(self): + p = hosmer_lemeshow(*_random_data()) + assert isinstance(p, float) + + def test_range(self): + p = hosmer_lemeshow(*_random_data()) + assert 0 <= p <= 1 + + def test_well_calibrated_high_pvalue(self): + y_true, y_score = _random_data() + p = hosmer_lemeshow(y_true, y_score) + # Well-calibrated data should have a reasonably high p-value + assert p > 0.01 + + +class TestSpiegelhalter: + def test_returns_float(self): + p = spiegelhalter(*_random_data()) + assert isinstance(p, float) + + def test_range(self): + p = spiegelhalter(*_random_data()) + assert 0 <= p <= 1 + + +class TestReliability: + def test_returns_tuple(self): + result = reliability(*_random_data()) + assert isinstance(result, tuple) + assert len(result) == 2 + + def test_perfect_calibration(self): + rel_s, rel_l = reliability(*_perfect_data()) + assert rel_l == pytest.approx(0.0, abs=1e-7) + + def test_non_negative(self): + rel_s, rel_l = reliability(*_random_data()) + assert rel_s >= 0 + assert rel_l >= 0 diff --git a/tests/test_califorest.py b/tests/test_califorest.py new file mode 100644 index 000000000..652c56f91 --- /dev/null +++ b/tests/test_califorest.py @@ -0,0 +1,159 @@ +# Authors: Cesar Jesus Giglio Badoino (cesarjg2@illinois.edu) +# Arjun Tangella (avtange2@illinois.edu) +# Tony Nguyen (tonyln2@illinois.edu) +# Paper: CaliForest: Calibrated Random Forest for Health Data +# Paper link: https://doi.org/10.1145/3368555.3384461 +# Description: Tests for CaliForest model +"""Tests for the CaliForest model.""" + +import numpy as np +import torch +import pytest + +from pyhealth.datasets import create_sample_dataset, get_dataloader +from pyhealth.models.califorest import CaliForest + + +def _make_dataset(n=40, d=10): + """Create a small synthetic binary classification dataset.""" + np.random.seed(42) + samples = [] + for i in range(n): + label = int(i >= n // 2) + feats = np.random.randn(d).tolist() + if label == 1: + feats = [f + 1.0 for f in feats] + samples.append( + { + "patient_id": f"p{i}", + "visit_id": f"v{i}", + "features": feats, + "label": label, + } + ) + return create_sample_dataset( + samples=samples, + input_schema={"features": "tensor"}, + output_schema={"label": "binary"}, + dataset_name="synth", + ) + + +class TestCaliForestInstantiation: + def test_default_params(self): + ds = _make_dataset() + model = CaliForest( + dataset=ds, + feature_keys=["features"], + label_key="label", + n_estimators=10, + ) + assert model.n_estimators == 10 + assert model.is_fitted_ is False + + def test_invalid_ctype_still_creates(self): + ds = _make_dataset() + model = CaliForest( + dataset=ds, + feature_keys=["features"], + label_key="label", + ctype="isotonic", + ) + assert model.ctype == "isotonic" + + +class TestCaliForestForward: + def test_unfitted_forward(self): + ds = _make_dataset() + model = CaliForest( + dataset=ds, + feature_keys=["features"], + label_key="label", + n_estimators=5, + ) + loader = get_dataloader(ds, batch_size=8, shuffle=False) + batch = next(iter(loader)) + out = model(**batch) + assert "loss" in out + assert "y_prob" in out + assert "y_true" in out + assert "logit" in out + + def test_fitted_forward_isotonic(self): + ds = _make_dataset() + model = CaliForest( + dataset=ds, + feature_keys=["features"], + label_key="label", + n_estimators=10, + max_depth=3, + ctype="isotonic", + ) + loader = get_dataloader(ds, batch_size=40, shuffle=False) + model.fit(loader) + assert model.is_fitted_ + + loader2 = get_dataloader(ds, batch_size=8, shuffle=False) + batch = next(iter(loader2)) + out = model(**batch) + assert out["y_prob"].shape[0] == 8 + assert (out["y_prob"] >= 0).all() + assert (out["y_prob"] <= 1).all() + + def test_fitted_forward_logistic(self): + ds = _make_dataset() + model = CaliForest( + dataset=ds, + feature_keys=["features"], + label_key="label", + n_estimators=10, + max_depth=3, + ctype="logistic", + ) + loader = get_dataloader(ds, batch_size=40, shuffle=False) + model.fit(loader) + assert model.is_fitted_ + + batch = next(iter(loader)) + out = model(**batch) + assert (out["y_prob"] >= 0).all() + assert (out["y_prob"] <= 1).all() + + def test_fitted_forward_beta(self): + ds = _make_dataset() + model = CaliForest( + dataset=ds, + feature_keys=["features"], + label_key="label", + n_estimators=10, + max_depth=3, + ctype="beta", + ) + loader = get_dataloader(ds, batch_size=40, shuffle=False) + model.fit(loader) + assert model.is_fitted_ + + batch = next(iter(loader)) + out = model(**batch) + assert (out["y_prob"] >= 0).all() + assert (out["y_prob"] <= 1).all() + + +class TestCaliForestNumpyFit: + def test_fit_and_predict(self): + np.random.seed(0) + X = np.random.randn(60, 5) + y = (X[:, 0] > 0).astype(int) + + ds = _make_dataset() + model = CaliForest( + dataset=ds, + feature_keys=["features"], + label_key="label", + n_estimators=20, + max_depth=3, + ) + model._fit_numpy(X, y) + proba = model._predict_proba_numpy(X) + assert proba.shape == (60,) + assert np.all(proba >= 0) and np.all(proba <= 1) diff --git a/tests/test_califorest_mimic_extract_dataset.py b/tests/test_califorest_mimic_extract_dataset.py new file mode 100644 index 000000000..1bd79233e --- /dev/null +++ b/tests/test_califorest_mimic_extract_dataset.py @@ -0,0 +1,131 @@ +# Authors: Cesar Jesus Giglio Badoino (cesarjg2@illinois.edu) +# Arjun Tangella (avtange2@illinois.edu) +# Tony Nguyen (tonyln2@illinois.edu) +# Paper: CaliForest: Calibrated Random Forest for Health Data +# Paper link: https://doi.org/10.1145/3368555.3384461 +# Description: Tests for CaliForest MIMIC-Extract dataset +"""Tests for CaliForestMIMICExtractDataset.""" + +import os +import tempfile + +import numpy as np +import pandas as pd +import pytest + +from pyhealth.datasets.califorest_mimic_extract import ( + CaliForestMIMICExtractDataset, + _normalize_columns, + _simple_imputer, +) + + +def _make_fake_hdf5(path): + """Create a minimal fake all_hourly_data.h5.""" + patients = pd.DataFrame( + { + "mort_hosp": [0, 1, 0], + "mort_icu": [0, 1, 0], + "los_icu": [2.0, 8.0, 4.0], + "max_hours": [48, 72, 50], + "age": [65, 70, 55], + "gender": ["M", "F", "M"], + "ethnicity": ["W", "W", "B"], + }, + index=pd.MultiIndex.from_tuples( + [(1, 100, 1000), (2, 200, 2000), (3, 300, 3000)], + names=["subject_id", "hadm_id", "icustay_id"], + ), + ) + rows = [] + for sid, hid, iid in [(1, 100, 1000), (2, 200, 2000), (3, 300, 3000)]: + for h in range(3): + rows.append((sid, hid, iid, h)) + idx = pd.MultiIndex.from_tuples( + rows, names=["subject_id", "hadm_id", "icustay_id", "hours_in"], + ) + np.random.seed(0) + n = len(rows) + data = { + ("heart_rate", "mean"): np.random.randn(n), + ("heart_rate", "count"): np.random.randint(0, 3, n).astype(float), + ("heart_rate", "std"): np.abs(np.random.randn(n)), + ("sodium", "mean"): np.random.randn(n), + ("sodium", "count"): np.random.randint(0, 3, n).astype(float), + ("sodium", "std"): np.abs(np.random.randn(n)), + } + vitals = pd.DataFrame(data, index=idx) + vitals.columns = pd.MultiIndex.from_tuples( + vitals.columns, names=["LEVEL2", "Aggregation Function"] + ) + patients.to_hdf(path, key="patients", mode="w") + vitals.to_hdf(path, key="vitals_labs", mode="a") + + +class TestDatasetInstantiation: + def test_creates_with_valid_path(self): + with tempfile.TemporaryDirectory() as tmpdir: + _make_fake_hdf5(os.path.join(tmpdir, "all_hourly_data.h5")) + ds = CaliForestMIMICExtractDataset(root=tmpdir) + assert ds.dataset_name == "califorest_mimic_extract" + + def test_raises_on_missing_file(self): + with tempfile.TemporaryDirectory() as tmpdir: + with pytest.raises(FileNotFoundError): + CaliForestMIMICExtractDataset(root=tmpdir) + + +class TestNormalizeColumns: + def test_swaps_when_agg_is_level1(self): + cols = pd.MultiIndex.from_tuples( + [("hr", "mean"), ("hr", "count")], + names=["LEVEL2", "Aggregation Function"], + ) + df = pd.DataFrame(np.ones((2, 2)), columns=cols) + result = _normalize_columns(df) + assert result.columns.get_level_values(0)[0] in {"mean", "count"} + + def test_noop_when_agg_is_level0(self): + cols = pd.MultiIndex.from_tuples( + [("mean", "hr"), ("count", "hr")], + names=["Aggregation Function", "LEVEL2"], + ) + df = pd.DataFrame(np.ones((2, 2)), columns=cols) + result = _normalize_columns(df) + assert result.columns.get_level_values(0)[0] == "mean" + + +class TestSimpleImputer: + def test_output_has_three_channels(self): + cols = pd.MultiIndex.from_tuples( + [("mean", "hr"), ("count", "hr"), ("std", "hr")], + names=["Aggregation Function", "LEVEL2"], + ) + idx = pd.MultiIndex.from_tuples( + [(1, 1, 1, 0), (1, 1, 1, 1)], + names=["subject_id", "hadm_id", "icustay_id", "hours_in"], + ) + df = pd.DataFrame( + [[1.0, 1.0, 0.1], [np.nan, 0.0, 0.0]], + index=idx, columns=cols, + ) + result = _simple_imputer(df) + agg_funcs = set(result.columns.get_level_values(0).unique()) + assert "mean" in agg_funcs + assert "mask" in agg_funcs + assert "time_since_measured" in agg_funcs + + +class TestLoadCaliforestData: + def test_returns_aligned_arrays(self): + with tempfile.TemporaryDirectory() as tmpdir: + _make_fake_hdf5(os.path.join(tmpdir, "all_hourly_data.h5")) + ds = CaliForestMIMICExtractDataset(root=tmpdir) + X_tr, X_te, y_tr, y_te = ds.load_califorest_data( + target="mort_hosp", train_frac=0.67, + ) + assert X_tr.ndim == 2 + assert X_te.ndim == 2 + assert len(y_tr) == X_tr.shape[0] + assert len(y_te) == X_te.shape[0] + assert set(np.unique(np.concatenate([y_tr, y_te]))).issubset({0, 1}) diff --git a/tests/test_mimic_extract_califorest_task.py b/tests/test_mimic_extract_califorest_task.py new file mode 100644 index 000000000..940524b65 --- /dev/null +++ b/tests/test_mimic_extract_califorest_task.py @@ -0,0 +1,69 @@ +# Authors: Cesar Jesus Giglio Badoino (cesarjg2@illinois.edu) +# Arjun Tangella (avtange2@illinois.edu) +# Tony Nguyen (tonyln2@illinois.edu) +# Paper: CaliForest: Calibrated Random Forest for Health Data +# Paper link: https://doi.org/10.1145/3368555.3384461 +# Description: Tests for MIMIC-Extract CaliForest task +"""Tests for MIMICExtractCaliForestTask.""" + +import pytest + +from pyhealth.tasks.mimic_extract_califorest import ( + MIMICExtractCaliForestTask, +) + + +class _FakePatient: + """Minimal patient stub for testing.""" + + def __init__(self, pid, features, mort_hosp, mort_icu, los_3, los_7): + self.patient_id = pid + self.features = features + self.mort_hosp = mort_hosp + self.mort_icu = mort_icu + self.los_3 = los_3 + self.los_7 = los_7 + + +class TestTaskInit: + def test_valid_targets(self): + for t in ("mort_hosp", "mort_icu", "los_3", "los_7"): + task = MIMICExtractCaliForestTask(target=t) + assert task.target == t + assert t in task.task_name + + def test_invalid_target(self): + with pytest.raises(ValueError): + MIMICExtractCaliForestTask(target="invalid") + + def test_schema(self): + task = MIMICExtractCaliForestTask() + assert "features" in task.input_schema + assert "label" in task.output_schema + + +class TestTaskCall: + def test_produces_sample(self): + task = MIMICExtractCaliForestTask(target="mort_hosp") + patient = _FakePatient("p0", [0.1] * 10, 1, 0, 1, 0) + samples = task(patient) + assert len(samples) == 1 + assert samples[0]["label"] == 1 + assert samples[0]["features"] == [0.1] * 10 + + def test_different_targets(self): + patient = _FakePatient("p0", [0.5] * 5, 0, 1, 1, 0) + assert MIMICExtractCaliForestTask("mort_hosp")(patient)[0]["label"] == 0 + assert MIMICExtractCaliForestTask("mort_icu")(patient)[0]["label"] == 1 + assert MIMICExtractCaliForestTask("los_3")(patient)[0]["label"] == 1 + assert MIMICExtractCaliForestTask("los_7")(patient)[0]["label"] == 0 + + def test_missing_features(self): + task = MIMICExtractCaliForestTask() + patient = _FakePatient("p0", None, 1, 0, 0, 0) + assert task(patient) == [] + + def test_missing_label(self): + task = MIMICExtractCaliForestTask(target="mort_hosp") + patient = _FakePatient("p0", [1.0], None, 0, 0, 0) + assert task(patient) == []