From 9ca079802db5774eb451b7285cf904304d8faa70 Mon Sep 17 00:00:00 2001
From: cgigliob <cgigliob@gmail.com>
Date: Sun, 12 Apr 2026 19:25:24 -0700
Subject: [PATCH] Add CaliForest model, data loader, task, and calibration
 metrics

---
 docs/api/datasets.rst                         |   1 +
 ...datasets.CaliForestMIMICExtractDataset.rst |  16 +
 docs/api/models.rst                           |   1 +
 .../api/models/pyhealth.models.CaliForest.rst |   7 +
 docs/api/tasks.rst                            |   1 +
 ...ealth.tasks.MIMICExtractCaliForestTask.rst |   7 +
 ...rest_mimic_extract_mortality_califorest.py | 192 ++++++++++
 pyhealth/datasets/califorest_mimic_extract.py | 353 +++++++++++++++++
 .../configs/califorest_mimic_extract.yaml     |  32 ++
 pyhealth/metrics/califorest_calibration.py    | 176 +++++++++
 pyhealth/models/califorest.py                 | 357 ++++++++++++++++++
 pyhealth/tasks/mimic_extract_califorest.py    |  88 +++++
 tests/test_calibration_metrics.py             |  91 +++++
 tests/test_califorest.py                      | 159 ++++++++
 .../test_califorest_mimic_extract_dataset.py  | 131 +++++++
 tests/test_mimic_extract_califorest_task.py   |  69 ++++
 16 files changed, 1681 insertions(+)
 create mode 100644 docs/api/datasets/pyhealth.datasets.CaliForestMIMICExtractDataset.rst
 create mode 100644 docs/api/models/pyhealth.models.CaliForest.rst
 create mode 100644 docs/api/tasks/pyhealth.tasks.MIMICExtractCaliForestTask.rst
 create mode 100644 examples/califorest_mimic_extract_mortality_califorest.py
 create mode 100644 pyhealth/datasets/califorest_mimic_extract.py
 create mode 100644 pyhealth/datasets/configs/califorest_mimic_extract.yaml
 create mode 100644 pyhealth/metrics/califorest_calibration.py
 create mode 100644 pyhealth/models/califorest.py
 create mode 100644 pyhealth/tasks/mimic_extract_califorest.py
 create mode 100644 tests/test_calibration_metrics.py
 create mode 100644 tests/test_califorest.py
 create mode 100644 tests/test_califorest_mimic_extract_dataset.py
 create mode 100644 tests/test_mimic_extract_califorest_task.py

diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst
index b02439d26..6259bcca1 100644
--- a/docs/api/datasets.rst
+++ b/docs/api/datasets.rst
@@ -243,5 +243,6 @@ Available Datasets
     datasets/pyhealth.datasets.ClinVarDataset
     datasets/pyhealth.datasets.COSMICDataset
     datasets/pyhealth.datasets.TCGAPRADDataset
+    datasets/pyhealth.datasets.CaliForestMIMICExtractDataset
     datasets/pyhealth.datasets.splitter
     datasets/pyhealth.datasets.utils
diff --git a/docs/api/datasets/pyhealth.datasets.CaliForestMIMICExtractDataset.rst b/docs/api/datasets/pyhealth.datasets.CaliForestMIMICExtractDataset.rst
new file mode 100644
index 000000000..52d8d01b0
--- /dev/null
+++ b/docs/api/datasets/pyhealth.datasets.CaliForestMIMICExtractDataset.rst
@@ -0,0 +1,16 @@
+pyhealth.datasets.CaliForestMIMICExtractDataset
+=================================================
+
+Overview
+--------
+
+MIMIC-Extract dataset loader for CaliForest experiments. Reads the
+``all_hourly_data.h5`` file produced by the MIMIC-Extract pipeline and
+produces a flat feature matrix suitable for tree-based models.
+
+.. autoclass:: pyhealth.datasets.califorest_mimic_extract.CaliForestMIMICExtractDataset
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+.. autofunction:: pyhealth.datasets.califorest_mimic_extract.load_califorest_data
diff --git a/docs/api/models.rst b/docs/api/models.rst
index 7368dec94..6ce96c93b 100644
--- a/docs/api/models.rst
+++ b/docs/api/models.rst
@@ -198,6 +198,7 @@ API Reference
     models/pyhealth.models.TCN
     models/pyhealth.models.TFMTokenizer
     models/pyhealth.models.GAN
+    models/pyhealth.models.CaliForest
     models/pyhealth.models.VAE
     models/pyhealth.models.SDOH
     models/pyhealth.models.VisionEmbeddingModel
diff --git a/docs/api/models/pyhealth.models.CaliForest.rst b/docs/api/models/pyhealth.models.CaliForest.rst
new file mode 100644
index 000000000..1c44bc715
--- /dev/null
+++ b/docs/api/models/pyhealth.models.CaliForest.rst
@@ -0,0 +1,7 @@
+pyhealth.models.CaliForest
+===================================
+
+.. autoclass:: pyhealth.models.califorest.CaliForest
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/api/tasks.rst b/docs/api/tasks.rst
index 399b8f1aa..0b4c9804d 100644
--- a/docs/api/tasks.rst
+++ b/docs/api/tasks.rst
@@ -229,3 +229,4 @@ Available Tasks
     Mutation Pathogenicity (COSMIC) <tasks/pyhealth.tasks.MutationPathogenicityPrediction>
     Cancer Survival Prediction (TCGA) <tasks/pyhealth.tasks.CancerSurvivalPrediction>
     Cancer Mutation Burden (TCGA) <tasks/pyhealth.tasks.CancerMutationBurden>
+    MIMIC-Extract CaliForest <tasks/pyhealth.tasks.MIMICExtractCaliForestTask>
diff --git a/docs/api/tasks/pyhealth.tasks.MIMICExtractCaliForestTask.rst b/docs/api/tasks/pyhealth.tasks.MIMICExtractCaliForestTask.rst
new file mode 100644
index 000000000..12d5ebe67
--- /dev/null
+++ b/docs/api/tasks/pyhealth.tasks.MIMICExtractCaliForestTask.rst
@@ -0,0 +1,7 @@
+pyhealth.tasks.MIMICExtractCaliForestTask
+==========================================
+
+.. autoclass:: pyhealth.tasks.mimic_extract_califorest.MIMICExtractCaliForestTask
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/examples/califorest_mimic_extract_mortality_califorest.py b/examples/califorest_mimic_extract_mortality_califorest.py
new file mode 100644
index 000000000..e77d64585
--- /dev/null
+++ b/examples/califorest_mimic_extract_mortality_califorest.py
@@ -0,0 +1,192 @@
+# Authors: Cesar Jesus Giglio Badoino (cesarjg2@illinois.edu)
+#          Arjun Tangella (avtange2@illinois.edu)
+#          Tony Nguyen (tonyln2@illinois.edu)
+# Paper: CaliForest: Calibrated Random Forest for Health Data
+# Paper link: https://doi.org/10.1145/3368555.3384461
+# Description: Ablation study for CaliForest hyperparameters
+"""MIMIC-Extract CaliForest ablation study.
+
+This script demonstrates the full CaliForest pipeline and runs
+ablation experiments on:
+
+1. **Calibration type**: CF-Iso vs CF-Logit vs RF-NoCal
+2. **Prior sensitivity**: varying alpha0/beta0
+3. **Ensemble size**: 50, 100, 200, 300 trees
+4. **Prediction target**: mort_hosp, mort_icu, los_3, los_7
+
+Usage with real MIMIC-Extract data::
+
+    python mimic_extract_califorest_ablation.py \\
+        --hdf5_path /path/to/all_hourly_data.h5
+
+Usage with synthetic data (for testing)::
+
+    python mimic_extract_califorest_ablation.py --synthetic
+
+Paper: Y. Park and J. C. Ho. "CaliForest: Calibrated Random Forest
+for Health Data." ACM CHIL, 2020.
+https://doi.org/10.1145/3368555.3384461
+"""
+
+import argparse
+import numpy as np
+from sklearn.metrics import roc_auc_score
+
+
+def make_synthetic_data(n=500, d=50, seed=42):
+    """Generate synthetic binary classification data."""
+    np.random.seed(seed)
+    X = np.random.randn(n, d)
+    logits = X[:, 0] + 0.5 * X[:, 1] - 0.3 * X[:, 2]
+    y = (logits > 0).astype(int)
+    split = int(0.7 * n)
+    return X[:split], X[split:], y[:split], y[split:]
+
+
+def evaluate(y_true, y_pred):
+    """Compute AUROC and all six calibration metrics."""
+    from pyhealth.metrics.califorest_calibration import (
+        hosmer_lemeshow,
+        reliability,
+        scaled_brier_score,
+        spiegelhalter,
+    )
+
+    auc = roc_auc_score(y_true, y_pred)
+    brier, brier_scaled = scaled_brier_score(y_true, y_pred)
+    hl = hosmer_lemeshow(y_true, y_pred)
+    sp = spiegelhalter(y_true, y_pred)
+    rel_s, rel_l = reliability(y_true, y_pred)
+    return {
+        "AUROC": auc,
+        "Brier": brier,
+        "Scaled Brier": brier_scaled,
+        "HL p-value": hl,
+        "Spiegelhalter p": sp,
+        "Rel-small": rel_s,
+        "Rel-large": rel_l,
+    }
+
+
+def run_califorest(X_train, y_train, X_test, y_test, **kwargs):
+    """Train CaliForest and return predictions."""
+    from pyhealth.models.califorest import CaliForest
+
+    # Use the numpy-level API directly for efficiency
+    model = CaliForest.__new__(CaliForest)
+    model.n_estimators = kwargs.get("n_estimators", 300)
+    model.max_depth = kwargs.get("max_depth", 10)
+    model.min_samples_split = kwargs.get("min_samples_split", 3)
+    model.min_samples_leaf = kwargs.get("min_samples_leaf", 1)
+    model.ctype = kwargs.get("ctype", "isotonic")
+    model.alpha0 = kwargs.get("alpha0", 100.0)
+    model.beta0 = kwargs.get("beta0", 25.0)
+    model.estimators_ = []
+    model.calibrator_ = None
+    model.is_fitted_ = False
+
+    model._fit_numpy(X_train, y_train)
+    y_pred = model._predict_proba_numpy(X_test)
+    return y_pred
+
+
+def run_rf_nocal(X_train, y_train, X_test, **kwargs):
+    """Train uncalibrated Random Forest baseline."""
+    from sklearn.ensemble import RandomForestClassifier
+
+    rf = RandomForestClassifier(
+        n_estimators=kwargs.get("n_estimators", 300),
+        max_depth=kwargs.get("max_depth", 10),
+        min_samples_split=kwargs.get("min_samples_split", 3),
+        min_samples_leaf=kwargs.get("min_samples_leaf", 1),
+    )
+    rf.fit(X_train, y_train)
+    return rf.predict_proba(X_test)[:, 1]
+
+
+def print_results(name, metrics):
+    """Pretty-print a single row of results."""
+    parts = [f"{name:<20}"]
+    for k, v in metrics.items():
+        parts.append(f"{k}={v:.4f}")
+    print("  ".join(parts))
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--hdf5_path", type=str, default=None)
+    parser.add_argument("--target", type=str, default="mort_hosp")
+    parser.add_argument(
+        "--synthetic", action="store_true", default=False
+    )
+    args = parser.parse_args()
+
+    # ---- Load data ----
+    if args.synthetic or args.hdf5_path is None:
+        print("Using synthetic data\n")
+        X_tr, X_te, y_tr, y_te = make_synthetic_data()
+    else:
+        from pyhealth.datasets.califorest_mimic_extract import (
+            load_califorest_data,
+        )
+
+        X_tr, X_te, y_tr, y_te = load_califorest_data(
+            args.hdf5_path, target=args.target
+        )
+
+    # ---- Ablation 1: Calibration type ----
+    print("=" * 60)
+    print("Ablation 1: Calibration Type")
+    print("=" * 60)
+    for ctype in ["isotonic", "logistic", "beta"]:
+        y_pred = run_califorest(
+            X_tr, y_tr, X_te, y_te, ctype=ctype
+        )
+        print_results(f"CF-{ctype}", evaluate(y_te, y_pred))
+    y_pred = run_rf_nocal(X_tr, y_tr, X_te)
+    print_results("RF-NoCal", evaluate(y_te, y_pred))
+
+    # ---- Ablation 2: Prior sensitivity ----
+    print(f"\n{'=' * 60}")
+    print("Ablation 2: Prior Sensitivity (alpha0, beta0)")
+    print("=" * 60)
+    for alpha0, beta0 in [(50, 12.5), (100, 25), (200, 50), (10, 2.5)]:
+        y_pred = run_califorest(
+            X_tr, y_tr, X_te, y_te,
+            alpha0=alpha0, beta0=beta0,
+        )
+        print_results(
+            f"a0={alpha0},b0={beta0}", evaluate(y_te, y_pred)
+        )
+
+    # ---- Ablation 3: Ensemble size ----
+    print(f"\n{'=' * 60}")
+    print("Ablation 3: Number of Estimators")
+    print("=" * 60)
+    for n_est in [50, 100, 200, 300]:
+        y_pred = run_califorest(
+            X_tr, y_tr, X_te, y_te, n_estimators=n_est
+        )
+        print_results(f"n_est={n_est}", evaluate(y_te, y_pred))
+
+    # ---- Ablation 4: Prediction targets (synthetic only) ----
+    if args.synthetic or args.hdf5_path is None:
+        print("\n(Skipping target ablation on synthetic data)")
+    else:
+        print(f"\n{'=' * 60}")
+        print("Ablation 4: Prediction Targets")
+        print("=" * 60)
+        from pyhealth.datasets.califorest_mimic_extract import (
+            load_califorest_data,
+        )
+
+        for tgt in ["mort_hosp", "mort_icu", "los_3", "los_7"]:
+            Xr, Xe, yr, ye = load_califorest_data(
+                args.hdf5_path, target=tgt
+            )
+            y_pred = run_califorest(Xr, yr, Xe, ye)
+            print_results(tgt, evaluate(ye, y_pred))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyhealth/datasets/califorest_mimic_extract.py b/pyhealth/datasets/califorest_mimic_extract.py
new file mode 100644
index 000000000..957639ade
--- /dev/null
+++ b/pyhealth/datasets/califorest_mimic_extract.py
@@ -0,0 +1,353 @@
+# Authors: Cesar Jesus Giglio Badoino (cesarjg2@illinois.edu)
+#          Arjun Tangella (avtange2@illinois.edu)
+#          Tony Nguyen (tonyln2@illinois.edu)
+# Paper: CaliForest: Calibrated Random Forest for Health Data
+# Paper link: https://doi.org/10.1145/3368555.3384461
+# Description: MIMIC-Extract HDF5 loader for CaliForest flat features
+"""MIMIC-Extract dataset for CaliForest replication.
+
+Reads the ``all_hourly_data.h5`` file produced by the `MIMIC-Extract
+<https://github.com/MLforHealth/MIMIC_Extract>`_ pipeline and
+produces a flat (n_patients x n_features) matrix suitable for
+tree-based models such as CaliForest.
+
+The preprocessing follows Park and Ho (2020):
+
+1. Filter to patients with ``max_hours > 30`` (24 h window + 6 h
+   gap).
+2. Use only the first 24 hours of time-series data.
+3. Impute missing values (forward-fill, ICU-stay mean, then 0).
+4. Add binary mask and time-since-measured features.
+5. Mean-centre features using training-set statistics.
+6. Flatten hourly data into a single row per patient.
+7. Extract four binary labels: ``mort_hosp``, ``mort_icu``,
+   ``los_3`` (LOS > 3 days), ``los_7`` (LOS > 7 days).
+
+The user must have already generated the HDF5 file via the
+MIMIC-Extract pipeline or obtained it from GCP (requires PhysioNet
+credentials).
+
+Paper: Y. Park and J. C. Ho. "CaliForest: Calibrated Random Forest
+for Health Data." ACM CHIL, 2020.
+https://doi.org/10.1145/3368555.3384461
+"""
+
+import logging
+import os
+from typing import List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+
+from .base_dataset import BaseDataset
+
+logger = logging.getLogger(__name__)
+
+GAP_TIME = 6  # hours
+WINDOW_SIZE = 24  # hours
+ID_COLS = ["subject_id", "hadm_id", "icustay_id"]
+
+
+def _normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """Ensure Aggregation Function is the first column level.
+
+    The official MIMIC-Extract HDF5 stores columns as
+    ``(LEVEL2, Aggregation Function)`` while some generated
+    versions use ``(Aggregation Function, LEVEL2)``.  This
+    helper normalises to ``(Aggregation Function, LEVEL2)``.
+
+    Args:
+        df: DataFrame with a two-level MultiIndex on columns.
+
+    Returns:
+        DataFrame with normalised column order.
+    """
+    if df.columns.nlevels != 2:
+        return df
+    agg_vals = {"mean", "count", "std", "mask"}
+    level0_vals = set(df.columns.get_level_values(0).unique())
+    if not level0_vals & agg_vals:
+        df = df.swaplevel(axis=1).sort_index(axis=1)
+    return df
+
+
+def _simple_imputer(df: pd.DataFrame) -> pd.DataFrame:
+    """Impute missing values following Che et al. (2018).
+
+    Produces three feature channels per variable: ``mean``
+    (imputed value), ``mask`` (observation indicator), and
+    ``time_since_measured``.
+
+    Args:
+        df: Multi-level column DataFrame from MIMIC-Extract
+            ``vitals_labs`` table.
+
+    Returns:
+        Imputed DataFrame with ``mean``, ``mask``, and
+        ``time_since_measured`` columns.
+    """
+    idx = pd.IndexSlice
+    df = df.copy()
+
+    if df.columns.nlevels > 2:
+        df.columns = df.columns.droplevel(
+            [n for n in df.columns.names
+             if n not in ("Aggregation Function", "LEVEL2")]
+        )
+
+    df = _normalize_columns(df)
+
+    df_out = df.loc[:, idx[["mean", "count"], :]]
+    icustay_means = (
+        df_out.loc[:, idx["mean", :]].groupby(ID_COLS).mean()
+    )
+    imputed = (
+        df_out.loc[:, idx["mean", :]]
+        .groupby(ID_COLS)
+        .ffill()
+    )
+    # Fill remaining NaNs with per-ICU-stay means, then 0
+    for col in imputed.columns:
+        imputed[col] = imputed[col].fillna(icustay_means[col])
+    imputed = imputed.fillna(0).copy()
+    df_out.loc[:, idx["mean", :]] = imputed
+
+    mask = (
+        df.loc[:, idx["count", :]] > 0
+    ).astype(float).copy()
+    df_out.loc[:, idx["count", :]] = mask
+    df_out = df_out.rename(
+        columns={"count": "mask"},
+        level="Aggregation Function",
+    )
+
+    is_absent = 1 - df_out.loc[:, idx["mask", :]].copy()
+    hours_of_absence = is_absent.cumsum()
+    time_since = hours_of_absence - hours_of_absence[
+        is_absent == 0
+    ].ffill()
+    time_since = time_since.rename(
+        columns={"mask": "time_since_measured"},
+        level="Aggregation Function",
+    )
+    df_out = pd.concat((df_out, time_since), axis=1)
+    df_out.loc[:, idx["time_since_measured", :]] = (
+        df_out.loc[:, idx["time_since_measured", :]]
+        .fillna(WINDOW_SIZE + 1)
+        .copy()
+    )
+    df_out.sort_index(axis=1, inplace=True)
+    return df_out
+
+
+class CaliForestMIMICExtractDataset(BaseDataset):
+    """MIMIC-Extract dataset for CaliForest experiments.
+
+    Wraps the MIMIC-Extract ``all_hourly_data.h5`` output and
+    provides the flat feature matrix and binary labels used in
+    the CaliForest paper.
+
+    Unlike the existing ``MIMICExtractDataset`` which produces
+    event-level data for deep learning models, this class
+    produces the flat, imputed, aggregated feature matrix that
+    tree-based models like CaliForest require.
+
+    Args:
+        root: Directory containing ``all_hourly_data.h5``.
+        tables: Ignored (kept for BaseDataset compatibility).
+            Defaults to ``["vitals_labs"]``.
+        dataset_name: Name of the dataset. Defaults to
+            ``"califorest_mimic_extract"``.
+        **kwargs: Additional arguments passed to BaseDataset.
+
+    Examples:
+        >>> from pyhealth.datasets import (
+        ...     CaliForestMIMICExtractDataset,
+        ... )
+        >>> dataset = CaliForestMIMICExtractDataset(
+        ...     root="/path/to/mimic_extract_output",
+        ... )
+        >>> X_tr, X_te, y_tr, y_te = dataset.load_califorest_data(
+        ...     target="mort_hosp",
+        ... )
+        >>> print(X_tr.shape)  # (n_train, 7488)
+    """
+
+    def __init__(
+        self,
+        root: str,
+        tables: Optional[List[str]] = None,
+        dataset_name: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        self._hdf5_path = os.path.join(
+            root, "all_hourly_data.h5"
+        )
+        if not os.path.exists(self._hdf5_path):
+            raise FileNotFoundError(
+                f"all_hourly_data.h5 not found in {root}. "
+                "Run the MIMIC-Extract pipeline first."
+            )
+        # BaseDataset requires tables and config_path but we
+        # handle data loading ourselves via HDF5.  Pass minimal
+        # args so the parent __init__ stores root/dataset_name
+        # without attempting CSV loading.
+        super().__init__(
+            root=root,
+            tables=tables or ["vitals_labs"],
+            dataset_name=(
+                dataset_name or "califorest_mimic_extract"
+            ),
+            **kwargs,
+        )
+
+    # Override to prevent BaseDataset from scanning CSVs
+    def load_data(self):  # type: ignore[override]
+        """Not used. Data is loaded via :meth:`load_califorest_data`."""
+        raise NotImplementedError(
+            "Use load_califorest_data() instead."
+        )
+
+    def load_califorest_data(
+        self,
+        target: str = "mort_hosp",
+        random_seed: int = 0,
+        train_frac: float = 0.7,
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """Load and preprocess data for CaliForest.
+
+        Replicates the exact data pipeline from the CaliForest
+        paper, producing the flat feature matrix and binary
+        labels used in the original experiments.
+
+        Args:
+            target: Prediction target. One of ``"mort_hosp"``,
+                ``"mort_icu"``, ``"los_3"``, ``"los_7"``.
+            random_seed: Random seed for train/test split.
+            train_frac: Fraction of subjects for training.
+
+        Returns:
+            A tuple ``(X_train, X_test, y_train, y_test)``.
+
+        Examples:
+            >>> X_tr, X_te, y_tr, y_te = (
+            ...     dataset.load_califorest_data("mort_hosp")
+            ... )
+        """
+        return load_califorest_data(
+            self._hdf5_path,
+            target=target,
+            random_seed=random_seed,
+            train_frac=train_frac,
+        )
+
+
+def load_califorest_data(
+    hdf5_path: str,
+    target: str = "mort_hosp",
+    random_seed: int = 0,
+    train_frac: float = 0.7,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Load and preprocess MIMIC-Extract data for CaliForest.
+
+    This is a standalone convenience function. For integration
+    with PyHealth pipelines, prefer
+    :class:`CaliForestMIMICExtractDataset`.
+
+    Args:
+        hdf5_path: Path to ``all_hourly_data.h5``.
+        target: One of ``"mort_hosp"``, ``"mort_icu"``,
+            ``"los_3"``, ``"los_7"``.
+        random_seed: Random seed for train/test split.
+        train_frac: Fraction of subjects for training.
+
+    Returns:
+        A tuple ``(X_train, X_test, y_train, y_test)``.
+    """
+    statics = pd.read_hdf(hdf5_path, "patients")
+    data_full = pd.read_hdf(hdf5_path, "vitals_labs")
+
+    # Cohort filter
+    statics = statics[statics.max_hours > WINDOW_SIZE + GAP_TIME]
+
+    # Labels
+    Ys = statics.loc[:, ["mort_hosp", "mort_icu", "los_icu"]]
+    Ys["mort_hosp"] = Ys["mort_hosp"].astype(int)
+    Ys["mort_icu"] = Ys["mort_icu"].astype(int)
+    Ys["los_3"] = (Ys["los_icu"] > 3).astype(int)
+    Ys["los_7"] = (Ys["los_icu"] > 7).astype(int)
+    Ys.drop(columns=["los_icu"], inplace=True)
+
+    # Filter time-series to cohort and first 24 h
+    valid_ids = set(
+        Ys.index.get_level_values("icustay_id")
+    )
+    lvl2 = data_full.loc[
+        data_full.index.get_level_values("icustay_id").isin(
+            valid_ids
+        )
+        & (
+            data_full.index.get_level_values("hours_in")
+            < WINDOW_SIZE
+        ),
+        :,
+    ]
+
+    # Train/test split by subject
+    subjects = list(
+        set(lvl2.index.get_level_values("subject_id"))
+    )
+    np.random.seed(random_seed)
+    subjects = np.random.permutation(subjects)
+    n_train = int(train_frac * len(subjects))
+    train_subj = set(subjects[:n_train])
+    test_subj = set(subjects[n_train:])
+
+    def _filter(df, subj_set):
+        return df.loc[
+            df.index.get_level_values("subject_id").isin(
+                subj_set
+            )
+        ]
+
+    lvl2_train = _filter(lvl2, train_subj)
+    lvl2_test = _filter(lvl2, test_subj)
+    Ys_train = _filter(Ys, train_subj)
+    Ys_test = _filter(Ys, test_subj)
+
+    # Normalize column level order
+    lvl2_train = _normalize_columns(lvl2_train)
+    lvl2_test = _normalize_columns(lvl2_test)
+
+    # Mean-centre using training statistics
+    idx = pd.IndexSlice
+    lvl2_means = lvl2_train.loc[:, idx["mean", :]].mean(axis=0)
+    lvl2_train.loc[:, idx["mean", :]] -= lvl2_means
+    lvl2_test.loc[:, idx["mean", :]] -= lvl2_means
+
+    # Impute
+    lvl2_train = _simple_imputer(lvl2_train)
+    lvl2_test = _simple_imputer(lvl2_test)
+
+    # Flatten
+    X_train_df = lvl2_train.pivot_table(
+        index=ID_COLS, columns=["hours_in"]
+    )
+    X_test_df = lvl2_test.pivot_table(
+        index=ID_COLS, columns=["hours_in"]
+    )
+
+    # Align features and labels to shared patients
+    common_tr = Ys_train.index.intersection(X_train_df.index)
+    common_te = Ys_test.index.intersection(X_test_df.index)
+    X_train = X_train_df.loc[common_tr].values
+    X_test = X_test_df.loc[common_te].values
+    y_train = Ys_train.loc[common_tr, target].values
+    y_test = Ys_test.loc[common_te, target].values
+
+    logger.info(
+        f"Loaded MIMIC-Extract: X_train={X_train.shape}, "
+        f"X_test={X_test.shape}, "
+        f"prevalence={y_train.mean():.3f}"
+    )
+    return X_train, X_test, y_train, y_test
diff --git a/pyhealth/datasets/configs/califorest_mimic_extract.yaml b/pyhealth/datasets/configs/califorest_mimic_extract.yaml
new file mode 100644
index 000000000..3fcd1155b
--- /dev/null
+++ b/pyhealth/datasets/configs/califorest_mimic_extract.yaml
@@ -0,0 +1,32 @@
+# CaliForest MIMIC-Extract dataset configuration.
+#
+# This dataset reads from the MIMIC-Extract all_hourly_data.h5 file
+# rather than raw CSVs. The HDF5 contains pre-aggregated hourly
+# vitals/labs with mean, count, and std columns.
+#
+# Tables in the HDF5:
+#   - patients: static demographics and outcomes
+#   - vitals_labs: hourly time-series (LEVEL2 x Aggregation Function)
+#   - interventions: hourly binary intervention indicators
+#   - codes: ICD9 diagnosis codes
+#
+# Reference: Wang et al. "MIMIC-Extract: A Data Extraction,
+# Preprocessing, and Representation Pipeline for MIMIC-III." 2019.
+version: "1.0"
+format: "hdf5"
+tables:
+  patients:
+    hdf5_key: "patients"
+    patient_id: "subject_id"
+    attributes:
+      - "mort_hosp"
+      - "mort_icu"
+      - "los_icu"
+      - "max_hours"
+      - "age"
+      - "gender"
+      - "ethnicity"
+  vitals_labs:
+    hdf5_key: "vitals_labs"
+    patient_id: "subject_id"
+    attributes: "all"
diff --git a/pyhealth/metrics/califorest_calibration.py b/pyhealth/metrics/califorest_calibration.py
new file mode 100644
index 000000000..a2d01ce27
--- /dev/null
+++ b/pyhealth/metrics/califorest_calibration.py
@@ -0,0 +1,176 @@
+# Authors: Cesar Jesus Giglio Badoino (cesarjg2@illinois.edu)
+#          Arjun Tangella (avtange2@illinois.edu)
+#          Tony Nguyen (tonyln2@illinois.edu)
+# Paper: CaliForest: Calibrated Random Forest for Health Data
+# Paper link: https://doi.org/10.1145/3368555.3384461
+# Description: Six calibration metrics for binary risk prediction
+"""CaliForest calibration metrics (Park and Ho, 2020).
+
+Six commonly published calibration metrics for binary risk
+prediction models, as described in Table 1 of the CaliForest
+paper.
+
+Paper: Y. Park and J. C. Ho. "CaliForest: Calibrated Random
+Forest for Health Data." ACM CHIL, 2020.
+https://doi.org/10.1145/3368555.3384461
+"""
+
+import numpy as np
+import pandas as pd
+
+
+def hosmer_lemeshow(
+    y_true: np.ndarray,
+    y_score: np.ndarray,
+    n_groups: int = 10,
+) -> float:
+    """Hosmer-Lemeshow goodness-of-fit test p-value.
+
+    Divides predictions into ``n_groups`` equal-sized bins and
+    tests whether observed event rates match predicted rates
+    using a chi-squared statistic.
+
+    A p-value close to 1 indicates good calibration.
+
+    Args:
+        y_true: Binary ground-truth labels of shape ``(N,)``.
+        y_score: Predicted probabilities of shape ``(N,)``.
+        n_groups: Number of quantile groups. Default ``10``.
+
+    Returns:
+        The Hosmer-Lemeshow p-value (higher is better).
+
+    Examples:
+        >>> y_true = np.array([0, 0, 1, 1])
+        >>> y_score = np.array([0.1, 0.2, 0.8, 0.9])
+        >>> p = hosmer_lemeshow(y_true, y_score)
+    """
+    from scipy.stats import chi2
+
+    df = pd.DataFrame({"score": y_score, "target": y_true})
+    df = df.sort_values("score")
+    df["score"] = np.clip(df["score"], 1e-8, 1 - 1e-8)
+    df["rank"] = range(len(df))
+    df["decile"] = pd.qcut(
+        df["rank"], n_groups, duplicates="raise"
+    )
+
+    grp = df.groupby("decile", observed=False)
+    obs_pos = grp["target"].sum()
+    obs_neg = grp["target"].count() - obs_pos
+    exp_pos = grp["score"].sum()
+    exp_neg = grp["score"].count() - exp_pos
+
+    hl = (
+        (obs_pos - exp_pos) ** 2 / exp_pos
+        + (obs_neg - exp_neg) ** 2 / exp_neg
+    ).sum()
+    return float(1 - chi2.cdf(hl, n_groups - 2))
+
+
+def spiegelhalter(
+    y_true: np.ndarray,
+    y_score: np.ndarray,
+) -> float:
+    """Spiegelhalter z-test p-value for calibration.
+
+    Tests whether the Brier score is extreme under the null
+    hypothesis that predicted probabilities equal true
+    probabilities.
+
+    A p-value close to 1 indicates good calibration.
+
+    Args:
+        y_true: Binary ground-truth labels of shape ``(N,)``.
+        y_score: Predicted probabilities of shape ``(N,)``.
+
+    Returns:
+        The two-sided Spiegelhalter p-value (higher is better).
+
+    Examples:
+        >>> y_true = np.array([0, 0, 1, 1])
+        >>> y_score = np.array([0.1, 0.2, 0.8, 0.9])
+        >>> p = spiegelhalter(y_true, y_score)
+    """
+    from scipy.stats import norm
+
+    top = np.sum((y_true - y_score) * (1 - 2 * y_score))
+    bot = np.sum(
+        (1 - 2 * y_score) ** 2 * y_score * (1 - y_score)
+    )
+    z = top / np.sqrt(bot)
+    return float(norm.sf(np.abs(z)) * 2)
+
+
+def reliability(
+    y_true: np.ndarray,
+    y_score: np.ndarray,
+    n_groups: int = 10,
+) -> tuple:
+    """Reliability-in-the-small and reliability-in-the-large.
+
+    Reliability-in-the-small measures the average squared
+    difference between observed and predicted event rates across
+    quantile bins.  Reliability-in-the-large measures the squared
+    difference between the overall mean prediction and the
+    overall observed prevalence.
+
+    Values close to 0 indicate good calibration.
+
+    Args:
+        y_true: Binary ground-truth labels of shape ``(N,)``.
+        y_score: Predicted probabilities of shape ``(N,)``.
+        n_groups: Number of quantile groups. Default ``10``.
+
+    Returns:
+        A tuple ``(rel_small, rel_large)``.
+
+    Examples:
+        >>> y_true = np.array([0, 0, 1, 1])
+        >>> y_score = np.array([0.1, 0.2, 0.8, 0.9])
+        >>> rel_s, rel_l = reliability(y_true, y_score)
+    """
+    df = pd.DataFrame({"score": y_score, "target": y_true})
+    df = df.sort_values("score")
+    df["rank"] = range(len(df))
+    df["decile"] = pd.qcut(
+        df["rank"], n_groups, duplicates="raise"
+    )
+
+    grp = df.groupby("decile", observed=False)
+    obs = grp["target"].mean()
+    exp = grp["score"].mean()
+    rel_small = float(np.mean((obs - exp) ** 2))
+    rel_large = float(
+        (np.mean(y_true) - np.mean(y_score)) ** 2
+    )
+    return rel_small, rel_large
+
+
+def scaled_brier_score(
+    y_true: np.ndarray,
+    y_score: np.ndarray,
+) -> tuple:
+    """Brier score and scaled (prevalence-adjusted) Brier score.
+
+    The scaled Brier score normalises by the maximum Brier score
+    achievable by always predicting the prevalence.  A scaled
+    score of 1 indicates perfect calibration.
+
+    Args:
+        y_true: Binary ground-truth labels of shape ``(N,)``.
+        y_score: Predicted probabilities of shape ``(N,)``.
+
+    Returns:
+        A tuple ``(brier, scaled_brier)``.
+
+    Examples:
+        >>> y_true = np.array([0, 0, 1, 1])
+        >>> y_score = np.array([0.1, 0.2, 0.8, 0.9])
+        >>> brier, scaled = scaled_brier_score(y_true, y_score)
+    """
+    brier = float(np.mean((y_true - y_score) ** 2))
+    p = np.mean(y_true)
+    denom = p * (1 - p)
+    scaled = 1 - brier / denom if denom > 0 else 0.0
+    return brier, float(scaled)
diff --git a/pyhealth/models/califorest.py b/pyhealth/models/califorest.py
new file mode 100644
index 000000000..a3e945835
--- /dev/null
+++ b/pyhealth/models/califorest.py
@@ -0,0 +1,357 @@
+# Authors: Cesar Jesus Giglio Badoino (cesarjg2@illinois.edu)
+#          Arjun Tangella (avtange2@illinois.edu)
+#          Tony Nguyen (tonyln2@illinois.edu)
+# Paper: CaliForest: Calibrated Random Forest for Health Data
+# Paper link: https://doi.org/10.1145/3368555.3384461
+# Description: Calibrated random forest using OOB variance-weighted calibration
+"""CaliForest: Calibrated Random Forest for Health Data.
+
+This module implements CaliForest (Park and Ho, 2020), a calibrated
+random forest that uses out-of-bag (OOB) prediction variance to learn
+a robust calibration function without requiring a held-out calibration
+set.
+
+Paper: Y. Park and J. C. Ho. "CaliForest: Calibrated Random Forest
+for Health Data." ACM Conference on Health, Inference, and Learning,
+2020. https://doi.org/10.1145/3368555.3384461
+
+Note:
+    CaliForest wraps scikit-learn's ``DecisionTreeClassifier`` and
+    calibration estimators internally. The ``forward`` method accepts
+    and returns ``torch.Tensor`` objects to conform to PyHealth's
+    ``BaseModel`` interface.
+"""
+
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+from sklearn.isotonic import IsotonicRegression
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+
+from pyhealth.datasets import SampleDataset
+from pyhealth.models.base_model import BaseModel
+
+
+class CaliForest(BaseModel):
+    """Calibrated Random Forest for health data.
+
+    CaliForest retains per-tree OOB predictions for each training
+    sample and estimates prediction uncertainty via an Inverse-Gamma
+    conjugate prior on the noise variance. Samples with low variance
+    (high certainty) receive larger weights when fitting the
+    calibration function, yielding better-calibrated probabilities
+    without sacrificing discrimination.
+
+    Two calibration modes are supported:
+
+    * ``"isotonic"`` (CF-Iso): non-parametric isotonic regression.
+    * ``"logistic"`` (CF-Logit): Platt scaling via logistic
+      regression.
+    * ``"beta"`` (CF-Beta): Beta calibration via logistic
+      regression on log-transformed predictions.
+
+    Args:
+        dataset: A ``SampleDataset`` used to query label information.
+        feature_keys: List of feature keys in the input batch.
+        label_key: Key for the label in the input batch.
+        n_estimators: Number of trees in the forest. Default ``300``.
+        max_depth: Maximum depth of each tree. Default ``10``.
+        min_samples_split: Minimum samples to split a node.
+            Default ``3``.
+        min_samples_leaf: Minimum samples in a leaf. Default ``1``.
+        ctype: Calibration type, ``"isotonic"``, ``"logistic"``,
+            or ``"beta"``. Default ``"isotonic"``.
+        alpha0: Prior shape parameter for the Inverse-Gamma
+            distribution. Default ``100``.
+        beta0: Prior scale parameter for the Inverse-Gamma
+            distribution. Default ``25`` (prior variance =
+            beta0 / alpha0 = 0.25).
+
+    Examples:
+        >>> from pyhealth.datasets import create_sample_dataset
+        >>> samples = [
+        ...     {
+        ...         "patient_id": "p0",
+        ...         "visit_id": "v0",
+        ...         "features": [0.1] * 10,
+        ...         "label": 0,
+        ...     },
+        ...     {
+        ...         "patient_id": "p1",
+        ...         "visit_id": "v1",
+        ...         "features": [0.9] * 10,
+        ...         "label": 1,
+        ...     },
+        ... ]
+        >>> dataset = create_sample_dataset(
+        ...     samples=samples,
+        ...     input_schema={"features": "tensor"},
+        ...     output_schema={"label": "binary"},
+        ...     dataset_name="demo",
+        ... )
+        >>> model = CaliForest(
+        ...     dataset=dataset,
+        ...     feature_keys=["features"],
+        ...     label_key="label",
+        ...     n_estimators=50,
+        ...     max_depth=5,
+        ... )
+    """
+
+    def __init__(
+        self,
+        dataset: SampleDataset,
+        feature_keys: List[str],
+        label_key: str,
+        n_estimators: int = 300,
+        max_depth: int = 10,
+        min_samples_split: int = 3,
+        min_samples_leaf: int = 1,
+        ctype: str = "isotonic",
+        alpha0: float = 100.0,
+        beta0: float = 25.0,
+    ) -> None:
+        super().__init__(dataset)
+        self.feature_keys = feature_keys
+        self.label_key = label_key
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.ctype = ctype
+        self.alpha0 = alpha0
+        self.beta0 = beta0
+
+        self.estimators_: List[DecisionTreeClassifier] = []
+        self.calibrator_: Optional[Any] = None
+        self.is_fitted_ = False
+
+        # Dummy parameter so BaseModel.device works
+        self._dummy_param = nn.Parameter(torch.empty(0))
+
+    # ----------------------------------------------------------
+    # Training
+    # ----------------------------------------------------------
+
+    def fit(
+        self,
+        train_dataloader: Any,
+    ) -> "CaliForest":
+        """Fits the CaliForest model on training data.
+
+        Collects all batches from the dataloader, trains individual
+        decision trees with bootstrap sampling, computes OOB
+        predictions and inverse-variance sample weights, then fits
+        the calibration function.
+
+        Args:
+            train_dataloader: A PyTorch ``DataLoader`` yielding
+                batches with keys matching ``feature_keys`` and
+                ``label_key``.
+
+        Returns:
+            The fitted ``CaliForest`` instance.
+        """
+        X_parts, y_parts = [], []
+        for batch in train_dataloader:
+            feats = [batch[k].numpy() for k in self.feature_keys]
+            X_parts.append(np.concatenate(feats, axis=1))
+            y_parts.append(batch[self.label_key].numpy().ravel())
+        X = np.concatenate(X_parts, axis=0)
+        y = np.concatenate(y_parts, axis=0)
+        self._fit_numpy(X, y)
+        return self
+
+    def _fit_numpy(self, X: np.ndarray, y: np.ndarray) -> None:
+        """Core fitting logic on numpy arrays.
+
+        Args:
+            X: Feature matrix of shape ``(n_samples, n_features)``.
+            y: Binary label array of shape ``(n_samples,)``.
+        """
+        n = X.shape[0]
+        self.estimators_ = []
+
+        # Build calibrator
+        if self.ctype == "logistic":
+            self.calibrator_ = LogisticRegression(
+                penalty=None, solver="saga", max_iter=5000
+            )
+        elif self.ctype == "beta":
+            self.calibrator_ = LogisticRegression(
+                penalty=None, solver="saga", max_iter=5000
+            )
+        else:
+            self.calibrator_ = IsotonicRegression(
+                y_min=0, y_max=1, out_of_bounds="clip"
+            )
+
+        # OOB prediction matrix: (n_samples, n_estimators), NaN if
+        # sample was in-bag for that tree
+        Y_oob = np.full((n, self.n_estimators), np.nan)
+        n_oob = np.zeros(n)
+        IB = np.zeros((n, self.n_estimators), dtype=int)
+        OOB = np.full((n, self.n_estimators), True)
+
+        for eid in range(self.n_estimators):
+            IB[:, eid] = np.random.choice(n, n)
+            OOB[IB[:, eid], eid] = False
+
+        for eid in range(self.n_estimators):
+            est = DecisionTreeClassifier(
+                max_depth=self.max_depth,
+                min_samples_split=self.min_samples_split,
+                min_samples_leaf=self.min_samples_leaf,
+            )
+            ib_idx = IB[:, eid]
+            oob_mask = OOB[:, eid]
+            est.fit(X[ib_idx], y[ib_idx])
+            proba = est.predict_proba(X[oob_mask])
+            # Handle case where tree only sees one class
+            if proba.shape[1] == 2:
+                Y_oob[oob_mask, eid] = proba[:, 1]
+            else:
+                Y_oob[oob_mask, eid] = proba[:, 0]
+            n_oob[oob_mask] += 1
+            self.estimators_.append(est)
+
+        # Compute inverse-variance weights via Inverse-Gamma update
+        valid = n_oob > 1
+        Y_oob_valid = Y_oob[valid]
+        n_oob_valid = n_oob[valid]
+        z_hat = np.nanmean(Y_oob_valid, axis=1)
+        z_true = y[valid]
+        beta = (
+            self.beta0
+            + np.nanvar(Y_oob_valid, axis=1) * n_oob_valid / 2
+        )
+        alpha = self.alpha0 + n_oob_valid / 2
+        z_weight = alpha / beta
+
+        # Fit calibrator
+        if self.ctype == "logistic":
+            self.calibrator_.fit(
+                z_hat[:, np.newaxis], z_true, z_weight
+            )
+        elif self.ctype == "beta":
+            self.calibrator_.fit(
+                self._beta_features(z_hat), z_true, z_weight
+            )
+        else:
+            self.calibrator_.fit(z_hat, z_true, z_weight)
+
+        self.is_fitted_ = True
+
+    # ----------------------------------------------------------
+    # Prediction
+    # ----------------------------------------------------------
+
+    def _beta_features(self, p: np.ndarray) -> np.ndarray:
+        """Compute Beta calibration features from probabilities.
+
+        Beta calibration (Kull et al., 2017) fits a logistic
+        regression on three log-transformed features of the
+        predicted probability.
+
+        Args:
+            p: Predicted probabilities of shape ``(n,)``.
+
+        Returns:
+            Feature matrix of shape ``(n, 3)``.
+        """
+        eps = 1e-7
+        p = np.clip(p, eps, 1 - eps)
+        return np.column_stack([
+            np.log(p / (1 - p)),
+            np.log(p),
+            np.log(1 - p),
+        ])
+
+    def _predict_proba_numpy(self, X: np.ndarray) -> np.ndarray:
+        """Predicts calibrated probabilities on numpy input.
+
+        Args:
+            X: Feature matrix of shape ``(n_samples, n_features)``.
+
+        Returns:
+            Array of shape ``(n_samples,)`` with calibrated
+            probabilities for the positive class.
+        """
+        z = np.zeros(X.shape[0])
+        for est in self.estimators_:
+            proba = est.predict_proba(X)
+            if proba.shape[1] == 2:
+                z += proba[:, 1]
+            else:
+                z += proba[:, 0]
+        z /= len(self.estimators_)
+
+        if self.ctype == "logistic":
+            return self.calibrator_.predict_proba(
+                z[:, np.newaxis]
+            )[:, 1]
+        if self.ctype == "beta":
+            return self.calibrator_.predict_proba(
+                self._beta_features(z)
+            )[:, 1]
+        return self.calibrator_.predict(z)
+
+    # ----------------------------------------------------------
+    # PyHealth interface
+    # ----------------------------------------------------------
+
+    def forward(
+        self, **kwargs: torch.Tensor
+    ) -> Dict[str, torch.Tensor]:
+        """Forward pass conforming to PyHealth's ``BaseModel``.
+
+        During training (``self.training is True`` and model is not
+        yet fitted), this method stores data and returns a dummy loss.
+        After ``fit()`` has been called, it returns calibrated
+        predictions.
+
+        Args:
+            **kwargs: Keyword arguments containing tensors for each
+                feature key and the label key.
+
+        Returns:
+            A dictionary with keys ``loss``, ``y_prob``, ``y_true``,
+            and ``logit``.
+        """
+        feats = [kwargs[k] for k in self.feature_keys]
+        X_tensor = torch.cat(feats, dim=-1).float()
+        X_np = X_tensor.detach().cpu().numpy()
+
+        if self.label_key in kwargs:
+            y_true = kwargs[self.label_key].float()
+        else:
+            y_true = torch.zeros(X_np.shape[0])
+
+        if not self.is_fitted_:
+            # Before fit: return dummy outputs
+            y_prob = torch.full_like(y_true, 0.5)
+            logit = torch.zeros_like(y_true)
+            loss = torch.tensor(0.0, requires_grad=True)
+        else:
+            proba = self._predict_proba_numpy(X_np)
+            y_prob = torch.tensor(
+                proba, dtype=torch.float32
+            ).unsqueeze(-1)
+            logit = torch.log(
+                y_prob.clamp(1e-7, 1 - 1e-7)
+                / (1 - y_prob.clamp(1e-7, 1 - 1e-7))
+            )
+            loss = nn.functional.binary_cross_entropy(
+                y_prob.squeeze(-1),
+                y_true.squeeze(-1).float(),
+            )
+
+        return {
+            "loss": loss,
+            "y_prob": y_prob,
+            "y_true": y_true,
+            "logit": logit,
+        }
diff --git a/pyhealth/tasks/mimic_extract_califorest.py b/pyhealth/tasks/mimic_extract_califorest.py
new file mode 100644
index 000000000..cc4663640
--- /dev/null
+++ b/pyhealth/tasks/mimic_extract_califorest.py
@@ -0,0 +1,88 @@
+# Authors: Cesar Jesus Giglio Badoino (cesarjg2@illinois.edu)
+#          Arjun Tangella (avtange2@illinois.edu)
+#          Tony Nguyen (tonyln2@illinois.edu)
+# Paper: CaliForest: Calibrated Random Forest for Health Data
+# Paper link: https://doi.org/10.1145/3368555.3384461
+# Description: Binary prediction task for mortality and length of stay
+"""MIMIC-Extract flat-feature tasks for CaliForest.
+
+Defines a single parameterised task that converts MIMIC-Extract
+hourly time-series data into a flat feature vector suitable for
+tree-based models such as CaliForest.  Four binary prediction
+targets are supported:
+
+* ``mort_hosp`` – in-hospital mortality
+* ``mort_icu``  – in-ICU mortality
+* ``los_3``     – ICU length-of-stay > 3 days
+* ``los_7``     – ICU length-of-stay > 7 days
+
+Paper: Y. Park and J. C. Ho. "CaliForest: Calibrated Random Forest
+for Health Data." ACM CHIL, 2020.
+"""
+
+from typing import Any, Dict, List
+
+from .base_task import BaseTask
+
+
+class MIMICExtractCaliForestTask(BaseTask):
+    """Flat-feature binary prediction task for MIMIC-Extract data.
+
+    This task is designed for the ``MIMICExtractFlatDataset`` which
+    already produces flat feature vectors and labels.  The task
+    simply selects the requested prediction target.
+
+    Args:
+        target: One of ``"mort_hosp"``, ``"mort_icu"``,
+            ``"los_3"``, or ``"los_7"``.  Default ``"mort_hosp"``.
+
+    Examples:
+        >>> task = MIMICExtractCaliForestTask(target="mort_hosp")
+        >>> task.task_name
+        'mimic_extract_califorest_mort_hosp'
+        >>> task.output_schema
+        {'label': 'binary'}
+    """
+
+    VALID_TARGETS = ("mort_hosp", "mort_icu", "los_3", "los_7")
+
+    def __init__(self, target: str = "mort_hosp") -> None:
+        if target not in self.VALID_TARGETS:
+            raise ValueError(
+                f"target must be one of {self.VALID_TARGETS}, "
+                f"got '{target}'"
+            )
+        self.target = target
+        self.task_name = f"mimic_extract_califorest_{target}"
+        self.input_schema: Dict[str, str] = {
+            "features": "tensor",
+        }
+        self.output_schema: Dict[str, str] = {
+            "label": "binary",
+        }
+
+    def __call__(self, patient: Any) -> List[Dict]:
+        """Extracts a single sample from a patient record.
+
+        Expects the patient object to carry ``features`` (a flat
+        numeric vector) and the target label attribute.
+
+        Args:
+            patient: A ``Patient`` object from
+                ``MIMICExtractFlatDataset``.
+
+        Returns:
+            A list containing one sample dictionary, or an empty
+            list if the patient lacks the required data.
+        """
+        features = getattr(patient, "features", None)
+        label = getattr(patient, self.target, None)
+        if features is None or label is None:
+            return []
+        return [
+            {
+                "patient_id": patient.patient_id,
+                "features": features,
+                "label": int(label),
+            }
+        ]
diff --git a/tests/test_calibration_metrics.py b/tests/test_calibration_metrics.py
new file mode 100644
index 000000000..200dab2a4
--- /dev/null
+++ b/tests/test_calibration_metrics.py
@@ -0,0 +1,91 @@
+# Authors: Cesar Jesus Giglio Badoino (cesarjg2@illinois.edu)
+#          Arjun Tangella (avtange2@illinois.edu)
+#          Tony Nguyen (tonyln2@illinois.edu)
+# Paper: CaliForest: Calibrated Random Forest for Health Data
+# Paper link: https://doi.org/10.1145/3368555.3384461
+# Description: Tests for CaliForest calibration metrics
+"""Tests for CaliForest calibration metrics."""
+
+import numpy as np
+import pytest
+
+from pyhealth.metrics.califorest_calibration import (
+    hosmer_lemeshow,
+    reliability,
+    scaled_brier_score,
+    spiegelhalter,
+)
+
+
+def _perfect_data():
+    y_true = np.array([0] * 50 + [1] * 50)
+    y_score = np.array([0.0] * 50 + [1.0] * 50)
+    return y_true, y_score
+
+
+def _random_data(seed=42):
+    np.random.seed(seed)
+    y_true = np.random.randint(0, 2, 200)
+    y_score = np.clip(
+        y_true + np.random.randn(200) * 0.2, 0.01, 0.99
+    )
+    return y_true, y_score
+
+
+class TestScaledBrierScore:
+    def test_perfect(self):
+        brier, scaled = scaled_brier_score(*_perfect_data())
+        assert brier == pytest.approx(0.0, abs=1e-7)
+        assert scaled == pytest.approx(1.0, abs=1e-7)
+
+    def test_returns_tuple(self):
+        result = scaled_brier_score(*_random_data())
+        assert isinstance(result, tuple)
+        assert len(result) == 2
+
+    def test_brier_range(self):
+        brier, scaled = scaled_brier_score(*_random_data())
+        assert 0 <= brier <= 1
+        assert scaled <= 1
+
+
+class TestHosmerLemeshow:
+    def test_returns_float(self):
+        p = hosmer_lemeshow(*_random_data())
+        assert isinstance(p, float)
+
+    def test_range(self):
+        p = hosmer_lemeshow(*_random_data())
+        assert 0 <= p <= 1
+
+    def test_well_calibrated_high_pvalue(self):
+        y_true, y_score = _random_data()
+        p = hosmer_lemeshow(y_true, y_score)
+        # Well-calibrated data should have a reasonably high p-value
+        assert p > 0.01
+
+
+class TestSpiegelhalter:
+    def test_returns_float(self):
+        p = spiegelhalter(*_random_data())
+        assert isinstance(p, float)
+
+    def test_range(self):
+        p = spiegelhalter(*_random_data())
+        assert 0 <= p <= 1
+
+
+class TestReliability:
+    def test_returns_tuple(self):
+        result = reliability(*_random_data())
+        assert isinstance(result, tuple)
+        assert len(result) == 2
+
+    def test_perfect_calibration(self):
+        rel_s, rel_l = reliability(*_perfect_data())
+        assert rel_l == pytest.approx(0.0, abs=1e-7)
+
+    def test_non_negative(self):
+        rel_s, rel_l = reliability(*_random_data())
+        assert rel_s >= 0
+        assert rel_l >= 0
diff --git a/tests/test_califorest.py b/tests/test_califorest.py
new file mode 100644
index 000000000..652c56f91
--- /dev/null
+++ b/tests/test_califorest.py
@@ -0,0 +1,159 @@
+# Authors: Cesar Jesus Giglio Badoino (cesarjg2@illinois.edu)
+#          Arjun Tangella (avtange2@illinois.edu)
+#          Tony Nguyen (tonyln2@illinois.edu)
+# Paper: CaliForest: Calibrated Random Forest for Health Data
+# Paper link: https://doi.org/10.1145/3368555.3384461
+# Description: Tests for CaliForest model
+"""Tests for the CaliForest model."""
+
+import numpy as np
+import torch
+import pytest
+
+from pyhealth.datasets import create_sample_dataset, get_dataloader
+from pyhealth.models.califorest import CaliForest
+
+
+def _make_dataset(n=40, d=10):
+    """Create a small synthetic binary classification dataset."""
+    np.random.seed(42)
+    samples = []
+    for i in range(n):
+        label = int(i >= n // 2)
+        feats = np.random.randn(d).tolist()
+        if label == 1:
+            feats = [f + 1.0 for f in feats]
+        samples.append(
+            {
+                "patient_id": f"p{i}",
+                "visit_id": f"v{i}",
+                "features": feats,
+                "label": label,
+            }
+        )
+    return create_sample_dataset(
+        samples=samples,
+        input_schema={"features": "tensor"},
+        output_schema={"label": "binary"},
+        dataset_name="synth",
+    )
+
+
+class TestCaliForestInstantiation:
+    def test_default_params(self):
+        ds = _make_dataset()
+        model = CaliForest(
+            dataset=ds,
+            feature_keys=["features"],
+            label_key="label",
+            n_estimators=10,
+        )
+        assert model.n_estimators == 10
+        assert model.is_fitted_ is False
+
+    def test_invalid_ctype_still_creates(self):
+        ds = _make_dataset()
+        model = CaliForest(
+            dataset=ds,
+            feature_keys=["features"],
+            label_key="label",
+            ctype="isotonic",
+        )
+        assert model.ctype == "isotonic"
+
+
+class TestCaliForestForward:
+    def test_unfitted_forward(self):
+        ds = _make_dataset()
+        model = CaliForest(
+            dataset=ds,
+            feature_keys=["features"],
+            label_key="label",
+            n_estimators=5,
+        )
+        loader = get_dataloader(ds, batch_size=8, shuffle=False)
+        batch = next(iter(loader))
+        out = model(**batch)
+        assert "loss" in out
+        assert "y_prob" in out
+        assert "y_true" in out
+        assert "logit" in out
+
+    def test_fitted_forward_isotonic(self):
+        ds = _make_dataset()
+        model = CaliForest(
+            dataset=ds,
+            feature_keys=["features"],
+            label_key="label",
+            n_estimators=10,
+            max_depth=3,
+            ctype="isotonic",
+        )
+        loader = get_dataloader(ds, batch_size=40, shuffle=False)
+        model.fit(loader)
+        assert model.is_fitted_
+
+        loader2 = get_dataloader(ds, batch_size=8, shuffle=False)
+        batch = next(iter(loader2))
+        out = model(**batch)
+        assert out["y_prob"].shape[0] == 8
+        assert (out["y_prob"] >= 0).all()
+        assert (out["y_prob"] <= 1).all()
+
+    def test_fitted_forward_logistic(self):
+        ds = _make_dataset()
+        model = CaliForest(
+            dataset=ds,
+            feature_keys=["features"],
+            label_key="label",
+            n_estimators=10,
+            max_depth=3,
+            ctype="logistic",
+        )
+        loader = get_dataloader(ds, batch_size=40, shuffle=False)
+        model.fit(loader)
+        assert model.is_fitted_
+
+        batch = next(iter(loader))
+        out = model(**batch)
+        assert (out["y_prob"] >= 0).all()
+        assert (out["y_prob"] <= 1).all()
+
+    def test_fitted_forward_beta(self):
+        ds = _make_dataset()
+        model = CaliForest(
+            dataset=ds,
+            feature_keys=["features"],
+            label_key="label",
+            n_estimators=10,
+            max_depth=3,
+            ctype="beta",
+        )
+        loader = get_dataloader(ds, batch_size=40, shuffle=False)
+        model.fit(loader)
+        assert model.is_fitted_
+
+        batch = next(iter(loader))
+        out = model(**batch)
+        assert (out["y_prob"] >= 0).all()
+        assert (out["y_prob"] <= 1).all()
+
+
+class TestCaliForestNumpyFit:
+    def test_fit_and_predict(self):
+        np.random.seed(0)
+        X = np.random.randn(60, 5)
+        y = (X[:, 0] > 0).astype(int)
+
+        ds = _make_dataset()
+        model = CaliForest(
+            dataset=ds,
+            feature_keys=["features"],
+            label_key="label",
+            n_estimators=20,
+            max_depth=3,
+        )
+        model._fit_numpy(X, y)
+        proba = model._predict_proba_numpy(X)
+        assert proba.shape == (60,)
+        assert np.all(proba >= 0) and np.all(proba <= 1)
diff --git a/tests/test_califorest_mimic_extract_dataset.py b/tests/test_califorest_mimic_extract_dataset.py
new file mode 100644
index 000000000..1bd79233e
--- /dev/null
+++ b/tests/test_califorest_mimic_extract_dataset.py
@@ -0,0 +1,131 @@
+# Authors: Cesar Jesus Giglio Badoino (cesarjg2@illinois.edu)
+#          Arjun Tangella (avtange2@illinois.edu)
+#          Tony Nguyen (tonyln2@illinois.edu)
+# Paper: CaliForest: Calibrated Random Forest for Health Data
+# Paper link: https://doi.org/10.1145/3368555.3384461
+# Description: Tests for CaliForest MIMIC-Extract dataset
+"""Tests for CaliForestMIMICExtractDataset."""
+
+import os
+import tempfile
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from pyhealth.datasets.califorest_mimic_extract import (
+    CaliForestMIMICExtractDataset,
+    _normalize_columns,
+    _simple_imputer,
+)
+
+
+def _make_fake_hdf5(path):
+    """Create a minimal fake all_hourly_data.h5."""
+    patients = pd.DataFrame(
+        {
+            "mort_hosp": [0, 1, 0],
+            "mort_icu": [0, 1, 0],
+            "los_icu": [2.0, 8.0, 4.0],
+            "max_hours": [48, 72, 50],
+            "age": [65, 70, 55],
+            "gender": ["M", "F", "M"],
+            "ethnicity": ["W", "W", "B"],
+        },
+        index=pd.MultiIndex.from_tuples(
+            [(1, 100, 1000), (2, 200, 2000), (3, 300, 3000)],
+            names=["subject_id", "hadm_id", "icustay_id"],
+        ),
+    )
+    rows = []
+    for sid, hid, iid in [(1, 100, 1000), (2, 200, 2000), (3, 300, 3000)]:
+        for h in range(3):
+            rows.append((sid, hid, iid, h))
+    idx = pd.MultiIndex.from_tuples(
+        rows, names=["subject_id", "hadm_id", "icustay_id", "hours_in"],
+    )
+    np.random.seed(0)
+    n = len(rows)
+    data = {
+        ("heart_rate", "mean"): np.random.randn(n),
+        ("heart_rate", "count"): np.random.randint(0, 3, n).astype(float),
+        ("heart_rate", "std"): np.abs(np.random.randn(n)),
+        ("sodium", "mean"): np.random.randn(n),
+        ("sodium", "count"): np.random.randint(0, 3, n).astype(float),
+        ("sodium", "std"): np.abs(np.random.randn(n)),
+    }
+    vitals = pd.DataFrame(data, index=idx)
+    vitals.columns = pd.MultiIndex.from_tuples(
+        vitals.columns, names=["LEVEL2", "Aggregation Function"]
+    )
+    patients.to_hdf(path, key="patients", mode="w")
+    vitals.to_hdf(path, key="vitals_labs", mode="a")
+
+
+class TestDatasetInstantiation:
+    def test_creates_with_valid_path(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            _make_fake_hdf5(os.path.join(tmpdir, "all_hourly_data.h5"))
+            ds = CaliForestMIMICExtractDataset(root=tmpdir)
+            assert ds.dataset_name == "califorest_mimic_extract"
+
+    def test_raises_on_missing_file(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with pytest.raises(FileNotFoundError):
+                CaliForestMIMICExtractDataset(root=tmpdir)
+
+
+class TestNormalizeColumns:
+    def test_swaps_when_agg_is_level1(self):
+        cols = pd.MultiIndex.from_tuples(
+            [("hr", "mean"), ("hr", "count")],
+            names=["LEVEL2", "Aggregation Function"],
+        )
+        df = pd.DataFrame(np.ones((2, 2)), columns=cols)
+        result = _normalize_columns(df)
+        assert result.columns.get_level_values(0)[0] in {"mean", "count"}
+
+    def test_noop_when_agg_is_level0(self):
+        cols = pd.MultiIndex.from_tuples(
+            [("mean", "hr"), ("count", "hr")],
+            names=["Aggregation Function", "LEVEL2"],
+        )
+        df = pd.DataFrame(np.ones((2, 2)), columns=cols)
+        result = _normalize_columns(df)
+        assert result.columns.get_level_values(0)[0] == "mean"
+
+
+class TestSimpleImputer:
+    def test_output_has_three_channels(self):
+        cols = pd.MultiIndex.from_tuples(
+            [("mean", "hr"), ("count", "hr"), ("std", "hr")],
+            names=["Aggregation Function", "LEVEL2"],
+        )
+        idx = pd.MultiIndex.from_tuples(
+            [(1, 1, 1, 0), (1, 1, 1, 1)],
+            names=["subject_id", "hadm_id", "icustay_id", "hours_in"],
+        )
+        df = pd.DataFrame(
+            [[1.0, 1.0, 0.1], [np.nan, 0.0, 0.0]],
+            index=idx, columns=cols,
+        )
+        result = _simple_imputer(df)
+        agg_funcs = set(result.columns.get_level_values(0).unique())
+        assert "mean" in agg_funcs
+        assert "mask" in agg_funcs
+        assert "time_since_measured" in agg_funcs
+
+
+class TestLoadCaliforestData:
+    def test_returns_aligned_arrays(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            _make_fake_hdf5(os.path.join(tmpdir, "all_hourly_data.h5"))
+            ds = CaliForestMIMICExtractDataset(root=tmpdir)
+            X_tr, X_te, y_tr, y_te = ds.load_califorest_data(
+                target="mort_hosp", train_frac=0.67,
+            )
+            assert X_tr.ndim == 2
+            assert X_te.ndim == 2
+            assert len(y_tr) == X_tr.shape[0]
+            assert len(y_te) == X_te.shape[0]
+            assert set(np.unique(np.concatenate([y_tr, y_te]))).issubset({0, 1})
diff --git a/tests/test_mimic_extract_califorest_task.py b/tests/test_mimic_extract_califorest_task.py
new file mode 100644
index 000000000..940524b65
--- /dev/null
+++ b/tests/test_mimic_extract_califorest_task.py
@@ -0,0 +1,69 @@
+# Authors: Cesar Jesus Giglio Badoino (cesarjg2@illinois.edu)
+#          Arjun Tangella (avtange2@illinois.edu)
+#          Tony Nguyen (tonyln2@illinois.edu)
+# Paper: CaliForest: Calibrated Random Forest for Health Data
+# Paper link: https://doi.org/10.1145/3368555.3384461
+# Description: Tests for MIMIC-Extract CaliForest task
+"""Tests for MIMICExtractCaliForestTask."""
+
+import pytest
+
+from pyhealth.tasks.mimic_extract_califorest import (
+    MIMICExtractCaliForestTask,
+)
+
+
+class _FakePatient:
+    """Minimal patient stub for testing."""
+
+    def __init__(self, pid, features, mort_hosp, mort_icu, los_3, los_7):
+        self.patient_id = pid
+        self.features = features
+        self.mort_hosp = mort_hosp
+        self.mort_icu = mort_icu
+        self.los_3 = los_3
+        self.los_7 = los_7
+
+
+class TestTaskInit:
+    def test_valid_targets(self):
+        for t in ("mort_hosp", "mort_icu", "los_3", "los_7"):
+            task = MIMICExtractCaliForestTask(target=t)
+            assert task.target == t
+            assert t in task.task_name
+
+    def test_invalid_target(self):
+        with pytest.raises(ValueError):
+            MIMICExtractCaliForestTask(target="invalid")
+
+    def test_schema(self):
+        task = MIMICExtractCaliForestTask()
+        assert "features" in task.input_schema
+        assert "label" in task.output_schema
+
+
+class TestTaskCall:
+    def test_produces_sample(self):
+        task = MIMICExtractCaliForestTask(target="mort_hosp")
+        patient = _FakePatient("p0", [0.1] * 10, 1, 0, 1, 0)
+        samples = task(patient)
+        assert len(samples) == 1
+        assert samples[0]["label"] == 1
+        assert samples[0]["features"] == [0.1] * 10
+
+    def test_different_targets(self):
+        patient = _FakePatient("p0", [0.5] * 5, 0, 1, 1, 0)
+        assert MIMICExtractCaliForestTask("mort_hosp")(patient)[0]["label"] == 0
+        assert MIMICExtractCaliForestTask("mort_icu")(patient)[0]["label"] == 1
+        assert MIMICExtractCaliForestTask("los_3")(patient)[0]["label"] == 1
+        assert MIMICExtractCaliForestTask("los_7")(patient)[0]["label"] == 0
+
+    def test_missing_features(self):
+        task = MIMICExtractCaliForestTask()
+        patient = _FakePatient("p0", None, 1, 0, 0, 0)
+        assert task(patient) == []
+
+    def test_missing_label(self):
+        task = MIMICExtractCaliForestTask(target="mort_hosp")
+        patient = _FakePatient("p0", [1.0], None, 0, 0, 0)
+        assert task(patient) == []