From 35817a5f1634456bcf30275ad575e7166e176492 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Tue, 24 Mar 2026 00:11:08 -0400
Subject: [PATCH 1/5] Add structural mortgage interest data support

---
 .../structural-mortgage-interest.added.md     |   1 +
 .../datasets/cps/extended_cps.py              |  27 +-
 policyengine_us_data/datasets/puf/puf.py      |  11 +
 .../test_mortgage_interest.py                 | 222 +++++
 .../utils/mortgage_interest.py                | 770 ++++++++++++++++++
 5 files changed, 1027 insertions(+), 4 deletions(-)
 create mode 100644 changelog.d/structural-mortgage-interest.added.md
 create mode 100644 policyengine_us_data/tests/test_calibration/test_mortgage_interest.py
 create mode 100644 policyengine_us_data/utils/mortgage_interest.py

diff --git a/changelog.d/structural-mortgage-interest.added.md b/changelog.d/structural-mortgage-interest.added.md
new file mode 100644
index 00000000..bfecbc02
--- /dev/null
+++ b/changelog.d/structural-mortgage-interest.added.md
@@ -0,0 +1 @@
+Convert imputed deductible mortgage interest into structural mortgage balance, interest, and origination-year inputs when the installed `policyengine-us` supports federal MID cap modeling, while preserving total current-law interest deductions via residual investment interest inputs.
diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
index f38d5746..288e3e46 100644
--- a/policyengine_us_data/datasets/cps/extended_cps.py
+++ b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -6,9 +6,14 @@
 import pandas as pd
 from policyengine_core.data import Dataset
 
-from policyengine_us_data.datasets.cps.cps import *  # noqa: F403
-from policyengine_us_data.datasets.puf import *  # noqa: F403
+from policyengine_us_data.datasets.cps.cps import CPS, CPS_2024, CPS_2024_Full
+from policyengine_us_data.datasets.puf import PUF, PUF_2024
 from policyengine_us_data.storage import STORAGE_FOLDER
+from policyengine_us_data.utils.mortgage_interest import (
+    convert_mortgage_interest_to_structural_inputs,
+    impute_tax_unit_mortgage_balance_hints,
+    supports_structural_mortgage_inputs,
+)
 from policyengine_us_data.utils.retirement_limits import (
     get_retirement_limits,
     get_se_pension_limits,
@@ -445,6 +450,14 @@ def generate(self):
         )
 
         new_data = self._rename_imputed_to_inputs(new_data)
+        new_data = impute_tax_unit_mortgage_balance_hints(
+            new_data,
+            self.time_period,
+        )
+        new_data = convert_mortgage_interest_to_structural_inputs(
+            new_data,
+            self.time_period,
+        )
         new_data = self._drop_formula_variables(new_data)
         self.save_dataset(new_data)
 
@@ -472,11 +485,17 @@ def _rename_imputed_to_inputs(cls, data):
     # due to entity shape mismatch.
     _KEEP_FORMULA_VARS = {
         "person_id",
-        "interest_deduction",
         "self_employed_pension_contribution_ald",
         "self_employed_health_insurance_ald",
     }
 
+    @classmethod
+    def _keep_formula_vars(cls):
+        keep = set(cls._KEEP_FORMULA_VARS)
+        if not supports_structural_mortgage_inputs():
+            keep.add("interest_deduction")
+        return keep
+
     # QRF imputes formula-level variables (e.g. taxable_pension_income)
     # but we must store them under leaf input names so
     # _drop_formula_variables doesn't discard them. The engine then
@@ -526,7 +545,7 @@ def _drop_formula_variables(cls, data):
             if (hasattr(var, "formulas") and len(var.formulas) > 0)
             or getattr(var, "adds", None)
             or getattr(var, "subtracts", None)
-        } - cls._KEEP_FORMULA_VARS
+        } - cls._keep_formula_vars()
         dropped = sorted(set(data.keys()) & formula_vars)
         if dropped:
             logger.info(
diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py
index baedce8a..9a0e8a22 100644
--- a/policyengine_us_data/datasets/puf/puf.py
+++ b/policyengine_us_data/datasets/puf/puf.py
@@ -12,6 +12,9 @@
 from policyengine_us_data.datasets.puf.disaggregate_puf import (
     disaggregate_aggregate_records,
 )
+from policyengine_us_data.utils.mortgage_interest import (
+    convert_mortgage_interest_to_structural_inputs,
+)
 from policyengine_us_data.utils.uprating import (
     create_policyengine_uprating_factors_table,
 )
@@ -643,6 +646,14 @@ def generate(self):
                 self.holder[key] = np.array(self.holder[key]).astype(float)
                 assert not np.isnan(self.holder[key]).any(), f"{key} has NaNs."
 
+        holder_tp = {variable: {self.time_period: values} for variable, values in self.holder.items()}
+        holder_tp = convert_mortgage_interest_to_structural_inputs(
+            holder_tp,
+            self.time_period,
+        )
+        self.holder = {
+            variable: values[self.time_period] for variable, values in holder_tp.items()
+        }
         self.save_dataset(self.holder)
 
     def add_tax_unit(self, row, tax_unit_id):
diff --git a/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py b/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py
new file mode 100644
index 00000000..94aed99e
--- /dev/null
+++ b/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py
@@ -0,0 +1,222 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from policyengine_us_data.utils.mortgage_interest import (
+    convert_mortgage_interest_to_structural_inputs,
+    impute_tax_unit_mortgage_balance_hints,
+    supports_structural_mortgage_inputs,
+)
+
+
+def _base_dataset_dict(deductible_mortgage_interest, interest_deduction):
+    time_period = 2024
+    return {
+        "person_id": {time_period: np.array([1, 2])},
+        "tax_unit_id": {time_period: np.array([1])},
+        "marital_unit_id": {time_period: np.array([1])},
+        "spm_unit_id": {time_period: np.array([1])},
+        "family_id": {time_period: np.array([1])},
+        "household_id": {time_period: np.array([1])},
+        "person_tax_unit_id": {time_period: np.array([1, 1])},
+        "person_marital_unit_id": {time_period: np.array([1, 1])},
+        "person_spm_unit_id": {time_period: np.array([1, 1])},
+        "person_family_id": {time_period: np.array([1, 1])},
+        "person_household_id": {time_period: np.array([1, 1])},
+        "is_tax_unit_head": {time_period: np.array([True, False])},
+        "is_tax_unit_spouse": {time_period: np.array([False, True])},
+        "age": {time_period: np.array([55, 53])},
+        "filing_status": {time_period: np.array([b"JOINT"])},
+        "deductible_mortgage_interest": {
+            time_period: np.array(deductible_mortgage_interest, dtype=np.float32)
+        },
+        "interest_deduction": {
+            time_period: np.array(interest_deduction, dtype=np.float32)
+        },
+    }
+
+
+def _current_law_cap(filing_status: bytes, origination_year: int) -> float:
+    is_separate = b"SEPARATE" in filing_status
+    if origination_year <= 2017:
+        return 500_000.0 if is_separate else 1_000_000.0
+    return 375_000.0 if is_separate else 750_000.0
+
+
+@pytest.mark.skipif(
+    not supports_structural_mortgage_inputs(),
+    reason="Installed policyengine-us does not yet expose structural MID inputs.",
+)
+def test_structural_mortgage_conversion_preserves_current_law_interest_deduction():
+    data = _base_dataset_dict(
+        deductible_mortgage_interest=[6_000.0, 0.0],
+        interest_deduction=[7_000.0],
+    )
+    converted = convert_mortgage_interest_to_structural_inputs(data, 2024)
+
+    assert "deductible_mortgage_interest" not in converted
+    assert "interest_deduction" not in converted
+    assert converted["first_home_mortgage_balance"][2024][0] > 0
+    assert converted["first_home_mortgage_interest"][2024][0] >= 6_000
+    assert converted["first_home_mortgage_origination_year"][2024][0] > 0
+    assert converted["investment_interest_expense"][2024].sum() == pytest.approx(
+        1_000.0
+    )
+    cap = _current_law_cap(
+        converted["filing_status"][2024][0],
+        int(converted["first_home_mortgage_origination_year"][2024][0]),
+    )
+    balance = converted["first_home_mortgage_balance"][2024][0]
+    total_interest = converted["first_home_mortgage_interest"][2024][0]
+    deductible_share = min(1.0, cap / balance) if balance > 0 else 0.0
+
+    assert total_interest * deductible_share == pytest.approx(6_000.0)
+    assert converted["home_mortgage_interest"][2024].sum() == pytest.approx(
+        total_interest
+    )
+    assert (
+        total_interest * deductible_share
+        + converted["investment_interest_expense"][2024].sum()
+    ) == pytest.approx(7_000.0)
+
+
+@pytest.mark.skipif(
+    not supports_structural_mortgage_inputs(),
+    reason="Installed policyengine-us does not yet expose structural MID inputs.",
+)
+def test_structural_mortgage_conversion_preserves_non_mortgage_interest():
+    data = _base_dataset_dict(
+        deductible_mortgage_interest=[0.0, 0.0],
+        interest_deduction=[2_500.0],
+    )
+    converted = convert_mortgage_interest_to_structural_inputs(data, 2024)
+
+    assert converted["first_home_mortgage_balance"][2024][0] == 0
+    assert converted["first_home_mortgage_interest"][2024][0] == 0
+    assert converted["home_mortgage_interest"][2024].sum() == 0
+    assert converted["investment_interest_expense"][2024].sum() == pytest.approx(
+        2_500.0
+    )
+
+
+@pytest.mark.skipif(
+    not supports_structural_mortgage_inputs(),
+    reason="Installed policyengine-us does not yet expose structural MID inputs.",
+)
+def test_structural_mortgage_conversion_keeps_balance_hints_for_non_itemizers():
+    data = _base_dataset_dict(
+        deductible_mortgage_interest=[0.0, 0.0],
+        interest_deduction=[0.0],
+    )
+    data["imputed_first_home_mortgage_balance_hint"] = {
+        2024: np.array([250_000.0], dtype=np.float32)
+    }
+    data["imputed_second_home_mortgage_balance_hint"] = {
+        2024: np.array([25_000.0], dtype=np.float32)
+    }
+
+    converted = convert_mortgage_interest_to_structural_inputs(data, 2024)
+
+    assert converted["first_home_mortgage_balance"][2024][0] == pytest.approx(
+        250_000.0
+    )
+    assert converted["second_home_mortgage_balance"][2024][0] == pytest.approx(
+        25_000.0
+    )
+    assert converted["first_home_mortgage_interest"][2024][0] == 0
+    assert converted["second_home_mortgage_interest"][2024][0] == 0
+    assert converted["first_home_mortgage_origination_year"][2024][0] > 0
+    assert converted["second_home_mortgage_origination_year"][2024][0] >= 2018
+    assert converted["home_mortgage_interest"][2024].sum() == 0
+    assert converted["investment_interest_expense"][2024].sum() == 0
+
+
+@pytest.mark.skipif(
+    not supports_structural_mortgage_inputs(),
+    reason="Installed policyengine-us does not yet expose structural MID inputs.",
+)
+def test_scf_balance_hint_imputation_zeroes_non_mortgaged_owner(monkeypatch):
+    import microimpute.models.qrf as qrf_module
+    import policyengine_us_data.datasets.scf.scf as scf_module
+
+    class DummyQRF:
+        def fit(self, *args, **kwargs):
+            return self
+
+        def predict(self, X_test):
+            return pd.DataFrame(
+                {
+                    "imputed_first_home_mortgage_balance_hint": X_test[
+                        "mortgage_owner_status"
+                    ]
+                    * 100_000,
+                    "imputed_second_home_mortgage_balance_hint": X_test[
+                        "mortgage_owner_status"
+                    ]
+                    * 10_000,
+                }
+            )
+
+    monkeypatch.setattr(qrf_module, "QRF", DummyQRF)
+    monkeypatch.setattr(
+        scf_module.SCF_2022,
+        "load_dataset",
+        lambda self: {
+            "age": np.array([45, 55]),
+            "is_female": np.array([0, 1]),
+            "cps_race": np.array([1, 2]),
+            "is_married": np.array([1, 0]),
+            "own_children_in_household": np.array([1, 0]),
+            "employment_income": np.array([80_000, 40_000]),
+            "interest_dividend_income": np.array([2_000, 1_000]),
+            "social_security_pension_income": np.array([0, 5_000]),
+            "nh_mort": np.array([250_000, 0]),
+            "heloc": np.array([25_000, 0]),
+            "houses": np.array([500_000, 350_000]),
+            "wgt": np.array([1, 1]),
+        },
+    )
+
+    data = {
+        "person_id": {2024: np.array([1, 2])},
+        "tax_unit_id": {2024: np.array([1, 2])},
+        "marital_unit_id": {2024: np.array([1, 2])},
+        "spm_unit_id": {2024: np.array([1, 2])},
+        "family_id": {2024: np.array([1, 2])},
+        "household_id": {2024: np.array([1, 2])},
+        "person_tax_unit_id": {2024: np.array([1, 2])},
+        "person_marital_unit_id": {2024: np.array([1, 2])},
+        "person_spm_unit_id": {2024: np.array([1, 2])},
+        "person_family_id": {2024: np.array([1, 2])},
+        "person_household_id": {2024: np.array([1, 2])},
+        "is_tax_unit_head": {2024: np.array([True, True])},
+        "is_tax_unit_spouse": {2024: np.array([False, False])},
+        "age": {2024: np.array([45, 55])},
+        "is_male": {2024: np.array([1, 0])},
+        "cps_race": {2024: np.array([1, 2])},
+        "employment_income": {2024: np.array([80_000, 40_000])},
+        "taxable_interest_income": {2024: np.array([1_000, 500])},
+        "tax_exempt_interest_income": {2024: np.array([0, 0])},
+        "qualified_dividend_income": {2024: np.array([500, 250])},
+        "non_qualified_dividend_income": {2024: np.array([0, 0])},
+        "social_security_retirement": {2024: np.array([0, 5_000])},
+        "taxable_private_pension_income": {2024: np.array([0, 0])},
+        "tax_exempt_private_pension_income": {2024: np.array([0, 0])},
+        "tenure_type": {
+            2024: np.array([b"OWNED_WITH_MORTGAGE", b"OWNED_WITH_MORTGAGE"])
+        },
+        "spm_unit_tenure_type": {
+            2024: np.array([b"OWNER_WITH_MORTGAGE", b"OWNER_WITHOUT_MORTGAGE"])
+        },
+    }
+
+    imputed = impute_tax_unit_mortgage_balance_hints(data, 2024)
+
+    assert imputed["imputed_first_home_mortgage_balance_hint"][2024].tolist() == [
+        200_000.0,
+        0.0,
+    ]
+    assert imputed["imputed_second_home_mortgage_balance_hint"][2024].tolist() == [
+        20_000.0,
+        0.0,
+    ]
diff --git a/policyengine_us_data/utils/mortgage_interest.py b/policyengine_us_data/utils/mortgage_interest.py
new file mode 100644
index 00000000..bcf1be48
--- /dev/null
+++ b/policyengine_us_data/utils/mortgage_interest.py
@@ -0,0 +1,770 @@
+from __future__ import annotations
+
+from typing import Dict
+
+import numpy as np
+import pandas as pd
+
+
+STRUCTURAL_MORTGAGE_VARIABLES = (
+    "first_home_mortgage_balance",
+    "second_home_mortgage_balance",
+    "first_home_mortgage_interest",
+    "second_home_mortgage_interest",
+    "first_home_mortgage_origination_year",
+    "second_home_mortgage_origination_year",
+)
+
+MORTGAGE_HINT_VARIABLES = (
+    "imputed_first_home_mortgage_balance_hint",
+    "imputed_second_home_mortgage_balance_hint",
+)
+
+MORTGAGE_IMPUTATION_PREDICTORS = [
+    "age",
+    "is_female",
+    "cps_race",
+    "is_married",
+    "own_children_in_household",
+    "employment_income",
+    "interest_dividend_income",
+    "social_security_pension_income",
+    "mortgage_owner_status",
+]
+
+
+def supports_structural_mortgage_inputs() -> bool:
+    """Return whether the installed policyengine-us exposes structural MID inputs."""
+    try:
+        from policyengine_us import CountryTaxBenefitSystem
+    except ImportError:
+        return False
+
+    tbs = CountryTaxBenefitSystem()
+    return all(name in tbs.variables for name in STRUCTURAL_MORTGAGE_VARIABLES)
+
+
+def impute_tax_unit_mortgage_balance_hints(
+    data: Dict[str, Dict[int, np.ndarray]],
+    time_period: int,
+) -> Dict[str, Dict[int, np.ndarray]]:
+    """Impute tax-unit mortgage balance hints from SCF data.
+
+    The output variables are not policyengine-us inputs. They are auxiliary
+    data-layer hints that let the structural MID conversion reuse an SCF-like
+    mortgage balance distribution without forcing the baseline to use mortgage
+    interest for non-itemizers.
+    """
+    if not supports_structural_mortgage_inputs():
+        return data
+
+    receiver = _build_tax_unit_mortgage_receiver(data, time_period)
+    if receiver.empty:
+        return data
+
+    from microimpute.models.qrf import QRF
+    from policyengine_us_data.datasets.scf.scf import SCF_2022
+
+    scf = pd.DataFrame(SCF_2022().load_dataset())
+    donor = _build_scf_mortgage_donor(scf)
+    if donor.empty:
+        return data
+
+    qrf = QRF()
+    donor_sample = donor.sample(frac=0.5, random_state=42).reset_index(drop=True)
+    fitted = qrf.fit(
+        X_train=donor_sample,
+        predictors=MORTGAGE_IMPUTATION_PREDICTORS,
+        imputed_variables=list(MORTGAGE_HINT_VARIABLES),
+        weight_col="wgt",
+        tune_hyperparameters=False,
+    )
+    predictions = fitted.predict(X_test=receiver[MORTGAGE_IMPUTATION_PREDICTORS])
+
+    owner_with_mortgage = receiver["mortgage_owner_status"].values == 2
+    first_hint = np.where(
+        owner_with_mortgage,
+        np.maximum(
+            predictions["imputed_first_home_mortgage_balance_hint"].values,
+            0,
+        ),
+        0,
+    ).astype(np.float32)
+    second_hint = np.where(
+        owner_with_mortgage,
+        np.maximum(
+            predictions["imputed_second_home_mortgage_balance_hint"].values,
+            0,
+        ),
+        0,
+    ).astype(np.float32)
+
+    swap_mask = (first_hint == 0) & (second_hint > 0)
+    first_hint[swap_mask] = second_hint[swap_mask]
+    second_hint[swap_mask] = 0
+
+    data["imputed_first_home_mortgage_balance_hint"] = {time_period: first_hint}
+    data["imputed_second_home_mortgage_balance_hint"] = {time_period: second_hint}
+    return data
+
+
+def convert_mortgage_interest_to_structural_inputs(
+    data: Dict[str, Dict[int, np.ndarray]],
+    time_period: int,
+) -> Dict[str, Dict[int, np.ndarray]]:
+    """Replace formula-level mortgage inputs with structural mortgage data.
+
+    The current us-data calibration pipeline imputes a person-level
+    ``deductible_mortgage_interest`` and a tax-unit-level
+    ``interest_deduction``. That short-circuits structural MID reforms in
+    policyengine-us. When structural mortgage inputs are available, convert
+    those imputed amounts into:
+
+    * tax-unit mortgage balances, interest, and origination years
+    * person-level ``home_mortgage_interest`` for within-tax-unit allocation
+    * person-level ``investment_interest_expense`` for the residual non-mortgage
+      interest share of ``interest_deduction``
+
+    The conversion is intentionally conservative:
+    * current-law deductible mortgage interest is preserved exactly
+    * current-law total interest deduction is preserved exactly
+    * the origination year is heuristic, because the current public pipeline
+      does not carry a mortgage-vintage input
+    """
+    if not supports_structural_mortgage_inputs():
+        return data
+
+    tp = time_period
+    person_ids = data.get("person_id", {}).get(tp)
+    tax_unit_ids = data.get("tax_unit_id", {}).get(tp)
+    person_tax_unit_ids = data.get("person_tax_unit_id", {}).get(tp)
+    filing_status = data.get("filing_status", {}).get(tp)
+
+    if (
+        person_ids is None
+        or tax_unit_ids is None
+        or person_tax_unit_ids is None
+        or filing_status is None
+    ):
+        return data
+
+    n_persons = len(person_ids)
+    n_tax_units = len(tax_unit_ids)
+    tax_unit_index = {
+        int(tax_unit_id): idx for idx, tax_unit_id in enumerate(tax_unit_ids)
+    }
+    person_tax_unit_idx = np.array(
+        [tax_unit_index[int(tax_unit_id)] for tax_unit_id in person_tax_unit_ids],
+        dtype=np.int32,
+    )
+
+    person_deductible = _get_person_mortgage_interest_target(data, tp, n_persons)
+    tax_unit_deductible = np.zeros(n_tax_units, dtype=np.float32)
+    np.add.at(tax_unit_deductible, person_tax_unit_idx, person_deductible)
+    (
+        first_balance_hint,
+        second_balance_hint,
+    ) = _get_tax_unit_mortgage_balance_hints(data, tp, n_tax_units)
+
+    total_interest_deduction = _get_tax_unit_interest_deduction_target(
+        data,
+        tp,
+        tax_unit_deductible,
+    )
+
+    fallback_person_share = _filer_share(data, tp, person_tax_unit_idx, n_tax_units)
+    person_share = _normalize_person_share(
+        person_deductible,
+        person_tax_unit_idx,
+        n_tax_units,
+        fallback_person_share,
+    )
+
+    tax_unit_age = _tax_unit_age(data, tp, person_tax_unit_idx, n_tax_units)
+    filing_status_str = np.array(
+        [_decode_filing_status(value) for value in filing_status]
+    )
+
+    post_cap = np.array(
+        [_post_tcja_cap(status) for status in filing_status_str],
+        dtype=np.float32,
+    )
+    pre_cap = np.array(
+        [_pre_tcja_cap(status) for status in filing_status_str],
+        dtype=np.float32,
+    )
+
+    has_mortgage = tax_unit_deductible > 0
+    hinted_balance = np.maximum(first_balance_hint + second_balance_hint, 0)
+    balance, origination_year = _estimate_mortgage_balance_and_year(
+        tax_unit_ids,
+        tax_unit_deductible,
+        post_cap,
+        tax_unit_age,
+        tp,
+        hinted_balance,
+    )
+    use_balance_hint = hinted_balance > 0
+    first_balance = np.where(use_balance_hint, first_balance_hint, balance).astype(
+        np.float32
+    )
+    second_balance = np.where(use_balance_hint, second_balance_hint, 0).astype(
+        np.float32
+    )
+
+    swap_mask = (first_balance == 0) & (second_balance > 0)
+    first_balance[swap_mask] = second_balance[swap_mask]
+    second_balance[swap_mask] = 0
+    total_balance = first_balance + second_balance
+
+    applicable_cap = np.where(origination_year <= 2017, pre_cap, post_cap)
+    deductible_share = np.ones(n_tax_units, dtype=np.float32)
+    capped_mask = has_mortgage & (total_balance > applicable_cap)
+    deductible_share[capped_mask] = (
+        applicable_cap[capped_mask] / total_balance[capped_mask]
+    )
+
+    total_mortgage_interest = np.zeros(n_tax_units, dtype=np.float32)
+    positive_share = has_mortgage & (deductible_share > 0)
+    total_mortgage_interest[positive_share] = (
+        tax_unit_deductible[positive_share] / deductible_share[positive_share]
+    )
+    first_interest, second_interest = _split_interest_by_balance(
+        total_mortgage_interest,
+        first_balance,
+        second_balance,
+    )
+    second_origination_year = np.where(
+        second_balance > 0,
+        np.maximum(2018, origination_year),
+        0,
+    ).astype(np.int32)
+
+    investment_interest = np.maximum(
+        total_interest_deduction - tax_unit_deductible,
+        0,
+    ).astype(np.float32)
+
+    person_home_mortgage_interest = (
+        total_mortgage_interest[person_tax_unit_idx] * person_share
+    ).astype(np.float32)
+    person_investment_interest = (
+        investment_interest[person_tax_unit_idx] * fallback_person_share
+    ).astype(np.float32)
+
+    data["first_home_mortgage_balance"] = {tp: first_balance}
+    data["second_home_mortgage_balance"] = {tp: second_balance}
+    data["first_home_mortgage_interest"] = {tp: first_interest}
+    data["second_home_mortgage_interest"] = {tp: second_interest}
+    data["first_home_mortgage_origination_year"] = {
+        tp: origination_year.astype(np.int32)
+    }
+    data["second_home_mortgage_origination_year"] = {tp: second_origination_year}
+    data["home_mortgage_interest"] = {tp: person_home_mortgage_interest}
+    data["investment_interest_expense"] = {tp: person_investment_interest}
+
+    data.pop("deductible_mortgage_interest", None)
+    data.pop("interest_deduction", None)
+    return data
+
+
+def _get_person_mortgage_interest_target(
+    data: Dict[str, Dict[int, np.ndarray]],
+    time_period: int,
+    n_persons: int,
+) -> np.ndarray:
+    if "deductible_mortgage_interest" in data:
+        values = np.asarray(
+            data["deductible_mortgage_interest"][time_period],
+            dtype=np.float32,
+        )
+        return np.maximum(values, 0)
+    return np.zeros(n_persons, dtype=np.float32)
+
+
+def _get_tax_unit_interest_deduction_target(
+    data: Dict[str, Dict[int, np.ndarray]],
+    time_period: int,
+    tax_unit_deductible: np.ndarray,
+) -> np.ndarray:
+    if "interest_deduction" not in data:
+        return tax_unit_deductible.astype(np.float32)
+    values = np.asarray(data["interest_deduction"][time_period], dtype=np.float32)
+    return np.maximum(values, tax_unit_deductible).astype(np.float32)
+
+
+def _get_tax_unit_mortgage_balance_hints(
+    data: Dict[str, Dict[int, np.ndarray]],
+    time_period: int,
+    n_tax_units: int,
+) -> tuple[np.ndarray, np.ndarray]:
+    first_hint = np.asarray(
+        data.get("imputed_first_home_mortgage_balance_hint", {}).get(
+            time_period, np.zeros(n_tax_units)
+        ),
+        dtype=np.float32,
+    )
+    second_hint = np.asarray(
+        data.get("imputed_second_home_mortgage_balance_hint", {}).get(
+            time_period, np.zeros(n_tax_units)
+        ),
+        dtype=np.float32,
+    )
+    return np.maximum(first_hint, 0), np.maximum(second_hint, 0)
+
+
+def _build_tax_unit_mortgage_receiver(
+    data: Dict[str, Dict[int, np.ndarray]],
+    time_period: int,
+) -> pd.DataFrame:
+    tax_unit_ids = data.get("tax_unit_id", {}).get(time_period)
+    person_tax_unit_ids = data.get("person_tax_unit_id", {}).get(time_period)
+    is_head = data.get("is_tax_unit_head", {}).get(time_period)
+    if tax_unit_ids is None or person_tax_unit_ids is None or is_head is None:
+        return pd.DataFrame()
+
+    tax_unit_ids = np.asarray(tax_unit_ids)
+    person_tax_unit_ids = np.asarray(person_tax_unit_ids)
+    is_head = np.asarray(is_head, dtype=bool)
+    n_tax_units = len(tax_unit_ids)
+    tax_unit_index = {int(tax_unit_id): idx for idx, tax_unit_id in enumerate(tax_unit_ids)}
+    person_tax_unit_idx = np.array(
+        [tax_unit_index[int(tax_unit_id)] for tax_unit_id in person_tax_unit_ids],
+        dtype=np.int32,
+    )
+
+    head_index = np.full(n_tax_units, -1, dtype=np.int32)
+    head_positions = np.flatnonzero(is_head)
+    if head_positions.size > 0:
+        head_index[person_tax_unit_idx[head_positions]] = head_positions
+
+    missing_head = head_index < 0
+    if np.any(missing_head):
+        first_person = np.full(n_tax_units, -1, dtype=np.int32)
+        for person_idx, tax_unit_idx in enumerate(person_tax_unit_idx):
+            if first_person[tax_unit_idx] < 0:
+                first_person[tax_unit_idx] = person_idx
+        head_index[missing_head] = first_person[missing_head]
+
+    receiver = pd.DataFrame(
+        {
+            "tax_unit_id": tax_unit_ids,
+            "head_index": head_index,
+        }
+    )
+    head_take = head_index.clip(min=0)
+
+    receiver["age"] = _take_person_values(data, time_period, "age", head_take)
+    is_male = _take_person_values(data, time_period, "is_male", head_take)
+    receiver["is_female"] = (1 - is_male).astype(np.float32)
+    receiver["cps_race"] = _take_person_values(
+        data, time_period, "cps_race", head_take
+    ).astype(np.float32)
+    receiver["own_children_in_household"] = _take_person_values(
+        data, time_period, "own_children_in_household", head_take
+    )
+    receiver["mortgage_owner_status"] = _tax_unit_mortgage_owner_status(
+        data,
+        time_period,
+        head_take,
+    )
+
+    spouse_count = np.zeros(n_tax_units, dtype=np.float32)
+    spouse = np.asarray(
+        data.get("is_tax_unit_spouse", {}).get(
+            time_period, np.zeros(len(person_tax_unit_idx))
+        ),
+        dtype=np.float32,
+    )
+    np.add.at(spouse_count, person_tax_unit_idx, spouse)
+    receiver["is_married"] = (spouse_count > 0).astype(np.float32)
+
+    receiver["employment_income"] = _sum_person_values_to_tax_unit(
+        data,
+        time_period,
+        person_tax_unit_idx,
+        n_tax_units,
+        ["employment_income"],
+    )
+    receiver["interest_dividend_income"] = _sum_person_values_to_tax_unit(
+        data,
+        time_period,
+        person_tax_unit_idx,
+        n_tax_units,
+        [
+            "taxable_interest_income",
+            "tax_exempt_interest_income",
+            "qualified_dividend_income",
+            "non_qualified_dividend_income",
+        ],
+    )
+    receiver["social_security_pension_income"] = _sum_person_values_to_tax_unit(
+        data,
+        time_period,
+        person_tax_unit_idx,
+        n_tax_units,
+        [
+            "social_security_retirement",
+            "taxable_private_pension_income",
+            "tax_exempt_private_pension_income",
+        ],
+    )
+    return receiver[MORTGAGE_IMPUTATION_PREDICTORS]
+
+
+def _build_scf_mortgage_donor(scf: pd.DataFrame) -> pd.DataFrame:
+    donor = pd.DataFrame()
+    donor["age"] = _frame_column(scf, "age")
+    donor["is_female"] = _frame_column(scf, "is_female")
+    donor["cps_race"] = _frame_column(scf, "cps_race")
+    donor["is_married"] = _frame_column(scf, "is_married")
+    donor["own_children_in_household"] = _frame_column(
+        scf, "own_children_in_household"
+    )
+    donor["employment_income"] = _frame_column(scf, "employment_income")
+    donor["interest_dividend_income"] = _frame_column(
+        scf, "interest_dividend_income"
+    )
+    donor["social_security_pension_income"] = _frame_column(
+        scf, "social_security_pension_income"
+    )
+
+    total_mortgage = np.maximum(
+        np.asarray(scf.get("nh_mort", 0), dtype=np.float32),
+        np.asarray(scf.get("mortgage_debt", 0), dtype=np.float32),
+    )
+    heloc = np.minimum(
+        np.maximum(np.asarray(scf.get("heloc", 0), dtype=np.float32), 0),
+        total_mortgage,
+    )
+    owns_home = np.asarray(scf.get("houses", 0), dtype=np.float32) > 0
+    has_mortgage = total_mortgage > 0
+
+    donor["mortgage_owner_status"] = np.where(
+        has_mortgage,
+        2,
+        np.where(owns_home, 1, 0),
+    ).astype(np.float32)
+    donor["imputed_first_home_mortgage_balance_hint"] = np.maximum(
+        total_mortgage - heloc,
+        0,
+    ).astype(np.float32)
+    donor["imputed_second_home_mortgage_balance_hint"] = heloc.astype(np.float32)
+    donor["wgt"] = _frame_column(scf, "wgt", default=1)
+    return donor[
+        MORTGAGE_IMPUTATION_PREDICTORS + list(MORTGAGE_HINT_VARIABLES) + ["wgt"]
+    ].dropna()
+
+
+def _take_person_values(
+    data: Dict[str, Dict[int, np.ndarray]],
+    time_period: int,
+    variable: str,
+    head_take: np.ndarray,
+) -> np.ndarray:
+    values = np.asarray(
+        data.get(variable, {}).get(time_period, np.zeros(head_take.size)),
+        dtype=np.float32,
+    )
+    if values.size == 0:
+        return np.zeros(head_take.size, dtype=np.float32)
+    return values[head_take].astype(np.float32)
+
+
+def _frame_column(
+    frame: pd.DataFrame,
+    column: str,
+    default: float = 0,
+) -> np.ndarray:
+    if column in frame:
+        return np.asarray(frame[column], dtype=np.float32)
+    return np.full(len(frame), default, dtype=np.float32)
+
+
+def _sum_person_values_to_tax_unit(
+    data: Dict[str, Dict[int, np.ndarray]],
+    time_period: int,
+    person_tax_unit_idx: np.ndarray,
+    n_tax_units: int,
+    variables: list[str],
+) -> np.ndarray:
+    total = np.zeros(n_tax_units, dtype=np.float32)
+    for variable in variables:
+        if variable not in data:
+            continue
+        values = np.asarray(data[variable][time_period], dtype=np.float32)
+        np.add.at(total, person_tax_unit_idx, values)
+    return total
+
+
+def _tax_unit_mortgage_owner_status(
+    data: Dict[str, Dict[int, np.ndarray]],
+    time_period: int,
+    head_take: np.ndarray,
+) -> np.ndarray:
+    household_status = np.zeros(head_take.size, dtype=np.float32)
+    household_tenure = data.get("tenure_type", {}).get(time_period)
+    person_household_id = data.get("person_household_id", {}).get(time_period)
+    household_ids = data.get("household_id", {}).get(time_period)
+    if (
+        household_tenure is not None
+        and person_household_id is not None
+        and household_ids is not None
+    ):
+        household_map = {
+            int(household_id): _decode_owner_status(value)
+            for household_id, value in zip(household_ids, household_tenure)
+        }
+        household_status = np.array(
+            [
+                household_map.get(int(household_id), 0)
+                for household_id in np.asarray(person_household_id)[head_take]
+            ],
+            dtype=np.float32,
+        )
+
+    spm_status = np.zeros(head_take.size, dtype=np.float32)
+    spm_tenure = data.get("spm_unit_tenure_type", {}).get(time_period)
+    person_spm_unit_id = data.get("person_spm_unit_id", {}).get(time_period)
+    spm_unit_ids = data.get("spm_unit_id", {}).get(time_period)
+    if spm_tenure is not None and person_spm_unit_id is not None and spm_unit_ids is not None:
+        spm_map = {
+            int(spm_unit_id): _decode_owner_status(value)
+            for spm_unit_id, value in zip(spm_unit_ids, spm_tenure)
+        }
+        spm_status = np.array(
+            [
+                spm_map.get(int(spm_unit_id), 0)
+                for spm_unit_id in np.asarray(person_spm_unit_id)[head_take]
+            ],
+            dtype=np.float32,
+        )
+
+    return np.where(spm_status > 0, spm_status, household_status).astype(np.float32)
+
+
+def _decode_owner_status(value) -> int:
+    if isinstance(value, bytes):
+        value = value.decode("utf-8")
+    value = str(value).upper()
+    if "OWNER_WITH_MORTGAGE" in value or "OWNED_WITH_MORTGAGE" in value:
+        return 2
+    if "OWNER_WITHOUT_MORTGAGE" in value:
+        return 1
+    return 0
+
+
+def _filer_share(
+    data: Dict[str, Dict[int, np.ndarray]],
+    time_period: int,
+    person_tax_unit_idx: np.ndarray,
+    n_tax_units: int,
+) -> np.ndarray:
+    is_head = np.asarray(
+        data.get("is_tax_unit_head", {}).get(
+            time_period, np.zeros(len(person_tax_unit_idx), dtype=bool)
+        ),
+        dtype=bool,
+    )
+    is_spouse = np.asarray(
+        data.get("is_tax_unit_spouse", {}).get(
+            time_period, np.zeros(len(person_tax_unit_idx), dtype=bool)
+        ),
+        dtype=bool,
+    )
+    filer_mask = (is_head | is_spouse).astype(np.float32)
+    filer_count = np.zeros(n_tax_units, dtype=np.float32)
+    np.add.at(filer_count, person_tax_unit_idx, filer_mask)
+
+    share = np.zeros(len(person_tax_unit_idx), dtype=np.float32)
+    positive_filers = filer_count[person_tax_unit_idx] > 0
+    share[positive_filers] = (
+        filer_mask[positive_filers] / filer_count[person_tax_unit_idx][positive_filers]
+    )
+
+    no_filer_mask = filer_count[person_tax_unit_idx] == 0
+    if np.any(no_filer_mask):
+        share[no_filer_mask] = _equal_person_share(
+            person_tax_unit_idx[no_filer_mask],
+            n_tax_units,
+        )
+
+    return share
+
+
+def _normalize_person_share(
+    person_values: np.ndarray,
+    person_tax_unit_idx: np.ndarray,
+    n_tax_units: int,
+    fallback_share: np.ndarray,
+) -> np.ndarray:
+    tax_unit_totals = np.zeros(n_tax_units, dtype=np.float32)
+    np.add.at(tax_unit_totals, person_tax_unit_idx, person_values)
+    share = np.zeros_like(person_values, dtype=np.float32)
+    positive = tax_unit_totals[person_tax_unit_idx] > 0
+    share[positive] = (
+        person_values[positive] / tax_unit_totals[person_tax_unit_idx][positive]
+    )
+    share[~positive] = fallback_share[~positive]
+    return share
+
+
+def _equal_person_share(
+    person_tax_unit_idx: np.ndarray,
+    n_tax_units: int,
+) -> np.ndarray:
+    counts = np.zeros(n_tax_units, dtype=np.float32)
+    np.add.at(counts, person_tax_unit_idx, 1)
+    return (1 / counts[person_tax_unit_idx]).astype(np.float32)
+
+
+def _tax_unit_age(
+    data: Dict[str, Dict[int, np.ndarray]],
+    time_period: int,
+    person_tax_unit_idx: np.ndarray,
+    n_tax_units: int,
+) -> np.ndarray:
+    ages = np.asarray(
+        data.get("age", {}).get(time_period, np.zeros(len(person_tax_unit_idx))),
+        dtype=np.float32,
+    )
+    is_head = np.asarray(
+        data.get("is_tax_unit_head", {}).get(
+            time_period, np.zeros(len(person_tax_unit_idx), dtype=bool)
+        ),
+        dtype=bool,
+    )
+    is_spouse = np.asarray(
+        data.get("is_tax_unit_spouse", {}).get(
+            time_period, np.zeros(len(person_tax_unit_idx), dtype=bool)
+        ),
+        dtype=bool,
+    )
+    filer_ages = np.where(is_head | is_spouse, ages, 0)
+    tax_unit_age = np.zeros(n_tax_units, dtype=np.float32)
+    np.maximum.at(tax_unit_age, person_tax_unit_idx, filer_ages)
+
+    missing_age = tax_unit_age == 0
+    if np.any(missing_age):
+        any_age = np.zeros(n_tax_units, dtype=np.float32)
+        np.maximum.at(any_age, person_tax_unit_idx, ages)
+        tax_unit_age[missing_age] = any_age[missing_age]
+
+    tax_unit_age[missing_age & (tax_unit_age == 0)] = 45
+    return tax_unit_age
+
+
+def _estimate_mortgage_balance_and_year(
+    tax_unit_ids: np.ndarray,
+    deductible_mortgage_interest: np.ndarray,
+    post_cap: np.ndarray,
+    tax_unit_age: np.ndarray,
+    time_period: int,
+    hinted_balance: np.ndarray,
+) -> tuple[np.ndarray, np.ndarray]:
+    balance = np.zeros_like(deductible_mortgage_interest, dtype=np.float32)
+    year = np.zeros_like(deductible_mortgage_interest, dtype=np.int32)
+    has_mortgage = (deductible_mortgage_interest > 0) | (hinted_balance > 0)
+    if not np.any(has_mortgage):
+        return balance, year
+
+    older_draw = _stable_uniform(tax_unit_ids, salt=17)
+    year_draw = _stable_uniform(tax_unit_ids, salt=31)
+
+    pre_probability = np.clip(
+        0.10 + 0.012 * np.maximum(tax_unit_age - 30, 0),
+        0.10,
+        0.85,
+    )
+
+    provisional_rate = 0.045
+    provisional_balance = np.where(
+        hinted_balance > 0,
+        hinted_balance,
+        deductible_mortgage_interest / provisional_rate,
+    )
+    pre_probability += 0.20 * (provisional_balance > post_cap)
+    pre_probability = np.clip(pre_probability, 0.10, 0.90)
+
+    if time_period <= 2017:
+        is_pre_tcja = has_mortgage
+    else:
+        is_pre_tcja = has_mortgage & (older_draw < pre_probability)
+
+    pre_span = 13  # 2005-2017 inclusive
+    year[is_pre_tcja] = 2005 + np.floor(year_draw[is_pre_tcja] * pre_span).astype(
+        np.int32
+    )
+
+    post_mask = has_mortgage & ~is_pre_tcja
+    post_start = 2018 if time_period >= 2018 else time_period
+    post_span = max(1, time_period - post_start + 1)
+    year[post_mask] = post_start + np.floor(year_draw[post_mask] * post_span).astype(
+        np.int32
+    )
+
+    rate = _mortgage_rate(year)
+    balance[has_mortgage] = np.where(
+        hinted_balance[has_mortgage] > 0,
+        hinted_balance[has_mortgage],
+        deductible_mortgage_interest[has_mortgage] / rate[has_mortgage],
+    )
+    return balance, year
+
+
+def _split_interest_by_balance(
+    total_interest: np.ndarray,
+    first_balance: np.ndarray,
+    second_balance: np.ndarray,
+) -> tuple[np.ndarray, np.ndarray]:
+    total_balance = first_balance + second_balance
+    first_interest = np.zeros_like(total_interest, dtype=np.float32)
+    second_interest = np.zeros_like(total_interest, dtype=np.float32)
+
+    with_second = total_balance > 0
+    first_interest[with_second] = (
+        total_interest[with_second] * first_balance[with_second] / total_balance[with_second]
+    )
+    second_interest[with_second] = total_interest[with_second] - first_interest[with_second]
+
+    no_second = second_balance == 0
+    first_interest[no_second] = total_interest[no_second]
+    second_interest[no_second] = 0
+    return first_interest.astype(np.float32), second_interest.astype(np.float32)
+
+
+def _mortgage_rate(origination_year: np.ndarray) -> np.ndarray:
+    year = np.asarray(origination_year, dtype=np.int32)
+    rate = np.full(year.shape, 0.045, dtype=np.float32)
+    rate[year <= 2017] = 0.040
+    rate[(year >= 2018) & (year <= 2019)] = 0.045
+    rate[(year >= 2020) & (year <= 2021)] = 0.035
+    rate[year == 2022] = 0.0525
+    rate[year >= 2023] = 0.0675
+    return rate
+
+
+def _stable_uniform(ids: np.ndarray, salt: int) -> np.ndarray:
+    values = np.asarray(ids, dtype=np.uint64)
+    hashed = values * np.uint64(1_103_515_245 + salt) + np.uint64(12_345 + salt)
+    return ((hashed % np.uint64(2**31)).astype(np.float64) / float(2**31)).astype(
+        np.float32
+    )
+
+
+def _decode_filing_status(value) -> str:
+    if isinstance(value, bytes):
+        return value.decode("utf-8").upper()
+    return str(value).upper()
+
+
+def _post_tcja_cap(status: str) -> float:
+    if "SEPARATE" in status:
+        return 375_000.0
+    return 750_000.0
+
+
+def _pre_tcja_cap(status: str) -> float:
+    if "SEPARATE" in status:
+        return 500_000.0
+    return 1_000_000.0

From 9f7ad45983f0007626a663c95161b2cf831acf19 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Tue, 24 Mar 2026 05:53:11 -0400
Subject: [PATCH 2/5] Format structural mortgage support files

---
 policyengine_us_data/datasets/puf/puf.py      |  5 +++-
 .../test_mortgage_interest.py                 |  8 ++----
 .../utils/mortgage_interest.py                | 26 ++++++++++++-------
 3 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py
index 9a0e8a22..9747df41 100644
--- a/policyengine_us_data/datasets/puf/puf.py
+++ b/policyengine_us_data/datasets/puf/puf.py
@@ -646,7 +646,10 @@ def generate(self):
                 self.holder[key] = np.array(self.holder[key]).astype(float)
                 assert not np.isnan(self.holder[key]).any(), f"{key} has NaNs."
 
-        holder_tp = {variable: {self.time_period: values} for variable, values in self.holder.items()}
+        holder_tp = {
+            variable: {self.time_period: values}
+            for variable, values in self.holder.items()
+        }
         holder_tp = convert_mortgage_interest_to_structural_inputs(
             holder_tp,
             self.time_period,
diff --git a/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py b/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py
index 94aed99e..82a42d1f 100644
--- a/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py
+++ b/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py
@@ -117,12 +117,8 @@ def test_structural_mortgage_conversion_keeps_balance_hints_for_non_itemizers():
 
     converted = convert_mortgage_interest_to_structural_inputs(data, 2024)
 
-    assert converted["first_home_mortgage_balance"][2024][0] == pytest.approx(
-        250_000.0
-    )
-    assert converted["second_home_mortgage_balance"][2024][0] == pytest.approx(
-        25_000.0
-    )
+    assert converted["first_home_mortgage_balance"][2024][0] == pytest.approx(250_000.0)
+    assert converted["second_home_mortgage_balance"][2024][0] == pytest.approx(25_000.0)
     assert converted["first_home_mortgage_interest"][2024][0] == 0
     assert converted["second_home_mortgage_interest"][2024][0] == 0
     assert converted["first_home_mortgage_origination_year"][2024][0] > 0
diff --git a/policyengine_us_data/utils/mortgage_interest.py b/policyengine_us_data/utils/mortgage_interest.py
index bcf1be48..f49530bf 100644
--- a/policyengine_us_data/utils/mortgage_interest.py
+++ b/policyengine_us_data/utils/mortgage_interest.py
@@ -327,7 +327,9 @@ def _build_tax_unit_mortgage_receiver(
     person_tax_unit_ids = np.asarray(person_tax_unit_ids)
     is_head = np.asarray(is_head, dtype=bool)
     n_tax_units = len(tax_unit_ids)
-    tax_unit_index = {int(tax_unit_id): idx for idx, tax_unit_id in enumerate(tax_unit_ids)}
+    tax_unit_index = {
+        int(tax_unit_id): idx for idx, tax_unit_id in enumerate(tax_unit_ids)
+    }
     person_tax_unit_idx = np.array(
         [tax_unit_index[int(tax_unit_id)] for tax_unit_id in person_tax_unit_ids],
         dtype=np.int32,
@@ -418,13 +420,9 @@ def _build_scf_mortgage_donor(scf: pd.DataFrame) -> pd.DataFrame:
     donor["is_female"] = _frame_column(scf, "is_female")
     donor["cps_race"] = _frame_column(scf, "cps_race")
     donor["is_married"] = _frame_column(scf, "is_married")
-    donor["own_children_in_household"] = _frame_column(
-        scf, "own_children_in_household"
-    )
+    donor["own_children_in_household"] = _frame_column(scf, "own_children_in_household")
     donor["employment_income"] = _frame_column(scf, "employment_income")
-    donor["interest_dividend_income"] = _frame_column(
-        scf, "interest_dividend_income"
-    )
+    donor["interest_dividend_income"] = _frame_column(scf, "interest_dividend_income")
     donor["social_security_pension_income"] = _frame_column(
         scf, "social_security_pension_income"
     )
@@ -527,7 +525,11 @@ def _tax_unit_mortgage_owner_status(
     spm_tenure = data.get("spm_unit_tenure_type", {}).get(time_period)
     person_spm_unit_id = data.get("person_spm_unit_id", {}).get(time_period)
     spm_unit_ids = data.get("spm_unit_id", {}).get(time_period)
-    if spm_tenure is not None and person_spm_unit_id is not None and spm_unit_ids is not None:
+    if (
+        spm_tenure is not None
+        and person_spm_unit_id is not None
+        and spm_unit_ids is not None
+    ):
         spm_map = {
             int(spm_unit_id): _decode_owner_status(value)
             for spm_unit_id, value in zip(spm_unit_ids, spm_tenure)
@@ -723,9 +725,13 @@ def _split_interest_by_balance(
 
     with_second = total_balance > 0
     first_interest[with_second] = (
-        total_interest[with_second] * first_balance[with_second] / total_balance[with_second]
+        total_interest[with_second]
+        * first_balance[with_second]
+        / total_balance[with_second]
+    )
+    second_interest[with_second] = (
+        total_interest[with_second] - first_interest[with_second]
     )
-    second_interest[with_second] = total_interest[with_second] - first_interest[with_second]
 
     no_second = second_balance == 0
     first_interest[no_second] = total_interest[no_second]

From 32254ddc63085d8132da409f53df92a2b6b867f5 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Wed, 25 Mar 2026 21:43:50 -0400
Subject: [PATCH 3/5] Refactor structural mortgage input compatibility

---
 .../datasets/cps/extended_cps.py              |  27 +-
 policyengine_us_data/datasets/puf/puf.py      |  11 +-
 .../test_mortgage_interest.py                 | 268 +++++++++++-------
 .../utils/mortgage_interest.py                |  20 +-
 policyengine_us_data/utils/policyengine.py    |  17 ++
 5 files changed, 206 insertions(+), 137 deletions(-)
 create mode 100644 policyengine_us_data/utils/policyengine.py

diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
index 288e3e46..e0871736 100644
--- a/policyengine_us_data/datasets/cps/extended_cps.py
+++ b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -10,10 +10,11 @@
 from policyengine_us_data.datasets.puf import PUF, PUF_2024
 from policyengine_us_data.storage import STORAGE_FOLDER
 from policyengine_us_data.utils.mortgage_interest import (
+    STRUCTURAL_MORTGAGE_VARIABLES,
     convert_mortgage_interest_to_structural_inputs,
     impute_tax_unit_mortgage_balance_hints,
-    supports_structural_mortgage_inputs,
 )
+from policyengine_us_data.utils.policyengine import has_policyengine_us_variables
 from policyengine_us_data.utils.retirement_limits import (
     get_retirement_limits,
     get_se_pension_limits,
@@ -21,6 +22,11 @@
 
 logger = logging.getLogger(__name__)
 
+
+def _supports_structural_mortgage_inputs() -> bool:
+    return has_policyengine_us_variables(*STRUCTURAL_MORTGAGE_VARIABLES)
+
+
 # CPS-only variables that should be QRF-imputed for the PUF clone half
 # instead of naively duplicated from the CPS donor. These are
 # income-correlated variables that exist only in the CPS; demographics,
@@ -450,14 +456,15 @@ def generate(self):
         )
 
         new_data = self._rename_imputed_to_inputs(new_data)
-        new_data = impute_tax_unit_mortgage_balance_hints(
-            new_data,
-            self.time_period,
-        )
-        new_data = convert_mortgage_interest_to_structural_inputs(
-            new_data,
-            self.time_period,
-        )
+        if _supports_structural_mortgage_inputs():
+            new_data = impute_tax_unit_mortgage_balance_hints(
+                new_data,
+                self.time_period,
+            )
+            new_data = convert_mortgage_interest_to_structural_inputs(
+                new_data,
+                self.time_period,
+            )
         new_data = self._drop_formula_variables(new_data)
         self.save_dataset(new_data)
 
@@ -492,7 +499,7 @@ def _rename_imputed_to_inputs(cls, data):
     @classmethod
     def _keep_formula_vars(cls):
         keep = set(cls._KEEP_FORMULA_VARS)
-        if not supports_structural_mortgage_inputs():
+        if not _supports_structural_mortgage_inputs():
             keep.add("interest_deduction")
         return keep
 
diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py
index 9747df41..bde0f33f 100644
--- a/policyengine_us_data/datasets/puf/puf.py
+++ b/policyengine_us_data/datasets/puf/puf.py
@@ -13,8 +13,10 @@
     disaggregate_aggregate_records,
 )
 from policyengine_us_data.utils.mortgage_interest import (
+    STRUCTURAL_MORTGAGE_VARIABLES,
     convert_mortgage_interest_to_structural_inputs,
 )
+from policyengine_us_data.utils.policyengine import has_policyengine_us_variables
 from policyengine_us_data.utils.uprating import (
     create_policyengine_uprating_factors_table,
 )
@@ -650,10 +652,11 @@ def generate(self):
             variable: {self.time_period: values}
             for variable, values in self.holder.items()
         }
-        holder_tp = convert_mortgage_interest_to_structural_inputs(
-            holder_tp,
-            self.time_period,
-        )
+        if has_policyengine_us_variables(*STRUCTURAL_MORTGAGE_VARIABLES):
+            holder_tp = convert_mortgage_interest_to_structural_inputs(
+                holder_tp,
+                self.time_period,
+            )
         self.holder = {
             variable: values[self.time_period] for variable, values in holder_tp.items()
         }
diff --git a/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py b/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py
index 82a42d1f..bbcfea45 100644
--- a/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py
+++ b/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py
@@ -3,36 +3,103 @@
 import pytest
 
 from policyengine_us_data.utils.mortgage_interest import (
+    STRUCTURAL_MORTGAGE_VARIABLES,
     convert_mortgage_interest_to_structural_inputs,
     impute_tax_unit_mortgage_balance_hints,
-    supports_structural_mortgage_inputs,
 )
+from policyengine_us_data.utils.policyengine import has_policyengine_us_variables
 
+TIME_PERIOD = 2024
+HAS_STRUCTURAL_MORTGAGE_INPUTS = has_policyengine_us_variables(
+    *STRUCTURAL_MORTGAGE_VARIABLES
+)
+
+
+def _at_time_period(values, dtype=None):
+    return {TIME_PERIOD: np.array(values, dtype=dtype)}
+
+
+def _time_period_variables(**variables):
+    return {name: _at_time_period(values) for name, values in variables.items()}
+
+
+def _head_and_spouse_flags(person_tax_unit_ids):
+    first_seen = {}
+    heads = np.zeros(len(person_tax_unit_ids), dtype=bool)
+    spouses = np.zeros(len(person_tax_unit_ids), dtype=bool)
+
+    for idx, tax_unit_id in enumerate(person_tax_unit_ids):
+        occurrence = first_seen.get(int(tax_unit_id), 0)
+        if occurrence == 0:
+            heads[idx] = True
+        elif occurrence == 1:
+            spouses[idx] = True
+        first_seen[int(tax_unit_id)] = occurrence + 1
+
+    return heads, spouses
+
+
+def _base_dataset_dict(
+    *,
+    person_tax_unit_ids,
+    ages,
+    deductible_mortgage_interest=None,
+    interest_deduction=None,
+    filing_status=None,
+):
+    person_tax_unit_ids = np.array(person_tax_unit_ids, dtype=np.int32)
+    tax_unit_ids = np.unique(person_tax_unit_ids)
+    n_people = len(person_tax_unit_ids)
+    person_ids = np.arange(1, n_people + 1, dtype=np.int32)
+    heads, spouses = _head_and_spouse_flags(person_tax_unit_ids)
+
+    data = {
+        "person_id": _at_time_period(person_ids),
+        "tax_unit_id": _at_time_period(tax_unit_ids),
+        "marital_unit_id": _at_time_period(tax_unit_ids),
+        "spm_unit_id": _at_time_period(tax_unit_ids),
+        "family_id": _at_time_period(tax_unit_ids),
+        "household_id": _at_time_period(tax_unit_ids),
+        "person_tax_unit_id": _at_time_period(person_tax_unit_ids),
+        "person_marital_unit_id": _at_time_period(person_tax_unit_ids),
+        "person_spm_unit_id": _at_time_period(person_tax_unit_ids),
+        "person_family_id": _at_time_period(person_tax_unit_ids),
+        "person_household_id": _at_time_period(person_tax_unit_ids),
+        "is_tax_unit_head": _at_time_period(heads),
+        "is_tax_unit_spouse": _at_time_period(spouses),
+        "age": _at_time_period(ages),
+    }
+
+    if filing_status is not None:
+        data["filing_status"] = _at_time_period(filing_status)
+    if deductible_mortgage_interest is not None:
+        data["deductible_mortgage_interest"] = _at_time_period(
+            deductible_mortgage_interest,
+            dtype=np.float32,
+        )
+    if interest_deduction is not None:
+        data["interest_deduction"] = _at_time_period(
+            interest_deduction,
+            dtype=np.float32,
+        )
 
-def _base_dataset_dict(deductible_mortgage_interest, interest_deduction):
-    time_period = 2024
+    return data
+
+
+def _mock_scf_dataset():
     return {
-        "person_id": {time_period: np.array([1, 2])},
-        "tax_unit_id": {time_period: np.array([1])},
-        "marital_unit_id": {time_period: np.array([1])},
-        "spm_unit_id": {time_period: np.array([1])},
-        "family_id": {time_period: np.array([1])},
-        "household_id": {time_period: np.array([1])},
-        "person_tax_unit_id": {time_period: np.array([1, 1])},
-        "person_marital_unit_id": {time_period: np.array([1, 1])},
-        "person_spm_unit_id": {time_period: np.array([1, 1])},
-        "person_family_id": {time_period: np.array([1, 1])},
-        "person_household_id": {time_period: np.array([1, 1])},
-        "is_tax_unit_head": {time_period: np.array([True, False])},
-        "is_tax_unit_spouse": {time_period: np.array([False, True])},
-        "age": {time_period: np.array([55, 53])},
-        "filing_status": {time_period: np.array([b"JOINT"])},
-        "deductible_mortgage_interest": {
-            time_period: np.array(deductible_mortgage_interest, dtype=np.float32)
-        },
-        "interest_deduction": {
-            time_period: np.array(interest_deduction, dtype=np.float32)
-        },
+        "age": np.array([45, 55]),
+        "is_female": np.array([0, 1]),
+        "cps_race": np.array([1, 2]),
+        "is_married": np.array([1, 0]),
+        "own_children_in_household": np.array([1, 0]),
+        "employment_income": np.array([80_000, 40_000]),
+        "interest_dividend_income": np.array([2_000, 1_000]),
+        "social_security_pension_income": np.array([0, 5_000]),
+        "nh_mort": np.array([250_000, 0]),
+        "heloc": np.array([25_000, 0]),
+        "houses": np.array([500_000, 350_000]),
+        "wgt": np.array([1, 1]),
     }
 
 
@@ -44,91 +111,104 @@ def _current_law_cap(filing_status: bytes, origination_year: int) -> float:
 
 
 @pytest.mark.skipif(
-    not supports_structural_mortgage_inputs(),
+    not HAS_STRUCTURAL_MORTGAGE_INPUTS,
     reason="Installed policyengine-us does not yet expose structural MID inputs.",
 )
 def test_structural_mortgage_conversion_preserves_current_law_interest_deduction():
     data = _base_dataset_dict(
+        person_tax_unit_ids=[1, 1],
+        ages=[55, 53],
         deductible_mortgage_interest=[6_000.0, 0.0],
         interest_deduction=[7_000.0],
+        filing_status=[b"JOINT"],
     )
-    converted = convert_mortgage_interest_to_structural_inputs(data, 2024)
+    converted = convert_mortgage_interest_to_structural_inputs(data, TIME_PERIOD)
 
     assert "deductible_mortgage_interest" not in converted
     assert "interest_deduction" not in converted
-    assert converted["first_home_mortgage_balance"][2024][0] > 0
-    assert converted["first_home_mortgage_interest"][2024][0] >= 6_000
-    assert converted["first_home_mortgage_origination_year"][2024][0] > 0
-    assert converted["investment_interest_expense"][2024].sum() == pytest.approx(
+    assert converted["first_home_mortgage_balance"][TIME_PERIOD][0] > 0
+    assert converted["first_home_mortgage_interest"][TIME_PERIOD][0] >= 6_000
+    assert converted["first_home_mortgage_origination_year"][TIME_PERIOD][0] > 0
+    assert converted["investment_interest_expense"][TIME_PERIOD].sum() == pytest.approx(
         1_000.0
     )
     cap = _current_law_cap(
-        converted["filing_status"][2024][0],
-        int(converted["first_home_mortgage_origination_year"][2024][0]),
+        converted["filing_status"][TIME_PERIOD][0],
+        int(converted["first_home_mortgage_origination_year"][TIME_PERIOD][0]),
     )
-    balance = converted["first_home_mortgage_balance"][2024][0]
-    total_interest = converted["first_home_mortgage_interest"][2024][0]
+    balance = converted["first_home_mortgage_balance"][TIME_PERIOD][0]
+    total_interest = converted["first_home_mortgage_interest"][TIME_PERIOD][0]
     deductible_share = min(1.0, cap / balance) if balance > 0 else 0.0
 
     assert total_interest * deductible_share == pytest.approx(6_000.0)
-    assert converted["home_mortgage_interest"][2024].sum() == pytest.approx(
+    assert converted["home_mortgage_interest"][TIME_PERIOD].sum() == pytest.approx(
         total_interest
     )
     assert (
         total_interest * deductible_share
-        + converted["investment_interest_expense"][2024].sum()
+        + converted["investment_interest_expense"][TIME_PERIOD].sum()
     ) == pytest.approx(7_000.0)
 
 
 @pytest.mark.skipif(
-    not supports_structural_mortgage_inputs(),
+    not HAS_STRUCTURAL_MORTGAGE_INPUTS,
     reason="Installed policyengine-us does not yet expose structural MID inputs.",
 )
 def test_structural_mortgage_conversion_preserves_non_mortgage_interest():
     data = _base_dataset_dict(
+        person_tax_unit_ids=[1, 1],
+        ages=[55, 53],
         deductible_mortgage_interest=[0.0, 0.0],
         interest_deduction=[2_500.0],
+        filing_status=[b"JOINT"],
     )
-    converted = convert_mortgage_interest_to_structural_inputs(data, 2024)
+    converted = convert_mortgage_interest_to_structural_inputs(data, TIME_PERIOD)
 
-    assert converted["first_home_mortgage_balance"][2024][0] == 0
-    assert converted["first_home_mortgage_interest"][2024][0] == 0
-    assert converted["home_mortgage_interest"][2024].sum() == 0
-    assert converted["investment_interest_expense"][2024].sum() == pytest.approx(
+    assert converted["first_home_mortgage_balance"][TIME_PERIOD][0] == 0
+    assert converted["first_home_mortgage_interest"][TIME_PERIOD][0] == 0
+    assert converted["home_mortgage_interest"][TIME_PERIOD].sum() == 0
+    assert converted["investment_interest_expense"][TIME_PERIOD].sum() == pytest.approx(
         2_500.0
     )
 
 
 @pytest.mark.skipif(
-    not supports_structural_mortgage_inputs(),
+    not HAS_STRUCTURAL_MORTGAGE_INPUTS,
     reason="Installed policyengine-us does not yet expose structural MID inputs.",
 )
 def test_structural_mortgage_conversion_keeps_balance_hints_for_non_itemizers():
     data = _base_dataset_dict(
+        person_tax_unit_ids=[1, 1],
+        ages=[55, 53],
         deductible_mortgage_interest=[0.0, 0.0],
         interest_deduction=[0.0],
+        filing_status=[b"JOINT"],
     )
     data["imputed_first_home_mortgage_balance_hint"] = {
-        2024: np.array([250_000.0], dtype=np.float32)
+        TIME_PERIOD: np.array([250_000.0], dtype=np.float32)
     }
     data["imputed_second_home_mortgage_balance_hint"] = {
-        2024: np.array([25_000.0], dtype=np.float32)
+        TIME_PERIOD: np.array([25_000.0], dtype=np.float32)
     }
 
-    converted = convert_mortgage_interest_to_structural_inputs(data, 2024)
+    converted = convert_mortgage_interest_to_structural_inputs(data, TIME_PERIOD)
 
-    assert converted["first_home_mortgage_balance"][2024][0] == pytest.approx(250_000.0)
-    assert converted["second_home_mortgage_balance"][2024][0] == pytest.approx(25_000.0)
-    assert converted["first_home_mortgage_interest"][2024][0] == 0
-    assert converted["second_home_mortgage_interest"][2024][0] == 0
-    assert converted["first_home_mortgage_origination_year"][2024][0] > 0
-    assert converted["second_home_mortgage_origination_year"][2024][0] >= 2018
-    assert converted["home_mortgage_interest"][2024].sum() == 0
-    assert converted["investment_interest_expense"][2024].sum() == 0
+    assert converted["first_home_mortgage_balance"][TIME_PERIOD][0] == pytest.approx(
+        250_000.0
+    )
+    assert converted["second_home_mortgage_balance"][TIME_PERIOD][0] == pytest.approx(
+        25_000.0
+    )
+    assert converted["first_home_mortgage_interest"][TIME_PERIOD][0] == 0
+    assert converted["second_home_mortgage_interest"][TIME_PERIOD][0] == 0
+    assert converted["first_home_mortgage_origination_year"][TIME_PERIOD][0] > 0
+    assert converted["second_home_mortgage_origination_year"][TIME_PERIOD][0] >= 2018
+    assert converted["home_mortgage_interest"][TIME_PERIOD].sum() == 0
+    assert converted["investment_interest_expense"][TIME_PERIOD].sum() == 0
 
 
 @pytest.mark.skipif(
-    not supports_structural_mortgage_inputs(),
+    not HAS_STRUCTURAL_MORTGAGE_INPUTS,
     reason="Installed policyengine-us does not yet expose structural MID inputs.",
 )
 def test_scf_balance_hint_imputation_zeroes_non_mortgaged_owner(monkeypatch):
@@ -157,62 +237,42 @@ def predict(self, X_test):
     monkeypatch.setattr(
         scf_module.SCF_2022,
         "load_dataset",
-        lambda self: {
-            "age": np.array([45, 55]),
-            "is_female": np.array([0, 1]),
-            "cps_race": np.array([1, 2]),
-            "is_married": np.array([1, 0]),
-            "own_children_in_household": np.array([1, 0]),
-            "employment_income": np.array([80_000, 40_000]),
-            "interest_dividend_income": np.array([2_000, 1_000]),
-            "social_security_pension_income": np.array([0, 5_000]),
-            "nh_mort": np.array([250_000, 0]),
-            "heloc": np.array([25_000, 0]),
-            "houses": np.array([500_000, 350_000]),
-            "wgt": np.array([1, 1]),
-        },
+        lambda self: _mock_scf_dataset(),
     )
 
-    data = {
-        "person_id": {2024: np.array([1, 2])},
-        "tax_unit_id": {2024: np.array([1, 2])},
-        "marital_unit_id": {2024: np.array([1, 2])},
-        "spm_unit_id": {2024: np.array([1, 2])},
-        "family_id": {2024: np.array([1, 2])},
-        "household_id": {2024: np.array([1, 2])},
-        "person_tax_unit_id": {2024: np.array([1, 2])},
-        "person_marital_unit_id": {2024: np.array([1, 2])},
-        "person_spm_unit_id": {2024: np.array([1, 2])},
-        "person_family_id": {2024: np.array([1, 2])},
-        "person_household_id": {2024: np.array([1, 2])},
-        "is_tax_unit_head": {2024: np.array([True, True])},
-        "is_tax_unit_spouse": {2024: np.array([False, False])},
-        "age": {2024: np.array([45, 55])},
-        "is_male": {2024: np.array([1, 0])},
-        "cps_race": {2024: np.array([1, 2])},
-        "employment_income": {2024: np.array([80_000, 40_000])},
-        "taxable_interest_income": {2024: np.array([1_000, 500])},
-        "tax_exempt_interest_income": {2024: np.array([0, 0])},
-        "qualified_dividend_income": {2024: np.array([500, 250])},
-        "non_qualified_dividend_income": {2024: np.array([0, 0])},
-        "social_security_retirement": {2024: np.array([0, 5_000])},
-        "taxable_private_pension_income": {2024: np.array([0, 0])},
-        "tax_exempt_private_pension_income": {2024: np.array([0, 0])},
-        "tenure_type": {
-            2024: np.array([b"OWNED_WITH_MORTGAGE", b"OWNED_WITH_MORTGAGE"])
-        },
-        "spm_unit_tenure_type": {
-            2024: np.array([b"OWNER_WITH_MORTGAGE", b"OWNER_WITHOUT_MORTGAGE"])
-        },
-    }
+    data = _base_dataset_dict(
+        person_tax_unit_ids=[1, 2],
+        ages=[45, 55],
+    )
+    data |= _time_period_variables(
+        is_male=[1, 0],
+        cps_race=[1, 2],
+        employment_income=[80_000, 40_000],
+        taxable_interest_income=[1_000, 500],
+        tax_exempt_interest_income=[0, 0],
+        qualified_dividend_income=[500, 250],
+        non_qualified_dividend_income=[0, 0],
+        social_security_retirement=[0, 5_000],
+        taxable_private_pension_income=[0, 0],
+        tax_exempt_private_pension_income=[0, 0],
+        tenure_type=[b"OWNED_WITH_MORTGAGE", b"OWNED_WITH_MORTGAGE"],
+        spm_unit_tenure_type=[
+            b"OWNER_WITH_MORTGAGE",
+            b"OWNER_WITHOUT_MORTGAGE",
+        ],
+    )
 
-    imputed = impute_tax_unit_mortgage_balance_hints(data, 2024)
+    imputed = impute_tax_unit_mortgage_balance_hints(data, TIME_PERIOD)
 
-    assert imputed["imputed_first_home_mortgage_balance_hint"][2024].tolist() == [
+    assert imputed["imputed_first_home_mortgage_balance_hint"][
+        TIME_PERIOD
+    ].tolist() == [
         200_000.0,
         0.0,
     ]
-    assert imputed["imputed_second_home_mortgage_balance_hint"][2024].tolist() == [
+    assert imputed["imputed_second_home_mortgage_balance_hint"][
+        TIME_PERIOD
+    ].tolist() == [
         20_000.0,
         0.0,
     ]
diff --git a/policyengine_us_data/utils/mortgage_interest.py b/policyengine_us_data/utils/mortgage_interest.py
index f49530bf..fbd6df94 100644
--- a/policyengine_us_data/utils/mortgage_interest.py
+++ b/policyengine_us_data/utils/mortgage_interest.py
@@ -33,17 +33,6 @@
 ]
 
 
-def supports_structural_mortgage_inputs() -> bool:
-    """Return whether the installed policyengine-us exposes structural MID inputs."""
-    try:
-        from policyengine_us import CountryTaxBenefitSystem
-    except ImportError:
-        return False
-
-    tbs = CountryTaxBenefitSystem()
-    return all(name in tbs.variables for name in STRUCTURAL_MORTGAGE_VARIABLES)
-
-
 def impute_tax_unit_mortgage_balance_hints(
     data: Dict[str, Dict[int, np.ndarray]],
     time_period: int,
@@ -55,9 +44,6 @@ def impute_tax_unit_mortgage_balance_hints(
     mortgage balance distribution without forcing the baseline to use mortgage
     interest for non-itemizers.
     """
-    if not supports_structural_mortgage_inputs():
-        return data
-
     receiver = _build_tax_unit_mortgage_receiver(data, time_period)
     if receiver.empty:
         return data
@@ -117,8 +103,7 @@ def convert_mortgage_interest_to_structural_inputs(
     The current us-data calibration pipeline imputes a person-level
     ``deductible_mortgage_interest`` and a tax-unit-level
     ``interest_deduction``. That short-circuits structural MID reforms in
-    policyengine-us. When structural mortgage inputs are available, convert
-    those imputed amounts into:
+    policyengine-us, so this converts those imputed amounts into:
 
     * tax-unit mortgage balances, interest, and origination years
     * person-level ``home_mortgage_interest`` for within-tax-unit allocation
@@ -131,9 +116,6 @@ def convert_mortgage_interest_to_structural_inputs(
     * the origination year is heuristic, because the current public pipeline
       does not carry a mortgage-vintage input
     """
-    if not supports_structural_mortgage_inputs():
-        return data
-
     tp = time_period
     person_ids = data.get("person_id", {}).get(tp)
     tax_unit_ids = data.get("tax_unit_id", {}).get(tp)
diff --git a/policyengine_us_data/utils/policyengine.py b/policyengine_us_data/utils/policyengine.py
new file mode 100644
index 00000000..18b9050f
--- /dev/null
+++ b/policyengine_us_data/utils/policyengine.py
@@ -0,0 +1,17 @@
+from functools import lru_cache
+
+
+@lru_cache(maxsize=1)
+def _policyengine_us_variable_names() -> frozenset[str]:
+    from policyengine_us import CountryTaxBenefitSystem
+
+    return frozenset(CountryTaxBenefitSystem().variables)
+
+
+def has_policyengine_us_variables(*variables: str) -> bool:
+    try:
+        available_variables = _policyengine_us_variable_names()
+    except Exception:
+        return False
+
+    return set(variables).issubset(available_variables)

From 857b31c769c554a300a7f6ba6446f7897e13fef8 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Fri, 27 Mar 2026 09:17:16 -0400
Subject: [PATCH 4/5] Improve MID structure and SOI targeting

---
 .../test_mortgage_interest.py                 | 49 ++++++++++++--
 .../tests/test_calibration/test_soi.py        |  9 +++
 policyengine_us_data/utils/loss.py            |  3 +-
 .../utils/mortgage_interest.py                | 64 ++++++++++++++++++-
 policyengine_us_data/utils/soi.py             | 12 +++-
 5 files changed, 126 insertions(+), 11 deletions(-)
 create mode 100644 policyengine_us_data/tests/test_calibration/test_soi.py

diff --git a/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py b/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py
index bbcfea45..cc689be6 100644
--- a/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py
+++ b/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py
@@ -4,6 +4,7 @@
 
 from policyengine_us_data.utils.mortgage_interest import (
     STRUCTURAL_MORTGAGE_VARIABLES,
+    _interest_implied_balance_floor,
     convert_mortgage_interest_to_structural_inputs,
     impute_tax_unit_mortgage_balance_hints,
 )
@@ -23,6 +24,17 @@ def _time_period_variables(**variables):
     return {name: _at_time_period(values) for name, values in variables.items()}
 
 
+def _set_balance_hints(data, *, first, second):
+    data["imputed_first_home_mortgage_balance_hint"] = _at_time_period(
+        first,
+        dtype=np.float32,
+    )
+    data["imputed_second_home_mortgage_balance_hint"] = _at_time_period(
+        second,
+        dtype=np.float32,
+    )
+
+
 def _head_and_spouse_flags(person_tax_unit_ids):
     first_seen = {}
     heads = np.zeros(len(person_tax_unit_ids), dtype=bool)
@@ -184,12 +196,7 @@ def test_structural_mortgage_conversion_keeps_balance_hints_for_non_itemizers():
         interest_deduction=[0.0],
         filing_status=[b"JOINT"],
     )
-    data["imputed_first_home_mortgage_balance_hint"] = {
-        TIME_PERIOD: np.array([250_000.0], dtype=np.float32)
-    }
-    data["imputed_second_home_mortgage_balance_hint"] = {
-        TIME_PERIOD: np.array([25_000.0], dtype=np.float32)
-    }
+    _set_balance_hints(data, first=[250_000.0], second=[25_000.0])
 
     converted = convert_mortgage_interest_to_structural_inputs(data, TIME_PERIOD)
 
@@ -276,3 +283,33 @@ def predict(self, X_test):
         20_000.0,
         0.0,
     ]
+
+
+@pytest.mark.skipif(
+    not HAS_STRUCTURAL_MORTGAGE_INPUTS,
+    reason="Installed policyengine-us does not yet expose structural MID inputs.",
+)
+def test_structural_mortgage_conversion_scales_hints_to_interest_floor():
+    data = _base_dataset_dict(
+        person_tax_unit_ids=[1, 1],
+        ages=[55, 53],
+        deductible_mortgage_interest=[30_000.0, 0.0],
+        interest_deduction=[30_000.0],
+        filing_status=[b"JOINT"],
+    )
+    _set_balance_hints(data, first=[200_000.0], second=[25_000.0])
+
+    converted = convert_mortgage_interest_to_structural_inputs(data, TIME_PERIOD)
+    first_balance = converted["first_home_mortgage_balance"][TIME_PERIOD][0]
+    second_balance = converted["second_home_mortgage_balance"][TIME_PERIOD][0]
+    expected_floor = _interest_implied_balance_floor(
+        np.array([30_000.0], dtype=np.float32),
+        TIME_PERIOD,
+    )[0]
+
+    assert first_balance + second_balance == pytest.approx(expected_floor)
+    assert first_balance / second_balance == pytest.approx(8.0)
+    assert converted["home_mortgage_interest"][TIME_PERIOD].sum() == pytest.approx(
+        converted["first_home_mortgage_interest"][TIME_PERIOD][0]
+        + converted["second_home_mortgage_interest"][TIME_PERIOD][0]
+    )
diff --git a/policyengine_us_data/tests/test_calibration/test_soi.py b/policyengine_us_data/tests/test_calibration/test_soi.py
new file mode 100644
index 00000000..c8354b95
--- /dev/null
+++ b/policyengine_us_data/tests/test_calibration/test_soi.py
@@ -0,0 +1,9 @@
+from policyengine_us_data.utils.soi import get_soi
+
+
+def test_get_soi_includes_mortgage_interest_deduction_targets():
+    soi = get_soi(2024)
+    mortgage_interest = soi[soi.Variable == "mortgage_interest_deductions"]
+
+    assert not mortgage_interest.empty
+    assert mortgage_interest["Value"].gt(0).all()
diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py
index bfbf49db..02507f63 100644
--- a/policyengine_us_data/utils/loss.py
+++ b/policyengine_us_data/utils/loss.py
@@ -4,7 +4,7 @@
 import numpy as np
 import logging
 
-from policyengine_us_data.storage import STORAGE_FOLDER, CALIBRATION_FOLDER
+from policyengine_us_data.storage import CALIBRATION_FOLDER
 from policyengine_us_data.storage.calibration_targets.pull_soi_targets import (
     STATE_ABBR_TO_FIPS,
 )
@@ -127,6 +127,7 @@ def build_loss_matrix(dataset: type, time_period):
         "partnership_and_s_corp_income",
         "qualified_dividends",
         "taxable_interest_income",
+        "mortgage_interest_deductions",
         "total_pension_income",
         "total_social_security",
     ]
diff --git a/policyengine_us_data/utils/mortgage_interest.py b/policyengine_us_data/utils/mortgage_interest.py
index fbd6df94..5cae7a1d 100644
--- a/policyengine_us_data/utils/mortgage_interest.py
+++ b/policyengine_us_data/utils/mortgage_interest.py
@@ -113,6 +113,9 @@ def convert_mortgage_interest_to_structural_inputs(
     The conversion is intentionally conservative:
     * current-law deductible mortgage interest is preserved exactly
     * current-law total interest deduction is preserved exactly
+    * SCF-imputed first-lien and HELOC splits are preserved when available
+    * weak balance hints are lifted to a conservative lower bound implied by
+      the observed deductible mortgage interest
     * the origination year is heuristic, because the current public pipeline
       does not carry a mortgage-vintage input
     """
@@ -147,6 +150,8 @@ def convert_mortgage_interest_to_structural_inputs(
         first_balance_hint,
         second_balance_hint,
     ) = _get_tax_unit_mortgage_balance_hints(data, tp, n_tax_units)
+    hinted_total_balance = np.maximum(first_balance_hint + second_balance_hint, 0)
+    balance_floor = _interest_implied_balance_floor(tax_unit_deductible, tp)
 
     total_interest_deduction = _get_tax_unit_interest_deduction_target(
         data,
@@ -177,7 +182,7 @@ def convert_mortgage_interest_to_structural_inputs(
     )
 
     has_mortgage = tax_unit_deductible > 0
-    hinted_balance = np.maximum(first_balance_hint + second_balance_hint, 0)
+    hinted_balance = np.maximum(hinted_total_balance, balance_floor)
     balance, origination_year = _estimate_mortgage_balance_and_year(
         tax_unit_ids,
         tax_unit_deductible,
@@ -186,13 +191,18 @@ def convert_mortgage_interest_to_structural_inputs(
         tp,
         hinted_balance,
     )
-    use_balance_hint = hinted_balance > 0
+    use_balance_hint = hinted_total_balance > 0
     first_balance = np.where(use_balance_hint, first_balance_hint, balance).astype(
         np.float32
     )
     second_balance = np.where(use_balance_hint, second_balance_hint, 0).astype(
         np.float32
     )
+    first_balance, second_balance = _apply_interest_implied_balance_floor(
+        first_balance,
+        second_balance,
+        balance_floor,
+    )
 
     swap_mask = (first_balance == 0) & (second_balance > 0)
     first_balance[swap_mask] = second_balance[swap_mask]
@@ -696,6 +706,56 @@ def _estimate_mortgage_balance_and_year(
     return balance, year
 
 
+def _interest_implied_balance_floor(
+    deductible_mortgage_interest: np.ndarray,
+    time_period: int,
+) -> np.ndarray:
+    """Conservative balance lower bound implied by deductible interest.
+
+    Uses the current-period market mortgage rate as the denominator, so the
+    inferred balance is a lower bound rather than an aggressive reconstruction
+    of total acquisition debt.
+    """
+    current_market_rate = float(
+        _mortgage_rate(np.array([time_period], dtype=np.int32))[0]
+    )
+    if current_market_rate <= 0:
+        return np.zeros_like(deductible_mortgage_interest, dtype=np.float32)
+    return np.where(
+        deductible_mortgage_interest > 0,
+        deductible_mortgage_interest / current_market_rate,
+        0,
+    ).astype(np.float32)
+
+
+def _apply_interest_implied_balance_floor(
+    first_balance: np.ndarray,
+    second_balance: np.ndarray,
+    balance_floor: np.ndarray,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Prevent donor balance hints from understating observed mortgage interest."""
+    first_balance = np.asarray(first_balance, dtype=np.float32).copy()
+    second_balance = np.asarray(second_balance, dtype=np.float32).copy()
+    balance_floor = np.maximum(np.asarray(balance_floor, dtype=np.float32), 0)
+
+    total_balance = first_balance + second_balance
+    needs_floor = balance_floor > total_balance
+    with_existing_split = needs_floor & (total_balance > 0)
+
+    scale = np.ones_like(total_balance, dtype=np.float32)
+    scale[with_existing_split] = (
+        balance_floor[with_existing_split] / total_balance[with_existing_split]
+    )
+    first_balance[with_existing_split] *= scale[with_existing_split]
+    second_balance[with_existing_split] *= scale[with_existing_split]
+
+    no_existing_balance = needs_floor & (total_balance == 0)
+    first_balance[no_existing_balance] = balance_floor[no_existing_balance]
+    second_balance[no_existing_balance] = 0
+
+    return first_balance.astype(np.float32), second_balance.astype(np.float32)
+
+
 def _split_interest_by_balance(
     total_interest: np.ndarray,
     first_balance: np.ndarray,
diff --git a/policyengine_us_data/utils/soi.py b/policyengine_us_data/utils/soi.py
index 997a8078..27a92956 100644
--- a/policyengine_us_data/utils/soi.py
+++ b/policyengine_us_data/utils/soi.py
@@ -1,7 +1,7 @@
 import pandas as pd
 import numpy as np
 from .uprating import create_policyengine_uprating_factors_table
-from policyengine_us_data.storage import STORAGE_FOLDER, CALIBRATION_FOLDER
+from policyengine_us_data.storage import CALIBRATION_FOLDER
 
 
 def pe_to_soi(pe_dataset, year):
@@ -11,7 +11,8 @@ def pe_to_soi(pe_dataset, year):
     pe_sim.default_calculation_period = year
     df = pd.DataFrame()
 
-    pe = lambda variable: np.array(pe_sim.calculate(variable, map_to="tax_unit"))
+    def pe(variable):
+        return np.array(pe_sim.calculate(variable, map_to="tax_unit"))
 
     df["adjusted_gross_income"] = pe("adjusted_gross_income")
     df["exemption"] = pe("exemptions")
@@ -62,6 +63,7 @@ def pe_to_soi(pe_dataset, year):
     )
     df["charitable_contributions_deduction"] = pe("charitable_deduction")
     df["interest_paid_deductions"] = pe("interest_deduction")
+    df["mortgage_interest_deductions"] = pe("deductible_mortgage_interest")
     df["medical_expense_deductions_uncapped"] = pe("medical_expense_deduction")
     df["state_and_local_tax_deductions"] = pe("salt_deduction")
     df["itemized_state_income_and_sales_tax_deductions"] = pe(
@@ -108,6 +110,11 @@ def puf_to_soi(puf, year):
     df["employment_income"] = puf.E00200
     df["charitable_contributions_deduction"] = puf.E19700
     df["interest_paid_deductions"] = puf.E19200
+    df["mortgage_interest_deductions"] = (
+        puf["deductible_mortgage_interest"]
+        if "deductible_mortgage_interest" in puf
+        else puf.E19200
+    )
     df["medical_expense_deductions_uncapped"] = puf.E17500
     df["itemized_state_income_and_sales_tax_deductions"] = puf.E18400
     df["itemized_real_estate_tax_deductions"] = puf.E18500
@@ -146,6 +153,7 @@ def get_soi(year: int) -> pd.DataFrame:
         "partnership_and_s_corp_income": "partnership_s_corp_income",
         "qualified_dividends": "qualified_dividend_income",
         "taxable_interest_income": "taxable_interest_income",
+        "mortgage_interest_deductions": "interest_deduction",
         "total_pension_income": "pension_income",
         "total_social_security": "social_security",
         "business_net_losses": "self_employment_income",

From 22ab8f10e789fb85cb348fe6837197fdd009a292 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Fri, 27 Mar 2026 11:28:18 -0400
Subject: [PATCH 5/5] Clarify MID proxy assumptions

---
 .../test_mortgage_interest.py                 | 28 +++++++++++++++++++
 policyengine_us_data/utils/loss.py            |  4 ++-
 .../utils/mortgage_interest.py                | 14 ++++++++++
 policyengine_us_data/utils/soi.py             |  3 ++
 4 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py b/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py
index cc689be6..bb6fbc1c 100644
--- a/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py
+++ b/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py
@@ -5,6 +5,7 @@
 from policyengine_us_data.utils.mortgage_interest import (
     STRUCTURAL_MORTGAGE_VARIABLES,
     _interest_implied_balance_floor,
+    _post_tcja_cap,
     convert_mortgage_interest_to_structural_inputs,
     impute_tax_unit_mortgage_balance_hints,
 )
@@ -313,3 +314,30 @@ def test_structural_mortgage_conversion_scales_hints_to_interest_floor():
         converted["first_home_mortgage_interest"][TIME_PERIOD][0]
         + converted["second_home_mortgage_interest"][TIME_PERIOD][0]
     )
+
+
+def test_post_tcja_cap_uses_mfs_limit():
+    assert _post_tcja_cap("SEPARATE") == pytest.approx(375_000.0)
+    assert _post_tcja_cap("MARRIED_FILING_SEPARATELY") == pytest.approx(375_000.0)
+
+
+@pytest.mark.skipif(
+    not HAS_STRUCTURAL_MORTGAGE_INPUTS,
+    reason="Installed policyengine-us does not yet expose structural MID inputs.",
+)
+def test_structural_mortgage_conversion_swaps_partial_hints():
+    data = _base_dataset_dict(
+        person_tax_unit_ids=[1, 1],
+        ages=[55, 53],
+        deductible_mortgage_interest=[0.0, 0.0],
+        interest_deduction=[0.0],
+        filing_status=[b"JOINT"],
+    )
+    _set_balance_hints(data, first=[0.0], second=[25_000.0])
+
+    converted = convert_mortgage_interest_to_structural_inputs(data, TIME_PERIOD)
+
+    assert converted["first_home_mortgage_balance"][TIME_PERIOD][0] == pytest.approx(
+        25_000.0
+    )
+    assert converted["second_home_mortgage_balance"][TIME_PERIOD][0] == 0
diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py
index 02507f63..49662850 100644
--- a/policyengine_us_data/utils/loss.py
+++ b/policyengine_us_data/utils/loss.py
@@ -127,7 +127,6 @@ def build_loss_matrix(dataset: type, time_period):
         "partnership_and_s_corp_income",
         "qualified_dividends",
         "taxable_interest_income",
-        "mortgage_interest_deductions",
         "total_pension_income",
         "total_social_security",
     ]
@@ -142,6 +141,9 @@ def build_loss_matrix(dataset: type, time_period):
         "partnership_and_s_corp_losses",
         "rent_and_royalty_net_income",
         "rent_and_royalty_net_losses",
+        # The current SOI source only exposes taxable-only aggregate targets for
+        # mortgage-interest deductions, not the AGI-bin detail used above.
+        "mortgage_interest_deductions",
         "taxable_pension_income",
         "taxable_social_security",
         "unemployment_compensation",
diff --git a/policyengine_us_data/utils/mortgage_interest.py b/policyengine_us_data/utils/mortgage_interest.py
index 5cae7a1d..af26974e 100644
--- a/policyengine_us_data/utils/mortgage_interest.py
+++ b/policyengine_us_data/utils/mortgage_interest.py
@@ -43,6 +43,11 @@ def impute_tax_unit_mortgage_balance_hints(
     data-layer hints that let the structural MID conversion reuse an SCF-like
     mortgage balance distribution without forcing the baseline to use mortgage
     interest for non-itemizers.
+
+    The second hint is a generic secondary acquisition-debt slot. In the
+    public SCF, HELOC balances are the best observable proxy for that slot even
+    though the downstream ``second_home_mortgage_*`` variables in
+    policyengine-us are named around a second home.
     """
     receiver = _build_tax_unit_mortgage_receiver(data, time_period)
     if receiver.empty:
@@ -118,6 +123,10 @@ def convert_mortgage_interest_to_structural_inputs(
       the observed deductible mortgage interest
     * the origination year is heuristic, because the current public pipeline
       does not carry a mortgage-vintage input
+
+    The structural model has two mortgage slots. In public data, we use those
+    slots for "first-lien" and "secondary acquisition debt" rather than trying
+    to identify literal primary-residence versus second-home mortgages.
     """
     tp = time_period
     person_ids = data.get("person_id", {}).get(tp)
@@ -228,6 +237,9 @@ def convert_mortgage_interest_to_structural_inputs(
     )
     second_origination_year = np.where(
         second_balance > 0,
+        # The public data's second slot is mainly a HELOC/secondary-debt proxy,
+        # so treat it as post-TCJA unless a richer vintage input becomes
+        # available.
         np.maximum(2018, origination_year),
         0,
     ).astype(np.int32)
@@ -435,6 +447,8 @@ def _build_scf_mortgage_donor(scf: pd.DataFrame) -> pd.DataFrame:
         2,
         np.where(owns_home, 1, 0),
     ).astype(np.float32)
+    # The second slot is not a literal second-home mortgage in SCF. We use
+    # HELOC balances as the best public proxy for secondary acquisition debt.
     donor["imputed_first_home_mortgage_balance_hint"] = np.maximum(
         total_mortgage - heloc,
         0,
diff --git a/policyengine_us_data/utils/soi.py b/policyengine_us_data/utils/soi.py
index 27a92956..41f2365c 100644
--- a/policyengine_us_data/utils/soi.py
+++ b/policyengine_us_data/utils/soi.py
@@ -153,6 +153,9 @@ def get_soi(year: int) -> pd.DataFrame:
         "partnership_and_s_corp_income": "partnership_s_corp_income",
         "qualified_dividends": "qualified_dividend_income",
         "taxable_interest_income": "taxable_interest_income",
+        # There is no separate published uprating factor for mortgage-interest
+        # deductions, so use total interest deductions as the closest available
+        # proxy.
         "mortgage_interest_deductions": "interest_deduction",
         "total_pension_income": "pension_income",
         "total_social_security": "social_security",