From 35817a5f1634456bcf30275ad575e7166e176492 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Tue, 24 Mar 2026 00:11:08 -0400 Subject: [PATCH 1/5] Add structural mortgage interest data support --- .../structural-mortgage-interest.added.md | 1 + .../datasets/cps/extended_cps.py | 27 +- policyengine_us_data/datasets/puf/puf.py | 11 + .../test_mortgage_interest.py | 222 +++++ .../utils/mortgage_interest.py | 770 ++++++++++++++++++ 5 files changed, 1027 insertions(+), 4 deletions(-) create mode 100644 changelog.d/structural-mortgage-interest.added.md create mode 100644 policyengine_us_data/tests/test_calibration/test_mortgage_interest.py create mode 100644 policyengine_us_data/utils/mortgage_interest.py diff --git a/changelog.d/structural-mortgage-interest.added.md b/changelog.d/structural-mortgage-interest.added.md new file mode 100644 index 00000000..bfecbc02 --- /dev/null +++ b/changelog.d/structural-mortgage-interest.added.md @@ -0,0 +1 @@ +Convert imputed deductible mortgage interest into structural mortgage balance, interest, and origination-year inputs when the installed `policyengine-us` supports federal MID cap modeling, while preserving total current-law interest deductions via residual investment interest inputs. diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index f38d5746..288e3e46 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -6,9 +6,14 @@ import pandas as pd from policyengine_core.data import Dataset -from policyengine_us_data.datasets.cps.cps import * # noqa: F403 -from policyengine_us_data.datasets.puf import * # noqa: F403 +from policyengine_us_data.datasets.cps.cps import CPS, CPS_2024, CPS_2024_Full +from policyengine_us_data.datasets.puf import PUF, PUF_2024 from policyengine_us_data.storage import STORAGE_FOLDER +from policyengine_us_data.utils.mortgage_interest import ( + convert_mortgage_interest_to_structural_inputs, + impute_tax_unit_mortgage_balance_hints, + supports_structural_mortgage_inputs, +) from policyengine_us_data.utils.retirement_limits import ( get_retirement_limits, get_se_pension_limits, @@ -445,6 +450,14 @@ def generate(self): ) new_data = self._rename_imputed_to_inputs(new_data) + new_data = impute_tax_unit_mortgage_balance_hints( + new_data, + self.time_period, + ) + new_data = convert_mortgage_interest_to_structural_inputs( + new_data, + self.time_period, + ) new_data = self._drop_formula_variables(new_data) self.save_dataset(new_data) @@ -472,11 +485,17 @@ def _rename_imputed_to_inputs(cls, data): # due to entity shape mismatch. _KEEP_FORMULA_VARS = { "person_id", - "interest_deduction", "self_employed_pension_contribution_ald", "self_employed_health_insurance_ald", } + @classmethod + def _keep_formula_vars(cls): + keep = set(cls._KEEP_FORMULA_VARS) + if not supports_structural_mortgage_inputs(): + keep.add("interest_deduction") + return keep + # QRF imputes formula-level variables (e.g. taxable_pension_income) # but we must store them under leaf input names so # _drop_formula_variables doesn't discard them. The engine then @@ -526,7 +545,7 @@ def _drop_formula_variables(cls, data): if (hasattr(var, "formulas") and len(var.formulas) > 0) or getattr(var, "adds", None) or getattr(var, "subtracts", None) - } - cls._KEEP_FORMULA_VARS + } - cls._keep_formula_vars() dropped = sorted(set(data.keys()) & formula_vars) if dropped: logger.info( diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py index baedce8a..9a0e8a22 100644 --- a/policyengine_us_data/datasets/puf/puf.py +++ b/policyengine_us_data/datasets/puf/puf.py @@ -12,6 +12,9 @@ from policyengine_us_data.datasets.puf.disaggregate_puf import ( disaggregate_aggregate_records, ) +from policyengine_us_data.utils.mortgage_interest import ( + convert_mortgage_interest_to_structural_inputs, +) from policyengine_us_data.utils.uprating import ( create_policyengine_uprating_factors_table, ) @@ -643,6 +646,14 @@ def generate(self): self.holder[key] = np.array(self.holder[key]).astype(float) assert not np.isnan(self.holder[key]).any(), f"{key} has NaNs." + holder_tp = {variable: {self.time_period: values} for variable, values in self.holder.items()} + holder_tp = convert_mortgage_interest_to_structural_inputs( + holder_tp, + self.time_period, + ) + self.holder = { + variable: values[self.time_period] for variable, values in holder_tp.items() + } self.save_dataset(self.holder) def add_tax_unit(self, row, tax_unit_id): diff --git a/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py b/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py new file mode 100644 index 00000000..94aed99e --- /dev/null +++ b/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py @@ -0,0 +1,222 @@ +import numpy as np +import pandas as pd +import pytest + +from policyengine_us_data.utils.mortgage_interest import ( + convert_mortgage_interest_to_structural_inputs, + impute_tax_unit_mortgage_balance_hints, + supports_structural_mortgage_inputs, +) + + +def _base_dataset_dict(deductible_mortgage_interest, interest_deduction): + time_period = 2024 + return { + "person_id": {time_period: np.array([1, 2])}, + "tax_unit_id": {time_period: np.array([1])}, + "marital_unit_id": {time_period: np.array([1])}, + "spm_unit_id": {time_period: np.array([1])}, + "family_id": {time_period: np.array([1])}, + "household_id": {time_period: np.array([1])}, + "person_tax_unit_id": {time_period: np.array([1, 1])}, + "person_marital_unit_id": {time_period: np.array([1, 1])}, + "person_spm_unit_id": {time_period: np.array([1, 1])}, + "person_family_id": {time_period: np.array([1, 1])}, + "person_household_id": {time_period: np.array([1, 1])}, + "is_tax_unit_head": {time_period: np.array([True, False])}, + "is_tax_unit_spouse": {time_period: np.array([False, True])}, + "age": {time_period: np.array([55, 53])}, + "filing_status": {time_period: np.array([b"JOINT"])}, + "deductible_mortgage_interest": { + time_period: np.array(deductible_mortgage_interest, dtype=np.float32) + }, + "interest_deduction": { + time_period: np.array(interest_deduction, dtype=np.float32) + }, + } + + +def _current_law_cap(filing_status: bytes, origination_year: int) -> float: + is_separate = b"SEPARATE" in filing_status + if origination_year <= 2017: + return 500_000.0 if is_separate else 1_000_000.0 + return 375_000.0 if is_separate else 750_000.0 + + +@pytest.mark.skipif( + not supports_structural_mortgage_inputs(), + reason="Installed policyengine-us does not yet expose structural MID inputs.", +) +def test_structural_mortgage_conversion_preserves_current_law_interest_deduction(): + data = _base_dataset_dict( + deductible_mortgage_interest=[6_000.0, 0.0], + interest_deduction=[7_000.0], + ) + converted = convert_mortgage_interest_to_structural_inputs(data, 2024) + + assert "deductible_mortgage_interest" not in converted + assert "interest_deduction" not in converted + assert converted["first_home_mortgage_balance"][2024][0] > 0 + assert converted["first_home_mortgage_interest"][2024][0] >= 6_000 + assert converted["first_home_mortgage_origination_year"][2024][0] > 0 + assert converted["investment_interest_expense"][2024].sum() == pytest.approx( + 1_000.0 + ) + cap = _current_law_cap( + converted["filing_status"][2024][0], + int(converted["first_home_mortgage_origination_year"][2024][0]), + ) + balance = converted["first_home_mortgage_balance"][2024][0] + total_interest = converted["first_home_mortgage_interest"][2024][0] + deductible_share = min(1.0, cap / balance) if balance > 0 else 0.0 + + assert total_interest * deductible_share == pytest.approx(6_000.0) + assert converted["home_mortgage_interest"][2024].sum() == pytest.approx( + total_interest + ) + assert ( + total_interest * deductible_share + + converted["investment_interest_expense"][2024].sum() + ) == pytest.approx(7_000.0) + + +@pytest.mark.skipif( + not supports_structural_mortgage_inputs(), + reason="Installed policyengine-us does not yet expose structural MID inputs.", +) +def test_structural_mortgage_conversion_preserves_non_mortgage_interest(): + data = _base_dataset_dict( + deductible_mortgage_interest=[0.0, 0.0], + interest_deduction=[2_500.0], + ) + converted = convert_mortgage_interest_to_structural_inputs(data, 2024) + + assert converted["first_home_mortgage_balance"][2024][0] == 0 + assert converted["first_home_mortgage_interest"][2024][0] == 0 + assert converted["home_mortgage_interest"][2024].sum() == 0 + assert converted["investment_interest_expense"][2024].sum() == pytest.approx( + 2_500.0 + ) + + +@pytest.mark.skipif( + not supports_structural_mortgage_inputs(), + reason="Installed policyengine-us does not yet expose structural MID inputs.", +) +def test_structural_mortgage_conversion_keeps_balance_hints_for_non_itemizers(): + data = _base_dataset_dict( + deductible_mortgage_interest=[0.0, 0.0], + interest_deduction=[0.0], + ) + data["imputed_first_home_mortgage_balance_hint"] = { + 2024: np.array([250_000.0], dtype=np.float32) + } + data["imputed_second_home_mortgage_balance_hint"] = { + 2024: np.array([25_000.0], dtype=np.float32) + } + + converted = convert_mortgage_interest_to_structural_inputs(data, 2024) + + assert converted["first_home_mortgage_balance"][2024][0] == pytest.approx( + 250_000.0 + ) + assert converted["second_home_mortgage_balance"][2024][0] == pytest.approx( + 25_000.0 + ) + assert converted["first_home_mortgage_interest"][2024][0] == 0 + assert converted["second_home_mortgage_interest"][2024][0] == 0 + assert converted["first_home_mortgage_origination_year"][2024][0] > 0 + assert converted["second_home_mortgage_origination_year"][2024][0] >= 2018 + assert converted["home_mortgage_interest"][2024].sum() == 0 + assert converted["investment_interest_expense"][2024].sum() == 0 + + +@pytest.mark.skipif( + not supports_structural_mortgage_inputs(), + reason="Installed policyengine-us does not yet expose structural MID inputs.", +) +def test_scf_balance_hint_imputation_zeroes_non_mortgaged_owner(monkeypatch): + import microimpute.models.qrf as qrf_module + import policyengine_us_data.datasets.scf.scf as scf_module + + class DummyQRF: + def fit(self, *args, **kwargs): + return self + + def predict(self, X_test): + return pd.DataFrame( + { + "imputed_first_home_mortgage_balance_hint": X_test[ + "mortgage_owner_status" + ] + * 100_000, + "imputed_second_home_mortgage_balance_hint": X_test[ + "mortgage_owner_status" + ] + * 10_000, + } + ) + + monkeypatch.setattr(qrf_module, "QRF", DummyQRF) + monkeypatch.setattr( + scf_module.SCF_2022, + "load_dataset", + lambda self: { + "age": np.array([45, 55]), + "is_female": np.array([0, 1]), + "cps_race": np.array([1, 2]), + "is_married": np.array([1, 0]), + "own_children_in_household": np.array([1, 0]), + "employment_income": np.array([80_000, 40_000]), + "interest_dividend_income": np.array([2_000, 1_000]), + "social_security_pension_income": np.array([0, 5_000]), + "nh_mort": np.array([250_000, 0]), + "heloc": np.array([25_000, 0]), + "houses": np.array([500_000, 350_000]), + "wgt": np.array([1, 1]), + }, + ) + + data = { + "person_id": {2024: np.array([1, 2])}, + "tax_unit_id": {2024: np.array([1, 2])}, + "marital_unit_id": {2024: np.array([1, 2])}, + "spm_unit_id": {2024: np.array([1, 2])}, + "family_id": {2024: np.array([1, 2])}, + "household_id": {2024: np.array([1, 2])}, + "person_tax_unit_id": {2024: np.array([1, 2])}, + "person_marital_unit_id": {2024: np.array([1, 2])}, + "person_spm_unit_id": {2024: np.array([1, 2])}, + "person_family_id": {2024: np.array([1, 2])}, + "person_household_id": {2024: np.array([1, 2])}, + "is_tax_unit_head": {2024: np.array([True, True])}, + "is_tax_unit_spouse": {2024: np.array([False, False])}, + "age": {2024: np.array([45, 55])}, + "is_male": {2024: np.array([1, 0])}, + "cps_race": {2024: np.array([1, 2])}, + "employment_income": {2024: np.array([80_000, 40_000])}, + "taxable_interest_income": {2024: np.array([1_000, 500])}, + "tax_exempt_interest_income": {2024: np.array([0, 0])}, + "qualified_dividend_income": {2024: np.array([500, 250])}, + "non_qualified_dividend_income": {2024: np.array([0, 0])}, + "social_security_retirement": {2024: np.array([0, 5_000])}, + "taxable_private_pension_income": {2024: np.array([0, 0])}, + "tax_exempt_private_pension_income": {2024: np.array([0, 0])}, + "tenure_type": { + 2024: np.array([b"OWNED_WITH_MORTGAGE", b"OWNED_WITH_MORTGAGE"]) + }, + "spm_unit_tenure_type": { + 2024: np.array([b"OWNER_WITH_MORTGAGE", b"OWNER_WITHOUT_MORTGAGE"]) + }, + } + + imputed = impute_tax_unit_mortgage_balance_hints(data, 2024) + + assert imputed["imputed_first_home_mortgage_balance_hint"][2024].tolist() == [ + 200_000.0, + 0.0, + ] + assert imputed["imputed_second_home_mortgage_balance_hint"][2024].tolist() == [ + 20_000.0, + 0.0, + ] diff --git a/policyengine_us_data/utils/mortgage_interest.py b/policyengine_us_data/utils/mortgage_interest.py new file mode 100644 index 00000000..bcf1be48 --- /dev/null +++ b/policyengine_us_data/utils/mortgage_interest.py @@ -0,0 +1,770 @@ +from __future__ import annotations + +from typing import Dict + +import numpy as np +import pandas as pd + + +STRUCTURAL_MORTGAGE_VARIABLES = ( + "first_home_mortgage_balance", + "second_home_mortgage_balance", + "first_home_mortgage_interest", + "second_home_mortgage_interest", + "first_home_mortgage_origination_year", + "second_home_mortgage_origination_year", +) + +MORTGAGE_HINT_VARIABLES = ( + "imputed_first_home_mortgage_balance_hint", + "imputed_second_home_mortgage_balance_hint", +) + +MORTGAGE_IMPUTATION_PREDICTORS = [ + "age", + "is_female", + "cps_race", + "is_married", + "own_children_in_household", + "employment_income", + "interest_dividend_income", + "social_security_pension_income", + "mortgage_owner_status", +] + + +def supports_structural_mortgage_inputs() -> bool: + """Return whether the installed policyengine-us exposes structural MID inputs.""" + try: + from policyengine_us import CountryTaxBenefitSystem + except ImportError: + return False + + tbs = CountryTaxBenefitSystem() + return all(name in tbs.variables for name in STRUCTURAL_MORTGAGE_VARIABLES) + + +def impute_tax_unit_mortgage_balance_hints( + data: Dict[str, Dict[int, np.ndarray]], + time_period: int, +) -> Dict[str, Dict[int, np.ndarray]]: + """Impute tax-unit mortgage balance hints from SCF data. + + The output variables are not policyengine-us inputs. They are auxiliary + data-layer hints that let the structural MID conversion reuse an SCF-like + mortgage balance distribution without forcing the baseline to use mortgage + interest for non-itemizers. + """ + if not supports_structural_mortgage_inputs(): + return data + + receiver = _build_tax_unit_mortgage_receiver(data, time_period) + if receiver.empty: + return data + + from microimpute.models.qrf import QRF + from policyengine_us_data.datasets.scf.scf import SCF_2022 + + scf = pd.DataFrame(SCF_2022().load_dataset()) + donor = _build_scf_mortgage_donor(scf) + if donor.empty: + return data + + qrf = QRF() + donor_sample = donor.sample(frac=0.5, random_state=42).reset_index(drop=True) + fitted = qrf.fit( + X_train=donor_sample, + predictors=MORTGAGE_IMPUTATION_PREDICTORS, + imputed_variables=list(MORTGAGE_HINT_VARIABLES), + weight_col="wgt", + tune_hyperparameters=False, + ) + predictions = fitted.predict(X_test=receiver[MORTGAGE_IMPUTATION_PREDICTORS]) + + owner_with_mortgage = receiver["mortgage_owner_status"].values == 2 + first_hint = np.where( + owner_with_mortgage, + np.maximum( + predictions["imputed_first_home_mortgage_balance_hint"].values, + 0, + ), + 0, + ).astype(np.float32) + second_hint = np.where( + owner_with_mortgage, + np.maximum( + predictions["imputed_second_home_mortgage_balance_hint"].values, + 0, + ), + 0, + ).astype(np.float32) + + swap_mask = (first_hint == 0) & (second_hint > 0) + first_hint[swap_mask] = second_hint[swap_mask] + second_hint[swap_mask] = 0 + + data["imputed_first_home_mortgage_balance_hint"] = {time_period: first_hint} + data["imputed_second_home_mortgage_balance_hint"] = {time_period: second_hint} + return data + + +def convert_mortgage_interest_to_structural_inputs( + data: Dict[str, Dict[int, np.ndarray]], + time_period: int, +) -> Dict[str, Dict[int, np.ndarray]]: + """Replace formula-level mortgage inputs with structural mortgage data. + + The current us-data calibration pipeline imputes a person-level + ``deductible_mortgage_interest`` and a tax-unit-level + ``interest_deduction``. That short-circuits structural MID reforms in + policyengine-us. When structural mortgage inputs are available, convert + those imputed amounts into: + + * tax-unit mortgage balances, interest, and origination years + * person-level ``home_mortgage_interest`` for within-tax-unit allocation + * person-level ``investment_interest_expense`` for the residual non-mortgage + interest share of ``interest_deduction`` + + The conversion is intentionally conservative: + * current-law deductible mortgage interest is preserved exactly + * current-law total interest deduction is preserved exactly + * the origination year is heuristic, because the current public pipeline + does not carry a mortgage-vintage input + """ + if not supports_structural_mortgage_inputs(): + return data + + tp = time_period + person_ids = data.get("person_id", {}).get(tp) + tax_unit_ids = data.get("tax_unit_id", {}).get(tp) + person_tax_unit_ids = data.get("person_tax_unit_id", {}).get(tp) + filing_status = data.get("filing_status", {}).get(tp) + + if ( + person_ids is None + or tax_unit_ids is None + or person_tax_unit_ids is None + or filing_status is None + ): + return data + + n_persons = len(person_ids) + n_tax_units = len(tax_unit_ids) + tax_unit_index = { + int(tax_unit_id): idx for idx, tax_unit_id in enumerate(tax_unit_ids) + } + person_tax_unit_idx = np.array( + [tax_unit_index[int(tax_unit_id)] for tax_unit_id in person_tax_unit_ids], + dtype=np.int32, + ) + + person_deductible = _get_person_mortgage_interest_target(data, tp, n_persons) + tax_unit_deductible = np.zeros(n_tax_units, dtype=np.float32) + np.add.at(tax_unit_deductible, person_tax_unit_idx, person_deductible) + ( + first_balance_hint, + second_balance_hint, + ) = _get_tax_unit_mortgage_balance_hints(data, tp, n_tax_units) + + total_interest_deduction = _get_tax_unit_interest_deduction_target( + data, + tp, + tax_unit_deductible, + ) + + fallback_person_share = _filer_share(data, tp, person_tax_unit_idx, n_tax_units) + person_share = _normalize_person_share( + person_deductible, + person_tax_unit_idx, + n_tax_units, + fallback_person_share, + ) + + tax_unit_age = _tax_unit_age(data, tp, person_tax_unit_idx, n_tax_units) + filing_status_str = np.array( + [_decode_filing_status(value) for value in filing_status] + ) + + post_cap = np.array( + [_post_tcja_cap(status) for status in filing_status_str], + dtype=np.float32, + ) + pre_cap = np.array( + [_pre_tcja_cap(status) for status in filing_status_str], + dtype=np.float32, + ) + + has_mortgage = tax_unit_deductible > 0 + hinted_balance = np.maximum(first_balance_hint + second_balance_hint, 0) + balance, origination_year = _estimate_mortgage_balance_and_year( + tax_unit_ids, + tax_unit_deductible, + post_cap, + tax_unit_age, + tp, + hinted_balance, + ) + use_balance_hint = hinted_balance > 0 + first_balance = np.where(use_balance_hint, first_balance_hint, balance).astype( + np.float32 + ) + second_balance = np.where(use_balance_hint, second_balance_hint, 0).astype( + np.float32 + ) + + swap_mask = (first_balance == 0) & (second_balance > 0) + first_balance[swap_mask] = second_balance[swap_mask] + second_balance[swap_mask] = 0 + total_balance = first_balance + second_balance + + applicable_cap = np.where(origination_year <= 2017, pre_cap, post_cap) + deductible_share = np.ones(n_tax_units, dtype=np.float32) + capped_mask = has_mortgage & (total_balance > applicable_cap) + deductible_share[capped_mask] = ( + applicable_cap[capped_mask] / total_balance[capped_mask] + ) + + total_mortgage_interest = np.zeros(n_tax_units, dtype=np.float32) + positive_share = has_mortgage & (deductible_share > 0) + total_mortgage_interest[positive_share] = ( + tax_unit_deductible[positive_share] / deductible_share[positive_share] + ) + first_interest, second_interest = _split_interest_by_balance( + total_mortgage_interest, + first_balance, + second_balance, + ) + second_origination_year = np.where( + second_balance > 0, + np.maximum(2018, origination_year), + 0, + ).astype(np.int32) + + investment_interest = np.maximum( + total_interest_deduction - tax_unit_deductible, + 0, + ).astype(np.float32) + + person_home_mortgage_interest = ( + total_mortgage_interest[person_tax_unit_idx] * person_share + ).astype(np.float32) + person_investment_interest = ( + investment_interest[person_tax_unit_idx] * fallback_person_share + ).astype(np.float32) + + data["first_home_mortgage_balance"] = {tp: first_balance} + data["second_home_mortgage_balance"] = {tp: second_balance} + data["first_home_mortgage_interest"] = {tp: first_interest} + data["second_home_mortgage_interest"] = {tp: second_interest} + data["first_home_mortgage_origination_year"] = { + tp: origination_year.astype(np.int32) + } + data["second_home_mortgage_origination_year"] = {tp: second_origination_year} + data["home_mortgage_interest"] = {tp: person_home_mortgage_interest} + data["investment_interest_expense"] = {tp: person_investment_interest} + + data.pop("deductible_mortgage_interest", None) + data.pop("interest_deduction", None) + return data + + +def _get_person_mortgage_interest_target( + data: Dict[str, Dict[int, np.ndarray]], + time_period: int, + n_persons: int, +) -> np.ndarray: + if "deductible_mortgage_interest" in data: + values = np.asarray( + data["deductible_mortgage_interest"][time_period], + dtype=np.float32, + ) + return np.maximum(values, 0) + return np.zeros(n_persons, dtype=np.float32) + + +def _get_tax_unit_interest_deduction_target( + data: Dict[str, Dict[int, np.ndarray]], + time_period: int, + tax_unit_deductible: np.ndarray, +) -> np.ndarray: + if "interest_deduction" not in data: + return tax_unit_deductible.astype(np.float32) + values = np.asarray(data["interest_deduction"][time_period], dtype=np.float32) + return np.maximum(values, tax_unit_deductible).astype(np.float32) + + +def _get_tax_unit_mortgage_balance_hints( + data: Dict[str, Dict[int, np.ndarray]], + time_period: int, + n_tax_units: int, +) -> tuple[np.ndarray, np.ndarray]: + first_hint = np.asarray( + data.get("imputed_first_home_mortgage_balance_hint", {}).get( + time_period, np.zeros(n_tax_units) + ), + dtype=np.float32, + ) + second_hint = np.asarray( + data.get("imputed_second_home_mortgage_balance_hint", {}).get( + time_period, np.zeros(n_tax_units) + ), + dtype=np.float32, + ) + return np.maximum(first_hint, 0), np.maximum(second_hint, 0) + + +def _build_tax_unit_mortgage_receiver( + data: Dict[str, Dict[int, np.ndarray]], + time_period: int, +) -> pd.DataFrame: + tax_unit_ids = data.get("tax_unit_id", {}).get(time_period) + person_tax_unit_ids = data.get("person_tax_unit_id", {}).get(time_period) + is_head = data.get("is_tax_unit_head", {}).get(time_period) + if tax_unit_ids is None or person_tax_unit_ids is None or is_head is None: + return pd.DataFrame() + + tax_unit_ids = np.asarray(tax_unit_ids) + person_tax_unit_ids = np.asarray(person_tax_unit_ids) + is_head = np.asarray(is_head, dtype=bool) + n_tax_units = len(tax_unit_ids) + tax_unit_index = {int(tax_unit_id): idx for idx, tax_unit_id in enumerate(tax_unit_ids)} + person_tax_unit_idx = np.array( + [tax_unit_index[int(tax_unit_id)] for tax_unit_id in person_tax_unit_ids], + dtype=np.int32, + ) + + head_index = np.full(n_tax_units, -1, dtype=np.int32) + head_positions = np.flatnonzero(is_head) + if head_positions.size > 0: + head_index[person_tax_unit_idx[head_positions]] = head_positions + + missing_head = head_index < 0 + if np.any(missing_head): + first_person = np.full(n_tax_units, -1, dtype=np.int32) + for person_idx, tax_unit_idx in enumerate(person_tax_unit_idx): + if first_person[tax_unit_idx] < 0: + first_person[tax_unit_idx] = person_idx + head_index[missing_head] = first_person[missing_head] + + receiver = pd.DataFrame( + { + "tax_unit_id": tax_unit_ids, + "head_index": head_index, + } + ) + head_take = head_index.clip(min=0) + + receiver["age"] = _take_person_values(data, time_period, "age", head_take) + is_male = _take_person_values(data, time_period, "is_male", head_take) + receiver["is_female"] = (1 - is_male).astype(np.float32) + receiver["cps_race"] = _take_person_values( + data, time_period, "cps_race", head_take + ).astype(np.float32) + receiver["own_children_in_household"] = _take_person_values( + data, time_period, "own_children_in_household", head_take + ) + receiver["mortgage_owner_status"] = _tax_unit_mortgage_owner_status( + data, + time_period, + head_take, + ) + + spouse_count = np.zeros(n_tax_units, dtype=np.float32) + spouse = np.asarray( + data.get("is_tax_unit_spouse", {}).get( + time_period, np.zeros(len(person_tax_unit_idx)) + ), + dtype=np.float32, + ) + np.add.at(spouse_count, person_tax_unit_idx, spouse) + receiver["is_married"] = (spouse_count > 0).astype(np.float32) + + receiver["employment_income"] = _sum_person_values_to_tax_unit( + data, + time_period, + person_tax_unit_idx, + n_tax_units, + ["employment_income"], + ) + receiver["interest_dividend_income"] = _sum_person_values_to_tax_unit( + data, + time_period, + person_tax_unit_idx, + n_tax_units, + [ + "taxable_interest_income", + "tax_exempt_interest_income", + "qualified_dividend_income", + "non_qualified_dividend_income", + ], + ) + receiver["social_security_pension_income"] = _sum_person_values_to_tax_unit( + data, + time_period, + person_tax_unit_idx, + n_tax_units, + [ + "social_security_retirement", + "taxable_private_pension_income", + "tax_exempt_private_pension_income", + ], + ) + return receiver[MORTGAGE_IMPUTATION_PREDICTORS] + + +def _build_scf_mortgage_donor(scf: pd.DataFrame) -> pd.DataFrame: + donor = pd.DataFrame() + donor["age"] = _frame_column(scf, "age") + donor["is_female"] = _frame_column(scf, "is_female") + donor["cps_race"] = _frame_column(scf, "cps_race") + donor["is_married"] = _frame_column(scf, "is_married") + donor["own_children_in_household"] = _frame_column( + scf, "own_children_in_household" + ) + donor["employment_income"] = _frame_column(scf, "employment_income") + donor["interest_dividend_income"] = _frame_column( + scf, "interest_dividend_income" + ) + donor["social_security_pension_income"] = _frame_column( + scf, "social_security_pension_income" + ) + + total_mortgage = np.maximum( + np.asarray(scf.get("nh_mort", 0), dtype=np.float32), + np.asarray(scf.get("mortgage_debt", 0), dtype=np.float32), + ) + heloc = np.minimum( + np.maximum(np.asarray(scf.get("heloc", 0), dtype=np.float32), 0), + total_mortgage, + ) + owns_home = np.asarray(scf.get("houses", 0), dtype=np.float32) > 0 + has_mortgage = total_mortgage > 0 + + donor["mortgage_owner_status"] = np.where( + has_mortgage, + 2, + np.where(owns_home, 1, 0), + ).astype(np.float32) + donor["imputed_first_home_mortgage_balance_hint"] = np.maximum( + total_mortgage - heloc, + 0, + ).astype(np.float32) + donor["imputed_second_home_mortgage_balance_hint"] = heloc.astype(np.float32) + donor["wgt"] = _frame_column(scf, "wgt", default=1) + return donor[ + MORTGAGE_IMPUTATION_PREDICTORS + list(MORTGAGE_HINT_VARIABLES) + ["wgt"] + ].dropna() + + +def _take_person_values( + data: Dict[str, Dict[int, np.ndarray]], + time_period: int, + variable: str, + head_take: np.ndarray, +) -> np.ndarray: + values = np.asarray( + data.get(variable, {}).get(time_period, np.zeros(head_take.size)), + dtype=np.float32, + ) + if values.size == 0: + return np.zeros(head_take.size, dtype=np.float32) + return values[head_take].astype(np.float32) + + +def _frame_column( + frame: pd.DataFrame, + column: str, + default: float = 0, +) -> np.ndarray: + if column in frame: + return np.asarray(frame[column], dtype=np.float32) + return np.full(len(frame), default, dtype=np.float32) + + +def _sum_person_values_to_tax_unit( + data: Dict[str, Dict[int, np.ndarray]], + time_period: int, + person_tax_unit_idx: np.ndarray, + n_tax_units: int, + variables: list[str], +) -> np.ndarray: + total = np.zeros(n_tax_units, dtype=np.float32) + for variable in variables: + if variable not in data: + continue + values = np.asarray(data[variable][time_period], dtype=np.float32) + np.add.at(total, person_tax_unit_idx, values) + return total + + +def _tax_unit_mortgage_owner_status( + data: Dict[str, Dict[int, np.ndarray]], + time_period: int, + head_take: np.ndarray, +) -> np.ndarray: + household_status = np.zeros(head_take.size, dtype=np.float32) + household_tenure = data.get("tenure_type", {}).get(time_period) + person_household_id = data.get("person_household_id", {}).get(time_period) + household_ids = data.get("household_id", {}).get(time_period) + if ( + household_tenure is not None + and person_household_id is not None + and household_ids is not None + ): + household_map = { + int(household_id): _decode_owner_status(value) + for household_id, value in zip(household_ids, household_tenure) + } + household_status = np.array( + [ + household_map.get(int(household_id), 0) + for household_id in np.asarray(person_household_id)[head_take] + ], + dtype=np.float32, + ) + + spm_status = np.zeros(head_take.size, dtype=np.float32) + spm_tenure = data.get("spm_unit_tenure_type", {}).get(time_period) + person_spm_unit_id = data.get("person_spm_unit_id", {}).get(time_period) + spm_unit_ids = data.get("spm_unit_id", {}).get(time_period) + if spm_tenure is not None and person_spm_unit_id is not None and spm_unit_ids is not None: + spm_map = { + int(spm_unit_id): _decode_owner_status(value) + for spm_unit_id, value in zip(spm_unit_ids, spm_tenure) + } + spm_status = np.array( + [ + spm_map.get(int(spm_unit_id), 0) + for spm_unit_id in np.asarray(person_spm_unit_id)[head_take] + ], + dtype=np.float32, + ) + + return np.where(spm_status > 0, spm_status, household_status).astype(np.float32) + + +def _decode_owner_status(value) -> int: + if isinstance(value, bytes): + value = value.decode("utf-8") + value = str(value).upper() + if "OWNER_WITH_MORTGAGE" in value or "OWNED_WITH_MORTGAGE" in value: + return 2 + if "OWNER_WITHOUT_MORTGAGE" in value: + return 1 + return 0 + + +def _filer_share( + data: Dict[str, Dict[int, np.ndarray]], + time_period: int, + person_tax_unit_idx: np.ndarray, + n_tax_units: int, +) -> np.ndarray: + is_head = np.asarray( + data.get("is_tax_unit_head", {}).get( + time_period, np.zeros(len(person_tax_unit_idx), dtype=bool) + ), + dtype=bool, + ) + is_spouse = np.asarray( + data.get("is_tax_unit_spouse", {}).get( + time_period, np.zeros(len(person_tax_unit_idx), dtype=bool) + ), + dtype=bool, + ) + filer_mask = (is_head | is_spouse).astype(np.float32) + filer_count = np.zeros(n_tax_units, dtype=np.float32) + np.add.at(filer_count, person_tax_unit_idx, filer_mask) + + share = np.zeros(len(person_tax_unit_idx), dtype=np.float32) + positive_filers = filer_count[person_tax_unit_idx] > 0 + share[positive_filers] = ( + filer_mask[positive_filers] / filer_count[person_tax_unit_idx][positive_filers] + ) + + no_filer_mask = filer_count[person_tax_unit_idx] == 0 + if np.any(no_filer_mask): + share[no_filer_mask] = _equal_person_share( + person_tax_unit_idx[no_filer_mask], + n_tax_units, + ) + + return share + + +def _normalize_person_share( + person_values: np.ndarray, + person_tax_unit_idx: np.ndarray, + n_tax_units: int, + fallback_share: np.ndarray, +) -> np.ndarray: + tax_unit_totals = np.zeros(n_tax_units, dtype=np.float32) + np.add.at(tax_unit_totals, person_tax_unit_idx, person_values) + share = np.zeros_like(person_values, dtype=np.float32) + positive = tax_unit_totals[person_tax_unit_idx] > 0 + share[positive] = ( + person_values[positive] / tax_unit_totals[person_tax_unit_idx][positive] + ) + share[~positive] = fallback_share[~positive] + return share + + +def _equal_person_share( + person_tax_unit_idx: np.ndarray, + n_tax_units: int, +) -> np.ndarray: + counts = np.zeros(n_tax_units, dtype=np.float32) + np.add.at(counts, person_tax_unit_idx, 1) + return (1 / counts[person_tax_unit_idx]).astype(np.float32) + + +def _tax_unit_age( + data: Dict[str, Dict[int, np.ndarray]], + time_period: int, + person_tax_unit_idx: np.ndarray, + n_tax_units: int, +) -> np.ndarray: + ages = np.asarray( + data.get("age", {}).get(time_period, np.zeros(len(person_tax_unit_idx))), + dtype=np.float32, + ) + is_head = np.asarray( + data.get("is_tax_unit_head", {}).get( + time_period, np.zeros(len(person_tax_unit_idx), dtype=bool) + ), + dtype=bool, + ) + is_spouse = np.asarray( + data.get("is_tax_unit_spouse", {}).get( + time_period, np.zeros(len(person_tax_unit_idx), dtype=bool) + ), + dtype=bool, + ) + filer_ages = np.where(is_head | is_spouse, ages, 0) + tax_unit_age = np.zeros(n_tax_units, dtype=np.float32) + np.maximum.at(tax_unit_age, person_tax_unit_idx, filer_ages) + + missing_age = tax_unit_age == 0 + if np.any(missing_age): + any_age = np.zeros(n_tax_units, dtype=np.float32) + np.maximum.at(any_age, person_tax_unit_idx, ages) + tax_unit_age[missing_age] = any_age[missing_age] + + tax_unit_age[missing_age & (tax_unit_age == 0)] = 45 + return tax_unit_age + + +def _estimate_mortgage_balance_and_year( + tax_unit_ids: np.ndarray, + deductible_mortgage_interest: np.ndarray, + post_cap: np.ndarray, + tax_unit_age: np.ndarray, + time_period: int, + hinted_balance: np.ndarray, +) -> tuple[np.ndarray, np.ndarray]: + balance = np.zeros_like(deductible_mortgage_interest, dtype=np.float32) + year = np.zeros_like(deductible_mortgage_interest, dtype=np.int32) + has_mortgage = (deductible_mortgage_interest > 0) | (hinted_balance > 0) + if not np.any(has_mortgage): + return balance, year + + older_draw = _stable_uniform(tax_unit_ids, salt=17) + year_draw = _stable_uniform(tax_unit_ids, salt=31) + + pre_probability = np.clip( + 0.10 + 0.012 * np.maximum(tax_unit_age - 30, 0), + 0.10, + 0.85, + ) + + provisional_rate = 0.045 + provisional_balance = np.where( + hinted_balance > 0, + hinted_balance, + deductible_mortgage_interest / provisional_rate, + ) + pre_probability += 0.20 * (provisional_balance > post_cap) + pre_probability = np.clip(pre_probability, 0.10, 0.90) + + if time_period <= 2017: + is_pre_tcja = has_mortgage + else: + is_pre_tcja = has_mortgage & (older_draw < pre_probability) + + pre_span = 13 # 2005-2017 inclusive + year[is_pre_tcja] = 2005 + np.floor(year_draw[is_pre_tcja] * pre_span).astype( + np.int32 + ) + + post_mask = has_mortgage & ~is_pre_tcja + post_start = 2018 if time_period >= 2018 else time_period + post_span = max(1, time_period - post_start + 1) + year[post_mask] = post_start + np.floor(year_draw[post_mask] * post_span).astype( + np.int32 + ) + + rate = _mortgage_rate(year) + balance[has_mortgage] = np.where( + hinted_balance[has_mortgage] > 0, + hinted_balance[has_mortgage], + deductible_mortgage_interest[has_mortgage] / rate[has_mortgage], + ) + return balance, year + + +def _split_interest_by_balance( + total_interest: np.ndarray, + first_balance: np.ndarray, + second_balance: np.ndarray, +) -> tuple[np.ndarray, np.ndarray]: + total_balance = first_balance + second_balance + first_interest = np.zeros_like(total_interest, dtype=np.float32) + second_interest = np.zeros_like(total_interest, dtype=np.float32) + + with_second = total_balance > 0 + first_interest[with_second] = ( + total_interest[with_second] * first_balance[with_second] / total_balance[with_second] + ) + second_interest[with_second] = total_interest[with_second] - first_interest[with_second] + + no_second = second_balance == 0 + first_interest[no_second] = total_interest[no_second] + second_interest[no_second] = 0 + return first_interest.astype(np.float32), second_interest.astype(np.float32) + + +def _mortgage_rate(origination_year: np.ndarray) -> np.ndarray: + year = np.asarray(origination_year, dtype=np.int32) + rate = np.full(year.shape, 0.045, dtype=np.float32) + rate[year <= 2017] = 0.040 + rate[(year >= 2018) & (year <= 2019)] = 0.045 + rate[(year >= 2020) & (year <= 2021)] = 0.035 + rate[year == 2022] = 0.0525 + rate[year >= 2023] = 0.0675 + return rate + + +def _stable_uniform(ids: np.ndarray, salt: int) -> np.ndarray: + values = np.asarray(ids, dtype=np.uint64) + hashed = values * np.uint64(1_103_515_245 + salt) + np.uint64(12_345 + salt) + return ((hashed % np.uint64(2**31)).astype(np.float64) / float(2**31)).astype( + np.float32 + ) + + +def _decode_filing_status(value) -> str: + if isinstance(value, bytes): + return value.decode("utf-8").upper() + return str(value).upper() + + +def _post_tcja_cap(status: str) -> float: + if "SEPARATE" in status: + return 375_000.0 + return 750_000.0 + + +def _pre_tcja_cap(status: str) -> float: + if "SEPARATE" in status: + return 500_000.0 + return 1_000_000.0 From 9f7ad45983f0007626a663c95161b2cf831acf19 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Tue, 24 Mar 2026 05:53:11 -0400 Subject: [PATCH 2/5] Format structural mortgage support files --- policyengine_us_data/datasets/puf/puf.py | 5 +++- .../test_mortgage_interest.py | 8 ++---- .../utils/mortgage_interest.py | 26 ++++++++++++------- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py index 9a0e8a22..9747df41 100644 --- a/policyengine_us_data/datasets/puf/puf.py +++ b/policyengine_us_data/datasets/puf/puf.py @@ -646,7 +646,10 @@ def generate(self): self.holder[key] = np.array(self.holder[key]).astype(float) assert not np.isnan(self.holder[key]).any(), f"{key} has NaNs." - holder_tp = {variable: {self.time_period: values} for variable, values in self.holder.items()} + holder_tp = { + variable: {self.time_period: values} + for variable, values in self.holder.items() + } holder_tp = convert_mortgage_interest_to_structural_inputs( holder_tp, self.time_period, diff --git a/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py b/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py index 94aed99e..82a42d1f 100644 --- a/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py +++ b/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py @@ -117,12 +117,8 @@ def test_structural_mortgage_conversion_keeps_balance_hints_for_non_itemizers(): converted = convert_mortgage_interest_to_structural_inputs(data, 2024) - assert converted["first_home_mortgage_balance"][2024][0] == pytest.approx( - 250_000.0 - ) - assert converted["second_home_mortgage_balance"][2024][0] == pytest.approx( - 25_000.0 - ) + assert converted["first_home_mortgage_balance"][2024][0] == pytest.approx(250_000.0) + assert converted["second_home_mortgage_balance"][2024][0] == pytest.approx(25_000.0) assert converted["first_home_mortgage_interest"][2024][0] == 0 assert converted["second_home_mortgage_interest"][2024][0] == 0 assert converted["first_home_mortgage_origination_year"][2024][0] > 0 diff --git a/policyengine_us_data/utils/mortgage_interest.py b/policyengine_us_data/utils/mortgage_interest.py index bcf1be48..f49530bf 100644 --- a/policyengine_us_data/utils/mortgage_interest.py +++ b/policyengine_us_data/utils/mortgage_interest.py @@ -327,7 +327,9 @@ def _build_tax_unit_mortgage_receiver( person_tax_unit_ids = np.asarray(person_tax_unit_ids) is_head = np.asarray(is_head, dtype=bool) n_tax_units = len(tax_unit_ids) - tax_unit_index = {int(tax_unit_id): idx for idx, tax_unit_id in enumerate(tax_unit_ids)} + tax_unit_index = { + int(tax_unit_id): idx for idx, tax_unit_id in enumerate(tax_unit_ids) + } person_tax_unit_idx = np.array( [tax_unit_index[int(tax_unit_id)] for tax_unit_id in person_tax_unit_ids], dtype=np.int32, @@ -418,13 +420,9 @@ def _build_scf_mortgage_donor(scf: pd.DataFrame) -> pd.DataFrame: donor["is_female"] = _frame_column(scf, "is_female") donor["cps_race"] = _frame_column(scf, "cps_race") donor["is_married"] = _frame_column(scf, "is_married") - donor["own_children_in_household"] = _frame_column( - scf, "own_children_in_household" - ) + donor["own_children_in_household"] = _frame_column(scf, "own_children_in_household") donor["employment_income"] = _frame_column(scf, "employment_income") - donor["interest_dividend_income"] = _frame_column( - scf, "interest_dividend_income" - ) + donor["interest_dividend_income"] = _frame_column(scf, "interest_dividend_income") donor["social_security_pension_income"] = _frame_column( scf, "social_security_pension_income" ) @@ -527,7 +525,11 @@ def _tax_unit_mortgage_owner_status( spm_tenure = data.get("spm_unit_tenure_type", {}).get(time_period) person_spm_unit_id = data.get("person_spm_unit_id", {}).get(time_period) spm_unit_ids = data.get("spm_unit_id", {}).get(time_period) - if spm_tenure is not None and person_spm_unit_id is not None and spm_unit_ids is not None: + if ( + spm_tenure is not None + and person_spm_unit_id is not None + and spm_unit_ids is not None + ): spm_map = { int(spm_unit_id): _decode_owner_status(value) for spm_unit_id, value in zip(spm_unit_ids, spm_tenure) @@ -723,9 +725,13 @@ def _split_interest_by_balance( with_second = total_balance > 0 first_interest[with_second] = ( - total_interest[with_second] * first_balance[with_second] / total_balance[with_second] + total_interest[with_second] + * first_balance[with_second] + / total_balance[with_second] + ) + second_interest[with_second] = ( + total_interest[with_second] - first_interest[with_second] ) - second_interest[with_second] = total_interest[with_second] - first_interest[with_second] no_second = second_balance == 0 first_interest[no_second] = total_interest[no_second] From 32254ddc63085d8132da409f53df92a2b6b867f5 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 25 Mar 2026 21:43:50 -0400 Subject: [PATCH 3/5] Refactor structural mortgage input compatibility --- .../datasets/cps/extended_cps.py | 27 +- policyengine_us_data/datasets/puf/puf.py | 11 +- .../test_mortgage_interest.py | 268 +++++++++++------- .../utils/mortgage_interest.py | 20 +- policyengine_us_data/utils/policyengine.py | 17 ++ 5 files changed, 206 insertions(+), 137 deletions(-) create mode 100644 policyengine_us_data/utils/policyengine.py diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 288e3e46..e0871736 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -10,10 +10,11 @@ from policyengine_us_data.datasets.puf import PUF, PUF_2024 from policyengine_us_data.storage import STORAGE_FOLDER from policyengine_us_data.utils.mortgage_interest import ( + STRUCTURAL_MORTGAGE_VARIABLES, convert_mortgage_interest_to_structural_inputs, impute_tax_unit_mortgage_balance_hints, - supports_structural_mortgage_inputs, ) +from policyengine_us_data.utils.policyengine import has_policyengine_us_variables from policyengine_us_data.utils.retirement_limits import ( get_retirement_limits, get_se_pension_limits, @@ -21,6 +22,11 @@ logger = logging.getLogger(__name__) + +def _supports_structural_mortgage_inputs() -> bool: + return has_policyengine_us_variables(*STRUCTURAL_MORTGAGE_VARIABLES) + + # CPS-only variables that should be QRF-imputed for the PUF clone half # instead of naively duplicated from the CPS donor. These are # income-correlated variables that exist only in the CPS; demographics, @@ -450,14 +456,15 @@ def generate(self): ) new_data = self._rename_imputed_to_inputs(new_data) - new_data = impute_tax_unit_mortgage_balance_hints( - new_data, - self.time_period, - ) - new_data = convert_mortgage_interest_to_structural_inputs( - new_data, - self.time_period, - ) + if _supports_structural_mortgage_inputs(): + new_data = impute_tax_unit_mortgage_balance_hints( + new_data, + self.time_period, + ) + new_data = convert_mortgage_interest_to_structural_inputs( + new_data, + self.time_period, + ) new_data = self._drop_formula_variables(new_data) self.save_dataset(new_data) @@ -492,7 +499,7 @@ def _rename_imputed_to_inputs(cls, data): @classmethod def _keep_formula_vars(cls): keep = set(cls._KEEP_FORMULA_VARS) - if not supports_structural_mortgage_inputs(): + if not _supports_structural_mortgage_inputs(): keep.add("interest_deduction") return keep diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py index 9747df41..bde0f33f 100644 --- a/policyengine_us_data/datasets/puf/puf.py +++ b/policyengine_us_data/datasets/puf/puf.py @@ -13,8 +13,10 @@ disaggregate_aggregate_records, ) from policyengine_us_data.utils.mortgage_interest import ( + STRUCTURAL_MORTGAGE_VARIABLES, convert_mortgage_interest_to_structural_inputs, ) +from policyengine_us_data.utils.policyengine import has_policyengine_us_variables from policyengine_us_data.utils.uprating import ( create_policyengine_uprating_factors_table, ) @@ -650,10 +652,11 @@ def generate(self): variable: {self.time_period: values} for variable, values in self.holder.items() } - holder_tp = convert_mortgage_interest_to_structural_inputs( - holder_tp, - self.time_period, - ) + if has_policyengine_us_variables(*STRUCTURAL_MORTGAGE_VARIABLES): + holder_tp = convert_mortgage_interest_to_structural_inputs( + holder_tp, + self.time_period, + ) self.holder = { variable: values[self.time_period] for variable, values in holder_tp.items() } diff --git a/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py b/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py index 82a42d1f..bbcfea45 100644 --- a/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py +++ b/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py @@ -3,36 +3,103 @@ import pytest from policyengine_us_data.utils.mortgage_interest import ( + STRUCTURAL_MORTGAGE_VARIABLES, convert_mortgage_interest_to_structural_inputs, impute_tax_unit_mortgage_balance_hints, - supports_structural_mortgage_inputs, ) +from policyengine_us_data.utils.policyengine import has_policyengine_us_variables +TIME_PERIOD = 2024 +HAS_STRUCTURAL_MORTGAGE_INPUTS = has_policyengine_us_variables( + *STRUCTURAL_MORTGAGE_VARIABLES +) + + +def _at_time_period(values, dtype=None): + return {TIME_PERIOD: np.array(values, dtype=dtype)} + + +def _time_period_variables(**variables): + return {name: _at_time_period(values) for name, values in variables.items()} + + +def _head_and_spouse_flags(person_tax_unit_ids): + first_seen = {} + heads = np.zeros(len(person_tax_unit_ids), dtype=bool) + spouses = np.zeros(len(person_tax_unit_ids), dtype=bool) + + for idx, tax_unit_id in enumerate(person_tax_unit_ids): + occurrence = first_seen.get(int(tax_unit_id), 0) + if occurrence == 0: + heads[idx] = True + elif occurrence == 1: + spouses[idx] = True + first_seen[int(tax_unit_id)] = occurrence + 1 + + return heads, spouses + + +def _base_dataset_dict( + *, + person_tax_unit_ids, + ages, + deductible_mortgage_interest=None, + interest_deduction=None, + filing_status=None, +): + person_tax_unit_ids = np.array(person_tax_unit_ids, dtype=np.int32) + tax_unit_ids = np.unique(person_tax_unit_ids) + n_people = len(person_tax_unit_ids) + person_ids = np.arange(1, n_people + 1, dtype=np.int32) + heads, spouses = _head_and_spouse_flags(person_tax_unit_ids) + + data = { + "person_id": _at_time_period(person_ids), + "tax_unit_id": _at_time_period(tax_unit_ids), + "marital_unit_id": _at_time_period(tax_unit_ids), + "spm_unit_id": _at_time_period(tax_unit_ids), + "family_id": _at_time_period(tax_unit_ids), + "household_id": _at_time_period(tax_unit_ids), + "person_tax_unit_id": _at_time_period(person_tax_unit_ids), + "person_marital_unit_id": _at_time_period(person_tax_unit_ids), + "person_spm_unit_id": _at_time_period(person_tax_unit_ids), + "person_family_id": _at_time_period(person_tax_unit_ids), + "person_household_id": _at_time_period(person_tax_unit_ids), + "is_tax_unit_head": _at_time_period(heads), + "is_tax_unit_spouse": _at_time_period(spouses), + "age": _at_time_period(ages), + } + + if filing_status is not None: + data["filing_status"] = _at_time_period(filing_status) + if deductible_mortgage_interest is not None: + data["deductible_mortgage_interest"] = _at_time_period( + deductible_mortgage_interest, + dtype=np.float32, + ) + if interest_deduction is not None: + data["interest_deduction"] = _at_time_period( + interest_deduction, + dtype=np.float32, + ) -def _base_dataset_dict(deductible_mortgage_interest, interest_deduction): - time_period = 2024 + return data + + +def _mock_scf_dataset(): return { - "person_id": {time_period: np.array([1, 2])}, - "tax_unit_id": {time_period: np.array([1])}, - "marital_unit_id": {time_period: np.array([1])}, - "spm_unit_id": {time_period: np.array([1])}, - "family_id": {time_period: np.array([1])}, - "household_id": {time_period: np.array([1])}, - "person_tax_unit_id": {time_period: np.array([1, 1])}, - "person_marital_unit_id": {time_period: np.array([1, 1])}, - "person_spm_unit_id": {time_period: np.array([1, 1])}, - "person_family_id": {time_period: np.array([1, 1])}, - "person_household_id": {time_period: np.array([1, 1])}, - "is_tax_unit_head": {time_period: np.array([True, False])}, - "is_tax_unit_spouse": {time_period: np.array([False, True])}, - "age": {time_period: np.array([55, 53])}, - "filing_status": {time_period: np.array([b"JOINT"])}, - "deductible_mortgage_interest": { - time_period: np.array(deductible_mortgage_interest, dtype=np.float32) - }, - "interest_deduction": { - time_period: np.array(interest_deduction, dtype=np.float32) - }, + "age": np.array([45, 55]), + "is_female": np.array([0, 1]), + "cps_race": np.array([1, 2]), + "is_married": np.array([1, 0]), + "own_children_in_household": np.array([1, 0]), + "employment_income": np.array([80_000, 40_000]), + "interest_dividend_income": np.array([2_000, 1_000]), + "social_security_pension_income": np.array([0, 5_000]), + "nh_mort": np.array([250_000, 0]), + "heloc": np.array([25_000, 0]), + "houses": np.array([500_000, 350_000]), + "wgt": np.array([1, 1]), } @@ -44,91 +111,104 @@ def _current_law_cap(filing_status: bytes, origination_year: int) -> float: @pytest.mark.skipif( - not supports_structural_mortgage_inputs(), + not HAS_STRUCTURAL_MORTGAGE_INPUTS, reason="Installed policyengine-us does not yet expose structural MID inputs.", ) def test_structural_mortgage_conversion_preserves_current_law_interest_deduction(): data = _base_dataset_dict( + person_tax_unit_ids=[1, 1], + ages=[55, 53], deductible_mortgage_interest=[6_000.0, 0.0], interest_deduction=[7_000.0], + filing_status=[b"JOINT"], ) - converted = convert_mortgage_interest_to_structural_inputs(data, 2024) + converted = convert_mortgage_interest_to_structural_inputs(data, TIME_PERIOD) assert "deductible_mortgage_interest" not in converted assert "interest_deduction" not in converted - assert converted["first_home_mortgage_balance"][2024][0] > 0 - assert converted["first_home_mortgage_interest"][2024][0] >= 6_000 - assert converted["first_home_mortgage_origination_year"][2024][0] > 0 - assert converted["investment_interest_expense"][2024].sum() == pytest.approx( + assert converted["first_home_mortgage_balance"][TIME_PERIOD][0] > 0 + assert converted["first_home_mortgage_interest"][TIME_PERIOD][0] >= 6_000 + assert converted["first_home_mortgage_origination_year"][TIME_PERIOD][0] > 0 + assert converted["investment_interest_expense"][TIME_PERIOD].sum() == pytest.approx( 1_000.0 ) cap = _current_law_cap( - converted["filing_status"][2024][0], - int(converted["first_home_mortgage_origination_year"][2024][0]), + converted["filing_status"][TIME_PERIOD][0], + int(converted["first_home_mortgage_origination_year"][TIME_PERIOD][0]), ) - balance = converted["first_home_mortgage_balance"][2024][0] - total_interest = converted["first_home_mortgage_interest"][2024][0] + balance = converted["first_home_mortgage_balance"][TIME_PERIOD][0] + total_interest = converted["first_home_mortgage_interest"][TIME_PERIOD][0] deductible_share = min(1.0, cap / balance) if balance > 0 else 0.0 assert total_interest * deductible_share == pytest.approx(6_000.0) - assert converted["home_mortgage_interest"][2024].sum() == pytest.approx( + assert converted["home_mortgage_interest"][TIME_PERIOD].sum() == pytest.approx( total_interest ) assert ( total_interest * deductible_share - + converted["investment_interest_expense"][2024].sum() + + converted["investment_interest_expense"][TIME_PERIOD].sum() ) == pytest.approx(7_000.0) @pytest.mark.skipif( - not supports_structural_mortgage_inputs(), + not HAS_STRUCTURAL_MORTGAGE_INPUTS, reason="Installed policyengine-us does not yet expose structural MID inputs.", ) def test_structural_mortgage_conversion_preserves_non_mortgage_interest(): data = _base_dataset_dict( + person_tax_unit_ids=[1, 1], + ages=[55, 53], deductible_mortgage_interest=[0.0, 0.0], interest_deduction=[2_500.0], + filing_status=[b"JOINT"], ) - converted = convert_mortgage_interest_to_structural_inputs(data, 2024) + converted = convert_mortgage_interest_to_structural_inputs(data, TIME_PERIOD) - assert converted["first_home_mortgage_balance"][2024][0] == 0 - assert converted["first_home_mortgage_interest"][2024][0] == 0 - assert converted["home_mortgage_interest"][2024].sum() == 0 - assert converted["investment_interest_expense"][2024].sum() == pytest.approx( + assert converted["first_home_mortgage_balance"][TIME_PERIOD][0] == 0 + assert converted["first_home_mortgage_interest"][TIME_PERIOD][0] == 0 + assert converted["home_mortgage_interest"][TIME_PERIOD].sum() == 0 + assert converted["investment_interest_expense"][TIME_PERIOD].sum() == pytest.approx( 2_500.0 ) @pytest.mark.skipif( - not supports_structural_mortgage_inputs(), + not HAS_STRUCTURAL_MORTGAGE_INPUTS, reason="Installed policyengine-us does not yet expose structural MID inputs.", ) def test_structural_mortgage_conversion_keeps_balance_hints_for_non_itemizers(): data = _base_dataset_dict( + person_tax_unit_ids=[1, 1], + ages=[55, 53], deductible_mortgage_interest=[0.0, 0.0], interest_deduction=[0.0], + filing_status=[b"JOINT"], ) data["imputed_first_home_mortgage_balance_hint"] = { - 2024: np.array([250_000.0], dtype=np.float32) + TIME_PERIOD: np.array([250_000.0], dtype=np.float32) } data["imputed_second_home_mortgage_balance_hint"] = { - 2024: np.array([25_000.0], dtype=np.float32) + TIME_PERIOD: np.array([25_000.0], dtype=np.float32) } - converted = convert_mortgage_interest_to_structural_inputs(data, 2024) + converted = convert_mortgage_interest_to_structural_inputs(data, TIME_PERIOD) - assert converted["first_home_mortgage_balance"][2024][0] == pytest.approx(250_000.0) - assert converted["second_home_mortgage_balance"][2024][0] == pytest.approx(25_000.0) - assert converted["first_home_mortgage_interest"][2024][0] == 0 - assert converted["second_home_mortgage_interest"][2024][0] == 0 - assert converted["first_home_mortgage_origination_year"][2024][0] > 0 - assert converted["second_home_mortgage_origination_year"][2024][0] >= 2018 - assert converted["home_mortgage_interest"][2024].sum() == 0 - assert converted["investment_interest_expense"][2024].sum() == 0 + assert converted["first_home_mortgage_balance"][TIME_PERIOD][0] == pytest.approx( + 250_000.0 + ) + assert converted["second_home_mortgage_balance"][TIME_PERIOD][0] == pytest.approx( + 25_000.0 + ) + assert converted["first_home_mortgage_interest"][TIME_PERIOD][0] == 0 + assert converted["second_home_mortgage_interest"][TIME_PERIOD][0] == 0 + assert converted["first_home_mortgage_origination_year"][TIME_PERIOD][0] > 0 + assert converted["second_home_mortgage_origination_year"][TIME_PERIOD][0] >= 2018 + assert converted["home_mortgage_interest"][TIME_PERIOD].sum() == 0 + assert converted["investment_interest_expense"][TIME_PERIOD].sum() == 0 @pytest.mark.skipif( - not supports_structural_mortgage_inputs(), + not HAS_STRUCTURAL_MORTGAGE_INPUTS, reason="Installed policyengine-us does not yet expose structural MID inputs.", ) def test_scf_balance_hint_imputation_zeroes_non_mortgaged_owner(monkeypatch): @@ -157,62 +237,42 @@ def predict(self, X_test): monkeypatch.setattr( scf_module.SCF_2022, "load_dataset", - lambda self: { - "age": np.array([45, 55]), - "is_female": np.array([0, 1]), - "cps_race": np.array([1, 2]), - "is_married": np.array([1, 0]), - "own_children_in_household": np.array([1, 0]), - "employment_income": np.array([80_000, 40_000]), - "interest_dividend_income": np.array([2_000, 1_000]), - "social_security_pension_income": np.array([0, 5_000]), - "nh_mort": np.array([250_000, 0]), - "heloc": np.array([25_000, 0]), - "houses": np.array([500_000, 350_000]), - "wgt": np.array([1, 1]), - }, + lambda self: _mock_scf_dataset(), ) - data = { - "person_id": {2024: np.array([1, 2])}, - "tax_unit_id": {2024: np.array([1, 2])}, - "marital_unit_id": {2024: np.array([1, 2])}, - "spm_unit_id": {2024: np.array([1, 2])}, - "family_id": {2024: np.array([1, 2])}, - "household_id": {2024: np.array([1, 2])}, - "person_tax_unit_id": {2024: np.array([1, 2])}, - "person_marital_unit_id": {2024: np.array([1, 2])}, - "person_spm_unit_id": {2024: np.array([1, 2])}, - "person_family_id": {2024: np.array([1, 2])}, - "person_household_id": {2024: np.array([1, 2])}, - "is_tax_unit_head": {2024: np.array([True, True])}, - "is_tax_unit_spouse": {2024: np.array([False, False])}, - "age": {2024: np.array([45, 55])}, - "is_male": {2024: np.array([1, 0])}, - "cps_race": {2024: np.array([1, 2])}, - "employment_income": {2024: np.array([80_000, 40_000])}, - "taxable_interest_income": {2024: np.array([1_000, 500])}, - "tax_exempt_interest_income": {2024: np.array([0, 0])}, - "qualified_dividend_income": {2024: np.array([500, 250])}, - "non_qualified_dividend_income": {2024: np.array([0, 0])}, - "social_security_retirement": {2024: np.array([0, 5_000])}, - "taxable_private_pension_income": {2024: np.array([0, 0])}, - "tax_exempt_private_pension_income": {2024: np.array([0, 0])}, - "tenure_type": { - 2024: np.array([b"OWNED_WITH_MORTGAGE", b"OWNED_WITH_MORTGAGE"]) - }, - "spm_unit_tenure_type": { - 2024: np.array([b"OWNER_WITH_MORTGAGE", b"OWNER_WITHOUT_MORTGAGE"]) - }, - } + data = _base_dataset_dict( + person_tax_unit_ids=[1, 2], + ages=[45, 55], + ) + data |= _time_period_variables( + is_male=[1, 0], + cps_race=[1, 2], + employment_income=[80_000, 40_000], + taxable_interest_income=[1_000, 500], + tax_exempt_interest_income=[0, 0], + qualified_dividend_income=[500, 250], + non_qualified_dividend_income=[0, 0], + social_security_retirement=[0, 5_000], + taxable_private_pension_income=[0, 0], + tax_exempt_private_pension_income=[0, 0], + tenure_type=[b"OWNED_WITH_MORTGAGE", b"OWNED_WITH_MORTGAGE"], + spm_unit_tenure_type=[ + b"OWNER_WITH_MORTGAGE", + b"OWNER_WITHOUT_MORTGAGE", + ], + ) - imputed = impute_tax_unit_mortgage_balance_hints(data, 2024) + imputed = impute_tax_unit_mortgage_balance_hints(data, TIME_PERIOD) - assert imputed["imputed_first_home_mortgage_balance_hint"][2024].tolist() == [ + assert imputed["imputed_first_home_mortgage_balance_hint"][ + TIME_PERIOD + ].tolist() == [ 200_000.0, 0.0, ] - assert imputed["imputed_second_home_mortgage_balance_hint"][2024].tolist() == [ + assert imputed["imputed_second_home_mortgage_balance_hint"][ + TIME_PERIOD + ].tolist() == [ 20_000.0, 0.0, ] diff --git a/policyengine_us_data/utils/mortgage_interest.py b/policyengine_us_data/utils/mortgage_interest.py index f49530bf..fbd6df94 100644 --- a/policyengine_us_data/utils/mortgage_interest.py +++ b/policyengine_us_data/utils/mortgage_interest.py @@ -33,17 +33,6 @@ ] -def supports_structural_mortgage_inputs() -> bool: - """Return whether the installed policyengine-us exposes structural MID inputs.""" - try: - from policyengine_us import CountryTaxBenefitSystem - except ImportError: - return False - - tbs = CountryTaxBenefitSystem() - return all(name in tbs.variables for name in STRUCTURAL_MORTGAGE_VARIABLES) - - def impute_tax_unit_mortgage_balance_hints( data: Dict[str, Dict[int, np.ndarray]], time_period: int, @@ -55,9 +44,6 @@ def impute_tax_unit_mortgage_balance_hints( mortgage balance distribution without forcing the baseline to use mortgage interest for non-itemizers. """ - if not supports_structural_mortgage_inputs(): - return data - receiver = _build_tax_unit_mortgage_receiver(data, time_period) if receiver.empty: return data @@ -117,8 +103,7 @@ def convert_mortgage_interest_to_structural_inputs( The current us-data calibration pipeline imputes a person-level ``deductible_mortgage_interest`` and a tax-unit-level ``interest_deduction``. That short-circuits structural MID reforms in - policyengine-us. When structural mortgage inputs are available, convert - those imputed amounts into: + policyengine-us, so this converts those imputed amounts into: * tax-unit mortgage balances, interest, and origination years * person-level ``home_mortgage_interest`` for within-tax-unit allocation @@ -131,9 +116,6 @@ def convert_mortgage_interest_to_structural_inputs( * the origination year is heuristic, because the current public pipeline does not carry a mortgage-vintage input """ - if not supports_structural_mortgage_inputs(): - return data - tp = time_period person_ids = data.get("person_id", {}).get(tp) tax_unit_ids = data.get("tax_unit_id", {}).get(tp) diff --git a/policyengine_us_data/utils/policyengine.py b/policyengine_us_data/utils/policyengine.py new file mode 100644 index 00000000..18b9050f --- /dev/null +++ b/policyengine_us_data/utils/policyengine.py @@ -0,0 +1,17 @@ +from functools import lru_cache + + +@lru_cache(maxsize=1) +def _policyengine_us_variable_names() -> frozenset[str]: + from policyengine_us import CountryTaxBenefitSystem + + return frozenset(CountryTaxBenefitSystem().variables) + + +def has_policyengine_us_variables(*variables: str) -> bool: + try: + available_variables = _policyengine_us_variable_names() + except Exception: + return False + + return set(variables).issubset(available_variables) From 857b31c769c554a300a7f6ba6446f7897e13fef8 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Fri, 27 Mar 2026 09:17:16 -0400 Subject: [PATCH 4/5] Improve MID structure and SOI targeting --- .../test_mortgage_interest.py | 49 ++++++++++++-- .../tests/test_calibration/test_soi.py | 9 +++ policyengine_us_data/utils/loss.py | 3 +- .../utils/mortgage_interest.py | 64 ++++++++++++++++++- policyengine_us_data/utils/soi.py | 12 +++- 5 files changed, 126 insertions(+), 11 deletions(-) create mode 100644 policyengine_us_data/tests/test_calibration/test_soi.py diff --git a/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py b/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py index bbcfea45..cc689be6 100644 --- a/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py +++ b/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py @@ -4,6 +4,7 @@ from policyengine_us_data.utils.mortgage_interest import ( STRUCTURAL_MORTGAGE_VARIABLES, + _interest_implied_balance_floor, convert_mortgage_interest_to_structural_inputs, impute_tax_unit_mortgage_balance_hints, ) @@ -23,6 +24,17 @@ def _time_period_variables(**variables): return {name: _at_time_period(values) for name, values in variables.items()} +def _set_balance_hints(data, *, first, second): + data["imputed_first_home_mortgage_balance_hint"] = _at_time_period( + first, + dtype=np.float32, + ) + data["imputed_second_home_mortgage_balance_hint"] = _at_time_period( + second, + dtype=np.float32, + ) + + def _head_and_spouse_flags(person_tax_unit_ids): first_seen = {} heads = np.zeros(len(person_tax_unit_ids), dtype=bool) @@ -184,12 +196,7 @@ def test_structural_mortgage_conversion_keeps_balance_hints_for_non_itemizers(): interest_deduction=[0.0], filing_status=[b"JOINT"], ) - data["imputed_first_home_mortgage_balance_hint"] = { - TIME_PERIOD: np.array([250_000.0], dtype=np.float32) - } - data["imputed_second_home_mortgage_balance_hint"] = { - TIME_PERIOD: np.array([25_000.0], dtype=np.float32) - } + _set_balance_hints(data, first=[250_000.0], second=[25_000.0]) converted = convert_mortgage_interest_to_structural_inputs(data, TIME_PERIOD) @@ -276,3 +283,33 @@ def predict(self, X_test): 20_000.0, 0.0, ] + + +@pytest.mark.skipif( + not HAS_STRUCTURAL_MORTGAGE_INPUTS, + reason="Installed policyengine-us does not yet expose structural MID inputs.", +) +def test_structural_mortgage_conversion_scales_hints_to_interest_floor(): + data = _base_dataset_dict( + person_tax_unit_ids=[1, 1], + ages=[55, 53], + deductible_mortgage_interest=[30_000.0, 0.0], + interest_deduction=[30_000.0], + filing_status=[b"JOINT"], + ) + _set_balance_hints(data, first=[200_000.0], second=[25_000.0]) + + converted = convert_mortgage_interest_to_structural_inputs(data, TIME_PERIOD) + first_balance = converted["first_home_mortgage_balance"][TIME_PERIOD][0] + second_balance = converted["second_home_mortgage_balance"][TIME_PERIOD][0] + expected_floor = _interest_implied_balance_floor( + np.array([30_000.0], dtype=np.float32), + TIME_PERIOD, + )[0] + + assert first_balance + second_balance == pytest.approx(expected_floor) + assert first_balance / second_balance == pytest.approx(8.0) + assert converted["home_mortgage_interest"][TIME_PERIOD].sum() == pytest.approx( + converted["first_home_mortgage_interest"][TIME_PERIOD][0] + + converted["second_home_mortgage_interest"][TIME_PERIOD][0] + ) diff --git a/policyengine_us_data/tests/test_calibration/test_soi.py b/policyengine_us_data/tests/test_calibration/test_soi.py new file mode 100644 index 00000000..c8354b95 --- /dev/null +++ b/policyengine_us_data/tests/test_calibration/test_soi.py @@ -0,0 +1,9 @@ +from policyengine_us_data.utils.soi import get_soi + + +def test_get_soi_includes_mortgage_interest_deduction_targets(): + soi = get_soi(2024) + mortgage_interest = soi[soi.Variable == "mortgage_interest_deductions"] + + assert not mortgage_interest.empty + assert mortgage_interest["Value"].gt(0).all() diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index bfbf49db..02507f63 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -4,7 +4,7 @@ import numpy as np import logging -from policyengine_us_data.storage import STORAGE_FOLDER, CALIBRATION_FOLDER +from policyengine_us_data.storage import CALIBRATION_FOLDER from policyengine_us_data.storage.calibration_targets.pull_soi_targets import ( STATE_ABBR_TO_FIPS, ) @@ -127,6 +127,7 @@ def build_loss_matrix(dataset: type, time_period): "partnership_and_s_corp_income", "qualified_dividends", "taxable_interest_income", + "mortgage_interest_deductions", "total_pension_income", "total_social_security", ] diff --git a/policyengine_us_data/utils/mortgage_interest.py b/policyengine_us_data/utils/mortgage_interest.py index fbd6df94..5cae7a1d 100644 --- a/policyengine_us_data/utils/mortgage_interest.py +++ b/policyengine_us_data/utils/mortgage_interest.py @@ -113,6 +113,9 @@ def convert_mortgage_interest_to_structural_inputs( The conversion is intentionally conservative: * current-law deductible mortgage interest is preserved exactly * current-law total interest deduction is preserved exactly + * SCF-imputed first-lien and HELOC splits are preserved when available + * weak balance hints are lifted to a conservative lower bound implied by + the observed deductible mortgage interest * the origination year is heuristic, because the current public pipeline does not carry a mortgage-vintage input """ @@ -147,6 +150,8 @@ def convert_mortgage_interest_to_structural_inputs( first_balance_hint, second_balance_hint, ) = _get_tax_unit_mortgage_balance_hints(data, tp, n_tax_units) + hinted_total_balance = np.maximum(first_balance_hint + second_balance_hint, 0) + balance_floor = _interest_implied_balance_floor(tax_unit_deductible, tp) total_interest_deduction = _get_tax_unit_interest_deduction_target( data, @@ -177,7 +182,7 @@ def convert_mortgage_interest_to_structural_inputs( ) has_mortgage = tax_unit_deductible > 0 - hinted_balance = np.maximum(first_balance_hint + second_balance_hint, 0) + hinted_balance = np.maximum(hinted_total_balance, balance_floor) balance, origination_year = _estimate_mortgage_balance_and_year( tax_unit_ids, tax_unit_deductible, @@ -186,13 +191,18 @@ def convert_mortgage_interest_to_structural_inputs( tp, hinted_balance, ) - use_balance_hint = hinted_balance > 0 + use_balance_hint = hinted_total_balance > 0 first_balance = np.where(use_balance_hint, first_balance_hint, balance).astype( np.float32 ) second_balance = np.where(use_balance_hint, second_balance_hint, 0).astype( np.float32 ) + first_balance, second_balance = _apply_interest_implied_balance_floor( + first_balance, + second_balance, + balance_floor, + ) swap_mask = (first_balance == 0) & (second_balance > 0) first_balance[swap_mask] = second_balance[swap_mask] @@ -696,6 +706,56 @@ def _estimate_mortgage_balance_and_year( return balance, year +def _interest_implied_balance_floor( + deductible_mortgage_interest: np.ndarray, + time_period: int, +) -> np.ndarray: + """Conservative balance lower bound implied by deductible interest. + + Uses the current-period market mortgage rate as the denominator, so the + inferred balance is a lower bound rather than an aggressive reconstruction + of total acquisition debt. + """ + current_market_rate = float( + _mortgage_rate(np.array([time_period], dtype=np.int32))[0] + ) + if current_market_rate <= 0: + return np.zeros_like(deductible_mortgage_interest, dtype=np.float32) + return np.where( + deductible_mortgage_interest > 0, + deductible_mortgage_interest / current_market_rate, + 0, + ).astype(np.float32) + + +def _apply_interest_implied_balance_floor( + first_balance: np.ndarray, + second_balance: np.ndarray, + balance_floor: np.ndarray, +) -> tuple[np.ndarray, np.ndarray]: + """Prevent donor balance hints from understating observed mortgage interest.""" + first_balance = np.asarray(first_balance, dtype=np.float32).copy() + second_balance = np.asarray(second_balance, dtype=np.float32).copy() + balance_floor = np.maximum(np.asarray(balance_floor, dtype=np.float32), 0) + + total_balance = first_balance + second_balance + needs_floor = balance_floor > total_balance + with_existing_split = needs_floor & (total_balance > 0) + + scale = np.ones_like(total_balance, dtype=np.float32) + scale[with_existing_split] = ( + balance_floor[with_existing_split] / total_balance[with_existing_split] + ) + first_balance[with_existing_split] *= scale[with_existing_split] + second_balance[with_existing_split] *= scale[with_existing_split] + + no_existing_balance = needs_floor & (total_balance == 0) + first_balance[no_existing_balance] = balance_floor[no_existing_balance] + second_balance[no_existing_balance] = 0 + + return first_balance.astype(np.float32), second_balance.astype(np.float32) + + def _split_interest_by_balance( total_interest: np.ndarray, first_balance: np.ndarray, diff --git a/policyengine_us_data/utils/soi.py b/policyengine_us_data/utils/soi.py index 997a8078..27a92956 100644 --- a/policyengine_us_data/utils/soi.py +++ b/policyengine_us_data/utils/soi.py @@ -1,7 +1,7 @@ import pandas as pd import numpy as np from .uprating import create_policyengine_uprating_factors_table -from policyengine_us_data.storage import STORAGE_FOLDER, CALIBRATION_FOLDER +from policyengine_us_data.storage import CALIBRATION_FOLDER def pe_to_soi(pe_dataset, year): @@ -11,7 +11,8 @@ def pe_to_soi(pe_dataset, year): pe_sim.default_calculation_period = year df = pd.DataFrame() - pe = lambda variable: np.array(pe_sim.calculate(variable, map_to="tax_unit")) + def pe(variable): + return np.array(pe_sim.calculate(variable, map_to="tax_unit")) df["adjusted_gross_income"] = pe("adjusted_gross_income") df["exemption"] = pe("exemptions") @@ -62,6 +63,7 @@ def pe_to_soi(pe_dataset, year): ) df["charitable_contributions_deduction"] = pe("charitable_deduction") df["interest_paid_deductions"] = pe("interest_deduction") + df["mortgage_interest_deductions"] = pe("deductible_mortgage_interest") df["medical_expense_deductions_uncapped"] = pe("medical_expense_deduction") df["state_and_local_tax_deductions"] = pe("salt_deduction") df["itemized_state_income_and_sales_tax_deductions"] = pe( @@ -108,6 +110,11 @@ def puf_to_soi(puf, year): df["employment_income"] = puf.E00200 df["charitable_contributions_deduction"] = puf.E19700 df["interest_paid_deductions"] = puf.E19200 + df["mortgage_interest_deductions"] = ( + puf["deductible_mortgage_interest"] + if "deductible_mortgage_interest" in puf + else puf.E19200 + ) df["medical_expense_deductions_uncapped"] = puf.E17500 df["itemized_state_income_and_sales_tax_deductions"] = puf.E18400 df["itemized_real_estate_tax_deductions"] = puf.E18500 @@ -146,6 +153,7 @@ def get_soi(year: int) -> pd.DataFrame: "partnership_and_s_corp_income": "partnership_s_corp_income", "qualified_dividends": "qualified_dividend_income", "taxable_interest_income": "taxable_interest_income", + "mortgage_interest_deductions": "interest_deduction", "total_pension_income": "pension_income", "total_social_security": "social_security", "business_net_losses": "self_employment_income", From 22ab8f10e789fb85cb348fe6837197fdd009a292 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Fri, 27 Mar 2026 11:28:18 -0400 Subject: [PATCH 5/5] Clarify MID proxy assumptions --- .../test_mortgage_interest.py | 28 +++++++++++++++++++ policyengine_us_data/utils/loss.py | 4 ++- .../utils/mortgage_interest.py | 14 ++++++++++ policyengine_us_data/utils/soi.py | 3 ++ 4 files changed, 48 insertions(+), 1 deletion(-) diff --git a/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py b/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py index cc689be6..bb6fbc1c 100644 --- a/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py +++ b/policyengine_us_data/tests/test_calibration/test_mortgage_interest.py @@ -5,6 +5,7 @@ from policyengine_us_data.utils.mortgage_interest import ( STRUCTURAL_MORTGAGE_VARIABLES, _interest_implied_balance_floor, + _post_tcja_cap, convert_mortgage_interest_to_structural_inputs, impute_tax_unit_mortgage_balance_hints, ) @@ -313,3 +314,30 @@ def test_structural_mortgage_conversion_scales_hints_to_interest_floor(): converted["first_home_mortgage_interest"][TIME_PERIOD][0] + converted["second_home_mortgage_interest"][TIME_PERIOD][0] ) + + +def test_post_tcja_cap_uses_mfs_limit(): + assert _post_tcja_cap("SEPARATE") == pytest.approx(375_000.0) + assert _post_tcja_cap("MARRIED_FILING_SEPARATELY") == pytest.approx(375_000.0) + + +@pytest.mark.skipif( + not HAS_STRUCTURAL_MORTGAGE_INPUTS, + reason="Installed policyengine-us does not yet expose structural MID inputs.", +) +def test_structural_mortgage_conversion_swaps_partial_hints(): + data = _base_dataset_dict( + person_tax_unit_ids=[1, 1], + ages=[55, 53], + deductible_mortgage_interest=[0.0, 0.0], + interest_deduction=[0.0], + filing_status=[b"JOINT"], + ) + _set_balance_hints(data, first=[0.0], second=[25_000.0]) + + converted = convert_mortgage_interest_to_structural_inputs(data, TIME_PERIOD) + + assert converted["first_home_mortgage_balance"][TIME_PERIOD][0] == pytest.approx( + 25_000.0 + ) + assert converted["second_home_mortgage_balance"][TIME_PERIOD][0] == 0 diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index 02507f63..49662850 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -127,7 +127,6 @@ def build_loss_matrix(dataset: type, time_period): "partnership_and_s_corp_income", "qualified_dividends", "taxable_interest_income", - "mortgage_interest_deductions", "total_pension_income", "total_social_security", ] @@ -142,6 +141,9 @@ def build_loss_matrix(dataset: type, time_period): "partnership_and_s_corp_losses", "rent_and_royalty_net_income", "rent_and_royalty_net_losses", + # The current SOI source only exposes taxable-only aggregate targets for + # mortgage-interest deductions, not the AGI-bin detail used above. + "mortgage_interest_deductions", "taxable_pension_income", "taxable_social_security", "unemployment_compensation", diff --git a/policyengine_us_data/utils/mortgage_interest.py b/policyengine_us_data/utils/mortgage_interest.py index 5cae7a1d..af26974e 100644 --- a/policyengine_us_data/utils/mortgage_interest.py +++ b/policyengine_us_data/utils/mortgage_interest.py @@ -43,6 +43,11 @@ def impute_tax_unit_mortgage_balance_hints( data-layer hints that let the structural MID conversion reuse an SCF-like mortgage balance distribution without forcing the baseline to use mortgage interest for non-itemizers. + + The second hint is a generic secondary acquisition-debt slot. In the + public SCF, HELOC balances are the best observable proxy for that slot even + though the downstream ``second_home_mortgage_*`` variables in + policyengine-us are named around a second home. """ receiver = _build_tax_unit_mortgage_receiver(data, time_period) if receiver.empty: @@ -118,6 +123,10 @@ def convert_mortgage_interest_to_structural_inputs( the observed deductible mortgage interest * the origination year is heuristic, because the current public pipeline does not carry a mortgage-vintage input + + The structural model has two mortgage slots. In public data, we use those + slots for "first-lien" and "secondary acquisition debt" rather than trying + to identify literal primary-residence versus second-home mortgages. """ tp = time_period person_ids = data.get("person_id", {}).get(tp) @@ -228,6 +237,9 @@ def convert_mortgage_interest_to_structural_inputs( ) second_origination_year = np.where( second_balance > 0, + # The public data's second slot is mainly a HELOC/secondary-debt proxy, + # so treat it as post-TCJA unless a richer vintage input becomes + # available. np.maximum(2018, origination_year), 0, ).astype(np.int32) @@ -435,6 +447,8 @@ def _build_scf_mortgage_donor(scf: pd.DataFrame) -> pd.DataFrame: 2, np.where(owns_home, 1, 0), ).astype(np.float32) + # The second slot is not a literal second-home mortgage in SCF. We use + # HELOC balances as the best public proxy for secondary acquisition debt. donor["imputed_first_home_mortgage_balance_hint"] = np.maximum( total_mortgage - heloc, 0, diff --git a/policyengine_us_data/utils/soi.py b/policyengine_us_data/utils/soi.py index 27a92956..41f2365c 100644 --- a/policyengine_us_data/utils/soi.py +++ b/policyengine_us_data/utils/soi.py @@ -153,6 +153,9 @@ def get_soi(year: int) -> pd.DataFrame: "partnership_and_s_corp_income": "partnership_s_corp_income", "qualified_dividends": "qualified_dividend_income", "taxable_interest_income": "taxable_interest_income", + # There is no separate published uprating factor for mortgage-interest + # deductions, so use total interest deductions as the closest available + # proxy. "mortgage_interest_deductions": "interest_deduction", "total_pension_income": "pension_income", "total_social_security": "social_security",