policyengine-us-data/policyengine_us_data/datasets/cps/extended_cps.py at aee2ffc94a92c00317aefe365f92573cc86ffe7c · PolicyEngine/policyengine-us-data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
from policyengine_core.data import Dataset
from policyengine_us_data.storage import STORAGE_FOLDER
from typing import Type
from policyengine_us_data.datasets.cps.cps import *
from policyengine_us_data.datasets.puf import *
import pandas as pd
from microimpute.models.qrf import QRF
import time
import logging
import gc

# These are sorted by magnitude.
# First 15 contain 90%.
# First 7 contain 75%.
# If you're trying to debug this part of the code and don't want to wait ages
# to see if something breaks, try limiting to those.
IMPUTED_VARIABLES = [
    "employment_income",
    "partnership_s_corp_income",
    "social_security",
    "taxable_pension_income",
    "interest_deduction",
    "tax_exempt_pension_income",
    "long_term_capital_gains",
    "unreimbursed_business_employee_expenses",
    "pre_tax_contributions",
    "taxable_ira_distributions",
    "self_employment_income",
    "w2_wages_from_qualified_business",
    "unadjusted_basis_qualified_property",
    "business_is_sstb",  # bool
    "short_term_capital_gains",
    "qualified_dividend_income",
    "charitable_cash_donations",
    "self_employed_pension_contribution_ald",
    "unrecaptured_section_1250_gain",
    "taxable_unemployment_compensation",
    "taxable_interest_income",
    "domestic_production_ald",
    "self_employed_health_insurance_ald",
    "rental_income",
    "non_qualified_dividend_income",
    "cdcc_relevant_expenses",
    "tax_exempt_interest_income",
    "salt_refund_income",
    "foreign_tax_credit",
    "estate_income",
    "charitable_non_cash_donations",
    "american_opportunity_credit",
    "miscellaneous_income",
    "alimony_expense",
    "farm_income",
    "partnership_se_income",
    "alimony_income",
    "health_savings_account_ald",
    "non_sch_d_capital_gains",
    "general_business_credit",
    "energy_efficient_home_improvement_credit",
    "traditional_ira_contributions",
    "amt_foreign_tax_credit",
    "excess_withheld_payroll_tax",
    "savers_credit",
    "student_loan_interest",
    "investment_income_elected_form_4952",
    "early_withdrawal_penalty",
    "prior_year_minimum_tax_credit",
    "farm_rent_income",
    "qualified_tuition_expenses",
    "educator_expense",
    "long_term_capital_gains_on_collectibles",
    "other_credits",
    "casualty_loss",
    "unreported_payroll_tax",
    "recapture_of_investment_credit",
    "deductible_mortgage_interest",
    "qualified_reit_and_ptp_income",
    "qualified_bdc_income",
    "farm_operations_income",
    "estate_income_would_be_qualified",
    "farm_operations_income_would_be_qualified",
    "farm_rent_income_would_be_qualified",
    "partnership_s_corp_income_would_be_qualified",
    "rental_income_would_be_qualified",
    "self_employment_income_would_be_qualified",
]

OVERRIDDEN_IMPUTED_VARIABLES = [
    "partnership_s_corp_income",
    "interest_deduction",
    "unreimbursed_business_employee_expenses",
    "pre_tax_contributions",
    "w2_wages_from_qualified_business",
    "unadjusted_basis_qualified_property",
    "business_is_sstb",
    "charitable_cash_donations",
    "self_employed_pension_contribution_ald",
    "unrecaptured_section_1250_gain",
    "taxable_unemployment_compensation",
    "domestic_production_ald",
    "self_employed_health_insurance_ald",
    "cdcc_relevant_expenses",
    "salt_refund_income",
    "foreign_tax_credit",
    "estate_income",
    "charitable_non_cash_donations",
    "american_opportunity_credit",
    "miscellaneous_income",
    "alimony_expense",
    "health_savings_account_ald",
    "non_sch_d_capital_gains",
    "general_business_credit",
    "energy_efficient_home_improvement_credit",
    "amt_foreign_tax_credit",
    "excess_withheld_payroll_tax",
    "savers_credit",
    "student_loan_interest",
    "investment_income_elected_form_4952",
    "early_withdrawal_penalty",
    "prior_year_minimum_tax_credit",
    "farm_rent_income",
    "qualified_tuition_expenses",
    "educator_expense",
    "long_term_capital_gains_on_collectibles",
    "other_credits",
    "casualty_loss",
    "unreported_payroll_tax",
    "recapture_of_investment_credit",
    "deductible_mortgage_interest",
    "qualified_reit_and_ptp_income",
    "qualified_bdc_income",
    "farm_operations_income",
    "estate_income_would_be_qualified",
    "farm_operations_income_would_be_qualified",
    "farm_rent_income_would_be_qualified",
    "partnership_s_corp_income_would_be_qualified",
    "rental_income_would_be_qualified",
]


class ExtendedCPS(Dataset):
    cps: Type[CPS]
    puf: Type[PUF]
    data_format = Dataset.TIME_PERIOD_ARRAYS

    def generate(self):
        from policyengine_us import Microsimulation

        cps_sim = Microsimulation(dataset=self.cps)
        puf_sim = Microsimulation(dataset=self.puf)

        puf_sim.subsample(10_000)

        INPUTS = [
            "age",
            "is_male",
            "tax_unit_is_joint",
            "tax_unit_count_dependents",
            "is_tax_unit_head",
            "is_tax_unit_spouse",
            "is_tax_unit_dependent",
        ]

        y_full_imputations = impute_income_variables(
            cps_sim,
            puf_sim,
            predictors=INPUTS,
            outputs=IMPUTED_VARIABLES,
        )
        y_cps_imputations = impute_income_variables(
            cps_sim,
            puf_sim,
            predictors=INPUTS,
            outputs=OVERRIDDEN_IMPUTED_VARIABLES,
        )
        cps_sim = Microsimulation(dataset=self.cps)
        data = cps_sim.dataset.load_dataset()
        new_data = {}

        # Pre-compute weeks_unemployed imputation for PUF copy
        # Preserve relationship between UC and weeks from CPS
        puf_weeks_unemployed = impute_weeks_unemployed_for_puf(
            cps_sim, y_full_imputations
        )

        for variable in list(data) + IMPUTED_VARIABLES:
            variable_metadata = cps_sim.tax_benefit_system.variables.get(
                variable
            )
            if variable in data:
                values = data[variable][...]
            else:
                values = cps_sim.calculate(variable).values
            if variable in OVERRIDDEN_IMPUTED_VARIABLES:
                pred_values = y_cps_imputations[variable].values
                entity = variable_metadata.entity.key
                if entity != "person":
                    pred_values = cps_sim.populations[
                        entity
                    ].value_from_first_person(pred_values)
                values = np.concatenate([pred_values, pred_values])
            elif variable in IMPUTED_VARIABLES:
                pred_values = y_full_imputations[variable].values
                entity = variable_metadata.entity.key
                if entity != "person":
                    pred_values = cps_sim.populations[
                        entity
                    ].value_from_first_person(pred_values)
                values = np.concatenate([values, pred_values])
            elif variable == "person_id":
                values = np.concatenate([values, values + values.max()])
            elif "_id" in variable:
                values = np.concatenate([values, values + values.max()])
            elif "_weight" in variable:
                values = np.concatenate([values, values * 0])
            elif variable == "weeks_unemployed":
                # Use imputed weeks for PUF copy to preserve UC relationship
                values = np.concatenate([values, puf_weeks_unemployed])
            else:
                values = np.concatenate([values, values])
            new_data[variable] = {
                self.time_period: values,
            }

        self.save_dataset(new_data)


def impute_income_variables(
    cps_sim,
    puf_sim,
    predictors: list[str] = None,
    outputs: list[str] = None,
):

    # Calculate all variables together to preserve dependencies
    X_train = puf_sim.calculate_dataframe(predictors + outputs)

    # Check which outputs are actually in the result
    available_outputs = [col for col in outputs if col in X_train.columns]
    missing_outputs = [col for col in outputs if col not in X_train.columns]

    if missing_outputs:
        logging.warning(
            f"The following {len(missing_outputs)} variables were not calculated: {missing_outputs}"
        )
        # Log the specific missing variable that's causing issues
        if "recapture_of_investment_credit" in missing_outputs:
            logging.error(
                "recapture_of_investment_credit is missing from PUF calculation!"
            )

    logging.info(
        f"X_train shape: {X_train.shape}, columns: {len(X_train.columns)}"
    )

    X_test = cps_sim.calculate_dataframe(predictors)

    logging.info(
        f"Imputing {len(available_outputs)} variables using batched sequential QRF"
    )
    total_start = time.time()

    # Batch variables to avoid memory issues with sequential imputation
    batch_size = 10  # Reduce to 10 variables at a time
    result = pd.DataFrame(index=X_test.index)

    # Sample training data more aggressively upfront
    sample_size = min(5000, len(X_train))  # Reduced from 5000
    if len(X_train) > sample_size:
        logging.info(
            f"Sampling training data from {len(X_train)} to {sample_size} rows"
        )
        X_train_sampled = X_train.sample(n=sample_size, random_state=42)
    else:
        X_train_sampled = X_train

    for batch_start in range(0, len(available_outputs), batch_size):
        batch_end = min(batch_start + batch_size, len(available_outputs))
        batch_vars = available_outputs[batch_start:batch_end]

        logging.info(
            f"Processing batch {batch_start//batch_size + 1}: variables {batch_start+1}-{batch_end} ({batch_vars})"
        )

        # Force garbage collection before each batch
        gc.collect()

        # Create a fresh QRF for each batch
        qrf = QRF(
            log_level="INFO",
            memory_efficient=True,
            batch_size=10,
            cleanup_interval=5,
        )

        # Use pre-sampled data for this batch
        batch_X_train = X_train_sampled[predictors + batch_vars].copy()

        # Fit model for this batch with sequential imputation within the batch
        fitted_model = qrf.fit(
            X_train=batch_X_train,
            predictors=predictors,
            imputed_variables=batch_vars,
            n_jobs=1,  # Single thread to reduce memory overhead
        )

        # Predict for this batch
        batch_predictions = fitted_model.predict(X_test=X_test)

        # Extract median predictions and add to result
        for var in batch_vars:
            result[var] = batch_predictions[var]

        # Clean up batch objects
        del fitted_model
        del batch_predictions
        del batch_X_train
        gc.collect()

        logging.info(f"Completed batch {batch_start//batch_size + 1}")

    # Add zeros for missing variables
    for var in missing_outputs:
        result[var] = 0

    logging.info(
        f"Imputing {len(available_outputs)} variables took {time.time() - total_start:.2f} seconds total"
    )

    return result


def impute_weeks_unemployed_for_puf(cps_sim, puf_imputations):
    """
    Impute weeks_unemployed for the PUF copy using QRF from CPS data.

    Uses microimpute's Quantile Random Forest to impute weeks_unemployed
    for PUF records based on CPS data, preserving the joint distribution
    of weeks with UC, age, and other predictors.

    This is the reverse of the income imputation (CPS → PUF instead of
    PUF → CPS) because weeks_unemployed exists in CPS but not in PUF.
    """
    # Get CPS weeks
    try:
        cps_weeks = cps_sim.calculate("weeks_unemployed").values
    except (ValueError, KeyError):
        logging.warning(
            "weeks_unemployed not available in CPS, "
            "returning zeros for PUF copy"
        )
        n_persons = len(puf_imputations.index)
        return np.zeros(n_persons)

    # Predictors available in both CPS and imputed PUF data
    WEEKS_PREDICTORS = [
        "age",
        "is_male",
        "tax_unit_is_joint",
        "is_tax_unit_head",
        "is_tax_unit_spouse",
        "is_tax_unit_dependent",
    ]

    # Build training data from CPS
    X_train = cps_sim.calculate_dataframe(WEEKS_PREDICTORS)
    X_train["weeks_unemployed"] = cps_weeks

    # Add UC as predictor if available in imputations (strong predictor)
    if "taxable_unemployment_compensation" in puf_imputations.columns:
        cps_uc = cps_sim.calculate("unemployment_compensation").values
        X_train["unemployment_compensation"] = cps_uc
        WEEKS_PREDICTORS = WEEKS_PREDICTORS + ["unemployment_compensation"]

    # Build test data for PUF copy
    # Use CPS sim to get demographics (same as CPS portion)
    X_test = cps_sim.calculate_dataframe(
        [p for p in WEEKS_PREDICTORS if p != "unemployment_compensation"]
    )

    # Add imputed UC if available
    if "taxable_unemployment_compensation" in puf_imputations.columns:
        X_test["unemployment_compensation"] = puf_imputations[
            "taxable_unemployment_compensation"
        ].values

    logging.info(
        f"Imputing weeks_unemployed using QRF with "
        f"predictors: {WEEKS_PREDICTORS}"
    )

    # Use QRF to impute weeks
    qrf = QRF(
        log_level="INFO",
        memory_efficient=True,
    )

    # Sample training data for efficiency
    sample_size = min(5000, len(X_train))
    if len(X_train) > sample_size:
        X_train_sampled = X_train.sample(n=sample_size, random_state=42)
    else:
        X_train_sampled = X_train

    fitted_model = qrf.fit(
        X_train=X_train_sampled,
        predictors=WEEKS_PREDICTORS,
        imputed_variables=["weeks_unemployed"],
        n_jobs=1,
    )

    predictions = fitted_model.predict(X_test=X_test)
    imputed_weeks = predictions["weeks_unemployed"].values

    # Enforce constraints: 0-52 weeks, 0 if no UC
    imputed_weeks = np.clip(imputed_weeks, 0, 52)
    if "unemployment_compensation" in X_test.columns:
        imputed_weeks = np.where(
            X_test["unemployment_compensation"].values > 0,
            imputed_weeks,
            0,
        )

    logging.info(
        f"Imputed weeks_unemployed for PUF: "
        f"{(imputed_weeks > 0).sum()} with weeks > 0, "
        f"mean = {imputed_weeks[imputed_weeks > 0].mean():.1f} weeks"
    )

    return imputed_weeks


class ExtendedCPS_2024(ExtendedCPS):
    cps = CPS_2024_Full
    puf = PUF_2024
    name = "extended_cps_2024"
    label = "Extended CPS (2024)"
    file_path = STORAGE_FOLDER / "extended_cps_2024.h5"
    time_period = 2024


if __name__ == "__main__":
    ExtendedCPS_2024().generate()