From 194fb4a2334599177acd814ade17be84d266715d Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 7 Feb 2026 17:18:33 -0500 Subject: [PATCH 1/4] Add voluntary tax filer assignment SOI data shows many low-AGI filers who file taxes voluntarily even when not required and not receiving a refund. This affects calibration accuracy when comparing CPS-based filer counts to SOI totals. Add would_file_taxes_voluntarily variable at tax_unit level with ~5% probability, using seeded RNG for reproducibility. This enables policyengine-us to incorporate voluntary filing behavior in its tax_unit_is_filer variable. Co-Authored-By: Claude Opus 4.5 --- policyengine_us_data/datasets/cps/cps.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 84969500..da57e492 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -289,6 +289,16 @@ def add_takeup(self): imputed_risk = rng.random(n_persons) < wic_risk_rate_by_person data["is_wic_at_nutritional_risk"] = receives_wic | imputed_risk + # Voluntary tax filing: some people file even when not required and + # not getting a refund. SOI shows ~21M filers with AGI < $10k, many + # of whom file voluntarily. Estimate ~5% of tax units file voluntarily + # (state requirements, documentation, habit). + voluntary_filing_rate = 0.05 + rng = seeded_rng("would_file_taxes_voluntarily") + data["would_file_taxes_voluntarily"] = ( + rng.random(n_tax_units) < voluntary_filing_rate + ) + self.save_dataset(data) From 062a0abdf77dcc6ddb48537d7375d4aa65451057 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 7 Feb 2026 17:20:57 -0500 Subject: [PATCH 2/4] Improve voluntary tax filer logic with refund-seeking behavior Replace simple 5% voluntary filing rate with more nuanced approach: 1. Add would_file_for_refund variable that identifies tax units taking up EITC (95% of EITC takers are assumed to know they'll get a refund) 2. Apply voluntary filing rate (3%) only to those NOT already filing for a refund, to avoid double-counting This better models the actual filing decision process where refundable credit recipients have a clear financial incentive to file, while others may file for state requirements, documentation, or habit. Co-Authored-By: Claude Opus 4.5 --- policyengine_us_data/datasets/cps/cps.py | 28 +++++++++++++++++++----- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index da57e492..ac85904e 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -289,13 +289,29 @@ def add_takeup(self): imputed_risk = rng.random(n_persons) < wic_risk_rate_by_person data["is_wic_at_nutritional_risk"] = receives_wic | imputed_risk - # Voluntary tax filing: some people file even when not required and - # not getting a refund. SOI shows ~21M filers with AGI < $10k, many - # of whom file voluntarily. Estimate ~5% of tax units file voluntarily - # (state requirements, documentation, habit). - voluntary_filing_rate = 0.05 + # Tax filing behavior assignment + # + # Some people file taxes even when not strictly required: + # 1. Those eligible for refundable credits (know they'll get money back) + # 2. Those who file voluntarily for other reasons (state requirements, + # documentation, habit) + # + # We use EITC take-up as a proxy for refund-seeking behavior, since + # EITC recipients know they'll get money back and will file. + + # People who take up EITC are likely filing for a refund + # Use a high probability (95%) since some may not know about it + rng = seeded_rng("would_file_for_refund") + data["would_file_for_refund"] = data["takes_up_eitc"] & ( + rng.random(n_tax_units) < 0.95 + ) + + # Voluntary filers: file for other reasons (state requirements, + # documentation, habit). Apply only to those not already filing for + # refund. ~3% of remaining tax units file voluntarily. + voluntary_filing_rate = 0.03 rng = seeded_rng("would_file_taxes_voluntarily") - data["would_file_taxes_voluntarily"] = ( + data["would_file_taxes_voluntarily"] = ~data["would_file_for_refund"] & ( rng.random(n_tax_units) < voluntary_filing_rate ) From ee214a8a3bcd875c2da71bcf98c1cdaf20e0fde1 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 7 Feb 2026 17:24:59 -0500 Subject: [PATCH 3/4] Simplify voluntary filer logic and add filer count calibration targets Voluntary filer changes (cps.py): - Remove redundant would_file_for_refund variable since takes_up_eitc already captures refund-seeking behavior - Simplify to single would_file_taxes_voluntarily variable that applies only to tax units NOT taking up EITC - Use 5% voluntary filing rate for non-EITC takers Calibration target changes (loss.py): - Add SOI Table 1.1 filer counts by AGI band as calibration targets - Covers 7 bands: <$0, $0-5k, $5k-10k, $10k-25k, $25k-50k, $50k-100k, $100k+ - Includes all filers (not just taxable returns) to properly calibrate low-income filer counts which are important for distribution accuracy - Uprates 2015 SOI counts to current year using population growth This consolidates PR #514 into PR #513. Co-Authored-By: Claude Opus 4.5 --- policyengine_us_data/datasets/cps/cps.py | 29 +++++---------------- policyengine_us_data/utils/loss.py | 33 ++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 22 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index ac85904e..558a5353 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -289,29 +289,14 @@ def add_takeup(self): imputed_risk = rng.random(n_persons) < wic_risk_rate_by_person data["is_wic_at_nutritional_risk"] = receives_wic | imputed_risk - # Tax filing behavior assignment - # - # Some people file taxes even when not strictly required: - # 1. Those eligible for refundable credits (know they'll get money back) - # 2. Those who file voluntarily for other reasons (state requirements, - # documentation, habit) - # - # We use EITC take-up as a proxy for refund-seeking behavior, since - # EITC recipients know they'll get money back and will file. - - # People who take up EITC are likely filing for a refund - # Use a high probability (95%) since some may not know about it - rng = seeded_rng("would_file_for_refund") - data["would_file_for_refund"] = data["takes_up_eitc"] & ( - rng.random(n_tax_units) < 0.95 - ) - - # Voluntary filers: file for other reasons (state requirements, - # documentation, habit). Apply only to those not already filing for - # refund. ~3% of remaining tax units file voluntarily. - voluntary_filing_rate = 0.03 + # Voluntary tax filing: some people file even when not required and not + # seeking a refund. EITC take-up already captures refund-seeking behavior + # (if you take up EITC, you file). This variable captures people who file + # for other reasons: state requirements, documentation, habit. + # ~5% of tax units who don't take up EITC still file voluntarily. + voluntary_filing_rate = 0.05 rng = seeded_rng("would_file_taxes_voluntarily") - data["would_file_taxes_voluntarily"] = ~data["would_file_for_refund"] & ( + data["would_file_taxes_voluntarily"] = ~data["takes_up_eitc"] & ( rng.random(n_tax_units) < voluntary_filing_rate ) diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index f798c0dc..05caf44b 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -330,6 +330,39 @@ def build_loss_matrix(dataset: type, time_period): ) targets_array.append(row["eitc_total"] * eitc_spending_uprating) + # Tax filer counts by AGI band (SOI Table 1.1) + # This calibrates total filers (not just taxable returns) including + # low-AGI filers who are important for income distribution accuracy + SOI_FILER_COUNTS_2015 = { + # (agi_lower, agi_upper): total_returns + (-np.inf, 0): 2_072_066, + (0, 5_000): 10_134_703, + (5_000, 10_000): 11_398_595, + (10_000, 25_000): 23_447_927, + (25_000, 50_000): 23_727_745, + (50_000, 100_000): 32_801_908, + (100_000, np.inf): 25_120_985, + } + + # Get AGI and filer status at tax unit level, mapped to household + agi_tu = sim.calculate("adjusted_gross_income").values + is_filer_tu = sim.calculate("tax_unit_is_filer").values > 0 + + for ( + agi_lower, + agi_upper, + ), filer_count_2015 in SOI_FILER_COUNTS_2015.items(): + in_band = (agi_tu >= agi_lower) & (agi_tu < agi_upper) + label = f"nation/soi/filer_count/agi_{fmt(agi_lower)}_{fmt(agi_upper)}" + loss_matrix[label] = sim.map_result( + (is_filer_tu & in_band).astype(float), + "tax_unit", + "household", + ) + # Uprate from 2015 to current year using population growth + uprated_target = filer_count_2015 * population_uprating + targets_array.append(uprated_target) + # Hard-coded totals for variable_name, target in HARD_CODED_TOTALS.items(): label = f"nation/census/{variable_name}" From c442bd87f91c71a304631c221389b0aae80282a8 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 7 Feb 2026 17:55:19 -0500 Subject: [PATCH 4/4] Add changelog entry Co-Authored-By: Claude Opus 4.5 --- changelog_entry.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..cafe3f63 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + added: + - Add voluntary tax filer variable and filer count calibration targets by AGI band.