From 569c5b3a1b36fb1b3418cd65bedddfc7b5eecc32 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Wed, 21 Jan 2026 20:38:01 +0530 Subject: [PATCH 1/6] adding test for matrix building logic --- changelog_entry.yaml | 6 +- .../sparse_matrix_builder.py | 20 +- .../test_sparse_matrix_builder.py | 876 ++++++++++++++++++ 3 files changed, 885 insertions(+), 17 deletions(-) create mode 100644 policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 0f82eb65..591f325d 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,4 +1,4 @@ -- bump: patch +- bump: minor changes: - fixed: - - Versioning workflow checkout for push events + added: + - tests to verify SparseMatrixBuilder correctly calculates variables and constraints into the calibration matrix. diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py b/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py index 3af0a8d8..d8748014 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py @@ -199,20 +199,12 @@ def build_matrix( ): mask[:] = False else: - try: - values = state_sim.calculate( - c["variable"], map_to="household" - ).values - mask &= apply_op( - values, c["operation"], c["value"] - ) - except Exception as e: - # Variable may not exist or may not be - # calculable at household level - skip - logger.debug( - f"Could not evaluate constraint " - f"{c['variable']}: {e}" - ) + values = state_sim.calculate( + c["variable"], map_to="household" + ).values + mask &= apply_op( + values, c["operation"], c["value"] + ) if not mask.any(): continue diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py b/policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py new file mode 100644 index 00000000..175488be --- /dev/null +++ b/policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py @@ -0,0 +1,876 @@ +""" +Tests for sparse matrix builder correctness. + +These tests verify that: +1. Matrix shape and structure are correct +2. Matrix cell values match simulation calculations for households in their + original state +3. Variable aggregation (person to household) preserves totals +4. National-level targets receive contributions from all states (no geographic + bias) + +The key verification approach: +- When households are "borrowed" to different geographic areas, state_fips is + changed and variables are recalculated +- For households borrowed to CDs in their ORIGINAL state, the recalculated + values should match the original simulation values exactly (since state_fips + is unchanged) +- This provides a ground-truth verification without needing end-to-end H5 + creation + +IMPORTANT NOTE on stochastic eligibility: +Some variables like SNAP have eligibility tests that use PolicyEngine's +random() function. When variables are recalculated in the matrix builder (via +fresh simulations), the random seed sequence may differ, causing ~1-3% of +households to have different eligibility outcomes. This is expected behavior, +so tests allow up to 2% mismatch rate for such variables. +""" + +import pytest +import numpy as np +import pandas as pd +from policyengine_us import Microsimulation +from policyengine_us_data.storage import STORAGE_FOLDER +from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import ( + SparseMatrixBuilder, +) + + +# ============================================================================= +# CONFIGURATION - Update these lists as new variables are added +# ============================================================================= + +# Variables to test for state-level value matching +# Format: (variable_name, rtol) - rtol is relative tolerance for comparison +VARIABLES_TO_TEST = [ + ("snap", 1e-2), + ("health_insurance_premiums_without_medicare_part_b", 1e-2), + ("medicaid", 1e-2), + ("medicare_part_b_premiums", 1e-2), + ("other_medical_expenses", 1e-2), + ("over_the_counter_health_expenses", 1e-2), + ("salt_deduction", 1e-2), + ("spm_unit_capped_work_childcare_expenses", 1e-2), + ("spm_unit_capped_housing_subsidy", 1e-2), + ("ssi", 1e-2), + ("tanf", 1e-2), + ("tip_income", 1e-2), + ("unemployment_compensation", 1e-2), +] + +# Combined filter config to build matrix with all variables at once +COMBINED_FILTER_CONFIG = { + "stratum_group_ids": [ + 4, # SNAP targets + 5, # Medicaid targets + 112, # Unemployment compensation targets + ], + "variables": [ + "snap", + "health_insurance_premiums_without_medicare_part_b", + "medicaid", + "medicare_part_b_premiums", + "other_medical_expenses", + "over_the_counter_health_expenses", + "salt_deduction", + "spm_unit_capped_work_childcare_expenses", + "spm_unit_capped_housing_subsidy", + "ssi", + "tanf", + "tip_income", + "unemployment_compensation", + ], +} + +VARIABLES_WITH_STATE_VARIATION = [ + "snap", +] + +# Complications: +# (snap) +# (unemployment_compensation) +# income_tax +# qualified_business_income_deduction +# taxable_social_security +# taxable_pension_income +# taxable_ira_distributions +# taxable_interest_income +# tax_exempt_interest_income +# self_employment_income +# salt +# refundable_ctc +# real_estate_taxes +# qualified_dividend_income +# dividend_income +# adjusted_gross_income +# eitc + +# Maximum allowed mismatch rate for state-level value comparison +MAX_MISMATCH_RATE = 0.02 + + +# ============================================================================= +# FIXTURES +# ============================================================================= + + +@pytest.fixture(scope="module") +def db_uri(): + """Database URI for calibration targets.""" + db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" + return f"sqlite:///{db_path}" + + +@pytest.fixture(scope="module") +def dataset_path(): + """Path to stratified extended CPS dataset.""" + return str(STORAGE_FOLDER / "stratified_extended_cps_2023.h5") + + +@pytest.fixture(scope="module") +def sim(dataset_path): + """Base simulation loaded from stratified CPS.""" + return Microsimulation(dataset=dataset_path) + + +@pytest.fixture(scope="module") +def test_cds(): + """ + Test CDs spanning multiple states for comprehensive testing. + + Selected to include: + - Small states (1-2 CDs): AL, MT + - Medium states: NC + - Large states: CA, TX, NY + """ + return [ + "101", # Alabama CD-1 (state_fips=1) + "102", # Alabama CD-2 + "601", # California CD-1 (state_fips=6) + "602", # California CD-2 + "3001", # Montana CD-1 (state_fips=30) + "3002", # Montana CD-2 + "3701", # North Carolina CD-1 (state_fips=37) + "3702", # North Carolina CD-2 + "3601", # New York CD-1 (state_fips=36) + "3602", # New York CD-2 + "4801", # Texas CD-1 (state_fips=48) + "4802", # Texas CD-2 + ] + + +@pytest.fixture(scope="module") +def builder(db_uri, dataset_path, test_cds): + """SparseMatrixBuilder configured with test CDs.""" + return SparseMatrixBuilder( + db_uri=db_uri, + time_period=2023, + cds_to_calibrate=test_cds, + dataset_path=dataset_path, + ) + + +@pytest.fixture(scope="module") +def combined_matrix_data(sim, builder): + """ + Build matrix once with all configured variables. + + This fixture is used by the consolidated test to avoid rebuilding + the matrix for each variable. + """ + targets_df, X_sparse, hh_mapping = builder.build_matrix( + sim, + target_filter=COMBINED_FILTER_CONFIG, + ) + + household_ids = sim.calculate("household_id", map_to="household").values + state_fips = sim.calculate("state_fips", map_to="household").values + + return { + "targets_df": targets_df, + "X_sparse": X_sparse, + "hh_mapping": hh_mapping, + "household_ids": household_ids, + "state_fips": state_fips, + "cds": builder.cds_to_calibrate, + "n_households": len(household_ids), + } + + +# ============================================================================= +# HELPER FUNCTIONS +# ============================================================================= + + +def _get_geo_level(geo_id) -> str: + """Determine geographic level from geographic_id.""" + if geo_id == "US": + return "national" + try: + val = int(geo_id) + if 1 <= val <= 56: + return "state" + else: + return "district" + except (ValueError, TypeError): + return "unknown" + + +def _verify_state_level_values( + X_sparse, + targets_df, + original_values, + original_state_fips, + cds, + n_households, + variable_name, + rtol=1e-2, +): + """ + Verify that matrix values match original values for households in their + original state. + + Returns: + Tuple of (verified_count, mismatches_list, skipped_reason or None) + """ + # Get state-level targets + state_targets = targets_df[ + (targets_df["variable"] == variable_name) + & (targets_df["geographic_id"].apply(lambda x: str(x).isdigit())) + & ( + targets_df["geographic_id"].apply( + lambda x: 1 <= int(x) <= 56 if str(x).isdigit() else False + ) + ) + ] + + if len(state_targets) == 0: + return 0, [], f"No state-level targets for {variable_name}" + + mismatches = [] + verified_count = 0 + + for _, target_row in state_targets.iterrows(): + target_state = int(target_row["geographic_id"]) + row_idx = target_row.name + + # Find all CDs in this state + state_cds = [ + (cd_idx, cd) + for cd_idx, cd in enumerate(cds) + if int(cd) // 100 == target_state + ] + + if not state_cds: + continue + + # Find households originally from this state + hh_from_state_mask = original_state_fips == target_state + hh_indices_from_state = np.where(hh_from_state_mask)[0] + + if len(hh_indices_from_state) == 0: + continue + + # For each CD in the state, check matrix values + for cd_idx, cd in state_cds: + col_start = cd_idx * n_households + + for hh_idx in hh_indices_from_state: + col_idx = col_start + hh_idx + matrix_val = X_sparse[row_idx, col_idx] + original_val = original_values[hh_idx] + + if original_val == 0 and matrix_val == 0: + verified_count += 1 + continue + + if original_val != 0: + rel_diff = abs(matrix_val - original_val) / abs( + original_val + ) + if rel_diff > rtol: + mismatches.append( + { + "variable": variable_name, + "state": target_state, + "cd": cd, + "hh_idx": hh_idx, + "matrix_val": float(matrix_val), + "original_val": float(original_val), + "rel_diff": rel_diff, + } + ) + else: + verified_count += 1 + elif matrix_val != 0: + mismatches.append( + { + "variable": variable_name, + "state": target_state, + "cd": cd, + "hh_idx": hh_idx, + "matrix_val": float(matrix_val), + "original_val": float(original_val), + "rel_diff": float("inf"), + } + ) + + return verified_count, mismatches, None + + +# ============================================================================= +# BASIC STRUCTURE TESTS +# ============================================================================= + + +def test_person_level_aggregation_preserves_totals(sim): + """Health insurance premiums (person-level) sum correctly to household.""" + var = "health_insurance_premiums_without_medicare_part_b" + person_total = sim.calculate(var, 2023, map_to="person").values.sum() + household_total = sim.calculate(var, 2023, map_to="household").values.sum() + assert np.isclose(person_total, household_total, rtol=1e-6) + + +def test_matrix_shape(sim, builder): + """Matrix should have (n_targets, n_households * n_cds) shape.""" + targets_df, X_sparse, _ = builder.build_matrix( + sim, + target_filter={ + "variables": ["health_insurance_premiums_without_medicare_part_b"] + }, + ) + n_households = len( + sim.calculate("household_id", map_to="household").values + ) + n_cds = len(builder.cds_to_calibrate) + assert X_sparse.shape[1] == n_households * n_cds + + +def test_combined_variables_in_matrix(sim, builder): + """Matrix should include all configured variables.""" + targets_df, X_sparse, _ = builder.build_matrix( + sim, + target_filter=COMBINED_FILTER_CONFIG, + ) + variables = targets_df["variable"].unique() + + for var_name, _ in VARIABLES_TO_TEST: + assert var_name in variables, f"Missing variable: {var_name}" + + +# ============================================================================= +# CONSOLIDATED STATE-LEVEL VALUE TEST +# ============================================================================= + + +class TestStateLevelValues: + """ + Consolidated test for verifying matrix values match original simulation + values for households in their original state. + + Builds matrix once and iterates through all configured variables. + """ + + def test_all_variables_state_level_match(self, sim, combined_matrix_data): + """ + Verify all configured variables have correct state-level values. + + For each variable: + 1. Calculate original values from simulation + 2. Compare to matrix values for households in their original state + 3. Allow up to MAX_MISMATCH_RATE due to stochastic eligibility + """ + results = [] + all_mismatches = [] + + for variable_name, rtol in VARIABLES_TO_TEST: + # Calculate original values for this variable + original_values = sim.calculate( + variable_name, map_to="household" + ).values + + verified, mismatches, skip_reason = _verify_state_level_values( + X_sparse=combined_matrix_data["X_sparse"], + targets_df=combined_matrix_data["targets_df"], + original_values=original_values, + original_state_fips=combined_matrix_data["state_fips"], + cds=combined_matrix_data["cds"], + n_households=combined_matrix_data["n_households"], + variable_name=variable_name, + rtol=rtol, + ) + + total_checked = verified + len(mismatches) + mismatch_rate = ( + len(mismatches) / total_checked if total_checked > 0 else 0 + ) + + results.append( + { + "variable": variable_name, + "verified": verified, + "mismatches": len(mismatches), + "total": total_checked, + "mismatch_rate": mismatch_rate, + "skip_reason": skip_reason, + "passed": ( + skip_reason is not None + or mismatch_rate <= MAX_MISMATCH_RATE + ), + } + ) + + all_mismatches.extend(mismatches) + + # Print summary + print("\n" + "=" * 70) + print("STATE-LEVEL VALUE VERIFICATION SUMMARY") + print("=" * 70) + + results_df = pd.DataFrame(results) + for _, row in results_df.iterrows(): + if row["skip_reason"]: + status = f"SKIPPED: {row['skip_reason']}" + elif row["passed"]: + status = ( + f"PASSED: {row['verified']:,} verified, " + f"{row['mismatch_rate']:.1%} mismatch rate" + ) + else: + status = ( + f"FAILED: {row['mismatches']:,} mismatches, " + f"{row['mismatch_rate']:.1%} > {MAX_MISMATCH_RATE:.0%}" + ) + print(f" {row['variable']}: {status}") + + # Show sample mismatches if any + if all_mismatches: + print(f"\nSample mismatches ({len(all_mismatches)} total):") + mismatch_df = pd.DataFrame(all_mismatches) + print(mismatch_df.head(15).to_string()) + + mismatch_df.to_csv("state_level_mismatches.csv", index=False) + + # Assert all variables passed + failed = [r for r in results if not r["passed"]] + assert len(failed) == 0, ( + f"{len(failed)} variable(s) failed state-level verification: " + f"{[r['variable'] for r in failed]}" + ) + + +# ============================================================================= +# NATIONAL-LEVEL CONTRIBUTION TEST +# ============================================================================= + + +class TestNationalLevelContributions: + """ + Tests verifying that national-level targets receive contributions from + households across all states, not just a geographic subset. + + The key insight: for a national target, when we look at a single CD's + column block, households from ALL original states should potentially + contribute (subject to meeting eligibility constraints). There should + be no systematic geographic bias where only households from certain + states contribute to the national total. + """ + + def test_national_targets_receive_multistate_contributions( + self, sim, combined_matrix_data + ): + """ + Verify that national-level targets have contributions from households + originally from multiple states. + + For each national target: + 1. Look at the matrix row + 2. For EACH CD's column block, identify which original states have + non-zero contributions + 3. Verify contributions come from multiple states (not geographically + biased) + """ + targets_df = combined_matrix_data["targets_df"] + X_sparse = combined_matrix_data["X_sparse"] + state_fips = combined_matrix_data["state_fips"] + n_households = combined_matrix_data["n_households"] + cds = combined_matrix_data["cds"] + + # Find national-level targets + national_targets = targets_df[ + targets_df["geographic_id"].apply( + lambda x: _get_geo_level(x) == "national" + ) + ] + + if len(national_targets) == 0: + pytest.skip("No national-level targets found") + + results = [] + + for _, target in national_targets.iterrows(): + row_idx = target.name + variable = target["variable"] + row = X_sparse[row_idx, :].toarray().flatten() + + # For each CD block, check which original states contribute + cd_contribution_stats = [] + + for cd_idx, cd in enumerate(cds): + col_start = cd_idx * n_households + col_end = col_start + n_households + cd_values = row[col_start:col_end] + + # Find households with non-zero values in this CD block + nonzero_mask = cd_values != 0 + nonzero_indices = np.where(nonzero_mask)[0] + + if len(nonzero_indices) == 0: + continue + + # Get original states of contributing households + contributing_states = set(state_fips[nonzero_indices]) + + cd_contribution_stats.append( + { + "cd": cd, + "cd_state": int(cd) // 100, + "n_contributing": len(nonzero_indices), + "n_states": len(contributing_states), + "contributing_states": contributing_states, + } + ) + + if not cd_contribution_stats: + results.append( + { + "variable": variable, + "status": "NO_CONTRIBUTIONS", + "details": "No non-zero values in any CD block", + } + ) + continue + + # Aggregate stats + stats_df = pd.DataFrame(cd_contribution_stats) + avg_states = stats_df["n_states"].mean() + min_states = stats_df["n_states"].min() + + # Check: on average, contributions should come from multiple states + # (at least 2, since we have CDs from 6 different states) + passed = avg_states >= 2 and min_states >= 1 + + results.append( + { + "variable": variable, + "status": "PASSED" if passed else "FAILED", + "avg_contributing_states": avg_states, + "min_contributing_states": min_states, + "n_cd_blocks_with_data": len(stats_df), + } + ) + + # Assert no geographic bias + failed = [r for r in results if r["status"] == "FAILED"] + assert len(failed) == 0, ( + f"Geographic bias detected in national targets: " + f"{[r['variable'] for r in failed]}" + ) + + def test_state_distribution_in_national_targets( + self, sim, combined_matrix_data + ): + """ + Verify the distribution of contributing states in national targets + roughly matches the original data distribution. + + This catches cases where one state dominates the contributions + disproportionately. + """ + targets_df = combined_matrix_data["targets_df"] + X_sparse = combined_matrix_data["X_sparse"] + state_fips = combined_matrix_data["state_fips"] + n_households = combined_matrix_data["n_households"] + cds = combined_matrix_data["cds"] + + # Get original state distribution (count of households per state) + unique_states, original_counts = np.unique( + state_fips, return_counts=True + ) + original_dist = dict(zip(unique_states, original_counts)) + total_hh = len(state_fips) + + # Find national-level targets + national_targets = targets_df[ + targets_df["geographic_id"].apply( + lambda x: _get_geo_level(x) == "national" + ) + ] + + if len(national_targets) == 0: + pytest.skip("No national-level targets found") + + for _, target in national_targets.iterrows(): + row_idx = target.name + variable = target["variable"] + row = X_sparse[row_idx, :].toarray().flatten() + + # Count contributions by original state across ALL CD blocks + state_contribution_counts = {} + + for cd_idx, cd in enumerate(cds): + col_start = cd_idx * n_households + col_end = col_start + n_households + cd_values = row[col_start:col_end] + + nonzero_mask = cd_values != 0 + nonzero_indices = np.where(nonzero_mask)[0] + + for hh_idx in nonzero_indices: + orig_state = state_fips[hh_idx] + state_contribution_counts[orig_state] = ( + state_contribution_counts.get(orig_state, 0) + 1 + ) + + if not state_contribution_counts: + continue + + # Check that no single state dominates excessively + total_contributions = sum(state_contribution_counts.values()) + max_contribution = max(state_contribution_counts.values()) + max_state = max( + state_contribution_counts, key=state_contribution_counts.get + ) + max_share = max_contribution / total_contributions + + # The max share should not exceed 70% (unless that state has 70%+ + # of households in the original data) + original_max_share = original_dist.get(max_state, 0) / total_hh + + # Allow 20% margin above original share + threshold = min(0.7, original_max_share + 0.2) + + assert max_share <= threshold, ( + f"State {max_state} dominates national {variable} target with " + f"{max_share:.1%} of contributions " + f"(original share: {original_max_share:.1%})" + ) + + +# ============================================================================= +# CROSS-STATE RECALCULATION TEST +# ============================================================================= + + +class TestCrossStateRecalculation: + """ + Tests verifying that household values change when borrowed to different + states, confirming state-specific rules are being applied. + + The key insight: for national-level targets (no state constraint), each + household appears in every CD block. The value in each CD block represents + what the variable would be if that household lived in that CD's state. + For state-dependent variables (like SNAP), values should differ across + states for at least some households. + """ + + def test_values_change_across_states_for_national_targets( + self, combined_matrix_data + ): + """ + Verify that for national targets, household values vary across CD + blocks from different states. + + This confirms the matrix builder is correctly recalculating variables + with state-specific rules when households are "borrowed" to different + geographic areas. + + The test checks: + 1. For each national target, examine households with non-zero values + 2. Compare each household's value across CD blocks from different states + 3. At least some households should have different values in different + states (confirming recalculation with different state rules) + """ + targets_df = combined_matrix_data["targets_df"] + X_sparse = combined_matrix_data["X_sparse"] + n_households = combined_matrix_data["n_households"] + cds = combined_matrix_data["cds"] + + # Group CDs by state + cds_by_state = {} + for cd_idx, cd in enumerate(cds): + state = int(cd) // 100 + if state not in cds_by_state: + cds_by_state[state] = [] + cds_by_state[state].append((cd_idx, cd)) + + states = list(cds_by_state.keys()) + if len(states) < 2: + pytest.skip("Need at least 2 states to test cross-state variation") + + # Find national-level targets + national_targets = targets_df[ + targets_df["geographic_id"].apply( + lambda x: _get_geo_level(x) == "national" + ) + ] + + if len(national_targets) == 0: + pytest.skip("No national-level targets found") + + results = [] + + for _, target in national_targets.iterrows(): + if target["variable"] not in VARIABLES_WITH_STATE_VARIATION: + continue + row_idx = target.name + variable = target["variable"] + row = X_sparse[row_idx, :].toarray().flatten() + + # For each household, collect values from different states + households_with_variation = 0 + households_checked = 0 + + # Sample households (check every 10th to keep test fast) + for hh_idx in range(0, n_households, 10): + # Get this household's value in each state (use first CD of + # each state) + state_values = {} + for state, cd_list in cds_by_state.items(): + cd_idx, _ = cd_list[0] # First CD in this state + col_idx = cd_idx * n_households + hh_idx + state_values[state] = row[col_idx] + + # Skip if all values are zero (household doesn't qualify for + # this variable) + nonzero_values = [v for v in state_values.values() if v != 0] + if len(nonzero_values) < 2: + continue + + households_checked += 1 + + # Check if values differ across states + unique_values = set(nonzero_values) + if len(unique_values) > 1: + households_with_variation += 1 + + variation_rate = ( + households_with_variation / households_checked + if households_checked > 0 + else 0 + ) + + results.append( + { + "variable": variable, + "households_checked": households_checked, + "households_with_variation": households_with_variation, + "variation_rate": variation_rate, + } + ) + + # For state-dependent variables, we expect SOME variation + # (not all households will vary - some may have $0 or max benefits + # regardless of state) + # The key is that variation exists, confirming recalculation occurs + for r in results: + if r["households_checked"] > 0: + # At least 10% of households should show variation for + # state-dependent variables + assert ( + r["variation_rate"] > 0.1 or r["households_checked"] < 10 + ), ( + f"No cross-state variation found for {r['variable']}. " + f"This suggests state-specific rules may not be applied " + f"when households are borrowed to different states." + ) + + def test_same_household_different_states_shows_rule_changes( + self, combined_matrix_data + ): + """ + Deep dive test: pick specific households and verify their values + differ across states in a way consistent with state-specific rules. + + For SNAP specifically, different states have different: + - Standard deductions + - Shelter deduction caps + - Vehicle allowances + - Categorical eligibility rules + + This test finds households where we can verify the recalculation + is applying different state rules. + """ + targets_df = combined_matrix_data["targets_df"] + X_sparse = combined_matrix_data["X_sparse"] + n_households = combined_matrix_data["n_households"] + cds = combined_matrix_data["cds"] + state_fips_orig = combined_matrix_data["state_fips"] + + # Group CDs by state + cds_by_state = {} + for cd_idx, cd in enumerate(cds): + state = int(cd) // 100 + if state not in cds_by_state: + cds_by_state[state] = [] + cds_by_state[state].append((cd_idx, cd)) + + states = sorted(cds_by_state.keys()) + if len(states) < 2: + pytest.skip("Need at least 2 states") + + # Find national SNAP target (most state-dependent) + snap_national = targets_df[ + (targets_df["variable"] == "snap") + & ( + targets_df["geographic_id"].apply( + lambda x: _get_geo_level(x) == "national" + ) + ) + ] + + if len(snap_national) == 0: + pytest.skip("No national SNAP target found") + + row_idx = snap_national.iloc[0].name + row = X_sparse[row_idx, :].toarray().flatten() + + # Find households with interesting variation patterns + example_households = [] + + for hh_idx in range(n_households): + state_values = {} + for state, cd_list in cds_by_state.items(): + cd_idx, _ = cd_list[0] + col_idx = cd_idx * n_households + hh_idx + state_values[state] = row[col_idx] + + # Look for households where: + # 1. At least 2 states have non-zero SNAP + # 2. The values differ significantly (>10% relative difference) + nonzero_states = {s: v for s, v in state_values.items() if v > 0} + + if len(nonzero_states) >= 2: + values = list(nonzero_states.values()) + max_val = max(values) + min_val = min(values) + if min_val > 0 and (max_val - min_val) / min_val > 0.1: + example_households.append( + { + "hh_idx": hh_idx, + "original_state": state_fips_orig[hh_idx], + "state_values": nonzero_states, + "max_val": max_val, + "min_val": min_val, + "variation": (max_val - min_val) / min_val, + } + ) + + if len(example_households) >= 5: + break + + # Assert we found at least one household with variation + assert len(example_households) > 0, ( + "Expected to find households with >10% SNAP variation across " + "states, confirming state-specific rules are applied" + ) From fe70932ae8f714a8e86696cb3bb8d67b0e8045f2 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Wed, 21 Jan 2026 20:47:13 +0530 Subject: [PATCH 2/6] lint after updating black --- policyengine_us_data/datasets/cps/cps.py | 1 - policyengine_us_data/datasets/cps/enhanced_cps.py | 1 - .../datasets/cps/local_area_calibration/calibration_utils.py | 1 - .../datasets/cps/local_area_calibration/matrix_tracer.py | 1 - policyengine_us_data/datasets/puf/puf.py | 1 - policyengine_us_data/datasets/puf/uprate_puf.py | 1 - policyengine_us_data/db/create_database_tables.py | 1 - policyengine_us_data/db/etl_age.py | 1 - policyengine_us_data/db/etl_irs_soi.py | 1 - policyengine_us_data/db/validate_database.py | 1 - .../storage/calibration_targets/pull_snap_targets.py | 1 - policyengine_us_data/tests/test_datasets/test_county_fips.py | 1 - .../test_local_area_calibration/test_sparse_matrix_builder.py | 1 - policyengine_us_data/utils/census.py | 1 - policyengine_us_data/utils/huggingface.py | 1 - policyengine_us_data/utils/loss.py | 1 - policyengine_us_data/utils/spm.py | 1 - tests/test_h6_reform.py | 1 - 18 files changed, 18 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 19ee9249..27a41bec 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -15,7 +15,6 @@ from microimpute.models.qrf import QRF import logging - test_lite = os.environ.get("TEST_LITE") == "true" print(f"TEST_LITE == {test_lite}") diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 8bbe67bc..4eb0a660 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -22,7 +22,6 @@ from pathlib import Path import logging - try: import torch except ImportError: diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py index c2e2a08f..f01465a2 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py @@ -17,7 +17,6 @@ StateCode, ) - # State/Geographic Mappings STATE_CODES = { 1: "AL", diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/matrix_tracer.py b/policyengine_us_data/datasets/cps/local_area_calibration/matrix_tracer.py index e7cbf57b..4823de1e 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/matrix_tracer.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/matrix_tracer.py @@ -46,7 +46,6 @@ create_target_groups, ) - logger = logging.getLogger(__name__) diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py index 9d605aca..c90255e3 100644 --- a/policyengine_us_data/datasets/puf/puf.py +++ b/policyengine_us_data/datasets/puf/puf.py @@ -15,7 +15,6 @@ create_policyengine_uprating_factors_table, ) - rng = np.random.default_rng(seed=64) # Get Qualified Business Income simulation parameters --- diff --git a/policyengine_us_data/datasets/puf/uprate_puf.py b/policyengine_us_data/datasets/puf/uprate_puf.py index 1cf0eb9c..96144615 100644 --- a/policyengine_us_data/datasets/puf/uprate_puf.py +++ b/policyengine_us_data/datasets/puf/uprate_puf.py @@ -2,7 +2,6 @@ import numpy as np from policyengine_us_data.storage import STORAGE_FOLDER - ITMDED_GROW_RATE = 0.02 # annual growth rate in itemized deduction amounts USE_VARIABLE_SPECIFIC_POPULATION_GROWTH_DIVISORS = False diff --git a/policyengine_us_data/db/create_database_tables.py b/policyengine_us_data/db/create_database_tables.py index df03772d..920d1449 100644 --- a/policyengine_us_data/db/create_database_tables.py +++ b/policyengine_us_data/db/create_database_tables.py @@ -15,7 +15,6 @@ from policyengine_us_data.storage import STORAGE_FOLDER - logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py index bb83067c..d80faf06 100644 --- a/policyengine_us_data/db/etl_age.py +++ b/policyengine_us_data/db/etl_age.py @@ -11,7 +11,6 @@ ) from policyengine_us_data.utils.census import get_census_docs, pull_acs_table - LABEL_TO_SHORT = { "Estimate!!Total!!Total population!!AGE!!Under 5 years": "0-4", "Estimate!!Total!!Total population!!AGE!!5 to 9 years": "5-9", diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index 786abb1c..6607a5dd 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -24,7 +24,6 @@ get_district_mapping, ) - """See the 22incddocguide.docx manual from the IRS SOI""" # Let's make this work with strict inequalities # Language in the doc: '$10,000 under $25,000' diff --git a/policyengine_us_data/db/validate_database.py b/policyengine_us_data/db/validate_database.py index fee6a49d..53ac0985 100644 --- a/policyengine_us_data/db/validate_database.py +++ b/policyengine_us_data/db/validate_database.py @@ -9,7 +9,6 @@ import pandas as pd from policyengine_us.system import system - conn = sqlite3.connect("policyengine_us_data/storage/policy_data.db") stratum_constraints_df = pd.read_sql("SELECT * FROM stratum_constraints", conn) diff --git a/policyengine_us_data/storage/calibration_targets/pull_snap_targets.py b/policyengine_us_data/storage/calibration_targets/pull_snap_targets.py index 349e6fbd..1830bdb3 100644 --- a/policyengine_us_data/storage/calibration_targets/pull_snap_targets.py +++ b/policyengine_us_data/storage/calibration_targets/pull_snap_targets.py @@ -9,7 +9,6 @@ STATE_NAME_TO_ABBREV, ) - STATE_NAME_TO_FIPS = { "Alabama": "01", "Alaska": "02", diff --git a/policyengine_us_data/tests/test_datasets/test_county_fips.py b/policyengine_us_data/tests/test_datasets/test_county_fips.py index ad1f10c5..d692cf55 100644 --- a/policyengine_us_data/tests/test_datasets/test_county_fips.py +++ b/policyengine_us_data/tests/test_datasets/test_county_fips.py @@ -10,7 +10,6 @@ LOCAL_FOLDER, ) - # Sample data that mimics the format from census.gov SAMPLE_CENSUS_DATA = """STATE|STATEFP|COUNTYFP|COUNTYNAME AL|01|001|Autauga County diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py b/policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py index 175488be..76309388 100644 --- a/policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py +++ b/policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py @@ -35,7 +35,6 @@ SparseMatrixBuilder, ) - # ============================================================================= # CONFIGURATION - Update these lists as new variables are added # ============================================================================= diff --git a/policyengine_us_data/utils/census.py b/policyengine_us_data/utils/census.py index 2f424ccb..8081b616 100644 --- a/policyengine_us_data/utils/census.py +++ b/policyengine_us_data/utils/census.py @@ -4,7 +4,6 @@ import pandas as pd import numpy as np - STATE_NAME_TO_FIPS = { "Alabama": "01", "Alaska": "02", diff --git a/policyengine_us_data/utils/huggingface.py b/policyengine_us_data/utils/huggingface.py index 2860adf3..a312b524 100644 --- a/policyengine_us_data/utils/huggingface.py +++ b/policyengine_us_data/utils/huggingface.py @@ -1,7 +1,6 @@ from huggingface_hub import hf_hub_download, login, HfApi import os - TOKEN = os.environ.get("HUGGING_FACE_TOKEN") if not TOKEN: raise ValueError( diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index cbea6dab..e368d504 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -9,7 +9,6 @@ from policyengine_core.reforms import Reform from policyengine_us_data.utils.soi import pe_to_soi, get_soi - # CPS-derived statistics # Medical expenses, sum of spm thresholds # Child support expenses diff --git a/policyengine_us_data/utils/spm.py b/policyengine_us_data/utils/spm.py index 070db533..b2e4538b 100644 --- a/policyengine_us_data/utils/spm.py +++ b/policyengine_us_data/utils/spm.py @@ -3,7 +3,6 @@ import numpy as np from spm_calculator import SPMCalculator, spm_equivalence_scale - TENURE_CODE_MAP = { 1: "owner_with_mortgage", 2: "owner_without_mortgage", diff --git a/tests/test_h6_reform.py b/tests/test_h6_reform.py index 7253ed97..e68ed8db 100644 --- a/tests/test_h6_reform.py +++ b/tests/test_h6_reform.py @@ -11,7 +11,6 @@ import pytest - # Constants from the H6 reform implementation HI_SINGLE = 34_000 HI_JOINT = 44_000 From 653415139cd7fffd966d22d1488db30dcfe9107f Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Thu, 22 Jan 2026 14:39:20 +0530 Subject: [PATCH 3/6] remove redundant tests and fixtures --- .../test_local_area_calibration/conftest.py | 65 ++- .../test_cross_state.py | 58 ++- ...r.py => test_matrix_national_variation.py} | 436 +----------------- .../test_same_state.py | 59 ++- 4 files changed, 172 insertions(+), 446 deletions(-) rename policyengine_us_data/tests/test_local_area_calibration/{test_sparse_matrix_builder.py => test_matrix_national_variation.py} (53%) diff --git a/policyengine_us_data/tests/test_local_area_calibration/conftest.py b/policyengine_us_data/tests/test_local_area_calibration/conftest.py index 04d6d7f5..d4b5edc6 100644 --- a/policyengine_us_data/tests/test_local_area_calibration/conftest.py +++ b/policyengine_us_data/tests/test_local_area_calibration/conftest.py @@ -1,4 +1,7 @@ -"""Shared fixtures for local area calibration tests.""" +"""Shared fixtures for local area calibration tests. + +Importantly, this file determines which variables will be included in the sparse matrix and calibrating routine. +""" import pytest import numpy as np @@ -16,6 +19,56 @@ get_calculated_variables, ) +# Variables to test for state-level value matching +# Format: (variable_name, rtol) +# variable_name as per the targets in policy_data.db +# rtol is relative tolerance for comparison +VARIABLES_TO_TEST = [ + ("snap", 1e-2), + ("health_insurance_premiums_without_medicare_part_b", 1e-2), + ("medicaid", 1e-2), + ("medicare_part_b_premiums", 1e-2), + ("other_medical_expenses", 1e-2), + ("over_the_counter_health_expenses", 1e-2), + ("salt_deduction", 1e-2), + ("spm_unit_capped_work_childcare_expenses", 1e-2), + ("spm_unit_capped_housing_subsidy", 1e-2), + ("ssi", 1e-2), + ("tanf", 1e-2), + ("tip_income", 1e-2), + ("unemployment_compensation", 1e-2), +] + +# Combined filter config to build matrix with all variables at once +COMBINED_FILTER_CONFIG = { + "stratum_group_ids": [ + 4, # SNAP targets + 5, # Medicaid targets + 112, # Unemployment compensation targets + ], + "variables": [ + "snap", + "health_insurance_premiums_without_medicare_part_b", + "medicaid", + "medicare_part_b_premiums", + "other_medical_expenses", + "over_the_counter_health_expenses", + "salt_deduction", + "spm_unit_capped_work_childcare_expenses", + "spm_unit_capped_housing_subsidy", + "ssi", + "tanf", + "tip_income", + "unemployment_compensation", + ], +} + +# Maximum allowed mismatch rate for state-level value comparison +MAX_MISMATCH_RATE = 0.02 + +# Number of samples for cell-level verification tests +N_VERIFICATION_SAMPLES = 200 + @pytest.fixture(scope="module") def db_uri(): @@ -30,7 +83,7 @@ def dataset_path(): @pytest.fixture(scope="module") def test_cds(db_uri): - """CDs from NC, HI, MT, AK (manageable size, multiple same-state CDs).""" + """CDs from multiple states for comprehensive testing.""" engine = create_engine(db_uri) query = """ SELECT DISTINCT sc.value as cd_geoid @@ -43,6 +96,10 @@ def test_cds(db_uri): OR sc.value LIKE '150_' OR sc.value LIKE '300_' OR sc.value = '200' OR sc.value = '201' + OR sc.value IN ('101', '102') + OR sc.value IN ('601', '602') + OR sc.value IN ('3601', '3602') + OR sc.value IN ('4801', '4802') ) ORDER BY sc.value """ @@ -58,7 +115,7 @@ def sim(dataset_path): @pytest.fixture(scope="module") def matrix_data(db_uri, dataset_path, test_cds, sim): - """Build sparse matrix, return (targets_df, X_sparse, household_id_mapping).""" + """Build sparse matrix with all configured variables.""" builder = SparseMatrixBuilder( db_uri, time_period=2023, @@ -66,7 +123,7 @@ def matrix_data(db_uri, dataset_path, test_cds, sim): dataset_path=dataset_path, ) targets_df, X_sparse, household_id_mapping = builder.build_matrix( - sim, target_filter={"stratum_group_ids": [4], "variables": ["snap"]} + sim, target_filter=COMBINED_FILTER_CONFIG ) return targets_df, X_sparse, household_id_mapping diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_cross_state.py b/policyengine_us_data/tests/test_local_area_calibration/test_cross_state.py index ea9eca6f..f3615e30 100644 --- a/policyengine_us_data/tests/test_local_area_calibration/test_cross_state.py +++ b/policyengine_us_data/tests/test_local_area_calibration/test_cross_state.py @@ -2,17 +2,19 @@ import pytest import numpy as np +from collections import defaultdict from policyengine_us import Microsimulation from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import ( get_calculated_variables, ) +from .conftest import VARIABLES_TO_TEST, N_VERIFICATION_SAMPLES + def test_cross_state_matches_swapped_sim( X_sparse, targets_df, - tracer, test_cds, dataset_path, n_households, @@ -25,8 +27,10 @@ def test_cross_state_matches_swapped_sim( When household moves to different state, X_sparse should contain the value calculated from a fresh simulation with state_fips set to destination state. + + Uses stratified sampling to ensure all variables in VARIABLES_TO_TEST + are covered with approximately equal samples per variable. """ - n_samples = 200 seed = 42 rng = np.random.default_rng(seed) n_hh = n_households @@ -48,28 +52,46 @@ def get_state_sim(state): nonzero_rows, nonzero_cols = X_sparse.nonzero() - cross_state_indices = [] + # Group cross-state cells by variable for stratified sampling + variable_to_indices = defaultdict(list) + variables_to_test = {v[0] for v in VARIABLES_TO_TEST} + for i in range(len(nonzero_rows)): + row_idx = nonzero_rows[i] col_idx = nonzero_cols[i] cd_idx = col_idx // n_hh hh_idx = col_idx % n_hh cd = test_cds[cd_idx] dest_state = int(cd) // 100 orig_state = int(hh_states[hh_idx]) - if dest_state != orig_state: - cross_state_indices.append(i) - if not cross_state_indices: - pytest.skip("No cross-state non-zero cells found") + # Only include cross-state cells + if dest_state == orig_state: + continue + + # Get variable for this row + variable = targets_df.iloc[row_idx]["variable"] + if variable in variables_to_test: + variable_to_indices[variable].append(i) + + if not variable_to_indices: + pytest.skip("No cross-state non-zero cells found for test variables") - sample_idx = rng.choice( - cross_state_indices, - min(n_samples, len(cross_state_indices)), - replace=False, + # Stratified sampling: sample proportionally from each variable + samples_per_var = max( + 1, N_VERIFICATION_SAMPLES // len(variable_to_indices) ) + sample_indices = [] + + for variable, indices in variable_to_indices.items(): + n_to_sample = min(samples_per_var, len(indices)) + sampled = rng.choice(indices, n_to_sample, replace=False) + sample_indices.extend(sampled) + errors = [] + variables_tested = set() - for idx in sample_idx: + for idx in sample_indices: row_idx = nonzero_rows[idx] col_idx = nonzero_cols[idx] cd_idx = col_idx // n_hh @@ -83,6 +105,8 @@ def get_state_sim(state): state_sim.calculate(variable, map_to="household").values[hh_idx] ) + variables_tested.add(variable) + if not np.isclose(actual, expected, atol=0.5): errors.append( { @@ -95,7 +119,13 @@ def get_state_sim(state): } ) + # Report which variables were tested + missing_vars = variables_to_test - variables_tested + if missing_vars: + print(f"Warning: No cross-state cells found for: {missing_vars}") + assert not errors, ( - f"Cross-state verification failed: {len(errors)}/{len(sample_idx)} " - f"mismatches. First 5: {errors[:5]}" + f"Cross-state verification failed: {len(errors)}/{len(sample_indices)} " + f"mismatches across {len(variables_tested)} variables. " + f"First 5: {errors[:5]}" ) diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py b/policyengine_us_data/tests/test_local_area_calibration/test_matrix_national_variation.py similarity index 53% rename from policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py rename to policyengine_us_data/tests/test_local_area_calibration/test_matrix_national_variation.py index 76309388..09cba3d1 100644 --- a/policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py +++ b/policyengine_us_data/tests/test_local_area_calibration/test_matrix_national_variation.py @@ -1,162 +1,31 @@ """ -Tests for sparse matrix builder correctness. +Tests for correctness in the sparse matrix builder, particularly for national level contributions. These tests verify that: 1. Matrix shape and structure are correct -2. Matrix cell values match simulation calculations for households in their - original state -3. Variable aggregation (person to household) preserves totals -4. National-level targets receive contributions from all states (no geographic +2. Variable aggregation (person to household) preserves totals +3. National-level targets receive contributions from all states (no geographic bias) - -The key verification approach: -- When households are "borrowed" to different geographic areas, state_fips is - changed and variables are recalculated -- For households borrowed to CDs in their ORIGINAL state, the recalculated - values should match the original simulation values exactly (since state_fips - is unchanged) -- This provides a ground-truth verification without needing end-to-end H5 - creation - -IMPORTANT NOTE on stochastic eligibility: -Some variables like SNAP have eligibility tests that use PolicyEngine's -random() function. When variables are recalculated in the matrix builder (via -fresh simulations), the random seed sequence may differ, causing ~1-3% of -households to have different eligibility outcomes. This is expected behavior, -so tests allow up to 2% mismatch rate for such variables. +4. Cross-state recalculation applies state-specific rules """ import pytest import numpy as np import pandas as pd -from policyengine_us import Microsimulation -from policyengine_us_data.storage import STORAGE_FOLDER from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import ( SparseMatrixBuilder, ) -# ============================================================================= -# CONFIGURATION - Update these lists as new variables are added -# ============================================================================= - -# Variables to test for state-level value matching -# Format: (variable_name, rtol) - rtol is relative tolerance for comparison -VARIABLES_TO_TEST = [ - ("snap", 1e-2), - ("health_insurance_premiums_without_medicare_part_b", 1e-2), - ("medicaid", 1e-2), - ("medicare_part_b_premiums", 1e-2), - ("other_medical_expenses", 1e-2), - ("over_the_counter_health_expenses", 1e-2), - ("salt_deduction", 1e-2), - ("spm_unit_capped_work_childcare_expenses", 1e-2), - ("spm_unit_capped_housing_subsidy", 1e-2), - ("ssi", 1e-2), - ("tanf", 1e-2), - ("tip_income", 1e-2), - ("unemployment_compensation", 1e-2), -] - -# Combined filter config to build matrix with all variables at once -COMBINED_FILTER_CONFIG = { - "stratum_group_ids": [ - 4, # SNAP targets - 5, # Medicaid targets - 112, # Unemployment compensation targets - ], - "variables": [ - "snap", - "health_insurance_premiums_without_medicare_part_b", - "medicaid", - "medicare_part_b_premiums", - "other_medical_expenses", - "over_the_counter_health_expenses", - "salt_deduction", - "spm_unit_capped_work_childcare_expenses", - "spm_unit_capped_housing_subsidy", - "ssi", - "tanf", - "tip_income", - "unemployment_compensation", - ], -} +from .conftest import ( + VARIABLES_TO_TEST, + COMBINED_FILTER_CONFIG, +) +# Variables with state-specific variation (e.g., SNAP eligibility) VARIABLES_WITH_STATE_VARIATION = [ "snap", ] -# Complications: -# (snap) -# (unemployment_compensation) -# income_tax -# qualified_business_income_deduction -# taxable_social_security -# taxable_pension_income -# taxable_ira_distributions -# taxable_interest_income -# tax_exempt_interest_income -# self_employment_income -# salt -# refundable_ctc -# real_estate_taxes -# qualified_dividend_income -# dividend_income -# adjusted_gross_income -# eitc - -# Maximum allowed mismatch rate for state-level value comparison -MAX_MISMATCH_RATE = 0.02 - - -# ============================================================================= -# FIXTURES -# ============================================================================= - - -@pytest.fixture(scope="module") -def db_uri(): - """Database URI for calibration targets.""" - db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" - return f"sqlite:///{db_path}" - - -@pytest.fixture(scope="module") -def dataset_path(): - """Path to stratified extended CPS dataset.""" - return str(STORAGE_FOLDER / "stratified_extended_cps_2023.h5") - - -@pytest.fixture(scope="module") -def sim(dataset_path): - """Base simulation loaded from stratified CPS.""" - return Microsimulation(dataset=dataset_path) - - -@pytest.fixture(scope="module") -def test_cds(): - """ - Test CDs spanning multiple states for comprehensive testing. - - Selected to include: - - Small states (1-2 CDs): AL, MT - - Medium states: NC - - Large states: CA, TX, NY - """ - return [ - "101", # Alabama CD-1 (state_fips=1) - "102", # Alabama CD-2 - "601", # California CD-1 (state_fips=6) - "602", # California CD-2 - "3001", # Montana CD-1 (state_fips=30) - "3002", # Montana CD-2 - "3701", # North Carolina CD-1 (state_fips=37) - "3702", # North Carolina CD-2 - "3601", # New York CD-1 (state_fips=36) - "3602", # New York CD-2 - "4801", # Texas CD-1 (state_fips=48) - "4802", # Texas CD-2 - ] - @pytest.fixture(scope="module") def builder(db_uri, dataset_path, test_cds): @@ -169,38 +38,6 @@ def builder(db_uri, dataset_path, test_cds): ) -@pytest.fixture(scope="module") -def combined_matrix_data(sim, builder): - """ - Build matrix once with all configured variables. - - This fixture is used by the consolidated test to avoid rebuilding - the matrix for each variable. - """ - targets_df, X_sparse, hh_mapping = builder.build_matrix( - sim, - target_filter=COMBINED_FILTER_CONFIG, - ) - - household_ids = sim.calculate("household_id", map_to="household").values - state_fips = sim.calculate("state_fips", map_to="household").values - - return { - "targets_df": targets_df, - "X_sparse": X_sparse, - "hh_mapping": hh_mapping, - "household_ids": household_ids, - "state_fips": state_fips, - "cds": builder.cds_to_calibrate, - "n_households": len(household_ids), - } - - -# ============================================================================= -# HELPER FUNCTIONS -# ============================================================================= - - def _get_geo_level(geo_id) -> str: """Determine geographic level from geographic_id.""" if geo_id == "US": @@ -215,113 +52,6 @@ def _get_geo_level(geo_id) -> str: return "unknown" -def _verify_state_level_values( - X_sparse, - targets_df, - original_values, - original_state_fips, - cds, - n_households, - variable_name, - rtol=1e-2, -): - """ - Verify that matrix values match original values for households in their - original state. - - Returns: - Tuple of (verified_count, mismatches_list, skipped_reason or None) - """ - # Get state-level targets - state_targets = targets_df[ - (targets_df["variable"] == variable_name) - & (targets_df["geographic_id"].apply(lambda x: str(x).isdigit())) - & ( - targets_df["geographic_id"].apply( - lambda x: 1 <= int(x) <= 56 if str(x).isdigit() else False - ) - ) - ] - - if len(state_targets) == 0: - return 0, [], f"No state-level targets for {variable_name}" - - mismatches = [] - verified_count = 0 - - for _, target_row in state_targets.iterrows(): - target_state = int(target_row["geographic_id"]) - row_idx = target_row.name - - # Find all CDs in this state - state_cds = [ - (cd_idx, cd) - for cd_idx, cd in enumerate(cds) - if int(cd) // 100 == target_state - ] - - if not state_cds: - continue - - # Find households originally from this state - hh_from_state_mask = original_state_fips == target_state - hh_indices_from_state = np.where(hh_from_state_mask)[0] - - if len(hh_indices_from_state) == 0: - continue - - # For each CD in the state, check matrix values - for cd_idx, cd in state_cds: - col_start = cd_idx * n_households - - for hh_idx in hh_indices_from_state: - col_idx = col_start + hh_idx - matrix_val = X_sparse[row_idx, col_idx] - original_val = original_values[hh_idx] - - if original_val == 0 and matrix_val == 0: - verified_count += 1 - continue - - if original_val != 0: - rel_diff = abs(matrix_val - original_val) / abs( - original_val - ) - if rel_diff > rtol: - mismatches.append( - { - "variable": variable_name, - "state": target_state, - "cd": cd, - "hh_idx": hh_idx, - "matrix_val": float(matrix_val), - "original_val": float(original_val), - "rel_diff": rel_diff, - } - ) - else: - verified_count += 1 - elif matrix_val != 0: - mismatches.append( - { - "variable": variable_name, - "state": target_state, - "cd": cd, - "hh_idx": hh_idx, - "matrix_val": float(matrix_val), - "original_val": float(original_val), - "rel_diff": float("inf"), - } - ) - - return verified_count, mismatches, None - - -# ============================================================================= -# BASIC STRUCTURE TESTS -# ============================================================================= - - def test_person_level_aggregation_preserves_totals(sim): """Health insurance premiums (person-level) sum correctly to household.""" var = "health_insurance_premiums_without_medicare_part_b" @@ -357,112 +87,6 @@ def test_combined_variables_in_matrix(sim, builder): assert var_name in variables, f"Missing variable: {var_name}" -# ============================================================================= -# CONSOLIDATED STATE-LEVEL VALUE TEST -# ============================================================================= - - -class TestStateLevelValues: - """ - Consolidated test for verifying matrix values match original simulation - values for households in their original state. - - Builds matrix once and iterates through all configured variables. - """ - - def test_all_variables_state_level_match(self, sim, combined_matrix_data): - """ - Verify all configured variables have correct state-level values. - - For each variable: - 1. Calculate original values from simulation - 2. Compare to matrix values for households in their original state - 3. Allow up to MAX_MISMATCH_RATE due to stochastic eligibility - """ - results = [] - all_mismatches = [] - - for variable_name, rtol in VARIABLES_TO_TEST: - # Calculate original values for this variable - original_values = sim.calculate( - variable_name, map_to="household" - ).values - - verified, mismatches, skip_reason = _verify_state_level_values( - X_sparse=combined_matrix_data["X_sparse"], - targets_df=combined_matrix_data["targets_df"], - original_values=original_values, - original_state_fips=combined_matrix_data["state_fips"], - cds=combined_matrix_data["cds"], - n_households=combined_matrix_data["n_households"], - variable_name=variable_name, - rtol=rtol, - ) - - total_checked = verified + len(mismatches) - mismatch_rate = ( - len(mismatches) / total_checked if total_checked > 0 else 0 - ) - - results.append( - { - "variable": variable_name, - "verified": verified, - "mismatches": len(mismatches), - "total": total_checked, - "mismatch_rate": mismatch_rate, - "skip_reason": skip_reason, - "passed": ( - skip_reason is not None - or mismatch_rate <= MAX_MISMATCH_RATE - ), - } - ) - - all_mismatches.extend(mismatches) - - # Print summary - print("\n" + "=" * 70) - print("STATE-LEVEL VALUE VERIFICATION SUMMARY") - print("=" * 70) - - results_df = pd.DataFrame(results) - for _, row in results_df.iterrows(): - if row["skip_reason"]: - status = f"SKIPPED: {row['skip_reason']}" - elif row["passed"]: - status = ( - f"PASSED: {row['verified']:,} verified, " - f"{row['mismatch_rate']:.1%} mismatch rate" - ) - else: - status = ( - f"FAILED: {row['mismatches']:,} mismatches, " - f"{row['mismatch_rate']:.1%} > {MAX_MISMATCH_RATE:.0%}" - ) - print(f" {row['variable']}: {status}") - - # Show sample mismatches if any - if all_mismatches: - print(f"\nSample mismatches ({len(all_mismatches)} total):") - mismatch_df = pd.DataFrame(all_mismatches) - print(mismatch_df.head(15).to_string()) - - mismatch_df.to_csv("state_level_mismatches.csv", index=False) - - # Assert all variables passed - failed = [r for r in results if not r["passed"]] - assert len(failed) == 0, ( - f"{len(failed)} variable(s) failed state-level verification: " - f"{[r['variable'] for r in failed]}" - ) - - -# ============================================================================= -# NATIONAL-LEVEL CONTRIBUTION TEST -# ============================================================================= - - class TestNationalLevelContributions: """ Tests verifying that national-level targets receive contributions from @@ -476,7 +100,7 @@ class TestNationalLevelContributions: """ def test_national_targets_receive_multistate_contributions( - self, sim, combined_matrix_data + self, targets_df, X_sparse, household_states, n_households, test_cds ): """ Verify that national-level targets have contributions from households @@ -489,11 +113,8 @@ def test_national_targets_receive_multistate_contributions( 3. Verify contributions come from multiple states (not geographically biased) """ - targets_df = combined_matrix_data["targets_df"] - X_sparse = combined_matrix_data["X_sparse"] - state_fips = combined_matrix_data["state_fips"] - n_households = combined_matrix_data["n_households"] - cds = combined_matrix_data["cds"] + state_fips = household_states + cds = test_cds # Find national-level targets national_targets = targets_df[ @@ -577,7 +198,7 @@ def test_national_targets_receive_multistate_contributions( ) def test_state_distribution_in_national_targets( - self, sim, combined_matrix_data + self, targets_df, X_sparse, household_states, n_households, test_cds ): """ Verify the distribution of contributing states in national targets @@ -586,11 +207,8 @@ def test_state_distribution_in_national_targets( This catches cases where one state dominates the contributions disproportionately. """ - targets_df = combined_matrix_data["targets_df"] - X_sparse = combined_matrix_data["X_sparse"] - state_fips = combined_matrix_data["state_fips"] - n_households = combined_matrix_data["n_households"] - cds = combined_matrix_data["cds"] + state_fips = household_states + cds = test_cds # Get original state distribution (count of households per state) unique_states, original_counts = np.unique( @@ -656,11 +274,6 @@ def test_state_distribution_in_national_targets( ) -# ============================================================================= -# CROSS-STATE RECALCULATION TEST -# ============================================================================= - - class TestCrossStateRecalculation: """ Tests verifying that household values change when borrowed to different @@ -671,10 +284,13 @@ class TestCrossStateRecalculation: what the variable would be if that household lived in that CD's state. For state-dependent variables (like SNAP), values should differ across states for at least some households. + + NOTE: This complements test_cross_state.py which verifies exact values. + These tests verify that variation exists (state rules are applied). """ def test_values_change_across_states_for_national_targets( - self, combined_matrix_data + self, targets_df, X_sparse, n_households, test_cds ): """ Verify that for national targets, household values vary across CD @@ -690,10 +306,7 @@ def test_values_change_across_states_for_national_targets( 3. At least some households should have different values in different states (confirming recalculation with different state rules) """ - targets_df = combined_matrix_data["targets_df"] - X_sparse = combined_matrix_data["X_sparse"] - n_households = combined_matrix_data["n_households"] - cds = combined_matrix_data["cds"] + cds = test_cds # Group CDs by state cds_by_state = {} @@ -785,7 +398,7 @@ def test_values_change_across_states_for_national_targets( ) def test_same_household_different_states_shows_rule_changes( - self, combined_matrix_data + self, targets_df, X_sparse, household_states, n_households, test_cds ): """ Deep dive test: pick specific households and verify their values @@ -800,11 +413,8 @@ def test_same_household_different_states_shows_rule_changes( This test finds households where we can verify the recalculation is applying different state rules. """ - targets_df = combined_matrix_data["targets_df"] - X_sparse = combined_matrix_data["X_sparse"] - n_households = combined_matrix_data["n_households"] - cds = combined_matrix_data["cds"] - state_fips_orig = combined_matrix_data["state_fips"] + state_fips_orig = household_states + cds = test_cds # Group CDs by state cds_by_state = {} diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py b/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py index a13f459d..c9507aaf 100644 --- a/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py +++ b/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py @@ -2,18 +2,19 @@ import pytest import numpy as np +from collections import defaultdict from policyengine_us import Microsimulation from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import ( get_calculated_variables, ) +from .conftest import VARIABLES_TO_TEST, N_VERIFICATION_SAMPLES + def test_same_state_matches_original( X_sparse, targets_df, - tracer, - sim, test_cds, dataset_path, n_households, @@ -25,8 +26,10 @@ def test_same_state_matches_original( When household stays in same state, X_sparse should contain the value calculated from a fresh simulation with state_fips set to that state. + + Uses stratified sampling to ensure all variables in VARIABLES_TO_TEST + are covered with approximately equal samples per variable. """ - n_samples = 200 seed = 42 rng = np.random.default_rng(seed) n_hh = n_households @@ -48,28 +51,46 @@ def get_state_sim(state): nonzero_rows, nonzero_cols = X_sparse.nonzero() - same_state_indices = [] + # Group same-state cells by variable for stratified sampling + variable_to_indices = defaultdict(list) + variables_to_test = {v[0] for v in VARIABLES_TO_TEST} + for i in range(len(nonzero_rows)): + row_idx = nonzero_rows[i] col_idx = nonzero_cols[i] cd_idx = col_idx // n_hh hh_idx = col_idx % n_hh cd = test_cds[cd_idx] dest_state = int(cd) // 100 orig_state = int(hh_states[hh_idx]) - if dest_state == orig_state: - same_state_indices.append(i) - if not same_state_indices: - pytest.skip("No same-state non-zero cells found") + # Only include same-state cells + if dest_state != orig_state: + continue + + # Get variable for this row + variable = targets_df.iloc[row_idx]["variable"] + if variable in variables_to_test: + variable_to_indices[variable].append(i) + + if not variable_to_indices: + pytest.skip("No same-state non-zero cells found for test variables") - sample_idx = rng.choice( - same_state_indices, - min(n_samples, len(same_state_indices)), - replace=False, + # Stratified sampling: sample proportionally from each variable + samples_per_var = max( + 1, N_VERIFICATION_SAMPLES // len(variable_to_indices) ) + sample_indices = [] + + for variable, indices in variable_to_indices.items(): + n_to_sample = min(samples_per_var, len(indices)) + sampled = rng.choice(indices, n_to_sample, replace=False) + sample_indices.extend(sampled) + errors = [] + variables_tested = set() - for idx in sample_idx: + for idx in sample_indices: row_idx = nonzero_rows[idx] col_idx = nonzero_cols[idx] cd_idx = col_idx // n_hh @@ -83,6 +104,8 @@ def get_state_sim(state): state_sim.calculate(variable, map_to="household").values[hh_idx] ) + variables_tested.add(variable) + if not np.isclose(actual, expected, atol=0.5): errors.append( { @@ -93,7 +116,13 @@ def get_state_sim(state): } ) + # Report which variables were tested + missing_vars = variables_to_test - variables_tested + if missing_vars: + print(f"Warning: No same-state cells found for: {missing_vars}") + assert not errors, ( - f"Same-state verification failed: {len(errors)}/{len(sample_idx)} " - f"mismatches. First 5: {errors[:5]}" + f"Same-state verification failed: {len(errors)}/{len(sample_indices)} " + f"mismatches across {len(variables_tested)} variables. " + f"First 5: {errors[:5]}" ) From 89ce2c8ad93a40fd5cb19abdcd29de5ea457891d Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Thu, 22 Jan 2026 18:35:12 +0530 Subject: [PATCH 4/6] update test_same_test so it compares to original values instead of fresh calculations --- .../test_same_state.py | 55 +++++++++---------- 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py b/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py index c9507aaf..ec9200b3 100644 --- a/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py +++ b/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py @@ -1,31 +1,26 @@ -"""Test same-state values match fresh simulations.""" +"""Test same-state values match original simulation values.""" import pytest import numpy as np from collections import defaultdict -from policyengine_us import Microsimulation -from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import ( - get_calculated_variables, -) - from .conftest import VARIABLES_TO_TEST, N_VERIFICATION_SAMPLES def test_same_state_matches_original( + sim, X_sparse, targets_df, test_cds, - dataset_path, n_households, household_ids, household_states, ): """ - Same-state non-zero cells must match fresh same-state simulation. + Same-state non-zero cells must match ORIGINAL simulation values. When household stays in same state, X_sparse should contain the value - calculated from a fresh simulation with state_fips set to that state. + from the original simulation (ground truth from H5 dataset). Uses stratified sampling to ensure all variables in VARIABLES_TO_TEST are covered with approximately equal samples per variable. @@ -36,19 +31,6 @@ def test_same_state_matches_original( hh_ids = household_ids hh_states = household_states - state_sims = {} - - def get_state_sim(state): - if state not in state_sims: - s = Microsimulation(dataset=dataset_path) - s.set_input( - "state_fips", 2023, np.full(n_hh, state, dtype=np.int32) - ) - for var in get_calculated_variables(s): - s.delete_arrays(var) - state_sims[state] = s - return state_sims[state] - nonzero_rows, nonzero_cols = X_sparse.nonzero() # Group same-state cells by variable for stratified sampling @@ -68,7 +50,6 @@ def get_state_sim(state): if dest_state != orig_state: continue - # Get variable for this row variable = targets_df.iloc[row_idx]["variable"] if variable in variables_to_test: variable_to_indices[variable].append(i) @@ -87,6 +68,16 @@ def get_state_sim(state): sampled = rng.choice(indices, n_to_sample, replace=False) sample_indices.extend(sampled) + # Cache original values per variable to avoid repeated calculations + original_values_cache = {} + + def get_original_values(variable): + if variable not in original_values_cache: + original_values_cache[variable] = sim.calculate( + variable, map_to="household" + ).values + return original_values_cache[variable] + errors = [] variables_tested = set() @@ -95,14 +86,12 @@ def get_state_sim(state): col_idx = nonzero_cols[idx] cd_idx = col_idx // n_hh hh_idx = col_idx % n_hh - cd = test_cds[cd_idx] - dest_state = int(cd) // 100 variable = targets_df.iloc[row_idx]["variable"] actual = float(X_sparse[row_idx, col_idx]) - state_sim = get_state_sim(dest_state) - expected = float( - state_sim.calculate(variable, map_to="household").values[hh_idx] - ) + + # Compare to ORIGINAL simulation values (ground truth) + original_values = get_original_values(variable) + expected = float(original_values[hh_idx]) variables_tested.add(variable) @@ -110,13 +99,19 @@ def get_state_sim(state): errors.append( { "hh_id": hh_ids[hh_idx], + "hh_idx": hh_idx, "variable": variable, "actual": actual, "expected": expected, + "diff": actual - expected, + "rel_diff": ( + (actual - expected) / expected + if expected != 0 + else np.inf + ), } ) - # Report which variables were tested missing_vars = variables_to_test - variables_tested if missing_vars: print(f"Warning: No same-state cells found for: {missing_vars}") From b18680d7bc6cf49923ccd6eb01c8849991383d3d Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Thu, 22 Jan 2026 21:04:23 +0530 Subject: [PATCH 5/6] adding matrix builder improvements --- .../calibration_utils.py | 64 +++++--- .../sparse_matrix_builder.py | 138 ++++++++++++++++-- .../test_local_area_calibration/conftest.py | 48 +++++- 3 files changed, 220 insertions(+), 30 deletions(-) diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py index f01465a2..aa954aba 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py @@ -192,15 +192,21 @@ def get_calculated_variables(sim) -> List[str]: """ Return variables that should be cleared for state-swap recalculation. - Includes variables with formulas, adds, or subtracts. - - Excludes ID variables (person_id, household_id, etc.) because: - 1. They have formulas that generate sequential IDs (0, 1, 2, ...) - 2. We need the original H5 values, not regenerated sequences - 3. PolicyEngine's random() function uses entity IDs as seeds: - seed = abs(entity_id * 100 + count_random_calls) - If IDs change, random-dependent variables (SSI resource test, - WIC nutritional risk, WIC takeup) produce different results. + Includes variables with formulas, or adds/subtracts that are lists. + + Excludes: + 1. ID variables (person_id, household_id, etc.) - needed for random seeds + 2. Variables with string adds/subtracts (parameter paths) - these are + pseudo-inputs stored in H5 that would recalculate differently using + parameter lookups. Examples: pre_tax_contributions. + 3. Variables in input_variables (have stored H5 values) even if they + have formulas - the stored values represent original survey data + that should be preserved. Examples: cdcc_relevant_expenses, rent. + + The exclusions are critical because: + - The H5 file stores pre-computed values from original CPS processing + - If deleted, recalculation produces different values, corrupting + downstream calculations like income_tax """ exclude_ids = { "person_id", @@ -210,16 +216,36 @@ def get_calculated_variables(sim) -> List[str]: "family_id", "marital_unit_id", } - return [ - name - for name, var in sim.tax_benefit_system.variables.items() - if ( - var.formulas - or getattr(var, "adds", None) - or getattr(var, "subtracts", None) - ) - and name not in exclude_ids - ] + + # Get stored input variables to exclude + input_vars = set(sim.input_variables) + + result = [] + for name, var in sim.tax_benefit_system.variables.items(): + if name in exclude_ids: + continue + + # Exclude variables that have stored values (input_variables) + # These represent original survey data that should be preserved + if name in input_vars: + continue + + # Include if has formulas + if var.formulas: + result.append(name) + continue + + # Include if adds/subtracts is a list (explicit component aggregation) + # Exclude if adds/subtracts is a string (parameter path - pseudo-input) + adds = getattr(var, "adds", None) + subtracts = getattr(var, "subtracts", None) + + if adds and isinstance(adds, list): + result.append(name) + elif subtracts and isinstance(subtracts, list): + result.append(name) + + return result def get_pseudo_input_variables(sim) -> set: diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py b/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py index d8748014..b12629fb 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py @@ -38,6 +38,105 @@ def __init__( self.time_period = time_period self.cds_to_calibrate = cds_to_calibrate self.dataset_path = dataset_path + self._entity_rel_cache = None + + def _build_entity_relationship(self, sim) -> pd.DataFrame: + """ + Build entity relationship DataFrame mapping persons to all entity IDs. + + This is used to evaluate constraints at the person level and then + aggregate to household level, handling variables defined at different + entity levels (person, tax_unit, household, spm_unit). + + Returns: + DataFrame with person_id, household_id, tax_unit_id, spm_unit_id + """ + if self._entity_rel_cache is not None: + return self._entity_rel_cache + + self._entity_rel_cache = pd.DataFrame( + { + "person_id": sim.calculate( + "person_id", map_to="person" + ).values, + "household_id": sim.calculate( + "household_id", map_to="person" + ).values, + "tax_unit_id": sim.calculate( + "tax_unit_id", map_to="person" + ).values, + "spm_unit_id": sim.calculate( + "spm_unit_id", map_to="person" + ).values, + } + ) + return self._entity_rel_cache + + def _evaluate_constraints_entity_aware( + self, state_sim, constraints: List[dict], n_households: int + ) -> np.ndarray: + """ + Evaluate non-geographic constraints at person level, aggregate to + household level using .any(). + + This properly handles constraints on variables defined at different + entity levels (e.g., tax_unit_is_filer at tax_unit level). Instead of + summing values at household level (which would give 2, 3, etc. for + households with multiple tax units), we evaluate at person level and + use .any() aggregation ("does this household have at least one person + satisfying all constraints?"). + + Args: + state_sim: Microsimulation with state_fips set + constraints: List of constraint dicts with variable, operation, + value keys (geographic constraints should be pre-filtered) + n_households: Number of households + + Returns: + Boolean mask array of length n_households + """ + if not constraints: + return np.ones(n_households, dtype=bool) + + entity_rel = self._build_entity_relationship(state_sim) + n_persons = len(entity_rel) + + person_mask = np.ones(n_persons, dtype=bool) + + for c in constraints: + var = c["variable"] + op = c["operation"] + val = c["value"] + + # Calculate constraint variable at person level + constraint_values = state_sim.calculate( + var, map_to="person" + ).values + + # Apply operation at person level + person_mask &= apply_op(constraint_values, op, val) + + # Aggregate to household level using .any() + # "At least one person in this household satisfies ALL constraints" + entity_rel_with_mask = entity_rel.copy() + entity_rel_with_mask["satisfies"] = person_mask + + household_mask_series = entity_rel_with_mask.groupby("household_id")[ + "satisfies" + ].any() + + # Ensure we return a mask aligned with household order + household_ids = state_sim.calculate( + "household_id", map_to="household" + ).values + household_mask = np.array( + [ + household_mask_series.get(hh_id, False) + for hh_id in household_ids + ] + ) + + return household_mask def _query_targets(self, target_filter: dict) -> pd.DataFrame: """Query targets based on filter criteria using OR logic.""" @@ -166,6 +265,9 @@ def build_matrix( cds_by_state[state].append((cd_idx, cd)) for state, cd_list in cds_by_state.items(): + # Clear entity relationship cache when creating new simulation + self._entity_rel_cache = None + if self.dataset_path: state_sim = self._create_state_sim(state, n_households) else: @@ -184,27 +286,43 @@ def build_matrix( for row_idx, (_, target) in enumerate(targets_df.iterrows()): constraints = self._get_constraints(target["stratum_id"]) - mask = np.ones(n_households, dtype=bool) + geo_constraints = [] + non_geo_constraints = [] for c in constraints: + if c["variable"] in ( + "state_fips", + "congressional_district_geoid", + ): + geo_constraints.append(c) + else: + non_geo_constraints.append(c) + + # Check geographic constraints first (quick fail) + geo_mask = np.ones(n_households, dtype=bool) + for c in geo_constraints: if c["variable"] == "congressional_district_geoid": if ( c["operation"] in ("==", "=") and c["value"] != cd ): - mask[:] = False + geo_mask[:] = False elif c["variable"] == "state_fips": if ( c["operation"] in ("==", "=") and int(c["value"]) != state ): - mask[:] = False - else: - values = state_sim.calculate( - c["variable"], map_to="household" - ).values - mask &= apply_op( - values, c["operation"], c["value"] - ) + geo_mask[:] = False + + if not geo_mask.any(): + continue + + # Evaluate non-geographic constraints at entity level + entity_mask = self._evaluate_constraints_entity_aware( + state_sim, non_geo_constraints, n_households + ) + + # Combine geographic and entity-aware masks + mask = geo_mask & entity_mask if not mask.any(): continue diff --git a/policyengine_us_data/tests/test_local_area_calibration/conftest.py b/policyengine_us_data/tests/test_local_area_calibration/conftest.py index d4b5edc6..633b391f 100644 --- a/policyengine_us_data/tests/test_local_area_calibration/conftest.py +++ b/policyengine_us_data/tests/test_local_area_calibration/conftest.py @@ -37,6 +37,22 @@ ("tanf", 1e-2), ("tip_income", 1e-2), ("unemployment_compensation", 1e-2), + ("income_tax", 1e-2), + ("income_tax", 1e-2), + ("qualified_business_income_deduction", 1e-2), + ("taxable_social_security", 1e-2), + ("taxable_pension_income", 1e-2), + ("taxable_ira_distributions", 1e-2), + ("taxable_interest_income", 1e-2), + ("tax_exempt_interest_income", 1e-2), + ("self_employment_income", 1e-2), + ("salt", 1e-2), + ("refundable_ctc", 1e-2), + ("real_estate_taxes", 1e-2), + ("qualified_dividend_income", 1e-2), + ("dividend_income", 1e-2), + ("adjusted_gross_income", 1e-2), + ("eitc", 1e-2), ] # Combined filter config to build matrix with all variables at once @@ -45,6 +61,20 @@ 4, # SNAP targets 5, # Medicaid targets 112, # Unemployment compensation targets + 117, # Income tax targets + 100, # QBID targets + 111, # Taxable social security targets + 114, # Taxable pension income targets + 105, # Taxable IRA distributions targets + 106, # Taxable interest income targets + 107, # Tax exempt interest income targets + 101, # Self-employment income targets + 116, # Salt targets + 115, # Refundable CTC targets + 103, # Real estate taxes targets + 109, # Qualified dividend income targets + 108, # Dividend income targets + 3, # Adjusted gross income targets ], "variables": [ "snap", @@ -60,6 +90,22 @@ "tanf", "tip_income", "unemployment_compensation", + "income_tax", + "income_tax", + "qualified_business_income_deduction", + "taxable_social_security", + "taxable_pension_income", + "taxable_ira_distributions", + "taxable_interest_income", + "tax_exempt_interest_income", + "self_employment_income", + "salt", + "refundable_ctc", + "real_estate_taxes", + "qualified_dividend_income", + "dividend_income", + "adjusted_gross_income", + "eitc", ], } @@ -67,7 +113,7 @@ MAX_MISMATCH_RATE = 0.02 # Number of samples for cell-level verification tests -N_VERIFICATION_SAMPLES = 200 +N_VERIFICATION_SAMPLES = 2000 @pytest.fixture(scope="module") From b5b1f1dd6faa4845c7e1e26f90d085109b32599c Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Thu, 22 Jan 2026 14:15:02 -0500 Subject: [PATCH 6/6] Reduce test scope for CI performance - Reduce VARIABLES_TO_TEST to 3 representative variables (snap, income_tax, eitc) - Reduce COMBINED_FILTER_CONFIG to minimal subset for fast CI runs - Reduce N_VERIFICATION_SAMPLES from 2000 to 500 - Revert test_cds to original 4 states (NC, HI, MT, AK) instead of 8 states Tests now complete in ~4 minutes instead of 3+ hours. Co-Authored-By: Claude Opus 4.5 --- .../test_local_area_calibration/conftest.py | 80 ++----------------- .../test_matrix_national_variation.py | 2 +- 2 files changed, 6 insertions(+), 76 deletions(-) diff --git a/policyengine_us_data/tests/test_local_area_calibration/conftest.py b/policyengine_us_data/tests/test_local_area_calibration/conftest.py index 633b391f..7abcbafb 100644 --- a/policyengine_us_data/tests/test_local_area_calibration/conftest.py +++ b/policyengine_us_data/tests/test_local_area_calibration/conftest.py @@ -19,92 +19,26 @@ get_calculated_variables, ) -# Variables to test for state-level value matching +# Variables to test for state-level value matching (CI uses subset for speed) # Format: (variable_name, rtol) # variable_name as per the targets in policy_data.db # rtol is relative tolerance for comparison VARIABLES_TO_TEST = [ ("snap", 1e-2), - ("health_insurance_premiums_without_medicare_part_b", 1e-2), - ("medicaid", 1e-2), - ("medicare_part_b_premiums", 1e-2), - ("other_medical_expenses", 1e-2), - ("over_the_counter_health_expenses", 1e-2), - ("salt_deduction", 1e-2), - ("spm_unit_capped_work_childcare_expenses", 1e-2), - ("spm_unit_capped_housing_subsidy", 1e-2), - ("ssi", 1e-2), - ("tanf", 1e-2), - ("tip_income", 1e-2), - ("unemployment_compensation", 1e-2), ("income_tax", 1e-2), - ("income_tax", 1e-2), - ("qualified_business_income_deduction", 1e-2), - ("taxable_social_security", 1e-2), - ("taxable_pension_income", 1e-2), - ("taxable_ira_distributions", 1e-2), - ("taxable_interest_income", 1e-2), - ("tax_exempt_interest_income", 1e-2), - ("self_employment_income", 1e-2), - ("salt", 1e-2), - ("refundable_ctc", 1e-2), - ("real_estate_taxes", 1e-2), - ("qualified_dividend_income", 1e-2), - ("dividend_income", 1e-2), - ("adjusted_gross_income", 1e-2), ("eitc", 1e-2), ] -# Combined filter config to build matrix with all variables at once +# CI filter config - minimal subset for fast CI runs +# Tests 3 representative variables covering benefits, taxes, and credits COMBINED_FILTER_CONFIG = { "stratum_group_ids": [ 4, # SNAP targets - 5, # Medicaid targets - 112, # Unemployment compensation targets 117, # Income tax targets - 100, # QBID targets - 111, # Taxable social security targets - 114, # Taxable pension income targets - 105, # Taxable IRA distributions targets - 106, # Taxable interest income targets - 107, # Tax exempt interest income targets - 101, # Self-employment income targets - 116, # Salt targets - 115, # Refundable CTC targets - 103, # Real estate taxes targets - 109, # Qualified dividend income targets - 108, # Dividend income targets - 3, # Adjusted gross income targets ], "variables": [ "snap", - "health_insurance_premiums_without_medicare_part_b", - "medicaid", - "medicare_part_b_premiums", - "other_medical_expenses", - "over_the_counter_health_expenses", - "salt_deduction", - "spm_unit_capped_work_childcare_expenses", - "spm_unit_capped_housing_subsidy", - "ssi", - "tanf", - "tip_income", - "unemployment_compensation", - "income_tax", "income_tax", - "qualified_business_income_deduction", - "taxable_social_security", - "taxable_pension_income", - "taxable_ira_distributions", - "taxable_interest_income", - "tax_exempt_interest_income", - "self_employment_income", - "salt", - "refundable_ctc", - "real_estate_taxes", - "qualified_dividend_income", - "dividend_income", - "adjusted_gross_income", "eitc", ], } @@ -113,7 +47,7 @@ MAX_MISMATCH_RATE = 0.02 # Number of samples for cell-level verification tests -N_VERIFICATION_SAMPLES = 2000 +N_VERIFICATION_SAMPLES = 500 @pytest.fixture(scope="module") @@ -129,7 +63,7 @@ def dataset_path(): @pytest.fixture(scope="module") def test_cds(db_uri): - """CDs from multiple states for comprehensive testing.""" + """CDs from NC, HI, MT, AK (manageable size for CI, multiple same-state CDs).""" engine = create_engine(db_uri) query = """ SELECT DISTINCT sc.value as cd_geoid @@ -142,10 +76,6 @@ def test_cds(db_uri): OR sc.value LIKE '150_' OR sc.value LIKE '300_' OR sc.value = '200' OR sc.value = '201' - OR sc.value IN ('101', '102') - OR sc.value IN ('601', '602') - OR sc.value IN ('3601', '3602') - OR sc.value IN ('4801', '4802') ) ORDER BY sc.value """ diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_matrix_national_variation.py b/policyengine_us_data/tests/test_local_area_calibration/test_matrix_national_variation.py index 09cba3d1..b5950089 100644 --- a/policyengine_us_data/tests/test_local_area_calibration/test_matrix_national_variation.py +++ b/policyengine_us_data/tests/test_local_area_calibration/test_matrix_national_variation.py @@ -177,7 +177,7 @@ def test_national_targets_receive_multistate_contributions( min_states = stats_df["n_states"].min() # Check: on average, contributions should come from multiple states - # (at least 2, since we have CDs from 6 different states) + # (at least 2, since we have CDs from 4 different states) passed = avg_states >= 2 and min_states >= 1 results.append(