From 569c5b3a1b36fb1b3418cd65bedddfc7b5eecc32 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Wed, 21 Jan 2026 20:38:01 +0530
Subject: [PATCH 1/6] adding test for matrix building logic

---
 changelog_entry.yaml                          |   6 +-
 .../sparse_matrix_builder.py                  |  20 +-
 .../test_sparse_matrix_builder.py             | 876 ++++++++++++++++++
 3 files changed, 885 insertions(+), 17 deletions(-)
 create mode 100644 policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index 0f82eb65..591f325d 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -1,4 +1,4 @@
-- bump: patch
+- bump: minor
   changes:
-    fixed:
-    - Versioning workflow checkout for push events
+    added:
+      - tests to verify SparseMatrixBuilder correctly calculates variables and constraints into the calibration matrix.
diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py b/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py
index 3af0a8d8..d8748014 100644
--- a/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py
+++ b/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py
@@ -199,20 +199,12 @@ def build_matrix(
                             ):
                                 mask[:] = False
                         else:
-                            try:
-                                values = state_sim.calculate(
-                                    c["variable"], map_to="household"
-                                ).values
-                                mask &= apply_op(
-                                    values, c["operation"], c["value"]
-                                )
-                            except Exception as e:
-                                # Variable may not exist or may not be
-                                # calculable at household level - skip
-                                logger.debug(
-                                    f"Could not evaluate constraint "
-                                    f"{c['variable']}: {e}"
-                                )
+                            values = state_sim.calculate(
+                                c["variable"], map_to="household"
+                            ).values
+                            mask &= apply_op(
+                                values, c["operation"], c["value"]
+                            )
 
                     if not mask.any():
                         continue
diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py b/policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py
new file mode 100644
index 00000000..175488be
--- /dev/null
+++ b/policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py
@@ -0,0 +1,876 @@
+"""
+Tests for sparse matrix builder correctness.
+
+These tests verify that:
+1. Matrix shape and structure are correct
+2. Matrix cell values match simulation calculations for households in their
+   original state
+3. Variable aggregation (person to household) preserves totals
+4. National-level targets receive contributions from all states (no geographic
+   bias)
+
+The key verification approach:
+- When households are "borrowed" to different geographic areas, state_fips is
+  changed and variables are recalculated
+- For households borrowed to CDs in their ORIGINAL state, the recalculated
+  values should match the original simulation values exactly (since state_fips
+  is unchanged)
+- This provides a ground-truth verification without needing end-to-end H5
+  creation
+
+IMPORTANT NOTE on stochastic eligibility:
+Some variables like SNAP have eligibility tests that use PolicyEngine's
+random() function. When variables are recalculated in the matrix builder (via
+fresh simulations), the random seed sequence may differ, causing ~1-3% of
+households to have different eligibility outcomes. This is expected behavior,
+so tests allow up to 2% mismatch rate for such variables.
+"""
+
+import pytest
+import numpy as np
+import pandas as pd
+from policyengine_us import Microsimulation
+from policyengine_us_data.storage import STORAGE_FOLDER
+from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import (
+    SparseMatrixBuilder,
+)
+
+
+# =============================================================================
+# CONFIGURATION - Update these lists as new variables are added
+# =============================================================================
+
+# Variables to test for state-level value matching
+# Format: (variable_name, rtol) - rtol is relative tolerance for comparison
+VARIABLES_TO_TEST = [
+    ("snap", 1e-2),
+    ("health_insurance_premiums_without_medicare_part_b", 1e-2),
+    ("medicaid", 1e-2),
+    ("medicare_part_b_premiums", 1e-2),
+    ("other_medical_expenses", 1e-2),
+    ("over_the_counter_health_expenses", 1e-2),
+    ("salt_deduction", 1e-2),
+    ("spm_unit_capped_work_childcare_expenses", 1e-2),
+    ("spm_unit_capped_housing_subsidy", 1e-2),
+    ("ssi", 1e-2),
+    ("tanf", 1e-2),
+    ("tip_income", 1e-2),
+    ("unemployment_compensation", 1e-2),
+]
+
+# Combined filter config to build matrix with all variables at once
+COMBINED_FILTER_CONFIG = {
+    "stratum_group_ids": [
+        4,  # SNAP targets
+        5,  # Medicaid targets
+        112,  # Unemployment compensation targets
+    ],
+    "variables": [
+        "snap",
+        "health_insurance_premiums_without_medicare_part_b",
+        "medicaid",
+        "medicare_part_b_premiums",
+        "other_medical_expenses",
+        "over_the_counter_health_expenses",
+        "salt_deduction",
+        "spm_unit_capped_work_childcare_expenses",
+        "spm_unit_capped_housing_subsidy",
+        "ssi",
+        "tanf",
+        "tip_income",
+        "unemployment_compensation",
+    ],
+}
+
+VARIABLES_WITH_STATE_VARIATION = [
+    "snap",
+]
+
+# Complications:
+# (snap)
+# (unemployment_compensation)
+# income_tax
+# qualified_business_income_deduction
+# taxable_social_security
+# taxable_pension_income
+# taxable_ira_distributions
+# taxable_interest_income
+# tax_exempt_interest_income
+# self_employment_income
+# salt
+# refundable_ctc
+# real_estate_taxes
+# qualified_dividend_income
+# dividend_income
+# adjusted_gross_income
+# eitc
+
+# Maximum allowed mismatch rate for state-level value comparison
+MAX_MISMATCH_RATE = 0.02
+
+
+# =============================================================================
+# FIXTURES
+# =============================================================================
+
+
+@pytest.fixture(scope="module")
+def db_uri():
+    """Database URI for calibration targets."""
+    db_path = STORAGE_FOLDER / "calibration" / "policy_data.db"
+    return f"sqlite:///{db_path}"
+
+
+@pytest.fixture(scope="module")
+def dataset_path():
+    """Path to stratified extended CPS dataset."""
+    return str(STORAGE_FOLDER / "stratified_extended_cps_2023.h5")
+
+
+@pytest.fixture(scope="module")
+def sim(dataset_path):
+    """Base simulation loaded from stratified CPS."""
+    return Microsimulation(dataset=dataset_path)
+
+
+@pytest.fixture(scope="module")
+def test_cds():
+    """
+    Test CDs spanning multiple states for comprehensive testing.
+
+    Selected to include:
+    - Small states (1-2 CDs): AL, MT
+    - Medium states: NC
+    - Large states: CA, TX, NY
+    """
+    return [
+        "101",  # Alabama CD-1 (state_fips=1)
+        "102",  # Alabama CD-2
+        "601",  # California CD-1 (state_fips=6)
+        "602",  # California CD-2
+        "3001",  # Montana CD-1 (state_fips=30)
+        "3002",  # Montana CD-2
+        "3701",  # North Carolina CD-1 (state_fips=37)
+        "3702",  # North Carolina CD-2
+        "3601",  # New York CD-1 (state_fips=36)
+        "3602",  # New York CD-2
+        "4801",  # Texas CD-1 (state_fips=48)
+        "4802",  # Texas CD-2
+    ]
+
+
+@pytest.fixture(scope="module")
+def builder(db_uri, dataset_path, test_cds):
+    """SparseMatrixBuilder configured with test CDs."""
+    return SparseMatrixBuilder(
+        db_uri=db_uri,
+        time_period=2023,
+        cds_to_calibrate=test_cds,
+        dataset_path=dataset_path,
+    )
+
+
+@pytest.fixture(scope="module")
+def combined_matrix_data(sim, builder):
+    """
+    Build matrix once with all configured variables.
+
+    This fixture is used by the consolidated test to avoid rebuilding
+    the matrix for each variable.
+    """
+    targets_df, X_sparse, hh_mapping = builder.build_matrix(
+        sim,
+        target_filter=COMBINED_FILTER_CONFIG,
+    )
+
+    household_ids = sim.calculate("household_id", map_to="household").values
+    state_fips = sim.calculate("state_fips", map_to="household").values
+
+    return {
+        "targets_df": targets_df,
+        "X_sparse": X_sparse,
+        "hh_mapping": hh_mapping,
+        "household_ids": household_ids,
+        "state_fips": state_fips,
+        "cds": builder.cds_to_calibrate,
+        "n_households": len(household_ids),
+    }
+
+
+# =============================================================================
+# HELPER FUNCTIONS
+# =============================================================================
+
+
+def _get_geo_level(geo_id) -> str:
+    """Determine geographic level from geographic_id."""
+    if geo_id == "US":
+        return "national"
+    try:
+        val = int(geo_id)
+        if 1 <= val <= 56:
+            return "state"
+        else:
+            return "district"
+    except (ValueError, TypeError):
+        return "unknown"
+
+
+def _verify_state_level_values(
+    X_sparse,
+    targets_df,
+    original_values,
+    original_state_fips,
+    cds,
+    n_households,
+    variable_name,
+    rtol=1e-2,
+):
+    """
+    Verify that matrix values match original values for households in their
+    original state.
+
+    Returns:
+        Tuple of (verified_count, mismatches_list, skipped_reason or None)
+    """
+    # Get state-level targets
+    state_targets = targets_df[
+        (targets_df["variable"] == variable_name)
+        & (targets_df["geographic_id"].apply(lambda x: str(x).isdigit()))
+        & (
+            targets_df["geographic_id"].apply(
+                lambda x: 1 <= int(x) <= 56 if str(x).isdigit() else False
+            )
+        )
+    ]
+
+    if len(state_targets) == 0:
+        return 0, [], f"No state-level targets for {variable_name}"
+
+    mismatches = []
+    verified_count = 0
+
+    for _, target_row in state_targets.iterrows():
+        target_state = int(target_row["geographic_id"])
+        row_idx = target_row.name
+
+        # Find all CDs in this state
+        state_cds = [
+            (cd_idx, cd)
+            for cd_idx, cd in enumerate(cds)
+            if int(cd) // 100 == target_state
+        ]
+
+        if not state_cds:
+            continue
+
+        # Find households originally from this state
+        hh_from_state_mask = original_state_fips == target_state
+        hh_indices_from_state = np.where(hh_from_state_mask)[0]
+
+        if len(hh_indices_from_state) == 0:
+            continue
+
+        # For each CD in the state, check matrix values
+        for cd_idx, cd in state_cds:
+            col_start = cd_idx * n_households
+
+            for hh_idx in hh_indices_from_state:
+                col_idx = col_start + hh_idx
+                matrix_val = X_sparse[row_idx, col_idx]
+                original_val = original_values[hh_idx]
+
+                if original_val == 0 and matrix_val == 0:
+                    verified_count += 1
+                    continue
+
+                if original_val != 0:
+                    rel_diff = abs(matrix_val - original_val) / abs(
+                        original_val
+                    )
+                    if rel_diff > rtol:
+                        mismatches.append(
+                            {
+                                "variable": variable_name,
+                                "state": target_state,
+                                "cd": cd,
+                                "hh_idx": hh_idx,
+                                "matrix_val": float(matrix_val),
+                                "original_val": float(original_val),
+                                "rel_diff": rel_diff,
+                            }
+                        )
+                    else:
+                        verified_count += 1
+                elif matrix_val != 0:
+                    mismatches.append(
+                        {
+                            "variable": variable_name,
+                            "state": target_state,
+                            "cd": cd,
+                            "hh_idx": hh_idx,
+                            "matrix_val": float(matrix_val),
+                            "original_val": float(original_val),
+                            "rel_diff": float("inf"),
+                        }
+                    )
+
+    return verified_count, mismatches, None
+
+
+# =============================================================================
+# BASIC STRUCTURE TESTS
+# =============================================================================
+
+
+def test_person_level_aggregation_preserves_totals(sim):
+    """Health insurance premiums (person-level) sum correctly to household."""
+    var = "health_insurance_premiums_without_medicare_part_b"
+    person_total = sim.calculate(var, 2023, map_to="person").values.sum()
+    household_total = sim.calculate(var, 2023, map_to="household").values.sum()
+    assert np.isclose(person_total, household_total, rtol=1e-6)
+
+
+def test_matrix_shape(sim, builder):
+    """Matrix should have (n_targets, n_households * n_cds) shape."""
+    targets_df, X_sparse, _ = builder.build_matrix(
+        sim,
+        target_filter={
+            "variables": ["health_insurance_premiums_without_medicare_part_b"]
+        },
+    )
+    n_households = len(
+        sim.calculate("household_id", map_to="household").values
+    )
+    n_cds = len(builder.cds_to_calibrate)
+    assert X_sparse.shape[1] == n_households * n_cds
+
+
+def test_combined_variables_in_matrix(sim, builder):
+    """Matrix should include all configured variables."""
+    targets_df, X_sparse, _ = builder.build_matrix(
+        sim,
+        target_filter=COMBINED_FILTER_CONFIG,
+    )
+    variables = targets_df["variable"].unique()
+
+    for var_name, _ in VARIABLES_TO_TEST:
+        assert var_name in variables, f"Missing variable: {var_name}"
+
+
+# =============================================================================
+# CONSOLIDATED STATE-LEVEL VALUE TEST
+# =============================================================================
+
+
+class TestStateLevelValues:
+    """
+    Consolidated test for verifying matrix values match original simulation
+    values for households in their original state.
+
+    Builds matrix once and iterates through all configured variables.
+    """
+
+    def test_all_variables_state_level_match(self, sim, combined_matrix_data):
+        """
+        Verify all configured variables have correct state-level values.
+
+        For each variable:
+        1. Calculate original values from simulation
+        2. Compare to matrix values for households in their original state
+        3. Allow up to MAX_MISMATCH_RATE due to stochastic eligibility
+        """
+        results = []
+        all_mismatches = []
+
+        for variable_name, rtol in VARIABLES_TO_TEST:
+            # Calculate original values for this variable
+            original_values = sim.calculate(
+                variable_name, map_to="household"
+            ).values
+
+            verified, mismatches, skip_reason = _verify_state_level_values(
+                X_sparse=combined_matrix_data["X_sparse"],
+                targets_df=combined_matrix_data["targets_df"],
+                original_values=original_values,
+                original_state_fips=combined_matrix_data["state_fips"],
+                cds=combined_matrix_data["cds"],
+                n_households=combined_matrix_data["n_households"],
+                variable_name=variable_name,
+                rtol=rtol,
+            )
+
+            total_checked = verified + len(mismatches)
+            mismatch_rate = (
+                len(mismatches) / total_checked if total_checked > 0 else 0
+            )
+
+            results.append(
+                {
+                    "variable": variable_name,
+                    "verified": verified,
+                    "mismatches": len(mismatches),
+                    "total": total_checked,
+                    "mismatch_rate": mismatch_rate,
+                    "skip_reason": skip_reason,
+                    "passed": (
+                        skip_reason is not None
+                        or mismatch_rate <= MAX_MISMATCH_RATE
+                    ),
+                }
+            )
+
+            all_mismatches.extend(mismatches)
+
+        # Print summary
+        print("\n" + "=" * 70)
+        print("STATE-LEVEL VALUE VERIFICATION SUMMARY")
+        print("=" * 70)
+
+        results_df = pd.DataFrame(results)
+        for _, row in results_df.iterrows():
+            if row["skip_reason"]:
+                status = f"SKIPPED: {row['skip_reason']}"
+            elif row["passed"]:
+                status = (
+                    f"PASSED: {row['verified']:,} verified, "
+                    f"{row['mismatch_rate']:.1%} mismatch rate"
+                )
+            else:
+                status = (
+                    f"FAILED: {row['mismatches']:,} mismatches, "
+                    f"{row['mismatch_rate']:.1%} > {MAX_MISMATCH_RATE:.0%}"
+                )
+            print(f"  {row['variable']}: {status}")
+
+        # Show sample mismatches if any
+        if all_mismatches:
+            print(f"\nSample mismatches ({len(all_mismatches)} total):")
+            mismatch_df = pd.DataFrame(all_mismatches)
+            print(mismatch_df.head(15).to_string())
+
+        mismatch_df.to_csv("state_level_mismatches.csv", index=False)
+
+        # Assert all variables passed
+        failed = [r for r in results if not r["passed"]]
+        assert len(failed) == 0, (
+            f"{len(failed)} variable(s) failed state-level verification: "
+            f"{[r['variable'] for r in failed]}"
+        )
+
+
+# =============================================================================
+# NATIONAL-LEVEL CONTRIBUTION TEST
+# =============================================================================
+
+
+class TestNationalLevelContributions:
+    """
+    Tests verifying that national-level targets receive contributions from
+    households across all states, not just a geographic subset.
+
+    The key insight: for a national target, when we look at a single CD's
+    column block, households from ALL original states should potentially
+    contribute (subject to meeting eligibility constraints). There should
+    be no systematic geographic bias where only households from certain
+    states contribute to the national total.
+    """
+
+    def test_national_targets_receive_multistate_contributions(
+        self, sim, combined_matrix_data
+    ):
+        """
+        Verify that national-level targets have contributions from households
+        originally from multiple states.
+
+        For each national target:
+        1. Look at the matrix row
+        2. For EACH CD's column block, identify which original states have
+           non-zero contributions
+        3. Verify contributions come from multiple states (not geographically
+           biased)
+        """
+        targets_df = combined_matrix_data["targets_df"]
+        X_sparse = combined_matrix_data["X_sparse"]
+        state_fips = combined_matrix_data["state_fips"]
+        n_households = combined_matrix_data["n_households"]
+        cds = combined_matrix_data["cds"]
+
+        # Find national-level targets
+        national_targets = targets_df[
+            targets_df["geographic_id"].apply(
+                lambda x: _get_geo_level(x) == "national"
+            )
+        ]
+
+        if len(national_targets) == 0:
+            pytest.skip("No national-level targets found")
+
+        results = []
+
+        for _, target in national_targets.iterrows():
+            row_idx = target.name
+            variable = target["variable"]
+            row = X_sparse[row_idx, :].toarray().flatten()
+
+            # For each CD block, check which original states contribute
+            cd_contribution_stats = []
+
+            for cd_idx, cd in enumerate(cds):
+                col_start = cd_idx * n_households
+                col_end = col_start + n_households
+                cd_values = row[col_start:col_end]
+
+                # Find households with non-zero values in this CD block
+                nonzero_mask = cd_values != 0
+                nonzero_indices = np.where(nonzero_mask)[0]
+
+                if len(nonzero_indices) == 0:
+                    continue
+
+                # Get original states of contributing households
+                contributing_states = set(state_fips[nonzero_indices])
+
+                cd_contribution_stats.append(
+                    {
+                        "cd": cd,
+                        "cd_state": int(cd) // 100,
+                        "n_contributing": len(nonzero_indices),
+                        "n_states": len(contributing_states),
+                        "contributing_states": contributing_states,
+                    }
+                )
+
+            if not cd_contribution_stats:
+                results.append(
+                    {
+                        "variable": variable,
+                        "status": "NO_CONTRIBUTIONS",
+                        "details": "No non-zero values in any CD block",
+                    }
+                )
+                continue
+
+            # Aggregate stats
+            stats_df = pd.DataFrame(cd_contribution_stats)
+            avg_states = stats_df["n_states"].mean()
+            min_states = stats_df["n_states"].min()
+
+            # Check: on average, contributions should come from multiple states
+            # (at least 2, since we have CDs from 6 different states)
+            passed = avg_states >= 2 and min_states >= 1
+
+            results.append(
+                {
+                    "variable": variable,
+                    "status": "PASSED" if passed else "FAILED",
+                    "avg_contributing_states": avg_states,
+                    "min_contributing_states": min_states,
+                    "n_cd_blocks_with_data": len(stats_df),
+                }
+            )
+
+        # Assert no geographic bias
+        failed = [r for r in results if r["status"] == "FAILED"]
+        assert len(failed) == 0, (
+            f"Geographic bias detected in national targets: "
+            f"{[r['variable'] for r in failed]}"
+        )
+
+    def test_state_distribution_in_national_targets(
+        self, sim, combined_matrix_data
+    ):
+        """
+        Verify the distribution of contributing states in national targets
+        roughly matches the original data distribution.
+
+        This catches cases where one state dominates the contributions
+        disproportionately.
+        """
+        targets_df = combined_matrix_data["targets_df"]
+        X_sparse = combined_matrix_data["X_sparse"]
+        state_fips = combined_matrix_data["state_fips"]
+        n_households = combined_matrix_data["n_households"]
+        cds = combined_matrix_data["cds"]
+
+        # Get original state distribution (count of households per state)
+        unique_states, original_counts = np.unique(
+            state_fips, return_counts=True
+        )
+        original_dist = dict(zip(unique_states, original_counts))
+        total_hh = len(state_fips)
+
+        # Find national-level targets
+        national_targets = targets_df[
+            targets_df["geographic_id"].apply(
+                lambda x: _get_geo_level(x) == "national"
+            )
+        ]
+
+        if len(national_targets) == 0:
+            pytest.skip("No national-level targets found")
+
+        for _, target in national_targets.iterrows():
+            row_idx = target.name
+            variable = target["variable"]
+            row = X_sparse[row_idx, :].toarray().flatten()
+
+            # Count contributions by original state across ALL CD blocks
+            state_contribution_counts = {}
+
+            for cd_idx, cd in enumerate(cds):
+                col_start = cd_idx * n_households
+                col_end = col_start + n_households
+                cd_values = row[col_start:col_end]
+
+                nonzero_mask = cd_values != 0
+                nonzero_indices = np.where(nonzero_mask)[0]
+
+                for hh_idx in nonzero_indices:
+                    orig_state = state_fips[hh_idx]
+                    state_contribution_counts[orig_state] = (
+                        state_contribution_counts.get(orig_state, 0) + 1
+                    )
+
+            if not state_contribution_counts:
+                continue
+
+            # Check that no single state dominates excessively
+            total_contributions = sum(state_contribution_counts.values())
+            max_contribution = max(state_contribution_counts.values())
+            max_state = max(
+                state_contribution_counts, key=state_contribution_counts.get
+            )
+            max_share = max_contribution / total_contributions
+
+            # The max share should not exceed 70% (unless that state has 70%+
+            # of households in the original data)
+            original_max_share = original_dist.get(max_state, 0) / total_hh
+
+            # Allow 20% margin above original share
+            threshold = min(0.7, original_max_share + 0.2)
+
+            assert max_share <= threshold, (
+                f"State {max_state} dominates national {variable} target with "
+                f"{max_share:.1%} of contributions "
+                f"(original share: {original_max_share:.1%})"
+            )
+
+
+# =============================================================================
+# CROSS-STATE RECALCULATION TEST
+# =============================================================================
+
+
+class TestCrossStateRecalculation:
+    """
+    Tests verifying that household values change when borrowed to different
+    states, confirming state-specific rules are being applied.
+
+    The key insight: for national-level targets (no state constraint), each
+    household appears in every CD block. The value in each CD block represents
+    what the variable would be if that household lived in that CD's state.
+    For state-dependent variables (like SNAP), values should differ across
+    states for at least some households.
+    """
+
+    def test_values_change_across_states_for_national_targets(
+        self, combined_matrix_data
+    ):
+        """
+        Verify that for national targets, household values vary across CD
+        blocks from different states.
+
+        This confirms the matrix builder is correctly recalculating variables
+        with state-specific rules when households are "borrowed" to different
+        geographic areas.
+
+        The test checks:
+        1. For each national target, examine households with non-zero values
+        2. Compare each household's value across CD blocks from different states
+        3. At least some households should have different values in different
+           states (confirming recalculation with different state rules)
+        """
+        targets_df = combined_matrix_data["targets_df"]
+        X_sparse = combined_matrix_data["X_sparse"]
+        n_households = combined_matrix_data["n_households"]
+        cds = combined_matrix_data["cds"]
+
+        # Group CDs by state
+        cds_by_state = {}
+        for cd_idx, cd in enumerate(cds):
+            state = int(cd) // 100
+            if state not in cds_by_state:
+                cds_by_state[state] = []
+            cds_by_state[state].append((cd_idx, cd))
+
+        states = list(cds_by_state.keys())
+        if len(states) < 2:
+            pytest.skip("Need at least 2 states to test cross-state variation")
+
+        # Find national-level targets
+        national_targets = targets_df[
+            targets_df["geographic_id"].apply(
+                lambda x: _get_geo_level(x) == "national"
+            )
+        ]
+
+        if len(national_targets) == 0:
+            pytest.skip("No national-level targets found")
+
+        results = []
+
+        for _, target in national_targets.iterrows():
+            if target["variable"] not in VARIABLES_WITH_STATE_VARIATION:
+                continue
+            row_idx = target.name
+            variable = target["variable"]
+            row = X_sparse[row_idx, :].toarray().flatten()
+
+            # For each household, collect values from different states
+            households_with_variation = 0
+            households_checked = 0
+
+            # Sample households (check every 10th to keep test fast)
+            for hh_idx in range(0, n_households, 10):
+                # Get this household's value in each state (use first CD of
+                # each state)
+                state_values = {}
+                for state, cd_list in cds_by_state.items():
+                    cd_idx, _ = cd_list[0]  # First CD in this state
+                    col_idx = cd_idx * n_households + hh_idx
+                    state_values[state] = row[col_idx]
+
+                # Skip if all values are zero (household doesn't qualify for
+                # this variable)
+                nonzero_values = [v for v in state_values.values() if v != 0]
+                if len(nonzero_values) < 2:
+                    continue
+
+                households_checked += 1
+
+                # Check if values differ across states
+                unique_values = set(nonzero_values)
+                if len(unique_values) > 1:
+                    households_with_variation += 1
+
+            variation_rate = (
+                households_with_variation / households_checked
+                if households_checked > 0
+                else 0
+            )
+
+            results.append(
+                {
+                    "variable": variable,
+                    "households_checked": households_checked,
+                    "households_with_variation": households_with_variation,
+                    "variation_rate": variation_rate,
+                }
+            )
+
+        # For state-dependent variables, we expect SOME variation
+        # (not all households will vary - some may have $0 or max benefits
+        # regardless of state)
+        # The key is that variation exists, confirming recalculation occurs
+        for r in results:
+            if r["households_checked"] > 0:
+                # At least 10% of households should show variation for
+                # state-dependent variables
+                assert (
+                    r["variation_rate"] > 0.1 or r["households_checked"] < 10
+                ), (
+                    f"No cross-state variation found for {r['variable']}. "
+                    f"This suggests state-specific rules may not be applied "
+                    f"when households are borrowed to different states."
+                )
+
+    def test_same_household_different_states_shows_rule_changes(
+        self, combined_matrix_data
+    ):
+        """
+        Deep dive test: pick specific households and verify their values
+        differ across states in a way consistent with state-specific rules.
+
+        For SNAP specifically, different states have different:
+        - Standard deductions
+        - Shelter deduction caps
+        - Vehicle allowances
+        - Categorical eligibility rules
+
+        This test finds households where we can verify the recalculation
+        is applying different state rules.
+        """
+        targets_df = combined_matrix_data["targets_df"]
+        X_sparse = combined_matrix_data["X_sparse"]
+        n_households = combined_matrix_data["n_households"]
+        cds = combined_matrix_data["cds"]
+        state_fips_orig = combined_matrix_data["state_fips"]
+
+        # Group CDs by state
+        cds_by_state = {}
+        for cd_idx, cd in enumerate(cds):
+            state = int(cd) // 100
+            if state not in cds_by_state:
+                cds_by_state[state] = []
+            cds_by_state[state].append((cd_idx, cd))
+
+        states = sorted(cds_by_state.keys())
+        if len(states) < 2:
+            pytest.skip("Need at least 2 states")
+
+        # Find national SNAP target (most state-dependent)
+        snap_national = targets_df[
+            (targets_df["variable"] == "snap")
+            & (
+                targets_df["geographic_id"].apply(
+                    lambda x: _get_geo_level(x) == "national"
+                )
+            )
+        ]
+
+        if len(snap_national) == 0:
+            pytest.skip("No national SNAP target found")
+
+        row_idx = snap_national.iloc[0].name
+        row = X_sparse[row_idx, :].toarray().flatten()
+
+        # Find households with interesting variation patterns
+        example_households = []
+
+        for hh_idx in range(n_households):
+            state_values = {}
+            for state, cd_list in cds_by_state.items():
+                cd_idx, _ = cd_list[0]
+                col_idx = cd_idx * n_households + hh_idx
+                state_values[state] = row[col_idx]
+
+            # Look for households where:
+            # 1. At least 2 states have non-zero SNAP
+            # 2. The values differ significantly (>10% relative difference)
+            nonzero_states = {s: v for s, v in state_values.items() if v > 0}
+
+            if len(nonzero_states) >= 2:
+                values = list(nonzero_states.values())
+                max_val = max(values)
+                min_val = min(values)
+                if min_val > 0 and (max_val - min_val) / min_val > 0.1:
+                    example_households.append(
+                        {
+                            "hh_idx": hh_idx,
+                            "original_state": state_fips_orig[hh_idx],
+                            "state_values": nonzero_states,
+                            "max_val": max_val,
+                            "min_val": min_val,
+                            "variation": (max_val - min_val) / min_val,
+                        }
+                    )
+
+            if len(example_households) >= 5:
+                break
+
+        # Assert we found at least one household with variation
+        assert len(example_households) > 0, (
+            "Expected to find households with >10% SNAP variation across "
+            "states, confirming state-specific rules are applied"
+        )

From fe70932ae8f714a8e86696cb3bb8d67b0e8045f2 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Wed, 21 Jan 2026 20:47:13 +0530
Subject: [PATCH 2/6] lint after updating black

---
 policyengine_us_data/datasets/cps/cps.py                         | 1 -
 policyengine_us_data/datasets/cps/enhanced_cps.py                | 1 -
 .../datasets/cps/local_area_calibration/calibration_utils.py     | 1 -
 .../datasets/cps/local_area_calibration/matrix_tracer.py         | 1 -
 policyengine_us_data/datasets/puf/puf.py                         | 1 -
 policyengine_us_data/datasets/puf/uprate_puf.py                  | 1 -
 policyengine_us_data/db/create_database_tables.py                | 1 -
 policyengine_us_data/db/etl_age.py                               | 1 -
 policyengine_us_data/db/etl_irs_soi.py                           | 1 -
 policyengine_us_data/db/validate_database.py                     | 1 -
 .../storage/calibration_targets/pull_snap_targets.py             | 1 -
 policyengine_us_data/tests/test_datasets/test_county_fips.py     | 1 -
 .../test_local_area_calibration/test_sparse_matrix_builder.py    | 1 -
 policyengine_us_data/utils/census.py                             | 1 -
 policyengine_us_data/utils/huggingface.py                        | 1 -
 policyengine_us_data/utils/loss.py                               | 1 -
 policyengine_us_data/utils/spm.py                                | 1 -
 tests/test_h6_reform.py                                          | 1 -
 18 files changed, 18 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
index 19ee9249..27a41bec 100644
--- a/policyengine_us_data/datasets/cps/cps.py
+++ b/policyengine_us_data/datasets/cps/cps.py
@@ -15,7 +15,6 @@
 from microimpute.models.qrf import QRF
 import logging
 
-
 test_lite = os.environ.get("TEST_LITE") == "true"
 print(f"TEST_LITE == {test_lite}")
 
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 8bbe67bc..4eb0a660 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -22,7 +22,6 @@
 from pathlib import Path
 import logging
 
-
 try:
     import torch
 except ImportError:
diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py
index c2e2a08f..f01465a2 100644
--- a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py
+++ b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py
@@ -17,7 +17,6 @@
     StateCode,
 )
 
-
 # State/Geographic Mappings
 STATE_CODES = {
     1: "AL",
diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/matrix_tracer.py b/policyengine_us_data/datasets/cps/local_area_calibration/matrix_tracer.py
index e7cbf57b..4823de1e 100644
--- a/policyengine_us_data/datasets/cps/local_area_calibration/matrix_tracer.py
+++ b/policyengine_us_data/datasets/cps/local_area_calibration/matrix_tracer.py
@@ -46,7 +46,6 @@
     create_target_groups,
 )
 
-
 logger = logging.getLogger(__name__)
 
 
diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py
index 9d605aca..c90255e3 100644
--- a/policyengine_us_data/datasets/puf/puf.py
+++ b/policyengine_us_data/datasets/puf/puf.py
@@ -15,7 +15,6 @@
     create_policyengine_uprating_factors_table,
 )
 
-
 rng = np.random.default_rng(seed=64)
 
 # Get Qualified Business Income simulation parameters ---
diff --git a/policyengine_us_data/datasets/puf/uprate_puf.py b/policyengine_us_data/datasets/puf/uprate_puf.py
index 1cf0eb9c..96144615 100644
--- a/policyengine_us_data/datasets/puf/uprate_puf.py
+++ b/policyengine_us_data/datasets/puf/uprate_puf.py
@@ -2,7 +2,6 @@
 import numpy as np
 from policyengine_us_data.storage import STORAGE_FOLDER
 
-
 ITMDED_GROW_RATE = 0.02  # annual growth rate in itemized deduction amounts
 
 USE_VARIABLE_SPECIFIC_POPULATION_GROWTH_DIVISORS = False
diff --git a/policyengine_us_data/db/create_database_tables.py b/policyengine_us_data/db/create_database_tables.py
index df03772d..920d1449 100644
--- a/policyengine_us_data/db/create_database_tables.py
+++ b/policyengine_us_data/db/create_database_tables.py
@@ -15,7 +15,6 @@
 
 from policyengine_us_data.storage import STORAGE_FOLDER
 
-
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py
index bb83067c..d80faf06 100644
--- a/policyengine_us_data/db/etl_age.py
+++ b/policyengine_us_data/db/etl_age.py
@@ -11,7 +11,6 @@
 )
 from policyengine_us_data.utils.census import get_census_docs, pull_acs_table
 
-
 LABEL_TO_SHORT = {
     "Estimate!!Total!!Total population!!AGE!!Under 5 years": "0-4",
     "Estimate!!Total!!Total population!!AGE!!5 to 9 years": "5-9",
diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py
index 786abb1c..6607a5dd 100644
--- a/policyengine_us_data/db/etl_irs_soi.py
+++ b/policyengine_us_data/db/etl_irs_soi.py
@@ -24,7 +24,6 @@
     get_district_mapping,
 )
 
-
 """See the 22incddocguide.docx manual from the IRS SOI"""
 # Let's make this work with strict inequalities
 # Language in the doc: '$10,000 under $25,000'
diff --git a/policyengine_us_data/db/validate_database.py b/policyengine_us_data/db/validate_database.py
index fee6a49d..53ac0985 100644
--- a/policyengine_us_data/db/validate_database.py
+++ b/policyengine_us_data/db/validate_database.py
@@ -9,7 +9,6 @@
 import pandas as pd
 from policyengine_us.system import system
 
-
 conn = sqlite3.connect("policyengine_us_data/storage/policy_data.db")
 
 stratum_constraints_df = pd.read_sql("SELECT * FROM stratum_constraints", conn)
diff --git a/policyengine_us_data/storage/calibration_targets/pull_snap_targets.py b/policyengine_us_data/storage/calibration_targets/pull_snap_targets.py
index 349e6fbd..1830bdb3 100644
--- a/policyengine_us_data/storage/calibration_targets/pull_snap_targets.py
+++ b/policyengine_us_data/storage/calibration_targets/pull_snap_targets.py
@@ -9,7 +9,6 @@
     STATE_NAME_TO_ABBREV,
 )
 
-
 STATE_NAME_TO_FIPS = {
     "Alabama": "01",
     "Alaska": "02",
diff --git a/policyengine_us_data/tests/test_datasets/test_county_fips.py b/policyengine_us_data/tests/test_datasets/test_county_fips.py
index ad1f10c5..d692cf55 100644
--- a/policyengine_us_data/tests/test_datasets/test_county_fips.py
+++ b/policyengine_us_data/tests/test_datasets/test_county_fips.py
@@ -10,7 +10,6 @@
     LOCAL_FOLDER,
 )
 
-
 # Sample data that mimics the format from census.gov
 SAMPLE_CENSUS_DATA = """STATE|STATEFP|COUNTYFP|COUNTYNAME
 AL|01|001|Autauga County
diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py b/policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py
index 175488be..76309388 100644
--- a/policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py
+++ b/policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py
@@ -35,7 +35,6 @@
     SparseMatrixBuilder,
 )
 
-
 # =============================================================================
 # CONFIGURATION - Update these lists as new variables are added
 # =============================================================================
diff --git a/policyengine_us_data/utils/census.py b/policyengine_us_data/utils/census.py
index 2f424ccb..8081b616 100644
--- a/policyengine_us_data/utils/census.py
+++ b/policyengine_us_data/utils/census.py
@@ -4,7 +4,6 @@
 import pandas as pd
 import numpy as np
 
-
 STATE_NAME_TO_FIPS = {
     "Alabama": "01",
     "Alaska": "02",
diff --git a/policyengine_us_data/utils/huggingface.py b/policyengine_us_data/utils/huggingface.py
index 2860adf3..a312b524 100644
--- a/policyengine_us_data/utils/huggingface.py
+++ b/policyengine_us_data/utils/huggingface.py
@@ -1,7 +1,6 @@
 from huggingface_hub import hf_hub_download, login, HfApi
 import os
 
-
 TOKEN = os.environ.get("HUGGING_FACE_TOKEN")
 if not TOKEN:
     raise ValueError(
diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py
index cbea6dab..e368d504 100644
--- a/policyengine_us_data/utils/loss.py
+++ b/policyengine_us_data/utils/loss.py
@@ -9,7 +9,6 @@
 from policyengine_core.reforms import Reform
 from policyengine_us_data.utils.soi import pe_to_soi, get_soi
 
-
 # CPS-derived statistics
 # Medical expenses, sum of spm thresholds
 # Child support expenses
diff --git a/policyengine_us_data/utils/spm.py b/policyengine_us_data/utils/spm.py
index 070db533..b2e4538b 100644
--- a/policyengine_us_data/utils/spm.py
+++ b/policyengine_us_data/utils/spm.py
@@ -3,7 +3,6 @@
 import numpy as np
 from spm_calculator import SPMCalculator, spm_equivalence_scale
 
-
 TENURE_CODE_MAP = {
     1: "owner_with_mortgage",
     2: "owner_without_mortgage",
diff --git a/tests/test_h6_reform.py b/tests/test_h6_reform.py
index 7253ed97..e68ed8db 100644
--- a/tests/test_h6_reform.py
+++ b/tests/test_h6_reform.py
@@ -11,7 +11,6 @@
 
 import pytest
 
-
 # Constants from the H6 reform implementation
 HI_SINGLE = 34_000
 HI_JOINT = 44_000

From 653415139cd7fffd966d22d1488db30dcfe9107f Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Thu, 22 Jan 2026 14:39:20 +0530
Subject: [PATCH 3/6] remove redundant tests and fixtures

---
 .../test_local_area_calibration/conftest.py   |  65 ++-
 .../test_cross_state.py                       |  58 ++-
 ...r.py => test_matrix_national_variation.py} | 436 +-----------------
 .../test_same_state.py                        |  59 ++-
 4 files changed, 172 insertions(+), 446 deletions(-)
 rename policyengine_us_data/tests/test_local_area_calibration/{test_sparse_matrix_builder.py => test_matrix_national_variation.py} (53%)

diff --git a/policyengine_us_data/tests/test_local_area_calibration/conftest.py b/policyengine_us_data/tests/test_local_area_calibration/conftest.py
index 04d6d7f5..d4b5edc6 100644
--- a/policyengine_us_data/tests/test_local_area_calibration/conftest.py
+++ b/policyengine_us_data/tests/test_local_area_calibration/conftest.py
@@ -1,4 +1,7 @@
-"""Shared fixtures for local area calibration tests."""
+"""Shared fixtures for local area calibration tests.
+
+Importantly, this file determines which variables will be included in the sparse matrix and calibrating routine.
+"""
 
 import pytest
 import numpy as np
@@ -16,6 +19,56 @@
     get_calculated_variables,
 )
 
+# Variables to test for state-level value matching
+# Format: (variable_name, rtol)
+#     variable_name as per the targets in policy_data.db
+#     rtol is relative tolerance for comparison
+VARIABLES_TO_TEST = [
+    ("snap", 1e-2),
+    ("health_insurance_premiums_without_medicare_part_b", 1e-2),
+    ("medicaid", 1e-2),
+    ("medicare_part_b_premiums", 1e-2),
+    ("other_medical_expenses", 1e-2),
+    ("over_the_counter_health_expenses", 1e-2),
+    ("salt_deduction", 1e-2),
+    ("spm_unit_capped_work_childcare_expenses", 1e-2),
+    ("spm_unit_capped_housing_subsidy", 1e-2),
+    ("ssi", 1e-2),
+    ("tanf", 1e-2),
+    ("tip_income", 1e-2),
+    ("unemployment_compensation", 1e-2),
+]
+
+# Combined filter config to build matrix with all variables at once
+COMBINED_FILTER_CONFIG = {
+    "stratum_group_ids": [
+        4,  # SNAP targets
+        5,  # Medicaid targets
+        112,  # Unemployment compensation targets
+    ],
+    "variables": [
+        "snap",
+        "health_insurance_premiums_without_medicare_part_b",
+        "medicaid",
+        "medicare_part_b_premiums",
+        "other_medical_expenses",
+        "over_the_counter_health_expenses",
+        "salt_deduction",
+        "spm_unit_capped_work_childcare_expenses",
+        "spm_unit_capped_housing_subsidy",
+        "ssi",
+        "tanf",
+        "tip_income",
+        "unemployment_compensation",
+    ],
+}
+
+# Maximum allowed mismatch rate for state-level value comparison
+MAX_MISMATCH_RATE = 0.02
+
+# Number of samples for cell-level verification tests
+N_VERIFICATION_SAMPLES = 200
+
 
 @pytest.fixture(scope="module")
 def db_uri():
@@ -30,7 +83,7 @@ def dataset_path():
 
 @pytest.fixture(scope="module")
 def test_cds(db_uri):
-    """CDs from NC, HI, MT, AK (manageable size, multiple same-state CDs)."""
+    """CDs from multiple states for comprehensive testing."""
     engine = create_engine(db_uri)
     query = """
     SELECT DISTINCT sc.value as cd_geoid
@@ -43,6 +96,10 @@ def test_cds(db_uri):
         OR sc.value LIKE '150_'
         OR sc.value LIKE '300_'
         OR sc.value = '200' OR sc.value = '201'
+        OR sc.value IN ('101', '102')
+        OR sc.value IN ('601', '602')
+        OR sc.value IN ('3601', '3602')
+        OR sc.value IN ('4801', '4802')
       )
     ORDER BY sc.value
     """
@@ -58,7 +115,7 @@ def sim(dataset_path):
 
 @pytest.fixture(scope="module")
 def matrix_data(db_uri, dataset_path, test_cds, sim):
-    """Build sparse matrix, return (targets_df, X_sparse, household_id_mapping)."""
+    """Build sparse matrix with all configured variables."""
     builder = SparseMatrixBuilder(
         db_uri,
         time_period=2023,
@@ -66,7 +123,7 @@ def matrix_data(db_uri, dataset_path, test_cds, sim):
         dataset_path=dataset_path,
     )
     targets_df, X_sparse, household_id_mapping = builder.build_matrix(
-        sim, target_filter={"stratum_group_ids": [4], "variables": ["snap"]}
+        sim, target_filter=COMBINED_FILTER_CONFIG
     )
     return targets_df, X_sparse, household_id_mapping
 
diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_cross_state.py b/policyengine_us_data/tests/test_local_area_calibration/test_cross_state.py
index ea9eca6f..f3615e30 100644
--- a/policyengine_us_data/tests/test_local_area_calibration/test_cross_state.py
+++ b/policyengine_us_data/tests/test_local_area_calibration/test_cross_state.py
@@ -2,17 +2,19 @@
 
 import pytest
 import numpy as np
+from collections import defaultdict
 
 from policyengine_us import Microsimulation
 from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (
     get_calculated_variables,
 )
 
+from .conftest import VARIABLES_TO_TEST, N_VERIFICATION_SAMPLES
+
 
 def test_cross_state_matches_swapped_sim(
     X_sparse,
     targets_df,
-    tracer,
     test_cds,
     dataset_path,
     n_households,
@@ -25,8 +27,10 @@ def test_cross_state_matches_swapped_sim(
     When household moves to different state, X_sparse should contain the
     value calculated from a fresh simulation with state_fips set to
     destination state.
+
+    Uses stratified sampling to ensure all variables in VARIABLES_TO_TEST
+    are covered with approximately equal samples per variable.
     """
-    n_samples = 200
     seed = 42
     rng = np.random.default_rng(seed)
     n_hh = n_households
@@ -48,28 +52,46 @@ def get_state_sim(state):
 
     nonzero_rows, nonzero_cols = X_sparse.nonzero()
 
-    cross_state_indices = []
+    # Group cross-state cells by variable for stratified sampling
+    variable_to_indices = defaultdict(list)
+    variables_to_test = {v[0] for v in VARIABLES_TO_TEST}
+
     for i in range(len(nonzero_rows)):
+        row_idx = nonzero_rows[i]
         col_idx = nonzero_cols[i]
         cd_idx = col_idx // n_hh
         hh_idx = col_idx % n_hh
         cd = test_cds[cd_idx]
         dest_state = int(cd) // 100
         orig_state = int(hh_states[hh_idx])
-        if dest_state != orig_state:
-            cross_state_indices.append(i)
 
-    if not cross_state_indices:
-        pytest.skip("No cross-state non-zero cells found")
+        # Only include cross-state cells
+        if dest_state == orig_state:
+            continue
+
+        # Get variable for this row
+        variable = targets_df.iloc[row_idx]["variable"]
+        if variable in variables_to_test:
+            variable_to_indices[variable].append(i)
+
+    if not variable_to_indices:
+        pytest.skip("No cross-state non-zero cells found for test variables")
 
-    sample_idx = rng.choice(
-        cross_state_indices,
-        min(n_samples, len(cross_state_indices)),
-        replace=False,
+    # Stratified sampling: sample proportionally from each variable
+    samples_per_var = max(
+        1, N_VERIFICATION_SAMPLES // len(variable_to_indices)
     )
+    sample_indices = []
+
+    for variable, indices in variable_to_indices.items():
+        n_to_sample = min(samples_per_var, len(indices))
+        sampled = rng.choice(indices, n_to_sample, replace=False)
+        sample_indices.extend(sampled)
+
     errors = []
+    variables_tested = set()
 
-    for idx in sample_idx:
+    for idx in sample_indices:
         row_idx = nonzero_rows[idx]
         col_idx = nonzero_cols[idx]
         cd_idx = col_idx // n_hh
@@ -83,6 +105,8 @@ def get_state_sim(state):
             state_sim.calculate(variable, map_to="household").values[hh_idx]
         )
 
+        variables_tested.add(variable)
+
         if not np.isclose(actual, expected, atol=0.5):
             errors.append(
                 {
@@ -95,7 +119,13 @@ def get_state_sim(state):
                 }
             )
 
+    # Report which variables were tested
+    missing_vars = variables_to_test - variables_tested
+    if missing_vars:
+        print(f"Warning: No cross-state cells found for: {missing_vars}")
+
     assert not errors, (
-        f"Cross-state verification failed: {len(errors)}/{len(sample_idx)} "
-        f"mismatches. First 5: {errors[:5]}"
+        f"Cross-state verification failed: {len(errors)}/{len(sample_indices)} "
+        f"mismatches across {len(variables_tested)} variables. "
+        f"First 5: {errors[:5]}"
     )
diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py b/policyengine_us_data/tests/test_local_area_calibration/test_matrix_national_variation.py
similarity index 53%
rename from policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py
rename to policyengine_us_data/tests/test_local_area_calibration/test_matrix_national_variation.py
index 76309388..09cba3d1 100644
--- a/policyengine_us_data/tests/test_local_area_calibration/test_sparse_matrix_builder.py
+++ b/policyengine_us_data/tests/test_local_area_calibration/test_matrix_national_variation.py
@@ -1,162 +1,31 @@
 """
-Tests for sparse matrix builder correctness.
+Tests for correctness in the sparse matrix builder, particularly for national level contributions.
 
 These tests verify that:
 1. Matrix shape and structure are correct
-2. Matrix cell values match simulation calculations for households in their
-   original state
-3. Variable aggregation (person to household) preserves totals
-4. National-level targets receive contributions from all states (no geographic
+2. Variable aggregation (person to household) preserves totals
+3. National-level targets receive contributions from all states (no geographic
    bias)
-
-The key verification approach:
-- When households are "borrowed" to different geographic areas, state_fips is
-  changed and variables are recalculated
-- For households borrowed to CDs in their ORIGINAL state, the recalculated
-  values should match the original simulation values exactly (since state_fips
-  is unchanged)
-- This provides a ground-truth verification without needing end-to-end H5
-  creation
-
-IMPORTANT NOTE on stochastic eligibility:
-Some variables like SNAP have eligibility tests that use PolicyEngine's
-random() function. When variables are recalculated in the matrix builder (via
-fresh simulations), the random seed sequence may differ, causing ~1-3% of
-households to have different eligibility outcomes. This is expected behavior,
-so tests allow up to 2% mismatch rate for such variables.
+4. Cross-state recalculation applies state-specific rules
 """
 
 import pytest
 import numpy as np
 import pandas as pd
-from policyengine_us import Microsimulation
-from policyengine_us_data.storage import STORAGE_FOLDER
 from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import (
     SparseMatrixBuilder,
 )
 
-# =============================================================================
-# CONFIGURATION - Update these lists as new variables are added
-# =============================================================================
-
-# Variables to test for state-level value matching
-# Format: (variable_name, rtol) - rtol is relative tolerance for comparison
-VARIABLES_TO_TEST = [
-    ("snap", 1e-2),
-    ("health_insurance_premiums_without_medicare_part_b", 1e-2),
-    ("medicaid", 1e-2),
-    ("medicare_part_b_premiums", 1e-2),
-    ("other_medical_expenses", 1e-2),
-    ("over_the_counter_health_expenses", 1e-2),
-    ("salt_deduction", 1e-2),
-    ("spm_unit_capped_work_childcare_expenses", 1e-2),
-    ("spm_unit_capped_housing_subsidy", 1e-2),
-    ("ssi", 1e-2),
-    ("tanf", 1e-2),
-    ("tip_income", 1e-2),
-    ("unemployment_compensation", 1e-2),
-]
-
-# Combined filter config to build matrix with all variables at once
-COMBINED_FILTER_CONFIG = {
-    "stratum_group_ids": [
-        4,  # SNAP targets
-        5,  # Medicaid targets
-        112,  # Unemployment compensation targets
-    ],
-    "variables": [
-        "snap",
-        "health_insurance_premiums_without_medicare_part_b",
-        "medicaid",
-        "medicare_part_b_premiums",
-        "other_medical_expenses",
-        "over_the_counter_health_expenses",
-        "salt_deduction",
-        "spm_unit_capped_work_childcare_expenses",
-        "spm_unit_capped_housing_subsidy",
-        "ssi",
-        "tanf",
-        "tip_income",
-        "unemployment_compensation",
-    ],
-}
+from .conftest import (
+    VARIABLES_TO_TEST,
+    COMBINED_FILTER_CONFIG,
+)
 
+# Variables with state-specific variation (e.g., SNAP eligibility)
 VARIABLES_WITH_STATE_VARIATION = [
     "snap",
 ]
 
-# Complications:
-# (snap)
-# (unemployment_compensation)
-# income_tax
-# qualified_business_income_deduction
-# taxable_social_security
-# taxable_pension_income
-# taxable_ira_distributions
-# taxable_interest_income
-# tax_exempt_interest_income
-# self_employment_income
-# salt
-# refundable_ctc
-# real_estate_taxes
-# qualified_dividend_income
-# dividend_income
-# adjusted_gross_income
-# eitc
-
-# Maximum allowed mismatch rate for state-level value comparison
-MAX_MISMATCH_RATE = 0.02
-
-
-# =============================================================================
-# FIXTURES
-# =============================================================================
-
-
-@pytest.fixture(scope="module")
-def db_uri():
-    """Database URI for calibration targets."""
-    db_path = STORAGE_FOLDER / "calibration" / "policy_data.db"
-    return f"sqlite:///{db_path}"
-
-
-@pytest.fixture(scope="module")
-def dataset_path():
-    """Path to stratified extended CPS dataset."""
-    return str(STORAGE_FOLDER / "stratified_extended_cps_2023.h5")
-
-
-@pytest.fixture(scope="module")
-def sim(dataset_path):
-    """Base simulation loaded from stratified CPS."""
-    return Microsimulation(dataset=dataset_path)
-
-
-@pytest.fixture(scope="module")
-def test_cds():
-    """
-    Test CDs spanning multiple states for comprehensive testing.
-
-    Selected to include:
-    - Small states (1-2 CDs): AL, MT
-    - Medium states: NC
-    - Large states: CA, TX, NY
-    """
-    return [
-        "101",  # Alabama CD-1 (state_fips=1)
-        "102",  # Alabama CD-2
-        "601",  # California CD-1 (state_fips=6)
-        "602",  # California CD-2
-        "3001",  # Montana CD-1 (state_fips=30)
-        "3002",  # Montana CD-2
-        "3701",  # North Carolina CD-1 (state_fips=37)
-        "3702",  # North Carolina CD-2
-        "3601",  # New York CD-1 (state_fips=36)
-        "3602",  # New York CD-2
-        "4801",  # Texas CD-1 (state_fips=48)
-        "4802",  # Texas CD-2
-    ]
-
 
 @pytest.fixture(scope="module")
 def builder(db_uri, dataset_path, test_cds):
@@ -169,38 +38,6 @@ def builder(db_uri, dataset_path, test_cds):
     )
 
 
-@pytest.fixture(scope="module")
-def combined_matrix_data(sim, builder):
-    """
-    Build matrix once with all configured variables.
-
-    This fixture is used by the consolidated test to avoid rebuilding
-    the matrix for each variable.
-    """
-    targets_df, X_sparse, hh_mapping = builder.build_matrix(
-        sim,
-        target_filter=COMBINED_FILTER_CONFIG,
-    )
-
-    household_ids = sim.calculate("household_id", map_to="household").values
-    state_fips = sim.calculate("state_fips", map_to="household").values
-
-    return {
-        "targets_df": targets_df,
-        "X_sparse": X_sparse,
-        "hh_mapping": hh_mapping,
-        "household_ids": household_ids,
-        "state_fips": state_fips,
-        "cds": builder.cds_to_calibrate,
-        "n_households": len(household_ids),
-    }
-
-
-# =============================================================================
-# HELPER FUNCTIONS
-# =============================================================================
-
-
 def _get_geo_level(geo_id) -> str:
     """Determine geographic level from geographic_id."""
     if geo_id == "US":
@@ -215,113 +52,6 @@ def _get_geo_level(geo_id) -> str:
         return "unknown"
 
 
-def _verify_state_level_values(
-    X_sparse,
-    targets_df,
-    original_values,
-    original_state_fips,
-    cds,
-    n_households,
-    variable_name,
-    rtol=1e-2,
-):
-    """
-    Verify that matrix values match original values for households in their
-    original state.
-
-    Returns:
-        Tuple of (verified_count, mismatches_list, skipped_reason or None)
-    """
-    # Get state-level targets
-    state_targets = targets_df[
-        (targets_df["variable"] == variable_name)
-        & (targets_df["geographic_id"].apply(lambda x: str(x).isdigit()))
-        & (
-            targets_df["geographic_id"].apply(
-                lambda x: 1 <= int(x) <= 56 if str(x).isdigit() else False
-            )
-        )
-    ]
-
-    if len(state_targets) == 0:
-        return 0, [], f"No state-level targets for {variable_name}"
-
-    mismatches = []
-    verified_count = 0
-
-    for _, target_row in state_targets.iterrows():
-        target_state = int(target_row["geographic_id"])
-        row_idx = target_row.name
-
-        # Find all CDs in this state
-        state_cds = [
-            (cd_idx, cd)
-            for cd_idx, cd in enumerate(cds)
-            if int(cd) // 100 == target_state
-        ]
-
-        if not state_cds:
-            continue
-
-        # Find households originally from this state
-        hh_from_state_mask = original_state_fips == target_state
-        hh_indices_from_state = np.where(hh_from_state_mask)[0]
-
-        if len(hh_indices_from_state) == 0:
-            continue
-
-        # For each CD in the state, check matrix values
-        for cd_idx, cd in state_cds:
-            col_start = cd_idx * n_households
-
-            for hh_idx in hh_indices_from_state:
-                col_idx = col_start + hh_idx
-                matrix_val = X_sparse[row_idx, col_idx]
-                original_val = original_values[hh_idx]
-
-                if original_val == 0 and matrix_val == 0:
-                    verified_count += 1
-                    continue
-
-                if original_val != 0:
-                    rel_diff = abs(matrix_val - original_val) / abs(
-                        original_val
-                    )
-                    if rel_diff > rtol:
-                        mismatches.append(
-                            {
-                                "variable": variable_name,
-                                "state": target_state,
-                                "cd": cd,
-                                "hh_idx": hh_idx,
-                                "matrix_val": float(matrix_val),
-                                "original_val": float(original_val),
-                                "rel_diff": rel_diff,
-                            }
-                        )
-                    else:
-                        verified_count += 1
-                elif matrix_val != 0:
-                    mismatches.append(
-                        {
-                            "variable": variable_name,
-                            "state": target_state,
-                            "cd": cd,
-                            "hh_idx": hh_idx,
-                            "matrix_val": float(matrix_val),
-                            "original_val": float(original_val),
-                            "rel_diff": float("inf"),
-                        }
-                    )
-
-    return verified_count, mismatches, None
-
-
-# =============================================================================
-# BASIC STRUCTURE TESTS
-# =============================================================================
-
-
 def test_person_level_aggregation_preserves_totals(sim):
     """Health insurance premiums (person-level) sum correctly to household."""
     var = "health_insurance_premiums_without_medicare_part_b"
@@ -357,112 +87,6 @@ def test_combined_variables_in_matrix(sim, builder):
         assert var_name in variables, f"Missing variable: {var_name}"
 
 
-# =============================================================================
-# CONSOLIDATED STATE-LEVEL VALUE TEST
-# =============================================================================
-
-
-class TestStateLevelValues:
-    """
-    Consolidated test for verifying matrix values match original simulation
-    values for households in their original state.
-
-    Builds matrix once and iterates through all configured variables.
-    """
-
-    def test_all_variables_state_level_match(self, sim, combined_matrix_data):
-        """
-        Verify all configured variables have correct state-level values.
-
-        For each variable:
-        1. Calculate original values from simulation
-        2. Compare to matrix values for households in their original state
-        3. Allow up to MAX_MISMATCH_RATE due to stochastic eligibility
-        """
-        results = []
-        all_mismatches = []
-
-        for variable_name, rtol in VARIABLES_TO_TEST:
-            # Calculate original values for this variable
-            original_values = sim.calculate(
-                variable_name, map_to="household"
-            ).values
-
-            verified, mismatches, skip_reason = _verify_state_level_values(
-                X_sparse=combined_matrix_data["X_sparse"],
-                targets_df=combined_matrix_data["targets_df"],
-                original_values=original_values,
-                original_state_fips=combined_matrix_data["state_fips"],
-                cds=combined_matrix_data["cds"],
-                n_households=combined_matrix_data["n_households"],
-                variable_name=variable_name,
-                rtol=rtol,
-            )
-
-            total_checked = verified + len(mismatches)
-            mismatch_rate = (
-                len(mismatches) / total_checked if total_checked > 0 else 0
-            )
-
-            results.append(
-                {
-                    "variable": variable_name,
-                    "verified": verified,
-                    "mismatches": len(mismatches),
-                    "total": total_checked,
-                    "mismatch_rate": mismatch_rate,
-                    "skip_reason": skip_reason,
-                    "passed": (
-                        skip_reason is not None
-                        or mismatch_rate <= MAX_MISMATCH_RATE
-                    ),
-                }
-            )
-
-            all_mismatches.extend(mismatches)
-
-        # Print summary
-        print("\n" + "=" * 70)
-        print("STATE-LEVEL VALUE VERIFICATION SUMMARY")
-        print("=" * 70)
-
-        results_df = pd.DataFrame(results)
-        for _, row in results_df.iterrows():
-            if row["skip_reason"]:
-                status = f"SKIPPED: {row['skip_reason']}"
-            elif row["passed"]:
-                status = (
-                    f"PASSED: {row['verified']:,} verified, "
-                    f"{row['mismatch_rate']:.1%} mismatch rate"
-                )
-            else:
-                status = (
-                    f"FAILED: {row['mismatches']:,} mismatches, "
-                    f"{row['mismatch_rate']:.1%} > {MAX_MISMATCH_RATE:.0%}"
-                )
-            print(f"  {row['variable']}: {status}")
-
-        # Show sample mismatches if any
-        if all_mismatches:
-            print(f"\nSample mismatches ({len(all_mismatches)} total):")
-            mismatch_df = pd.DataFrame(all_mismatches)
-            print(mismatch_df.head(15).to_string())
-
-        mismatch_df.to_csv("state_level_mismatches.csv", index=False)
-
-        # Assert all variables passed
-        failed = [r for r in results if not r["passed"]]
-        assert len(failed) == 0, (
-            f"{len(failed)} variable(s) failed state-level verification: "
-            f"{[r['variable'] for r in failed]}"
-        )
-
-
-# =============================================================================
-# NATIONAL-LEVEL CONTRIBUTION TEST
-# =============================================================================
-
-
 class TestNationalLevelContributions:
     """
     Tests verifying that national-level targets receive contributions from
@@ -476,7 +100,7 @@ class TestNationalLevelContributions:
     """
 
     def test_national_targets_receive_multistate_contributions(
-        self, sim, combined_matrix_data
+        self, targets_df, X_sparse, household_states, n_households, test_cds
     ):
         """
         Verify that national-level targets have contributions from households
@@ -489,11 +113,8 @@ def test_national_targets_receive_multistate_contributions(
         3. Verify contributions come from multiple states (not geographically
            biased)
         """
-        targets_df = combined_matrix_data["targets_df"]
-        X_sparse = combined_matrix_data["X_sparse"]
-        state_fips = combined_matrix_data["state_fips"]
-        n_households = combined_matrix_data["n_households"]
-        cds = combined_matrix_data["cds"]
+        state_fips = household_states
+        cds = test_cds
 
         # Find national-level targets
         national_targets = targets_df[
@@ -577,7 +198,7 @@ def test_national_targets_receive_multistate_contributions(
         )
 
     def test_state_distribution_in_national_targets(
-        self, sim, combined_matrix_data
+        self, targets_df, X_sparse, household_states, n_households, test_cds
     ):
         """
         Verify the distribution of contributing states in national targets
@@ -586,11 +207,8 @@ def test_state_distribution_in_national_targets(
         This catches cases where one state dominates the contributions
         disproportionately.
         """
-        targets_df = combined_matrix_data["targets_df"]
-        X_sparse = combined_matrix_data["X_sparse"]
-        state_fips = combined_matrix_data["state_fips"]
-        n_households = combined_matrix_data["n_households"]
-        cds = combined_matrix_data["cds"]
+        state_fips = household_states
+        cds = test_cds
 
         # Get original state distribution (count of households per state)
         unique_states, original_counts = np.unique(
@@ -656,11 +274,6 @@ def test_state_distribution_in_national_targets(
             )
 
 
-# =============================================================================
-# CROSS-STATE RECALCULATION TEST
-# =============================================================================
-
-
 class TestCrossStateRecalculation:
     """
     Tests verifying that household values change when borrowed to different
@@ -671,10 +284,13 @@ class TestCrossStateRecalculation:
     what the variable would be if that household lived in that CD's state.
     For state-dependent variables (like SNAP), values should differ across
     states for at least some households.
+
+    NOTE: This complements test_cross_state.py which verifies exact values.
+    These tests verify that variation exists (state rules are applied).
     """
 
     def test_values_change_across_states_for_national_targets(
-        self, combined_matrix_data
+        self, targets_df, X_sparse, n_households, test_cds
     ):
         """
         Verify that for national targets, household values vary across CD
@@ -690,10 +306,7 @@ def test_values_change_across_states_for_national_targets(
         3. At least some households should have different values in different
            states (confirming recalculation with different state rules)
         """
-        targets_df = combined_matrix_data["targets_df"]
-        X_sparse = combined_matrix_data["X_sparse"]
-        n_households = combined_matrix_data["n_households"]
-        cds = combined_matrix_data["cds"]
+        cds = test_cds
 
         # Group CDs by state
         cds_by_state = {}
@@ -785,7 +398,7 @@ def test_values_change_across_states_for_national_targets(
                 )
 
     def test_same_household_different_states_shows_rule_changes(
-        self, combined_matrix_data
+        self, targets_df, X_sparse, household_states, n_households, test_cds
     ):
         """
         Deep dive test: pick specific households and verify their values
@@ -800,11 +413,8 @@ def test_same_household_different_states_shows_rule_changes(
         This test finds households where we can verify the recalculation
         is applying different state rules.
         """
-        targets_df = combined_matrix_data["targets_df"]
-        X_sparse = combined_matrix_data["X_sparse"]
-        n_households = combined_matrix_data["n_households"]
-        cds = combined_matrix_data["cds"]
-        state_fips_orig = combined_matrix_data["state_fips"]
+        state_fips_orig = household_states
+        cds = test_cds
 
         # Group CDs by state
         cds_by_state = {}
diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py b/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py
index a13f459d..c9507aaf 100644
--- a/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py
+++ b/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py
@@ -2,18 +2,19 @@
 
 import pytest
 import numpy as np
+from collections import defaultdict
 
 from policyengine_us import Microsimulation
 from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (
     get_calculated_variables,
 )
 
+from .conftest import VARIABLES_TO_TEST, N_VERIFICATION_SAMPLES
+
 
 def test_same_state_matches_original(
     X_sparse,
     targets_df,
-    tracer,
-    sim,
     test_cds,
     dataset_path,
     n_households,
@@ -25,8 +26,10 @@ def test_same_state_matches_original(
 
     When household stays in same state, X_sparse should contain the value
     calculated from a fresh simulation with state_fips set to that state.
+
+    Uses stratified sampling to ensure all variables in VARIABLES_TO_TEST
+    are covered with approximately equal samples per variable.
     """
-    n_samples = 200
     seed = 42
     rng = np.random.default_rng(seed)
     n_hh = n_households
@@ -48,28 +51,46 @@ def get_state_sim(state):
 
     nonzero_rows, nonzero_cols = X_sparse.nonzero()
 
-    same_state_indices = []
+    # Group same-state cells by variable for stratified sampling
+    variable_to_indices = defaultdict(list)
+    variables_to_test = {v[0] for v in VARIABLES_TO_TEST}
+
     for i in range(len(nonzero_rows)):
+        row_idx = nonzero_rows[i]
         col_idx = nonzero_cols[i]
         cd_idx = col_idx // n_hh
         hh_idx = col_idx % n_hh
         cd = test_cds[cd_idx]
         dest_state = int(cd) // 100
         orig_state = int(hh_states[hh_idx])
-        if dest_state == orig_state:
-            same_state_indices.append(i)
 
-    if not same_state_indices:
-        pytest.skip("No same-state non-zero cells found")
+        # Only include same-state cells
+        if dest_state != orig_state:
+            continue
+
+        # Get variable for this row
+        variable = targets_df.iloc[row_idx]["variable"]
+        if variable in variables_to_test:
+            variable_to_indices[variable].append(i)
+
+    if not variable_to_indices:
+        pytest.skip("No same-state non-zero cells found for test variables")
 
-    sample_idx = rng.choice(
-        same_state_indices,
-        min(n_samples, len(same_state_indices)),
-        replace=False,
+    # Stratified sampling: sample proportionally from each variable
+    samples_per_var = max(
+        1, N_VERIFICATION_SAMPLES // len(variable_to_indices)
     )
+    sample_indices = []
+
+    for variable, indices in variable_to_indices.items():
+        n_to_sample = min(samples_per_var, len(indices))
+        sampled = rng.choice(indices, n_to_sample, replace=False)
+        sample_indices.extend(sampled)
+
     errors = []
+    variables_tested = set()
 
-    for idx in sample_idx:
+    for idx in sample_indices:
         row_idx = nonzero_rows[idx]
         col_idx = nonzero_cols[idx]
         cd_idx = col_idx // n_hh
@@ -83,6 +104,8 @@ def get_state_sim(state):
             state_sim.calculate(variable, map_to="household").values[hh_idx]
         )
 
+        variables_tested.add(variable)
+
         if not np.isclose(actual, expected, atol=0.5):
             errors.append(
                 {
@@ -93,7 +116,13 @@ def get_state_sim(state):
                 }
             )
 
+    # Report which variables were tested
+    missing_vars = variables_to_test - variables_tested
+    if missing_vars:
+        print(f"Warning: No same-state cells found for: {missing_vars}")
+
     assert not errors, (
-        f"Same-state verification failed: {len(errors)}/{len(sample_idx)} "
-        f"mismatches. First 5: {errors[:5]}"
+        f"Same-state verification failed: {len(errors)}/{len(sample_indices)} "
+        f"mismatches across {len(variables_tested)} variables. "
+        f"First 5: {errors[:5]}"
     )

From 89ce2c8ad93a40fd5cb19abdcd29de5ea457891d Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Thu, 22 Jan 2026 18:35:12 +0530
Subject: [PATCH 4/6] update test_same_test so it compares to original values
 instead of fresh calculations

---
 .../test_same_state.py                        | 55 +++++++++----------
 1 file changed, 25 insertions(+), 30 deletions(-)

diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py b/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py
index c9507aaf..ec9200b3 100644
--- a/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py
+++ b/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py
@@ -1,31 +1,26 @@
-"""Test same-state values match fresh simulations."""
+"""Test same-state values match original simulation values."""
 
 import pytest
 import numpy as np
 from collections import defaultdict
 
-from policyengine_us import Microsimulation
-from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (
-    get_calculated_variables,
-)
-
 from .conftest import VARIABLES_TO_TEST, N_VERIFICATION_SAMPLES
 
 
 def test_same_state_matches_original(
+    sim,
     X_sparse,
     targets_df,
     test_cds,
-    dataset_path,
     n_households,
     household_ids,
     household_states,
 ):
     """
-    Same-state non-zero cells must match fresh same-state simulation.
+    Same-state non-zero cells must match ORIGINAL simulation values.
 
     When household stays in same state, X_sparse should contain the value
-    calculated from a fresh simulation with state_fips set to that state.
+    from the original simulation (ground truth from H5 dataset).
 
     Uses stratified sampling to ensure all variables in VARIABLES_TO_TEST
     are covered with approximately equal samples per variable.
@@ -36,19 +31,6 @@ def test_same_state_matches_original(
     hh_ids = household_ids
     hh_states = household_states
 
-    state_sims = {}
-
-    def get_state_sim(state):
-        if state not in state_sims:
-            s = Microsimulation(dataset=dataset_path)
-            s.set_input(
-                "state_fips", 2023, np.full(n_hh, state, dtype=np.int32)
-            )
-            for var in get_calculated_variables(s):
-                s.delete_arrays(var)
-            state_sims[state] = s
-        return state_sims[state]
-
     nonzero_rows, nonzero_cols = X_sparse.nonzero()
 
     # Group same-state cells by variable for stratified sampling
@@ -68,7 +50,6 @@ def get_state_sim(state):
         if dest_state != orig_state:
             continue
 
-        # Get variable for this row
         variable = targets_df.iloc[row_idx]["variable"]
         if variable in variables_to_test:
             variable_to_indices[variable].append(i)
@@ -87,6 +68,16 @@ def get_state_sim(state):
         sampled = rng.choice(indices, n_to_sample, replace=False)
         sample_indices.extend(sampled)
 
+    # Cache original values per variable to avoid repeated calculations
+    original_values_cache = {}
+
+    def get_original_values(variable):
+        if variable not in original_values_cache:
+            original_values_cache[variable] = sim.calculate(
+                variable, map_to="household"
+            ).values
+        return original_values_cache[variable]
+
     errors = []
     variables_tested = set()
 
@@ -95,14 +86,12 @@ def get_state_sim(state):
         col_idx = nonzero_cols[idx]
         cd_idx = col_idx // n_hh
         hh_idx = col_idx % n_hh
-        cd = test_cds[cd_idx]
-        dest_state = int(cd) // 100
         variable = targets_df.iloc[row_idx]["variable"]
         actual = float(X_sparse[row_idx, col_idx])
-        state_sim = get_state_sim(dest_state)
-        expected = float(
-            state_sim.calculate(variable, map_to="household").values[hh_idx]
-        )
+
+        # Compare to ORIGINAL simulation values (ground truth)
+        original_values = get_original_values(variable)
+        expected = float(original_values[hh_idx])
 
         variables_tested.add(variable)
 
@@ -110,13 +99,19 @@ def get_state_sim(state):
             errors.append(
                 {
                     "hh_id": hh_ids[hh_idx],
+                    "hh_idx": hh_idx,
                     "variable": variable,
                     "actual": actual,
                     "expected": expected,
+                    "diff": actual - expected,
+                    "rel_diff": (
+                        (actual - expected) / expected
+                        if expected != 0
+                        else np.inf
+                    ),
                 }
             )
 
-    # Report which variables were tested
     missing_vars = variables_to_test - variables_tested
     if missing_vars:
         print(f"Warning: No same-state cells found for: {missing_vars}")

From b18680d7bc6cf49923ccd6eb01c8849991383d3d Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Thu, 22 Jan 2026 21:04:23 +0530
Subject: [PATCH 5/6] adding matrix builder improvements

---
 .../calibration_utils.py                      |  64 +++++---
 .../sparse_matrix_builder.py                  | 138 ++++++++++++++++--
 .../test_local_area_calibration/conftest.py   |  48 +++++-
 3 files changed, 220 insertions(+), 30 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py
index f01465a2..aa954aba 100644
--- a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py
+++ b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py
@@ -192,15 +192,21 @@ def get_calculated_variables(sim) -> List[str]:
     """
     Return variables that should be cleared for state-swap recalculation.
 
-    Includes variables with formulas, adds, or subtracts.
-
-    Excludes ID variables (person_id, household_id, etc.) because:
-    1. They have formulas that generate sequential IDs (0, 1, 2, ...)
-    2. We need the original H5 values, not regenerated sequences
-    3. PolicyEngine's random() function uses entity IDs as seeds:
-       seed = abs(entity_id * 100 + count_random_calls)
-       If IDs change, random-dependent variables (SSI resource test,
-       WIC nutritional risk, WIC takeup) produce different results.
+    Includes variables with formulas, or adds/subtracts that are lists.
+
+    Excludes:
+    1. ID variables (person_id, household_id, etc.) - needed for random seeds
+    2. Variables with string adds/subtracts (parameter paths) - these are
+       pseudo-inputs stored in H5 that would recalculate differently using
+       parameter lookups. Examples: pre_tax_contributions.
+    3. Variables in input_variables (have stored H5 values) even if they
+       have formulas - the stored values represent original survey data
+       that should be preserved. Examples: cdcc_relevant_expenses, rent.
+
+    The exclusions are critical because:
+    - The H5 file stores pre-computed values from original CPS processing
+    - If deleted, recalculation produces different values, corrupting
+      downstream calculations like income_tax
     """
     exclude_ids = {
         "person_id",
@@ -210,16 +216,36 @@ def get_calculated_variables(sim) -> List[str]:
         "family_id",
         "marital_unit_id",
     }
-    return [
-        name
-        for name, var in sim.tax_benefit_system.variables.items()
-        if (
-            var.formulas
-            or getattr(var, "adds", None)
-            or getattr(var, "subtracts", None)
-        )
-        and name not in exclude_ids
-    ]
+
+    # Get stored input variables to exclude
+    input_vars = set(sim.input_variables)
+
+    result = []
+    for name, var in sim.tax_benefit_system.variables.items():
+        if name in exclude_ids:
+            continue
+
+        # Exclude variables that have stored values (input_variables)
+        # These represent original survey data that should be preserved
+        if name in input_vars:
+            continue
+
+        # Include if has formulas
+        if var.formulas:
+            result.append(name)
+            continue
+
+        # Include if adds/subtracts is a list (explicit component aggregation)
+        # Exclude if adds/subtracts is a string (parameter path - pseudo-input)
+        adds = getattr(var, "adds", None)
+        subtracts = getattr(var, "subtracts", None)
+
+        if adds and isinstance(adds, list):
+            result.append(name)
+        elif subtracts and isinstance(subtracts, list):
+            result.append(name)
+
+    return result
 
 
 def get_pseudo_input_variables(sim) -> set:
diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py b/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py
index d8748014..b12629fb 100644
--- a/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py
+++ b/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py
@@ -38,6 +38,105 @@ def __init__(
         self.time_period = time_period
         self.cds_to_calibrate = cds_to_calibrate
         self.dataset_path = dataset_path
+        self._entity_rel_cache = None
+
+    def _build_entity_relationship(self, sim) -> pd.DataFrame:
+        """
+        Build entity relationship DataFrame mapping persons to all entity IDs.
+
+        This is used to evaluate constraints at the person level and then
+        aggregate to household level, handling variables defined at different
+        entity levels (person, tax_unit, household, spm_unit).
+
+        Returns:
+            DataFrame with person_id, household_id, tax_unit_id, spm_unit_id
+        """
+        if self._entity_rel_cache is not None:
+            return self._entity_rel_cache
+
+        self._entity_rel_cache = pd.DataFrame(
+            {
+                "person_id": sim.calculate(
+                    "person_id", map_to="person"
+                ).values,
+                "household_id": sim.calculate(
+                    "household_id", map_to="person"
+                ).values,
+                "tax_unit_id": sim.calculate(
+                    "tax_unit_id", map_to="person"
+                ).values,
+                "spm_unit_id": sim.calculate(
+                    "spm_unit_id", map_to="person"
+                ).values,
+            }
+        )
+        return self._entity_rel_cache
+
+    def _evaluate_constraints_entity_aware(
+        self, state_sim, constraints: List[dict], n_households: int
+    ) -> np.ndarray:
+        """
+        Evaluate non-geographic constraints at person level, aggregate to
+        household level using .any().
+
+        This properly handles constraints on variables defined at different
+        entity levels (e.g., tax_unit_is_filer at tax_unit level). Instead of
+        summing values at household level (which would give 2, 3, etc. for
+        households with multiple tax units), we evaluate at person level and
+        use .any() aggregation ("does this household have at least one person
+        satisfying all constraints?").
+
+        Args:
+            state_sim: Microsimulation with state_fips set
+            constraints: List of constraint dicts with variable, operation,
+                value keys (geographic constraints should be pre-filtered)
+            n_households: Number of households
+
+        Returns:
+            Boolean mask array of length n_households
+        """
+        if not constraints:
+            return np.ones(n_households, dtype=bool)
+
+        entity_rel = self._build_entity_relationship(state_sim)
+        n_persons = len(entity_rel)
+
+        person_mask = np.ones(n_persons, dtype=bool)
+
+        for c in constraints:
+            var = c["variable"]
+            op = c["operation"]
+            val = c["value"]
+
+            # Calculate constraint variable at person level
+            constraint_values = state_sim.calculate(
+                var, map_to="person"
+            ).values
+
+            # Apply operation at person level
+            person_mask &= apply_op(constraint_values, op, val)
+
+        # Aggregate to household level using .any()
+        # "At least one person in this household satisfies ALL constraints"
+        entity_rel_with_mask = entity_rel.copy()
+        entity_rel_with_mask["satisfies"] = person_mask
+
+        household_mask_series = entity_rel_with_mask.groupby("household_id")[
+            "satisfies"
+        ].any()
+
+        # Ensure we return a mask aligned with household order
+        household_ids = state_sim.calculate(
+            "household_id", map_to="household"
+        ).values
+        household_mask = np.array(
+            [
+                household_mask_series.get(hh_id, False)
+                for hh_id in household_ids
+            ]
+        )
+
+        return household_mask
 
     def _query_targets(self, target_filter: dict) -> pd.DataFrame:
         """Query targets based on filter criteria using OR logic."""
@@ -166,6 +265,9 @@ def build_matrix(
             cds_by_state[state].append((cd_idx, cd))
 
         for state, cd_list in cds_by_state.items():
+            # Clear entity relationship cache when creating new simulation
+            self._entity_rel_cache = None
+
             if self.dataset_path:
                 state_sim = self._create_state_sim(state, n_households)
             else:
@@ -184,27 +286,43 @@ def build_matrix(
                 for row_idx, (_, target) in enumerate(targets_df.iterrows()):
                     constraints = self._get_constraints(target["stratum_id"])
 
-                    mask = np.ones(n_households, dtype=bool)
+                    geo_constraints = []
+                    non_geo_constraints = []
                     for c in constraints:
+                        if c["variable"] in (
+                            "state_fips",
+                            "congressional_district_geoid",
+                        ):
+                            geo_constraints.append(c)
+                        else:
+                            non_geo_constraints.append(c)
+
+                    # Check geographic constraints first (quick fail)
+                    geo_mask = np.ones(n_households, dtype=bool)
+                    for c in geo_constraints:
                         if c["variable"] == "congressional_district_geoid":
                             if (
                                 c["operation"] in ("==", "=")
                                 and c["value"] != cd
                             ):
-                                mask[:] = False
+                                geo_mask[:] = False
                         elif c["variable"] == "state_fips":
                             if (
                                 c["operation"] in ("==", "=")
                                 and int(c["value"]) != state
                             ):
-                                mask[:] = False
-                        else:
-                            values = state_sim.calculate(
-                                c["variable"], map_to="household"
-                            ).values
-                            mask &= apply_op(
-                                values, c["operation"], c["value"]
-                            )
+                                geo_mask[:] = False
+
+                    if not geo_mask.any():
+                        continue
+
+                    # Evaluate non-geographic constraints at entity level
+                    entity_mask = self._evaluate_constraints_entity_aware(
+                        state_sim, non_geo_constraints, n_households
+                    )
+
+                    # Combine geographic and entity-aware masks
+                    mask = geo_mask & entity_mask
 
                     if not mask.any():
                         continue
diff --git a/policyengine_us_data/tests/test_local_area_calibration/conftest.py b/policyengine_us_data/tests/test_local_area_calibration/conftest.py
index d4b5edc6..633b391f 100644
--- a/policyengine_us_data/tests/test_local_area_calibration/conftest.py
+++ b/policyengine_us_data/tests/test_local_area_calibration/conftest.py
@@ -37,6 +37,22 @@
     ("tanf", 1e-2),
     ("tip_income", 1e-2),
     ("unemployment_compensation", 1e-2),
+    ("income_tax", 1e-2),
+    ("income_tax", 1e-2),
+    ("qualified_business_income_deduction", 1e-2),
+    ("taxable_social_security", 1e-2),
+    ("taxable_pension_income", 1e-2),
+    ("taxable_ira_distributions", 1e-2),
+    ("taxable_interest_income", 1e-2),
+    ("tax_exempt_interest_income", 1e-2),
+    ("self_employment_income", 1e-2),
+    ("salt", 1e-2),
+    ("refundable_ctc", 1e-2),
+    ("real_estate_taxes", 1e-2),
+    ("qualified_dividend_income", 1e-2),
+    ("dividend_income", 1e-2),
+    ("adjusted_gross_income", 1e-2),
+    ("eitc", 1e-2),
 ]
 
 # Combined filter config to build matrix with all variables at once
@@ -45,6 +61,20 @@
         4,  # SNAP targets
         5,  # Medicaid targets
         112,  # Unemployment compensation targets
+        117,  # Income tax targets
+        100,  # QBID targets
+        111,  # Taxable social security targets
+        114,  # Taxable pension income targets
+        105,  # Taxable IRA distributions targets
+        106,  # Taxable interest income targets
+        107,  # Tax exempt interest income targets
+        101,  # Self-employment income targets
+        116,  # Salt targets
+        115,  # Refundable CTC targets
+        103,  # Real estate taxes targets
+        109,  # Qualified dividend income targets
+        108,  # Dividend income targets
+        3,  # Adjusted gross income targets
     ],
     "variables": [
         "snap",
@@ -60,6 +90,22 @@
         "tanf",
         "tip_income",
         "unemployment_compensation",
+        "income_tax",
+        "income_tax",
+        "qualified_business_income_deduction",
+        "taxable_social_security",
+        "taxable_pension_income",
+        "taxable_ira_distributions",
+        "taxable_interest_income",
+        "tax_exempt_interest_income",
+        "self_employment_income",
+        "salt",
+        "refundable_ctc",
+        "real_estate_taxes",
+        "qualified_dividend_income",
+        "dividend_income",
+        "adjusted_gross_income",
+        "eitc",
     ],
 }
 
@@ -67,7 +113,7 @@
 MAX_MISMATCH_RATE = 0.02
 
 # Number of samples for cell-level verification tests
-N_VERIFICATION_SAMPLES = 200
+N_VERIFICATION_SAMPLES = 2000
 
 
 @pytest.fixture(scope="module")

From b5b1f1dd6faa4845c7e1e26f90d085109b32599c Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Thu, 22 Jan 2026 14:15:02 -0500
Subject: [PATCH 6/6] Reduce test scope for CI performance

- Reduce VARIABLES_TO_TEST to 3 representative variables (snap, income_tax, eitc)
- Reduce COMBINED_FILTER_CONFIG to minimal subset for fast CI runs
- Reduce N_VERIFICATION_SAMPLES from 2000 to 500
- Revert test_cds to original 4 states (NC, HI, MT, AK) instead of 8 states

Tests now complete in ~4 minutes instead of 3+ hours.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .../test_local_area_calibration/conftest.py   | 80 ++-----------------
 .../test_matrix_national_variation.py         |  2 +-
 2 files changed, 6 insertions(+), 76 deletions(-)

diff --git a/policyengine_us_data/tests/test_local_area_calibration/conftest.py b/policyengine_us_data/tests/test_local_area_calibration/conftest.py
index 633b391f..7abcbafb 100644
--- a/policyengine_us_data/tests/test_local_area_calibration/conftest.py
+++ b/policyengine_us_data/tests/test_local_area_calibration/conftest.py
@@ -19,92 +19,26 @@
     get_calculated_variables,
 )
 
-# Variables to test for state-level value matching
+# Variables to test for state-level value matching (CI uses subset for speed)
 # Format: (variable_name, rtol)
 #     variable_name as per the targets in policy_data.db
 #     rtol is relative tolerance for comparison
 VARIABLES_TO_TEST = [
     ("snap", 1e-2),
-    ("health_insurance_premiums_without_medicare_part_b", 1e-2),
-    ("medicaid", 1e-2),
-    ("medicare_part_b_premiums", 1e-2),
-    ("other_medical_expenses", 1e-2),
-    ("over_the_counter_health_expenses", 1e-2),
-    ("salt_deduction", 1e-2),
-    ("spm_unit_capped_work_childcare_expenses", 1e-2),
-    ("spm_unit_capped_housing_subsidy", 1e-2),
-    ("ssi", 1e-2),
-    ("tanf", 1e-2),
-    ("tip_income", 1e-2),
-    ("unemployment_compensation", 1e-2),
     ("income_tax", 1e-2),
-    ("income_tax", 1e-2),
-    ("qualified_business_income_deduction", 1e-2),
-    ("taxable_social_security", 1e-2),
-    ("taxable_pension_income", 1e-2),
-    ("taxable_ira_distributions", 1e-2),
-    ("taxable_interest_income", 1e-2),
-    ("tax_exempt_interest_income", 1e-2),
-    ("self_employment_income", 1e-2),
-    ("salt", 1e-2),
-    ("refundable_ctc", 1e-2),
-    ("real_estate_taxes", 1e-2),
-    ("qualified_dividend_income", 1e-2),
-    ("dividend_income", 1e-2),
-    ("adjusted_gross_income", 1e-2),
     ("eitc", 1e-2),
 ]
 
-# Combined filter config to build matrix with all variables at once
+# CI filter config - minimal subset for fast CI runs
+# Tests 3 representative variables covering benefits, taxes, and credits
 COMBINED_FILTER_CONFIG = {
     "stratum_group_ids": [
         4,  # SNAP targets
-        5,  # Medicaid targets
-        112,  # Unemployment compensation targets
         117,  # Income tax targets
-        100,  # QBID targets
-        111,  # Taxable social security targets
-        114,  # Taxable pension income targets
-        105,  # Taxable IRA distributions targets
-        106,  # Taxable interest income targets
-        107,  # Tax exempt interest income targets
-        101,  # Self-employment income targets
-        116,  # Salt targets
-        115,  # Refundable CTC targets
-        103,  # Real estate taxes targets
-        109,  # Qualified dividend income targets
-        108,  # Dividend income targets
-        3,  # Adjusted gross income targets
     ],
     "variables": [
         "snap",
-        "health_insurance_premiums_without_medicare_part_b",
-        "medicaid",
-        "medicare_part_b_premiums",
-        "other_medical_expenses",
-        "over_the_counter_health_expenses",
-        "salt_deduction",
-        "spm_unit_capped_work_childcare_expenses",
-        "spm_unit_capped_housing_subsidy",
-        "ssi",
-        "tanf",
-        "tip_income",
-        "unemployment_compensation",
-        "income_tax",
         "income_tax",
-        "qualified_business_income_deduction",
-        "taxable_social_security",
-        "taxable_pension_income",
-        "taxable_ira_distributions",
-        "taxable_interest_income",
-        "tax_exempt_interest_income",
-        "self_employment_income",
-        "salt",
-        "refundable_ctc",
-        "real_estate_taxes",
-        "qualified_dividend_income",
-        "dividend_income",
-        "adjusted_gross_income",
         "eitc",
     ],
 }
@@ -113,7 +47,7 @@
 MAX_MISMATCH_RATE = 0.02
 
 # Number of samples for cell-level verification tests
-N_VERIFICATION_SAMPLES = 2000
+N_VERIFICATION_SAMPLES = 500
 
 
 @pytest.fixture(scope="module")
@@ -129,7 +63,7 @@ def dataset_path():
 
 @pytest.fixture(scope="module")
 def test_cds(db_uri):
-    """CDs from multiple states for comprehensive testing."""
+    """CDs from NC, HI, MT, AK (manageable size for CI, multiple same-state CDs)."""
     engine = create_engine(db_uri)
     query = """
     SELECT DISTINCT sc.value as cd_geoid
@@ -142,10 +76,6 @@ def test_cds(db_uri):
         OR sc.value LIKE '150_'
         OR sc.value LIKE '300_'
         OR sc.value = '200' OR sc.value = '201'
-        OR sc.value IN ('101', '102')
-        OR sc.value IN ('601', '602')
-        OR sc.value IN ('3601', '3602')
-        OR sc.value IN ('4801', '4802')
       )
     ORDER BY sc.value
     """
diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_matrix_national_variation.py b/policyengine_us_data/tests/test_local_area_calibration/test_matrix_national_variation.py
index 09cba3d1..b5950089 100644
--- a/policyengine_us_data/tests/test_local_area_calibration/test_matrix_national_variation.py
+++ b/policyengine_us_data/tests/test_local_area_calibration/test_matrix_national_variation.py
@@ -177,7 +177,7 @@ def test_national_targets_receive_multistate_contributions(
             min_states = stats_df["n_states"].min()
 
             # Check: on average, contributions should come from multiple states
-            # (at least 2, since we have CDs from 6 different states)
+            # (at least 2, since we have CDs from 4 different states)
             passed = avg_states >= 2 and min_states >= 1
 
             results.append(