From 320dfc044fc60ce37b73594cb90d4438164a6b25 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Thu, 26 Mar 2026 11:06:18 -0400 Subject: [PATCH 1/3] Add tax_unit_itemizes constraint for itemized deduction targets SOI targets for SALT, real estate taxes, and medical expense deduction are reported only for the ~10% of filers who itemize, but the existing `variable > 0` constraint captures everyone with economic exposure (~80-90% of filers). This mismatch causes massive count and dollar overestimates. Adding `tax_unit_itemizes == 1` fixes the population alignment. Changes: - etl_irs_soi.py: For salt, real_estate_taxes, and medical_expense_deduction, append a `tax_unit_itemizes == 1` constraint to child strata in the generic target loop. - etl_national_targets.py: Split JCT itemized deduction targets (salt_deduction, medical_expense_deduction, charitable_deduction, interest_deduction) into a separate itemizer_targets list loaded into a new "United States - Itemizing Tax Filers" stratum with both filer and itemizer constraints. QBI deduction remains in the plain filer stratum (above-the-line). Co-Authored-By: Claude Opus 4.6 (1M context) --- policyengine_us_data/db/etl_irs_soi.py | 16 ++- .../db/etl_national_targets.py | 108 +++++++++++++++--- 2 files changed, 110 insertions(+), 14 deletions(-) diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index f6bda07bc..33f08cef0 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -29,6 +29,7 @@ logger = logging.getLogger(__name__) +ITEMIZED_DEDUCTION_VARIABLES = {"salt", "real_estate_taxes", "medical_expense_deduction"} # IRS SOI data is typically available ~2 years after the tax year IRS_SOI_LAG_YEARS = 2 @@ -661,7 +662,11 @@ def load_soi_data(long_dfs, year): # Create child stratum with constraint for this IRS variable # Note: This stratum will have the constraint that amount_variable > 0 - note = f"{geo_description} filers with {amount_variable_name} > 0" + is_itemized = amount_variable_name in ITEMIZED_DEDUCTION_VARIABLES + if is_itemized: + note = f"{geo_description} itemizing filers with {amount_variable_name} > 0" + else: + note = f"{geo_description} filers with {amount_variable_name} > 0" # Check if child stratum already exists existing_stratum = ( @@ -698,6 +703,15 @@ def load_soi_data(long_dfs, year): ] ) + if is_itemized: + child_stratum.constraints_rel.append( + StratumConstraint( + constraint_variable="tax_unit_itemizes", + operation="==", + value="1", + ) + ) + # Add geographic constraints if applicable if geo_info["type"] == "state": child_stratum.constraints_rel.append( diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py index 0e87aa84a..a5e208687 100644 --- a/policyengine_us_data/db/etl_national_targets.py +++ b/policyengine_us_data/db/etl_national_targets.py @@ -57,6 +57,17 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET): # Separate tax-related targets that need filer constraint tax_filer_targets = [ + { + "variable": "qualified_business_income_deduction", + "value": 63.1e9, + "source": "Joint Committee on Taxation", + "notes": "QBI deduction tax expenditure", + "year": HARDCODED_YEAR, + }, + ] + + # Itemized deduction targets need both filer and itemizer constraints + itemizer_targets = [ { "variable": "salt_deduction", "value": 21.247e9, @@ -85,13 +96,6 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET): "notes": "Mortgage interest deduction tax expenditure", "year": HARDCODED_YEAR, }, - { - "variable": "qualified_business_income_deduction", - "value": 63.1e9, - "source": "Joint Committee on Taxation", - "notes": "QBI deduction tax expenditure", - "year": HARDCODED_YEAR, - }, ] direct_sum_targets = [ @@ -394,6 +398,7 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET): return { "direct_sum_targets": direct_sum_targets, "tax_filer_targets": tax_filer_targets, + "itemizer_targets": itemizer_targets, "conditional_count_targets": conditional_count_targets, "cbo_targets": cbo_targets, "treasury_targets": treasury_targets, @@ -413,9 +418,10 @@ def transform_national_targets(raw_targets): Returns ------- tuple - (direct_targets_df, tax_filer_df, conditional_targets) + (direct_targets_df, tax_filer_df, itemizer_df, conditional_targets) - direct_targets_df: DataFrame with direct sum targets - tax_filer_df: DataFrame with tax-related targets needing filer constraint + - itemizer_df: DataFrame with itemized deduction targets needing filer + itemizer constraints - conditional_targets: List of conditional count targets """ @@ -444,14 +450,19 @@ def transform_national_targets(raw_targets): tax_filer_df = ( pd.DataFrame(all_tax_filer_targets) if all_tax_filer_targets else pd.DataFrame() ) + itemizer_df = ( + pd.DataFrame(raw_targets["itemizer_targets"]) + if raw_targets["itemizer_targets"] + else pd.DataFrame() + ) # Conditional targets stay as list for special processing conditional_targets = raw_targets["conditional_count_targets"] - return direct_df, tax_filer_df, conditional_targets + return direct_df, tax_filer_df, itemizer_df, conditional_targets -def load_national_targets(direct_targets_df, tax_filer_df, conditional_targets): +def load_national_targets(direct_targets_df, tax_filer_df, itemizer_df, conditional_targets): """ Load national targets into the database. @@ -461,6 +472,8 @@ def load_national_targets(direct_targets_df, tax_filer_df, conditional_targets): DataFrame with direct sum target data tax_filer_df : pd.DataFrame DataFrame with tax-related targets needing filer constraint + itemizer_df : pd.DataFrame + DataFrame with itemized deduction targets needing filer + itemizer constraints conditional_targets : list List of conditional count targets requiring strata """ @@ -590,6 +603,74 @@ def load_national_targets(direct_targets_df, tax_filer_df, conditional_targets): session.add(target) print(f"Added filer target: {target_data['variable']}") + # Process itemized deduction targets that need filer + itemizer constraints + if not itemizer_df.empty: + national_itemizer_stratum = ( + session.query(Stratum) + .filter( + Stratum.parent_stratum_id == us_stratum.stratum_id, + Stratum.notes == "United States - Itemizing Tax Filers", + ) + .first() + ) + + if not national_itemizer_stratum: + national_itemizer_stratum = Stratum( + parent_stratum_id=us_stratum.stratum_id, + notes="United States - Itemizing Tax Filers", + ) + national_itemizer_stratum.constraints_rel = [ + StratumConstraint( + constraint_variable="tax_unit_is_filer", + operation="==", + value="1", + ), + StratumConstraint( + constraint_variable="tax_unit_itemizes", + operation="==", + value="1", + ), + ] + session.add(national_itemizer_stratum) + session.flush() + print("Created national itemizer stratum") + + for _, target_data in itemizer_df.iterrows(): + target_year = target_data["year"] + existing_target = ( + session.query(Target) + .filter( + Target.stratum_id == national_itemizer_stratum.stratum_id, + Target.variable == target_data["variable"], + Target.period == target_year, + ) + .first() + ) + + notes_parts = [] + if pd.notna(target_data.get("notes")): + notes_parts.append(target_data["notes"]) + notes_parts.append(f"Source: {target_data.get('source', 'Unknown')}") + combined_notes = " | ".join(notes_parts) + + if existing_target: + existing_target.value = target_data["value"] + existing_target.notes = combined_notes + existing_target.source = "PolicyEngine" + print(f"Updated itemizer target: {target_data['variable']}") + else: + target = Target( + stratum_id=national_itemizer_stratum.stratum_id, + variable=target_data["variable"], + period=target_year, + value=target_data["value"], + active=True, + source="PolicyEngine", + notes=combined_notes, + ) + session.add(target) + print(f"Added itemizer target: {target_data['variable']}") + # Process conditional count targets (enrollment counts) for cond_target in conditional_targets: constraint_var = cond_target["constraint_variable"] @@ -686,11 +767,12 @@ def load_national_targets(direct_targets_df, tax_filer_df, conditional_targets): session.commit() total_targets = ( - len(direct_targets_df) + len(tax_filer_df) + len(conditional_targets) + len(direct_targets_df) + len(tax_filer_df) + len(itemizer_df) + len(conditional_targets) ) print(f"\nSuccessfully loaded {total_targets} national targets") print(f" - {len(direct_targets_df)} direct sum targets") print(f" - {len(tax_filer_df)} tax filer targets") + print(f" - {len(itemizer_df)} itemizer targets") print(f" - {len(conditional_targets)} enrollment count targets (as strata)") @@ -706,13 +788,13 @@ def main(): # Transform print("Transforming targets...") - direct_targets_df, tax_filer_df, conditional_targets = transform_national_targets( + direct_targets_df, tax_filer_df, itemizer_df, conditional_targets = transform_national_targets( raw_targets ) # Load print("Loading targets into database...") - load_national_targets(direct_targets_df, tax_filer_df, conditional_targets) + load_national_targets(direct_targets_df, tax_filer_df, itemizer_df, conditional_targets) print("\nETL pipeline complete!") From 089000276e4d6d7c29a65081b9a67dbfe62e452b Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 26 Mar 2026 14:35:58 -0400 Subject: [PATCH 2/3] Fix JCT tax expenditure target handling --- .../calibration/unified_matrix_builder.py | 196 ++++++++++++++++-- .../calibration/validate_staging.py | 86 +++++++- .../db/create_database_tables.py | 3 +- .../db/etl_national_targets.py | 146 ++++++++----- .../test_unified_matrix_builder.py | 71 +++++-- .../tests/test_schema_views_and_lookups.py | 28 +++ 6 files changed, 431 insertions(+), 99 deletions(-) diff --git a/policyengine_us_data/calibration/unified_matrix_builder.py b/policyengine_us_data/calibration/unified_matrix_builder.py index 0e7a1188f..09c121935 100644 --- a/policyengine_us_data/calibration/unified_matrix_builder.py +++ b/policyengine_us_data/calibration/unified_matrix_builder.py @@ -40,6 +40,59 @@ } +def _make_neutralize_variable_reform(variable_name: str): + from policyengine_core.reforms import Reform + + class NeutralizeVariable(Reform): + def apply(self): + self.neutralize_variable(variable_name) + + NeutralizeVariable.__name__ = f"Neutralize_{variable_name}" + return NeutralizeVariable + + +def _compute_reform_household_values( + dataset_path: str, + time_period: int, + state: int, + n_hh: int, + reform_vars: list, + baseline_income_tax: np.ndarray, +) -> dict: + """Compute repeal-based household income tax deltas for target vars.""" + from policyengine_us import Microsimulation + + reform_hh = {} + if not reform_vars: + return reform_hh + + state_input = np.full(n_hh, state, dtype=np.int32) + for var in reform_vars: + try: + reform_sim = Microsimulation( + dataset=dataset_path, + reform=_make_neutralize_variable_reform(var), + ) + reform_sim.set_input("state_fips", time_period, state_input) + for calc_var in get_calculated_variables(reform_sim): + reform_sim.delete_arrays(calc_var) + reform_income_tax = reform_sim.calculate( + "income_tax", + time_period, + map_to="household", + ).values.astype(np.float32) + reform_hh[var] = reform_income_tax - baseline_income_tax + except Exception as exc: + logger.warning( + "Cannot calculate tax expenditure '%s' for state %d: %s", + var, + state, + exc, + ) + + return reform_hh + + def _compute_single_state( dataset_path: str, time_period: int, @@ -47,6 +100,7 @@ def _compute_single_state( n_hh: int, target_vars: list, constraint_vars: list, + reform_vars: list, rerandomize_takeup: bool, affected_targets: dict, ): @@ -118,6 +172,23 @@ def _compute_single_state( exc, ) + baseline_income_tax = None + reform_hh = {} + if reform_vars: + baseline_income_tax = state_sim.calculate( + "income_tax", + time_period, + map_to="household", + ).values.astype(np.float32) + reform_hh = _compute_reform_household_values( + dataset_path, + time_period, + state, + n_hh, + reform_vars, + baseline_income_tax, + ) + if rerandomize_takeup: for spec in SIMPLE_TAKEUP_VARS: entity = spec["entity"] @@ -177,6 +248,7 @@ def _compute_single_state( { "hh": hh, "person": person, + "reform_hh": reform_hh, "entity": entity_vals, "entity_wf_false": entity_wf_false, }, @@ -347,6 +419,7 @@ def _assemble_clone_values_standalone( person_hh_indices: np.ndarray, target_vars: set, constraint_vars: set, + reform_vars: set = None, county_values: dict = None, clone_counties: np.ndarray = None, county_dependent_vars: set = None, @@ -409,7 +482,23 @@ def _assemble_clone_values_standalone( arr[mask] = state_values[int(state)]["person"][var][mask] person_vars[var] = arr - return hh_vars, person_vars + reform_hh_vars: dict = {} + for var in reform_vars or set(): + if not any( + var in state_values[int(state)].get("reform_hh", {}) + for state in unique_clone_states + ): + continue + arr = np.zeros(n_records, dtype=np.float32) + for state in unique_clone_states: + mask = state_masks[int(state)] + arr[mask] = state_values[int(state)].get("reform_hh", {}).get( + var, + np.zeros(mask.sum(), dtype=np.float32), + ) + reform_hh_vars[var] = arr + + return hh_vars, person_vars, reform_hh_vars def _evaluate_constraints_standalone( @@ -452,10 +541,12 @@ def _calculate_target_values_standalone( non_geo_constraints: list, n_households: int, hh_vars: dict, + reform_hh_vars: dict, person_vars: dict, entity_rel: pd.DataFrame, household_ids: np.ndarray, variable_entity_map: dict, + reform_id: int = 0, ) -> np.ndarray: """Standalone target-value calculation (no class instance). @@ -472,7 +563,8 @@ def _calculate_target_values_standalone( household_ids, n_households, ) - vals = hh_vars.get(target_variable) + source_vars = reform_hh_vars if reform_id > 0 else hh_vars + vals = source_vars.get(target_variable) if vals is None: return np.zeros(n_households, dtype=np.float32) return (vals * mask).astype(np.float32) @@ -559,8 +651,10 @@ def _process_single_clone( unique_constraint_vars = sd["unique_constraint_vars"] county_dep_targets = sd["county_dep_targets"] target_variables = sd["target_variables"] + target_reform_ids = sd["target_reform_ids"] target_geo_info = sd["target_geo_info"] non_geo_constraints_list = sd["non_geo_constraints_list"] + reform_vars = sd["reform_vars"] n_records = sd["n_records"] n_total = sd["n_total"] n_targets = sd["n_targets"] @@ -580,12 +674,13 @@ def _process_single_clone( clone_counties = geo_counties[col_start:col_end] # Assemble hh/person values from precomputed state/county - hh_vars, person_vars = _assemble_clone_values_standalone( + hh_vars, person_vars, reform_hh_vars = _assemble_clone_values_standalone( state_values, clone_states, person_hh_indices, unique_variables, unique_constraint_vars, + reform_vars=reform_vars, county_values=county_values, clone_counties=clone_counties, county_dependent_vars=county_dep_targets, @@ -715,6 +810,7 @@ def _process_single_clone( for row_idx in range(n_targets): variable = target_variables[row_idx] + reform_id = target_reform_ids[row_idx] geo_level, geo_id = target_geo_info[row_idx] non_geo = non_geo_constraints_list[row_idx] @@ -758,6 +854,7 @@ def _process_single_clone( non_geo, n_records, hh_vars, + reform_hh_vars, person_vars, entity_rel, household_ids, @@ -765,7 +862,8 @@ def _process_single_clone( ) values = count_cache[vkey] else: - if variable not in hh_vars: + source_vars = reform_hh_vars if reform_id > 0 else hh_vars + if variable not in source_vars: continue if constraint_key not in mask_cache: mask_cache[constraint_key] = _evaluate_constraints_standalone( @@ -776,7 +874,7 @@ def _process_single_clone( n_records, ) mask = mask_cache[constraint_key] - values = hh_vars[variable] * mask + values = source_vars[variable] * mask vals = values[rec_indices] nonzero = vals != 0 @@ -857,6 +955,7 @@ def _build_state_values( sim, target_vars: set, constraint_vars: set, + reform_vars: set, geography, rerandomize_takeup: bool = True, workers: int = 1, @@ -919,6 +1018,7 @@ def _build_state_values( # Convert sets to sorted lists for deterministic iteration target_vars_list = sorted(target_vars) constraint_vars_list = sorted(constraint_vars) + reform_vars_list = sorted(reform_vars) state_values = {} @@ -942,6 +1042,7 @@ def _build_state_values( n_hh, target_vars_list, constraint_vars_list, + reform_vars_list, rerandomize_takeup, affected_targets, ): st @@ -1015,6 +1116,22 @@ def _build_state_values( exc, ) + reform_hh = {} + if reform_vars_list: + baseline_income_tax = state_sim.calculate( + "income_tax", + self.time_period, + map_to="household", + ).values.astype(np.float32) + reform_hh = _compute_reform_household_values( + self.dataset_path, + self.time_period, + state, + n_hh, + reform_vars_list, + baseline_income_tax, + ) + if rerandomize_takeup: for spec in SIMPLE_TAKEUP_VARS: entity = spec["entity"] @@ -1085,6 +1202,7 @@ def _build_state_values( state_values[state] = { "hh": hh, "person": person, + "reform_hh": reform_hh, "entity": entity_vals, "entity_wf_false": entity_wf_false, } @@ -1272,6 +1390,7 @@ def _assemble_clone_values( person_hh_indices: np.ndarray, target_vars: set, constraint_vars: set, + reform_vars: set = None, county_values: dict = None, clone_counties: np.ndarray = None, county_dependent_vars: set = None, @@ -1296,9 +1415,11 @@ def _assemble_clone_values( be looked up by county instead of state. Returns: - (hh_vars, person_vars) where hh_vars maps variable - name to household-level float32 array and person_vars - maps constraint variable name to person-level array. + (hh_vars, person_vars, reform_hh_vars) where hh_vars maps + baseline variables to household-level float32 arrays, + person_vars maps constraint variables to person-level arrays, + and reform_hh_vars maps repeal-based expenditure targets to + household-level arrays. """ n_records = len(clone_states) n_persons = len(person_hh_indices) @@ -1353,7 +1474,23 @@ def _assemble_clone_values( arr[mask] = state_values[int(state)]["person"][var][mask] person_vars[var] = arr - return hh_vars, person_vars + reform_hh_vars = {} + for var in reform_vars or set(): + if not any( + var in state_values[int(state)].get("reform_hh", {}) + for state in unique_clone_states + ): + continue + arr = np.zeros(n_records, dtype=np.float32) + for state in unique_clone_states: + mask = state_masks[int(state)] + arr[mask] = state_values[int(state)].get("reform_hh", {}).get( + var, + np.zeros(mask.sum(), dtype=np.float32), + ) + reform_hh_vars[var] = arr + + return hh_vars, person_vars, reform_hh_vars # --------------------------------------------------------------- # Database queries @@ -1402,14 +1539,15 @@ def _query_targets(self, target_filter: dict) -> pd.DataFrame: query = f""" WITH filtered_targets AS ( - SELECT tv.target_id, tv.stratum_id, tv.variable, + SELECT tv.target_id, tv.stratum_id, tv.variable, tv.reform_id, tv.value, tv.period, tv.geo_level, tv.geographic_id, tv.domain_variable FROM target_overview tv - WHERE {where_clause} + WHERE tv.active = 1 + AND ({where_clause}) ), best_periods AS ( - SELECT stratum_id, variable, + SELECT stratum_id, variable, reform_id, CASE WHEN MAX(CASE WHEN period <= :time_period THEN period END) IS NOT NULL @@ -1418,13 +1556,14 @@ def _query_targets(self, target_filter: dict) -> pd.DataFrame: ELSE MIN(period) END as best_period FROM filtered_targets - GROUP BY stratum_id, variable + GROUP BY stratum_id, variable, reform_id ) SELECT ft.* FROM filtered_targets ft JOIN best_periods bp ON ft.stratum_id = bp.stratum_id AND ft.variable = bp.variable + AND ft.reform_id = bp.reform_id AND ft.period = bp.best_period ORDER BY ft.target_id """ @@ -1821,7 +1960,9 @@ def build_matrix( # 2. Sort targets by geographic level targets_df["_geo_level"] = targets_df["geographic_id"].apply(get_geo_level) - targets_df = targets_df.sort_values(["_geo_level", "variable", "geographic_id"]) + targets_df = targets_df.sort_values( + ["_geo_level", "variable", "reform_id", "geographic_id"] + ) targets_df = targets_df.drop(columns=["_geo_level"]).reset_index(drop=True) # 3. Build column index structures from geography @@ -1838,6 +1979,7 @@ def build_matrix( target_geo_info: List[Tuple[str, str]] = [] target_names: List[str] = [] non_geo_constraints_list: List[List[dict]] = [] + target_reform_ids: List[int] = [] for _, row in targets_df.iterrows(): sid = int(row["stratum_id"]) @@ -1851,12 +1993,23 @@ def build_matrix( non_geo = [c for c in constraints if c["variable"] not in _GEO_VARS] non_geo_constraints_list.append(non_geo) + reform_id = int(row.get("reform_id", 0)) + target_reform_ids.append(reform_id) target_names.append( - self._make_target_name(str(row["variable"]), constraints) + self._make_target_name( + str(row["variable"]), + constraints, + reform_id=reform_id, + ) ) unique_variables = set(targets_df["variable"].values) + reform_variables = { + str(row["variable"]) + for _, row in targets_df.iterrows() + if int(row.get("reform_id", 0)) > 0 + } # 5a. Collect unique constraint variables unique_constraint_vars = set() @@ -1870,6 +2023,7 @@ def build_matrix( sim, unique_variables, unique_constraint_vars, + reform_variables, geography, rerandomize_takeup=rerandomize_takeup, workers=workers, @@ -2003,8 +2157,10 @@ def build_matrix( "person_hh_indices": person_hh_indices, "unique_variables": unique_variables, "unique_constraint_vars": unique_constraint_vars, + "reform_vars": reform_variables, "county_dep_targets": county_dep_targets, "target_variables": target_variables, + "target_reform_ids": target_reform_ids, "target_geo_info": target_geo_info, "non_geo_constraints_list": (non_geo_constraints_list), "n_records": n_records, @@ -2103,12 +2259,13 @@ def build_matrix( len(np.unique(clone_states)), ) - hh_vars, person_vars = self._assemble_clone_values( + hh_vars, person_vars, reform_hh_vars = self._assemble_clone_values( state_values, clone_states, person_hh_indices, unique_variables, unique_constraint_vars, + reform_vars=reform_variables, county_values=county_values, clone_counties=clone_counties, county_dependent_vars=(county_dep_targets), @@ -2245,6 +2402,7 @@ def build_matrix( for row_idx in range(n_targets): variable = str(targets_df.iloc[row_idx]["variable"]) + reform_id = int(targets_df.iloc[row_idx].get("reform_id", 0)) geo_level, geo_id = target_geo_info[row_idx] non_geo = non_geo_constraints_list[row_idx] @@ -2291,6 +2449,7 @@ def build_matrix( non_geo_constraints=non_geo, n_households=n_records, hh_vars=hh_vars, + reform_hh_vars=reform_hh_vars, person_vars=person_vars, entity_rel=entity_rel, household_ids=household_ids, @@ -2298,7 +2457,8 @@ def build_matrix( ) values = count_cache[vkey] else: - if variable not in hh_vars: + source_vars = reform_hh_vars if reform_id > 0 else hh_vars + if variable not in source_vars: continue if constraint_key not in mask_cache: mask_cache[constraint_key] = ( @@ -2311,7 +2471,7 @@ def build_matrix( ) ) mask = mask_cache[constraint_key] - values = hh_vars[variable] * mask + values = source_vars[variable] * mask vals = values[rec_indices] nonzero = vals != 0 diff --git a/policyengine_us_data/calibration/validate_staging.py b/policyengine_us_data/calibration/validate_staging.py index eb46287f4..f13f441ad 100644 --- a/policyengine_us_data/calibration/validate_staging.py +++ b/policyengine_us_data/calibration/validate_staging.py @@ -33,6 +33,7 @@ UnifiedMatrixBuilder, _calculate_target_values_standalone, _GEO_VARS, + _make_neutralize_variable_reform, ) from policyengine_us_data.calibration.calibration_utils import ( STATE_CODES, @@ -122,7 +123,7 @@ def _run_sanity_check( def _query_all_active_targets(engine, period: int) -> pd.DataFrame: query = """ WITH best_periods AS ( - SELECT stratum_id, variable, + SELECT stratum_id, variable, reform_id, CASE WHEN MAX(CASE WHEN period <= :period THEN period END) IS NOT NULL @@ -132,15 +133,16 @@ def _query_all_active_targets(engine, period: int) -> pd.DataFrame: END as best_period FROM target_overview WHERE active = 1 - GROUP BY stratum_id, variable + GROUP BY stratum_id, variable, reform_id ) - SELECT tv.target_id, tv.stratum_id, tv.variable, + SELECT tv.target_id, tv.stratum_id, tv.variable, tv.reform_id, tv.value, tv.period, tv.geo_level, tv.geographic_id, tv.domain_variable FROM target_overview tv JOIN best_periods bp ON tv.stratum_id = bp.stratum_id AND tv.variable = bp.variable + AND tv.reform_id = bp.reform_id AND tv.period = bp.best_period WHERE tv.active = 1 ORDER BY tv.target_id @@ -268,6 +270,29 @@ def _build_entity_rel(sim) -> pd.DataFrame: ) +def _get_reform_household_values( + dataset_path: str, + period: int, + variable: str, + reform_hh_cache: dict, +) -> np.ndarray: + if variable in reform_hh_cache: + return reform_hh_cache[variable] + + from policyengine_us import Microsimulation + + reform_sim = Microsimulation( + dataset=dataset_path, + reform=_make_neutralize_variable_reform(variable), + ) + reform_hh_cache[variable] = reform_sim.calculate( + "income_tax", + map_to="household", + period=period, + ).values + return reform_hh_cache[variable] + + def validate_area( sim, targets_df: pd.DataFrame, @@ -275,6 +300,7 @@ def validate_area( area_type: str, area_id: str, display_id: str, + dataset_path: str, period: int, training_mask: np.ndarray, variable_entity_map: dict, @@ -291,6 +317,7 @@ def validate_area( ).values.astype(np.float64) hh_vars_cache = {} + reform_hh_cache = {} person_vars_cache = {} training_arr = np.asarray(training_mask, dtype=bool) @@ -300,6 +327,7 @@ def validate_area( results = [] for i, (idx, row) in enumerate(targets_df.iterrows()): variable = row["variable"] + reform_id = int(row.get("reform_id", 0)) target_value = float(row["value"]) stratum_id = int(row["stratum_id"]) @@ -336,15 +364,32 @@ def validate_area( except Exception: pass + if reform_id > 0 and "income_tax" not in hh_vars_cache: + hh_vars_cache["income_tax"] = sim.calculate( + "income_tax", + map_to="household", + period=period, + ).values + if reform_id > 0 and variable not in reform_hh_cache: + reform_income_tax = _get_reform_household_values( + dataset_path, + period, + variable, + reform_hh_cache, + ) + reform_hh_cache[variable] = reform_income_tax - hh_vars_cache["income_tax"] + per_hh = _calculate_target_values_standalone( target_variable=variable, non_geo_constraints=non_geo, n_households=n_households, hh_vars=hh_vars_cache, + reform_hh_vars=reform_hh_cache, person_vars=person_vars_cache, entity_rel=entity_rel, household_ids=household_ids, variable_entity_map=variable_entity_map, + reform_id=reform_id, ) sim_value = float(np.dot(per_hh, hh_weight)) @@ -361,6 +406,7 @@ def validate_area( target_name = UnifiedMatrixBuilder._make_target_name( variable, constraints, + reform_id=reform_id, ) sanity_check, sanity_reason = _run_sanity_check( @@ -526,6 +572,7 @@ def _validate_single_area( area_type=area_type, area_id=area_id, display_id=display_id, + dataset_path=h5_path, period=period, training_mask=area_training, variable_entity_map=variable_entity_map, @@ -580,11 +627,13 @@ def _compute_district_contributions( ).values.astype(np.float64) hh_vars_cache = {} + reform_hh_cache = {} person_vars_cache = {} results = [] for i, (idx, row) in enumerate(state_targets_df.iterrows()): variable = row["variable"] + reform_id = int(row.get("reform_id", 0)) stratum_id = int(row["stratum_id"]) constraints = constraints_map.get(stratum_id, []) @@ -615,15 +664,32 @@ def _compute_district_contributions( except Exception: pass + if reform_id > 0 and "income_tax" not in hh_vars_cache: + hh_vars_cache["income_tax"] = sim.calculate( + "income_tax", + map_to="household", + period=period, + ).values + if reform_id > 0 and variable not in reform_hh_cache: + reform_income_tax = _get_reform_household_values( + district_h5_path, + period, + variable, + reform_hh_cache, + ) + reform_hh_cache[variable] = reform_income_tax - hh_vars_cache["income_tax"] + per_hh = _calculate_target_values_standalone( target_variable=variable, non_geo_constraints=non_geo, n_households=n_households, hh_vars=hh_vars_cache, + reform_hh_vars=reform_hh_cache, person_vars=person_vars_cache, entity_rel=entity_rel, household_ids=household_ids, variable_entity_map=variable_entity_map, + reform_id=reform_id, ) sim_value = float(np.dot(per_hh, hh_weight)) @@ -709,9 +775,14 @@ def _run_state_via_districts( row_data = state_targets.iloc[tidx] target_value = float(row_data["value"]) variable = row_data["variable"] + reform_id = int(row_data.get("reform_id", 0)) stratum_id = int(row_data["stratum_id"]) constraints = constraints_map.get(stratum_id, []) - target_name = UnifiedMatrixBuilder._make_target_name(variable, constraints) + target_name = UnifiedMatrixBuilder._make_target_name( + variable, + constraints, + reform_id=reform_id, + ) per_district_rows.append( { @@ -737,12 +808,17 @@ def _run_state_via_districts( for i in range(n_targets): row_data = state_targets.iloc[i] variable = row_data["variable"] + reform_id = int(row_data.get("reform_id", 0)) target_value = float(row_data["value"]) sim_value = float(aggregated[i]) stratum_id = int(row_data["stratum_id"]) constraints = constraints_map.get(stratum_id, []) - target_name = UnifiedMatrixBuilder._make_target_name(variable, constraints) + target_name = UnifiedMatrixBuilder._make_target_name( + variable, + constraints, + reform_id=reform_id, + ) error = sim_value - target_value abs_error = abs(error) diff --git a/policyengine_us_data/db/create_database_tables.py b/policyengine_us_data/db/create_database_tables.py index 4999a6f7f..86121f1d7 100644 --- a/policyengine_us_data/db/create_database_tables.py +++ b/policyengine_us_data/db/create_database_tables.py @@ -309,6 +309,7 @@ def validate_parent_child_constraints(mapper, connection, target: Stratum): t.target_id, t.stratum_id, t.variable, + t.reform_id, t.value, t.period, t.active, @@ -348,7 +349,7 @@ def validate_parent_child_constraints(mapper, connection, target: Stratum): FROM targets t LEFT JOIN stratum_constraints sc ON t.stratum_id = sc.stratum_id GROUP BY t.target_id, t.stratum_id, t.variable, - t.value, t.period, t.active; + t.reform_id, t.value, t.period, t.active; """ diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py index a5e208687..12ec523bb 100644 --- a/policyengine_us_data/db/etl_national_targets.py +++ b/policyengine_us_data/db/etl_national_targets.py @@ -14,6 +14,8 @@ etl_argparser, ) +TAX_EXPENDITURE_REFORM_ID = 1 + def extract_national_targets(dataset: str = DEFAULT_DATASET): """ @@ -31,6 +33,7 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET): Dictionary containing: - direct_sum_targets: Variables that can be summed directly - tax_filer_targets: Tax-related variables requiring filer constraint + - tax_expenditure_targets: Variables targeted via repeal-based tax expenditures - conditional_count_targets: Enrollment counts requiring constraints - cbo_targets: List of CBO projection targets - treasury_targets: List of Treasury/JCT targets @@ -56,18 +59,12 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET): ) # Separate tax-related targets that need filer constraint - tax_filer_targets = [ - { - "variable": "qualified_business_income_deduction", - "value": 63.1e9, - "source": "Joint Committee on Taxation", - "notes": "QBI deduction tax expenditure", - "year": HARDCODED_YEAR, - }, - ] + tax_filer_targets = [] - # Itemized deduction targets need both filer and itemizer constraints - itemizer_targets = [ + # These JCT values are tax expenditures, not baseline deduction totals. + # They must be matched against repeal-based income tax deltas in the + # unified calibration path. + tax_expenditure_targets = [ { "variable": "salt_deduction", "value": 21.247e9, @@ -96,6 +93,13 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET): "notes": "Mortgage interest deduction tax expenditure", "year": HARDCODED_YEAR, }, + { + "variable": "qualified_business_income_deduction", + "value": 63.1e9, + "source": "Joint Committee on Taxation", + "notes": "QBI deduction tax expenditure", + "year": HARDCODED_YEAR, + }, ] direct_sum_targets = [ @@ -398,7 +402,7 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET): return { "direct_sum_targets": direct_sum_targets, "tax_filer_targets": tax_filer_targets, - "itemizer_targets": itemizer_targets, + "tax_expenditure_targets": tax_expenditure_targets, "conditional_count_targets": conditional_count_targets, "cbo_targets": cbo_targets, "treasury_targets": treasury_targets, @@ -418,10 +422,10 @@ def transform_national_targets(raw_targets): Returns ------- tuple - (direct_targets_df, tax_filer_df, itemizer_df, conditional_targets) + (direct_targets_df, tax_filer_df, tax_expenditure_df, conditional_targets) - direct_targets_df: DataFrame with direct sum targets - tax_filer_df: DataFrame with tax-related targets needing filer constraint - - itemizer_df: DataFrame with itemized deduction targets needing filer + itemizer constraints + - tax_expenditure_df: DataFrame with reform-based tax expenditure targets - conditional_targets: List of conditional count targets """ @@ -450,19 +454,24 @@ def transform_national_targets(raw_targets): tax_filer_df = ( pd.DataFrame(all_tax_filer_targets) if all_tax_filer_targets else pd.DataFrame() ) - itemizer_df = ( - pd.DataFrame(raw_targets["itemizer_targets"]) - if raw_targets["itemizer_targets"] + tax_expenditure_df = ( + pd.DataFrame(raw_targets["tax_expenditure_targets"]) + if raw_targets["tax_expenditure_targets"] else pd.DataFrame() ) # Conditional targets stay as list for special processing conditional_targets = raw_targets["conditional_count_targets"] - return direct_df, tax_filer_df, itemizer_df, conditional_targets + return direct_df, tax_filer_df, tax_expenditure_df, conditional_targets -def load_national_targets(direct_targets_df, tax_filer_df, itemizer_df, conditional_targets): +def load_national_targets( + direct_targets_df, + tax_filer_df, + tax_expenditure_df, + conditional_targets, +): """ Load national targets into the database. @@ -472,8 +481,8 @@ def load_national_targets(direct_targets_df, tax_filer_df, itemizer_df, conditio DataFrame with direct sum target data tax_filer_df : pd.DataFrame DataFrame with tax-related targets needing filer constraint - itemizer_df : pd.DataFrame - DataFrame with itemized deduction targets needing filer + itemizer constraints + tax_expenditure_df : pd.DataFrame + DataFrame with reform-based tax expenditure targets conditional_targets : list List of conditional count targets requiring strata """ @@ -603,46 +612,49 @@ def load_national_targets(direct_targets_df, tax_filer_df, itemizer_df, conditio session.add(target) print(f"Added filer target: {target_data['variable']}") - # Process itemized deduction targets that need filer + itemizer constraints - if not itemizer_df.empty: - national_itemizer_stratum = ( + # Process reform-based tax expenditure targets. + if not tax_expenditure_df.empty: + migrated_strata = ( session.query(Stratum) .filter( Stratum.parent_stratum_id == us_stratum.stratum_id, - Stratum.notes == "United States - Itemizing Tax Filers", + Stratum.notes.in_( + [ + "United States - Tax Filers", + "United States - Itemizing Tax Filers", + ] + ), ) - .first() + .all() ) + migrated_stratum_ids = [s.stratum_id for s in migrated_strata] - if not national_itemizer_stratum: - national_itemizer_stratum = Stratum( - parent_stratum_id=us_stratum.stratum_id, - notes="United States - Itemizing Tax Filers", - ) - national_itemizer_stratum.constraints_rel = [ - StratumConstraint( - constraint_variable="tax_unit_is_filer", - operation="==", - value="1", - ), - StratumConstraint( - constraint_variable="tax_unit_itemizes", - operation="==", - value="1", - ), - ] - session.add(national_itemizer_stratum) - session.flush() - print("Created national itemizer stratum") - - for _, target_data in itemizer_df.iterrows(): + for _, target_data in tax_expenditure_df.iterrows(): target_year = target_data["year"] + + # Clean up incorrectly scoped baseline rows from older DBs. + if migrated_stratum_ids: + stale_targets = ( + session.query(Target) + .filter( + Target.stratum_id.in_(migrated_stratum_ids), + Target.variable == target_data["variable"], + Target.period == target_year, + Target.reform_id == 0, + Target.active == True, + ) + .all() + ) + for stale_target in stale_targets: + stale_target.active = False + existing_target = ( session.query(Target) .filter( - Target.stratum_id == national_itemizer_stratum.stratum_id, + Target.stratum_id == us_stratum.stratum_id, Target.variable == target_data["variable"], Target.period == target_year, + Target.reform_id == TAX_EXPENDITURE_REFORM_ID, ) .first() ) @@ -650,6 +662,9 @@ def load_national_targets(direct_targets_df, tax_filer_df, itemizer_df, conditio notes_parts = [] if pd.notna(target_data.get("notes")): notes_parts.append(target_data["notes"]) + notes_parts.append( + "Modeled as repeal-based income tax expenditure target" + ) notes_parts.append(f"Source: {target_data.get('source', 'Unknown')}") combined_notes = " | ".join(notes_parts) @@ -657,19 +672,25 @@ def load_national_targets(direct_targets_df, tax_filer_df, itemizer_df, conditio existing_target.value = target_data["value"] existing_target.notes = combined_notes existing_target.source = "PolicyEngine" - print(f"Updated itemizer target: {target_data['variable']}") + existing_target.active = True + print( + f"Updated tax expenditure target: {target_data['variable']}" + ) else: target = Target( - stratum_id=national_itemizer_stratum.stratum_id, + stratum_id=us_stratum.stratum_id, variable=target_data["variable"], period=target_year, + reform_id=TAX_EXPENDITURE_REFORM_ID, value=target_data["value"], active=True, source="PolicyEngine", notes=combined_notes, ) session.add(target) - print(f"Added itemizer target: {target_data['variable']}") + print( + f"Added tax expenditure target: {target_data['variable']}" + ) # Process conditional count targets (enrollment counts) for cond_target in conditional_targets: @@ -767,12 +788,15 @@ def load_national_targets(direct_targets_df, tax_filer_df, itemizer_df, conditio session.commit() total_targets = ( - len(direct_targets_df) + len(tax_filer_df) + len(itemizer_df) + len(conditional_targets) + len(direct_targets_df) + + len(tax_filer_df) + + len(tax_expenditure_df) + + len(conditional_targets) ) print(f"\nSuccessfully loaded {total_targets} national targets") print(f" - {len(direct_targets_df)} direct sum targets") print(f" - {len(tax_filer_df)} tax filer targets") - print(f" - {len(itemizer_df)} itemizer targets") + print(f" - {len(tax_expenditure_df)} tax expenditure targets") print(f" - {len(conditional_targets)} enrollment count targets (as strata)") @@ -788,13 +812,23 @@ def main(): # Transform print("Transforming targets...") - direct_targets_df, tax_filer_df, itemizer_df, conditional_targets = transform_national_targets( + ( + direct_targets_df, + tax_filer_df, + tax_expenditure_df, + conditional_targets, + ) = transform_national_targets( raw_targets ) # Load print("Loading targets into database...") - load_national_targets(direct_targets_df, tax_filer_df, itemizer_df, conditional_targets) + load_national_targets( + direct_targets_df, + tax_filer_df, + tax_expenditure_df, + conditional_targets, + ) print("\nETL pipeline complete!") diff --git a/policyengine_us_data/tests/test_calibration/test_unified_matrix_builder.py b/policyengine_us_data/tests/test_calibration/test_unified_matrix_builder.py index 492719d9e..da0f49882 100644 --- a/policyengine_us_data/tests/test_calibration/test_unified_matrix_builder.py +++ b/policyengine_us_data/tests/test_calibration/test_unified_matrix_builder.py @@ -49,6 +49,7 @@ def _create_test_db(db_path): "target_id INTEGER PRIMARY KEY, " "stratum_id INTEGER, " "variable TEXT, " + "reform_id INTEGER DEFAULT 0, " "value REAL, " "period INTEGER, " "active INTEGER DEFAULT 1)" @@ -110,33 +111,41 @@ def _insert_aca_ptc_data(engine): ) targets = [ - (1, 1, "aca_ptc", 10000.0, 2022), - (2, 1, "tax_unit_count", 500.0, 2022), - (3, 2, "aca_ptc", 6000.0, 2022), - (4, 2, "tax_unit_count", 300.0, 2022), - (5, 3, "aca_ptc", 4000.0, 2022), - (6, 3, "tax_unit_count", 200.0, 2022), - (7, 4, "aca_ptc", 2000.0, 2022), - (8, 5, "aca_ptc", 2500.0, 2022), - (9, 6, "aca_ptc", 1500.0, 2022), - (10, 4, "tax_unit_count", 100.0, 2022), - (11, 5, "tax_unit_count", 120.0, 2022), - (12, 6, "tax_unit_count", 80.0, 2022), - (13, 7, "aca_ptc", 2200.0, 2022), - (14, 8, "aca_ptc", 1800.0, 2022), - (15, 7, "tax_unit_count", 110.0, 2022), - (16, 8, "tax_unit_count", 90.0, 2022), - (17, 9, "person_count", 19743689.0, 2024), + (1, 1, "aca_ptc", 0, 10000.0, 2022, 1), + (2, 1, "tax_unit_count", 0, 500.0, 2022, 1), + (3, 2, "aca_ptc", 0, 6000.0, 2022, 1), + (4, 2, "tax_unit_count", 0, 300.0, 2022, 1), + (5, 3, "aca_ptc", 0, 4000.0, 2022, 1), + (6, 3, "tax_unit_count", 0, 200.0, 2022, 1), + (7, 4, "aca_ptc", 0, 2000.0, 2022, 1), + (8, 5, "aca_ptc", 0, 2500.0, 2022, 1), + (9, 6, "aca_ptc", 0, 1500.0, 2022, 1), + (10, 4, "tax_unit_count", 0, 100.0, 2022, 1), + (11, 5, "tax_unit_count", 0, 120.0, 2022, 1), + (12, 6, "tax_unit_count", 0, 80.0, 2022, 1), + (13, 7, "aca_ptc", 0, 2200.0, 2022, 1), + (14, 8, "aca_ptc", 0, 1800.0, 2022, 1), + (15, 7, "tax_unit_count", 0, 110.0, 2022, 1), + (16, 8, "tax_unit_count", 0, 90.0, 2022, 1), + (17, 9, "person_count", 0, 19743689.0, 2024, 1), + (18, 1, "aca_ptc", 1, 999.0, 2022, 1), + (19, 1, "aca_ptc", 0, 12345.0, 2024, 0), ] - for tid, sid, var, val, period in targets: + for tid, sid, var, reform_id, val, period, active in targets: conn.execute( - text("INSERT INTO targets VALUES (:tid, :sid, :var, :val, :period, 1)"), + text( + "INSERT INTO targets " + "(target_id, stratum_id, variable, reform_id, value, period, active) " + "VALUES (:tid, :sid, :var, :reform_id, :val, :period, :active)" + ), { "tid": tid, "sid": sid, "var": var, + "reform_id": reform_id, "val": val, "period": period, + "active": active, }, ) conn.commit() @@ -192,6 +201,30 @@ def test_geographic_id_populated(self): state_ca = df[(df["geo_level"] == "state") & (df["geographic_id"] == "6")] self.assertGreater(len(state_ca), 0) + def test_reform_targets_preserved(self): + b = self._make_builder() + df = b._query_targets({"domain_variables": ["aca_ptc"]}) + reform_rows = df[(df["variable"] == "aca_ptc") & (df["reform_id"] == 1)] + baseline_rows = df[(df["variable"] == "aca_ptc") & (df["reform_id"] == 0)] + self.assertEqual(len(reform_rows), 1) + self.assertGreater(len(baseline_rows), 0) + + def test_inactive_targets_are_excluded(self): + b = self._make_builder(time_period=2024) + df = b._query_targets({"stratum_ids": [1], "variables": ["aca_ptc"]}) + baseline_rows = df[(df["variable"] == "aca_ptc") & (df["reform_id"] == 0)] + self.assertEqual(len(baseline_rows), 1) + self.assertEqual(int(baseline_rows.iloc[0]["period"]), 2022) + self.assertEqual(float(baseline_rows.iloc[0]["value"]), 10000.0) + + def test_target_name_adds_expenditure_suffix_for_reforms(self): + name = UnifiedMatrixBuilder._make_target_name( + "salt_deduction", + [], + reform_id=1, + ) + self.assertEqual(name, "national/salt_deduction_expenditure") + class TestHierarchicalUprating(unittest.TestCase): @classmethod diff --git a/policyengine_us_data/tests/test_schema_views_and_lookups.py b/policyengine_us_data/tests/test_schema_views_and_lookups.py index c8e5f4f8a..e4fea0f08 100644 --- a/policyengine_us_data/tests/test_schema_views_and_lookups.py +++ b/policyengine_us_data/tests/test_schema_views_and_lookups.py @@ -66,6 +66,7 @@ def _add_target( period: int, value: float, active: bool = True, + reform_id: int = 0, ) -> Target: """Insert a target row.""" target = Target( @@ -74,6 +75,7 @@ def _add_target( period=period, value=value, active=active, + reform_id=reform_id, ) session.add(target) session.commit() @@ -371,6 +373,32 @@ def test_active_flag_passthrough(self): elif r[var_idx] == "household_count": self.assertFalse(bool(r[active_idx])) + def test_reform_id_passthrough(self): + """Reform targets retain their reform_id in target_overview.""" + with Session(self.engine) as session: + _add_target( + session, + self.national_id, + "salt_deduction", + 2024, + 21.247e9, + reform_id=1, + ) + + rows = self._query_target_overview() + cols = self._overview_columns() + sid_idx = cols.index("stratum_id") + var_idx = cols.index("variable") + reform_idx = cols.index("reform_id") + + matches = [ + r + for r in rows + if r[sid_idx] == self.national_id and r[var_idx] == "salt_deduction" + ] + self.assertEqual(len(matches), 1) + self.assertEqual(matches[0][reform_idx], 1) + # ---------------------------------------------------------------- # get_geographic_strata() # ---------------------------------------------------------------- From e75dc7c217317cd1c1510a0fb9ced71dfc116167 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Fri, 27 Mar 2026 11:15:41 -0400 Subject: [PATCH 3/3] Format files for lint --- .../calibration/unified_matrix_builder.py | 20 +++++++++++++------ policyengine_us_data/db/etl_irs_soi.py | 6 +++++- .../db/etl_national_targets.py | 12 +++-------- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/policyengine_us_data/calibration/unified_matrix_builder.py b/policyengine_us_data/calibration/unified_matrix_builder.py index 09c121935..1e1bb0055 100644 --- a/policyengine_us_data/calibration/unified_matrix_builder.py +++ b/policyengine_us_data/calibration/unified_matrix_builder.py @@ -492,9 +492,13 @@ def _assemble_clone_values_standalone( arr = np.zeros(n_records, dtype=np.float32) for state in unique_clone_states: mask = state_masks[int(state)] - arr[mask] = state_values[int(state)].get("reform_hh", {}).get( - var, - np.zeros(mask.sum(), dtype=np.float32), + arr[mask] = ( + state_values[int(state)] + .get("reform_hh", {}) + .get( + var, + np.zeros(mask.sum(), dtype=np.float32), + ) ) reform_hh_vars[var] = arr @@ -1484,9 +1488,13 @@ def _assemble_clone_values( arr = np.zeros(n_records, dtype=np.float32) for state in unique_clone_states: mask = state_masks[int(state)] - arr[mask] = state_values[int(state)].get("reform_hh", {}).get( - var, - np.zeros(mask.sum(), dtype=np.float32), + arr[mask] = ( + state_values[int(state)] + .get("reform_hh", {}) + .get( + var, + np.zeros(mask.sum(), dtype=np.float32), + ) ) reform_hh_vars[var] = arr diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index 33f08cef0..8e9543da8 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -29,7 +29,11 @@ logger = logging.getLogger(__name__) -ITEMIZED_DEDUCTION_VARIABLES = {"salt", "real_estate_taxes", "medical_expense_deduction"} +ITEMIZED_DEDUCTION_VARIABLES = { + "salt", + "real_estate_taxes", + "medical_expense_deduction", +} # IRS SOI data is typically available ~2 years after the tax year IRS_SOI_LAG_YEARS = 2 diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py index 12ec523bb..278e3a909 100644 --- a/policyengine_us_data/db/etl_national_targets.py +++ b/policyengine_us_data/db/etl_national_targets.py @@ -673,9 +673,7 @@ def load_national_targets( existing_target.notes = combined_notes existing_target.source = "PolicyEngine" existing_target.active = True - print( - f"Updated tax expenditure target: {target_data['variable']}" - ) + print(f"Updated tax expenditure target: {target_data['variable']}") else: target = Target( stratum_id=us_stratum.stratum_id, @@ -688,9 +686,7 @@ def load_national_targets( notes=combined_notes, ) session.add(target) - print( - f"Added tax expenditure target: {target_data['variable']}" - ) + print(f"Added tax expenditure target: {target_data['variable']}") # Process conditional count targets (enrollment counts) for cond_target in conditional_targets: @@ -817,9 +813,7 @@ def main(): tax_filer_df, tax_expenditure_df, conditional_targets, - ) = transform_national_targets( - raw_targets - ) + ) = transform_national_targets(raw_targets) # Load print("Loading targets into database...")