diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..2e4cbce3 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,8 @@ +- bump: minor + changes: + added: + - Census block-level geographic assignment for households in CD-stacked datasets + - Comprehensive geography variables in output (block_geoid, tract_geoid, cbsa_code, sldu, sldl, place_fips, vtd, puma, zcta) + - Block crosswalk file mapping 8.1M blocks to all Census geographies + - Block-to-CD distribution file for population-weighted assignment + - ZCTA (ZIP Code Tabulation Area) lookup from census block diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/block_assignment.py b/policyengine_us_data/datasets/cps/local_area_calibration/block_assignment.py new file mode 100644 index 00000000..73b435f6 --- /dev/null +++ b/policyengine_us_data/datasets/cps/local_area_calibration/block_assignment.py @@ -0,0 +1,597 @@ +""" +Census block assignment for congressional districts. + +Provides population-weighted random census block assignment for households +within each congressional district. This enables consistent lookup of all +geographic variables from a single block GEOID: + +- State, County, Tract (derived from block GEOID structure) +- CBSA/Metro area (via county crosswalk) +- SLDU/SLDL (State Legislative Districts) +- Place/City (via Census BAF) +- PUMA (via tract crosswalk) +- VTD (Voting Tabulation District) +- ZCTA (ZIP Code Tabulation Area) + +The distributions are computed from Census block-level population data and +stored in storage/block_cd_distributions.csv.gz. Block GEOIDs are 15-digit +strings in format SSCCCTTTTTTBBBB (state, county, tract, block). + +Additional geography lookups use storage/block_crosswalk.csv.gz which maps +blocks to SLDU, SLDL, Place, VTD, PUMA, and ZCTA. +""" + +import random +import re +from functools import lru_cache +from io import StringIO +from typing import Dict, Optional + +import numpy as np +import pandas as pd +import requests + +from policyengine_us.variables.household.demographic.geographic.county.county_enum import ( + County, +) +from policyengine_us_data.storage import STORAGE_FOLDER + +# === GEOID Parsing Functions === +# Block GEOID format: SSCCCTTTTTTBBBB (15 chars) +# SS = State FIPS (2 digits) +# CCC = County FIPS (3 digits) +# TTTTTT = Tract (6 digits) +# BBBB = Block (4 digits) + + +def get_state_fips_from_block(block_geoid: str) -> str: + """Extract 2-digit state FIPS from block GEOID.""" + return block_geoid[:2] + + +def get_county_fips_from_block(block_geoid: str) -> str: + """Extract 5-digit county FIPS (state + county) from block GEOID.""" + return block_geoid[:5] + + +def get_tract_geoid_from_block(block_geoid: str) -> str: + """Extract 11-digit tract GEOID (state + county + tract) from block GEOID.""" + return block_geoid[:11] + + +# === County FIPS to Enum Mapping === + + +@lru_cache(maxsize=1) +def _build_county_fips_to_enum() -> Dict[str, str]: + """ + Build mapping from 5-digit county FIPS to County enum name. + + Downloads Census county FIPS file and matches to County enum names. + Cached to avoid repeated downloads. + """ + url = "https://www2.census.gov/geo/docs/reference/codes2020/national_county2020.txt" + response = requests.get(url, timeout=60) + df = pd.read_csv( + StringIO(response.text), + delimiter="|", + dtype=str, + usecols=["STATE", "STATEFP", "COUNTYFP", "COUNTYNAME"], + ) + + valid_enum_names = set(County._member_names_) + fips_to_enum = {} + + for _, row in df.iterrows(): + county_fips = row["STATEFP"] + row["COUNTYFP"] + state_code = row["STATE"] + county_name = row["COUNTYNAME"] + + # Transform to enum name format + enum_name = county_name.upper() + enum_name = re.sub(r"[.'\"]", "", enum_name) + enum_name = enum_name.replace("-", "_") + enum_name = enum_name.replace(" ", "_") + enum_name = f"{enum_name}_{state_code}" + + if enum_name in valid_enum_names: + fips_to_enum[county_fips] = enum_name + + return fips_to_enum + + +def get_county_enum_index_from_block(block_geoid: str) -> int: + """ + Get County enum index from block GEOID. + + Args: + block_geoid: 15-digit census block GEOID + + Returns: + Integer index into County enum, or UNKNOWN index if not found + """ + county_fips = get_county_fips_from_block(block_geoid) + fips_to_enum = _build_county_fips_to_enum() + enum_name = fips_to_enum.get(county_fips, "UNKNOWN") + return County._member_names_.index(enum_name) + + +# === CBSA Lookup === + + +@lru_cache(maxsize=1) +def _load_cbsa_crosswalk() -> Dict[str, str]: + """ + Load county FIPS to CBSA code crosswalk from NBER. + + Returns: + Dict mapping 5-digit county FIPS to CBSA code (or None if not in CBSA) + """ + url = "https://data.nber.org/cbsa-csa-fips-county-crosswalk/2023/cbsa2fipsxw_2023.csv" + try: + df = pd.read_csv(url, dtype=str) + # Build 5-digit county FIPS from state + county codes + df["county_fips"] = df["fipsstatecode"] + df["fipscountycode"] + # Only include rows with valid CBSA codes (not blank/NA) + df = df.dropna(subset=["cbsacode"]) + df = df[df["cbsacode"].str.strip() != ""] + + return dict(zip(df["county_fips"], df["cbsacode"])) + except Exception: + # Return empty dict if download fails - rural areas will return None + return {} + + +def get_cbsa_from_county(county_fips: str) -> Optional[str]: + """ + Get CBSA code for a county. + + Args: + county_fips: 5-digit county FIPS code + + Returns: + CBSA code (e.g., "35620" for NYC metro) or None if not in CBSA + """ + crosswalk = _load_cbsa_crosswalk() + return crosswalk.get(county_fips) + + +# === Block Crosswalk for Additional Geographies === + + +@lru_cache(maxsize=1) +def _load_block_crosswalk() -> pd.DataFrame: + """ + Load block-level crosswalk for SLDU, SLDL, Place, VTD, PUMA. + + Returns: + DataFrame indexed by block_geoid with columns for each geography. + """ + csv_path = STORAGE_FOLDER / "block_crosswalk.csv.gz" + + if not csv_path.exists(): + print( + f"Warning: {csv_path} not found. " + "Run make_block_crosswalk.py to generate." + ) + return pd.DataFrame() + + # Load all columns as strings, then set index + # (using index_col directly can convert to int, dropping leading zeros) + df = pd.read_csv(csv_path, dtype=str) + df = df.set_index("block_geoid") + return df + + +def get_sldu_from_block(block_geoid: str) -> Optional[str]: + """Get State Legislative District Upper from block GEOID.""" + crosswalk = _load_block_crosswalk() + if block_geoid in crosswalk.index: + val = crosswalk.loc[block_geoid, "sldu"] + return val if pd.notna(val) else None + return None + + +def get_sldl_from_block(block_geoid: str) -> Optional[str]: + """Get State Legislative District Lower from block GEOID.""" + crosswalk = _load_block_crosswalk() + if block_geoid in crosswalk.index: + val = crosswalk.loc[block_geoid, "sldl"] + return val if pd.notna(val) else None + return None + + +def get_place_fips_from_block(block_geoid: str) -> Optional[str]: + """Get Place/City FIPS from block GEOID.""" + crosswalk = _load_block_crosswalk() + if block_geoid in crosswalk.index: + val = crosswalk.loc[block_geoid, "place_fips"] + return val if pd.notna(val) else None + return None + + +def get_vtd_from_block(block_geoid: str) -> Optional[str]: + """Get Voting Tabulation District from block GEOID.""" + crosswalk = _load_block_crosswalk() + if block_geoid in crosswalk.index: + val = crosswalk.loc[block_geoid, "vtd"] + return val if pd.notna(val) else None + return None + + +def get_puma_from_block(block_geoid: str) -> Optional[str]: + """Get PUMA (Public Use Microdata Area) from block GEOID.""" + crosswalk = _load_block_crosswalk() + if block_geoid in crosswalk.index: + val = crosswalk.loc[block_geoid, "puma"] + return val if pd.notna(val) else None + return None + + +def get_zcta_from_block(block_geoid: str) -> Optional[str]: + """Get ZCTA (ZIP Code Tabulation Area) from block GEOID.""" + crosswalk = _load_block_crosswalk() + if "zcta" not in crosswalk.columns: + return None + if block_geoid in crosswalk.index: + val = crosswalk.loc[block_geoid, "zcta"] + return val if pd.notna(val) else None + return None + + +def get_all_geography_from_block(block_geoid: str) -> Dict[str, Optional[str]]: + """ + Get all geographic variables from a single block GEOID lookup. + + More efficient than calling individual functions when you need multiple + geographies, as it only does one crosswalk lookup. + + Args: + block_geoid: 15-digit census block GEOID + + Returns: + Dict with keys: sldu, sldl, place_fips, vtd, puma, zcta + Values are strings or None if not available. + """ + crosswalk = _load_block_crosswalk() + has_zcta = "zcta" in crosswalk.columns + if block_geoid in crosswalk.index: + row = crosswalk.loc[block_geoid] + result = { + "sldu": row["sldu"] if pd.notna(row["sldu"]) else None, + "sldl": row["sldl"] if pd.notna(row["sldl"]) else None, + "place_fips": ( + row["place_fips"] if pd.notna(row["place_fips"]) else None + ), + "vtd": row["vtd"] if pd.notna(row["vtd"]) else None, + "puma": row["puma"] if pd.notna(row["puma"]) else None, + "zcta": ( + row["zcta"] if has_zcta and pd.notna(row["zcta"]) else None + ), + } + return result + return { + "sldu": None, + "sldl": None, + "place_fips": None, + "vtd": None, + "puma": None, + "zcta": None, + } + + +# === Block Distribution Loading/Generation === + + +def _load_block_distributions() -> Dict[str, Dict[str, float]]: + """ + Load pre-computed P(block|CD) distributions from CSV. + + Returns: + Dict mapping CD GEOID to Dict[block_geoid, probability] + """ + csv_path = STORAGE_FOLDER / "block_cd_distributions.csv.gz" + + if not csv_path.exists(): + print( + f"Warning: {csv_path} not found. " + "Run make_block_cd_distributions.py to generate." + ) + return {} + + df = pd.read_csv(csv_path, dtype={"block_geoid": str}) + distributions = {} + for cd_geoid, group in df.groupby("cd_geoid"): + distributions[str(int(cd_geoid))] = dict( + zip(group["block_geoid"], group["probability"]) + ) + return distributions + + +# Lazy-load distributions at module import +_BLOCK_DISTRIBUTIONS: Dict[str, Dict[str, float]] = {} + + +def _get_block_distributions() -> Dict[str, Dict[str, float]]: + """Get block distributions, loading if not already loaded.""" + global _BLOCK_DISTRIBUTIONS + if not _BLOCK_DISTRIBUTIONS: + _BLOCK_DISTRIBUTIONS = _load_block_distributions() + return _BLOCK_DISTRIBUTIONS + + +# === Assignment Functions === + + +def _generate_fallback_blocks(cd_geoid: str, n_households: int) -> np.ndarray: + """ + Generate fallback block GEOIDs for CDs not in pre-computed data. + + Uses county assignment as a fallback and generates synthetic but + structurally valid block GEOIDs. Used primarily for testing. + + Args: + cd_geoid: Congressional district geoid + n_households: Number of blocks to generate + + Returns: + Array of 15-character block GEOID strings + """ + # Import here to avoid circular dependency + from policyengine_us_data.datasets.cps.local_area_calibration.county_assignment import ( + assign_counties_for_cd, + ) + + # Fall back to county assignment + county_indices = assign_counties_for_cd( + cd_geoid, n_households, seed=hash(cd_geoid) % (2**31) + ) + + # Convert county indices to block GEOIDs + fips_to_enum = _build_county_fips_to_enum() + enum_to_fips = {v: k for k, v in fips_to_enum.items()} + + blocks = [] + for idx in county_indices: + county_name = County._member_names_[idx] + county_fips = enum_to_fips.get(county_name, "00000") + # Generate synthetic block: county_fips + tract (000100) + block (1000) + block_geoid = county_fips + "0001001000" + blocks.append(block_geoid) + + return np.array(blocks) + + +def assign_blocks_for_cd( + cd_geoid: str, + n_households: int, + seed: int, + distributions: Dict[str, Dict[str, float]] = None, +) -> np.ndarray: + """ + Assign census block GEOIDs to households in a CD using weighted random selection. + + Uses pre-computed P(block|CD) distributions from Census population data. + Falls back to county-based synthetic blocks for CDs not in pre-computed data. + + Args: + cd_geoid: Congressional district geoid (e.g., "3610") + n_households: Number of households to assign + seed: Random seed for reproducibility + distributions: Optional override distributions. If None, uses + pre-computed distributions from CSV. + + Returns: + Array of 15-character block GEOID strings + """ + random.seed(seed) + + if distributions is None: + distributions = _get_block_distributions() + + cd_key = str(int(cd_geoid)) + + if cd_key not in distributions: + # Fall back to county-based assignment for unknown CDs (e.g., in tests) + return _generate_fallback_blocks(cd_geoid, n_households) + + dist = distributions[cd_key] + blocks = list(dist.keys()) + weights = list(dist.values()) + selected = random.choices(blocks, weights=weights, k=n_households) + return np.array(selected) + + +def assign_geography_for_cd( + cd_geoid: str, + n_households: int, + seed: int, + distributions: Dict[str, Dict[str, float]] = None, +) -> Dict[str, np.ndarray]: + """ + Assign all geographic variables for households in a CD. + + This is the main entry point that assigns a census block and then + derives all other geographic variables from it, ensuring consistency. + + Args: + cd_geoid: Congressional district geoid (e.g., "3610") + n_households: Number of households to assign + seed: Random seed for reproducibility + distributions: Optional override distributions + + Returns: + Dict with arrays for each geography: + - block_geoid: 15-char block GEOID strings + - county_fips: 5-digit county FIPS strings + - tract_geoid: 11-digit tract GEOID strings + - state_fips: 2-digit state FIPS strings + - cbsa_code: CBSA code strings (or "" if not in CBSA) + - sldu: State Legislative District Upper (or "" if not available) + - sldl: State Legislative District Lower (or "" if not available) + - place_fips: Place/City FIPS (or "" if not in a place) + - vtd: Voting Tabulation District (or "" if not available) + - puma: Public Use Microdata Area (or "" if not available) + - zcta: ZIP Code Tabulation Area (or "" if not available) + - county_index: int32 indices into County enum (for backwards compat) + """ + # Assign blocks first + block_geoids = assign_blocks_for_cd( + cd_geoid, n_households, seed, distributions + ) + + # Derive geography directly from block GEOID structure + county_fips = np.array( + [get_county_fips_from_block(b) for b in block_geoids] + ) + tract_geoids = np.array( + [get_tract_geoid_from_block(b) for b in block_geoids] + ) + state_fips = np.array([get_state_fips_from_block(b) for b in block_geoids]) + + # CBSA lookup via county (may be None for rural areas) + cbsa_codes = np.array([get_cbsa_from_county(c) or "" for c in county_fips]) + + # County enum indices for backwards compatibility + county_indices = np.array( + [get_county_enum_index_from_block(b) for b in block_geoids], + dtype=np.int32, + ) + + # Lookup additional geographies from block crosswalk + # Do batch lookup for efficiency + crosswalk = _load_block_crosswalk() + has_zcta = "zcta" in crosswalk.columns + + sldu_list = [] + sldl_list = [] + place_fips_list = [] + vtd_list = [] + puma_list = [] + zcta_list = [] + + for b in block_geoids: + if not crosswalk.empty and b in crosswalk.index: + row = crosswalk.loc[b] + sldu_list.append(row["sldu"] if pd.notna(row["sldu"]) else "") + sldl_list.append(row["sldl"] if pd.notna(row["sldl"]) else "") + place_fips_list.append( + row["place_fips"] if pd.notna(row["place_fips"]) else "" + ) + vtd_list.append(row["vtd"] if pd.notna(row["vtd"]) else "") + puma_list.append(row["puma"] if pd.notna(row["puma"]) else "") + if has_zcta: + zcta_list.append(row["zcta"] if pd.notna(row["zcta"]) else "") + else: + zcta_list.append("") + else: + sldu_list.append("") + sldl_list.append("") + place_fips_list.append("") + vtd_list.append("") + puma_list.append("") + zcta_list.append("") + + return { + "block_geoid": block_geoids, + "county_fips": county_fips, + "tract_geoid": tract_geoids, + "state_fips": state_fips, + "cbsa_code": cbsa_codes, + "sldu": np.array(sldu_list), + "sldl": np.array(sldl_list), + "place_fips": np.array(place_fips_list), + "vtd": np.array(vtd_list), + "puma": np.array(puma_list), + "zcta": np.array(zcta_list), + "county_index": county_indices, + } + + +# === County Filter Functions (for city-level datasets) === + + +def get_county_filter_probability( + cd_geoid: str, + county_filter: set, +) -> float: + """ + Calculate P(county in filter | CD) using block-level data. + + Returns the probability that a household in this CD would be in the + target area (e.g., NYC). Used for weight scaling when building + city-level datasets. + + Args: + cd_geoid: Congressional district geoid (e.g., "3610") + county_filter: Set of county enum names that define the target area + + Returns: + Probability between 0 and 1 + """ + distributions = _get_block_distributions() + cd_key = str(int(cd_geoid)) + + if cd_key not in distributions: + return 0.0 + + dist = distributions[cd_key] + + # Convert county enum names to FIPS codes for comparison + fips_to_enum = _build_county_fips_to_enum() + enum_to_fips = {v: k for k, v in fips_to_enum.items()} + target_fips = {enum_to_fips.get(name) for name in county_filter} + target_fips.discard(None) + + # Sum probabilities of blocks in target counties + return sum( + prob + for block, prob in dist.items() + if get_county_fips_from_block(block) in target_fips + ) + + +def get_filtered_block_distribution( + cd_geoid: str, + county_filter: set, +) -> Dict[str, float]: + """ + Get normalized distribution over blocks in target counties only. + + Used when building city-level datasets to assign only blocks in valid + counties while maintaining relative proportions within the target area. + + Args: + cd_geoid: Congressional district geoid (e.g., "3610") + county_filter: Set of county enum names that define the target area + + Returns: + Dictionary mapping block GEOIDs to normalized probabilities. + Empty dict if CD has no overlap with target area. + """ + distributions = _get_block_distributions() + cd_key = str(int(cd_geoid)) + + if cd_key not in distributions: + return {} + + dist = distributions[cd_key] + + # Convert county enum names to FIPS codes for comparison + fips_to_enum = _build_county_fips_to_enum() + enum_to_fips = {v: k for k, v in fips_to_enum.items()} + target_fips = {enum_to_fips.get(name) for name in county_filter} + target_fips.discard(None) + + # Filter to blocks in target counties + filtered = { + block: prob + for block, prob in dist.items() + if get_county_fips_from_block(block) in target_fips + } + + # Normalize + total = sum(filtered.values()) + if total > 0: + return {block: prob / total for block, prob in filtered.items()} + return {} diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py b/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py index c3559071..209218cf 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py @@ -24,10 +24,10 @@ from policyengine_us.variables.household.demographic.geographic.county.county_enum import ( County, ) -from policyengine_us_data.datasets.cps.local_area_calibration.county_assignment import ( - assign_counties_for_cd, +from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + assign_geography_for_cd, get_county_filter_probability, - get_filtered_county_distribution, + get_filtered_block_distribution, ) NYC_COUNTIES = { @@ -338,28 +338,40 @@ def create_sparse_cd_stacked_dataset( np.full(n_households_orig, cd_geoid_int, dtype=np.int32), ) - # Set county for this CD - # For city datasets: use only target counties (normalized distribution) + # Assign all geography using census block assignment + # For city datasets: use only blocks in target counties if county_filter is not None: - filtered_dist = get_filtered_county_distribution( + filtered_dist = get_filtered_block_distribution( cd_geoid, county_filter ) if not filtered_dist: # Should not happen if we already checked p_target > 0 continue - county_indices = assign_counties_for_cd( + geography = assign_geography_for_cd( cd_geoid=cd_geoid, n_households=n_households_orig, seed=seed + int(cd_geoid), distributions={cd_geoid: filtered_dist}, ) else: - county_indices = assign_counties_for_cd( + geography = assign_geography_for_cd( cd_geoid=cd_geoid, n_households=n_households_orig, seed=seed + int(cd_geoid), ) - cd_sim.set_input("county", time_period, county_indices) + # Set county using indices for backwards compatibility with PolicyEngine-US + cd_sim.set_input("county", time_period, geography["county_index"]) + + # Set all other geography variables from block assignment + cd_sim.set_input("block_geoid", time_period, geography["block_geoid"]) + cd_sim.set_input("tract_geoid", time_period, geography["tract_geoid"]) + cd_sim.set_input("cbsa_code", time_period, geography["cbsa_code"]) + cd_sim.set_input("sldu", time_period, geography["sldu"]) + cd_sim.set_input("sldl", time_period, geography["sldl"]) + cd_sim.set_input("place_fips", time_period, geography["place_fips"]) + cd_sim.set_input("vtd", time_period, geography["vtd"]) + cd_sim.set_input("puma", time_period, geography["puma"]) + cd_sim.set_input("zcta", time_period, geography["zcta"]) # Note: We no longer use binary filtering for county_filter. # Instead, weights are scaled by P(target|CD) and all households @@ -635,6 +647,17 @@ def create_sparse_cd_stacked_dataset( # spm_unit_spm_threshold is recalculated with CD-specific geo-adjustment vars_to_save.add("spm_unit_spm_threshold") + # Add all geography variables set during block assignment + vars_to_save.add("block_geoid") + vars_to_save.add("tract_geoid") + vars_to_save.add("cbsa_code") + vars_to_save.add("sldu") + vars_to_save.add("sldl") + vars_to_save.add("place_fips") + vars_to_save.add("vtd") + vars_to_save.add("puma") + vars_to_save.add("zcta") + variables_saved = 0 variables_skipped = 0 diff --git a/policyengine_us_data/storage/block_cd_distributions.csv.gz b/policyengine_us_data/storage/block_cd_distributions.csv.gz new file mode 100644 index 00000000..10a5b527 Binary files /dev/null and b/policyengine_us_data/storage/block_cd_distributions.csv.gz differ diff --git a/policyengine_us_data/storage/block_crosswalk.csv.gz b/policyengine_us_data/storage/block_crosswalk.csv.gz new file mode 100644 index 00000000..cf4fb8ae Binary files /dev/null and b/policyengine_us_data/storage/block_crosswalk.csv.gz differ diff --git a/policyengine_us_data/storage/calibration_targets/make_block_cd_distributions.py b/policyengine_us_data/storage/calibration_targets/make_block_cd_distributions.py new file mode 100644 index 00000000..afdccf4d --- /dev/null +++ b/policyengine_us_data/storage/calibration_targets/make_block_cd_distributions.py @@ -0,0 +1,121 @@ +""" +Generate P(block|CD) distributions from Census block-level data. + +Uses 119th Congress block assignments and 2020 Census block populations. +Saves to storage/block_cd_distributions.parquet for use by block_assignment.py. +""" + +import pandas as pd +import us + +from policyengine_us_data.storage import STORAGE_FOLDER +from policyengine_us_data.storage.calibration_targets.make_district_mapping import ( + fetch_block_to_district_map, + fetch_block_population, +) + + +def build_block_cd_distributions(): + """ + Build P(block|CD) distributions from Census block data. + + Algorithm: + 1. Get block → CD mapping (119th Congress) + 2. Get block population (2020 Census) + 3. Merge to get populated blocks with CD assignments + 4. Calculate P(block|CD) = pop(block) / pop(CD) + 5. Save as parquet (more efficient for large data) + """ + print("Building P(block|CD) distributions from Census block data...") + + # Step 1: Block to CD mapping (119th Congress) + print("\nFetching 119th Congress block-to-CD mapping...") + bef = fetch_block_to_district_map(119) + # Filter out 'ZZ' (unassigned blocks) + bef = bef[bef["CD119"] != "ZZ"] + print(f" {len(bef):,} blocks with CD assignments") + + # Step 2: Block population (all 50 states + DC) + print("\nFetching block population data (this takes a few minutes)...") + state_pops = [] + + # Get 50 states + DC + states_to_process = [ + s + for s in us.states.STATES_AND_TERRITORIES + if not s.is_territory and s.abbr not in ["ZZ"] + ] + + import time + + for i, s in enumerate(states_to_process): + print(f" {s.abbr} ({i + 1}/{len(states_to_process)})") + for attempt in range(3): + try: + state_pops.append(fetch_block_population(s.abbr)) + break + except Exception as e: + if attempt < 2: + print(f" Retry {attempt + 1} for {s.abbr}...") + time.sleep(2) + else: + print(f" Warning: Failed to fetch {s.abbr}: {e}") + continue + + block_pop = pd.concat(state_pops, ignore_index=True) + print(f" Total blocks with population: {len(block_pop):,}") + + # Step 3: Merge block data + print("\nMerging block data...") + df = bef.merge(block_pop, on="GEOID", how="inner") + print(f" Matched blocks: {len(df):,}") + + # Filter to blocks with non-zero population + df = df[df["POP20"] > 0] + print(f" Populated blocks: {len(df):,}") + + df["state_fips"] = df["GEOID"].str[:2] + + # Create CD geoid in our format: state_fips * 100 + district + # Examples: AL-1 = 101, NY-10 = 3610, DC = 1198 + df["cd_geoid"] = df["state_fips"].astype(int) * 100 + df["CD119"].astype( + int + ) + + # Step 4: Calculate P(block|CD) + print("\nCalculating block probabilities...") + cd_totals = df.groupby("cd_geoid")["POP20"].transform("sum") + df["probability"] = df["POP20"] / cd_totals + + # Verify probabilities sum to 1 for each CD + cd_sums = df.groupby("cd_geoid")["probability"].sum() + bad_sums = cd_sums[~cd_sums.between(0.9999, 1.0001)] + if len(bad_sums) > 0: + print(f"Warning: {len(bad_sums)} CDs don't sum to 1.0") + + # Step 5: Prepare output + output = df[["cd_geoid", "GEOID", "probability"]].rename( + columns={"GEOID": "block_geoid"} + ) + output = output.sort_values( + ["cd_geoid", "probability"], ascending=[True, False] + ) + + # Step 6: Save as gzipped CSV (parquet requires pyarrow) + output_path = STORAGE_FOLDER / "block_cd_distributions.csv.gz" + output.to_csv(output_path, index=False, compression="gzip") + print(f"\nSaved to {output_path}") + print(f" Total rows: {len(output):,}") + print(f" Unique CDs: {output['cd_geoid'].nunique()}") + print(f" File size: {output_path.stat().st_size / 1024 / 1024:.1f} MB") + + # Print some stats + blocks_per_cd = output.groupby("cd_geoid").size() + print(f"\nBlocks per CD:") + print(f" Min: {blocks_per_cd.min():,}") + print(f" Median: {blocks_per_cd.median():,.0f}") + print(f" Max: {blocks_per_cd.max():,}") + + +if __name__ == "__main__": + build_block_cd_distributions() diff --git a/policyengine_us_data/storage/calibration_targets/make_block_crosswalk.py b/policyengine_us_data/storage/calibration_targets/make_block_crosswalk.py new file mode 100644 index 00000000..c7498068 --- /dev/null +++ b/policyengine_us_data/storage/calibration_targets/make_block_crosswalk.py @@ -0,0 +1,235 @@ +""" +Build comprehensive block-level geographic crosswalk from Census data. + +Downloads Block Assignment Files (BAFs) for all states and creates a single +crosswalk file mapping block GEOID to: +- SLDU (State Legislative District Upper) +- SLDL (State Legislative District Lower) +- Place FIPS (City/CDP) +- PUMA (via tract lookup) +- ZCTA (ZIP Code Tabulation Area) + +Data sources: +- BAFs: https://www2.census.gov/geo/docs/maps-data/data/baf2020/ +- Tract-to-PUMA: https://www2.census.gov/geo/docs/maps-data/data/rel2020/ +- ZCTA-to-Block: https://www2.census.gov/geo/docs/maps-data/data/rel2020/zcta520/ +""" + +import io +import requests +import zipfile +from pathlib import Path +import pandas as pd +import us + +from policyengine_us_data.storage import STORAGE_FOLDER + +BAF_BASE_URL = "https://www2.census.gov/geo/docs/maps-data/data/baf2020/" +TRACT_PUMA_URL = "https://www2.census.gov/geo/docs/maps-data/data/rel2020/2020_Census_Tract_to_2020_PUMA.txt" +ZCTA_BLOCK_URL = "https://www2.census.gov/geo/docs/maps-data/data/rel2020/zcta520/tab20_zcta520_tabblock20_natl.txt" + + +def download_state_baf(state_fips: str, state_abbr: str) -> dict: + """ + Download and parse Block Assignment Files for a state. + + Returns dict with DataFrames for SLDU, SLDL, Place. + """ + url = f"{BAF_BASE_URL}BlockAssign_ST{state_fips}_{state_abbr}.zip" + + response = requests.get(url, timeout=120) + response.raise_for_status() + + results = {} + + with zipfile.ZipFile(io.BytesIO(response.content)) as z: + # SLDU - State Legislative District Upper + sldu_file = f"BlockAssign_ST{state_fips}_{state_abbr}_SLDU.txt" + if sldu_file in z.namelist(): + df = pd.read_csv(z.open(sldu_file), sep="|", dtype=str) + results["sldu"] = df.rename( + columns={"BLOCKID": "block_geoid", "DISTRICT": "sldu"} + ) + + # SLDL - State Legislative District Lower + sldl_file = f"BlockAssign_ST{state_fips}_{state_abbr}_SLDL.txt" + if sldl_file in z.namelist(): + df = pd.read_csv(z.open(sldl_file), sep="|", dtype=str) + results["sldl"] = df.rename( + columns={"BLOCKID": "block_geoid", "DISTRICT": "sldl"} + ) + + # Place (City/CDP) + place_file = ( + f"BlockAssign_ST{state_fips}_{state_abbr}_INCPLACE_CDP.txt" + ) + if place_file in z.namelist(): + df = pd.read_csv(z.open(place_file), sep="|", dtype=str) + results["place"] = df.rename( + columns={"BLOCKID": "block_geoid", "PLACEFP": "place_fips"} + ) + + # VTD - Voting Tabulation District + vtd_file = f"BlockAssign_ST{state_fips}_{state_abbr}_VTD.txt" + if vtd_file in z.namelist(): + df = pd.read_csv(z.open(vtd_file), sep="|", dtype=str) + # VTD has COUNTYFP and DISTRICT columns + df["vtd"] = df["DISTRICT"] + results["vtd"] = df[["BLOCKID", "vtd"]].rename( + columns={"BLOCKID": "block_geoid"} + ) + + return results + + +def download_tract_puma_crosswalk() -> pd.DataFrame: + """Download tract-to-PUMA crosswalk from Census.""" + df = pd.read_csv(TRACT_PUMA_URL, dtype=str) + + # Build tract GEOID (11 chars: state + county + tract) + df["tract_geoid"] = df["STATEFP"] + df["COUNTYFP"] + df["TRACTCE"] + df["puma"] = df["PUMA5CE"] + + return df[["tract_geoid", "puma"]] + + +def download_zcta_crosswalk() -> pd.DataFrame: + """ + Download ZCTA-to-block crosswalk from Census. + + Returns DataFrame mapping block_geoid to zcta (5-digit ZCTA code). + Note: Some blocks have no ZCTA (water, uninhabited areas). + """ + print("Downloading ZCTA-to-block crosswalk (~1GB file)...") + df = pd.read_csv( + ZCTA_BLOCK_URL, + sep="|", + dtype=str, + usecols=["GEOID_ZCTA5_20", "GEOID_TABBLOCK_20"], + ) + df = df.dropna(subset=["GEOID_ZCTA5_20"]) + df = df[df["GEOID_ZCTA5_20"].str.strip() != ""] + df = df.rename( + columns={ + "GEOID_TABBLOCK_20": "block_geoid", + "GEOID_ZCTA5_20": "zcta", + } + ) + return df[["block_geoid", "zcta"]] + + +def build_block_crosswalk(): + """ + Build comprehensive block-level geographic crosswalk. + + Creates block_crosswalk.csv.gz with columns: + - block_geoid (15-char) + - sldu (3-char state legislative upper) + - sldl (3-char state legislative lower) + - place_fips (5-char place/city FIPS) + - puma (5-char PUMA via tract lookup) + - zcta (5-char ZIP Code Tabulation Area) + """ + print("Building comprehensive block geographic crosswalk...") + + # Download tract-to-PUMA crosswalk + print("\nDownloading tract-to-PUMA crosswalk...") + tract_puma = download_tract_puma_crosswalk() + print(f" {len(tract_puma):,} tract-PUMA mappings") + + # Download ZCTA-to-block crosswalk + print("\nDownloading ZCTA-to-block crosswalk...") + zcta_blocks = download_zcta_crosswalk() + print(f" {len(zcta_blocks):,} block-ZCTA mappings") + + # Process each state + print("\nDownloading Block Assignment Files...") + all_blocks = [] + + states_to_process = [ + s + for s in us.states.STATES_AND_TERRITORIES + if not s.is_territory and s.abbr not in ["ZZ"] + ] + + import time + + for i, s in enumerate(states_to_process): + state_fips = s.fips + print(f" {s.abbr} ({i + 1}/{len(states_to_process)})") + + for attempt in range(3): + try: + bafs = download_state_baf(state_fips, s.abbr) + + # Start with SLDU as base (has all blocks) + if "sldu" in bafs: + df = bafs["sldu"].copy() + + # Merge other geographies + if "sldl" in bafs: + df = df.merge( + bafs["sldl"], on="block_geoid", how="left" + ) + else: + df["sldl"] = None + + if "place" in bafs: + df = df.merge( + bafs["place"], on="block_geoid", how="left" + ) + else: + df["place_fips"] = None + + if "vtd" in bafs: + df = df.merge( + bafs["vtd"], on="block_geoid", how="left" + ) + else: + df["vtd"] = None + + # Add tract GEOID for PUMA lookup + df["tract_geoid"] = df["block_geoid"].str[:11] + + # Merge PUMA via tract + df = df.merge(tract_puma, on="tract_geoid", how="left") + + # Drop tract_geoid (can be derived from block) + df = df.drop(columns=["tract_geoid"]) + + all_blocks.append(df) + + break + except Exception as e: + if attempt < 2: + print(f" Retry {attempt + 1}...") + time.sleep(2) + else: + print(f" Warning: Failed to process {s.abbr}: {e}") + + # Combine all states + print("\nCombining all states...") + combined = pd.concat(all_blocks, ignore_index=True) + print(f" Total blocks: {len(combined):,}") + + # Merge ZCTA (national file, merged after combining states) + print("\nMerging ZCTA...") + combined = combined.merge(zcta_blocks, on="block_geoid", how="left") + + # Save + output_path = STORAGE_FOLDER / "block_crosswalk.csv.gz" + combined.to_csv(output_path, index=False, compression="gzip") + print(f"\nSaved to {output_path}") + print(f" File size: {output_path.stat().st_size / 1024 / 1024:.1f} MB") + + # Stats + print(f"\nCoverage:") + print(f" Blocks with SLDU: {combined['sldu'].notna().sum():,}") + print(f" Blocks with SLDL: {combined['sldl'].notna().sum():,}") + print(f" Blocks with Place: {combined['place_fips'].notna().sum():,}") + print(f" Blocks with PUMA: {combined['puma'].notna().sum():,}") + print(f" Blocks with ZCTA: {combined['zcta'].notna().sum():,}") + + +if __name__ == "__main__": + build_block_crosswalk() diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_block_assignment.py b/policyengine_us_data/tests/test_local_area_calibration/test_block_assignment.py new file mode 100644 index 00000000..0f100138 --- /dev/null +++ b/policyengine_us_data/tests/test_local_area_calibration/test_block_assignment.py @@ -0,0 +1,413 @@ +"""Tests for census block assignment functionality. + +This tests the new block-based geographic assignment that replaces county-only +assignment, allowing consistent lookup of all geographic variables from a +single census block GEOID. +""" + +import pytest +import numpy as np + + +class TestBlockAssignment: + """Test census block assignment for CDs.""" + + def test_assign_returns_correct_shape(self): + """Verify assign_blocks_for_cd returns correct shape.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + assign_blocks_for_cd, + ) + + n_households = 100 + result = assign_blocks_for_cd("3610", n_households, seed=42) + assert result.shape == (n_households,) + # Block GEOIDs are 15-character strings + assert all(len(geoid) == 15 for geoid in result) + + def test_assign_is_deterministic(self): + """Verify same seed produces same results.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + assign_blocks_for_cd, + ) + + result1 = assign_blocks_for_cd("3610", 50, seed=42) + result2 = assign_blocks_for_cd("3610", 50, seed=42) + np.testing.assert_array_equal(result1, result2) + + def test_different_seeds_different_results(self): + """Verify different seeds produce different results.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + assign_blocks_for_cd, + ) + + result1 = assign_blocks_for_cd("3610", 100, seed=42) + result2 = assign_blocks_for_cd("3610", 100, seed=99) + assert not np.array_equal(result1, result2) + + def test_ny_cd_gets_ny_blocks(self): + """Verify NY CDs get NY blocks.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + assign_blocks_for_cd, + ) + + # NY-10 (Manhattan/Brooklyn area) + result = assign_blocks_for_cd("3610", 100, seed=42) + + # All block GEOIDs should start with NY state FIPS (36) + for geoid in result: + assert geoid.startswith("36"), f"Got non-NY block: {geoid}" + + def test_ca_cd_gets_ca_blocks(self): + """Verify CA CDs get CA blocks.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + assign_blocks_for_cd, + ) + + # CA-12 (San Francisco area) + result = assign_blocks_for_cd("612", 100, seed=42) + + # All block GEOIDs should start with CA state FIPS (06) + for geoid in result: + assert geoid.startswith("06"), f"Got non-CA block: {geoid}" + + +class TestGeographyLookup: + """Test looking up geographic variables from block GEOID.""" + + def test_get_county_from_block(self): + """Verify county FIPS extraction from block GEOID.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + get_county_fips_from_block, + ) + + # New York County (Manhattan) block example + # Block GEOID format: SSCCCTTTTTBBBBB (state, county, tract, block) + # 36061 = NY state (36) + New York County (061) + block_geoid = "360610001001000" # Example Manhattan block + county_fips = get_county_fips_from_block(block_geoid) + assert county_fips == "36061" + + def test_get_tract_from_block(self): + """Verify tract GEOID extraction from block GEOID.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + get_tract_geoid_from_block, + ) + + block_geoid = "360610001001000" + tract_geoid = get_tract_geoid_from_block(block_geoid) + # Tract is positions 0-10 (state + county + tract) + assert tract_geoid == "36061000100" + + def test_get_state_fips_from_block(self): + """Verify state FIPS extraction from block GEOID.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + get_state_fips_from_block, + ) + + block_geoid = "360610001001000" + state_fips = get_state_fips_from_block(block_geoid) + assert state_fips == "36" + + +class TestCBSALookup: + """Test CBSA/metro area lookup from county.""" + + def test_manhattan_in_nyc_metro(self): + """Verify Manhattan (New York County) is in NYC metro area.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + get_cbsa_from_county, + ) + + # New York County FIPS = 36061 + cbsa_code = get_cbsa_from_county("36061") + # NYC metro area CBSA code = 35620 + assert cbsa_code == "35620" + + def test_sf_county_in_sf_metro(self): + """Verify San Francisco County is in SF metro area.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + get_cbsa_from_county, + ) + + # San Francisco County FIPS = 06075 + cbsa_code = get_cbsa_from_county("06075") + # SF-Oakland-Berkeley metro area CBSA code = 41860 + assert cbsa_code == "41860" + + def test_rural_county_no_cbsa(self): + """Verify rural county not in any metro area returns None.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + get_cbsa_from_county, + ) + + # Wheeler County, NE (FIPS 31183) - rural county not in CBSA + cbsa_code = get_cbsa_from_county("31183") + assert cbsa_code is None + + +class TestIntegratedAssignment: + """Test integrated assignment that returns all geography.""" + + def test_assign_geography_returns_all_fields(self): + """Verify assign_geography returns dict with all geography fields.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + assign_geography_for_cd, + ) + + n_households = 50 + result = assign_geography_for_cd("3610", n_households, seed=42) + + # Should return dict with arrays for each geography + expected_fields = [ + "block_geoid", + "county_fips", + "tract_geoid", + "state_fips", + "cbsa_code", + "sldu", + "sldl", + "place_fips", + "vtd", + "puma", + "zcta", + "county_index", + ] + for field in expected_fields: + assert field in result, f"Missing field: {field}" + + # All arrays should have same length + for key, arr in result.items(): + assert len(arr) == n_households, f"{key} has wrong length" + + def test_geography_is_consistent(self): + """Verify all geography fields are consistent with each other.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + assign_geography_for_cd, + ) + + result = assign_geography_for_cd("3610", 100, seed=42) + + # County should be derived from block + for i in range(100): + block = result["block_geoid"][i] + county = result["county_fips"][i] + tract = result["tract_geoid"][i] + state = result["state_fips"][i] + + # Block starts with county + assert block[:5] == county + # Tract is first 11 chars of block + assert block[:11] == tract + # State is first 2 chars + assert block[:2] == state + + +class TestStateLegislativeDistricts: + """Test state legislative district lookups.""" + + def test_get_sldu_from_block(self): + """Verify SLDU lookup from block GEOID.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + get_sldu_from_block, + ) + + # Alabama block from crosswalk + sldu = get_sldu_from_block("010010201001000") + # Should return a 3-character district code or None + assert sldu is None or (isinstance(sldu, str) and len(sldu) <= 3) + + def test_get_sldl_from_block(self): + """Verify SLDL lookup from block GEOID.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + get_sldl_from_block, + ) + + # Alabama block from crosswalk + sldl = get_sldl_from_block("010010201001000") + # Should return a 3-character district code or None + assert sldl is None or (isinstance(sldl, str) and len(sldl) <= 3) + + def test_assign_geography_includes_state_leg(self): + """Verify assign_geography includes SLDU and SLDL.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + assign_geography_for_cd, + ) + + result = assign_geography_for_cd("3610", 50, seed=42) + + assert "sldu" in result + assert "sldl" in result + assert len(result["sldu"]) == 50 + assert len(result["sldl"]) == 50 + + +class TestPlaceLookup: + """Test place/city lookup from block.""" + + def test_get_place_fips_from_block(self): + """Verify place FIPS lookup from block GEOID.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + get_place_fips_from_block, + ) + + # Alabama block that's in a place + place = get_place_fips_from_block("010010201001000") + # Should return 5-char place FIPS or None + assert place is None or (isinstance(place, str) and len(place) == 5) + + def test_assign_geography_includes_place(self): + """Verify assign_geography includes place_fips.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + assign_geography_for_cd, + ) + + result = assign_geography_for_cd("3610", 50, seed=42) + + assert "place_fips" in result + assert len(result["place_fips"]) == 50 + + +class TestPUMALookup: + """Test PUMA lookup from block.""" + + def test_get_puma_from_block(self): + """Verify PUMA lookup from block GEOID.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + get_puma_from_block, + ) + + # Alabama block + puma = get_puma_from_block("010010201001000") + # Should return 5-char PUMA code or None + assert puma is None or (isinstance(puma, str) and len(puma) == 5) + + def test_assign_geography_includes_puma(self): + """Verify assign_geography includes PUMA.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + assign_geography_for_cd, + ) + + result = assign_geography_for_cd("3610", 50, seed=42) + + assert "puma" in result + assert len(result["puma"]) == 50 + + +class TestVTDLookup: + """Test VTD (Voting Tabulation District) lookup from block.""" + + def test_get_vtd_from_block(self): + """Verify VTD lookup from block GEOID.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + get_vtd_from_block, + ) + + # Alabama block + vtd = get_vtd_from_block("010010201001000") + # Should return VTD code string or None + assert vtd is None or isinstance(vtd, str) + + def test_assign_geography_includes_vtd(self): + """Verify assign_geography includes VTD.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + assign_geography_for_cd, + ) + + result = assign_geography_for_cd("3610", 50, seed=42) + + assert "vtd" in result + assert len(result["vtd"]) == 50 + + +class TestAllGeographyLookup: + """Test bulk lookup of all geography from block.""" + + def test_get_all_geography_returns_all_fields(self): + """Verify get_all_geography_from_block returns all expected fields.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + get_all_geography_from_block, + ) + + result = get_all_geography_from_block("010010201001000") + + expected_keys = ["sldu", "sldl", "place_fips", "vtd", "puma", "zcta"] + for key in expected_keys: + assert key in result, f"Missing key: {key}" + + def test_get_all_geography_unknown_block(self): + """Verify get_all_geography handles unknown block gracefully.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + get_all_geography_from_block, + ) + + result = get_all_geography_from_block("999999999999999") + + # Should return dict with all None values + for key, val in result.items(): + assert val is None, f"{key} should be None for unknown block" + + +class TestCountyEnumIntegration: + """Test integration with existing County enum.""" + + def test_get_county_enum_from_block(self): + """Verify we can get County enum index from block GEOID.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + get_county_enum_index_from_block, + ) + from policyengine_us.variables.household.demographic.geographic.county.county_enum import ( + County, + ) + + # Manhattan block + block_geoid = "360610001001000" + county_idx = get_county_enum_index_from_block(block_geoid) + + # Should map to NEW_YORK_COUNTY_NY + assert County._member_names_[county_idx] == "NEW_YORK_COUNTY_NY" + + def test_assign_geography_includes_county_index(self): + """Verify assign_geography includes county_index for backwards compat.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + assign_geography_for_cd, + ) + from policyengine_us.variables.household.demographic.geographic.county.county_enum import ( + County, + ) + + result = assign_geography_for_cd("3610", 50, seed=42) + + # Should include county_index for backwards compatibility + assert "county_index" in result + assert result["county_index"].dtype == np.int32 + + # All indices should be valid NY counties + for idx in result["county_index"]: + county_name = County._member_names_[idx] + assert county_name.endswith("_NY") + + +class TestZCTALookup: + """Test ZCTA (ZIP Code Tabulation Area) lookup from block.""" + + def test_get_zcta_from_block(self): + """Verify ZCTA lookup from block GEOID.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + get_zcta_from_block, + ) + + # Alabama block + zcta = get_zcta_from_block("010010201001000") + # Should return 5-char ZCTA code or None + assert zcta is None or (isinstance(zcta, str) and len(zcta) == 5) + + def test_assign_geography_includes_zcta(self): + """Verify assign_geography includes ZCTA.""" + from policyengine_us_data.datasets.cps.local_area_calibration.block_assignment import ( + assign_geography_for_cd, + ) + + result = assign_geography_for_cd("3610", 50, seed=42) + + assert "zcta" in result + assert len(result["zcta"]) == 50 diff --git a/uv.lock b/uv.lock index 0bf265e4..58b165e1 100644 --- a/uv.lock +++ b/uv.lock @@ -637,6 +637,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f8/0a/a3871375c7b9727edaeeea994bfff7c63ff7804c9829c19309ba2e058807/greenlet-3.3.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:b01548f6e0b9e9784a2c99c5651e5dc89ffcbe870bc5fb2e5ef864e9cc6b5dcb", size = 276379, upload-time = "2025-12-04T14:23:30.498Z" }, { url = "https://files.pythonhosted.org/packages/43/ab/7ebfe34dce8b87be0d11dae91acbf76f7b8246bf9d6b319c741f99fa59c6/greenlet-3.3.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:349345b770dc88f81506c6861d22a6ccd422207829d2c854ae2af8025af303e3", size = 597294, upload-time = "2025-12-04T14:50:06.847Z" }, { url = "https://files.pythonhosted.org/packages/a4/39/f1c8da50024feecd0793dbd5e08f526809b8ab5609224a2da40aad3a7641/greenlet-3.3.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e8e18ed6995e9e2c0b4ed264d2cf89260ab3ac7e13555b8032b25a74c6d18655", size = 607742, upload-time = "2025-12-04T14:57:42.349Z" }, + { url = "https://files.pythonhosted.org/packages/77/cb/43692bcd5f7a0da6ec0ec6d58ee7cddb606d055ce94a62ac9b1aa481e969/greenlet-3.3.0-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c024b1e5696626890038e34f76140ed1daf858e37496d33f2af57f06189e70d7", size = 622297, upload-time = "2025-12-04T15:07:13.552Z" }, { url = "https://files.pythonhosted.org/packages/75/b0/6bde0b1011a60782108c01de5913c588cf51a839174538d266de15e4bf4d/greenlet-3.3.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:047ab3df20ede6a57c35c14bf5200fcf04039d50f908270d3f9a7a82064f543b", size = 609885, upload-time = "2025-12-04T14:26:02.368Z" }, { url = "https://files.pythonhosted.org/packages/49/0e/49b46ac39f931f59f987b7cd9f34bfec8ef81d2a1e6e00682f55be5de9f4/greenlet-3.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2d9ad37fc657b1102ec880e637cccf20191581f75c64087a549e66c57e1ceb53", size = 1567424, upload-time = "2025-12-04T15:04:23.757Z" }, { url = "https://files.pythonhosted.org/packages/05/f5/49a9ac2dff7f10091935def9165c90236d8f175afb27cbed38fb1d61ab6b/greenlet-3.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83cd0e36932e0e7f36a64b732a6f60c2fc2df28c351bae79fbaf4f8092fe7614", size = 1636017, upload-time = "2025-12-04T14:27:29.688Z" }, @@ -644,6 +645,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/2f/28592176381b9ab2cafa12829ba7b472d177f3acc35d8fbcf3673d966fff/greenlet-3.3.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:a1e41a81c7e2825822f4e068c48cb2196002362619e2d70b148f20a831c00739", size = 275140, upload-time = "2025-12-04T14:23:01.282Z" }, { url = "https://files.pythonhosted.org/packages/2c/80/fbe937bf81e9fca98c981fe499e59a3f45df2a04da0baa5c2be0dca0d329/greenlet-3.3.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9f515a47d02da4d30caaa85b69474cec77b7929b2e936ff7fb853d42f4bf8808", size = 599219, upload-time = "2025-12-04T14:50:08.309Z" }, { url = "https://files.pythonhosted.org/packages/c2/ff/7c985128f0514271b8268476af89aee6866df5eec04ac17dcfbc676213df/greenlet-3.3.0-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7d2d9fd66bfadf230b385fdc90426fcd6eb64db54b40c495b72ac0feb5766c54", size = 610211, upload-time = "2025-12-04T14:57:43.968Z" }, + { url = "https://files.pythonhosted.org/packages/79/07/c47a82d881319ec18a4510bb30463ed6891f2ad2c1901ed5ec23d3de351f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:30a6e28487a790417d036088b3bcb3f3ac7d8babaa7d0139edbaddebf3af9492", size = 624311, upload-time = "2025-12-04T15:07:14.697Z" }, { url = "https://files.pythonhosted.org/packages/fd/8e/424b8c6e78bd9837d14ff7df01a9829fc883ba2ab4ea787d4f848435f23f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:087ea5e004437321508a8d6f20efc4cfec5e3c30118e1417ea96ed1d93950527", size = 612833, upload-time = "2025-12-04T14:26:03.669Z" }, { url = "https://files.pythonhosted.org/packages/b5/ba/56699ff9b7c76ca12f1cdc27a886d0f81f2189c3455ff9f65246780f713d/greenlet-3.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ab97cf74045343f6c60a39913fa59710e4bd26a536ce7ab2397adf8b27e67c39", size = 1567256, upload-time = "2025-12-04T15:04:25.276Z" }, { url = "https://files.pythonhosted.org/packages/1e/37/f31136132967982d698c71a281a8901daf1a8fbab935dce7c0cf15f942cc/greenlet-3.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5375d2e23184629112ca1ea89a53389dddbffcf417dad40125713d88eb5f96e8", size = 1636483, upload-time = "2025-12-04T14:27:30.804Z" }, @@ -1842,7 +1844,7 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.520.0" +version = "1.524.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -1850,9 +1852,9 @@ dependencies = [ { name = "policyengine-core" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/c6/08/17c44c329c638aa237a5fb7f0f2c7c49c3acfdb26849624290f04deb24db/policyengine_us-1.520.0.tar.gz", hash = "sha256:8af04f4b181313a494a850362c3e6f75b7becac933cfd552f908083b192e3305", size = 8493366, upload-time = "2026-01-25T21:44:09.01Z" } +sdist = { url = "https://files.pythonhosted.org/packages/2b/f0/623a10a84d501039b100808f3f42020ae68c5a793ff2865c58a034be1b32/policyengine_us-1.524.1.tar.gz", hash = "sha256:329ab166681264fcacbe4549afa023cf96bda2348d48bfc385e8328afc811ec8", size = 8503673, upload-time = "2026-01-26T20:47:54.189Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/32/ab/da817a45e7572499e66e040f6be7e5f72844cff1eda24f64905d61a2a30e/policyengine_us-1.520.0-py3-none-any.whl", hash = "sha256:b6084337218572da68beb9fb29a463e60a4a7453a029fb83e0f66763953840b8", size = 7374635, upload-time = "2026-01-25T21:44:07.303Z" }, + { url = "https://files.pythonhosted.org/packages/a2/08/c53fd0555b2fa95cb81994ae20f4fb9df1d35b14efad26f911ff6d0ead75/policyengine_us-1.524.1-py3-none-any.whl", hash = "sha256:e940b977090a2aea782724668d798ddf667b1b1bda7036e93f3caeaf853ea436", size = 7398760, upload-time = "2026-01-26T20:47:51.851Z" }, ] [[package]]