diff --git a/aeo_updates/NG Prices Preprocessing for AEO Inputs.xlsx b/aeo_updates/NG Prices Preprocessing for AEO Inputs.xlsx deleted file mode 100644 index 3a1b3af..0000000 Binary files a/aeo_updates/NG Prices Preprocessing for AEO Inputs.xlsx and /dev/null differ diff --git a/aeo_updates/README.md b/aeo_updates/README.md index f02f09f..b50cf8b 100644 --- a/aeo_updates/README.md +++ b/aeo_updates/README.md @@ -42,22 +42,8 @@ Once you have obtained your api key, create a new environment variable to store ### Input Data Changes When Updating AEO Data #### Natural Gas Prices and Demand -Natural gas prices and demand can be pulled using the EIA AEO data grabber. The spreadsheet "NG Prices Preprocessing for AEO Inputs.xlsx" is used to calculate the alphas using the preset betas. You need to paste in the NG prices and NG electricity sector demand into the relevant tabs. Historical data needs to be updated to the current dollar year. The deflator to convert the alphas back to 2004$ also need to be updated. The alphas are then put into a csv file to be added to the inputs/fuelprices folder of the ReEDS model repo. - -The prices and demand (both for the electricity sector and for all sectors) are also put into the relevant csv files in the inputs/fuelprices folder of the ReEDS model repo. Here are the NG input files that should be updated: - -* ng_tot_demand_AEO_{year}_HOG.csv -* ng_tot_demand_AEO_{year}_LOG.csv -* ng_tot_demand_AEO_{year}_reference.csv -* ng_demand_AEO_{year}_LOG.csv -* ng_demand_AEO_{year}_HOG.csv -* ng_demand_AEO_{year}_reference.csv -* ng_AEO_{year}_LOG.csv -* ng_AEO_{year}_HOG.csv -* ng_AEO_{year}_reference.csv -* alpha_AEO_{year}_LOG.csv -* alpha_AEO_{year}_HOG.csv -* alpha_AEO_{year}_reference.csv +Natural gas prices and demand are updated using the files in natural_gas_price_regression. +See the README.md in that folder for more information. #### Coal Prices Pulled using the EIA data grabber. Coal data are input into the coal_AEO_{year}_reference.csv. diff --git a/aeo_updates/natural_gas_price_regression/.gitignore b/aeo_updates/natural_gas_price_regression/.gitignore new file mode 100644 index 0000000..b7356a8 --- /dev/null +++ b/aeo_updates/natural_gas_price_regression/.gitignore @@ -0,0 +1,35 @@ +# Python cache +__pycache__/ +*.pyc + +# Visualization outputs (PNGs, PDFs) +*.png +*.pdf + +# Validation and visualization results (all generated output) +results validation/ + +# Alpha regression outputs (all CSVs are generated, reproducible) +outputs of alpha regression/*.csv +outputs of alpha regression/raw_aeo_data/ + +# Beta regression outputs (all generated, reproducible) +outputs of beta regression/*.csv +outputs of beta regression/raw_aeo_data/ +outputs of beta regression/bata raw data visualization/ +outputs of beta regression/alpha from beta regression/ + +# Intermediate beta inputs (copied by sync_beta_to_alpha_inputs.py, not source data) +inputs for alpha regression/cd_beta0.csv +inputs for alpha regression/national_beta.csv + +# Notebook checkpoints +.ipynb_checkpoints/ + +# Stale nested duplicate directory +AEO_Updates/ + +# Old scripts (replaced by visualization.py) +results_visualization.py +results_validation.py +beta_raw_data_visualization.py diff --git a/aeo_updates/natural_gas_price_regression/README.md b/aeo_updates/natural_gas_price_regression/README.md new file mode 100644 index 0000000..ea59dcb --- /dev/null +++ b/aeo_updates/natural_gas_price_regression/README.md @@ -0,0 +1,93 @@ +# Natural Gas Price Regression + +This folder has the natural gas update workflow for ReEDS. + +Files to update in the ReEDS repository are described in "outputs of alpha regression/README.md". + +## Model Note + +`alpha` differs by step: + +- Beta step (`alpha1`) is `alpha1(region, year)` and is shared across scenarios. +- Alpha step (`alpha2`) is `alpha2(region, year, scenario)`, so each output + scenario has its own alpha path. + +## Prerequisites + +### EIA API Key + +The pipeline fetches data from the EIA API and requires an API key. Set it as an environment variable before running: + +**Windows (persistent — open a new terminal after running):** +```cmd +setx EIA_API_KEY your_api_key_here +``` + +**Windows (current session only):** +```cmd +set EIA_API_KEY=your_api_key_here +``` + +**macOS/Linux:** +```bash +export EIA_API_KEY=your_api_key_here +``` + +> The key is read from the `EIA_API_KEY` environment variable. Do **not** hardcode it in `aeo_pipeline_config.json`. + +## Run order + +1. Edit `aeo_pipeline_config.json` with your settings. + +2. Run **beta regression first**. + +```bash +python aeo_beta_regression.py --config aeo_pipeline_config.json +``` + +3. Sync beta outputs into alpha inputs. + +```bash +python sync_beta_to_alpha_inputs.py --config aeo_pipeline_config.json +``` + +4. Run **alpha regression**. + +```bash +python aeo_alpha_regression.py --config aeo_pipeline_config.json +``` + +5. Generate all diagnostic plots and validation. + +```bash +python visualization.py --config aeo_pipeline_config.json +``` + +Skip individual parts with flags: +```bash +python visualization.py --skip-raw-scatter --skip-validation +``` + +Default output folder: `results validation` (all plots and validation CSVs). + +## Scenario Configuration + +Set explicit scenarios in `aeo_pipeline_config.json` under `scenarios`: + +- `scenarios.beta_regression.include`: scenarios used to estimate beta. +- `scenarios.alpha_regression.fetch`: scenarios fetched for alpha preprocessing. +- `scenarios.alpha_regression.outputs`: mapping from output suffix (`reference`, `HOG`, `LOG`, etc.) to scenario aliases. + +Use canonical IDs (for example `ref{aeo_year}`, `highogs`, `lowogs`) for a clear setup. + +## One-command run (all platforms) + +```bash +python run_ng_pipeline.py +``` + +Optional custom config: + +```bash +python run_ng_pipeline.py --config my_config.json +``` diff --git a/aeo_updates/natural_gas_price_regression/aeo_alpha_regression.py b/aeo_updates/natural_gas_price_regression/aeo_alpha_regression.py new file mode 100644 index 0000000..e0a19b7 --- /dev/null +++ b/aeo_updates/natural_gas_price_regression/aeo_alpha_regression.py @@ -0,0 +1,1273 @@ +""" +AEO Natural Gas Price Preprocessing Pipeline for ReEDS Inputs +============================================================= + +This script automates the preprocessing of AEO (Annual Energy Outlook) +natural gas data from the EIA API into input files for the ReEDS capacity +expansion model. + +NG Supply Curve Model +--------------------- +ReEDS models regional natural gas prices using a linear supply curve +framework. The regional delivered price of natural gas in each census +division is decomposed into three components: + + Price(r,t,s) = Alpha(r,t,s) + Beta_regional(r) × Demand_regional(r,t,s) + + Beta_national × Demand_national(t,s) + +Where: + - Price(r,t,s) : AEO NG price for region r, year t, scenario s (2024$/MMBtu) + - Alpha(r,t,s) : Intercept / base price component (2004$/MMBtu) + - Beta_regional(r) : Region-specific demand sensitivity (2004$/MMBtu per Quad) + - Demand_regional(r,t,s) : Electric sector NG demand in region r (Quads) + - Beta_national : National demand sensitivity (2004$/MMBtu per Quad) + - Demand_national(t,s) : Total national electric sector NG demand (Quads) + +Alpha is solved as the residual after removing demand-driven price effects: + + Alpha(r,t,s) = Price(r,t,s) × deflator - Beta_regional(r) × Demand_regional(r,t,s) + - Beta_national × Demand_national(t,s) + +Alpha is solved at scenario level and indexed by (region, year, scenario), +so each scenario keeps its own residual alpha path. + +The deflator converts from the AEO's native dollar year (e.g., 2024$) to +2004$, the base dollar year used internally by ReEDS for NG pricing. + +Data Sources +------------ +- Prices and demand projections: EIA AEO API (3 scenarios: Reference, High/Low O&G) +- Historical backfill: Local CSV files for pre-projection years +- Betas: Pre-computed regional and national sensitivity coefficients + +Output Files (per scenario) +--------------------------- +- alpha_AEO_{year}_{scenario}.csv : Regional alpha values (2004$/MMBtu) +- ng_AEO_{year}_{scenario}.csv : Regional NG prices (AEO dollar year) +- ng_demand_AEO_{year}_{scenario}.csv : Electric sector demand (Quads) +- ng_tot_demand_AEO_{year}_{scenario}.csv : Total sector demand (Quads) +- cd_beta0.csv : Electric sector regional betas + +Usage +----- + python aeo_alpha_regression.py --config aeo_pipeline_config.json + +Example config: see aeo_pipeline_config.json +""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import re +import shutil +import sys +from pathlib import Path +from typing import Any + +import pandas as pd +import requests +from requests.adapters import HTTPAdapter +from urllib3 import disable_warnings +from urllib3.exceptions import InsecureRequestWarning +from urllib3.util.retry import Retry + +LOGGER = logging.getLogger("aeo_pipeline") + +# ============================================================================ +# Constants +# ============================================================================ + +# Mapping from normalized region names to canonical internal keys +CENDIV_CANONICAL = { + "newengland": "NewEngland", + "middleatlantic": "MiddleAtlantic", + "eastnorthcentral": "EastNorthCentral", + "westnorthcentral": "WestNorthCentral", + "southatlantic": "SouthAtlantic", + "eastsouthcentral": "EastSouthCentral", + "westsouthcentral": "WestSouthCentral", + "mountain": "Mountain", + "pacific": "Pacific", + "unitedstates": "UnitedStates", +} + +# Mapping from internal keys to ReEDS output format (underscore-separated) +CENDIV_OUTPUT = { + "NewEngland": "New_England", + "MiddleAtlantic": "Mid_Atlantic", + "EastNorthCentral": "East_North_Central", + "WestNorthCentral": "West_North_Central", + "SouthAtlantic": "South_Atlantic", + "EastSouthCentral": "East_South_Central", + "WestSouthCentral": "West_South_Central", + "Mountain": "Mountain", + "Pacific": "Pacific", +} + +# EIA AEO series names for NG data +NG_SERIES_NAMES = { + "price": "Energy Prices : Electric Power : Natural Gas", + "demand_elec": "Energy Use : Electric Power : Natural Gas", + "demand_total": "Energy Use : Total : Natural Gas", +} + +# Required output scenarios and their EIA API aliases +NG_OUTPUT_SCENARIOS = { + "reference": ["ref{aeo_year}", "Reference case", "Reference Case"], + "HOG": ["highogs", "High Oil and Gas Supply"], + "LOG": ["lowogs", "Low Oil and Gas Supply"], +} + +HISTORY_SUFFIX = "historical" + + +# ============================================================================ +# Utility Functions +# ============================================================================ + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(description="AEO NG update pipeline") + p.add_argument("--config", default="aeo_pipeline_config.json") + p.add_argument("--log-level", default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR"]) + p.add_argument("--aeo-year", type=int, default=None) + return p.parse_args() + + +def setup_logging(level: str) -> None: + logging.basicConfig( + level=getattr(logging, level.upper()), + format="%(asctime)s | %(levelname)s | %(message)s", + ) + + +def require(condition: bool, message: str) -> None: + """Assert a condition with a descriptive error message.""" + if not condition: + raise ValueError(message) + + +def normalize_token(value: Any) -> str: + """Normalize a string for case-insensitive, whitespace-insensitive matching.""" + if value is None: + return "" + return re.sub(r"[^a-z0-9]+", "", str(value).replace("\xa0", " ").strip().lower()) + + +def canonical_cendiv(value: Any) -> str: + """Convert a region name to its canonical internal key.""" + key = normalize_token(value) + return CENDIV_CANONICAL.get(key, re.sub(r"[^A-Za-z0-9]", "", str(value))) + + +def cendiv_output_label(cendiv: str) -> str: + """Convert an internal key to ReEDS output format (e.g., 'New_England').""" + require(cendiv in CENDIV_OUTPUT, + f"Unsupported census division for NG output: {cendiv}") + return CENDIV_OUTPUT[cendiv] + + +def output_label_to_cendiv(label: str) -> str: + """Convert a ReEDS output label back to the internal key.""" + norm = normalize_token(label.replace("_", " ")) + for cendiv, output in CENDIV_OUTPUT.items(): + if normalize_token(output.replace("_", " ")) == norm: + return cendiv + return canonical_cendiv(label) + + +def resolve_case_insensitive(path: Path) -> Path: + """Resolve a path with case-insensitive matching on each component.""" + if path.exists(): + return path + path = path.resolve() + current = Path(path.anchor) + for part in path.parts[1:]: + if not current.exists(): + return path + try: + matches = [p for p in current.iterdir() + if p.name.lower() == part.lower()] + except PermissionError: + return path + if not matches: + return path + current = matches[0] + return current + + +def resolve_path(base_dir: Path, configured_path: str) -> Path: + """Resolve a configured path relative to the base directory.""" + p = Path(configured_path) + if not p.is_absolute(): + p = base_dir / p + return resolve_case_insensitive(p) + + +def load_config(config_path: Path) -> dict[str, Any]: + """Load and validate the JSON configuration file.""" + require(config_path.exists(), f"Config not found: {config_path}") + with config_path.open("r", encoding="utf-8") as f: + cfg = json.load(f) + require(isinstance(cfg, dict), f"Config root must be an object: {config_path}") + return cfg + + +def resolve_api_key(config: dict[str, Any]) -> str: + """Resolve the EIA API key from environment, config, or legacy fallback.""" + api_cfg = config["api"] + env_var = api_cfg.get("key_env_var", "EIA_API_KEY") + env_key = os.getenv(env_var, "").strip() + if env_key: + return env_key + fallback = api_cfg.get("key_fallback", "").strip() + if fallback: + LOGGER.warning("Using key_fallback from config.") + return fallback + try: + from _eia_api_functions import api_key as legacy_api_key # type: ignore + if legacy_api_key: + LOGGER.warning("Using API key from _eia_api_functions.py fallback.") + return str(legacy_api_key) + except Exception: + pass + raise ValueError(f"Missing EIA API key. Set {env_var} or api.key_fallback.") + + +# ============================================================================ +# EIA API Client +# ============================================================================ + +class EiaClient: + """Client for fetching data from the EIA AEO API with retry logic.""" + + def __init__(self, api_cfg: dict[str, Any], api_key: str): + self.base_url = api_cfg["base_url"].rstrip("/") + self.verify_ssl = bool(api_cfg.get("verify_ssl", True)) + self.timeout = int(api_cfg.get("timeout_seconds", 60)) + self.api_key = api_key + if not self.verify_ssl: + disable_warnings(InsecureRequestWarning) + retries = Retry( + total=int(api_cfg.get("max_retries", 4)), + connect=int(api_cfg.get("max_retries", 4)), + read=int(api_cfg.get("max_retries", 4)), + status=int(api_cfg.get("max_retries", 4)), + backoff_factor=float(api_cfg.get("backoff_seconds", 1.0)), + status_forcelist=[429, 500, 502, 503, 504], + allowed_methods=["GET"], + raise_on_status=False, + ) + self.session = requests.Session() + adapter = HTTPAdapter(max_retries=retries) + self.session.mount("https://", adapter) + self.session.mount("http://", adapter) + + def get_json(self, path: str, + params: list[tuple[str, str]] | None = None) -> dict[str, Any]: + full_path = path if path.startswith("/") else f"/{path}" + query = [("api_key", self.api_key)] + if params: + query.extend(params) + resp = self.session.get( + f"{self.base_url}{full_path}", + params=query, + timeout=self.timeout, + verify=self.verify_ssl, + ) + resp.raise_for_status() + payload = resp.json() + require("response" in payload, + f"Unexpected payload from {path}: {payload}") + return payload + + def get_facets(self, aeo_year: int, facet: str) -> list[dict[str, Any]]: + return self.get_json( + f"/aeo/{aeo_year}/facet/{facet}")["response"]["facets"] + + def get_data(self, path: str, + params: list[tuple[str, str]]) -> pd.DataFrame: + payload = self.get_json(path, params) + warnings = payload["response"].get("warnings", []) + for w in warnings: + LOGGER.warning("EIA warning: %s | %s", + w.get("warning"), w.get("description")) + data = payload["response"].get("data", []) + require(data, f"No data from endpoint {path}") + return pd.DataFrame(data) + + +# ============================================================================ +# EIA API Resolution Helpers +# ============================================================================ + +def resolve_region_ids(client: EiaClient, aeo_year: int, + regions: list[str]) -> dict[str, str]: + """Map region display names to EIA region IDs.""" + facets = client.get_facets(aeo_year, "regionId") + region_map = {normalize_token(item["name"]): str(item["id"]) + for item in facets} + out: dict[str, str] = {} + for name in regions: + key = normalize_token(name) + require(key in region_map, f"Region not found in EIA facets: {name}") + out[name] = region_map[key] + return out + + +def resolve_series_ids(client: EiaClient, aeo_year: int, + series_name: str) -> list[str]: + """Find EIA series IDs matching a given series name.""" + facets = client.get_facets(aeo_year, "seriesId") + ids = [str(item["id"]) for item in facets + if normalize_token(item.get("name")) == normalize_token(series_name)] + require(ids, f"Series not found: {series_name}") + return ids + + +def resolve_ng_scenarios( + client: EiaClient, + aeo_year: int, + include_scenarios: list[str] | None = None, +) -> pd.DataFrame: + """ + Discover available AEO scenarios for NG data. + + If include_scenarios is provided, only those scenarios are kept + (matching by scenario id or scenario name, case-insensitive). + """ + facets = client.get_facets(aeo_year, "scenario") + rows: list[dict[str, str]] = [] + for item in facets: + scenario_id = str(item.get("id", "")).strip() + scenario_name = str(item.get("name", "")).strip() + if not scenario_id: + continue + # Skip legacy composite IDs (e.g., "aeo2023ref") + if normalize_token(scenario_id).startswith("aeo"): + continue + rows.append({"scenario_id": scenario_id, + "scenario_name": scenario_name}) + require(rows, "No NG scenarios left after filtering legacy IDs.") + + if include_scenarios: + picked: list[dict[str, str]] = [] + missing: list[str] = [] + for raw in include_scenarios: + token = normalize_token(str(raw).replace("{aeo_year}", str(aeo_year))) + found = None + for row in rows: + sid = str(row["scenario_id"]) + sname = str(row["scenario_name"]) + if normalize_token(sid) == token or normalize_token(sname) == token: + found = row + break + if found is None: + missing.append(str(raw)) + continue + if all(str(r["scenario_id"]) != str(found["scenario_id"]) for r in picked): + picked.append(found) + require( + not missing, + f"Configured alpha_regression.fetch scenarios not found: {missing}", + ) + out = pd.DataFrame(picked) + LOGGER.info( + "Selected NG scenarios from config (%d): %s", + len(out), + out["scenario_id"].tolist(), + ) + return out.reset_index(drop=True) + + out = ( + pd.DataFrame(rows) + .drop_duplicates(subset=["scenario_id"]) + .sort_values("scenario_id") + .reset_index(drop=True) + ) + LOGGER.info("Selected NG scenarios (%d): %s", len(out), out["scenario_id"].tolist()) + return out + + +def resolve_output_scenario_aliases( + aeo_year: int, + configured_outputs: Any, +) -> dict[str, list[str]]: + """Build output scenario alias map from config, with defaults.""" + if configured_outputs is None: + raw_map: dict[str, list[str]] = NG_OUTPUT_SCENARIOS + elif isinstance(configured_outputs, dict): + raw_map = {} + for suffix, aliases in configured_outputs.items(): + require( + isinstance(aliases, list), + f"alpha_regression.outputs['{suffix}'] must be a list.", + ) + raw_map[str(suffix)] = [str(a) for a in aliases] + elif isinstance(configured_outputs, list): + raw_map = {} + for row in configured_outputs: + require( + isinstance(row, dict), + "alpha_regression.outputs list entries must be objects.", + ) + suffix = str(row.get("file_suffix", "")).strip() + require(suffix, "alpha_regression.outputs entry missing file_suffix.") + aliases = row.get("aliases", []) + require( + isinstance(aliases, list), + f"alpha_regression.outputs entry '{suffix}' aliases must be a list.", + ) + vals = [str(a).strip() for a in aliases if str(a).strip()] + scenario_id = str(row.get("scenario_id", "")).strip() + if scenario_id: + vals.insert(0, scenario_id) + require(vals, f"alpha_regression.outputs entry '{suffix}' has no aliases.") + raw_map[suffix] = vals + else: + raise ValueError( + "alpha_regression.outputs must be either an object or a list of objects." + ) + + out: dict[str, list[str]] = {} + for suffix, aliases in raw_map.items(): + suffix_txt = str(suffix).strip() + require(suffix_txt, "alpha_regression.outputs contains an empty file_suffix.") + clean_aliases = [str(a).strip() for a in aliases if str(a).strip()] + require(clean_aliases, f"alpha_regression.outputs['{suffix_txt}'] has no aliases.") + # Keep original strings for logs and output; matching happens via normalize_token. + out[suffix_txt] = clean_aliases + require(out, "alpha_regression.outputs resolved to an empty mapping.") + LOGGER.info( + "Configured alpha output scenario map for AEO %d: %s", + aeo_year, + {k: v for k, v in out.items()}, + ) + return out + + +def resolve_output_scenarios( + available_scenarios: pd.DataFrame, + aeo_year: int, + output_aliases: dict[str, list[str]], +) -> pd.DataFrame: + """Match available scenarios to configured output scenario aliases.""" + require(not available_scenarios.empty, + "No available scenarios to resolve NG output scenarios.") + rows: list[dict[str, str]] = [] + for suffix, aliases in output_aliases.items(): + alias_tokens = {normalize_token(a.replace("{aeo_year}", str(aeo_year))) + for a in aliases} + found_row = None + for row in available_scenarios.itertuples(index=False): + sid = str(row.scenario_id) + sname = str(row.scenario_name) + if (normalize_token(sid) in alias_tokens + or normalize_token(sname) in alias_tokens): + found_row = row + break + require(found_row is not None, + f"Could not resolve required NG output scenario: '{suffix}'") + rows.append({ + "scenario_id": str(found_row.scenario_id), + "scenario_name": str(found_row.scenario_name), + "file_suffix": str(suffix), + }) + out = pd.DataFrame(rows).reset_index(drop=True) + require( + out["scenario_id"].nunique() == len(out), + "alpha_regression.outputs resolved to duplicate scenario_id values. " + "Use distinct output scenarios.", + ) + LOGGER.info("Resolved NG output scenarios: %s", + out[["scenario_id", "file_suffix"]].to_dict(orient="records")) + return out + + +def resolve_ng_region_order(config: dict[str, Any]) -> list[str]: + """Get the ordered list of census divisions from the config.""" + configured = config["ng"]["regions"] + order = [output_label_to_cendiv(x) for x in configured] + require(len(order) == len(set(order)), "NG regions contains duplicates.") + return order + + +# ============================================================================ +# Data Fetching +# ============================================================================ + +def fetch_aeo_series_by_scenario( + client: EiaClient, + aeo_year: int, + series_name: str, + scenario_ids: list[str], + region_ids: list[str], + value_col: str, + start_year: int, + end_year: int, + raw_output_path: Path | None = None, +) -> pd.DataFrame: + """ + Fetch a single AEO series (price/demand) for all scenarios and regions. + + Returns a DataFrame with columns: + [scenario_id, cendiv, year, ] + """ + series_ids = resolve_series_ids(client, aeo_year, series_name) + params: list[tuple[str, str]] = [ + ("data[]", "value"), + ("start", str(start_year)), + ("end", str(end_year)), + ] + params.extend(("facets[scenario][]", sid) for sid in scenario_ids) + params.extend(("facets[regionId][]", rid) for rid in region_ids) + params.extend(("facets[seriesId][]", sid) for sid in series_ids) + + df = client.get_data(f"/aeo/{aeo_year}/data", params=params) + if raw_output_path is not None: + raw_output_path.parent.mkdir(parents=True, exist_ok=True) + raw_export = df.copy() + raw_export.insert(0, "series_name", series_name) + raw_export.to_csv(raw_output_path, index=False, float_format="%.6f") + LOGGER.info( + "Wrote raw AEO rows for '%s' to %s (%d rows)", + series_name, raw_output_path, len(raw_export) + ) + need = {"scenario", "regionName", "period", "value"} + require( + not (need - set(df.columns)), + f"Missing columns for series '{series_name}': " + f"{sorted(need - set(df.columns))}", + ) + + df["scenario_id"] = df["scenario"].astype(str) + df["cendiv"] = df["regionName"].map(canonical_cendiv) + df["year"] = pd.to_numeric(df["period"], errors="coerce") + df[value_col] = pd.to_numeric(df["value"], errors="coerce") + df = df.dropna(subset=["scenario_id", "cendiv", "year", value_col]).copy() + df["year"] = df["year"].astype(int) + df = df[df["scenario_id"].isin(scenario_ids)].copy() + + # Verify no conflicting duplicate values + uniq = df.groupby(["scenario_id", "cendiv", "year"])[value_col].nunique() + require( + (uniq <= 1).all(), + f"Conflicting duplicate values for series '{series_name}'. " + f"Samples: {uniq[uniq > 1].head().to_dict()}", + ) + out = df.groupby( + ["scenario_id", "cendiv", "year"], as_index=False + )[value_col].mean() + LOGGER.info("Fetched %s rows for series '%s': %d", + value_col, series_name, len(out)) + return out + + +# ============================================================================ +# Data Validation and Filtering +# ============================================================================ + +def filter_complete_ng_scenarios( + scenario_table: pd.DataFrame, + series_frames: dict[str, pd.DataFrame], + region_order: list[str], + start_year: int, + end_year: int, +) -> tuple[pd.DataFrame, dict[str, pd.DataFrame]]: + """Drop scenarios that have incomplete region × year coverage.""" + expected_count = len(region_order) * (end_year - start_year + 1) + keep = set(scenario_table["scenario_id"].tolist()) + + for name, frame in series_frames.items(): + counts = frame.groupby("scenario_id").size().to_dict() + complete = {sid for sid in keep + if counts.get(sid, 0) == expected_count} + dropped = sorted(keep - complete) + if dropped: + LOGGER.warning( + "Series '%s' has incomplete coverage for %s-%s. " + "Dropping scenarios: %s", + name, start_year, end_year, dropped, + ) + keep = complete + + require(keep, "No NG scenarios remain after coverage filtering.") + out_scen = (scenario_table[scenario_table["scenario_id"].isin(keep)] + .copy().reset_index(drop=True)) + out_frames = { + name: frame[frame["scenario_id"].isin(keep)].copy() + for name, frame in series_frames.items() + } + return out_scen, out_frames + + +def validate_ng_coverage( + frame: pd.DataFrame, + scenarios: list[str], + region_order: list[str], + start_year: int, + end_year: int, + label: str, +) -> None: + """Verify that a DataFrame has complete scenario × region × year coverage.""" + expected = { + (sid, reg, yr) + for sid in scenarios + for reg in region_order + for yr in range(start_year, end_year + 1) + } + actual = { + (r.scenario_id, r.cendiv, int(r.year)) + for r in frame[["scenario_id", "cendiv", "year"]].itertuples(index=False) + } + missing = expected - actual + require( + not missing, + f"{label} missing scenario/region/year combos. " + f"Sample: {list(sorted(missing))[:10]}", + ) + + +# ============================================================================ +# Historical Data Handling +# ============================================================================ + +def resolve_history_file(input_dir: Path, stem: str, + history_suffix: str) -> Path: + """Locate a historical data CSV file.""" + file_path = input_dir / f"{stem}_{history_suffix}.csv" + require(file_path.exists(), + f"History source file not found: {file_path}") + return file_path + + +def load_history_wide_file( + file_path: Path, + value_col: str, + region_order: list[str], +) -> pd.DataFrame: + """ + Load a wide-format historical CSV and melt to long format. + + Expected CSV format: year/t column + one column per region (output labels). + Returns: DataFrame with [cendiv, year, ]. + """ + require(file_path.exists(), + f"History source file not found: {file_path}") + df = pd.read_csv(file_path) + year_col = ("year" if "year" in df.columns + else ("t" if "t" in df.columns else None)) + require(year_col is not None, + f"History file missing 'year' or 't' column: {file_path}") + df = df.rename(columns={year_col: "year"}) + for col in df.columns: + if col != "year": + df[col] = pd.to_numeric(df[col], errors="coerce") + melted = (df.melt(id_vars=["year"], var_name="region_out", + value_name=value_col) + .dropna(subset=[value_col]).copy()) + melted["cendiv"] = melted["region_out"].map(output_label_to_cendiv) + melted["year"] = pd.to_numeric(melted["year"], errors="coerce").astype(int) + melted = melted[melted["cendiv"].isin(region_order)].copy() + return melted[["cendiv", "year", value_col]] + + +def apply_reference_history_to_all_scenarios( + frame: pd.DataFrame, + value_col: str, + history_frame: pd.DataFrame, + scenario_ids: list[str], + projection_start_year: int, +) -> pd.DataFrame: + """ + Backfill pre-projection years with historical data. + + Historical data is replicated identically across all scenarios, + since AEO scenarios share the same historical period. + """ + hist = history_frame[history_frame["year"] < projection_start_year].copy() + if hist.empty: + return frame + replicated = pd.concat( + [hist.assign(scenario_id=sid) for sid in scenario_ids], + ignore_index=True, + ) + future = frame[frame["year"] >= projection_start_year].copy() + out = pd.concat([future, replicated], ignore_index=True) + out = out.groupby( + ["scenario_id", "cendiv", "year"], as_index=False + )[value_col].mean() + return out + + +def _append_year_to_history_csv( + csv_path: Path, + year: int, + data_frame: pd.DataFrame, + scenario_id: str, + value_col: str, +) -> None: + """Append a single year from the reference scenario to a history CSV if missing.""" + if not csv_path.exists(): + return + existing = pd.read_csv(csv_path) + year_col = "year" if "year" in existing.columns else "t" + if year_col not in existing.columns: + return + if year in existing[year_col].values: + return # Already present + + row_data = data_frame[ + (data_frame["scenario_id"] == scenario_id) + & (data_frame["year"] == year) + ].copy() + if row_data.empty: + LOGGER.warning("No data for year %d to append to %s.", year, csv_path.name) + return + + row_data["region_out"] = row_data["cendiv"].map(cendiv_output_label) + wide = row_data.pivot_table( + index="year", columns="region_out", values=value_col, aggfunc="mean", + ).reset_index().rename(columns={"year": year_col}) + + for col in existing.columns: + if col not in wide.columns: + wide[col] = float("nan") + wide = wide[existing.columns] + + updated = pd.concat([existing, wide], ignore_index=True) + updated = updated.sort_values(year_col).reset_index(drop=True) + updated.to_csv(csv_path, index=False, float_format="%.5f") + LOGGER.info("Appended year %d to %s.", year, csv_path.name) + + +# ============================================================================ +# Beta Loading +# ============================================================================ + +def load_regional_betas(beta_path: Path, + region_order: list[str]) -> dict[str, float]: + """ + Load regional beta coefficients from a CSV file. + + The beta file (cd_beta0.csv) contains electric-sector-only betas, + representing the price sensitivity to regional electric sector demand: + Beta_regional(r) in units of 2004$/MMBtu per Quad + """ + require(beta_path.exists(), + f"Regional beta file not found: {beta_path}") + df = pd.read_csv(beta_path) + cendiv_col = next( + (c for c in df.columns if normalize_token(c).endswith("cendiv")), + None, + ) + value_col = next( + (c for c in df.columns if normalize_token(c) == "value"), + None, + ) + require(cendiv_col is not None and value_col is not None, + "Regional beta file missing cendiv/value columns.") + df["cendiv"] = df[cendiv_col].map(output_label_to_cendiv) + df["value"] = pd.to_numeric(df[value_col], errors="coerce") + df = df.dropna(subset=["cendiv", "value"]).copy() + betas = {row.cendiv: float(row.value) + for row in df[["cendiv", "value"]].itertuples(index=False)} + missing = [c for c in region_order if c not in betas] + require(not missing, f"Regional beta file missing regions: {missing}") + return betas + + +# ============================================================================ +# Alpha Computation (Core Calculation) +# ============================================================================ + +def compute_ng_alpha( + price_2004: pd.DataFrame, + demand_elec: pd.DataFrame, + beta_regional: dict[str, float], + beta_national: float, + first_model_year: int, +) -> pd.DataFrame: + """ + Compute NG alpha with a scenario-specific index alpha(region, year, scenario). + + For each scenario we compute an implied residual: + + alpha_2004 = price_2004 - beta_reg * q_reg - beta_nat * q_nat + + ReEDS sets both NG beta terms to zero in the first model year, so the + first-year alpha must carry the full converted price level. + + Parameters + ---------- + price_2004 : DataFrame + Columns: [scenario_id, cendiv, year, price_2004] + demand_elec : DataFrame + Columns: [scenario_id, cendiv, year, demand_elec_quads] + beta_regional : dict + {cendiv: beta_value} for each census division + beta_national : float + National beta coefficient (2004$/MMBtu per Quad) + first_model_year : int + First modeled year in ReEDS. NG elasticity is disabled in this year. + + Returns + ------- + DataFrame with columns: [scenario_id, cendiv, year, alpha_2004]. + """ + # Merge price and regional demand. + merged = price_2004.merge( + demand_elec, + on=["scenario_id", "cendiv", "year"], + how="inner", + ) + + # Compute national demand: sum of regional electric demands per scenario-year. + q_nat = ( + demand_elec + .groupby(["scenario_id", "year"], as_index=False)["demand_elec_quads"] + .sum() + .rename(columns={"demand_elec_quads": "q_nat"}) + ) + merged = merged.merge(q_nat, on=["scenario_id", "year"], how="left") + + # Map regional betas. + merged["beta_reg"] = merged["cendiv"].map(beta_regional) + require( + not merged["beta_reg"].isna().any(), + "Missing regional beta values while computing NG alpha.", + ) + require( + not merged["q_nat"].isna().any(), + "Missing national demand values while computing NG alpha.", + ) + + # Scenario-level residual alpha. + merged["alpha_2004"] = ( + merged["price_2004"] + - merged["beta_reg"] * merged["demand_elec_quads"] + - beta_national * merged["q_nat"] + ) + first_year_mask = merged["year"] == first_model_year + merged.loc[first_year_mask, "alpha_2004"] = merged.loc[first_year_mask, "price_2004"] + out = merged[["scenario_id", "cendiv", "year", "alpha_2004"]].copy() + out = out.sort_values(["scenario_id", "cendiv", "year"]).reset_index(drop=True) + return out + + +# ============================================================================ +# Output Writing +# ============================================================================ +def pivot_ng_series( + frame: pd.DataFrame, + scenario_id: str, + value_col: str, + region_order: list[str], +) -> pd.DataFrame: + """Pivot a long-format series into wide format (year × region).""" + subset = frame[frame["scenario_id"] == scenario_id].copy() + subset["region_out"] = subset["cendiv"].map(cendiv_output_label) + pivot = subset.pivot_table( + index="year", columns="region_out", + values=value_col, aggfunc="mean", + ) + ordered_cols = [cendiv_output_label(c) for c in region_order] + pivot = pivot.reindex(columns=ordered_cols) + pivot = pivot.sort_index().reset_index() + return pivot + + +def write_ng_outputs( + scenario_table: pd.DataFrame, + price_raw: pd.DataFrame, + demand_elec: pd.DataFrame, + demand_total: pd.DataFrame, + alpha_2004: pd.DataFrame, + beta_regional: dict[str, float], + region_order: list[str], + config: dict[str, Any], + base_dir: Path, +) -> None: + """ + Write all NG output files to the configured output directory. + + Output files per scenario: + - ng_AEO_{year}_{suffix}.csv : NG prices (AEO native dollar year) + - ng_demand_AEO_{year}_{suffix}.csv : Electric sector demand (Quads) + - ng_tot_demand_AEO_{year}_{suffix}.csv : Total sector demand (Quads) + - alpha_AEO_{year}_{suffix}.csv : Alpha values (2004$/MMBtu) + + Shared output files: + - cd_beta0.csv : Electric sector regional betas + """ + ng_cfg = config["ng"] + aeo_year = int(config["aeo_year"]) + out_dir = resolve_path(base_dir, config["paths"]["output_dir"]) + out_dir.mkdir(parents=True, exist_ok=True) + + written_files: list[Path] = [] + + # --- Per-scenario output files --- + for row in scenario_table.itertuples(index=False): + scenario_id = str(row.scenario_id) + file_suffix = getattr(row, "file_suffix", None) + require(bool(file_suffix), + f"Missing output suffix for NG scenario '{scenario_id}'") + suffix = str(file_suffix) + + price_wide = pivot_ng_series( + price_raw, scenario_id, "ng_price", region_order) + elec_wide = pivot_ng_series( + demand_elec, scenario_id, "demand_elec_quads", region_order) + total_wide = pivot_ng_series( + demand_total, scenario_id, "demand_total_quads", region_order) + alpha_wide = pivot_ng_series( + alpha_2004, scenario_id, "alpha_2004", region_order + ).rename(columns={"year": "t"}) + + fn_price = out_dir / f"ng_AEO_{aeo_year}_{suffix}.csv" + fn_elec = out_dir / f"ng_demand_AEO_{aeo_year}_{suffix}.csv" + fn_total = out_dir / f"ng_tot_demand_AEO_{aeo_year}_{suffix}.csv" + fn_alpha = out_dir / f"alpha_AEO_{aeo_year}_{suffix}.csv" + + price_wide.to_csv(fn_price, index=False, float_format="%.6f") + elec_wide.to_csv(fn_elec, index=False, float_format="%.6f") + total_wide.to_csv(fn_total, index=False, float_format="%.6f") + alpha_wide.to_csv(fn_alpha, index=False, float_format="%.6f") + written_files.extend([fn_price, fn_elec, fn_total, fn_alpha]) + + # --- Beta output files (copy from input) --- + input_dir = resolve_path( + base_dir, + config["paths"].get("input_dir", config["paths"]["output_dir"]), + ) + beta_files = ["cd_beta0.csv", "national_beta.csv"] + for beta_name in beta_files: + src = resolve_case_insensitive(input_dir / beta_name) + dst = out_dir / beta_name + if src.exists(): + shutil.copy2(src, dst) + written_files.append(dst) + else: + LOGGER.warning("%s not found at %s; skipping copy.", beta_name, src) + + LOGGER.info("Wrote NG outputs to %s (%d files)", out_dir, len(written_files)) + + +# ============================================================================ +# Main Pipeline +# ============================================================================ + +def run_ng_pipeline(config: dict[str, Any], base_dir: Path) -> None: + """ + Execute the full NG preprocessing pipeline: + + 1. Connect to EIA API and discover available scenarios + 2. Fetch price & demand series for all scenarios and regions + 3. Backfill historical data for pre-projection years + 4. Validate completeness of all data + 5. Convert prices from AEO dollar year to 2004$ using the deflator + 6. Compute alpha values using the supply curve inversion: + Alpha = Price_2004 - Beta_reg × Q_reg - Beta_nat × Q_nat + 7. Write output CSV files for ReEDS consumption + """ + api_key = resolve_api_key(config) + client = EiaClient(config["api"], api_key) + + ng_cfg = config["ng"] + aeo_year = int(config["aeo_year"]) + start_year = int(config["start_year"]) + end_year = int(config["end_year"]) + region_order = resolve_ng_region_order(config) + out_dir = resolve_path(base_dir, config["paths"]["output_dir"]) + out_dir.mkdir(parents=True, exist_ok=True) + raw_dir = out_dir / "raw_aeo_data" + raw_dir.mkdir(parents=True, exist_ok=True) + + scenarios_cfg = config.get("scenarios", {}) + if scenarios_cfg is None: + scenarios_cfg = {} + require(isinstance(scenarios_cfg, dict), "Config key 'scenarios' must be an object.") + alpha_scen_cfg = scenarios_cfg.get("alpha_regression", {}) + if alpha_scen_cfg is None: + alpha_scen_cfg = {} + require(isinstance(alpha_scen_cfg, dict), "Config key 'scenarios.alpha_regression' must be an object.") + + fetch_cfg = alpha_scen_cfg.get("fetch") + require( + fetch_cfg is None or isinstance(fetch_cfg, list), + "Config key 'scenarios.alpha_regression.fetch' must be a list when provided.", + ) + fetch_scenarios = [str(x).strip() for x in (fetch_cfg or []) if str(x).strip()] + output_aliases = resolve_output_scenario_aliases( + aeo_year, + alpha_scen_cfg.get("outputs"), + ) + + # ---- Step 1: Discover and resolve scenarios ---- + LOGGER.info("Step 1: Resolving AEO %d scenarios...", aeo_year) + all_scenarios = resolve_ng_scenarios( + client, + aeo_year, + include_scenarios=fetch_scenarios or None, + ) + output_scenarios = resolve_output_scenarios( + available_scenarios=all_scenarios, + aeo_year=aeo_year, + output_aliases=output_aliases, + ) + pd.DataFrame(client.get_facets(aeo_year, "scenario")).to_csv( + raw_dir / "scenario_facets.csv", index=False + ) + all_scenarios.to_csv(raw_dir / "selected_scenarios_all.csv", index=False) + output_scenarios.to_csv(raw_dir / "selected_scenarios_outputs.csv", index=False) + scenario_ids = all_scenarios["scenario_id"].tolist() + region_ids = list( + resolve_region_ids(client, aeo_year, ng_cfg["regions"]).values() + ) + pd.DataFrame(client.get_facets(aeo_year, "regionId")).to_csv( + raw_dir / "region_facets.csv", index=False + ) + + # ---- Step 2: Fetch price and demand data from EIA API ---- + LOGGER.info("Step 2: Fetching AEO data from EIA API...") + price_raw = fetch_aeo_series_by_scenario( + client, aeo_year, + NG_SERIES_NAMES["price"], + scenario_ids, region_ids, + "ng_price", start_year, end_year, + raw_output_path=raw_dir / "aeo_raw_ng_price.csv", + ) + demand_elec = fetch_aeo_series_by_scenario( + client, aeo_year, + NG_SERIES_NAMES["demand_elec"], + scenario_ids, region_ids, + "demand_elec_quads", start_year, end_year, + raw_output_path=raw_dir / "aeo_raw_ng_demand_electric_power.csv", + ) + demand_total = fetch_aeo_series_by_scenario( + client, aeo_year, + NG_SERIES_NAMES["demand_total"], + scenario_ids, region_ids, + "demand_total_quads", start_year, end_year, + raw_output_path=raw_dir / "aeo_raw_ng_demand_total.csv", + ) + + # ---- Step 3: Backfill historical data ---- + LOGGER.info("Step 3: Backfilling historical data...") + projection_start_year = int(min( + price_raw["year"].min(), + demand_elec["year"].min(), + demand_total["year"].min(), + )) + validation_start_year = projection_start_year + + if start_year < projection_start_year: + input_dir = resolve_path( + base_dir, + config["paths"].get("input_dir", config["paths"]["output_dir"]), + ) + try: + history_price = load_history_wide_file( + resolve_history_file(input_dir, "ng_AEO", HISTORY_SUFFIX), + "ng_price", region_order, + ) + history_elec = load_history_wide_file( + resolve_history_file( + input_dir, "ng_demand_AEO", HISTORY_SUFFIX), + "demand_elec_quads", region_order, + ) + history_total = load_history_wide_file( + resolve_history_file( + input_dir, "ng_tot_demand_AEO", HISTORY_SUFFIX), + "demand_total_quads", region_order, + ) + price_raw = apply_reference_history_to_all_scenarios( + price_raw, "ng_price", history_price, + scenario_ids, projection_start_year, + ) + demand_elec = apply_reference_history_to_all_scenarios( + demand_elec, "demand_elec_quads", history_elec, + scenario_ids, projection_start_year, + ) + demand_total = apply_reference_history_to_all_scenarios( + demand_total, "demand_total_quads", history_total, + scenario_ids, projection_start_year, + ) + validation_start_year = start_year + LOGGER.info( + "Backfilled %d-%d from historical files in %s.", + start_year, projection_start_year - 1, input_dir, + ) + except Exception as exc: + LOGGER.warning( + "Could not backfill NG history (%s). " + "Using available years %d-%d only.", + exc, projection_start_year, end_year, + ) + validation_start_year = projection_start_year + + # ---- Step 4: Filter and validate data completeness ---- + LOGGER.info("Step 4: Validating data completeness...") + all_scenarios, series_frames = filter_complete_ng_scenarios( + scenario_table=all_scenarios, + series_frames={ + "price": price_raw, + "demand_elec": demand_elec, + "demand_total": demand_total, + }, + region_order=region_order, + start_year=validation_start_year, + end_year=end_year, + ) + price_raw = series_frames["price"] + demand_elec = series_frames["demand_elec"] + demand_total = series_frames["demand_total"] + scenario_ids = all_scenarios["scenario_id"].tolist() + output_scenarios = ( + output_scenarios[output_scenarios["scenario_id"].isin(scenario_ids)] + .copy().reset_index(drop=True) + ) + require( + len(output_scenarios) == len(output_aliases), + "One or more required output scenarios were dropped by coverage filtering.", + ) + validate_ng_coverage( + price_raw, scenario_ids, region_order, + validation_start_year, end_year, "NG price", + ) + validate_ng_coverage( + demand_elec, scenario_ids, region_order, + validation_start_year, end_year, "NG electric demand", + ) + validate_ng_coverage( + demand_total, scenario_ids, region_order, + validation_start_year, end_year, "NG total demand", + ) + + # ---- Step 5: Convert prices to 2004$ ---- + LOGGER.info("Step 5: Converting prices to 2004$ (deflator=%.6f)...", + float(ng_cfg["price_deflator_to_2004"])) + deflator = float(ng_cfg["price_deflator_to_2004"]) + price_2004 = price_raw.copy() + price_2004["price_2004"] = price_2004["ng_price"] * deflator + + # ---- Step 6: Compute alpha values ---- + LOGGER.info("Step 6: Computing NG alpha values...") + input_dir = resolve_path( + base_dir, + config["paths"].get("input_dir", config["paths"]["output_dir"]), + ) + beta_path = resolve_case_insensitive(input_dir / "cd_beta0.csv") + beta_regional = load_regional_betas(beta_path, region_order) + national_beta_path = resolve_case_insensitive(input_dir / "national_beta.csv") + require( + national_beta_path.exists(), + f"National beta file not found: {national_beta_path}", + ) + national_beta_df = pd.read_csv(national_beta_path) + require( + "beta" in national_beta_df.columns, + f"National beta file missing 'beta' column: {national_beta_path}", + ) + beta_vals = pd.to_numeric(national_beta_df["beta"], errors="coerce").dropna() + require( + not beta_vals.empty, + f"National beta file has no numeric beta value: {national_beta_path}", + ) + beta_national = float(beta_vals.iloc[0]) + LOGGER.info( + " Beta_national = %.10f (2004$/MMBtu per Quad, source=%s)", + beta_national, + national_beta_path, + ) + LOGGER.info( + " Beta_regional: %s", + {cendiv_output_label(k): f"{v:.6f}" for k, v in beta_regional.items()}, + ) + + alpha_2004 = compute_ng_alpha( + price_2004, + demand_elec, + beta_regional, + beta_national, + first_model_year=start_year, + ) + validate_ng_coverage( + alpha_2004, scenario_ids, region_order, + validation_start_year, end_year, "NG alpha", + ) + + # ---- Step 7: Write output files ---- + LOGGER.info("Step 7: Writing output files...") + write_ng_outputs( + scenario_table=output_scenarios, + price_raw=price_raw, + demand_elec=demand_elec, + demand_total=demand_total, + alpha_2004=alpha_2004, + beta_regional=beta_regional, + region_order=region_order, + config=config, + base_dir=base_dir, + ) + + # ---- Step 8: Append projection_start_year to history CSVs ---- + ref_rows = output_scenarios[ + output_scenarios["file_suffix"] == "reference" + ] + if not ref_rows.empty: + ref_sid = str(ref_rows.iloc[0]["scenario_id"]) + hist_dir = resolve_path( + base_dir, + config["paths"].get("input_dir", config["paths"]["output_dir"]), + ) + for stem, vcol, df in [ + ("ng_AEO", "ng_price", price_raw), + ("ng_demand_AEO", "demand_elec_quads", demand_elec), + ("ng_tot_demand_AEO", "demand_total_quads", demand_total), + ]: + _append_year_to_history_csv( + hist_dir / f"{stem}_{HISTORY_SUFFIX}.csv", + projection_start_year, df, ref_sid, vcol, + ) + + +# ============================================================================ +# Entry Point +# ============================================================================ + +def main() -> int: + args = parse_args() + setup_logging(args.log_level) + + script_dir = Path(__file__).resolve().parent + cfg_path = Path(args.config) + if not cfg_path.is_absolute(): + cwd_candidate = resolve_case_insensitive( + (Path.cwd() / cfg_path).resolve()) + script_candidate = resolve_case_insensitive( + (script_dir / cfg_path).resolve()) + cfg_path = (cwd_candidate if cwd_candidate.exists() + else script_candidate) + + cfg = load_config(cfg_path) + if args.aeo_year is not None: + cfg["aeo_year"] = int(args.aeo_year) + + run_ng_pipeline(cfg, script_dir) + LOGGER.info("NG pipeline complete.") + return 0 + + +if __name__ == "__main__": + try: + sys.exit(main()) + except Exception as exc: + LOGGER.exception("Pipeline failed: %s", exc) + sys.exit(1) + + diff --git a/aeo_updates/natural_gas_price_regression/aeo_beta_regression.py b/aeo_updates/natural_gas_price_regression/aeo_beta_regression.py new file mode 100644 index 0000000..5413931 --- /dev/null +++ b/aeo_updates/natural_gas_price_regression/aeo_beta_regression.py @@ -0,0 +1,735 @@ +""" +NG Beta Regression: Estimate Regional and National Betas from AEO Scenarios +============================================================================ + +This script is a preprocessing step that runs BEFORE the main aeo_pipeline.py. +It estimates the beta coefficients used in the ReEDS NG supply curve model. + +Method +------ +The NG supply curve decomposes regional prices as: + + Price(r,t) = Alpha(r,t) + Beta_reg(r) * Q_reg(r,t) + Beta_nat * Q_nat(t) + +Alpha varies by region and year (absorbing all time-varying effects). +Beta_reg and Beta_nat are structural parameters estimated via regression. + +Estimation uses a joint fixed-effects regression across all regions: + + Price_2004(r,t,s) = Alpha(r,t) + Beta_reg(r) * Q_reg(r,t,s) + Beta_nat * Q_nat(t,s) + +Alpha(r,t) is absorbed by demeaning each variable across scenarios within +each (region, year) group. The joint OLS then estimates Beta_nat and all +regional Beta_reg values simultaneously. + +Data: By default uses ALL available years for selected AEO scenarios +EXCEPT High/Low O&G Supply (held out for alpha computation), but +scenario and year selection can be overridden in config under +scenarios.beta_regression. + +Usage +----- + python aeo_beta_regression.py --config aeo_pipeline_config.json + +Outputs +------- + outputs of beta regression/cd_beta0.csv : Regional betas + outputs of beta regression/national_beta.csv : National beta + outputs of beta regression/beta_regression_summary.csv : Full summary + outputs of beta regression/alpha_from_beta_regression.csv : Shared alpha(region, year) + outputs of beta regression/alpha_from_beta_by_scenario.csv : Scenario residual alpha candidates + outputs of beta regression/alpha from beta regression/alpha_from_beta_shared.csv : Wide shared alpha + outputs of beta regression/*.png : Generated by results_visualization.py +""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import re +import sys +from pathlib import Path +from typing import Any + +import numpy as np +import pandas as pd +import requests +from requests.adapters import HTTPAdapter + +from urllib3 import disable_warnings +from urllib3.exceptions import InsecureRequestWarning +from urllib3.util.retry import Retry + +LOGGER = logging.getLogger("beta_regression") + +# ============================================================================ +# Constants +# ============================================================================ + +REGION_TO_LABEL = { + "New England": "New_England", + "Middle Atlantic": "Mid_Atlantic", + "East North Central": "East_North_Central", + "West North Central": "West_North_Central", + "South Atlantic": "South_Atlantic", + "East South Central": "East_South_Central", + "West South Central": "West_South_Central", + "Mountain": "Mountain", + "Pacific": "Pacific", +} + +NG_SERIES_NAMES = { + "price": "Energy Prices : Electric Power : Natural Gas", + "demand_elec": "Energy Use : Electric Power : Natural Gas", +} + +# Scenarios to EXCLUDE from regression (held out for alpha computation) +EXCLUDE_SCENARIOS = {"highogs", "lowogs", "high oil and gas supply", + "low oil and gas supply"} + + +# ============================================================================ +# Helpers +# ============================================================================ + +def _norm(value: Any) -> str: + if value is None: + return "" + return re.sub(r"[^a-z0-9]+", "", + str(value).replace("\xa0", " ").strip().lower()) + + +def _region_label(region_name: str) -> str: + if region_name in REGION_TO_LABEL: + return REGION_TO_LABEL[region_name] + norm = _norm(region_name) + for name, label in REGION_TO_LABEL.items(): + if _norm(name) == norm or _norm(label) == norm: + return label + raise ValueError(f"Unknown region: {region_name}") + + +def _calc_r2(y: np.ndarray, y_hat: np.ndarray) -> float: + """R2 for demeaned data (where mean ~= 0, so SS_tot = sum(y^2)).""" + ss_res = float(np.sum((y - y_hat) ** 2)) + ss_tot = float(np.sum(y ** 2)) + return 1 - ss_res / ss_tot if ss_tot > 0 else 0.0 + + +def _parse_optional_int(value: Any, config_key: str) -> int | None: + if value is None: + return None + try: + return int(value) + except (TypeError, ValueError) as exc: + raise ValueError(f"Config key '{config_key}' must be an integer.") from exc + + +# ============================================================================ +# EIA API Client +# ============================================================================ + +class EiaClient: + def __init__(self, api_cfg: dict[str, Any], api_key: str): + self.base_url = api_cfg["base_url"].rstrip("/") + self.verify_ssl = bool(api_cfg.get("verify_ssl", True)) + self.timeout = int(api_cfg.get("timeout_seconds", 60)) + self.api_key = api_key + if not self.verify_ssl: + disable_warnings(InsecureRequestWarning) + retries = Retry( + total=int(api_cfg.get("max_retries", 4)), + backoff_factor=float(api_cfg.get("backoff_seconds", 1.0)), + status_forcelist=[429, 500, 502, 503, 504], + allowed_methods=["GET"], + raise_on_status=False, + ) + self.session = requests.Session() + self.session.mount("https://", HTTPAdapter(max_retries=retries)) + self.session.mount("http://", HTTPAdapter(max_retries=retries)) + + def _get(self, path: str, + params: list[tuple[str, str]] | None = None) -> dict: + url = f"{self.base_url}/{path.lstrip('/')}" + query = [("api_key", self.api_key)] + (params or []) + resp = self.session.get(url, params=query, + timeout=self.timeout, verify=self.verify_ssl) + resp.raise_for_status() + payload = resp.json() + if "response" not in payload: + raise ValueError(f"Unexpected payload from {path}") + return payload + + def get_facets(self, aeo_year: int, facet: str) -> list[dict]: + return self._get( + f"/aeo/{aeo_year}/facet/{facet}")["response"]["facets"] + + def get_data(self, aeo_year: int, + params: list[tuple[str, str]]) -> pd.DataFrame: + payload = self._get(f"/aeo/{aeo_year}/data", params) + for w in payload["response"].get("warnings", []): + LOGGER.warning("EIA: %s | %s", + w.get("warning"), w.get("description")) + data = payload["response"].get("data", []) + if not data: + raise ValueError(f"No data returned for AEO {aeo_year}") + return pd.DataFrame(data) + + +# ============================================================================ +# Scenario Discovery +# ============================================================================ + +def discover_regression_scenarios( + client: EiaClient, + aeo_year: int, + include_scenarios: list[str] | None = None, + exclude_aliases: list[str] | None = None, +) -> list[str]: + """ + Resolve beta-regression scenarios. + + If include_scenarios is provided, only those scenarios are used + (matching by scenario id or scenario name, case-insensitive). + Otherwise, all non-legacy scenarios are used except exclude_aliases. + """ + facets = client.get_facets(aeo_year, "scenario") + rows: list[dict[str, str]] = [] + for item in facets: + sid = str(item.get("id", "")).strip() + sname = str(item.get("name", "")).strip() + if not sid or _norm(sid).startswith("aeo"): + continue + rows.append({"scenario_id": sid, "scenario_name": sname}) + + if include_scenarios: + selected: list[str] = [] + missing: list[str] = [] + for raw in include_scenarios: + token = _norm(str(raw).replace("{aeo_year}", str(aeo_year))) + found = None + for row in rows: + sid = str(row["scenario_id"]) + sname = str(row["scenario_name"]) + if _norm(sid) == token or _norm(sname) == token: + found = sid + break + if found is None: + missing.append(str(raw)) + continue + if found not in selected: + selected.append(found) + if missing: + raise ValueError( + f"Configured beta_regression.include scenarios not found: {missing}" + ) + LOGGER.info("Regression scenarios from config (%d): %s", len(selected), selected) + if len(selected) < 3: + LOGGER.warning( + "Only %d scenarios in beta_regression.include - regression may be unreliable.", + len(selected), + ) + return selected + + excluded_tokens = {_norm(x) for x in (exclude_aliases or sorted(EXCLUDE_SCENARIOS))} + selected: list[str] = [] + excluded: list[str] = [] + for row in rows: + sid = str(row["scenario_id"]) + sname = str(row["scenario_name"]) + if _norm(sid) in excluded_tokens or _norm(sname) in excluded_tokens: + excluded.append(f"{sid} ({sname})") + continue + selected.append(sid) + LOGGER.info("Regression scenarios (%d): %s", len(selected), selected) + if excluded: + LOGGER.info("Excluded scenarios: %s", excluded) + if len(selected) < 3: + LOGGER.warning("Only %d scenarios - regression may be unreliable.", + len(selected)) + return selected + + +# ============================================================================ +# Data Fetching +# ============================================================================ + +def fetch_series(client: EiaClient, aeo_year: int, series_name: str, + scenario_ids: list[str], region_ids: list[str], + value_col: str, + raw_output_path: Path | None = None, + start_year: int | None = None, + end_year: int | None = None) -> pd.DataFrame: + """Fetch an AEO series for all scenarios/regions.""" + if (start_year is None) != (end_year is None): + raise ValueError( + "Both start_year and end_year must be provided together for fetch_series." + ) + if start_year is not None and end_year is not None and start_year > end_year: + raise ValueError(f"Invalid year window: start_year={start_year} > end_year={end_year}") + + series_facets = client.get_facets(aeo_year, "seriesId") + series_ids = [str(item["id"]) for item in series_facets + if _norm(item.get("name")) == _norm(series_name)] + if not series_ids: + raise ValueError(f"Series not found: {series_name}") + + params: list[tuple[str, str]] = [("data[]", "value")] + if start_year is not None and end_year is not None: + params.append(("start", str(start_year))) + params.append(("end", str(end_year))) + params.extend(("facets[scenario][]", s) for s in scenario_ids) + params.extend(("facets[regionId][]", r) for r in region_ids) + params.extend(("facets[seriesId][]", s) for s in series_ids) + + df = client.get_data(aeo_year, params) + if raw_output_path is not None: + raw_output_path.parent.mkdir(parents=True, exist_ok=True) + df.to_csv(raw_output_path, index=False, float_format="%.6f") + LOGGER.info("Wrote raw data for '%s' to %s (%d rows)", + series_name, raw_output_path, len(df)) + + df = df.rename(columns={"scenario": "scenario_id"}) + df["region"] = df["regionName"].map(_region_label) + df["year"] = pd.to_numeric(df["period"], errors="coerce").astype(int) + df[value_col] = pd.to_numeric(df["value"], errors="coerce") + df = df.dropna(subset=[value_col]) + df = df[df["scenario_id"].isin(scenario_ids)] + + out = (df.groupby(["scenario_id", "region", "year"], as_index=False) + [value_col].mean()) + LOGGER.info("Fetched '%s': %d rows (%d scenarios x %d regions x %d years)", + series_name, len(out), out["scenario_id"].nunique(), + out["region"].nunique(), out["year"].nunique()) + return out + + +# ============================================================================ +# Joint Beta Regression +# ============================================================================ + +def estimate_joint_betas( + price: pd.DataFrame, + demand: pd.DataFrame, + deflator: float, + regions: list[str], +) -> tuple[float, float, dict[str, float], dict[str, dict], pd.DataFrame, float]: + """ + Estimate all betas jointly from the full equation: + + Price_2004(r,t,s) = Alpha(r,t) + Beta_reg(r)*Q_reg(r,t,s) + Beta_nat*Q_nat(t,s) + + Demeaning across scenarios within each (region, year) removes Alpha. + OLS on the demeaned system estimates Beta_nat and all Beta_reg simultaneously. + + Returns: (beta_nat, beta_nat_r2, beta_reg_dict, regional_diagnostics, + regression_data, model_r2) + """ + # National demand = sum of regional demands per scenario-year + demand_nat = (demand.groupby(["scenario_id", "year"], as_index=False) + ["demand"].sum() + .rename(columns={"demand": "demand_nat"})) + + merged = price.merge(demand, on=["scenario_id", "region", "year"]) + merged = merged.merge(demand_nat, on=["scenario_id", "year"]) + merged["price_2004"] = merged["price"] * deflator + + # Demean across scenarios within each (region, year) group + for col, dcol in [("price_2004", "dp"), ("demand", "dq_reg"), + ("demand_nat", "dq_nat")]: + means = merged.groupby(["year", "region"])[col].transform("mean") + merged[dcol] = merged[col] - means + + # Build X matrix: [dq_nat, dq_reg(region1), dq_reg(region2), ...] + region_arr = merged["region"].to_numpy() + dq_reg_x = merged["dq_reg"].to_numpy() + x_cols = [merged["dq_nat"].to_numpy()] + for region in regions: + x_cols.append(np.where(region_arr == region, dq_reg_x, 0.0)) + X = np.column_stack(x_cols) + y = merged["dp"].to_numpy() + + # OLS + coeffs = np.linalg.lstsq(X, y, rcond=None)[0] + beta_nat = float(coeffs[0]) + beta_reg = {region: float(coeffs[i + 1]) for i, region in enumerate(regions)} + + # Diagnostics + y_hat = X @ coeffs + model_r2 = _calc_r2(y, y_hat) + + # Partial R2 for national beta (residualize out regional terms) + merged["dp_hat"] = y_hat + merged["beta_reg_val"] = merged["region"].map(beta_reg) + merged["dp_partial_nat"] = ( + merged["dp"] - merged["beta_reg_val"] * merged["dq_reg"] + ) + beta_nat_r2 = _calc_r2( + merged["dp_partial_nat"].to_numpy(), + beta_nat * merged["dq_nat"].to_numpy()) + + # Per-region partial R2 (residualize out national term) + merged["dp_partial_reg"] = merged["dp"] - beta_nat * merged["dq_nat"] + regional_diag: dict[str, dict] = {} + for region in regions: + reg = merged[merged["region"] == region] + if reg.empty: + continue + regional_diag[region] = { + "r2_partial": _calc_r2( + reg["dp_partial_reg"].to_numpy(), + beta_reg[region] * reg["dq_reg"].to_numpy()), + "r2_full": _calc_r2( + reg["dp"].to_numpy(), reg["dp_hat"].to_numpy()), + "n_obs": len(reg), + } + + # Log results + LOGGER.info("Joint model: beta_nat=%.10f (partial R2=%.4f, model R2=%.4f)", + beta_nat, beta_nat_r2, model_r2) + for region in regions: + diag = regional_diag.get(region, {}) + LOGGER.info(" %-25s beta=%.10f (partial R2=%.4f, n=%d)", + region, beta_reg[region], + diag.get("r2_partial", float("nan")), + diag.get("n_obs", 0)) + + return beta_nat, beta_nat_r2, beta_reg, regional_diag, merged, model_r2 + + +# ============================================================================ +# Output: CSVs +# ============================================================================ + +def save_outputs( + out_dir: Path, + beta_nat: float, + beta_nat_r2: float, + beta_reg: dict[str, float], + regional_diag: dict[str, dict], + reg_data: pd.DataFrame, + model_r2: float, + first_model_year: int, +) -> None: + """Write beta tables and regression diagnostics.""" + out_dir.mkdir(parents=True, exist_ok=True) + + # cd_beta0.csv + pd.DataFrame({ + "*cendiv": list(beta_reg.keys()), + "value": list(beta_reg.values()), + }).to_csv(out_dir / "cd_beta0.csv", index=False, float_format="%.6f") + + # national_beta.csv + pd.DataFrame([{ + "beta": beta_nat, + }]).to_csv(out_dir / "national_beta.csv", index=False, float_format="%.6f") + + # Summary table + rows = [{ + "scope": "national", "region": "ALL", "beta": beta_nat, + "r2": beta_nat_r2, "r2_full": model_r2, "n_obs": len(reg_data), + }] + for region, beta in beta_reg.items(): + diag = regional_diag.get(region, {}) + rows.append({ + "scope": "regional", "region": region, "beta": beta, + "r2": diag.get("r2_partial", float("nan")), + "r2_full": diag.get("r2_full", float("nan")), + "n_obs": diag.get("n_obs", 0), + }) + pd.DataFrame(rows).to_csv( + out_dir / "beta_regression_summary.csv", + index=False, float_format="%.6f") + + # Regression point data for review + point_cols = ["scenario_id", "year", "region", "demand", "demand_nat", + "price", "price_2004", "dp", "dq_reg", "dq_nat", + "dp_partial_reg", "dp_partial_nat", "dp_hat"] + available = [c for c in point_cols if c in reg_data.columns] + reg_data[available].to_csv( + out_dir / "regression_points.csv", index=False, float_format="%.6f") + + # Alpha from beta-step data: + # 1) Scenario residual alpha candidate: + # alpha_candidate(r,t,s) = price_2004 - beta_reg*regional_demand - beta_nat*national_demand + # except in the first modeled year, where ReEDS disables both beta terms + # and alpha must therefore equal the full converted price. + # 2) Model alpha indexed by (region, year): + # alpha_shared(r,t) = mean_s(alpha_candidate(r,t,s)) + alpha_need = {"scenario_id", "year", "region", "price_2004", "demand", "demand_nat"} + missing_alpha_cols = sorted(alpha_need - set(reg_data.columns)) + if missing_alpha_cols: + raise ValueError( + f"Regression data missing columns needed for alpha export: {missing_alpha_cols}" + ) + + alpha_long = reg_data[ + ["scenario_id", "year", "region", "price_2004", "demand", "demand_nat"] + ].copy() + alpha_long = alpha_long.rename( + columns={"demand": "demand_elec_quads", "demand_nat": "q_nat"} + ) + alpha_long["beta_reg"] = alpha_long["region"].map(beta_reg) + if alpha_long["beta_reg"].isna().any(): + bad = sorted(alpha_long.loc[alpha_long["beta_reg"].isna(), "region"].astype(str).unique().tolist()) + raise ValueError(f"Missing regional beta values while exporting alpha: {bad}") + alpha_long["beta_nat"] = beta_nat + alpha_long["alpha_candidate"] = ( + alpha_long["price_2004"] + - alpha_long["beta_reg"] * alpha_long["demand_elec_quads"] + - alpha_long["beta_nat"] * alpha_long["q_nat"] + ) + first_year_mask = alpha_long["year"] == first_model_year + alpha_long.loc[first_year_mask, "alpha_candidate"] = alpha_long.loc[first_year_mask, "price_2004"] + alpha_long = alpha_long.sort_values(["scenario_id", "year", "region"]).reset_index(drop=True) + alpha_long.to_csv( + out_dir / "alpha_from_beta_by_scenario.csv", + index=False, + float_format="%.6f", + ) + + alpha_shared = ( + alpha_long.groupby(["region", "year"], as_index=False)["alpha_candidate"] + .mean() + .rename(columns={"alpha_candidate": "alpha_2004"}) + ) + n_scenarios = ( + alpha_long.groupby(["region", "year"], as_index=False)["scenario_id"] + .nunique() + .rename(columns={"scenario_id": "n_scenarios_used"}) + ) + alpha_shared = alpha_shared.merge(n_scenarios, on=["region", "year"], how="left") + alpha_shared = alpha_shared.sort_values(["region", "year"]).reset_index(drop=True) + alpha_shared.to_csv( + out_dir / "alpha_from_beta_regression.csv", + index=False, + float_format="%.6f", + ) + + alpha_wide_dir = out_dir / "alpha from beta regression" + alpha_wide_dir.mkdir(parents=True, exist_ok=True) + # Remove stale per-scenario files from earlier output format. + for old in alpha_wide_dir.glob("alpha_from_beta_*.csv"): + old.unlink() + old_index = alpha_wide_dir / "alpha_from_beta_files.csv" + if old_index.exists(): + old_index.unlink() + + alpha_wide = ( + alpha_shared.pivot_table( + index="year", + columns="region", + values="alpha_2004", + aggfunc="mean", + ) + .sort_index() + .reset_index() + ) + alpha_wide.to_csv( + alpha_wide_dir / "alpha_from_beta_shared.csv", + index=False, + float_format="%.6f", + ) + + LOGGER.info("Wrote outputs to %s", out_dir) + + +# ============================================================================ +# Pipeline +# ============================================================================ + +def run(config: dict[str, Any], base_dir: Path) -> None: + ng_cfg = config["ng"] + aeo_year = int(config["aeo_year"]) + deflator = float(ng_cfg["price_deflator_to_2004"]) + regions_config = ng_cfg["regions"] + regions = [REGION_TO_LABEL[r] for r in regions_config] + + # API key + api_key = os.getenv( + config["api"].get("key_env_var", "EIA_API_KEY"), + config["api"].get("key_fallback", ""), + ).strip() + if not api_key: + raise ValueError( + "Missing EIA API key. Set EIA_API_KEY or api.key_fallback.") + client = EiaClient(config["api"], api_key) + + scenarios_cfg = config.get("scenarios", {}) + if scenarios_cfg is None: + scenarios_cfg = {} + if not isinstance(scenarios_cfg, dict): + raise ValueError("Config key 'scenarios' must be an object.") + beta_scen_cfg = scenarios_cfg.get("beta_regression", {}) + if beta_scen_cfg is None: + beta_scen_cfg = {} + if not isinstance(beta_scen_cfg, dict): + raise ValueError("Config key 'scenarios.beta_regression' must be an object.") + + include_cfg = beta_scen_cfg.get("include") + if include_cfg is not None and not isinstance(include_cfg, list): + raise ValueError("Config key 'scenarios.beta_regression.include' must be a list.") + include_scenarios = [str(x).strip() for x in (include_cfg or []) if str(x).strip()] + + exclude_cfg = beta_scen_cfg.get("exclude_aliases") + if exclude_cfg is not None and not isinstance(exclude_cfg, list): + raise ValueError("Config key 'scenarios.beta_regression.exclude_aliases' must be a list.") + exclude_aliases = [str(x).strip() for x in (exclude_cfg or []) if str(x).strip()] + + beta_start_year = _parse_optional_int( + beta_scen_cfg.get("start_year"), "scenarios.beta_regression.start_year" + ) + beta_end_year = _parse_optional_int( + beta_scen_cfg.get("end_year"), "scenarios.beta_regression.end_year" + ) + if (beta_start_year is None) != (beta_end_year is None): + raise ValueError( + "Config keys 'scenarios.beta_regression.start_year' and " + "'scenarios.beta_regression.end_year' must be provided together." + ) + if beta_start_year is not None and beta_end_year is not None and beta_start_year > beta_end_year: + raise ValueError( + "Config key 'scenarios.beta_regression': start_year cannot be greater than end_year." + ) + if beta_start_year is not None and beta_end_year is not None: + LOGGER.info( + "Beta regression year range from config: %s-%s", + beta_start_year, beta_end_year, + ) + else: + LOGGER.info("Beta regression year range: all available AEO years.") + + # Step 1: Discover scenarios + LOGGER.info("Step 1: Discovering AEO %d scenarios...", aeo_year) + scenario_ids = discover_regression_scenarios( + client, + aeo_year, + include_scenarios=include_scenarios or None, + exclude_aliases=exclude_aliases or None, + ) + + # Resolve region IDs + facets = client.get_facets(aeo_year, "regionId") + region_lookup = {_norm(item["name"]): str(item["id"]) for item in facets} + region_ids = [] + for name in regions_config: + key = _norm(name) + if key not in region_lookup: + raise ValueError(f"Region not in EIA facets: {name}") + region_ids.append(region_lookup[key]) + + # Save raw facets for reference + out_dir = base_dir / "outputs of beta regression" + raw_dir = out_dir / "raw_aeo_data" + raw_dir.mkdir(parents=True, exist_ok=True) + pd.DataFrame(client.get_facets(aeo_year, "scenario")).to_csv( + raw_dir / "scenario_facets.csv", index=False) + pd.DataFrame({"selected_scenario_id": scenario_ids}).to_csv( + raw_dir / "selected_scenarios.csv", index=False) + + # Step 2: Fetch data + LOGGER.info("Step 2: Fetching price and demand data...") + price_raw = fetch_series( + client, aeo_year, NG_SERIES_NAMES["price"], + scenario_ids, region_ids, "price", + raw_output_path=raw_dir / "raw_ng_price.csv", + start_year=beta_start_year, + end_year=beta_end_year, + ) + demand_raw = fetch_series( + client, aeo_year, NG_SERIES_NAMES["demand_elec"], + scenario_ids, region_ids, "demand", + raw_output_path=raw_dir / "raw_ng_demand_elec.csv", + start_year=beta_start_year, + end_year=beta_end_year, + ) + if price_raw.empty or demand_raw.empty: + raise ValueError( + "No data available for configured beta regression year range." + ) + LOGGER.info( + "Price years used: %s-%s", + int(price_raw["year"].min()), + int(price_raw["year"].max()), + ) + LOGGER.info( + "Demand years used: %s-%s", + int(demand_raw["year"].min()), + int(demand_raw["year"].max()), + ) + + # Step 3: Estimate betas + LOGGER.info("Step 3: Estimating betas (joint fixed-effects regression)...") + (beta_nat, beta_nat_r2, beta_reg, regional_diag, + reg_data, model_r2) = estimate_joint_betas( + price_raw, + demand_raw, + deflator, + regions, + ) + + # Step 4: Save outputs + LOGGER.info("Step 4: Saving outputs...") + save_outputs(out_dir, beta_nat, beta_nat_r2, beta_reg, + regional_diag, reg_data, model_r2, + first_model_year=int(config["start_year"])) + + # Print summary + print("\n" + "=" * 60) + print("RESULTS") + print("=" * 60) + print(f"\nNational beta: {beta_nat:.15f}") + print(f"Model R2: {model_r2:.6f}") + print(f"\nRegional betas:") + for region, beta in beta_reg.items(): + diag = regional_diag.get(region, {}) + print(f" {region:25s} {beta:.12f} " + f"(partial R2={diag.get('r2_partial', float('nan')):.4f})") + print(f"\nOutputs: {out_dir}") + print("\nNext commands:") + print(" python sync_beta_to_alpha_inputs.py --config aeo_pipeline_config.json") + print(" python aeo_alpha_regression.py --config aeo_pipeline_config.json") + print(" python results_visualization.py --config aeo_pipeline_config.json") + print("=" * 60) + + +# ============================================================================ +# Entry Point +# ============================================================================ + +def main() -> int: + parser = argparse.ArgumentParser( + description="Estimate NG beta coefficients from AEO scenarios") + parser.add_argument("--config", default="aeo_pipeline_config.json") + parser.add_argument("--log-level", default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR"]) + parser.add_argument("--aeo-year", type=int, default=None) + args = parser.parse_args() + + logging.basicConfig(level=getattr(logging, args.log_level), + format="%(asctime)s | %(levelname)s | %(message)s") + + script_dir = Path(__file__).resolve().parent + cfg_path = Path(args.config) + if not cfg_path.is_absolute(): + cfg_path = (Path.cwd() / cfg_path if (Path.cwd() / cfg_path).exists() + else script_dir / cfg_path) + + with cfg_path.open() as f: + config = json.load(f) + if args.aeo_year is not None: + config["aeo_year"] = args.aeo_year + + run(config, script_dir) + return 0 + + +if __name__ == "__main__": + try: + sys.exit(main()) + except Exception as exc: + LOGGER.exception("Failed: %s", exc) + sys.exit(1) diff --git a/aeo_updates/natural_gas_price_regression/aeo_pipeline_config.json b/aeo_updates/natural_gas_price_regression/aeo_pipeline_config.json new file mode 100644 index 0000000..a1134bd --- /dev/null +++ b/aeo_updates/natural_gas_price_regression/aeo_pipeline_config.json @@ -0,0 +1,71 @@ +{ + "aeo_year": 2025, + "start_year": 2010, + "end_year": 2050, + "paths": { + "output_dir": "outputs of alpha regression", + "input_dir": "inputs for alpha regression" + }, + "scenarios": { + "beta_regression": { + "start_year": 2024, + "end_year": 2050, + "include": [ + "lm2025", + "hm2025", + "highZTC", + "lowZTC", + "alttrnp", + "nocaa111", + "ref2025" + ], + "exclude_aliases": [ + "highogs", + "highprice", + "lowprice", + "lowogs" + ] + }, + "alpha_regression": { + "fetch": [ + "ref{aeo_year}", + "highogs", + "lowogs" + ], + "outputs": { + "reference": [ + "ref{aeo_year}" + ], + "HOG": [ + "highogs" + ], + "LOG": [ + "lowogs" + ] + } + } + }, + "api": { + "base_url": "https://api.eia.gov/v2", + "key_env_var": "EIA_API_KEY", + "key_fallback": "", + "verify_ssl": false, + "timeout_seconds": 60, + "max_retries": 4, + "backoff_seconds": 1 + }, + "ng": { + "regions": [ + "New England", + "Middle Atlantic", + "East North Central", + "West North Central", + "South Atlantic", + "East South Central", + "West South Central", + "Mountain", + "Pacific" + ], + "price_deflator_to_2004": 0.602782 + } +} diff --git a/aeo_updates/natural_gas_price_regression/inputs for alpha regression/README.md b/aeo_updates/natural_gas_price_regression/inputs for alpha regression/README.md new file mode 100644 index 0000000..5a6f04a --- /dev/null +++ b/aeo_updates/natural_gas_price_regression/inputs for alpha regression/README.md @@ -0,0 +1,16 @@ +# Inputs for Alpha Regression + +This directory contains input files used by the alpha regression step. + +## Historical CSVs (manual inputs) + +The following files are manually maintained and provide historical data to backfill years (2010 – most recent year) that are not covered by AEO projections: + +- `ng_AEO_historical.csv` — Historical NG prices +- `ng_demand_AEO_historical.csv` — Historical electric sector NG demand +- `ng_tot_demand_AEO_historical.csv` — Historical total sector NG demand +- `st_cendiv.csv` — State to Census Division mapping + +## Auto-generated files + +During the pipeline run, `sync_beta_to_alpha_inputs.py` copies beta regression results (`cd_beta0.csv`, `national_beta.csv`) into this directory. These are then read by `aeo_alpha_regression.py`. diff --git a/aeo_updates/natural_gas_price_regression/inputs for alpha regression/ng_AEO_historical.csv b/aeo_updates/natural_gas_price_regression/inputs for alpha regression/ng_AEO_historical.csv new file mode 100644 index 0000000..21c2a70 --- /dev/null +++ b/aeo_updates/natural_gas_price_regression/inputs for alpha regression/ng_AEO_historical.csv @@ -0,0 +1,16 @@ +year,East_North_Central,East_South_Central,Mid_Atlantic,Mountain,New_England,Pacific,South_Atlantic,West_North_Central,West_South_Central +2010,7.09099,6.86463,7.81636,7.21463,7.77043,6.92208,8.64559,7.73429,6.61385 +2011,6.41524,6.03580,7.18161,6.75904,6.97661,6.52309,7.61268,7.18783,6.02193 +2012,4.31840,4.17134,4.92095,4.81269,5.12569,4.98196,5.74414,4.83697,4.08624 +2013,5.64457,5.42566,6.22177,6.02671,8.06812,5.96659,6.48806,6.21740,5.31173 +2014,6.90157,6.19613,6.74979,6.53325,8.76920,6.65287,7.21768,7.39862,6.14191 +2015,3.72847,3.83186,3.85032,4.28349,5.66414,4.31823,5.20964,4.42822,3.70961 +2016,3.60433,3.34894,3.66107,3.54438,5.14830,3.66709,4.51428,3.50041,3.09747 +2017,4.25133,4.06024,4.15969,4.19525,5.35528,4.18916,4.87257,4.46950,3.81434 +2018,4.08306,4.02598,4.22836,4.31170,5.88125,4.31732,4.69319,4.33441,3.78456 +2019,3.07415,3.29214,3.56932,3.55569,5.47646,3.74884,4.09898,3.35261,2.85090 +2020,2.43360,2.77132,2.92226,3.11701,4.89456,3.30504,3.53561,2.85416,2.37295 +2021,5.21205,5.94087,5.32393,6.29594,7.94785,6.71271,6.63282,5.65821,5.44123 +2022,6.73681,7.91093,6.17517,7.81644,8.92891,8.04040,8.54369,7.62799,7.36052 +2023,2.57670,2.93427,3.09409,3.30029,5.18236,3.49938,3.74350,3.02199,2.51248 +2024,2.28508,2.64533,2.26953,2.84119,3.91474,3.17877,3.58168,2.60507,2.19948 diff --git a/aeo_updates/natural_gas_price_regression/inputs for alpha regression/ng_demand_AEO_historical.csv b/aeo_updates/natural_gas_price_regression/inputs for alpha regression/ng_demand_AEO_historical.csv new file mode 100644 index 0000000..906ac82 --- /dev/null +++ b/aeo_updates/natural_gas_price_regression/inputs for alpha regression/ng_demand_AEO_historical.csv @@ -0,0 +1,16 @@ +year,East_North_Central,East_South_Central,Mid_Atlantic,Mountain,New_England,Pacific,South_Atlantic,West_North_Central,West_South_Central +2010,0.32600,0.56700,0.89000,0.64100,0.42000,0.98900,1.54400,0.12400,2.04900 +2011,0.39500,0.63900,0.96500,0.56700,0.45600,0.76600,1.67100,0.11700,2.18800 +2012,0.65600,0.79700,1.15400,0.66300,0.44800,1.04500,2.04300,0.17100,2.33700 +2013,0.47500,0.62900,1.06600,0.65100,0.37200,1.06900,1.88300,0.14000,2.07500 +2014,0.48286,0.67191,1.13148,0.64891,0.34175,1.07373,1.89625,0.10807,2.02567 +2015,0.71331,0.87542,1.23645,0.75092,0.39761,1.08443,2.33010,0.14716,2.43411 +2016,0.91214,0.96190,1.34450,0.76576,0.39318,0.91791,2.46983,0.19192,2.35184 +2017,0.81026,0.91090,1.19616,0.69033,0.37141,0.84090,2.47589,0.17219,2.06317 +2018,1.00257,0.99460,1.30006,0.76026,0.40041,0.87174,2.53553,0.18961,2.66131 +2019,1.18572,1.02982,1.45155,0.98821,0.33865,0.83400,2.79026,0.22831,2.92826 +2020,1.26278,1.04720,1.68422,0.95476,0.36085,0.84768,2.76804,0.24731,2.94039 +2021,1.08225,0.91391,1.65478,0.94725,0.34459,0.88703,2.46873,0.19506,2.73224 +2022,1.47769,0.90048,1.99829,0.94774,0.36677,0.85018,2.60534,0.25751,2.79701 +2023,1.33703,1.10878,1.78325,1.01090,0.38207,0.89752,2.93080,0.26185,3.11328 +2024,1.80377,1.12075,1.94787,1.20213,0.45162,1.02698,2.77112,0.32823,3.27809 diff --git a/aeo_updates/natural_gas_price_regression/inputs for alpha regression/ng_tot_demand_AEO_historical.csv b/aeo_updates/natural_gas_price_regression/inputs for alpha regression/ng_tot_demand_AEO_historical.csv new file mode 100644 index 0000000..12fc1e0 --- /dev/null +++ b/aeo_updates/natural_gas_price_regression/inputs for alpha regression/ng_tot_demand_AEO_historical.csv @@ -0,0 +1,16 @@ +year,East_North_Central,East_South_Central,Mid_Atlantic,Mountain,New_England,Pacific,South_Atlantic,West_North_Central,West_South_Central +2010,3.35827,1.35412,2.64274,4.75630,0.85818,2.70713,2.86466,1.54981,5.37928 +2011,3.52540,1.41626,2.71985,4.79351,0.91535,2.54856,2.93492,1.55552,5.52935 +2012,3.54768,1.54045,2.77095,4.90369,0.87908,2.80361,3.28012,1.49108,5.73093 +2013,3.81810,1.45590,2.89180,4.97218,0.85950,2.89684,3.26105,1.64592,5.57736 +2014,4.04516,1.54466,3.12045,5.00056,0.85773,2.78484,3.32657,1.68047,5.64674 +2015,3.88672,1.69639,3.09914,5.25547,0.90592,2.76397,3.66395,1.57498,5.95167 +2016,3.92096,1.74669,3.15614,1.58165,0.85664,2.77540,3.73514,1.63864,5.80345 +2017,3.99711,1.76773,3.11356,1.57646,0.87954,2.82770,3.88878,1.72960,5.92221 +2018,4.39524,1.91248,3.31636,1.67603,0.93098,2.95235,4.06765,1.84624,6.69761 +2019,4.66544,1.96209,3.50768,1.98524,0.88497,2.94931,4.32576,1.97242,7.13452 +2020,4.57850,1.96789,3.61531,1.90072,0.84980,2.84772,4.22153,1.91562,7.06612 +2021,4.41846,1.84455,3.66561,1.92141,0.87847,2.94745,3.97168,1.89320,6.86597 +2022,4.94008,1.89274,4.04472,1.91471,0.92315,2.85715,4.15986,1.99868,7.15639 +2023,4.64873,1.86139,3.70918,1.73492,0.91437,2.86945,4.15805,1.87774,6.91027 +2024,4.96334,2.09747,3.90312,2.15953,0.96217,2.98253,4.37537,2.04819,7.42142 diff --git a/aeo_updates/natural_gas_price_regression/inputs for alpha regression/st_cendiv.csv b/aeo_updates/natural_gas_price_regression/inputs for alpha regression/st_cendiv.csv new file mode 100644 index 0000000..ddd1f6c --- /dev/null +++ b/aeo_updates/natural_gas_price_regression/inputs for alpha regression/st_cendiv.csv @@ -0,0 +1,49 @@ +st,cendiv +WA,Pacific +OR,Pacific +CA,Pacific +NV,Mountain +ID,Mountain +MT,Mountain +WY,Mountain +UT,Mountain +AZ,Mountain +NM,Mountain +CO,Mountain +ND,WestNorthCentral +SD,WestNorthCentral +NE,WestNorthCentral +MN,WestNorthCentral +IA,WestNorthCentral +WI,EastNorthCentral +TX,WestSouthCentral +OK,WestSouthCentral +KS,WestNorthCentral +MO,WestNorthCentral +AR,WestSouthCentral +LA,WestSouthCentral +MI,EastNorthCentral +IL,EastNorthCentral +MS,EastSouthCentral +AL,EastSouthCentral +FL,SouthAtlantic +TN,EastSouthCentral +KY,EastSouthCentral +GA,SouthAtlantic +SC,SouthAtlantic +NC,SouthAtlantic +VA,SouthAtlantic +IN,EastNorthCentral +OH,EastNorthCentral +PA,MiddleAtlantic +WV,SouthAtlantic +MD,SouthAtlantic +DE,SouthAtlantic +NJ,MiddleAtlantic +NY,MiddleAtlantic +VT,NewEngland +NH,NewEngland +MA,NewEngland +CT,NewEngland +RI,NewEngland +ME,NewEngland diff --git a/aeo_updates/natural_gas_price_regression/outputs of alpha regression/README.md b/aeo_updates/natural_gas_price_regression/outputs of alpha regression/README.md new file mode 100644 index 0000000..64cb5a1 --- /dev/null +++ b/aeo_updates/natural_gas_price_regression/outputs of alpha regression/README.md @@ -0,0 +1,13 @@ +# Outputs of Alpha Regression + +This directory contains the **final outputs** of the natural gas price regression pipeline. + +## Copying outputs to ReEDS + +Copy **all CSV files except `national_beta.csv`** to: + +``` +ReEDS/inputs/fuelprices/ +``` + +For `national_beta.csv`, copy its value into `ReEDS/inputs/scalars.csv` under the key **`nat_beta_nonenergy`**. diff --git a/aeo_updates/natural_gas_price_regression/outputs of beta regression/README.md b/aeo_updates/natural_gas_price_regression/outputs of beta regression/README.md new file mode 100644 index 0000000..191151b --- /dev/null +++ b/aeo_updates/natural_gas_price_regression/outputs of beta regression/README.md @@ -0,0 +1,3 @@ +# Outputs of Beta Regression + +This directory stores intermediate files generated by the beta regression step of the pipeline. These files are consumed by downstream steps (sync and alpha regression) and are not final outputs. diff --git a/aeo_updates/natural_gas_price_regression/run_ng_pipeline.py b/aeo_updates/natural_gas_price_regression/run_ng_pipeline.py new file mode 100644 index 0000000..0e163d9 --- /dev/null +++ b/aeo_updates/natural_gas_price_regression/run_ng_pipeline.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +""" +Cross-platform runner for the natural gas price regression pipeline. + +Usage: + python run_ng_pipeline.py [--config CONFIG_FILE] + +Replaces run_ng_pipeline.bat for compatibility with macOS / Linux. +""" + +import argparse +import subprocess +import sys +from pathlib import Path + + +STEPS = [ + ("1/4", "Running beta regression...", "aeo_beta_regression.py"), + ("2/4", "Syncing beta outputs to alpha inputs...", "sync_beta_to_alpha_inputs.py"), + ("3/4", "Running alpha regression...", "aeo_alpha_regression.py"), + ("4/4", "Generating visualization and validation...", "visualization.py"), +] + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run the NG price regression pipeline.") + parser.add_argument("--config", default="aeo_pipeline_config.json", + help="Path to the pipeline config JSON (default: aeo_pipeline_config.json)") + args = parser.parse_args() + + script_dir = Path(__file__).resolve().parent + + for tag, msg, script in STEPS: + print(f"[{tag}] {msg}") + result = subprocess.run( + [sys.executable, script, "--config", args.config], + cwd=script_dir, + ) + if result.returncode != 0: + print(f"\nNG pipeline failed at step {tag} with exit code {result.returncode}.") + return result.returncode + + print("\nNG pipeline finished successfully.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/aeo_updates/natural_gas_price_regression/sync_beta_to_alpha_inputs.py b/aeo_updates/natural_gas_price_regression/sync_beta_to_alpha_inputs.py new file mode 100644 index 0000000..300aed4 --- /dev/null +++ b/aeo_updates/natural_gas_price_regression/sync_beta_to_alpha_inputs.py @@ -0,0 +1,122 @@ +""" +Copy beta regression outputs into alpha regression inputs. + +This removes the manual handoff step between: +1) aeo_beta_regression.py +2) aeo_alpha_regression.py + +Copied files: +- outputs of beta regression/cd_beta0.csv -> inputs for alpha regression/cd_beta0.csv +- outputs of beta regression/national_beta.csv -> inputs for alpha regression/national_beta.csv +""" + +from __future__ import annotations + +import argparse +import json +import shutil +import sys +from pathlib import Path +from typing import Any + + +def require(condition: bool, message: str) -> None: + """Raise ValueError when condition is false.""" + if not condition: + raise ValueError(message) + + +def resolve_case_insensitive(path: Path) -> Path: + """Resolve a path with case-insensitive matching on each component.""" + if path.exists(): + return path + path = path.resolve() + current = Path(path.anchor) + for part in path.parts[1:]: + if not current.exists(): + return path + try: + matches = [p for p in current.iterdir() if p.name.lower() == part.lower()] + except PermissionError: + return path + if not matches: + return path + current = matches[0] + return current + + +def resolve_path(base_dir: Path, configured_path: str) -> Path: + """Resolve a configured path relative to base_dir.""" + p = Path(configured_path) + if not p.is_absolute(): + p = base_dir / p + return resolve_case_insensitive(p) + + +def load_config(config_path: Path) -> dict[str, Any]: + """Load JSON config file.""" + require(config_path.exists(), f"Config not found: {config_path}") + with config_path.open("r", encoding="utf-8") as f: + cfg = json.load(f) + require(isinstance(cfg, dict), f"Config root must be an object: {config_path}") + return cfg + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Copy beta outputs into alpha input directory.") + parser.add_argument("--config", default="aeo_pipeline_config.json") + parser.add_argument("--beta-output-dir", default="outputs of beta regression") + parser.add_argument( + "--alpha-input-dir", + default=None, + help="Override alpha input directory (default: config paths.input_dir).", + ) + return parser.parse_args() + + +def copy_required_files(beta_out_dir: Path, alpha_input_dir: Path) -> None: + """Copy cd_beta0.csv and national_beta.csv from beta outputs to alpha inputs.""" + copies = [ + ("cd_beta0.csv", "cd_beta0.csv"), + ("national_beta.csv", "national_beta.csv"), + ] + alpha_input_dir.mkdir(parents=True, exist_ok=True) + + for src_name, dst_name in copies: + src = beta_out_dir / src_name + dst = alpha_input_dir / dst_name + require(src.exists(), f"Missing beta output file: {src}") + shutil.copy2(src, dst) + print(f"Copied: {src} -> {dst}") + + +def main() -> int: + args = parse_args() + + script_dir = Path(__file__).resolve().parent + cfg_path = Path(args.config) + if not cfg_path.is_absolute(): + cwd_candidate = resolve_case_insensitive((Path.cwd() / cfg_path).resolve()) + script_candidate = resolve_case_insensitive((script_dir / cfg_path).resolve()) + cfg_path = cwd_candidate if cwd_candidate.exists() else script_candidate + + config = load_config(cfg_path) + base_dir = cfg_path.parent + + input_dir_cfg = str(config.get("paths", {}).get("input_dir", "inputs for alpha regression")) + alpha_input_dir = resolve_path(base_dir, args.alpha_input_dir or input_dir_cfg) + beta_out_dir = resolve_path(base_dir, args.beta_output_dir) + require(beta_out_dir.exists(), f"Beta output directory not found: {beta_out_dir}") + + copy_required_files(beta_out_dir, alpha_input_dir) + print("Beta-to-alpha input sync completed.") + return 0 + + +if __name__ == "__main__": + try: + sys.exit(main()) + except Exception as exc: + print(f"ERROR: {exc}", file=sys.stderr) + sys.exit(1) diff --git a/aeo_updates/natural_gas_price_regression/visualization.py b/aeo_updates/natural_gas_price_regression/visualization.py new file mode 100644 index 0000000..48bbaf5 --- /dev/null +++ b/aeo_updates/natural_gas_price_regression/visualization.py @@ -0,0 +1,1191 @@ +""" +Unified visualization and validation for the NG regression pipeline. + +Consolidates three prior scripts into one entry point: + - Beta raw-data scatter grid (was beta_raw_data_visualization.py) + - Beta/alpha diagnostic plots (was results_visualization.py) + - Validation: actual vs predicted, parity, alpha comparison (was results_validation.py) + +Usage +----- + python visualization.py --config aeo_pipeline_config.json + +Run after: + 1) aeo_beta_regression.py + 2) sync_beta_to_alpha_inputs.py + 3) aeo_alpha_regression.py +""" + +from __future__ import annotations + +import argparse +import json +import logging +import re +import sys +from pathlib import Path +from typing import Any + +import numpy as np +import pandas as pd + +try: + import matplotlib + matplotlib.use("Agg") + import matplotlib.pyplot as plt + from matplotlib.lines import Line2D +except Exception as exc: # pragma: no cover + raise RuntimeError("matplotlib is required for visualization.py") from exc + +LOGGER = logging.getLogger("visualization") + +# ============================================================================ +# Constants +# ============================================================================ + +CENDIV_CANONICAL = { + "newengland": "NewEngland", + "middleatlantic": "MiddleAtlantic", + "eastnorthcentral": "EastNorthCentral", + "westnorthcentral": "WestNorthCentral", + "southatlantic": "SouthAtlantic", + "eastsouthcentral": "EastSouthCentral", + "westsouthcentral": "WestSouthCentral", + "mountain": "Mountain", + "pacific": "Pacific", +} + +CENDIV_OUTPUT = { + "NewEngland": "New_England", + "MiddleAtlantic": "Mid_Atlantic", + "EastNorthCentral": "East_North_Central", + "WestNorthCentral": "West_North_Central", + "SouthAtlantic": "South_Atlantic", + "EastSouthCentral": "East_South_Central", + "WestSouthCentral": "West_South_Central", + "Mountain": "Mountain", + "Pacific": "Pacific", +} + +SCENARIO_COLOR = { + "reference": "#1f77b4", + "HOG": "#2ca02c", + "LOG": "#d62728", +} + + +# ============================================================================ +# Shared helpers +# ============================================================================ + +def require(condition: bool, message: str) -> None: + if not condition: + raise ValueError(message) + + +def normalize_token(value: Any) -> str: + return re.sub( + r"[^a-z0-9]+", "", + str(value).replace("\xa0", " ").strip().lower(), + ) + + +def any_to_cendiv(value: str) -> str: + token = normalize_token(value) + if token in CENDIV_CANONICAL: + return CENDIV_CANONICAL[token] + for cendiv, out_label in CENDIV_OUTPUT.items(): + if token in {normalize_token(cendiv), normalize_token(out_label)}: + return cendiv + raise ValueError(f"Unknown region label: {value}") + + +def cendiv_output_label(cendiv: str) -> str: + require(cendiv in CENDIV_OUTPUT, f"Unknown cendiv key: {cendiv}") + return CENDIV_OUTPUT[cendiv] + + +def resolve_case_insensitive(path: Path) -> Path: + if path.exists(): + return path + path = path.resolve() + current = Path(path.anchor) + for part in path.parts[1:]: + if not current.exists(): + return path + try: + matches = [p for p in current.iterdir() if p.name.lower() == part.lower()] + except PermissionError: + return path + if not matches: + return path + current = matches[0] + return current + + +def resolve_path(base_dir: Path, configured_path: str) -> Path: + p = Path(configured_path) + if not p.is_absolute(): + p = base_dir / p + return resolve_case_insensitive(p) + + +def load_config(config_path: Path) -> dict[str, Any]: + require(config_path.exists(), f"Config not found: {config_path}") + with config_path.open("r", encoding="utf-8") as f: + cfg = json.load(f) + require(isinstance(cfg, dict), f"Config root must be an object: {config_path}") + return cfg + + +def region_order_from_config(config: dict[str, Any]) -> list[str]: + configured = config["ng"]["regions"] + return [cendiv_output_label(any_to_cendiv(x)) for x in configured] + + +def zero_ng_betas_in_first_model_year( + frame: pd.DataFrame, + first_model_year: int, + beta_reg_col: str = "beta_reg", + beta_nat_col: str = "beta_nat", +) -> None: + first_year_mask = frame["year"] == first_model_year + frame.loc[first_year_mask, beta_reg_col] = 0.0 + frame.loc[first_year_mask, beta_nat_col] = 0.0 + + +def zero_ng_terms_in_first_model_year( + frame: pd.DataFrame, + first_model_year: int, + regional_term_col: str = "term_reg", + national_term_col: str = "term_nat", +) -> None: + first_year_mask = frame["year"] == first_model_year + frame.loc[first_year_mask, regional_term_col] = 0.0 + frame.loc[first_year_mask, national_term_col] = 0.0 + + +def sanitize_file_component(value: str) -> str: + return re.sub(r"[^A-Za-z0-9_.-]+", "_", value) + + +def compute_r2(actual: pd.Series | np.ndarray, predicted: pd.Series | np.ndarray) -> float: + a = np.asarray(actual, dtype=float) + p = np.asarray(predicted, dtype=float) + mask = np.isfinite(a) & np.isfinite(p) + if not np.any(mask): + return float("nan") + a, p = a[mask], p[mask] + ss_res = float(np.sum(np.square(a - p))) + ss_tot = float(np.sum(np.square(a - np.mean(a)))) + if ss_tot <= 0: + return float("nan") + return 1.0 - (ss_res / ss_tot) + + +def compute_fit_metrics(actual: pd.Series | np.ndarray, predicted: pd.Series | np.ndarray) -> dict[str, float]: + a = np.asarray(actual, dtype=float) + p = np.asarray(predicted, dtype=float) + mask = np.isfinite(a) & np.isfinite(p) + if not np.any(mask): + return {"n_obs": 0.0, "mae": float("nan"), "rmse": float("nan"), + "max_abs": float("nan"), "r2": float("nan")} + a, p = a[mask], p[mask] + err = a - p + return { + "n_obs": float(len(a)), + "mae": float(np.mean(np.abs(err))), + "rmse": float(np.sqrt(np.mean(np.square(err)))), + "max_abs": float(np.max(np.abs(err))), + "r2": compute_r2(a, p), + } + + +def summarize_fit( + frame: pd.DataFrame, + group_cols: list[str], + actual_col: str, + predicted_col: str, + mae_col: str, + rmse_col: str, + max_abs_col: str, + r2_col: str, +) -> pd.DataFrame: + rows: list[dict[str, Any]] = [] + for keys, grp in frame.groupby(group_cols, sort=True): + if not isinstance(keys, tuple): + keys = (keys,) + row = {group_cols[i]: keys[i] for i in range(len(group_cols))} + m = compute_fit_metrics(grp[actual_col], grp[predicted_col]) + row["n_obs"] = int(m["n_obs"]) + row[mae_col] = m["mae"] + row[rmse_col] = m["rmse"] + row[max_abs_col] = m["max_abs"] + row[r2_col] = m["r2"] + rows.append(row) + out = pd.DataFrame(rows) + if out.empty: + return out + return out.sort_values(group_cols).reset_index(drop=True) + + +# ============================================================================ +# Beta / alpha CSV loaders +# ============================================================================ + +def load_regional_beta_from_csv(source_path: Path) -> dict[str, float]: + require(source_path.exists(), f"Missing regional beta file: {source_path}") + df = pd.read_csv(source_path) + cendiv_col = next( + (c for c in df.columns if normalize_token(c).endswith("cendiv")), None) + value_col = next( + (c for c in df.columns if normalize_token(c) == "value"), None) + require(cendiv_col is not None and value_col is not None, + f"Missing cendiv/value columns in {source_path}") + beta_map: dict[str, float] = {} + for raw_label, raw_beta in zip(df[cendiv_col], df[value_col]): + beta_val = pd.to_numeric([raw_beta], errors="coerce")[0] + if pd.isna(beta_val): + continue + region_label = cendiv_output_label(any_to_cendiv(str(raw_label))) + beta_map[region_label] = float(beta_val) + return beta_map + + +def load_national_beta_from_csv(source_path: Path) -> float: + require(source_path.exists(), f"Missing national beta file: {source_path}") + df = pd.read_csv(source_path) + require("beta" in df.columns, f"Missing 'beta' column in {source_path}") + beta_vals = pd.to_numeric(df["beta"], errors="coerce").dropna() + require(not beta_vals.empty, f"No numeric beta value found in {source_path}") + return float(beta_vals.iloc[0]) + + +def load_regional_beta_map(alpha_out_dir: Path, alpha_input_dir: Path) -> tuple[dict[str, float], Path]: + candidates = [alpha_out_dir / "cd_beta0.csv", alpha_input_dir / "cd_beta0.csv"] + source_path = next((p for p in candidates if p.exists()), None) + require(source_path is not None, f"Could not find cd_beta0.csv in: {candidates}") + return load_regional_beta_from_csv(source_path), source_path + + +def load_national_beta(alpha_out_dir: Path, alpha_input_dir: Path, beta_out_dir: Path) -> tuple[float, Path]: + candidates = [ + alpha_out_dir / "national_beta.csv", + alpha_input_dir / "national_beta.csv", + beta_out_dir / "national_beta.csv", + ] + source_path = next((p for p in candidates if p.exists()), None) + require(source_path is not None, f"Could not find national_beta.csv in: {candidates}") + return load_national_beta_from_csv(source_path), source_path + + +def load_alpha_from_beta_step(beta_out_dir: Path) -> tuple[pd.DataFrame, Path, list[str]]: + source_path = beta_out_dir / "alpha_from_beta_regression.csv" + require(source_path.exists(), + "Missing alpha_from_beta_regression.csv. Re-run aeo_beta_regression.py first.") + df = pd.read_csv(source_path) + if "scenario_id" in df.columns: + need = {"scenario_id", "region", "year", "alpha_2004"} + require(not (need - set(df.columns)), + f"Missing columns in {source_path}: {sorted(need - set(df.columns))}") + out = df[["scenario_id", "region", "year", "alpha_2004"]].copy() + out["scenario_id"] = out["scenario_id"].astype(str) + merge_cols = ["scenario_id", "region", "year"] + else: + need = {"region", "year", "alpha_2004"} + require(not (need - set(df.columns)), + f"Missing columns in {source_path}: {sorted(need - set(df.columns))}") + out = df[["region", "year", "alpha_2004"]].copy() + merge_cols = ["region", "year"] + + out["region"] = out["region"].astype(str).map( + lambda x: cendiv_output_label(any_to_cendiv(x))) + out["year"] = pd.to_numeric(out["year"], errors="coerce") + out["alpha_2004"] = pd.to_numeric(out["alpha_2004"], errors="coerce") + require(not out["year"].isna().any(), f"Non-numeric year in {source_path}") + require(not out["alpha_2004"].isna().any(), f"Non-numeric alpha_2004 in {source_path}") + out["year"] = out["year"].astype(int) + out = out.rename(columns={"alpha_2004": "alpha1"}) + return out[merge_cols + ["alpha1"]], source_path, merge_cols + + +# ============================================================================ +# Scenario discovery helpers +# ============================================================================ + +def scenario_display_label(suffix: str, scenario_id: str) -> str: + t = normalize_token(suffix) + if t.startswith("ref"): + return "reference" + if t in {"hog", "highogs"}: + return "HOG" + if t in {"log", "lowogs"}: + return "LOG" + return scenario_id + + +def discover_output_scenarios_with_labels( + alpha_out_dir: Path, aeo_year: int, +) -> list[tuple[str, str, str]]: + """Returns list of (suffix, scenario_id, display_label).""" + mapping_path = alpha_out_dir / "raw_aeo_data" / "selected_scenarios_outputs.csv" + suffix_to_scenario: dict[str, str] = {} + if mapping_path.exists(): + map_df = pd.read_csv(mapping_path) + if {"file_suffix", "scenario_id"}.issubset(map_df.columns): + for row in map_df.itertuples(index=False): + suffix_to_scenario[str(row.file_suffix)] = str(row.scenario_id) + + pattern = re.compile(rf"^alpha_AEO_{aeo_year}_(.+)\.csv$") + rows: list[tuple[str, str, str]] = [] + for path in sorted(alpha_out_dir.glob(f"alpha_AEO_{aeo_year}_*.csv")): + match = pattern.match(path.name) + if not match: + continue + suffix = match.group(1) + scenario_id = suffix_to_scenario.get(suffix, suffix) + label = scenario_display_label(suffix, scenario_id) + rows.append((suffix, scenario_id, label)) + require(rows, f"No alpha scenario files found in {alpha_out_dir}") + order_index = {"reference": 0, "HOG": 1, "LOG": 2} + rows.sort(key=lambda x: (order_index.get(x[2], 99), x[2], x[1])) + return rows + + +def discover_output_scenarios(alpha_out_dir: Path, aeo_year: int) -> list[tuple[str, str]]: + """Returns list of (suffix, scenario_id).""" + return [(s, sid) for s, sid, _ in discover_output_scenarios_with_labels(alpha_out_dir, aeo_year)] + + +# ============================================================================ +# Wide-series loader +# ============================================================================ + +def load_wide_series( + csv_path: Path, + value_col: str, + scenario_id: str, + region_order: list[str], + scenario_label: str | None = None, +) -> pd.DataFrame: + require(csv_path.exists(), f"Missing file: {csv_path}") + wide = pd.read_csv(csv_path) + year_col = "t" if "t" in wide.columns else ("year" if "year" in wide.columns else None) + require(year_col is not None, f"Missing year column (t/year): {csv_path}") + missing_regions = [c for c in region_order if c not in wide.columns] + require(not missing_regions, f"Missing region columns in {csv_path}: {missing_regions}") + long = wide[[year_col, *region_order]].melt( + id_vars=[year_col], var_name="region", value_name=value_col) + long["year"] = pd.to_numeric(long[year_col], errors="coerce") + long[value_col] = pd.to_numeric(long[value_col], errors="coerce") + require(not long["year"].isna().any(), f"Non-numeric year in {csv_path}") + require(not long[value_col].isna().any(), f"Non-numeric values in {csv_path}") + long["year"] = long["year"].astype(int) + long["scenario_id"] = scenario_id + cols = ["scenario_id", "region", "year", value_col] + if scenario_label is not None: + long["scenario_label"] = scenario_label + cols = ["scenario_id", "scenario_label", "region", "year", value_col] + return long[cols] + + +# ============================================================================ +# Part 1: Beta raw-data scatter grid +# ============================================================================ + +_RAW_KEY_COLS = ["period", "scenario", "regionId"] +_RAW_READ_COLS = ["period", "scenario", "scenarioDescription", "regionId", + "regionName", "value"] + + +def _coerce_numeric(df: pd.DataFrame, col: str) -> pd.DataFrame: + df[col] = pd.to_numeric(df[col], errors="coerce") + return df.dropna(subset=[col]) + + +def _load_beta_include_scenarios(config: dict[str, Any]) -> list[str]: + aeo_year = config.get("aeo_year") + include = (config.get("scenarios", {}) + .get("beta_regression", {}) + .get("include", [])) + if not isinstance(include, list): + raise ValueError("Expected list at scenarios.beta_regression.include") + resolved: list[str] = [] + for item in include: + token = str(item) + if aeo_year is not None: + token = token.replace("{aeo_year}", str(aeo_year)) + resolved.append(token) + return resolved + + +def _filter_to_beta_include(df: pd.DataFrame, include_tokens: list[str]) -> pd.DataFrame: + if not include_tokens: + return df + include_norm = {normalize_token(t) for t in include_tokens} + scenario_norm = df["scenario"].map(normalize_token) + desc_norm = df["scenarioDescription"].fillna("").map(normalize_token) + keep_mask = scenario_norm.isin(include_norm) | desc_norm.isin(include_norm) + filtered = df[keep_mask].copy() + if filtered.empty: + available = sorted(df["scenario"].unique()) + raise ValueError( + f"No rows matched beta_regression.include. " + f"Configured: {include_tokens}. Available: {available}") + return filtered + + +def _load_and_merge_raw(demand_csv: Path, price_csv: Path) -> pd.DataFrame: + demand = pd.read_csv(demand_csv, usecols=_RAW_READ_COLS).rename( + columns={"value": "demand"}) + price = pd.read_csv(price_csv, usecols=_RAW_READ_COLS).rename( + columns={"value": "price"}) + demand = _coerce_numeric(demand, "demand") + price = _coerce_numeric(price, "price") + for df in [demand, price]: + df["period"] = pd.to_numeric(df["period"], errors="coerce") + df.dropna(subset=["period"], inplace=True) + df["period"] = df["period"].astype(int) + demand = demand.groupby(_RAW_KEY_COLS, as_index=False).agg( + {"scenarioDescription": "first", "regionName": "first", "demand": "mean"}) + price = price.groupby(_RAW_KEY_COLS, as_index=False).agg({"price": "mean"}) + merged = demand.merge(price, on=_RAW_KEY_COLS, how="inner", validate="one_to_one") + return merged.sort_values(["period", "scenario", "regionId"]).reset_index(drop=True) + + +def _fit_line(panel_df: pd.DataFrame) -> tuple[np.ndarray, np.ndarray, float] | None: + x = panel_df["demand"].to_numpy() + y = panel_df["price"].to_numpy() + if x.size < 2 or np.unique(x).size < 2: + return None + slope, intercept = np.polyfit(x, y, deg=1) + x_line = np.linspace(x.min(), x.max(), 50) + y_line = slope * x_line + intercept + return x_line, y_line, float(slope) + + +def generate_raw_scatter_grid( + beta_out_dir: Path, + config: dict[str, Any], + output_dir: Path, + dpi: int = 150, +) -> None: + """Generate beta raw-data scatter grid (demand vs price by year/region).""" + raw_dir = beta_out_dir / "raw_aeo_data" + demand_csv = raw_dir / "raw_ng_demand_elec.csv" + price_csv = raw_dir / "raw_ng_price.csv" + if not demand_csv.exists() or not price_csv.exists(): + LOGGER.warning("Skipping raw scatter grid: raw CSVs not found in %s", raw_dir) + return + + merged = _load_and_merge_raw(demand_csv, price_csv) + include_scenarios = _load_beta_include_scenarios(config) + merged = _filter_to_beta_include(merged, include_scenarios) + + years = sorted(merged["period"].unique()) + scenarios = sorted(merged["scenario"].unique()) + region_table = (merged[["regionId", "regionName"]].drop_duplicates() + .sort_values(["regionId", "regionName"]).reset_index(drop=True)) + regions = region_table.to_dict("records") + scenario_colors = {s: plt.cm.tab10(i % 10) for i, s in enumerate(scenarios)} + + n_rows, n_cols = len(years), len(regions) + fig_w = max(15, n_cols * 2.6) + fig_h = max(12, n_rows * 1.8) + fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, + figsize=(fig_w, fig_h), sharex=True, sharey=True, squeeze=False) + + x_min, x_max = merged["demand"].min(), merged["demand"].max() + y_min, y_max = merged["price"].min(), merged["price"].max() + x_pad = 0.05 * (x_max - x_min) if x_max > x_min else 0.1 + y_pad = 0.05 * (y_max - y_min) if y_max > y_min else 0.1 + + for row_idx, year in enumerate(years): + for col_idx, region in enumerate(regions): + ax = axes[row_idx, col_idx] + panel = merged[(merged["period"] == year) & (merged["regionId"] == region["regionId"])] + ax.scatter(panel["demand"], panel["price"], s=20, alpha=0.95, + c=panel["scenario"].map(scenario_colors), edgecolors="none") + fit = _fit_line(panel) + if fit is not None: + x_line, y_line, slope = fit + is_negative = slope < 0 + line_color = "#c62828" if is_negative else "#4d4d4d" + ax.plot(x_line, y_line, color=line_color, linewidth=1.1) + slope_text = f"slope={slope:.3f}" + if is_negative: + slope_text += " (NEG)" + ax.text(0.03, 0.97, slope_text, transform=ax.transAxes, + ha="left", va="top", fontsize=6, + color="#8b0000" if is_negative else "#303030", + bbox={"boxstyle": "round,pad=0.2", + "facecolor": "#ffe6e6" if is_negative else "white", + "alpha": 0.85, + "edgecolor": "#c62828" if is_negative else "none", + "linewidth": 0.6 if is_negative else 0.0}) + if is_negative: + ax.set_facecolor("#fff5f5") + for spine in ax.spines.values(): + spine.set_edgecolor("#c62828") + spine.set_linewidth(0.9) + ax.set_xlim(x_min - x_pad, x_max + x_pad) + ax.set_ylim(y_min - y_pad, y_max + y_pad) + ax.grid(True, alpha=0.25, linewidth=0.5) + ax.tick_params(axis="both", labelsize=6, length=2) + if row_idx == 0: + ax.set_title(str(region["regionName"]), fontsize=8, pad=3) + if col_idx == 0: + ax.set_ylabel(str(year), fontsize=8, rotation=0, labelpad=18, va="center") + + legend_handles = [ + Line2D([0], [0], marker="o", linestyle="", markersize=5, + markerfacecolor=scenario_colors[s], markeredgecolor="none", label=s) + for s in scenarios + ] + fig.supxlabel("Natural Gas Demand (quads)", fontsize=11) + fig.supylabel("Natural Gas Price (2024 $/MMBtu)", fontsize=11) + fig.suptitle( + "Raw AEO NG Data: Demand vs Price\n" + "(Scatter + Linear Regression by Year and Region; negative slopes highlighted)", + fontsize=13, y=0.995) + fig.legend(handles=legend_handles, loc="upper center", + bbox_to_anchor=(0.5, 0.978), ncol=min(len(scenarios), 5), + fontsize=8, frameon=False, title="Scenario", title_fontsize=8) + fig.tight_layout(rect=[0.04, 0.02, 1, 0.95]) + + output_dir.mkdir(parents=True, exist_ok=True) + fig.savefig(output_dir / "beta_raw_data_scatter_linear_grid.png", dpi=dpi) + plt.close(fig) + LOGGER.info("Wrote raw scatter grid to %s", output_dir) + + +# ============================================================================ +# Part 2: Beta / alpha diagnostic plots +# ============================================================================ + +def generate_beta_plots(beta_out_dir: Path, region_order: list[str], plots_dir: Path) -> None: + plots_dir.mkdir(parents=True, exist_ok=True) + summary_path = beta_out_dir / "beta_regression_summary.csv" + points_path = beta_out_dir / "regression_points.csv" + require(summary_path.exists(), f"Missing file: {summary_path}") + require(points_path.exists(), f"Missing file: {points_path}") + + summary = pd.read_csv(summary_path) + points = pd.read_csv(points_path) + nat_y_col = "dp_partial_nat" + reg_x_col = "dq_reg" + for col in ["scope", "region", "beta"]: + require(col in summary.columns, f"Missing '{col}' in {summary_path}") + for col in ["region", "dq_nat", nat_y_col, reg_x_col, "dp_partial_reg"]: + require(col in points.columns, f"Missing '{col}' in {points_path}") + + national_rows = summary[summary["scope"].astype(str).str.lower() == "national"] + require(not national_rows.empty, f"No national row found in {summary_path}") + nrow = national_rows.iloc[0] + beta_nat = float(pd.to_numeric([nrow["beta"]], errors="coerce")[0]) + beta_nat_r2 = (float(pd.to_numeric([nrow["r2"]], errors="coerce")[0]) + if "r2" in summary.columns else float("nan")) + model_r2 = (float(pd.to_numeric([nrow["r2_full"]], errors="coerce")[0]) + if "r2_full" in summary.columns else float("nan")) + + # National beta plot + nat_points = points[["dq_nat", nat_y_col]].dropna() + require(not nat_points.empty, + f"No valid national plotting rows in {points_path}") + fig, ax = plt.subplots(figsize=(8, 6)) + ax.scatter(nat_points["dq_nat"], nat_points[nat_y_col], s=12, alpha=0.5, color="#1f77b4") + x_min, x_max = float(nat_points["dq_nat"].min()), float(nat_points["dq_nat"].max()) + x_range = np.linspace(x_min, x_max, 200) + ax.plot(x_range, beta_nat * x_range, color="#d62728", linewidth=2) + ax.axhline(0, color="#888888", linewidth=0.8) + ax.axvline(0, color="#888888", linewidth=0.8) + ax.set_xlabel("Demeaned national demand (partial)") + ax.set_ylabel("Price residual net of regional term (partial)") + ax.set_title(f"National beta | beta={beta_nat:.6f}, partial R2={beta_nat_r2:.3f}, model R2={model_r2:.3f}") + fig.tight_layout() + fig.savefig(plots_dir / "national_beta_regression.png", dpi=220) + plt.close(fig) + + # Regional beta plot + regional = summary[summary["scope"].astype(str).str.lower() == "regional"].copy() + regional["region"] = regional["region"].astype(str) + beta_map: dict[str, float] = {} + r2_map: dict[str, float] = {} + for row in regional.itertuples(index=False): + beta_val = pd.to_numeric([getattr(row, "beta")], errors="coerce")[0] + r2_val = (pd.to_numeric([getattr(row, "r2")], errors="coerce")[0] + if "r2" in regional.columns else float("nan")) + if pd.notna(beta_val): + beta_map[str(getattr(row, "region"))] = float(beta_val) + if pd.notna(r2_val): + r2_map[str(getattr(row, "region"))] = float(r2_val) + + regions = [r for r in region_order if r in beta_map] + if not regions: + regions = sorted(points["region"].dropna().astype(str).unique().tolist()) + + ncols = 3 + nrows = int(np.ceil(len(regions) / ncols)) + fig, axes = plt.subplots(nrows, ncols, figsize=(5.2 * ncols, 4.2 * nrows)) + axes = np.atleast_1d(axes).ravel() + for i, region in enumerate(regions): + ax = axes[i] + reg_r = points[points["region"] == region][[reg_x_col, "dp_partial_reg"]].dropna() + beta = beta_map.get(region, float("nan")) + r2 = r2_map.get(region, float("nan")) + ax.scatter(reg_r[reg_x_col], reg_r["dp_partial_reg"], s=10, alpha=0.65, color="#1f77b4") + if not reg_r.empty: + x_line = np.linspace(float(reg_r[reg_x_col].min()), float(reg_r[reg_x_col].max()), 120) + ax.plot(x_line, beta * x_line, color="#d62728", linewidth=1.6) + ax.axhline(0, color="#999999", linewidth=0.6) + ax.axvline(0, color="#999999", linewidth=0.6) + ax.set_title(f"{region}\nbeta={beta:.4f}, partial R2={r2:.3f}", fontsize=10) + ax.set_xlabel(f"{reg_x_col} (partial)") + ax.set_ylabel("dp_reg (partial)") + for i in range(len(regions), len(axes)): + axes[i].axis("off") + fig.suptitle("Regional betas (joint fixed-effects regression)", fontsize=14) + fig.tight_layout(rect=[0, 0, 1, 0.97]) + fig.savefig(plots_dir / "regional_beta_regression.png", dpi=220) + plt.close(fig) + LOGGER.info("Wrote beta plots to %s", plots_dir) + + +def generate_alpha_plots( + alpha_out_dir: Path, + alpha_input_dir: Path, + beta_out_dir: Path, + plots_dir: Path, + region_order: list[str], + aeo_year: int, + deflator_to_2004: float, + first_model_year: int, +) -> None: + plots_dir.mkdir(parents=True, exist_ok=True) + beta_regional, _ = load_regional_beta_map(alpha_out_dir, alpha_input_dir) + beta_national, _ = load_national_beta(alpha_out_dir, alpha_input_dir, beta_out_dir) + missing_regions = [r for r in region_order if r not in beta_regional] + require(not missing_regions, f"Missing regional beta values: {missing_regions}") + + scenarios = discover_output_scenarios(alpha_out_dir, aeo_year) + + for suffix, scenario_id in scenarios: + alpha_path = alpha_out_dir / f"alpha_AEO_{aeo_year}_{suffix}.csv" + price_path = alpha_out_dir / f"ng_AEO_{aeo_year}_{suffix}.csv" + demand_path = alpha_out_dir / f"ng_demand_AEO_{aeo_year}_{suffix}.csv" + if not (alpha_path.exists() and price_path.exists() and demand_path.exists()): + LOGGER.warning("Skipping suffix '%s' due to missing files.", suffix) + continue + + alpha_df = load_wide_series(alpha_path, "alpha_2004", scenario_id, region_order) + price_df = load_wide_series(price_path, "ng_price", scenario_id, region_order) + demand_df = load_wide_series(demand_path, "demand_elec_quads", scenario_id, region_order) + + merged = alpha_df.merge(price_df, on=["scenario_id", "region", "year"], how="inner") + merged = merged.merge(demand_df, on=["scenario_id", "region", "year"], how="inner") + require(not merged.empty, f"No merged data for scenario suffix '{suffix}'") + + q_nat = (demand_df.groupby(["scenario_id", "year"], as_index=False)["demand_elec_quads"] + .sum().rename(columns={"demand_elec_quads": "q_nat"})) + merged = merged.merge(q_nat, on=["scenario_id", "year"], how="left") + merged["price_2004"] = merged["ng_price"] * deflator_to_2004 + merged["beta_reg"] = merged["region"].map(beta_regional) + merged["term_reg"] = merged["beta_reg"] * merged["demand_elec_quads"] + merged["term_nat"] = beta_national * merged["q_nat"] + zero_ng_terms_in_first_model_year(merged, first_model_year) + + ncols = 3 + nrows = int(np.ceil(len(region_order) / ncols)) + fig, axes = plt.subplots(nrows, ncols, figsize=(5.5 * ncols, 4.0 * nrows), sharex=True) + axes = np.atleast_1d(axes).ravel() + for i, region in enumerate(region_order): + ax = axes[i] + reg = merged[merged["region"] == region].sort_values("year") + if reg.empty: + continue + years = reg["year"].to_numpy() + ax.fill_between(years, 0, reg["alpha_2004"].to_numpy(), + alpha=0.5, color="#2ca02c", label="Alpha") + ax.fill_between(years, reg["alpha_2004"].to_numpy(), + reg["alpha_2004"].to_numpy() + reg["term_reg"].to_numpy(), + alpha=0.5, color="#1f77b4", label="Beta_reg * Q_reg") + ax.fill_between(years, + reg["alpha_2004"].to_numpy() + reg["term_reg"].to_numpy(), + reg["alpha_2004"].to_numpy() + reg["term_reg"].to_numpy() + reg["term_nat"].to_numpy(), + alpha=0.5, color="#ff7f0e", label="Beta_nat * Q_nat") + ax.plot(years, reg["price_2004"].to_numpy(), color="#d62728", + linewidth=1.5, linestyle="--", label="Actual price") + ax.set_title(region, fontsize=12, pad=6) + ax.set_ylabel("2004$/MMBtu", fontsize=11) + ax.grid(True, alpha=0.3) + for i in range(len(region_order), len(axes)): + axes[i].axis("off") + handles, labels = axes[0].get_legend_handles_labels() + fig.legend(handles, labels, loc="lower center", ncol=4, fontsize=10, + bbox_to_anchor=(0.5, -0.02)) + fig.suptitle(f"Price decomposition: {scenario_id}\n" + "Price = Alpha + Beta_reg*Q_reg + Beta_nat*Q_nat", fontsize=15) + fig.tight_layout(rect=[0, 0.04, 1, 0.95]) + safe_id = sanitize_file_component(scenario_id) + fig.savefig(plots_dir / f"alpha_price_decomposition_{safe_id}.png", + dpi=220, bbox_inches="tight") + plt.close(fig) + LOGGER.info("Wrote alpha plot for %s", scenario_id) + + +# ============================================================================ +# Part 3: Validation +# ============================================================================ + +def build_validation_frame( + alpha_out_dir: Path, + aeo_year: int, + region_order: list[str], +) -> pd.DataFrame: + LOGGER.info("Building validation frame from alpha outputs in %s", alpha_out_dir) + frames: list[pd.DataFrame] = [] + for suffix, scenario_id, scenario_label in discover_output_scenarios_with_labels(alpha_out_dir, aeo_year): + alpha_path = alpha_out_dir / f"alpha_AEO_{aeo_year}_{suffix}.csv" + price_path = alpha_out_dir / f"ng_AEO_{aeo_year}_{suffix}.csv" + demand_path = alpha_out_dir / f"ng_demand_AEO_{aeo_year}_{suffix}.csv" + if not (alpha_path.exists() and price_path.exists() and demand_path.exists()): + LOGGER.warning("Skipping suffix '%s' due to missing files.", suffix) + continue + alpha_df = load_wide_series(alpha_path, "alpha_2004", scenario_id, region_order, scenario_label) + price_df = load_wide_series(price_path, "ng_price", scenario_id, region_order, scenario_label) + demand_df = load_wide_series(demand_path, "demand_elec_quads", scenario_id, region_order, scenario_label) + merged = alpha_df.merge(price_df, on=["scenario_id", "scenario_label", "region", "year"], how="inner") + merged = merged.merge(demand_df, on=["scenario_id", "scenario_label", "region", "year"], how="inner") + require(not merged.empty, f"No merged rows for scenario '{scenario_id}' ({suffix})") + frames.append(merged) + require(frames, "No scenario frames available for validation.") + out = pd.concat(frames, ignore_index=True) + q_nat = (out.groupby(["scenario_id", "scenario_label", "year"], as_index=False)["demand_elec_quads"] + .sum().rename(columns={"demand_elec_quads": "q_nat"})) + return out.merge(q_nat, on=["scenario_id", "scenario_label", "year"], how="left") + + +def load_beta_regression_points(beta_out_dir: Path) -> tuple[pd.DataFrame, Path]: + path = beta_out_dir / "regression_points.csv" + require(path.exists(), f"Missing regression_points.csv: {path}") + df = pd.read_csv(path) + need = {"scenario_id", "year", "region", "demand", "demand_nat", "price_2004", "dp", "dp_hat"} + require(not (need - set(df.columns)), f"Missing columns in {path}: {sorted(need - set(df.columns))}") + out = df[list(need)].copy() + out["scenario_id"] = out["scenario_id"].astype(str) + out["region"] = out["region"].astype(str).map(lambda x: cendiv_output_label(any_to_cendiv(x))) + for c in ["year", "demand", "demand_nat", "price_2004", "dp", "dp_hat"]: + out[c] = pd.to_numeric(out[c], errors="coerce") + require(not out[["year", "demand", "demand_nat", "price_2004", "dp", "dp_hat"]].isna().any().any(), + f"Non-numeric value(s) in {path}") + out["year"] = out["year"].astype(int) + return out, path + + +def build_step1_beta_validation_frame( + beta_out_dir: Path, + beta_reg_step_map: dict[str, float], + beta_nat_step: float, + alpha1_frame: pd.DataFrame, + alpha1_merge_cols: list[str], + first_model_year: int, +) -> tuple[pd.DataFrame, Path]: + points, points_path = load_beta_regression_points(beta_out_dir) + frame = points.rename(columns={ + "demand": "demand_elec_quads", "demand_nat": "q_nat", + "price_2004": "actual_2004", "dp": "actual_dprice", "dp_hat": "predicted_dprice", + }).copy() + frame["scenario_label"] = frame["scenario_id"] + frame["beta_reg"] = frame["region"].map(beta_reg_step_map) + require(not frame["beta_reg"].isna().any(), + "Missing regional beta mapping in step-1 validation.") + frame["beta_nat"] = beta_nat_step + frame = frame.merge(alpha1_frame, on=alpha1_merge_cols, how="left") + require(not frame["alpha1"].isna().any(), + "Missing alpha1 rows in step-1 validation.") + zero_ng_betas_in_first_model_year(frame, first_model_year) + frame["predicted_2004"] = (frame["alpha1"] + + frame["beta_reg"] * frame["demand_elec_quads"] + + frame["beta_nat"] * frame["q_nat"]) + frame["error"] = frame["actual_2004"] - frame["predicted_2004"] + frame["error_dprice"] = frame["actual_dprice"] - frame["predicted_dprice"] + return frame, points_path + + +def _parity_panel(ax, frame, title, actual_col, predicted_col, xlabel, ylabel): + m = compute_fit_metrics(frame[actual_col], frame[predicted_col]) + x = frame[actual_col].to_numpy(dtype=float) + y = frame[predicted_col].to_numpy(dtype=float) + mask = np.isfinite(x) & np.isfinite(y) + x, y = x[mask], y[mask] + ax.scatter(x, y, s=8, alpha=0.35, color="#1f77b4", edgecolors="none") + if len(x) > 0: + lo = float(min(np.min(x), np.min(y))) + hi = float(max(np.max(x), np.max(y))) + if hi <= lo: + hi = lo + 1.0 + pad = 0.05 * (hi - lo) + ax.plot([lo - pad, hi + pad], [lo - pad, hi + pad], color="#333333", + linestyle="--", linewidth=1.2) + ax.set_xlim(lo - pad, hi + pad) + ax.set_ylim(lo - pad, hi + pad) + ax.set_title(title, fontsize=11, pad=4) + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) + ax.grid(True, alpha=0.25) + ax.text(0.02, 0.98, + f"N={int(m['n_obs'])}\nR2={m['r2']:.4f}\nRMSE={m['rmse']:.4f}\nMAE={m['mae']:.4f}", + transform=ax.transAxes, va="top", ha="left", fontsize=9, + bbox={"boxstyle": "round,pad=0.2", "facecolor": "white", + "alpha": 0.75, "edgecolor": "#bbbbbb"}) + return m + + +def plot_overall_parity(step1_frame, step2_frame, out_path): + fig, axes = plt.subplots(1, 2, figsize=(12.8, 5.6)) + step1_m = _parity_panel(axes[0], step1_frame, + "Step 1: beta-regression fit\n(demeaned price: dp vs dp_hat)", + "actual_dprice", "predicted_dprice", + "Actual demeaned price (2004$/MMBtu)", + "Predicted demeaned price (2004$/MMBtu)") + step2_m = _parity_panel(axes[1], step2_frame, + "Step 2: alpha-regression scenarios\n(level price: alpha2 + beta x demand)", + "actual_2004", "predicted_2004", + "Actual (2004$/MMBtu)", "Predicted (2004$/MMBtu)") + fig.suptitle("Results validation parity: step1 demeaned fit vs step2 level fit", + fontsize=14, y=0.98) + fig.tight_layout(rect=[0, 0, 1, 0.95]) + out_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(out_path, dpi=220, bbox_inches="tight") + plt.close(fig) + return step1_m, step2_m + + +def plot_actual_vs_predicted(frame, region_order, out_path): + scenarios = list(dict.fromkeys(frame["scenario_label"].astype(str).tolist())) + ncols = 3 + nrows = int(np.ceil(len(region_order) / ncols)) + fig, axes = plt.subplots(nrows, ncols, figsize=(5.3 * ncols, 4.0 * nrows), sharex=True) + axes = np.atleast_1d(axes).ravel() + for i, region in enumerate(region_order): + ax = axes[i] + reg = frame[frame["region"] == region].copy() + if reg.empty: + ax.set_title(region, fontsize=10) + ax.grid(True, alpha=0.25) + continue + for scen in scenarios: + sdf = reg[reg["scenario_label"] == scen].sort_values("year") + if sdf.empty: + continue + color = SCENARIO_COLOR.get(scen) + ax.plot(sdf["year"], sdf["actual_2004"], color=color, linewidth=1.6) + ax.plot(sdf["year"], sdf["predicted_2004"], color=color, + linewidth=3.0, linestyle="--", alpha=0.98, zorder=4) + ax.set_title(region, fontsize=10, pad=4) + ax.set_ylabel("2004$/MMBtu") + ax.grid(True, alpha=0.25) + for i in range(len(region_order), len(axes)): + axes[i].axis("off") + legend_handles = [] + for scen in scenarios: + color = SCENARIO_COLOR.get(scen, "#333333") + legend_handles.append(Line2D([0], [0], color=color, lw=2.0, linestyle="-", + label=f"{scen} actual")) + legend_handles.append(Line2D([0], [0], color=color, lw=2.0, linestyle="--", + alpha=0.7, label=f"{scen} predicted")) + fig.legend(handles=legend_handles, loc="lower center", + ncol=max(2, min(6, len(legend_handles))), fontsize=8, + bbox_to_anchor=(0.5, -0.005)) + fig.suptitle("Results validation: actual price vs predicted\n" + "(predicted = alpha(region,year,scenario) + beta x scenario demand)", + fontsize=14, y=0.98) + fig.tight_layout(rect=[0, 0.04, 1, 0.95]) + out_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(out_path, dpi=220, bbox_inches="tight") + plt.close(fig) + + +def plot_alpha_vs_alpha1(frame, region_order, out_path): + scenarios = list(dict.fromkeys(frame["scenario_label"].astype(str).tolist())) + alpha1_shared = bool( + frame.groupby(["region", "year"], as_index=False)["alpha1"].nunique()["alpha1"].max() <= 1) + ncols = 3 + nrows = int(np.ceil(len(region_order) / ncols)) + fig, axes = plt.subplots(nrows, ncols, figsize=(5.3 * ncols, 4.0 * nrows), sharex=True) + axes = np.atleast_1d(axes).ravel() + for i, region in enumerate(region_order): + ax = axes[i] + reg = frame[frame["region"] == region].copy() + if reg.empty: + ax.set_title(region, fontsize=10) + ax.grid(True, alpha=0.25) + continue + for scen in scenarios: + sdf = reg[reg["scenario_label"] == scen].sort_values("year") + if sdf.empty: + continue + color = SCENARIO_COLOR.get(scen) + ax.plot(sdf["year"], sdf["alpha_2004"], color=color, linewidth=1.7) + if not alpha1_shared: + ax.plot(sdf["year"], sdf["alpha1"], color=color, linewidth=2.6, + linestyle="--", alpha=0.98, zorder=4) + if alpha1_shared: + a1 = reg[["year", "alpha1"]].drop_duplicates(subset=["year"]).sort_values("year") + ax.plot(a1["year"], a1["alpha1"], color="#222222", linewidth=2.8, + linestyle="--", alpha=0.98, zorder=5, label="alpha1 shared") + ax.set_title(region, fontsize=10, pad=4) + ax.set_ylabel("alpha") + ax.grid(True, alpha=0.25) + for i in range(len(region_order), len(axes)): + axes[i].axis("off") + legend_handles = [] + for scen in scenarios: + color = SCENARIO_COLOR.get(scen, "#333333") + legend_handles.append(Line2D([0], [0], color=color, lw=2.2, linestyle="-", + label=f"{scen} alpha2")) + if not alpha1_shared: + legend_handles.append(Line2D([0], [0], color=color, lw=2.6, linestyle="--", + alpha=0.98, label=f"{scen} alpha1")) + if alpha1_shared: + legend_handles.append(Line2D([0], [0], color="#222222", lw=2.8, linestyle="--", + alpha=0.98, label="alpha1 shared")) + fig.legend(handles=legend_handles, loc="lower center", + ncol=max(2, min(6, len(legend_handles))), fontsize=8, + bbox_to_anchor=(0.5, -0.005)) + fig.suptitle("Results validation: alpha2 vs alpha1\n" + "(alpha2 = alpha regression output, alpha1 = beta regression output)", + fontsize=14, y=0.98) + fig.tight_layout(rect=[0, 0.04, 1, 0.95]) + out_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(out_path, dpi=220, bbox_inches="tight") + plt.close(fig) + + +def run_validation( + config: dict[str, Any], + base_dir: Path, + beta_out_dir: Path, + alpha_out_dir: Path, + alpha_input_dir: Path, + validation_dir: Path, + region_order: list[str], +) -> None: + """Run full validation: CSV metrics + plots.""" + aeo_year = int(config["aeo_year"]) + first_model_year = int(config["start_year"]) + deflator = float(config["ng"]["price_deflator_to_2004"]) + validation_dir.mkdir(parents=True, exist_ok=True) + + # Load betas + beta_reg_map, beta_reg_src = load_regional_beta_map(alpha_out_dir, alpha_input_dir) + beta_nat, beta_nat_src = load_national_beta(alpha_out_dir, alpha_input_dir, beta_out_dir) + missing_regions = [r for r in region_order if r not in beta_reg_map] + require(not missing_regions, f"Missing regional beta: {missing_regions}") + + beta_reg_step_map = load_regional_beta_from_csv(beta_out_dir / "cd_beta0.csv") + beta_nat_step = load_national_beta_from_csv(beta_out_dir / "national_beta.csv") + + # Beta comparison CSV + beta_compare_rows = [{ + "scope": "national", "region": "ALL", + "beta1_from_beta_step": beta_nat_step, + "beta2_used_in_alpha_step": beta_nat, + "diff_beta1_minus_beta2": beta_nat_step - beta_nat, + }] + for region in region_order: + b1, b2 = beta_reg_step_map[region], beta_reg_map[region] + beta_compare_rows.append({ + "scope": "regional", "region": region, + "beta1_from_beta_step": b1, + "beta2_used_in_alpha_step": b2, + "diff_beta1_minus_beta2": b1 - b2, + }) + pd.DataFrame(beta_compare_rows).to_csv( + validation_dir / "results_validation_beta_comparison.csv", + index=False, float_format="%.6f") + + # Build step-2 validation frame + frame = build_validation_frame(alpha_out_dir, aeo_year, region_order) + frame["actual_2004"] = frame["ng_price"] * deflator + frame["beta_reg"] = frame["region"].map(beta_reg_map) + frame["beta_nat"] = beta_nat + zero_ng_betas_in_first_model_year(frame, first_model_year) + frame["predicted_2004"] = (frame["alpha_2004"] + + frame["beta_reg"] * frame["demand_elec_quads"] + + frame["beta_nat"] * frame["q_nat"]) + frame["error"] = frame["actual_2004"] - frame["predicted_2004"] + + # Alpha1 from beta step + alpha1_frame, _, alpha1_merge_cols = load_alpha_from_beta_step(beta_out_dir) + step1_frame, _ = build_step1_beta_validation_frame( + beta_out_dir, beta_reg_step_map, beta_nat_step, + alpha1_frame, alpha1_merge_cols, first_model_year) + + frame = frame.merge(alpha1_frame, on=alpha1_merge_cols, how="left") + frame["alpha_vs_alpha1_error"] = frame["alpha_2004"] - frame["alpha1"] + alpha_cmp_frame = frame[~frame["alpha1"].isna()].copy() + + # Write detail/summary CSVs + step1_frame[[ + "scenario_id", "scenario_label", "region", "year", + "actual_2004", "predicted_2004", "error", + "actual_dprice", "predicted_dprice", "error_dprice", + "alpha1", "beta_reg", "beta_nat", "demand_elec_quads", "q_nat", + ]].to_csv(validation_dir / "results_validation_step1_beta_actual_vs_predicted_detail.csv", + index=False, float_format="%.6f") + + summarize_fit(step1_frame, ["scenario_label", "region"], + "actual_dprice", "predicted_dprice", + "mae_error", "rmse_error", "max_abs_error", "r2").to_csv( + validation_dir / "results_validation_step1_beta_actual_vs_predicted_summary.csv", + index=False, float_format="%.6f") + + frame[[ + "scenario_id", "scenario_label", "region", "year", + "actual_2004", "predicted_2004", "error", + "alpha_2004", "beta_reg", "beta_nat", "demand_elec_quads", "q_nat", + "alpha1", "alpha_vs_alpha1_error", + ]].to_csv(validation_dir / "results_validation_actual_vs_predicted_detail.csv", + index=False, float_format="%.6f") + + summarize_fit(frame, ["scenario_label", "region"], + "actual_2004", "predicted_2004", + "mae_error", "rmse_error", "max_abs_error", "r2").to_csv( + validation_dir / "results_validation_actual_vs_predicted_summary.csv", + index=False, float_format="%.6f") + + if not alpha_cmp_frame.empty: + summarize_fit(alpha_cmp_frame, ["scenario_label", "region"], + "alpha_2004", "alpha1", + "mae_alpha1_error", "rmse_alpha1_error", + "max_abs_alpha1_error", "r2_alpha1").to_csv( + validation_dir / "results_validation_alpha_vs_alpha1_summary.csv", + index=False, float_format="%.6f") + + alpha_spread = (alpha_cmp_frame.groupby(["region", "year"], as_index=False)["alpha_2004"] + .agg(alpha_min="min", alpha_max="max")) + alpha_spread["alpha_spread"] = alpha_spread["alpha_max"] - alpha_spread["alpha_min"] + alpha_spread.to_csv(validation_dir / "results_validation_alpha_vs_alpha1_spread.csv", + index=False, float_format="%.6f") + + alpha_cmp_frame[[ + "scenario_id", "scenario_label", "region", "year", + "alpha_2004", "alpha1", "alpha_vs_alpha1_error", + ]].to_csv(validation_dir / "results_validation_alpha_vs_alpha1_detail.csv", + index=False, float_format="%.6f") + + # Plots + plot_actual_vs_predicted(frame, region_order, + validation_dir / "results_validation_actual_vs_predicted.png") + if not alpha_cmp_frame.empty: + plot_alpha_vs_alpha1(alpha_cmp_frame, region_order, + validation_dir / "results_validation_alpha_vs_alpha1.png") + step1_m, step2_m = plot_overall_parity(step1_frame, frame, + validation_dir / "results_validation_parity_overall.png") + + pd.DataFrame([ + {"step": "step1_beta_regression_scenarios", "metric_basis": "demeaned_price_dp", + "n_obs": int(step1_m["n_obs"]), "mae_error": step1_m["mae"], + "rmse_error": step1_m["rmse"], "max_abs_error": step1_m["max_abs"], "r2": step1_m["r2"]}, + {"step": "step2_alpha_regression_scenarios", "metric_basis": "level_price_2004_per_mmbtu", + "n_obs": int(step2_m["n_obs"]), "mae_error": step2_m["mae"], + "rmse_error": step2_m["rmse"], "max_abs_error": step2_m["max_abs"], "r2": step2_m["r2"]}, + ]).to_csv(validation_dir / "results_validation_overall_metrics.csv", + index=False, float_format="%.6f") + + LOGGER.info("Validation complete. Outputs in %s", validation_dir) + + +# ============================================================================ +# CLI +# ============================================================================ + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Unified visualization and validation for NG regression pipeline.") + parser.add_argument("--config", default="aeo_pipeline_config.json") + parser.add_argument("--beta-output-dir", default="outputs of beta regression") + parser.add_argument("--alpha-output-dir", default=None) + parser.add_argument("--alpha-input-dir", default=None) + parser.add_argument("--output-dir", default="results validation", + help="Directory for all plots and validation CSVs.") + parser.add_argument("--skip-raw-scatter", action="store_true", + help="Skip beta raw-data scatter grid.") + parser.add_argument("--skip-beta", action="store_true", + help="Skip beta diagnostic plots.") + parser.add_argument("--skip-alpha", action="store_true", + help="Skip alpha decomposition plots.") + parser.add_argument("--skip-validation", action="store_true", + help="Skip validation plots and CSVs.") + parser.add_argument("--log-level", default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR"]) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + logging.basicConfig(level=getattr(logging, args.log_level), + format="%(asctime)s | %(levelname)s | %(message)s") + + script_dir = Path(__file__).resolve().parent + cfg_path = Path(args.config) + if not cfg_path.is_absolute(): + cwd_candidate = resolve_case_insensitive((Path.cwd() / cfg_path).resolve()) + script_candidate = resolve_case_insensitive((script_dir / cfg_path).resolve()) + cfg_path = cwd_candidate if cwd_candidate.exists() else script_candidate + + config = load_config(cfg_path) + base_dir = cfg_path.parent + + beta_out_dir = resolve_path(base_dir, args.beta_output_dir) + alpha_out_dir = resolve_path(base_dir, + args.alpha_output_dir or config["paths"]["output_dir"]) + alpha_input_dir = resolve_path(base_dir, + args.alpha_input_dir or config["paths"].get("input_dir", "inputs for alpha regression")) + output_dir = resolve_path(base_dir, args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + region_order = region_order_from_config(config) + aeo_year = int(config["aeo_year"]) + first_model_year = int(config["start_year"]) + deflator = float(config["ng"]["price_deflator_to_2004"]) + + if not args.skip_raw_scatter: + generate_raw_scatter_grid(beta_out_dir, config, output_dir) + + if not args.skip_beta: + generate_beta_plots(beta_out_dir, region_order, output_dir) + + if not args.skip_alpha: + generate_alpha_plots( + alpha_out_dir=alpha_out_dir, + alpha_input_dir=alpha_input_dir, + beta_out_dir=beta_out_dir, + plots_dir=output_dir, + region_order=region_order, + aeo_year=aeo_year, + deflator_to_2004=deflator, + first_model_year=first_model_year, + ) + + if not args.skip_validation: + run_validation( + config=config, + base_dir=base_dir, + beta_out_dir=beta_out_dir, + alpha_out_dir=alpha_out_dir, + alpha_input_dir=alpha_input_dir, + validation_dir=output_dir, + region_order=region_order, + ) + + LOGGER.info("All visualization and validation complete.") + return 0 + + +if __name__ == "__main__": + try: + sys.exit(main()) + except Exception as exc: + LOGGER.exception("Failed: %s", exc) + sys.exit(1)