VectorInstitute · ethancjackson · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026
diff --git a/implementations/energy_oil_forecasting/04_systematic_backtest_eval.ipynb b/implementations/energy_oil_forecasting/04_systematic_backtest_eval.ipynb
@@ -33,11 +33,118 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "4760f015",
    "metadata": {},
-   "outputs": [],
-   "source": "import warnings\nfrom pathlib import Path\n\nimport energy_oil_forecasting\nimport pandas as pd\nimport yaml\nfrom aieng.forecasting.evaluation import (\n    MultiTargetBacktestSpec,\n    cached_multi_backtest,\n    describe_spec,\n)\nfrom energy_oil_forecasting.data import build_wti_service\n\n\nwarnings.filterwarnings(\"ignore\")\n\n# ── Mode ──────────────────────────────────────────────────────────────────────\n# Set SMOKE_TEST = True to run a 2-origin, 1-sample version of the notebook\n# for fast local development and end-to-end CI testing. The full specs run\n# 51 backtest + 8 eval origins; smoke runs 2 + 2.\nSMOKE_TEST = True\n\n# ── Model selection ───────────────────────────────────────────────────────────\n# Two project models: \"gemini-3.1-flash-lite-preview\" (lite/default) and\n# \"gemini-3.5-flash\" (advanced). Change these two lines to swap models for the\n# whole notebook (bare proxy names — no \"gemini/\" prefix).\nAGENT_MODEL = \"gemini-3.1-flash-lite-preview\"\nLLMP_MODEL = \"gemini-3.1-flash-lite-preview\"\n\n# ── Derived settings (do not edit below) ─────────────────────────────────────\nN_SAMPLES = 1 if SMOKE_TEST else 3  # trajectories per LLMP call\n\ndata_service = build_wti_service()\n\nspec_dir = Path(energy_oil_forecasting.__file__).parent / \"specs\"\nif SMOKE_TEST:\n    backtest_file, eval_file = \"energy_oil_smoke.yaml\", \"energy_oil_eval_smoke.yaml\"\nelse:\n    backtest_file, eval_file = \"energy_oil_backtest.yaml\", \"energy_oil_eval.yaml\"\n\nwith open(spec_dir / backtest_file) as f:\n    backtest_spec = MultiTargetBacktestSpec.model_validate(yaml.safe_load(f))\nwith open(spec_dir / eval_file) as f:\n    eval_spec = MultiTargetBacktestSpec.model_validate(yaml.safe_load(f))\n\nprint(\n    f\"{'⚡ SMOKE MODE' if SMOKE_TEST else '📊 FULL MODE'} — AGENT_MODEL={AGENT_MODEL!r}  LLMP_MODEL={LLMP_MODEL!r}  N_SAMPLES={N_SAMPLES}\"\n)\nprint()\nprint(\"━\" * 72)\nprint(\"LOADED SPECIFICATIONS:\")\nprint(\"━\" * 72)\nprint(describe_spec(backtest_spec, data_service))\nprint(describe_spec(eval_spec, data_service))"
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "⚡ SMOKE MODE — AGENT_MODEL='gemini-3.1-flash-lite-preview'  LLMP_MODEL='gemini-3.1-flash-lite-preview'  N_SAMPLES=1\n",
+      "\n",
+      "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
+      "LOADED SPECIFICATIONS:\n",
+      "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
+      "MultiTargetBacktestSpec (spec_id=energy_oil_smoke)\n",
+      "  description: Two-origin smoke backtest for local and CI testing of the NB04 pipeline. Uses the same tasks, horizons, and warmup as energy_oil_backtest but with only 2 weekly origins so the full notebook can be exercised without burning tokens on 51 × 5 predictor evaluations.\n",
+      "  start:       2025-06-02 00:00:00\n",
+      "  end:         2025-06-09 00:00:00\n",
+      "  stride:      5\n",
+      "  warmup:      250\n",
+      "  tasks:       1\n",
+      "\n",
+      "Task: wti_oil_price_forecast\n",
+      "  description: WTI Crude Oil continuous front-month futures Close price (yfinance symbol: CL=F), projected 5, 10, and 21 trading days ahead.\n",
+      "  horizons:    [5, 10, 21] (len=3)\n",
+      "  frequency:   B\n",
+      "  payload:     continuous\n",
+      "  resolution:  observed_value_at_resolution_timestamp\n",
+      "- target_series_id: wti_crude_oil_price\n",
+      "    description:    WTI Crude Oil continuous front-month futures adjusted close (Yahoo Finance CL=F)\n",
+      "    source:         yfinance\n",
+      "    units:          USD/bbl\n",
+      "    frequency:      B\n",
+      "\n",
+      "MultiTargetBacktestSpec (spec_id=energy_oil_eval_smoke)\n",
+      "  description: Two-origin smoke evaluation for local and CI testing of the NB04 pipeline. Uses the same tasks, horizons, warmup, and geopolitical period as energy_oil_eval but with only 2 origins to keep cost negligible.\n",
+      "  start:       2026-02-02 00:00:00\n",
+      "  end:         2026-02-09 00:00:00\n",
+      "  stride:      5\n",
+      "  warmup:      250\n",
+      "  tasks:       1\n",
+      "\n",
+      "Task: wti_oil_price_forecast\n",
+      "  description: WTI Crude Oil continuous front-month futures Close price (yfinance symbol: CL=F), projected 5, 10, and 21 trading days ahead.\n",
+      "  horizons:    [5, 10, 21] (len=3)\n",
+      "  frequency:   B\n",
+      "  payload:     continuous\n",
+      "  resolution:  observed_value_at_resolution_timestamp\n",
+      "- target_series_id: wti_crude_oil_price\n",
+      "    description:    WTI Crude Oil continuous front-month futures adjusted close (Yahoo Finance CL=F)\n",
+      "    source:         yfinance\n",
+      "    units:          USD/bbl\n",
+      "    frequency:      B\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import warnings\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import energy_oil_forecasting\n",
+    "import pandas as pd\n",
+    "import yaml\n",
+    "from aieng.forecasting.evaluation import (\n",
+    "    MultiTargetBacktestSpec,\n",
+    "    cached_multi_backtest,\n",
+    "    describe_spec,\n",
+    ")\n",
+    "from energy_oil_forecasting.data import build_wti_service\n",
+    "\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "# ── Mode ──────────────────────────────────────────────────────────────────────\n",
+    "# Set SMOKE_TEST = True to run a 2-origin, 1-sample version of the notebook\n",
+    "# for fast local development and end-to-end CI testing. The full specs run\n",
+    "# 51 backtest + 8 eval origins; smoke runs 2 + 2.\n",
+    "SMOKE_TEST = True\n",
+    "\n",
+    "# ── Model selection ───────────────────────────────────────────────────────────\n",
+    "# Two project models: \"gemini-3.1-flash-lite-preview\" (lite/default) and\n",
+    "# \"gemini-3.5-flash\" (advanced). Change these two lines to swap models for the\n",
+    "# whole notebook (bare proxy names — no \"gemini/\" prefix).\n",
+    "AGENT_MODEL = \"gemini-3.1-flash-lite-preview\"\n",
+    "LLMP_MODEL = \"gemini-3.1-flash-lite-preview\"\n",
+    "\n",
+    "# ── Derived settings (do not edit below) ─────────────────────────────────────\n",
+    "N_SAMPLES = 1 if SMOKE_TEST else 3  # trajectories per LLMP call\n",
+    "\n",
+    "data_service = build_wti_service()\n",
+    "\n",
+    "spec_dir = Path(energy_oil_forecasting.__file__).parent / \"specs\"\n",
+    "if SMOKE_TEST:\n",
+    "    backtest_file, eval_file = \"energy_oil_smoke.yaml\", \"energy_oil_eval_smoke.yaml\"\n",
+    "else:\n",
+    "    backtest_file, eval_file = \"energy_oil_backtest.yaml\", \"energy_oil_eval.yaml\"\n",
+    "\n",
+    "with open(spec_dir / backtest_file) as f:\n",
+    "    backtest_spec = MultiTargetBacktestSpec.model_validate(yaml.safe_load(f))\n",
+    "with open(spec_dir / eval_file) as f:\n",
+    "    eval_spec = MultiTargetBacktestSpec.model_validate(yaml.safe_load(f))\n",
+    "\n",
+    "print(\n",
+    "    f\"{'⚡ SMOKE MODE' if SMOKE_TEST else '📊 FULL MODE'} — AGENT_MODEL={AGENT_MODEL!r}  LLMP_MODEL={LLMP_MODEL!r}  N_SAMPLES={N_SAMPLES}\"\n",
+    ")\n",
+    "print()\n",
+    "print(\"━\" * 72)\n",
+    "print(\"LOADED SPECIFICATIONS:\")\n",
+    "print(\"━\" * 72)\n",
+    "print(describe_spec(backtest_spec, data_service))\n",
+    "print(describe_spec(eval_spec, data_service))"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -68,45 +175,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "2f52a6fe",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Active predictors (2):\n",
-      "  Naive (Last Value)\n",
-      "  AutoARIMA\n"
-     ]
-    }
-   ],
-   "source": [
-    "from aieng.forecasting.methods import (\n",
-    "    LastValuePredictor,\n",
-    ")\n",
-    "from aieng.forecasting.methods.numerical.darts_arima import DartsAutoARIMAPredictor\n",
-    "\n",
-    "\n",
-    "# ── Predictors ────────────────────────────────────────────────────────────────\n",
-    "# AutoARIMA is the primary method; Naive is the lower-bound baseline.\n",
-    "# Both are evaluated in every section — no contender selection needed.\n",
-    "# NOTE: AutoARIMA re-fits at every origin (slow on first run; cached after).\n",
-    "PREDICTORS = {\n",
-    "    \"Naive (Last Value)\": LastValuePredictor(),\n",
-    "    \"AutoARIMA\": DartsAutoARIMAPredictor(),\n",
-    "    # ── Optional comparisons (not the focus of this experiment) ──────────────\n",
-    "    # \"Prophet\": ProphetPredictor(predictor_id=\"prophet_daily\", ...),\n",
-    "    # f\"LLMP-Sampled ({LLMP_MODEL})\": SampledTrajectoryLLMPredictor(...),\n",
-    "    # f\"LLMP-Grid ({LLMP_MODEL})\": QuantileGridLLMPredictor(...),\n",
-    "    # f\"News Agent ({AGENT_MODEL})\": build_wti_agent_predictor(...),\n",
-    "}\n",
-    "\n",
-    "print(f\"Active predictors ({len(PREDICTORS)}):\")\n",
-    "for name in PREDICTORS:\n",
-    "    print(f\"  {name}\")"
-   ]
+   "outputs": [],
+   "source": "from aieng.forecasting.methods import (\n    LastValuePredictor,\n    QuantileGridLLMPredictor,  # noqa: F401\n    QuantileGridLLMPredictorConfig,  # noqa: F401\n    SampledTrajectoryLLMPredictor,  # noqa: F401\n    SampledTrajectoryLLMPredictorConfig,  # noqa: F401\n)\nfrom aieng.forecasting.methods.numerical.darts_arima import DartsAutoARIMAPredictor\nfrom energy_oil_forecasting.analyst_agent import build_wti_agent_predictor, build_wti_news_config  # noqa: F401\nfrom energy_oil_forecasting.prophet_baseline import ProphetPredictor  # noqa: F401\n\n\n# ── Predictors ────────────────────────────────────────────────────────────────\n# AutoARIMA is the primary method; Naive is the lower-bound baseline.\n# Both are evaluated in every section — no contender selection needed.\n# NOTE: AutoARIMA re-fits at every origin (slow on first run; cached after).\nPREDICTORS = {\n    \"Naive (Last Value)\": LastValuePredictor(),\n    \"AutoARIMA\": DartsAutoARIMAPredictor(),\n    # ── Optional comparisons (not the focus of this experiment) ──────────────\n    # \"Prophet\": ProphetPredictor(),\n    # f\"LLMP-Sampled ({LLMP_MODEL})\": SampledTrajectoryLLMPredictor(\n    #     SampledTrajectoryLLMPredictorConfig(model=LLMP_MODEL, n_samples=N_SAMPLES)\n    # ),\n    # f\"LLMP-Grid ({LLMP_MODEL})\": QuantileGridLLMPredictor(\n    #     QuantileGridLLMPredictorConfig(model=LLMP_MODEL)\n    # ),\n    # f\"News Agent ({AGENT_MODEL})\": build_wti_agent_predictor(\n    #     build_wti_news_config(model=AGENT_MODEL)\n    # ),\n}\n\nprint(f\"Active predictors ({len(PREDICTORS)}):\")\nfor name in PREDICTORS:\n    print(f\"  {name}\")"
   },
   {
    "cell_type": "markdown",
@@ -131,11 +204,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Running 2025 rolling backtest (2 predictor(s))...\n",
+      "Running 2025 rolling backtest (6 predictor(s))...\n",
       "LLM/agent runs are expensive — first run will take several minutes.\n",
       "\n",
       "  Naive (Last Value) ✓\n",
       "  AutoARIMA ✓\n",
+      "  Prophet ✓\n",
+      "  LLMP-Sampled (gemini-3.1-flash-lite-preview) ✓\n",
+      "  LLMP-Grid (gemini-3.1-flash-lite-preview) ✓\n",
+      "  News Agent (gemini-3.1-flash-lite-preview) ✓\n",
       "\n",
       "All 2025 backtests complete.\n"
      ]
@@ -183,12 +260,16 @@
       "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
       "2025 HISTORICAL BACKTEST — PERFORMANCE SUMMARY:\n",
       "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
-      "                    Mean CRPS  MAE h=21d\n",
-      "Predictor                               \n",
-      "AutoARIMA            2.472053   3.076572\n",
-      "Naive (Last Value)   2.929931   3.016445\n",
+      "                                              Mean CRPS  MAE h=21d\n",
+      "Predictor                                                         \n",
+      "AutoARIMA                                      3.866155        NaN\n",
+      "LLMP-Grid (gemini-3.1-flash-lite-preview)      4.095949        NaN\n",
+      "LLMP-Sampled (gemini-3.1-flash-lite-preview)   5.074999        NaN\n",
+      "Naive (Last Value)                             5.834998        NaN\n",
+      "News Agent (gemini-3.1-flash-lite-preview)     6.134875        NaN\n",
+      "Prophet                                       12.845009        NaN\n",
       "\n",
-      "AutoARIMA CRPS improvement over Naive: 0.4579 (15.6%)\n"
+      "AutoARIMA CRPS improvement over Naive: 1.9688 (33.7%)\n"
      ]
     }
    ],
@@ -231,29 +312,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "496dc416",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Saved 2 backtest result(s) to adaptive_agent/curriculum/\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# ── Save backtest results for NB05 / NB06 ────────────────────────────────────\n",
-    "# Downstream notebooks load these with BacktestResult.model_validate_json().\n",
-    "# The files are gitignored (derived data) and regenerated by re-running NB04.\n",
-    "# cached_multi_backtest returns {task_id: BacktestResult}; extract single task.\n",
+    "# Only the two baseline predictors are written to curriculum/ so that\n",
+    "# uncommenting the optional predictors above does not pollute the files\n",
+    "# that NB05 and NB06 depend on.\n",
     "_CURRICULUM_DIR = Path(\"adaptive_agent/curriculum\")\n",
     "_CURRICULUM_DIR.mkdir(exist_ok=True)\n",
+    "_BASELINE_PREDICTORS = {\"Naive (Last Value)\", \"AutoARIMA\"}\n",
     "for _name, _result_dict in backtest_results.items():\n",
+    "    if _name not in _BASELINE_PREDICTORS:\n",
+    "        continue\n",
     "    _result = next(iter(_result_dict.values()))\n",
     "    (_CURRICULUM_DIR / f\"backtest_{_name}.json\").write_text(_result.model_dump_json(), encoding=\"utf-8\")\n",
-    "print(f\"Saved {len(backtest_results)} backtest result(s) to {_CURRICULUM_DIR}/\")"
+    "print(f\"Saved {sum(n in _BASELINE_PREDICTORS for n in backtest_results)} backtest result(s) to {_CURRICULUM_DIR}/\")"
    ]
   },
   {
@@ -289,6 +365,10 @@
       "Running 2026 evaluation...\n",
       "  Naive (Last Value) ✓\n",
       "  AutoARIMA ✓\n",
+      "  Prophet ✓\n",
+      "  LLMP-Sampled (gemini-3.1-flash-lite-preview) ✓\n",
+      "  LLMP-Grid (gemini-3.1-flash-lite-preview) ✓\n",
+      "  News Agent (gemini-3.1-flash-lite-preview) ✓\n",
       "\n",
       "2026 evaluation complete.\n"
      ]
@@ -306,25 +386,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "a49c24d5",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Saved 2 eval result(s) to adaptive_agent/curriculum/\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# ── Save eval results for NB06 ───────────────────────────────────────────────\n",
-    "# eval_results is {predictor_name: {task_id: BacktestResult}}; extract single task.\n",
+    "# Only baseline predictors are written so uncommenting optional predictors\n",
+    "# above does not add extra rows to the NB06 scorecard.\n",
     "for _name, _result_dict in eval_results.items():\n",
+    "    if _name not in _BASELINE_PREDICTORS:\n",
+    "        continue\n",
     "    _result = next(iter(_result_dict.values()))\n",
     "    (_CURRICULUM_DIR / f\"eval_{_name}.json\").write_text(_result.model_dump_json(), encoding=\"utf-8\")\n",
-    "print(f\"Saved {len(eval_results)} eval result(s) to {_CURRICULUM_DIR}/\")"
+    "print(f\"Saved {sum(n in _BASELINE_PREDICTORS for n in eval_results)} eval result(s) to {_CURRICULUM_DIR}/\")"
    ]
   },
   {
@@ -353,10 +428,14 @@
       "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
       "2026 EVAL SCORECARD — STATELESS BASELINE:\n",
       "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
-      "                    Mean CRPS (2026)  MAE h=21d (2026)  80% CI Coverage\n",
-      "Predictor                                                              \n",
-      "AutoARIMA                  10.998416         14.100308             25.0\n",
-      "Naive (Last Value)         13.643182         13.999166              0.0\n"
+      "                                              Mean CRPS (2026)  MAE h=21d (2026)  80% CI Coverage\n",
+      "Predictor                                                                                        \n",
+      "Prophet                                               2.294532               NaN              NaN\n",
+      "AutoARIMA                                             6.198680               NaN              NaN\n",
+      "News Agent (gemini-3.1-flash-lite-preview)            7.060907               NaN              NaN\n",
+      "LLMP-Grid (gemini-3.1-flash-lite-preview)             7.624998               NaN              NaN\n",
+      "Naive (Last Value)                                    8.214998               NaN              NaN\n",
+      "LLMP-Sampled (gemini-3.1-flash-lite-preview)          9.069998               NaN              NaN\n"
      ]
     }
    ],