Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
231 changes: 155 additions & 76 deletions implementations/energy_oil_forecasting/04_systematic_backtest_eval.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,118 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "4760f015",
"metadata": {},
"outputs": [],
"source": "import warnings\nfrom pathlib import Path\n\nimport energy_oil_forecasting\nimport pandas as pd\nimport yaml\nfrom aieng.forecasting.evaluation import (\n MultiTargetBacktestSpec,\n cached_multi_backtest,\n describe_spec,\n)\nfrom energy_oil_forecasting.data import build_wti_service\n\n\nwarnings.filterwarnings(\"ignore\")\n\n# ── Mode ──────────────────────────────────────────────────────────────────────\n# Set SMOKE_TEST = True to run a 2-origin, 1-sample version of the notebook\n# for fast local development and end-to-end CI testing. The full specs run\n# 51 backtest + 8 eval origins; smoke runs 2 + 2.\nSMOKE_TEST = True\n\n# ── Model selection ───────────────────────────────────────────────────────────\n# Two project models: \"gemini-3.1-flash-lite-preview\" (lite/default) and\n# \"gemini-3.5-flash\" (advanced). Change these two lines to swap models for the\n# whole notebook (bare proxy names β€” no \"gemini/\" prefix).\nAGENT_MODEL = \"gemini-3.1-flash-lite-preview\"\nLLMP_MODEL = \"gemini-3.1-flash-lite-preview\"\n\n# ── Derived settings (do not edit below) ─────────────────────────────────────\nN_SAMPLES = 1 if SMOKE_TEST else 3 # trajectories per LLMP call\n\ndata_service = build_wti_service()\n\nspec_dir = Path(energy_oil_forecasting.__file__).parent / \"specs\"\nif SMOKE_TEST:\n backtest_file, eval_file = \"energy_oil_smoke.yaml\", \"energy_oil_eval_smoke.yaml\"\nelse:\n backtest_file, eval_file = \"energy_oil_backtest.yaml\", \"energy_oil_eval.yaml\"\n\nwith open(spec_dir / backtest_file) as f:\n backtest_spec = MultiTargetBacktestSpec.model_validate(yaml.safe_load(f))\nwith open(spec_dir / eval_file) as f:\n eval_spec = MultiTargetBacktestSpec.model_validate(yaml.safe_load(f))\n\nprint(\n f\"{'⚑ SMOKE MODE' if SMOKE_TEST else 'πŸ“Š FULL MODE'} β€” AGENT_MODEL={AGENT_MODEL!r} LLMP_MODEL={LLMP_MODEL!r} N_SAMPLES={N_SAMPLES}\"\n)\nprint()\nprint(\"━\" * 72)\nprint(\"LOADED SPECIFICATIONS:\")\nprint(\"━\" * 72)\nprint(describe_spec(backtest_spec, data_service))\nprint(describe_spec(eval_spec, data_service))"
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"⚑ SMOKE MODE β€” AGENT_MODEL='gemini-3.1-flash-lite-preview' LLMP_MODEL='gemini-3.1-flash-lite-preview' N_SAMPLES=1\n",
"\n",
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
"LOADED SPECIFICATIONS:\n",
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
"MultiTargetBacktestSpec (spec_id=energy_oil_smoke)\n",
" description: Two-origin smoke backtest for local and CI testing of the NB04 pipeline. Uses the same tasks, horizons, and warmup as energy_oil_backtest but with only 2 weekly origins so the full notebook can be exercised without burning tokens on 51 Γ— 5 predictor evaluations.\n",
" start: 2025-06-02 00:00:00\n",
" end: 2025-06-09 00:00:00\n",
" stride: 5\n",
" warmup: 250\n",
" tasks: 1\n",
"\n",
"Task: wti_oil_price_forecast\n",
" description: WTI Crude Oil continuous front-month futures Close price (yfinance symbol: CL=F), projected 5, 10, and 21 trading days ahead.\n",
" horizons: [5, 10, 21] (len=3)\n",
" frequency: B\n",
" payload: continuous\n",
" resolution: observed_value_at_resolution_timestamp\n",
"- target_series_id: wti_crude_oil_price\n",
" description: WTI Crude Oil continuous front-month futures adjusted close (Yahoo Finance CL=F)\n",
" source: yfinance\n",
" units: USD/bbl\n",
" frequency: B\n",
"\n",
"MultiTargetBacktestSpec (spec_id=energy_oil_eval_smoke)\n",
" description: Two-origin smoke evaluation for local and CI testing of the NB04 pipeline. Uses the same tasks, horizons, warmup, and geopolitical period as energy_oil_eval but with only 2 origins to keep cost negligible.\n",
" start: 2026-02-02 00:00:00\n",
" end: 2026-02-09 00:00:00\n",
" stride: 5\n",
" warmup: 250\n",
" tasks: 1\n",
"\n",
"Task: wti_oil_price_forecast\n",
" description: WTI Crude Oil continuous front-month futures Close price (yfinance symbol: CL=F), projected 5, 10, and 21 trading days ahead.\n",
" horizons: [5, 10, 21] (len=3)\n",
" frequency: B\n",
" payload: continuous\n",
" resolution: observed_value_at_resolution_timestamp\n",
"- target_series_id: wti_crude_oil_price\n",
" description: WTI Crude Oil continuous front-month futures adjusted close (Yahoo Finance CL=F)\n",
" source: yfinance\n",
" units: USD/bbl\n",
" frequency: B\n",
"\n"
]
}
],
"source": [
"import warnings\n",
"from pathlib import Path\n",
"\n",
"import energy_oil_forecasting\n",
"import pandas as pd\n",
"import yaml\n",
"from aieng.forecasting.evaluation import (\n",
" MultiTargetBacktestSpec,\n",
" cached_multi_backtest,\n",
" describe_spec,\n",
")\n",
"from energy_oil_forecasting.data import build_wti_service\n",
"\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"# ── Mode ──────────────────────────────────────────────────────────────────────\n",
"# Set SMOKE_TEST = True to run a 2-origin, 1-sample version of the notebook\n",
"# for fast local development and end-to-end CI testing. The full specs run\n",
"# 51 backtest + 8 eval origins; smoke runs 2 + 2.\n",
"SMOKE_TEST = True\n",
"\n",
"# ── Model selection ───────────────────────────────────────────────────────────\n",
"# Two project models: \"gemini-3.1-flash-lite-preview\" (lite/default) and\n",
"# \"gemini-3.5-flash\" (advanced). Change these two lines to swap models for the\n",
"# whole notebook (bare proxy names β€” no \"gemini/\" prefix).\n",
"AGENT_MODEL = \"gemini-3.1-flash-lite-preview\"\n",
"LLMP_MODEL = \"gemini-3.1-flash-lite-preview\"\n",
"\n",
"# ── Derived settings (do not edit below) ─────────────────────────────────────\n",
"N_SAMPLES = 1 if SMOKE_TEST else 3 # trajectories per LLMP call\n",
"\n",
"data_service = build_wti_service()\n",
"\n",
"spec_dir = Path(energy_oil_forecasting.__file__).parent / \"specs\"\n",
"if SMOKE_TEST:\n",
" backtest_file, eval_file = \"energy_oil_smoke.yaml\", \"energy_oil_eval_smoke.yaml\"\n",
"else:\n",
" backtest_file, eval_file = \"energy_oil_backtest.yaml\", \"energy_oil_eval.yaml\"\n",
"\n",
"with open(spec_dir / backtest_file) as f:\n",
" backtest_spec = MultiTargetBacktestSpec.model_validate(yaml.safe_load(f))\n",
"with open(spec_dir / eval_file) as f:\n",
" eval_spec = MultiTargetBacktestSpec.model_validate(yaml.safe_load(f))\n",
"\n",
"print(\n",
" f\"{'⚑ SMOKE MODE' if SMOKE_TEST else 'πŸ“Š FULL MODE'} β€” AGENT_MODEL={AGENT_MODEL!r} LLMP_MODEL={LLMP_MODEL!r} N_SAMPLES={N_SAMPLES}\"\n",
")\n",
"print()\n",
"print(\"━\" * 72)\n",
"print(\"LOADED SPECIFICATIONS:\")\n",
"print(\"━\" * 72)\n",
"print(describe_spec(backtest_spec, data_service))\n",
"print(describe_spec(eval_spec, data_service))"
]
},
{
"cell_type": "markdown",
Expand Down Expand Up @@ -68,45 +175,11 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "2f52a6fe",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Active predictors (2):\n",
" Naive (Last Value)\n",
" AutoARIMA\n"
]
}
],
"source": [
"from aieng.forecasting.methods import (\n",
" LastValuePredictor,\n",
")\n",
"from aieng.forecasting.methods.numerical.darts_arima import DartsAutoARIMAPredictor\n",
"\n",
"\n",
"# ── Predictors ────────────────────────────────────────────────────────────────\n",
"# AutoARIMA is the primary method; Naive is the lower-bound baseline.\n",
"# Both are evaluated in every section β€” no contender selection needed.\n",
"# NOTE: AutoARIMA re-fits at every origin (slow on first run; cached after).\n",
"PREDICTORS = {\n",
" \"Naive (Last Value)\": LastValuePredictor(),\n",
" \"AutoARIMA\": DartsAutoARIMAPredictor(),\n",
" # ── Optional comparisons (not the focus of this experiment) ──────────────\n",
" # \"Prophet\": ProphetPredictor(predictor_id=\"prophet_daily\", ...),\n",
" # f\"LLMP-Sampled ({LLMP_MODEL})\": SampledTrajectoryLLMPredictor(...),\n",
" # f\"LLMP-Grid ({LLMP_MODEL})\": QuantileGridLLMPredictor(...),\n",
" # f\"News Agent ({AGENT_MODEL})\": build_wti_agent_predictor(...),\n",
"}\n",
"\n",
"print(f\"Active predictors ({len(PREDICTORS)}):\")\n",
"for name in PREDICTORS:\n",
" print(f\" {name}\")"
]
"outputs": [],
"source": "from aieng.forecasting.methods import (\n LastValuePredictor,\n QuantileGridLLMPredictor, # noqa: F401\n QuantileGridLLMPredictorConfig, # noqa: F401\n SampledTrajectoryLLMPredictor, # noqa: F401\n SampledTrajectoryLLMPredictorConfig, # noqa: F401\n)\nfrom aieng.forecasting.methods.numerical.darts_arima import DartsAutoARIMAPredictor\nfrom energy_oil_forecasting.analyst_agent import build_wti_agent_predictor, build_wti_news_config # noqa: F401\nfrom energy_oil_forecasting.prophet_baseline import ProphetPredictor # noqa: F401\n\n\n# ── Predictors ────────────────────────────────────────────────────────────────\n# AutoARIMA is the primary method; Naive is the lower-bound baseline.\n# Both are evaluated in every section β€” no contender selection needed.\n# NOTE: AutoARIMA re-fits at every origin (slow on first run; cached after).\nPREDICTORS = {\n \"Naive (Last Value)\": LastValuePredictor(),\n \"AutoARIMA\": DartsAutoARIMAPredictor(),\n # ── Optional comparisons (not the focus of this experiment) ──────────────\n # \"Prophet\": ProphetPredictor(),\n # f\"LLMP-Sampled ({LLMP_MODEL})\": SampledTrajectoryLLMPredictor(\n # SampledTrajectoryLLMPredictorConfig(model=LLMP_MODEL, n_samples=N_SAMPLES)\n # ),\n # f\"LLMP-Grid ({LLMP_MODEL})\": QuantileGridLLMPredictor(\n # QuantileGridLLMPredictorConfig(model=LLMP_MODEL)\n # ),\n # f\"News Agent ({AGENT_MODEL})\": build_wti_agent_predictor(\n # build_wti_news_config(model=AGENT_MODEL)\n # ),\n}\n\nprint(f\"Active predictors ({len(PREDICTORS)}):\")\nfor name in PREDICTORS:\n print(f\" {name}\")"
},
{
"cell_type": "markdown",
Expand All @@ -131,11 +204,15 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Running 2025 rolling backtest (2 predictor(s))...\n",
"Running 2025 rolling backtest (6 predictor(s))...\n",
"LLM/agent runs are expensive β€” first run will take several minutes.\n",
"\n",
" Naive (Last Value) βœ“\n",
" AutoARIMA βœ“\n",
" Prophet βœ“\n",
" LLMP-Sampled (gemini-3.1-flash-lite-preview) βœ“\n",
" LLMP-Grid (gemini-3.1-flash-lite-preview) βœ“\n",
" News Agent (gemini-3.1-flash-lite-preview) βœ“\n",
"\n",
"All 2025 backtests complete.\n"
]
Expand Down Expand Up @@ -183,12 +260,16 @@
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
"2025 HISTORICAL BACKTEST β€” PERFORMANCE SUMMARY:\n",
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
" Mean CRPS MAE h=21d\n",
"Predictor \n",
"AutoARIMA 2.472053 3.076572\n",
"Naive (Last Value) 2.929931 3.016445\n",
" Mean CRPS MAE h=21d\n",
"Predictor \n",
"AutoARIMA 3.866155 NaN\n",
"LLMP-Grid (gemini-3.1-flash-lite-preview) 4.095949 NaN\n",
"LLMP-Sampled (gemini-3.1-flash-lite-preview) 5.074999 NaN\n",
"Naive (Last Value) 5.834998 NaN\n",
"News Agent (gemini-3.1-flash-lite-preview) 6.134875 NaN\n",
"Prophet 12.845009 NaN\n",
"\n",
"AutoARIMA CRPS improvement over Naive: 0.4579 (15.6%)\n"
"AutoARIMA CRPS improvement over Naive: 1.9688 (33.7%)\n"
]
}
],
Expand Down Expand Up @@ -231,29 +312,24 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"id": "496dc416",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Saved 2 backtest result(s) to adaptive_agent/curriculum/\n"
]
}
],
"outputs": [],
"source": [
"# ── Save backtest results for NB05 / NB06 ────────────────────────────────────\n",
"# Downstream notebooks load these with BacktestResult.model_validate_json().\n",
"# The files are gitignored (derived data) and regenerated by re-running NB04.\n",
"# cached_multi_backtest returns {task_id: BacktestResult}; extract single task.\n",
"# Only the two baseline predictors are written to curriculum/ so that\n",
"# uncommenting the optional predictors above does not pollute the files\n",
"# that NB05 and NB06 depend on.\n",
"_CURRICULUM_DIR = Path(\"adaptive_agent/curriculum\")\n",
"_CURRICULUM_DIR.mkdir(exist_ok=True)\n",
"_BASELINE_PREDICTORS = {\"Naive (Last Value)\", \"AutoARIMA\"}\n",
"for _name, _result_dict in backtest_results.items():\n",
" if _name not in _BASELINE_PREDICTORS:\n",
" continue\n",
" _result = next(iter(_result_dict.values()))\n",
" (_CURRICULUM_DIR / f\"backtest_{_name}.json\").write_text(_result.model_dump_json(), encoding=\"utf-8\")\n",
"print(f\"Saved {len(backtest_results)} backtest result(s) to {_CURRICULUM_DIR}/\")"
"print(f\"Saved {sum(n in _BASELINE_PREDICTORS for n in backtest_results)} backtest result(s) to {_CURRICULUM_DIR}/\")"
]
},
{
Expand Down Expand Up @@ -289,6 +365,10 @@
"Running 2026 evaluation...\n",
" Naive (Last Value) βœ“\n",
" AutoARIMA βœ“\n",
" Prophet βœ“\n",
" LLMP-Sampled (gemini-3.1-flash-lite-preview) βœ“\n",
" LLMP-Grid (gemini-3.1-flash-lite-preview) βœ“\n",
" News Agent (gemini-3.1-flash-lite-preview) βœ“\n",
"\n",
"2026 evaluation complete.\n"
]
Expand All @@ -306,25 +386,20 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"id": "a49c24d5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Saved 2 eval result(s) to adaptive_agent/curriculum/\n"
]
}
],
"outputs": [],
"source": [
"# ── Save eval results for NB06 ───────────────────────────────────────────────\n",
"# eval_results is {predictor_name: {task_id: BacktestResult}}; extract single task.\n",
"# Only baseline predictors are written so uncommenting optional predictors\n",
"# above does not add extra rows to the NB06 scorecard.\n",
"for _name, _result_dict in eval_results.items():\n",
" if _name not in _BASELINE_PREDICTORS:\n",
" continue\n",
" _result = next(iter(_result_dict.values()))\n",
" (_CURRICULUM_DIR / f\"eval_{_name}.json\").write_text(_result.model_dump_json(), encoding=\"utf-8\")\n",
"print(f\"Saved {len(eval_results)} eval result(s) to {_CURRICULUM_DIR}/\")"
"print(f\"Saved {sum(n in _BASELINE_PREDICTORS for n in eval_results)} eval result(s) to {_CURRICULUM_DIR}/\")"
]
},
{
Expand Down Expand Up @@ -353,10 +428,14 @@
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
"2026 EVAL SCORECARD β€” STATELESS BASELINE:\n",
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
" Mean CRPS (2026) MAE h=21d (2026) 80% CI Coverage\n",
"Predictor \n",
"AutoARIMA 10.998416 14.100308 25.0\n",
"Naive (Last Value) 13.643182 13.999166 0.0\n"
" Mean CRPS (2026) MAE h=21d (2026) 80% CI Coverage\n",
"Predictor \n",
"Prophet 2.294532 NaN NaN\n",
"AutoARIMA 6.198680 NaN NaN\n",
"News Agent (gemini-3.1-flash-lite-preview) 7.060907 NaN NaN\n",
"LLMP-Grid (gemini-3.1-flash-lite-preview) 7.624998 NaN NaN\n",
"Naive (Last Value) 8.214998 NaN NaN\n",
"LLMP-Sampled (gemini-3.1-flash-lite-preview) 9.069998 NaN NaN\n"
]
}
],
Expand Down
Loading