diff --git a/implementations/energy_oil_forecasting/04_systematic_backtest_eval.ipynb b/implementations/energy_oil_forecasting/04_systematic_backtest_eval.ipynb index 844b353..3618c94 100644 --- a/implementations/energy_oil_forecasting/04_systematic_backtest_eval.ipynb +++ b/implementations/energy_oil_forecasting/04_systematic_backtest_eval.ipynb @@ -33,11 +33,118 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4760f015", "metadata": {}, - "outputs": [], - "source": "import warnings\nfrom pathlib import Path\n\nimport energy_oil_forecasting\nimport pandas as pd\nimport yaml\nfrom aieng.forecasting.evaluation import (\n MultiTargetBacktestSpec,\n cached_multi_backtest,\n describe_spec,\n)\nfrom energy_oil_forecasting.data import build_wti_service\n\n\nwarnings.filterwarnings(\"ignore\")\n\n# ── Mode ──────────────────────────────────────────────────────────────────────\n# Set SMOKE_TEST = True to run a 2-origin, 1-sample version of the notebook\n# for fast local development and end-to-end CI testing. The full specs run\n# 51 backtest + 8 eval origins; smoke runs 2 + 2.\nSMOKE_TEST = True\n\n# ── Model selection ───────────────────────────────────────────────────────────\n# Two project models: \"gemini-3.1-flash-lite-preview\" (lite/default) and\n# \"gemini-3.5-flash\" (advanced). Change these two lines to swap models for the\n# whole notebook (bare proxy names — no \"gemini/\" prefix).\nAGENT_MODEL = \"gemini-3.1-flash-lite-preview\"\nLLMP_MODEL = \"gemini-3.1-flash-lite-preview\"\n\n# ── Derived settings (do not edit below) ─────────────────────────────────────\nN_SAMPLES = 1 if SMOKE_TEST else 3 # trajectories per LLMP call\n\ndata_service = build_wti_service()\n\nspec_dir = Path(energy_oil_forecasting.__file__).parent / \"specs\"\nif SMOKE_TEST:\n backtest_file, eval_file = \"energy_oil_smoke.yaml\", \"energy_oil_eval_smoke.yaml\"\nelse:\n backtest_file, eval_file = \"energy_oil_backtest.yaml\", \"energy_oil_eval.yaml\"\n\nwith open(spec_dir / backtest_file) as f:\n backtest_spec = MultiTargetBacktestSpec.model_validate(yaml.safe_load(f))\nwith open(spec_dir / eval_file) as f:\n eval_spec = MultiTargetBacktestSpec.model_validate(yaml.safe_load(f))\n\nprint(\n f\"{'⚡ SMOKE MODE' if SMOKE_TEST else '📊 FULL MODE'} — AGENT_MODEL={AGENT_MODEL!r} LLMP_MODEL={LLMP_MODEL!r} N_SAMPLES={N_SAMPLES}\"\n)\nprint()\nprint(\"━\" * 72)\nprint(\"LOADED SPECIFICATIONS:\")\nprint(\"━\" * 72)\nprint(describe_spec(backtest_spec, data_service))\nprint(describe_spec(eval_spec, data_service))" + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "⚡ SMOKE MODE — AGENT_MODEL='gemini-3.1-flash-lite-preview' LLMP_MODEL='gemini-3.1-flash-lite-preview' N_SAMPLES=1\n", + "\n", + "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n", + "LOADED SPECIFICATIONS:\n", + "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n", + "MultiTargetBacktestSpec (spec_id=energy_oil_smoke)\n", + " description: Two-origin smoke backtest for local and CI testing of the NB04 pipeline. Uses the same tasks, horizons, and warmup as energy_oil_backtest but with only 2 weekly origins so the full notebook can be exercised without burning tokens on 51 × 5 predictor evaluations.\n", + " start: 2025-06-02 00:00:00\n", + " end: 2025-06-09 00:00:00\n", + " stride: 5\n", + " warmup: 250\n", + " tasks: 1\n", + "\n", + "Task: wti_oil_price_forecast\n", + " description: WTI Crude Oil continuous front-month futures Close price (yfinance symbol: CL=F), projected 5, 10, and 21 trading days ahead.\n", + " horizons: [5, 10, 21] (len=3)\n", + " frequency: B\n", + " payload: continuous\n", + " resolution: observed_value_at_resolution_timestamp\n", + "- target_series_id: wti_crude_oil_price\n", + " description: WTI Crude Oil continuous front-month futures adjusted close (Yahoo Finance CL=F)\n", + " source: yfinance\n", + " units: USD/bbl\n", + " frequency: B\n", + "\n", + "MultiTargetBacktestSpec (spec_id=energy_oil_eval_smoke)\n", + " description: Two-origin smoke evaluation for local and CI testing of the NB04 pipeline. Uses the same tasks, horizons, warmup, and geopolitical period as energy_oil_eval but with only 2 origins to keep cost negligible.\n", + " start: 2026-02-02 00:00:00\n", + " end: 2026-02-09 00:00:00\n", + " stride: 5\n", + " warmup: 250\n", + " tasks: 1\n", + "\n", + "Task: wti_oil_price_forecast\n", + " description: WTI Crude Oil continuous front-month futures Close price (yfinance symbol: CL=F), projected 5, 10, and 21 trading days ahead.\n", + " horizons: [5, 10, 21] (len=3)\n", + " frequency: B\n", + " payload: continuous\n", + " resolution: observed_value_at_resolution_timestamp\n", + "- target_series_id: wti_crude_oil_price\n", + " description: WTI Crude Oil continuous front-month futures adjusted close (Yahoo Finance CL=F)\n", + " source: yfinance\n", + " units: USD/bbl\n", + " frequency: B\n", + "\n" + ] + } + ], + "source": [ + "import warnings\n", + "from pathlib import Path\n", + "\n", + "import energy_oil_forecasting\n", + "import pandas as pd\n", + "import yaml\n", + "from aieng.forecasting.evaluation import (\n", + " MultiTargetBacktestSpec,\n", + " cached_multi_backtest,\n", + " describe_spec,\n", + ")\n", + "from energy_oil_forecasting.data import build_wti_service\n", + "\n", + "\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "# ── Mode ──────────────────────────────────────────────────────────────────────\n", + "# Set SMOKE_TEST = True to run a 2-origin, 1-sample version of the notebook\n", + "# for fast local development and end-to-end CI testing. The full specs run\n", + "# 51 backtest + 8 eval origins; smoke runs 2 + 2.\n", + "SMOKE_TEST = True\n", + "\n", + "# ── Model selection ───────────────────────────────────────────────────────────\n", + "# Two project models: \"gemini-3.1-flash-lite-preview\" (lite/default) and\n", + "# \"gemini-3.5-flash\" (advanced). Change these two lines to swap models for the\n", + "# whole notebook (bare proxy names — no \"gemini/\" prefix).\n", + "AGENT_MODEL = \"gemini-3.1-flash-lite-preview\"\n", + "LLMP_MODEL = \"gemini-3.1-flash-lite-preview\"\n", + "\n", + "# ── Derived settings (do not edit below) ─────────────────────────────────────\n", + "N_SAMPLES = 1 if SMOKE_TEST else 3 # trajectories per LLMP call\n", + "\n", + "data_service = build_wti_service()\n", + "\n", + "spec_dir = Path(energy_oil_forecasting.__file__).parent / \"specs\"\n", + "if SMOKE_TEST:\n", + " backtest_file, eval_file = \"energy_oil_smoke.yaml\", \"energy_oil_eval_smoke.yaml\"\n", + "else:\n", + " backtest_file, eval_file = \"energy_oil_backtest.yaml\", \"energy_oil_eval.yaml\"\n", + "\n", + "with open(spec_dir / backtest_file) as f:\n", + " backtest_spec = MultiTargetBacktestSpec.model_validate(yaml.safe_load(f))\n", + "with open(spec_dir / eval_file) as f:\n", + " eval_spec = MultiTargetBacktestSpec.model_validate(yaml.safe_load(f))\n", + "\n", + "print(\n", + " f\"{'⚡ SMOKE MODE' if SMOKE_TEST else '📊 FULL MODE'} — AGENT_MODEL={AGENT_MODEL!r} LLMP_MODEL={LLMP_MODEL!r} N_SAMPLES={N_SAMPLES}\"\n", + ")\n", + "print()\n", + "print(\"━\" * 72)\n", + "print(\"LOADED SPECIFICATIONS:\")\n", + "print(\"━\" * 72)\n", + "print(describe_spec(backtest_spec, data_service))\n", + "print(describe_spec(eval_spec, data_service))" + ] }, { "cell_type": "markdown", @@ -68,45 +175,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "2f52a6fe", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Active predictors (2):\n", - " Naive (Last Value)\n", - " AutoARIMA\n" - ] - } - ], - "source": [ - "from aieng.forecasting.methods import (\n", - " LastValuePredictor,\n", - ")\n", - "from aieng.forecasting.methods.numerical.darts_arima import DartsAutoARIMAPredictor\n", - "\n", - "\n", - "# ── Predictors ────────────────────────────────────────────────────────────────\n", - "# AutoARIMA is the primary method; Naive is the lower-bound baseline.\n", - "# Both are evaluated in every section — no contender selection needed.\n", - "# NOTE: AutoARIMA re-fits at every origin (slow on first run; cached after).\n", - "PREDICTORS = {\n", - " \"Naive (Last Value)\": LastValuePredictor(),\n", - " \"AutoARIMA\": DartsAutoARIMAPredictor(),\n", - " # ── Optional comparisons (not the focus of this experiment) ──────────────\n", - " # \"Prophet\": ProphetPredictor(predictor_id=\"prophet_daily\", ...),\n", - " # f\"LLMP-Sampled ({LLMP_MODEL})\": SampledTrajectoryLLMPredictor(...),\n", - " # f\"LLMP-Grid ({LLMP_MODEL})\": QuantileGridLLMPredictor(...),\n", - " # f\"News Agent ({AGENT_MODEL})\": build_wti_agent_predictor(...),\n", - "}\n", - "\n", - "print(f\"Active predictors ({len(PREDICTORS)}):\")\n", - "for name in PREDICTORS:\n", - " print(f\" {name}\")" - ] + "outputs": [], + "source": "from aieng.forecasting.methods import (\n LastValuePredictor,\n QuantileGridLLMPredictor, # noqa: F401\n QuantileGridLLMPredictorConfig, # noqa: F401\n SampledTrajectoryLLMPredictor, # noqa: F401\n SampledTrajectoryLLMPredictorConfig, # noqa: F401\n)\nfrom aieng.forecasting.methods.numerical.darts_arima import DartsAutoARIMAPredictor\nfrom energy_oil_forecasting.analyst_agent import build_wti_agent_predictor, build_wti_news_config # noqa: F401\nfrom energy_oil_forecasting.prophet_baseline import ProphetPredictor # noqa: F401\n\n\n# ── Predictors ────────────────────────────────────────────────────────────────\n# AutoARIMA is the primary method; Naive is the lower-bound baseline.\n# Both are evaluated in every section — no contender selection needed.\n# NOTE: AutoARIMA re-fits at every origin (slow on first run; cached after).\nPREDICTORS = {\n \"Naive (Last Value)\": LastValuePredictor(),\n \"AutoARIMA\": DartsAutoARIMAPredictor(),\n # ── Optional comparisons (not the focus of this experiment) ──────────────\n # \"Prophet\": ProphetPredictor(),\n # f\"LLMP-Sampled ({LLMP_MODEL})\": SampledTrajectoryLLMPredictor(\n # SampledTrajectoryLLMPredictorConfig(model=LLMP_MODEL, n_samples=N_SAMPLES)\n # ),\n # f\"LLMP-Grid ({LLMP_MODEL})\": QuantileGridLLMPredictor(\n # QuantileGridLLMPredictorConfig(model=LLMP_MODEL)\n # ),\n # f\"News Agent ({AGENT_MODEL})\": build_wti_agent_predictor(\n # build_wti_news_config(model=AGENT_MODEL)\n # ),\n}\n\nprint(f\"Active predictors ({len(PREDICTORS)}):\")\nfor name in PREDICTORS:\n print(f\" {name}\")" }, { "cell_type": "markdown", @@ -131,11 +204,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "Running 2025 rolling backtest (2 predictor(s))...\n", + "Running 2025 rolling backtest (6 predictor(s))...\n", "LLM/agent runs are expensive — first run will take several minutes.\n", "\n", " Naive (Last Value) ✓\n", " AutoARIMA ✓\n", + " Prophet ✓\n", + " LLMP-Sampled (gemini-3.1-flash-lite-preview) ✓\n", + " LLMP-Grid (gemini-3.1-flash-lite-preview) ✓\n", + " News Agent (gemini-3.1-flash-lite-preview) ✓\n", "\n", "All 2025 backtests complete.\n" ] @@ -183,12 +260,16 @@ "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n", "2025 HISTORICAL BACKTEST — PERFORMANCE SUMMARY:\n", "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n", - " Mean CRPS MAE h=21d\n", - "Predictor \n", - "AutoARIMA 2.472053 3.076572\n", - "Naive (Last Value) 2.929931 3.016445\n", + " Mean CRPS MAE h=21d\n", + "Predictor \n", + "AutoARIMA 3.866155 NaN\n", + "LLMP-Grid (gemini-3.1-flash-lite-preview) 4.095949 NaN\n", + "LLMP-Sampled (gemini-3.1-flash-lite-preview) 5.074999 NaN\n", + "Naive (Last Value) 5.834998 NaN\n", + "News Agent (gemini-3.1-flash-lite-preview) 6.134875 NaN\n", + "Prophet 12.845009 NaN\n", "\n", - "AutoARIMA CRPS improvement over Naive: 0.4579 (15.6%)\n" + "AutoARIMA CRPS improvement over Naive: 1.9688 (33.7%)\n" ] } ], @@ -231,29 +312,24 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "496dc416", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saved 2 backtest result(s) to adaptive_agent/curriculum/\n" - ] - } - ], + "outputs": [], "source": [ "# ── Save backtest results for NB05 / NB06 ────────────────────────────────────\n", - "# Downstream notebooks load these with BacktestResult.model_validate_json().\n", - "# The files are gitignored (derived data) and regenerated by re-running NB04.\n", - "# cached_multi_backtest returns {task_id: BacktestResult}; extract single task.\n", + "# Only the two baseline predictors are written to curriculum/ so that\n", + "# uncommenting the optional predictors above does not pollute the files\n", + "# that NB05 and NB06 depend on.\n", "_CURRICULUM_DIR = Path(\"adaptive_agent/curriculum\")\n", "_CURRICULUM_DIR.mkdir(exist_ok=True)\n", + "_BASELINE_PREDICTORS = {\"Naive (Last Value)\", \"AutoARIMA\"}\n", "for _name, _result_dict in backtest_results.items():\n", + " if _name not in _BASELINE_PREDICTORS:\n", + " continue\n", " _result = next(iter(_result_dict.values()))\n", " (_CURRICULUM_DIR / f\"backtest_{_name}.json\").write_text(_result.model_dump_json(), encoding=\"utf-8\")\n", - "print(f\"Saved {len(backtest_results)} backtest result(s) to {_CURRICULUM_DIR}/\")" + "print(f\"Saved {sum(n in _BASELINE_PREDICTORS for n in backtest_results)} backtest result(s) to {_CURRICULUM_DIR}/\")" ] }, { @@ -289,6 +365,10 @@ "Running 2026 evaluation...\n", " Naive (Last Value) ✓\n", " AutoARIMA ✓\n", + " Prophet ✓\n", + " LLMP-Sampled (gemini-3.1-flash-lite-preview) ✓\n", + " LLMP-Grid (gemini-3.1-flash-lite-preview) ✓\n", + " News Agent (gemini-3.1-flash-lite-preview) ✓\n", "\n", "2026 evaluation complete.\n" ] @@ -306,25 +386,20 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "a49c24d5", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saved 2 eval result(s) to adaptive_agent/curriculum/\n" - ] - } - ], + "outputs": [], "source": [ "# ── Save eval results for NB06 ───────────────────────────────────────────────\n", - "# eval_results is {predictor_name: {task_id: BacktestResult}}; extract single task.\n", + "# Only baseline predictors are written so uncommenting optional predictors\n", + "# above does not add extra rows to the NB06 scorecard.\n", "for _name, _result_dict in eval_results.items():\n", + " if _name not in _BASELINE_PREDICTORS:\n", + " continue\n", " _result = next(iter(_result_dict.values()))\n", " (_CURRICULUM_DIR / f\"eval_{_name}.json\").write_text(_result.model_dump_json(), encoding=\"utf-8\")\n", - "print(f\"Saved {len(eval_results)} eval result(s) to {_CURRICULUM_DIR}/\")" + "print(f\"Saved {sum(n in _BASELINE_PREDICTORS for n in eval_results)} eval result(s) to {_CURRICULUM_DIR}/\")" ] }, { @@ -353,10 +428,14 @@ "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n", "2026 EVAL SCORECARD — STATELESS BASELINE:\n", "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n", - " Mean CRPS (2026) MAE h=21d (2026) 80% CI Coverage\n", - "Predictor \n", - "AutoARIMA 10.998416 14.100308 25.0\n", - "Naive (Last Value) 13.643182 13.999166 0.0\n" + " Mean CRPS (2026) MAE h=21d (2026) 80% CI Coverage\n", + "Predictor \n", + "Prophet 2.294532 NaN NaN\n", + "AutoARIMA 6.198680 NaN NaN\n", + "News Agent (gemini-3.1-flash-lite-preview) 7.060907 NaN NaN\n", + "LLMP-Grid (gemini-3.1-flash-lite-preview) 7.624998 NaN NaN\n", + "Naive (Last Value) 8.214998 NaN NaN\n", + "LLMP-Sampled (gemini-3.1-flash-lite-preview) 9.069998 NaN NaN\n" ] } ],