Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,18 @@ recent research, with fully reproducible notebooks and evaluation pipelines.
uv run jupyter lab
```

5. Run integration tests to validate that your API keys are set up correctly:

```bash
uv run --env-file .env pytest -sv tests/test_integration.py
```

> **Note:** If your `.env` file is incomplete or needs to be updated, you can re-run onboarding manually from inside your Coder workspace (from the repo root):
>
> ```bash
> onboard --bootcamp-name "llm-interpretability-bootcamp" --output-dir "." --test-script "./aieng-llm-interp/tests/test_integration.py" --env-example "./.env.example" --test-marker "integration_test" --force
> ```

## License

This project is licensed under the terms of the [LICENSE](LICENSE.md) file in the root directory.
Expand Down
317 changes: 132 additions & 185 deletions implementations/agentic_vqa_eval/README.md

Large diffs are not rendered by default.

8 changes: 6 additions & 2 deletions implementations/agentic_vqa_eval/analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"cell_type": "markdown",
"id": "7fb27b941602401d91542211134fc71a",
"metadata": {},
"source": "# ChartQAPro Evaluation — Analysis Walkthrough\n\nThis notebook walks through the full evaluation artifact stack produced by the\nagentic ChartQAPro framework. By the end you will be able to:\n\n- Load and inspect **MEPs** (Model Evaluation Packets) directly\n- Plot **accuracy by question type** from `metrics.jsonl`\n- Visualise the **verifier revision rate** and its effect on accuracy\n- Chart the **failure taxonomy** breakdown from `taxonomy.jsonl`\n- Browse individual samples — question, plan, vision answer, verifier verdict, chart image\n\n**Prerequisites:** Run these commands first:\n```bash\nuv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps --n 25 --config gemini_gemini\nuv run --env-file .env -m agentic_chartqapro_eval.eval.eval_outputs --mep_dir meps/gemini_gemini/chartqapro/test --out output/metrics.jsonl\nuv run --env-file .env -m agentic_chartqapro_eval.eval.error_taxonomy --mep_dir meps/gemini_gemini/chartqapro/test --metrics_file output/metrics.jsonl --out output/taxonomy.jsonl\n```"
"source": "# ChartQAPro Evaluation — Analysis Walkthrough\n\nThis notebook walks through the full evaluation artifact stack produced by the\nagentic ChartQAPro framework. By the end you will be able to:\n\n- Load and inspect **MEPs** (Model Evaluation Packets) directly\n- Plot **accuracy by question type** from `metrics.jsonl`\n- Visualise the **verifier revision rate** and its effect on accuracy\n- Chart the **failure taxonomy** breakdown from `taxonomy.jsonl`\n- Browse individual samples — question, plan, vision answer, verifier verdict, chart image\n\n**Prerequisites:** Run these commands first (from any directory in the repo):\n```bash\nuv run --env-file \"$(git rev-parse --show-toplevel)/.env\" --directory \"$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval\" -m agentic_chartqapro_eval.runner.run_generate_meps --n 25 --config gemini_gemini\nuv run --env-file \"$(git rev-parse --show-toplevel)/.env\" --directory \"$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval\" -m agentic_chartqapro_eval.eval.eval_outputs --mep_dir meps/gemini_gemini/chartqapro/test --out output/metrics.jsonl\nuv run --env-file \"$(git rev-parse --show-toplevel)/.env\" --directory \"$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval\" -m agentic_chartqapro_eval.eval.error_taxonomy --mep_dir meps/gemini_gemini/chartqapro/test --metrics_file output/metrics.jsonl --out output/taxonomy.jsonl\n```"
},
{
"cell_type": "markdown",
Expand Down Expand Up @@ -272,7 +272,11 @@
"\n",
" wrong = tax_df[tax_df[\"failure_type\"] != \"correct\"]\n",
" print(f\"\\nTotal wrong: {len(wrong)} / {len(tax_df)}\")\n",
" print(f\"Most common failure: {counts[counts.index != 'correct'].idxmax()}\")"
" failure_counts = counts[counts.index != \"correct\"]\n",
" if failure_counts.empty:\n",
" print(\"Most common failure: none (all samples correct)\")\n",
" else:\n",
" print(f\"Most common failure: {failure_counts.idxmax()}\")"
]
},
{
Expand Down
138 changes: 45 additions & 93 deletions implementations/agentic_vqa_eval/run_pipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"|---|---|\n",
"| 1 — Configuration | All tunable parameters in one place |\n",
"| 2 — Environment | Check API keys, install path, imports |\n",
"| 2.5 — Opik health check | Verify Opik stack is reachable and API-responsive before running |\n",
"| 2.5 — Langfuse health check | Verify Langfuse credentials are configured before running |\n",
"| 3 — Load dataset | Pull samples from HuggingFace |\n",
"| 4 — Instantiate agents | Build Planner, OCR, Vision, Verifier |\n",
"| 5 — Run pipeline | Generate MEPs (Plan → OCR → Vision → Verify) |\n",
Expand Down Expand Up @@ -119,7 +119,7 @@
" val = os.environ.get(var, \"\")\n",
" needed = needed_for in CONFIG\n",
" if val and not val.startswith(\"your_\"):\n",
" print(f\" ok {var} ({val[:12]}...)\")\n",
" print(f\" ok {var} ({val[:3]}...)\")\n",
" elif needed:\n",
" print(f\" MISSING {var} <- required for {CONFIG}\")\n",
" missing.append(var)\n",
Expand All @@ -141,8 +141,8 @@
"from agentic_chartqapro_eval.eval.eval_outputs import evaluate_mep # noqa: E402\n",
"from agentic_chartqapro_eval.eval.eval_traces import evaluate_trace # noqa: E402\n",
"from agentic_chartqapro_eval.eval.summarize import summarize, write_csv # noqa: E402\n",
"from agentic_chartqapro_eval.langfuse_integration.client import get_client # noqa: E402\n",
"from agentic_chartqapro_eval.mep.writer import iter_meps # noqa: E402\n",
"from agentic_chartqapro_eval.opik_integration.client import get_client # noqa: E402\n",
"from agentic_chartqapro_eval.runner.run_generate_meps import ( # noqa: E402\n",
" BACKEND_CONFIGS,\n",
" process_sample,\n",
Expand All @@ -159,19 +159,17 @@
"id": "cell-opik-hdr",
"metadata": {},
"source": [
"## 2.5 — Opik Health Check\n",
"## 2.5 — Langfuse Health Check\n",
"\n",
"Verifies that the self-hosted Opik stack is **fully operational** before the pipeline runs.\n",
"Three checks are run in sequence:\n",
"Verifies that Langfuse credentials are configured before the pipeline runs.\n",
"\n",
"| Check | What it tests |\n",
"|---|---|\n",
"| HTTP reachable | TCP connection to `OPIK_URL_OVERRIDE` succeeds within 5 s |\n",
"| Client init | `opik.Opik()` initialises without error |\n",
"| API read test | A lightweight `search_traces` call returns a valid response |\n",
"| Env vars present | `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are set in `.env` |\n",
"| Client init | `Langfuse()` initialises without error |\n",
"\n",
"If `OPIK_URL_OVERRIDE` is not set the cell prints a skip notice and continues — Opik is optional.\n",
"If any check fails the pipeline still runs; only tracing is affected."
"If the keys are absent the cell prints a skip notice and continues — Langfuse is optional.\n",
"The pipeline produces identical MEPs with or without it; tracing is purely additive."
]
},
{
Expand All @@ -181,107 +179,61 @@
"metadata": {},
"outputs": [],
"source": [
"import urllib.error\n",
"import urllib.request\n",
"\n",
"# Force re-initialisation so re-running this cell after starting Docker works correctly\n",
"from agentic_chartqapro_eval.opik_integration.client import reset_client\n",
"from agentic_chartqapro_eval.langfuse_integration.client import reset_client\n",
"\n",
"\n",
"# Force re-initialisation so re-running this cell picks up any .env changes\n",
"reset_client()\n",
"\n",
"OPIK_URL = os.environ.get(\"OPIK_URL_OVERRIDE\", \"\")\n",
"lf_public = os.environ.get(\"LANGFUSE_PUBLIC_KEY\", \"\")\n",
"lf_secret = os.environ.get(\"LANGFUSE_SECRET_KEY\", \"\")\n",
"\n",
"if not OPIK_URL:\n",
" print(\"[skip] OPIK_URL_OVERRIDE is not set.\")\n",
" print(\" Opik tracing is disabled. Pipeline will run fine without it.\")\n",
"if not lf_public or not lf_secret:\n",
" print(\"[skip] LANGFUSE_PUBLIC_KEY / LANGFUSE_SECRET_KEY are not set.\")\n",
" print(\" Langfuse tracing is disabled. Pipeline will run fine without it.\")\n",
" print()\n",
" print(\"To enable Opik tracing:\")\n",
" print(\" 1. Start the Docker stack:\")\n",
" print(\" cd /path/to/opik/deployment/docker-compose\")\n",
" print(\" docker compose --profile opik up -d\")\n",
" print(\" 2. Add to .env: OPIK_URL_OVERRIDE=http://localhost:5173/api\")\n",
" print(\" 3. Re-run this cell.\")\n",
" print(\"To enable Langfuse tracing, add to .env:\")\n",
" print(\" LANGFUSE_PUBLIC_KEY=pk-lf-...\")\n",
" print(\" LANGFUSE_SECRET_KEY=sk-lf-...\")\n",
" print(\" # LANGFUSE_HOST=https://cloud.langfuse.com (default; change for self-hosted)\")\n",
"else:\n",
" results = {}\n",
"\n",
" # -- Check 1: HTTP reachability (any response = server is up) --\n",
" try:\n",
" with urllib.request.urlopen(OPIK_URL, timeout=5) as r:\n",
" results[\"http\"] = (\"ok\", f\"HTTP {r.status}\")\n",
" except urllib.error.HTTPError as e:\n",
" # HTTPError means server responded -- it is up, just returned a non-200\n",
" results[\"http\"] = (\"ok\", f\"HTTP {e.code} (server responded)\")\n",
" except Exception as e:\n",
" results[\"http\"] = (\"fail\", str(e))\n",
" # -- Check 1: Env vars present --\n",
" results[\"env\"] = (\"ok\", f\"pk={lf_public[:3]}...\")\n",
"\n",
" # -- Check 2: Opik Python client initialises --\n",
" _opik_hc = None\n",
" # -- Check 2: Client initialises --\n",
" try:\n",
" from agentic_chartqapro_eval.opik_integration.client import get_client\n",
"\n",
" _opik_hc = get_client()\n",
" if _opik_hc is not None:\n",
" results[\"client\"] = (\"ok\", \"opik.Opik() ready\")\n",
" _lf_hc = get_client()\n",
" if _lf_hc is not None:\n",
" results[\"client\"] = (\"ok\", \"Langfuse() ready\")\n",
" else:\n",
" results[\"client\"] = (\"fail\", \"get_client() returned None\")\n",
" except Exception as e:\n",
" results[\"client\"] = (\"fail\", str(e))\n",
"\n",
" # -- Check 3: API actually responds to a lightweight read --\n",
" if results.get(\"client\", (\"\",))[0] == \"ok\" and _opik_hc is not None:\n",
" try:\n",
" traces = _opik_hc.search_traces(max_results=1)\n",
" results[\"api\"] = (\"ok\", f\"search_traces returned {len(traces)} result(s)\")\n",
" except Exception as e:\n",
" err_str = str(e)\n",
" hint = \"\"\n",
" if \"readonly\" in err_str.lower() or \"500\" in err_str:\n",
" hint = \" [ClickHouse replica may be read-only -- run SYSTEM RESTORE REPLICA]\"\n",
" results[\"api\"] = (\"fail\", err_str[:120] + hint)\n",
" else:\n",
" results[\"api\"] = (\"skip\", \"client unavailable\")\n",
"\n",
" # -- Report --\n",
" print(f\"Opik URL : {OPIK_URL}\")\n",
" print()\n",
" labels = [\n",
" (\"http\", \"HTTP reachable \"),\n",
" (\"client\", \"Client init \"),\n",
" (\"api\", \"API read test \"),\n",
" (\"env\", \"Env vars present\"),\n",
" (\"client\", \"Client init \"),\n",
" ]\n",
" all_ok = True\n",
" for key, label in labels:\n",
" status, detail = results.get(key, (\"skip\", \"\"))\n",
" if status == \"ok\":\n",
" marker = \"✓ OK \"\n",
" elif status == \"skip\":\n",
" marker = \"⊘ skip\"\n",
" else:\n",
" marker = \"✗ FAIL\"\n",
" marker = \"✓ OK \" if status == \"ok\" else (\"⊘ skip\" if status == \"skip\" else \"✗ FAIL\")\n",
" if status not in (\"ok\", \"skip\"):\n",
" all_ok = False\n",
" print(f\" {marker} {label} {detail}\")\n",
"\n",
" print()\n",
" if all_ok:\n",
" dashboard_url = OPIK_URL.rstrip(\"/\").removesuffix(\"/api\")\n",
" print(\"✓ Opik is fully operational.\")\n",
" print(f\"Dashboard : {dashboard_url}\")\n",
" lf_host = os.environ.get(\"LANGFUSE_HOST\") or os.environ.get(\"LANGFUSE_BASE_URL\") or \"https://cloud.langfuse.com\"\n",
" print(\"✓ Langfuse is configured.\")\n",
" print(f\"Host : {lf_host}\")\n",
" print(\"Traces and scores will be recorded automatically during the pipeline run.\")\n",
" else:\n",
" print(\"⚠ WARNING: One or more Opik checks failed.\")\n",
" print(\"The pipeline will still run; Opik tracing may not work correctly.\")\n",
" if results.get(\"http\", (\"\",))[0] == \"fail\":\n",
" print()\n",
" print(\" Docker stack appears to be down. To start it:\")\n",
" print(\" cd /path/to/opik/deployment/docker-compose\")\n",
" print(\" docker compose --profile opik up -d\")\n",
" if results.get(\"api\", (\"\",))[0] == \"fail\":\n",
" print()\n",
" print(\" API is reachable but not responding correctly.\")\n",
" print(\" Check ClickHouse replica state:\")\n",
" print(\" docker exec opik-clickhouse-1 clickhouse-client --query \\\\\")\n",
" print(\" \\\"SELECT database,table,is_readonly FROM system.replicas WHERE database='opik'\\\"\")"
" print(\"⚠ WARNING: Langfuse client failed to initialise.\")\n",
" print(\"The pipeline will still run; tracing will be skipped.\")"
]
},
{
Expand Down Expand Up @@ -376,10 +328,10 @@
"else:\n",
" print(\"OcrReaderTool : disabled (USE_OCR=False)\")\n",
"\n",
"# Opik observability (no-op if OPIK_URL_OVERRIDE not set)\n",
"opik_client = get_client()\n",
"opik_status = \"enabled\" if opik_client else \"not configured\"\n",
"print(f\"Opik : {opik_status}\")"
"# Langfuse observability (no-op if keys not set)\n",
"lf_client = get_client()\n",
"lf_status = \"enabled\" if lf_client else \"not configured\"\n",
"print(f\"Langfuse : {lf_status}\")"
]
},
{
Expand Down Expand Up @@ -421,7 +373,7 @@
" config,\n",
" RUN_ID,\n",
" OUT_DIR,\n",
" opik_client=opik_client,\n",
" lf_client=lf_client,\n",
" verifier_agent=verifier,\n",
" ocr_tool=ocr,\n",
" )\n",
Expand Down Expand Up @@ -459,8 +411,8 @@
"## 6 — Inspect First MEP\n",
"\n",
"MEPs are self-contained JSON files. Every field you see here is what the agent actually\n",
"produced — no post-processing. The `opik_trace_id` links this MEP back to the live trace\n",
"in the Opik dashboard if Opik is configured."
"produced — no post-processing. The `lf_trace_id` links this MEP back to the live trace\n",
"in the Langfuse dashboard if Langfuse is configured."
]
},
{
Expand Down Expand Up @@ -501,8 +453,8 @@
" print(\"Timestamps (ms):\")\n",
" for k in [\"planner_ms\", \"ocr_ms\", \"vision_ms\", \"verifier_ms\"]:\n",
" print(f\" {k:<16} {ts.get(k, 0):.0f}\")\n",
" if mep.get(\"opik_trace_id\"):\n",
" print(f\"Opik trace ID: {mep['opik_trace_id']}\")\n",
" if mep.get(\"lf_trace_id\"):\n",
" print(f\"Langfuse trace ID: {mep['lf_trace_id']}\")\n",
" print(\"=\" * 64)\n",
"\n",
" img_path = s.get(\"image_ref\", {}).get(\"path\", \"\")\n",
Expand Down Expand Up @@ -609,7 +561,7 @@
" config,\n",
" RUN_ID_NO_OCR,\n",
" OUT_DIR_NO_OCR,\n",
" opik_client=opik_client,\n",
" lf_client=lf_client,\n",
" verifier_agent=verifier,\n",
" ocr_tool=None, # <-- OCR disabled\n",
" )\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from crewai import LLM, Agent, Crew, Task

from ..datasets.perceived_sample import PerceivedSample
from ..opik_integration.tracing import close_span, open_llm_span
from ..langfuse_integration.tracing import close_span, open_llm_span
from ..utils.json_strict import parse_strict


Expand Down Expand Up @@ -137,7 +137,7 @@ def __init__(
self.api_key = api_key
self._llm = _build_llm(backend, model, api_key)

def run(self, sample: PerceivedSample, opik_trace: Any = None) -> Tuple[str, dict, bool, str]:
def run(self, sample: PerceivedSample, lf_trace: Any = None) -> Tuple[str, dict, bool, str]:
"""
Execute the planning phase for a new question.

Expand All @@ -148,7 +148,7 @@ def run(self, sample: PerceivedSample, opik_trace: Any = None) -> Tuple[str, dic
----------
sample : PerceivedSample
The question and context to plan for.
opik_trace : Any, optional
langfuse_trace : Any, optional
Observability object for logging.

Returns
Expand All @@ -165,7 +165,7 @@ def run(self, sample: PerceivedSample, opik_trace: Any = None) -> Tuple[str, dic
prompt = build_planner_prompt(sample)

span = open_llm_span(
opik_trace,
lf_trace,
name="planner",
input_data={"prompt": prompt},
model=self.model,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from openai import OpenAI
from PIL import Image

from ..opik_integration.tracing import close_span, open_llm_span
from ..langfuse_integration.tracing import close_span, open_llm_span
from ..utils.json_strict import parse_strict


Expand Down Expand Up @@ -203,7 +203,7 @@ def run(
sample, # PerceivedSample
plan: dict,
vision_parsed: dict,
opik_trace: Any = None,
lf_trace: Any = None,
) -> Tuple[str, dict, bool, str]:
"""
Critically audit a draft answer using a single VLM call.
Expand All @@ -216,7 +216,7 @@ def run(
The inspection plan used by the previous agent.
vision_parsed : dict
The draft answer and explanation to audit.
opik_trace : Any, optional
langfuse_trace : Any, optional
Tracing object for observability.

Returns
Expand Down Expand Up @@ -250,7 +250,7 @@ def run(
)

span = open_llm_span(
opik_trace,
lf_trace,
name="verifier",
input_data={"prompt": prompt, "draft_answer": draft_answer},
model=self.model,
Expand Down
Loading
Loading