VectorInstitute · aravind-3105 · Mar 19, 2026 · Mar 16, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/README.md b/README.md
@@ -72,6 +72,18 @@ recent research, with fully reproducible notebooks and evaluation pipelines.
    uv run jupyter lab
    ```
 
+5. Run integration tests to validate that your API keys are set up correctly:
+
+   ```bash
+   uv run --env-file .env pytest -sv tests/test_integration.py
+   ```
+
+   > **Note:** If your `.env` file is incomplete or needs to be updated, you can re-run onboarding manually from inside your Coder workspace (from the repo root):
+   >
+   > ```bash
+   > onboard --bootcamp-name "llm-interpretability-bootcamp" --output-dir "." --test-script "./aieng-llm-interp/tests/test_integration.py" --env-example "./.env.example" --test-marker "integration_test" --force
+   > ```
+
 ## License
 
 This project is licensed under the terms of the [LICENSE](LICENSE.md) file in the root directory.

diff --git a/implementations/agentic_vqa_eval/README.md b/implementations/agentic_vqa_eval/README.md
diff --git a/implementations/agentic_vqa_eval/analysis.ipynb b/implementations/agentic_vqa_eval/analysis.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "id": "7fb27b941602401d91542211134fc71a",
    "metadata": {},
-   "source": "# ChartQAPro Evaluation — Analysis Walkthrough\n\nThis notebook walks through the full evaluation artifact stack produced by the\nagentic ChartQAPro framework. By the end you will be able to:\n\n- Load and inspect **MEPs** (Model Evaluation Packets) directly\n- Plot **accuracy by question type** from `metrics.jsonl`\n- Visualise the **verifier revision rate** and its effect on accuracy\n- Chart the **failure taxonomy** breakdown from `taxonomy.jsonl`\n- Browse individual samples — question, plan, vision answer, verifier verdict, chart image\n\n**Prerequisites:** Run these commands first:\n```bash\nuv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps --n 25 --config gemini_gemini\nuv run --env-file .env -m agentic_chartqapro_eval.eval.eval_outputs  --mep_dir meps/gemini_gemini/chartqapro/test --out output/metrics.jsonl\nuv run --env-file .env -m agentic_chartqapro_eval.eval.error_taxonomy --mep_dir meps/gemini_gemini/chartqapro/test --metrics_file output/metrics.jsonl --out output/taxonomy.jsonl\n```"
+   "source": "# ChartQAPro Evaluation — Analysis Walkthrough\n\nThis notebook walks through the full evaluation artifact stack produced by the\nagentic ChartQAPro framework. By the end you will be able to:\n\n- Load and inspect **MEPs** (Model Evaluation Packets) directly\n- Plot **accuracy by question type** from `metrics.jsonl`\n- Visualise the **verifier revision rate** and its effect on accuracy\n- Chart the **failure taxonomy** breakdown from `taxonomy.jsonl`\n- Browse individual samples — question, plan, vision answer, verifier verdict, chart image\n\n**Prerequisites:** Run these commands first (from any directory in the repo):\n```bash\nuv run --env-file \"$(git rev-parse --show-toplevel)/.env\" --directory \"$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval\" -m agentic_chartqapro_eval.runner.run_generate_meps --n 25 --config gemini_gemini\nuv run --env-file \"$(git rev-parse --show-toplevel)/.env\" --directory \"$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval\" -m agentic_chartqapro_eval.eval.eval_outputs  --mep_dir meps/gemini_gemini/chartqapro/test --out output/metrics.jsonl\nuv run --env-file \"$(git rev-parse --show-toplevel)/.env\" --directory \"$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval\" -m agentic_chartqapro_eval.eval.error_taxonomy --mep_dir meps/gemini_gemini/chartqapro/test --metrics_file output/metrics.jsonl --out output/taxonomy.jsonl\n```"
   },
   {
    "cell_type": "markdown",
@@ -272,7 +272,11 @@
     "\n",
     "    wrong = tax_df[tax_df[\"failure_type\"] != \"correct\"]\n",
     "    print(f\"\\nTotal wrong: {len(wrong)} / {len(tax_df)}\")\n",
-    "    print(f\"Most common failure: {counts[counts.index != 'correct'].idxmax()}\")"
+    "    failure_counts = counts[counts.index != \"correct\"]\n",
+    "    if failure_counts.empty:\n",
+    "        print(\"Most common failure: none (all samples correct)\")\n",
+    "    else:\n",
+    "        print(f\"Most common failure: {failure_counts.idxmax()}\")"
    ]
   },
   {

diff --git a/implementations/agentic_vqa_eval/run_pipeline.ipynb b/implementations/agentic_vqa_eval/run_pipeline.ipynb
@@ -16,7 +16,7 @@
     "|---|---|\n",
     "| 1 — Configuration | All tunable parameters in one place |\n",
     "| 2 — Environment | Check API keys, install path, imports |\n",
-    "| 2.5 — Opik health check | Verify Opik stack is reachable and API-responsive before running |\n",
+    "| 2.5 — Langfuse health check | Verify Langfuse credentials are configured before running |\n",
     "| 3 — Load dataset | Pull samples from HuggingFace |\n",
     "| 4 — Instantiate agents | Build Planner, OCR, Vision, Verifier |\n",
     "| 5 — Run pipeline | Generate MEPs (Plan → OCR → Vision → Verify) |\n",
@@ -119,7 +119,7 @@
     "    val = os.environ.get(var, \"\")\n",
     "    needed = needed_for in CONFIG\n",
     "    if val and not val.startswith(\"your_\"):\n",
-    "        print(f\"  ok  {var}  ({val[:12]}...)\")\n",
+    "        print(f\"  ok  {var}  ({val[:3]}...)\")\n",
     "    elif needed:\n",
     "        print(f\"  MISSING  {var}  <- required for {CONFIG}\")\n",
     "        missing.append(var)\n",
@@ -141,8 +141,8 @@
     "from agentic_chartqapro_eval.eval.eval_outputs import evaluate_mep  # noqa: E402\n",
     "from agentic_chartqapro_eval.eval.eval_traces import evaluate_trace  # noqa: E402\n",
     "from agentic_chartqapro_eval.eval.summarize import summarize, write_csv  # noqa: E402\n",
+    "from agentic_chartqapro_eval.langfuse_integration.client import get_client  # noqa: E402\n",
     "from agentic_chartqapro_eval.mep.writer import iter_meps  # noqa: E402\n",
-    "from agentic_chartqapro_eval.opik_integration.client import get_client  # noqa: E402\n",
     "from agentic_chartqapro_eval.runner.run_generate_meps import (  # noqa: E402\n",
     "    BACKEND_CONFIGS,\n",
     "    process_sample,\n",
@@ -159,19 +159,17 @@
    "id": "cell-opik-hdr",
    "metadata": {},
    "source": [
-    "## 2.5 — Opik Health Check\n",
+    "## 2.5 — Langfuse Health Check\n",
     "\n",
-    "Verifies that the self-hosted Opik stack is **fully operational** before the pipeline runs.\n",
-    "Three checks are run in sequence:\n",
+    "Verifies that Langfuse credentials are configured before the pipeline runs.\n",
     "\n",
     "| Check | What it tests |\n",
     "|---|---|\n",
-    "| HTTP reachable | TCP connection to `OPIK_URL_OVERRIDE` succeeds within 5 s |\n",
-    "| Client init | `opik.Opik()` initialises without error |\n",
-    "| API read test | A lightweight `search_traces` call returns a valid response |\n",
+    "| Env vars present | `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are set in `.env` |\n",
+    "| Client init | `Langfuse()` initialises without error |\n",
     "\n",
-    "If `OPIK_URL_OVERRIDE` is not set the cell prints a skip notice and continues — Opik is optional.\n",
-    "If any check fails the pipeline still runs; only tracing is affected."
+    "If the keys are absent the cell prints a skip notice and continues — Langfuse is optional.\n",
+    "The pipeline produces identical MEPs with or without it; tracing is purely additive."
    ]
   },
   {
@@ -181,107 +179,61 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import urllib.error\n",
-    "import urllib.request\n",
-    "\n",
-    "# Force re-initialisation so re-running this cell after starting Docker works correctly\n",
-    "from agentic_chartqapro_eval.opik_integration.client import reset_client\n",
+    "from agentic_chartqapro_eval.langfuse_integration.client import reset_client\n",
     "\n",
     "\n",
+    "# Force re-initialisation so re-running this cell picks up any .env changes\n",
     "reset_client()\n",
     "\n",
-    "OPIK_URL = os.environ.get(\"OPIK_URL_OVERRIDE\", \"\")\n",
+    "lf_public = os.environ.get(\"LANGFUSE_PUBLIC_KEY\", \"\")\n",
+    "lf_secret = os.environ.get(\"LANGFUSE_SECRET_KEY\", \"\")\n",
     "\n",
-    "if not OPIK_URL:\n",
-    "    print(\"[skip] OPIK_URL_OVERRIDE is not set.\")\n",
-    "    print(\"       Opik tracing is disabled. Pipeline will run fine without it.\")\n",
+    "if not lf_public or not lf_secret:\n",
+    "    print(\"[skip] LANGFUSE_PUBLIC_KEY / LANGFUSE_SECRET_KEY are not set.\")\n",
+    "    print(\"       Langfuse tracing is disabled. Pipeline will run fine without it.\")\n",
     "    print()\n",
-    "    print(\"To enable Opik tracing:\")\n",
-    "    print(\"  1. Start the Docker stack:\")\n",
-    "    print(\"       cd /path/to/opik/deployment/docker-compose\")\n",
-    "    print(\"       docker compose --profile opik up -d\")\n",
-    "    print(\"  2. Add to .env:  OPIK_URL_OVERRIDE=http://localhost:5173/api\")\n",
-    "    print(\"  3. Re-run this cell.\")\n",
+    "    print(\"To enable Langfuse tracing, add to .env:\")\n",
+    "    print(\"  LANGFUSE_PUBLIC_KEY=pk-lf-...\")\n",
+    "    print(\"  LANGFUSE_SECRET_KEY=sk-lf-...\")\n",
+    "    print(\"  # LANGFUSE_HOST=https://cloud.langfuse.com  (default; change for self-hosted)\")\n",
     "else:\n",
     "    results = {}\n",
     "\n",
-    "    # -- Check 1: HTTP reachability (any response = server is up) --\n",
-    "    try:\n",
-    "        with urllib.request.urlopen(OPIK_URL, timeout=5) as r:\n",
-    "            results[\"http\"] = (\"ok\", f\"HTTP {r.status}\")\n",
-    "    except urllib.error.HTTPError as e:\n",
-    "        # HTTPError means server responded -- it is up, just returned a non-200\n",
-    "        results[\"http\"] = (\"ok\", f\"HTTP {e.code} (server responded)\")\n",
-    "    except Exception as e:\n",
-    "        results[\"http\"] = (\"fail\", str(e))\n",
+    "    # -- Check 1: Env vars present --\n",
+    "    results[\"env\"] = (\"ok\", f\"pk={lf_public[:3]}...\")\n",
     "\n",
-    "    # -- Check 2: Opik Python client initialises --\n",
-    "    _opik_hc = None\n",
+    "    # -- Check 2: Client initialises --\n",
     "    try:\n",
-    "        from agentic_chartqapro_eval.opik_integration.client import get_client\n",
-    "\n",
-    "        _opik_hc = get_client()\n",
-    "        if _opik_hc is not None:\n",
-    "            results[\"client\"] = (\"ok\", \"opik.Opik() ready\")\n",
+    "        _lf_hc = get_client()\n",
+    "        if _lf_hc is not None:\n",
+    "            results[\"client\"] = (\"ok\", \"Langfuse() ready\")\n",
     "        else:\n",
     "            results[\"client\"] = (\"fail\", \"get_client() returned None\")\n",
     "    except Exception as e:\n",
     "        results[\"client\"] = (\"fail\", str(e))\n",
     "\n",
-    "    # -- Check 3: API actually responds to a lightweight read --\n",
-    "    if results.get(\"client\", (\"\",))[0] == \"ok\" and _opik_hc is not None:\n",
-    "        try:\n",
-    "            traces = _opik_hc.search_traces(max_results=1)\n",
-    "            results[\"api\"] = (\"ok\", f\"search_traces returned {len(traces)} result(s)\")\n",
-    "        except Exception as e:\n",
-    "            err_str = str(e)\n",
-    "            hint = \"\"\n",
-    "            if \"readonly\" in err_str.lower() or \"500\" in err_str:\n",
-    "                hint = \" [ClickHouse replica may be read-only -- run SYSTEM RESTORE REPLICA]\"\n",
-    "            results[\"api\"] = (\"fail\", err_str[:120] + hint)\n",
-    "    else:\n",
-    "        results[\"api\"] = (\"skip\", \"client unavailable\")\n",
-    "\n",
     "    # -- Report --\n",
-    "    print(f\"Opik URL : {OPIK_URL}\")\n",
-    "    print()\n",
     "    labels = [\n",
-    "        (\"http\", \"HTTP reachable \"),\n",
-    "        (\"client\", \"Client init    \"),\n",
-    "        (\"api\", \"API read test  \"),\n",
+    "        (\"env\", \"Env vars present\"),\n",
+    "        (\"client\", \"Client init     \"),\n",
     "    ]\n",
     "    all_ok = True\n",
     "    for key, label in labels:\n",
     "        status, detail = results.get(key, (\"skip\", \"\"))\n",
-    "        if status == \"ok\":\n",
-    "            marker = \"✓ OK  \"\n",
-    "        elif status == \"skip\":\n",
-    "            marker = \"⊘ skip\"\n",
-    "        else:\n",
-    "            marker = \"✗ FAIL\"\n",
+    "        marker = \"✓ OK  \" if status == \"ok\" else (\"⊘ skip\" if status == \"skip\" else \"✗ FAIL\")\n",
+    "        if status not in (\"ok\", \"skip\"):\n",
     "            all_ok = False\n",
     "        print(f\"  {marker}  {label}  {detail}\")\n",
     "\n",
     "    print()\n",
     "    if all_ok:\n",
-    "        dashboard_url = OPIK_URL.rstrip(\"/\").removesuffix(\"/api\")\n",
-    "        print(\"✓ Opik is fully operational.\")\n",
-    "        print(f\"Dashboard : {dashboard_url}\")\n",
+    "        lf_host = os.environ.get(\"LANGFUSE_HOST\") or os.environ.get(\"LANGFUSE_BASE_URL\") or \"https://cloud.langfuse.com\"\n",
+    "        print(\"✓ Langfuse is configured.\")\n",
+    "        print(f\"Host      : {lf_host}\")\n",
     "        print(\"Traces and scores will be recorded automatically during the pipeline run.\")\n",
     "    else:\n",
-    "        print(\"⚠ WARNING: One or more Opik checks failed.\")\n",
-    "        print(\"The pipeline will still run; Opik tracing may not work correctly.\")\n",
-    "        if results.get(\"http\", (\"\",))[0] == \"fail\":\n",
-    "            print()\n",
-    "            print(\"  Docker stack appears to be down. To start it:\")\n",
-    "            print(\"    cd /path/to/opik/deployment/docker-compose\")\n",
-    "            print(\"    docker compose --profile opik up -d\")\n",
-    "        if results.get(\"api\", (\"\",))[0] == \"fail\":\n",
-    "            print()\n",
-    "            print(\"  API is reachable but not responding correctly.\")\n",
-    "            print(\"  Check ClickHouse replica state:\")\n",
-    "            print(\"    docker exec opik-clickhouse-1 clickhouse-client --query \\\\\")\n",
-    "            print(\"      \\\"SELECT database,table,is_readonly FROM system.replicas WHERE database='opik'\\\"\")"
+    "        print(\"⚠ WARNING: Langfuse client failed to initialise.\")\n",
+    "        print(\"The pipeline will still run; tracing will be skipped.\")"
    ]
   },
   {
@@ -376,10 +328,10 @@
     "else:\n",
     "    print(\"OcrReaderTool  : disabled (USE_OCR=False)\")\n",
     "\n",
-    "# Opik observability (no-op if OPIK_URL_OVERRIDE not set)\n",
-    "opik_client = get_client()\n",
-    "opik_status = \"enabled\" if opik_client else \"not configured\"\n",
-    "print(f\"Opik           : {opik_status}\")"
+    "# Langfuse observability (no-op if keys not set)\n",
+    "lf_client = get_client()\n",
+    "lf_status = \"enabled\" if lf_client else \"not configured\"\n",
+    "print(f\"Langfuse       : {lf_status}\")"
    ]
   },
   {
@@ -421,7 +373,7 @@
     "        config,\n",
     "        RUN_ID,\n",
     "        OUT_DIR,\n",
-    "        opik_client=opik_client,\n",
+    "        lf_client=lf_client,\n",
     "        verifier_agent=verifier,\n",
     "        ocr_tool=ocr,\n",
     "    )\n",
@@ -459,8 +411,8 @@
     "## 6 — Inspect First MEP\n",
     "\n",
     "MEPs are self-contained JSON files. Every field you see here is what the agent actually\n",
-    "produced — no post-processing. The `opik_trace_id` links this MEP back to the live trace\n",
-    "in the Opik dashboard if Opik is configured."
+    "produced — no post-processing. The `lf_trace_id` links this MEP back to the live trace\n",
+    "in the Langfuse dashboard if Langfuse is configured."
    ]
   },
   {
@@ -501,8 +453,8 @@
     "    print(\"Timestamps (ms):\")\n",
     "    for k in [\"planner_ms\", \"ocr_ms\", \"vision_ms\", \"verifier_ms\"]:\n",
     "        print(f\"  {k:<16} {ts.get(k, 0):.0f}\")\n",
-    "    if mep.get(\"opik_trace_id\"):\n",
-    "        print(f\"Opik trace ID: {mep['opik_trace_id']}\")\n",
+    "    if mep.get(\"lf_trace_id\"):\n",
+    "        print(f\"Langfuse trace ID: {mep['lf_trace_id']}\")\n",
     "    print(\"=\" * 64)\n",
     "\n",
     "    img_path = s.get(\"image_ref\", {}).get(\"path\", \"\")\n",
@@ -609,7 +561,7 @@
     "                config,\n",
     "                RUN_ID_NO_OCR,\n",
     "                OUT_DIR_NO_OCR,\n",
-    "                opik_client=opik_client,\n",
+    "                lf_client=lf_client,\n",
     "                verifier_agent=verifier,\n",
     "                ocr_tool=None,  # <-- OCR disabled\n",
     "            )\n",

diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py
@@ -11,7 +11,7 @@
 from crewai import LLM, Agent, Crew, Task
 
 from ..datasets.perceived_sample import PerceivedSample
-from ..opik_integration.tracing import close_span, open_llm_span
+from ..langfuse_integration.tracing import close_span, open_llm_span
 from ..utils.json_strict import parse_strict
 
 
@@ -137,7 +137,7 @@ def __init__(
         self.api_key = api_key
         self._llm = _build_llm(backend, model, api_key)
 
-    def run(self, sample: PerceivedSample, opik_trace: Any = None) -> Tuple[str, dict, bool, str]:
+    def run(self, sample: PerceivedSample, lf_trace: Any = None) -> Tuple[str, dict, bool, str]:
         """
         Execute the planning phase for a new question.
 
@@ -148,7 +148,7 @@ def run(self, sample: PerceivedSample, opik_trace: Any = None) -> Tuple[str, dic
         ----------
         sample : PerceivedSample
             The question and context to plan for.
-        opik_trace : Any, optional
+        langfuse_trace : Any, optional
             Observability object for logging.
 
         Returns
@@ -165,7 +165,7 @@ def run(self, sample: PerceivedSample, opik_trace: Any = None) -> Tuple[str, dic
         prompt = build_planner_prompt(sample)
 
         span = open_llm_span(
-            opik_trace,
+            lf_trace,
             name="planner",
             input_data={"prompt": prompt},
             model=self.model,

diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/verifier_agent.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/verifier_agent.py
@@ -23,7 +23,7 @@
 from openai import OpenAI
 from PIL import Image
 
-from ..opik_integration.tracing import close_span, open_llm_span
+from ..langfuse_integration.tracing import close_span, open_llm_span
 from ..utils.json_strict import parse_strict
 
 
@@ -203,7 +203,7 @@ def run(
         sample,  # PerceivedSample
         plan: dict,
         vision_parsed: dict,
-        opik_trace: Any = None,
+        lf_trace: Any = None,
     ) -> Tuple[str, dict, bool, str]:
         """
         Critically audit a draft answer using a single VLM call.
@@ -216,7 +216,7 @@ def run(
             The inspection plan used by the previous agent.
         vision_parsed : dict
             The draft answer and explanation to audit.
-        opik_trace : Any, optional
+        langfuse_trace : Any, optional
             Tracing object for observability.
 
         Returns
@@ -250,7 +250,7 @@ def run(
         )
 
         span = open_llm_span(
-            opik_trace,
+            lf_trace,
             name="verifier",
             input_data={"prompt": prompt, "draft_answer": draft_answer},
             model=self.model,