From 60e2bd1d9c8afffa16e2e029f16ef1990edd76fd Mon Sep 17 00:00:00 2001
From: Dvir Dukhan <12258836+DvirDukhan@users.noreply.github.com>
Date: Wed, 27 May 2026 14:48:01 +0300
Subject: [PATCH 1/7] bench: add MCP-transport sibling of the code_graph track
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a second bench track that exercises code-graph through the
exact transport real-world agents use (Claude Code, Cursor, …) —
JSON-RPC over stdio to a spawned `cgraph-mcp` server — instead of
HTTP to the FastAPI service.

Files:

- bench/agents/code_graph_mcp_adapter.py — sync Python adapter that
  spawns cgraph-mcp per call via the official MCP Python SDK. Knows
  the 8-tool MCP surface (search_code, get_callers, get_callees,
  get_dependencies, impact_analysis, find_path, index_repo, ask).
- bench/cli/cg-mcp + cg_mcp.py — bash-callable CLI shim mirroring
  the existing `cg` shim. mini-swe-agent only does bash, so each
  "tool" is one CLI invocation.
- bench/tools/code_graph_mcp/{tools.yaml,system_preamble.md} —
  agent config for the MCP track. Mirrors code_graph; same Q2
  decision to exclude `ask` (no nested LLM in the benchmarked tool
  set).
- tests/bench/test_cg_mcp_adapter.py — 5 unit + 1 e2e test
  (FalkorDB-gated AND MCP-server-gated so it skips cleanly until
  the MCP stack lands on staging).

Heavy e2e validated against the api/ subgraph (~6.3k nodes) over
real stdio: search_code -> get_callers -> impact_analysis returned
expected payloads.

Depends on the MCP stack (PRs #675–#683) for cgraph-mcp itself.
Lands cleanly once that stack merges.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 bench/agents/code_graph_mcp_adapter.py        | 163 ++++++++++++++++
 bench/cli/cg-mcp                              |   5 +
 bench/cli/cg_mcp.py                           | 140 ++++++++++++++
 bench/tools/code_graph_mcp/system_preamble.md |  72 +++++++
 bench/tools/code_graph_mcp/tools.yaml         |  39 ++++
 tests/bench/__init__.py                       |   0
 tests/bench/test_cg_mcp_adapter.py            | 178 ++++++++++++++++++
 7 files changed, 597 insertions(+)
 create mode 100644 bench/agents/code_graph_mcp_adapter.py
 create mode 100755 bench/cli/cg-mcp
 create mode 100644 bench/cli/cg_mcp.py
 create mode 100644 bench/tools/code_graph_mcp/system_preamble.md
 create mode 100644 bench/tools/code_graph_mcp/tools.yaml
 create mode 100644 tests/bench/__init__.py
 create mode 100644 tests/bench/test_cg_mcp_adapter.py

diff --git a/bench/agents/code_graph_mcp_adapter.py b/bench/agents/code_graph_mcp_adapter.py
new file mode 100644
index 00000000..9a6347bd
--- /dev/null
+++ b/bench/agents/code_graph_mcp_adapter.py
@@ -0,0 +1,163 @@
+"""MCP-transport adapter to cgraph-mcp for the benchmark.
+
+Sibling of `code_graph_adapter.py` (HTTP). Where the HTTP adapter talks
+to the host FastAPI service over the network, this one spawns the
+`cgraph-mcp` stdio MCP server in-process via the official MCP Python
+SDK and dispatches tool calls over JSON-RPC.
+
+This gives us a second, real-world benchmark track that exercises the
+exact same transport agents (Claude Code, Cursor, …) will use in
+production. Tool names match the 8-tool MCP surface
+(`index_repo`, `search_code`, `get_callers`, `get_callees`,
+`get_dependencies`, `impact_analysis`, `find_path`, `ask`).
+
+Each call spawns a fresh server, runs the call, and exits. That's
+~0.5-1s overhead per call but keeps the model trivially safe to call
+from a bash shim (one process per invocation, no shared state).
+A future optimisation could persist the server across calls via a
+side-channel daemon, but per-call spawn matches how external agents
+actually use MCP servers today.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import os
+from typing import Any
+
+from mcp import ClientSession, StdioServerParameters
+from mcp.client.stdio import stdio_client
+
+
+DEFAULT_TIMEOUT_SEC = 60.0
+
+
+def _env_for_mcp() -> dict[str, str]:
+    """Build the env for the spawned cgraph-mcp process.
+
+    Pass through everything from the caller but make sure the FalkorDB
+    coordinates are present — the runner usually sets them to point at
+    the host FalkorDB container.
+    """
+    env = dict(os.environ)
+    env.setdefault("FALKORDB_HOST", os.environ.get("FALKORDB_HOST", "127.0.0.1"))
+    env.setdefault("FALKORDB_PORT", os.environ.get("FALKORDB_PORT", "6379"))
+    return env
+
+
+def _extract(result: Any) -> Any:
+    """Normalize a CallToolResult into a JSON-serialisable Python value.
+
+    The MCP spec lets servers put the payload in `structuredContent`
+    and/or echo it as a JSON text chunk. Our 8 tools do both; agents
+    have historically preferred the text payload. We mirror that:
+    return the parsed text chunk when present, otherwise fall back to
+    structuredContent (unwrapping the spec's `{"result": ...}` wrapper
+    for collection-returning tools).
+    """
+    for chunk in result.content:
+        if hasattr(chunk, "text") and chunk.text:
+            try:
+                return json.loads(chunk.text)
+            except json.JSONDecodeError:
+                return chunk.text
+    struct = getattr(result, "structuredContent", None)
+    if isinstance(struct, dict) and set(struct.keys()) == {"result"}:
+        return struct["result"]
+    return struct
+
+
+async def _call_tool_async(name: str, arguments: dict[str, Any], timeout: float) -> Any:
+    params = StdioServerParameters(command="cgraph-mcp", args=[], env=_env_for_mcp())
+    async with stdio_client(params) as (read, write):
+        async with ClientSession(read, write) as session:
+            await asyncio.wait_for(session.initialize(), timeout=timeout)
+            result = await asyncio.wait_for(
+                session.call_tool(name, arguments), timeout=timeout
+            )
+            payload = _extract(result)
+            if getattr(result, "isError", False):
+                return {"error": payload}
+            return payload
+
+
+def call_tool(name: str, arguments: dict[str, Any], *, timeout: float = DEFAULT_TIMEOUT_SEC) -> Any:
+    """Sync entry point for the bash shim. One spawn per call."""
+    return asyncio.run(_call_tool_async(name, arguments, timeout))
+
+
+# ── Top-level convenience wrappers ─────────────────────────────────────
+# Names map 1:1 onto MCP tool names (and onto bench/tools/code_graph_mcp/
+# tools.yaml entries). Kwargs mirror each tool's MCP arg schema.
+
+
+def index_repo(path_or_url: str, branch: str | None = None, ignore: list[str] | None = None) -> dict[str, Any]:
+    args: dict[str, Any] = {"path_or_url": path_or_url}
+    if branch is not None:
+        args["branch"] = branch
+    if ignore is not None:
+        args["ignore"] = ignore
+    return call_tool("index_repo", args)
+
+
+def search_code(prefix: str, project: str, branch: str | None = None, limit: int = 10) -> Any:
+    args: dict[str, Any] = {"prefix": prefix, "project": project, "limit": limit}
+    if branch is not None:
+        args["branch"] = branch
+    return call_tool("search_code", args)
+
+
+def _neighbors(tool: str, symbol_id: int, project: str, branch: str | None, limit: int) -> Any:
+    args: dict[str, Any] = {"symbol_id": symbol_id, "project": project, "limit": limit}
+    if branch is not None:
+        args["branch"] = branch
+    return call_tool(tool, args)
+
+
+def get_callers(symbol_id: int, project: str, branch: str | None = None, limit: int = 50) -> Any:
+    return _neighbors("get_callers", symbol_id, project, branch, limit)
+
+
+def get_callees(symbol_id: int, project: str, branch: str | None = None, limit: int = 50) -> Any:
+    return _neighbors("get_callees", symbol_id, project, branch, limit)
+
+
+def get_dependencies(symbol_id: int, project: str, branch: str | None = None, limit: int = 50) -> Any:
+    return _neighbors("get_dependencies", symbol_id, project, branch, limit)
+
+
+def impact_analysis(
+    symbol_id: int,
+    project: str,
+    branch: str | None = None,
+    direction: str = "IN",
+    depth: int = 3,
+) -> Any:
+    args: dict[str, Any] = {
+        "symbol_id": symbol_id,
+        "project": project,
+        "direction": direction,
+        "depth": depth,
+    }
+    if branch is not None:
+        args["branch"] = branch
+    return call_tool("impact_analysis", args)
+
+
+def find_path(source_id: int, dest_id: int, project: str, branch: str | None = None) -> Any:
+    args: dict[str, Any] = {
+        "source_id": source_id,
+        "dest_id": dest_id,
+        "project": project,
+    }
+    if branch is not None:
+        args["branch"] = branch
+    return call_tool("find_path", args)
+
+
+def ask(question: str, project: str, branch: str | None = None) -> Any:
+    args: dict[str, Any] = {"question": question, "project": project}
+    if branch is not None:
+        args["branch"] = branch
+    return call_tool("ask", args)
diff --git a/bench/cli/cg-mcp b/bench/cli/cg-mcp
new file mode 100755
index 00000000..be6c09bb
--- /dev/null
+++ b/bench/cli/cg-mcp
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+# Bash-callable entry point for the code-graph MCP CLI. Mirrors `cg`
+# but speaks JSON-RPC over stdio to a spawned `cgraph-mcp` server
+# instead of HTTP to the FastAPI service. Runner adds bench/cli to PATH.
+exec "${BENCH_PYTHON:-python3}" -m bench.cli.cg_mcp "$@"
diff --git a/bench/cli/cg_mcp.py b/bench/cli/cg_mcp.py
new file mode 100644
index 00000000..95c91390
--- /dev/null
+++ b/bench/cli/cg_mcp.py
@@ -0,0 +1,140 @@
+"""`cg-mcp` — bash-callable CLI exposing code-graph's 8 MCP tools.
+
+This is the MCP-transport sibling of `cg`. Where `cg` calls the host
+FastAPI service over HTTP, `cg-mcp` spawns the `cgraph-mcp` stdio
+server (via the official MCP Python SDK) for every invocation and
+dispatches one tool call.
+
+The MCP track is what external agents (Claude Code, Cursor, …) use
+in production; benchmarking through it tells us how the *real-world*
+integration behaves under SWE-bench, not just the in-process FastAPI
+adapter.
+
+Subcommands mirror the MCP tool names:
+
+  cg-mcp index_repo       --path-or-url . [--branch B] [--ignore PAT ...]
+  cg-mcp search_code      --project P --prefix STR [--branch B] [--limit N]
+  cg-mcp get_callers      --project P --symbol-id ID [--branch B] [--limit N]
+  cg-mcp get_callees      --project P --symbol-id ID [--branch B] [--limit N]
+  cg-mcp get_dependencies --project P --symbol-id ID [--branch B] [--limit N]
+  cg-mcp impact_analysis  --project P --symbol-id ID [--direction IN|OUT] [--depth N]
+  cg-mcp find_path        --project P --source-id ID --dest-id ID [--branch B]
+  cg-mcp ask              --project P --question "..." [--branch B]
+
+Output: one JSON document per call on stdout. Errors print to stderr
+and exit non-zero.
+
+Env: FALKORDB_HOST / FALKORDB_PORT are passed through to the spawned
+server. Optionally set CGRAPH_MCP_TIMEOUT_SEC to override the
+default 60s timeout.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+from typing import Any
+
+from bench.agents import code_graph_mcp_adapter as cgm
+
+
+def _print(obj: Any) -> None:
+    json.dump(obj, sys.stdout, indent=2, sort_keys=True, default=str)
+    sys.stdout.write("\n")
+
+
+def _timeout() -> float:
+    try:
+        return float(os.getenv("CGRAPH_MCP_TIMEOUT_SEC", "60"))
+    except ValueError:
+        return 60.0
+
+
+def _add_project(p: argparse.ArgumentParser) -> None:
+    p.add_argument("--project", required=True)
+    p.add_argument("--branch", default=None)
+
+
+def _add_symbol(p: argparse.ArgumentParser) -> None:
+    p.add_argument("--symbol-id", type=int, required=True, dest="symbol_id")
+    p.add_argument("--limit", type=int, default=50)
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(prog="cg-mcp", description=__doc__)
+    sub = parser.add_subparsers(dest="cmd", required=True)
+
+    ir = sub.add_parser("index_repo")
+    ir.add_argument("--path-or-url", required=True, dest="path_or_url")
+    ir.add_argument("--branch", default=None)
+    ir.add_argument("--ignore", nargs="*", default=None)
+
+    sc = sub.add_parser("search_code")
+    _add_project(sc)
+    sc.add_argument("--prefix", required=True)
+    sc.add_argument("--limit", type=int, default=10)
+
+    for name in ("get_callers", "get_callees", "get_dependencies"):
+        p = sub.add_parser(name)
+        _add_project(p)
+        _add_symbol(p)
+
+    ia = sub.add_parser("impact_analysis")
+    _add_project(ia)
+    ia.add_argument("--symbol-id", type=int, required=True, dest="symbol_id")
+    ia.add_argument("--direction", choices=["IN", "OUT"], default="IN")
+    ia.add_argument("--depth", type=int, default=3)
+
+    fp = sub.add_parser("find_path")
+    _add_project(fp)
+    fp.add_argument("--source-id", type=int, required=True, dest="source_id")
+    fp.add_argument("--dest-id", type=int, required=True, dest="dest_id")
+
+    aq = sub.add_parser("ask")
+    _add_project(aq)
+    aq.add_argument("--question", required=True)
+
+    args = parser.parse_args(argv)
+    timeout = _timeout()
+
+    # Inject timeout for adapter calls.
+    cgm.DEFAULT_TIMEOUT_SEC = timeout
+
+    try:
+        if args.cmd == "index_repo":
+            _print(cgm.index_repo(args.path_or_url, branch=args.branch, ignore=args.ignore))
+        elif args.cmd == "search_code":
+            _print(cgm.search_code(args.prefix, args.project, branch=args.branch, limit=args.limit))
+        elif args.cmd == "get_callers":
+            _print(cgm.get_callers(args.symbol_id, args.project, branch=args.branch, limit=args.limit))
+        elif args.cmd == "get_callees":
+            _print(cgm.get_callees(args.symbol_id, args.project, branch=args.branch, limit=args.limit))
+        elif args.cmd == "get_dependencies":
+            _print(cgm.get_dependencies(args.symbol_id, args.project, branch=args.branch, limit=args.limit))
+        elif args.cmd == "impact_analysis":
+            _print(
+                cgm.impact_analysis(
+                    args.symbol_id,
+                    args.project,
+                    branch=args.branch,
+                    direction=args.direction,
+                    depth=args.depth,
+                )
+            )
+        elif args.cmd == "find_path":
+            _print(cgm.find_path(args.source_id, args.dest_id, args.project, branch=args.branch))
+        elif args.cmd == "ask":
+            _print(cgm.ask(args.question, args.project, branch=args.branch))
+        else:  # pragma: no cover — argparse already enforces this
+            parser.error(f"unknown subcommand: {args.cmd}")
+    except Exception as e:  # noqa: BLE001 — surface everything to the agent
+        print(f"cg-mcp error: {e}", file=sys.stderr)
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/bench/tools/code_graph_mcp/system_preamble.md b/bench/tools/code_graph_mcp/system_preamble.md
new file mode 100644
index 00000000..bf5af4a1
--- /dev/null
+++ b/bench/tools/code_graph_mcp/system_preamble.md
@@ -0,0 +1,72 @@
+# code-graph (MCP) preamble
+
+You are an autonomous coding agent solving a software-engineering task.
+Your sole tool is bash: every action you take is a shell command that
+is executed in the repository's working directory.
+
+## Code-navigation workflow — use this BEFORE grep/find
+
+A code-graph **MCP server** (`cgraph-mcp`) is available for this repo.
+**Before reading or editing code, locate the relevant symbols through
+`cg-mcp` rather than grepping the file tree** — it's faster, returns
+precise `{id, file, line}` records, and reveals caller / callee /
+impact relationships you would otherwise reconstruct by hand. Fall
+back to bash only when `cg-mcp` cannot answer the question.
+
+`$PROJECT_NAME` and `$BRANCH` are exported for you (do not guess).
+The graph is already indexed against the current commit.
+
+Typical loop:
+
+1. `cg-mcp search_code --project "$PROJECT_NAME" --prefix <name>` —
+   locate a function/class by name. Pick the `id` of the best hit.
+2. `cg-mcp get_callers --project "$PROJECT_NAME" --symbol-id <id>` —
+   "who calls this?" before refactoring.
+3. `cg-mcp impact_analysis --project "$PROJECT_NAME" --symbol-id <id>
+   --depth 3` — full transitive blast radius. Use this BEFORE
+   non-trivial edits.
+4. Read the implicated file(s) with `sed -n` / `cat`, then edit.
+
+## Available `cg-mcp` sub-commands
+
+- `cg-mcp search_code      --project P --prefix STR [--limit N]` —
+  prefix search; returns `[{id, name, label, file, line}, ...]`.
+- `cg-mcp get_callers      --project P --symbol-id ID [--limit N]` —
+  incoming CALLS edges (who calls X).
+- `cg-mcp get_callees      --project P --symbol-id ID [--limit N]` —
+  outgoing CALLS edges (what X calls).
+- `cg-mcp get_dependencies --project P --symbol-id ID [--limit N]` —
+  all outgoing edges (CALLS + IMPORTS + DEFINES).
+- `cg-mcp impact_analysis  --project P --symbol-id ID
+                          [--direction IN|OUT] [--depth N]` —
+  transitive blast radius (default IN, depth 3).
+- `cg-mcp find_path        --project P --source-id ID --dest-id ID` —
+  the call chain(s) between two symbols.
+- `cg-mcp index_repo       --path-or-url PATH [--branch B]` —
+  (re)index a folder or git URL. Only needed for repos that aren't
+  pre-indexed.
+
+You also have the usual Unix tools (`cat`, `grep`/`rg`, `find`, `sed`)
+for cases the graph can't answer.
+
+## Rules of thumb
+
+1. **Always run `search_code` first** to turn a name into an `id`.
+2. **`impact_analysis` before any non-trivial edit.** Even when you
+   think you know the answer — the transitive closure often surprises
+   you.
+3. **Don't `grep` for callers.** `get_callers` is one cheap Cypher
+   hop; grep over a large repo costs tens of thousands of tokens.
+
+## Submission
+
+When you believe the task is complete, run a bash command whose first
+line of stdout is exactly:
+
+```
+COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT
+```
+
+followed by your final answer or summary on subsequent lines. The
+runner reads the working-tree `git diff` automatically; you do not
+need to commit.
diff --git a/bench/tools/code_graph_mcp/tools.yaml b/bench/tools/code_graph_mcp/tools.yaml
new file mode 100644
index 00000000..3b676977
--- /dev/null
+++ b/bench/tools/code_graph_mcp/tools.yaml
@@ -0,0 +1,39 @@
+# SWE-agent tool bundle: code-graph MCP-transport config.
+#
+# This is the MCP-transport sibling of bench/tools/code_graph/tools.yaml.
+# Same backend graph; different transport. Where `code_graph` calls the
+# host FastAPI service over HTTP, `code_graph_mcp` spawns the
+# `cgraph-mcp` stdio server for each tool call — the exact transport
+# Claude Code / Cursor / Cline use in production.
+#
+# Tool names mirror the 8 MCP tools registered in api/mcp/tools/
+# (search_code, get_callers, get_callees, get_dependencies,
+# impact_analysis, find_path, index_repo, ask). The bash agent calls
+# them through the `cg-mcp <tool> ...` shim (see bench/cli/cg-mcp).
+#
+# IMPORTANT: `ask` (GraphRAG) is intentionally NOT in the tool list.
+# Including it would double-count tokens (nested LLM agent). Same Q2
+# decision as the HTTP code_graph config — we benchmark the *graph*,
+# not GraphRAG.
+
+extends: ../baseline/tools.yaml
+
+tools:
+  - index_repo            # (path_or_url, branch?) -> indexing stats
+  - search_code           # (project, prefix) -> [symbol]
+  - get_callers           # (project, symbol_id) -> [caller]
+  - get_callees           # (project, symbol_id) -> [callee]
+  - get_dependencies      # (project, symbol_id) -> [dep]
+  - impact_analysis       # (project, symbol_id, direction, depth) -> [impacted]
+  - find_path             # (project, source_id, dest_id) -> [path]
+
+backend:
+  transport: mcp_stdio
+  command: cgraph-mcp
+  # Container has cgraph-mcp on PATH via `pip install -e .` against this
+  # repo. FALKORDB_HOST/PORT are passed through to the spawned MCP
+  # server, pointing at the same host FalkorDB the HTTP config uses.
+  env_passthrough:
+    - FALKORDB_HOST
+    - FALKORDB_PORT
+    - MODEL_NAME
diff --git a/tests/bench/__init__.py b/tests/bench/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/bench/test_cg_mcp_adapter.py b/tests/bench/test_cg_mcp_adapter.py
new file mode 100644
index 00000000..7d6f9274
--- /dev/null
+++ b/tests/bench/test_cg_mcp_adapter.py
@@ -0,0 +1,178 @@
+"""Tests for the MCP-transport bench adapter (`cg-mcp`).
+
+Heavy end-to-end test (talks to real cgraph-mcp + FalkorDB) is gated
+behind the same `_falkordb_reachable` check as the existing MCP tests.
+Light tests run unconditionally and validate the argparse surface and
+`_extract` shape handling.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import socket
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+from bench.agents import code_graph_mcp_adapter as cgm
+from bench.cli import cg_mcp
+
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+
+
+def _mcp_server_available() -> bool:
+    """The benchmark MCP adapter requires the in-repo `cgraph-mcp` server.
+
+    On branches that pre-date the MCP stack (e.g. this branch's base,
+    `fix-find-symbol-nested-name`), `api.mcp.server` is absent. The
+    end-to-end test must skip there; it will run on staging once the
+    MCP stack lands.
+    """
+    try:
+        import api.mcp.server  # noqa: F401
+        return True
+    except ImportError:
+        return False
+
+
+def _falkordb_reachable() -> bool:
+    host = os.environ.get("FALKORDB_HOST", "127.0.0.1")
+    port = int(os.environ.get("FALKORDB_PORT", "6390"))
+    try:
+        with socket.create_connection((host, port), timeout=1):
+            return True
+    except OSError:
+        return False
+
+
+# ── light unit tests ──────────────────────────────────────────────────
+
+
+class _FakeChunk:
+    def __init__(self, text: str) -> None:
+        self.text = text
+
+
+class _FakeResult:
+    def __init__(self, content, structured=None, is_error=False):
+        self.content = content
+        self.structuredContent = structured
+        self.isError = is_error
+
+
+def test_extract_prefers_text_chunk_json():
+    r = _FakeResult([_FakeChunk('{"id": 7, "name": "foo"}')])
+    assert cgm._extract(r) == {"id": 7, "name": "foo"}
+
+
+def test_extract_falls_back_to_structured_result_wrapper():
+    r = _FakeResult(content=[], structured={"result": [1, 2, 3]})
+    assert cgm._extract(r) == [1, 2, 3]
+
+
+def test_extract_returns_raw_text_when_not_json():
+    r = _FakeResult([_FakeChunk("not json at all")])
+    assert cgm._extract(r) == "not json at all"
+
+
+def test_cli_rejects_unknown_subcommand(capsys):
+    with pytest.raises(SystemExit):
+        cg_mcp.main(["totally_bogus"])
+
+
+def test_cli_index_repo_parses_ignore_list(monkeypatch):
+    captured: dict = {}
+
+    def fake_index_repo(path_or_url, branch=None, ignore=None):
+        captured.update(path_or_url=path_or_url, branch=branch, ignore=ignore)
+        return {"ok": True, **captured}
+
+    monkeypatch.setattr(cgm, "index_repo", fake_index_repo)
+    rc = cg_mcp.main(
+        [
+            "index_repo",
+            "--path-or-url",
+            "/tmp/x",
+            "--branch",
+            "main",
+            "--ignore",
+            ".venv",
+            "node_modules",
+        ]
+    )
+    assert rc == 0
+    assert captured["path_or_url"] == "/tmp/x"
+    assert captured["branch"] == "main"
+    assert captured["ignore"] == [".venv", "node_modules"]
+
+
+# ── heavy end-to-end test ─────────────────────────────────────────────
+
+
+@pytest.mark.skipif(
+    not _mcp_server_available(),
+    reason="api.mcp.server not present — requires MCP stack to be merged",
+)
+@pytest.mark.skipif(not _falkordb_reachable(), reason="FalkorDB unreachable")
+def test_cg_mcp_search_code_end_to_end(tmp_path):
+    """Spawn the actual cg-mcp shim against a freshly-indexed fixture."""
+    fixture = REPO_ROOT / "tests" / "mcp" / "fixtures" / "sample_project"
+    if not fixture.exists():
+        pytest.skip("MCP sample fixture not present")
+
+    env = os.environ.copy()
+    env["FALKORDB_HOST"] = os.environ.get("FALKORDB_HOST", "127.0.0.1")
+    env["FALKORDB_PORT"] = os.environ.get("FALKORDB_PORT", "6390")
+    env["BENCH_PYTHON"] = sys.executable
+    # Ensure cgraph-mcp is on PATH for the spawned subprocess.
+    venv_bin = str(Path(sys.executable).parent)
+    env["PATH"] = f"{venv_bin}:{env.get('PATH', '')}"
+
+    # Index the fixture under a deterministic project/branch.
+    project = "sample_project"
+    branch = f"benchmcp-{os.getpid()}"
+    idx = subprocess.run(
+        [
+            str(REPO_ROOT / "bench" / "cli" / "cg-mcp"),
+            "index_repo",
+            "--path-or-url",
+            str(fixture),
+            "--branch",
+            branch,
+        ],
+        env=env,
+        capture_output=True,
+        text=True,
+        timeout=120,
+    )
+    assert idx.returncode == 0, idx.stderr
+    idx_payload = json.loads(idx.stdout)
+    assert "graph_name" in idx_payload
+    assert idx_payload["num_nodes"] > 0
+
+    # Then search for any known symbol from the fixture.
+    sr = subprocess.run(
+        [
+            str(REPO_ROOT / "bench" / "cli" / "cg-mcp"),
+            "search_code",
+            "--project",
+            project,
+            "--branch",
+            branch,
+            "--prefix",
+            "a",  # broad prefix to match something in the fixture
+            "--limit",
+            "3",
+        ],
+        env=env,
+        capture_output=True,
+        text=True,
+        timeout=60,
+    )
+    assert sr.returncode == 0, sr.stderr
+    out = json.loads(sr.stdout)
+    assert out is not None

From f17d437b3abf856b96494890965ea63e5a169d83 Mon Sep 17 00:00:00 2001
From: Dvir Dukhan <12258836+DvirDukhan@users.noreply.github.com>
Date: Wed, 27 May 2026 14:51:38 +0300
Subject: [PATCH 2/7] bench: wire code_graph_mcp into mini_runner dispatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The adapter and shim from the previous commit were inert from the
runner's perspective — VALID_CONFIGS only knew baseline/lsp/code_graph.
This commit makes `--config code_graph_mcp` a first-class track.

Changes in bench/runners/mini_runner.py:

- VALID_CONFIGS gains "code_graph_mcp" (passes argparse + help string).
- New INSTANCE_TEMPLATE_CODE_GRAPH_MCP: mirrors the HTTP code_graph
  template but tells the agent to call `cg-mcp` with $PROJECT_NAME +
  $BRANCH, and to use impact_analysis before non-trivial edits.
- load_instance_template dispatches the new template.
- config_env("code_graph_mcp", ...) prepends venv bin to PATH (so
  cgraph-mcp is callable from the agent's bash), passes FALKORDB_*
  through to the spawned MCP server, and exports PROJECT_NAME +
  BRANCH which the preamble references.
- New _ensure_indexed_mcp() mirrors _ensure_indexed but goes through
  the bench MCP adapter instead of HTTP. Skip-if-present probe hits
  FalkorDB's GRAPH.LIST directly (one trip, no MCP spawn).
- Per-instance loop now dispatches to _ensure_indexed_mcp for the
  new config.

Smoke-verified that:
- VALID_CONFIGS == ('baseline','lsp','code_graph','code_graph_mcp')
- load_instance_template('code_graph_mcp') contains 'cg-mcp'
- config_env populates PROJECT_NAME/BRANCH/FALKORDB_HOST

Unit tests for the adapter still pass (5 passed, 1 skipped — heavy
e2e double-gated on FalkorDB + api.mcp.server availability).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 bench/runners/mini_runner.py | 87 +++++++++++++++++++++++++++++++++++-
 1 file changed, 85 insertions(+), 2 deletions(-)

diff --git a/bench/runners/mini_runner.py b/bench/runners/mini_runner.py
index 3689c0aa..17b0cbaa 100644
--- a/bench/runners/mini_runner.py
+++ b/bench/runners/mini_runner.py
@@ -49,7 +49,7 @@
 DEFAULT_CACHE_DIR = BENCH_DIR / "cache"
 DEFAULT_RESULTS = DEFAULT_CACHE_DIR / "results.jsonl"
 
-VALID_CONFIGS = ("baseline", "lsp", "code_graph")
+VALID_CONFIGS = ("baseline", "lsp", "code_graph", "code_graph_mcp")
 
 
 # ---------------------------------------------------------------------------
@@ -155,11 +155,40 @@ class Task:
 """
 
 
+INSTANCE_TEMPLATE_CODE_GRAPH_MCP = """\
+You are working in the repository at {{cwd}}.
+The code-graph MCP server has already indexed this repository under the
+project name `$PROJECT_NAME` on branch `$BRANCH` (use the env vars
+literally).
+
+The task to solve:
+
+{{task}}
+
+**Required workflow.** Before reading or editing any file, your first
+bash command MUST be:
+
+  `cg-mcp search_code --project "$PROJECT_NAME" --branch "$BRANCH" --prefix <a symbol named in the task description>`
+
+Then use `cg-mcp get_callers --project "$PROJECT_NAME" --branch "$BRANCH" --symbol-id <id>`
+to expand relationships before doing any textual search. Use
+`cg-mcp impact_analysis ... --symbol-id <id> --depth 3` before
+non-trivial edits.
+
+When you believe the task is complete, finish your turn with a final
+message that contains a unified diff of your changes inside a fenced
+``` block, then exit. Do not commit; the harness reads the diff via
+`git diff`.
+"""
+
+
 def load_instance_template(config: str) -> str:
     if config == "lsp":
         return INSTANCE_TEMPLATE_LSP
     if config == "code_graph":
         return INSTANCE_TEMPLATE_CODE_GRAPH
+    if config == "code_graph_mcp":
+        return INSTANCE_TEMPLATE_CODE_GRAPH_MCP
     return INSTANCE_TEMPLATE
 
 
@@ -210,6 +239,23 @@ def config_env(config: str, repo_path: Path) -> dict[str, str]:
         # The agent's preamble references $REPO_NAME — set it to the
         # worktree dirname, which is what analyze_folder used as the id.
         env["REPO_NAME"] = repo_path.name
+    elif config == "code_graph_mcp":
+        # MCP transport: agent calls `cg-mcp …` which spawns the
+        # `cgraph-mcp` stdio server per call. FalkorDB coordinates
+        # are passed through verbatim.
+        env.setdefault("FALKORDB_HOST", os.environ.get("FALKORDB_HOST", "127.0.0.1"))
+        env.setdefault("FALKORDB_PORT", os.environ.get("FALKORDB_PORT", "6379"))
+        # `cgraph-mcp` must be on PATH; the runner installs the
+        # falkordb-code-graph package into the same interpreter, so
+        # prepending the venv bin gives us the entry point.
+        venv_bin = str(Path(sys.executable).parent)
+        env["PATH"] = f"{venv_bin}:{env['PATH']}"
+        # The preamble references $PROJECT_NAME and $BRANCH; project
+        # name matches what `index_repo` derives from the folder
+        # (= worktree dirname), and branch is the per-instance tag we
+        # used when indexing.
+        env["PROJECT_NAME"] = repo_path.name
+        env["BRANCH"] = os.environ.get("CGRAPH_MCP_BRANCH", "_default")
     return env
 
 
@@ -248,6 +294,41 @@ def _ensure_indexed(repo_path: Path) -> None:
         print(f"[index] WARN failed to index {repo_name}: {exc!r}")
 
 
+def _ensure_indexed_mcp(repo_path: Path) -> None:
+    """MCP-track equivalent of _ensure_indexed.
+
+    Drives the `index_repo` MCP tool in-process via the bench adapter
+    (avoids spawning a second cgraph-mcp just to bootstrap; the agent
+    will spawn its own per call). Same skip-if-present optimization
+    as the HTTP path: cheap GRAPH.LIST scan against FalkorDB.
+    """
+    from bench.agents import code_graph_mcp_adapter as cgm
+    import redis
+
+    repo_name = repo_path.name
+    branch = os.environ.get("CGRAPH_MCP_BRANCH", "_default")
+    host = os.environ.get("FALKORDB_HOST", "127.0.0.1")
+    port = int(os.environ.get("FALKORDB_PORT", "6379"))
+    expected_graph = f"code:{repo_name}:{branch}"
+    try:
+        r = redis.Redis(host=host, port=port, decode_responses=True, socket_timeout=2)
+        if expected_graph in (r.execute_command("GRAPH.LIST") or []):
+            print(f"[index-mcp] {expected_graph} already indexed; skip")
+            return
+    except Exception as exc:  # noqa: BLE001
+        print(f"[index-mcp] WARN list_graphs failed ({exc!r}); will attempt index anyway")
+
+    print(f"[index-mcp] indexing {repo_path} as {expected_graph} ...")
+    try:
+        payload = cgm.index_repo(str(repo_path), branch=branch)
+        if isinstance(payload, dict) and payload.get("error"):
+            print(f"[index-mcp] WARN index_repo error: {payload['error']!r}")
+        else:
+            print(f"[index-mcp] indexed: {payload}")
+    except Exception as exc:  # noqa: BLE001
+        print(f"[index-mcp] WARN failed to index {repo_name}: {exc!r}")
+
+
 # ---------------------------------------------------------------------------
 # Dry-run stub model
 # ---------------------------------------------------------------------------
@@ -569,7 +650,7 @@ def main(argv: list[str] | None = None) -> int:
 
     p = argparse.ArgumentParser(description="code-graph benchmark runner")
     p.add_argument("--config", choices=VALID_CONFIGS, action="append",
-                   help="one of baseline / lsp / code_graph; repeatable. "
+                   help="one of baseline / lsp / code_graph / code_graph_mcp; repeatable. "
                         "Default: all three.")
     mode = p.add_mutually_exclusive_group(required=True)
     mode.add_argument("--dry-run", action="store_true",
@@ -640,6 +721,8 @@ def main(argv: list[str] | None = None) -> int:
                 # call returns nothing and the agent abandons the tool.
                 if cfg == "code_graph":
                     _ensure_indexed(cfg_wt)
+                elif cfg == "code_graph_mcp":
+                    _ensure_indexed_mcp(cfg_wt)
                 cfg_rows = run_batch(
                     [task],
                     [cfg],

From b14432b9448c2b9cc3483b0f93a887bbc55e3813 Mon Sep 17 00:00:00 2001
From: Dvir Dukhan <12258836+DvirDukhan@users.noreply.github.com>
Date: Wed, 27 May 2026 16:28:47 +0300
Subject: [PATCH 3/7] bench: fail loudly on indexing errors + bump
 analyze_folder timeout

The mini_runner previously printed a [index] WARN line on
analyze_folder errors and continued. This meant SWE-bench instances
whose path falls outside ALLOWED_ANALYSIS_DIR (e.g. when the API
server is started from a sibling worktree) would silently run the
agent against a missing code-graph project. The agent's first cg
call returns 400 'Missing project ...', the agent falls back to
grep/sed, and we get a token count that looks bad for the
code_graph track but actually reflects 'tool unavailable'.

Two changes:

* analyze_folder errors and httpx exceptions now raise RuntimeError
  with the offending path. This stops the run and surfaces the
  ALLOWED_ANALYSIS_DIR misconfiguration immediately.
* analyze_folder timeout bumped 600s -> 1800s. The 600s default
  was tight for sympy (~5 MB of Python, ~5000 functions) and
  caused a timeout during indexing.

This was discovered while running the first real 3-way SWE-bench
smoke. With the fix and a corrected ALLOWED_ANALYSIS_DIR, the
code_graph track produces sensible numbers (-11% input vs baseline
across the smoke sample vs the prior bogus +4.7%).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 bench/runners/mini_runner.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/bench/runners/mini_runner.py b/bench/runners/mini_runner.py
index 17b0cbaa..cb6b72a0 100644
--- a/bench/runners/mini_runner.py
+++ b/bench/runners/mini_runner.py
@@ -281,17 +281,19 @@ def _ensure_indexed(repo_path: Path) -> None:
                 print(f"[index] {repo_name} already indexed; skip")
                 return
         print(f"[index] analyzing {repo_path} ...")
-        with httpx.Client(timeout=600.0, headers=headers) as c:
+        with httpx.Client(timeout=1800.0, headers=headers) as c:
             r = c.post(
                 f"{base}/api/analyze_folder",
                 json={"path": str(repo_path), "ignore": []},
             )
             if r.status_code != 200:
-                print(f"[index] WARN analyze_folder returned {r.status_code}: {r.text[:200]}")
-            else:
-                print(f"[index] indexed {repo_name}")
-    except Exception as exc:  # noqa: BLE001
-        print(f"[index] WARN failed to index {repo_name}: {exc!r}")
+                raise RuntimeError(
+                    f"analyze_folder returned {r.status_code}: {r.text[:300]}. "
+                    f"Check ALLOWED_ANALYSIS_DIR on the API server covers {repo_path}."
+                )
+            print(f"[index] indexed {repo_name}")
+    except Exception as exc:
+        raise RuntimeError(f"failed to index {repo_name} at {repo_path}: {exc}") from exc
 
 
 def _ensure_indexed_mcp(repo_path: Path) -> None:

From 532d84957763cf1114681c5bdd75fc6c44889bfe Mon Sep 17 00:00:00 2001
From: dvirdukhan <dvir@falkordb.com>
Date: Wed, 27 May 2026 16:51:42 +0300
Subject: [PATCH 4/7] fix(analyzer): resolve LSP CALLS edges on repos without a
 venv
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Python analyzer hardcoded `environment_path={path}/venv` when starting
jedi-language-server via multilspy. When the repo had no venv (the common
case for cloned codebases like sphinx, sympy, anything from SWE-bench),
jedi raised `InvalidPythonEnvironment` on every `request_definition()`
call. analyzer.resolve() then swallowed the exception silently and the
indexer produced a graph with DEFINES edges only — zero CALLS, zero
EXTENDS. Benchmark validation showed sphinx (5K functions) and sympy
(41K functions) had no resolved cross-references at all.

Fix:
- source_analyzer.py: prefer {repo}/venv, then {repo}/.venv, then fall
  back to the host interpreter's environment (sys.executable's prefix)
  so jedi always has a valid Python to introspect.
- analyzer.py: log resolve() failures at WARN with file/line context
  instead of swallowing them silently, so the next regression is loud.

Verified: re-indexed sphinx-doc/sphinx-9230 with the fix:
  DEFINES: 5640, CALLS: 4931, EXTENDS: 484 (was DEFINES-only).

Fixes #685.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 api/analyzers/analyzer.py        |  5 +++++
 api/analyzers/source_analyzer.py | 22 +++++++++++++++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/api/analyzers/analyzer.py b/api/analyzers/analyzer.py
index 64d49004..0564606b 100644
--- a/api/analyzers/analyzer.py
+++ b/api/analyzers/analyzer.py
@@ -57,6 +57,11 @@ def resolve(self, files: dict[Path, File], lsp: SyncLanguageServer, file_path: P
             locations = lsp.request_definition(str(file_path), node.start_point.row, node.start_point.column)
             return [(files[Path(self.resolve_path(location['absolutePath'], path))], files[Path(self.resolve_path(location['absolutePath'], path))].tree.root_node.descendant_for_point_range(Point(location['range']['start']['line'], location['range']['start']['character']), Point(location['range']['end']['line'], location['range']['end']['character']))) for location in locations if location and Path(self.resolve_path(location['absolutePath'], path)) in files]
         except Exception as e:
+            import logging
+            logging.getLogger(__name__).warning(
+                "resolve() failed for %s @%d:%d: %s",
+                file_path, node.start_point.row, node.start_point.column, e,
+            )
             return []
         
     @abstractmethod
diff --git a/api/analyzers/source_analyzer.py b/api/analyzers/source_analyzer.py
index 4186f358..1b8f85b1 100644
--- a/api/analyzers/source_analyzer.py
+++ b/api/analyzers/source_analyzer.py
@@ -134,7 +134,27 @@ def second_pass(self, graph: Graph, files: list[Path], path: Path) -> None:
         else:
             lsps[".java"] = NullLanguageServer()
         if any(path.rglob('*.py')):
-            config = MultilspyConfig.from_dict({"code_language": "python", "environment_path": f"{path}/venv"})
+            import sys
+            py_venv = path / "venv"
+            py_dotvenv = path / ".venv"
+            if py_venv.is_dir() and (py_venv / "bin" / "python").exists():
+                env_path = str(py_venv)
+            elif py_dotvenv.is_dir() and (py_dotvenv / "bin" / "python").exists():
+                env_path = str(py_dotvenv)
+            else:
+                # Fall back to the host's Python environment so jedi has a
+                # valid interpreter to introspect; otherwise every
+                # request_definition() raises InvalidPythonEnvironment and
+                # we'd silently produce a graph with zero CALLS edges.
+                env_path = str(Path(sys.executable).resolve().parent.parent)
+                logging.info(
+                    "No venv at %s; falling back to host env %s for jedi LSP",
+                    path, env_path,
+                )
+            config = MultilspyConfig.from_dict({
+                "code_language": "python",
+                "environment_path": env_path,
+            })
             lsps[".py"] = SyncLanguageServer.create(config, logger, str(path))
         else:
             lsps[".py"] = NullLanguageServer()

From 476bc73d4a3b029ee437e057e82bf05a64874230 Mon Sep 17 00:00:00 2001
From: dvirdukhan <dvir@falkordb.com>
Date: Wed, 27 May 2026 20:25:26 +0300
Subject: [PATCH 5/7] bench: add resume support + ignore sympy rubi rules

Two production-quality fixes from the calibration run that crashed at
14/30 trajectories:

1. Resume support: skip (instance, cfg) pairs whose trajectory file
   already exists. Lets us recover from crashes/kills without re-running
   completed work (avoids ~$3 of wasted compute on this run).
2. Ignore pathological files at index time: sympy/integrals/rubi/rules
   contains auto-generated 3000-line files with hundreds of unresolvable
   symbols per line. jedi spends hours and never makes progress. Adding
   it to the default ignore list unblocks sympy-19040 (and other sympy
   instances) without affecting graph quality.

Also expanded default ignore set: __pycache__, build, dist, .tox, .eggs.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 bench/runners/mini_runner.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/bench/runners/mini_runner.py b/bench/runners/mini_runner.py
index cb6b72a0..03e32e7b 100644
--- a/bench/runners/mini_runner.py
+++ b/bench/runners/mini_runner.py
@@ -281,10 +281,19 @@ def _ensure_indexed(repo_path: Path) -> None:
                 print(f"[index] {repo_name} already indexed; skip")
                 return
         print(f"[index] analyzing {repo_path} ...")
+        # Default ignore set: auto-generated / vendored / pathological dirs
+        # that either contain no useful symbols or send jedi into a
+        # multi-hour resolve loop (e.g. sympy/integrals/rubi/rules has
+        # 3000-line files with hundreds of unresolvable symbols per line).
+        default_ignore = [
+            ".git", "venv", ".venv", "node_modules", "__pycache__",
+            "rubi/rules",  # sympy: blocks indexing for ~hours otherwise
+            "build", "dist", ".tox", ".eggs",
+        ]
         with httpx.Client(timeout=1800.0, headers=headers) as c:
             r = c.post(
                 f"{base}/api/analyze_folder",
-                json={"path": str(repo_path), "ignore": []},
+                json={"path": str(repo_path), "ignore": default_ignore},
             )
             if r.status_code != 200:
                 raise RuntimeError(
@@ -708,6 +717,13 @@ def main(argv: list[str] | None = None) -> int:
               f"x {len(configs)} configs = {len(insts) * len(configs)} trajectories")
         for inst in insts:
             for cfg in configs:
+                # Resume support: if a trajectory file for this (instance, cfg)
+                # already exists, skip the run entirely. Lets us recover from
+                # crashes / kills without re-spending tokens on completed work.
+                existing_traj = args.trajectories / f"{inst.instance_id}__{cfg}.json"
+                if existing_traj.exists():
+                    print(f"[resume] {inst.instance_id}/{cfg}: trajectory exists, skip")
+                    continue
                 # Fresh worktree per (instance, config) to avoid cross-talk.
                 wt = prepare_worktree(inst)
                 # Rename so each cfg gets a distinct path.

From d23ef79f80df790515309a28c15bf1590d9ef05f Mon Sep 17 00:00:00 2001
From: Dvir Dukhan <12258836+DvirDukhan@users.noreply.github.com>
Date: Thu, 28 May 2026 07:29:12 +0300
Subject: [PATCH 6/7] fix(analyzer): defensive skip when second_pass references
 untracked file

In source_analyzer.second_pass, the list of files we iterate can include
paths that first_pass did not add to self.files (e.g. parse errors,
LSP-induced timeouts, or rare edge cases where a candidate file is
present in the input list but never makes it into the files map).
Previously this raised KeyError and aborted the entire index. Hit on
sympy/polys/distributedmodules.py during bench calibration of sympy-12481.

Skip with a WARN log instead so a single bad file no longer takes down
the whole index.

Also bump mini_runner httpx timeout 1800s -> 7200s; observed sympy-12481
index taking >30 min in the field, which previously left the API server
indexing successfully but the runner gave up early.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 api/analyzers/source_analyzer.py | 11 ++++++++++-
 bench/runners/mini_runner.py     |  2 +-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/api/analyzers/source_analyzer.py b/api/analyzers/source_analyzer.py
index 1b8f85b1..ead8707a 100644
--- a/api/analyzers/source_analyzer.py
+++ b/api/analyzers/source_analyzer.py
@@ -166,7 +166,16 @@ def second_pass(self, graph: Graph, files: list[Path], path: Path) -> None:
         with lsps[".java"].start_server(), lsps[".py"].start_server(), lsps[".cs"].start_server():
             files_len = len(self.files)
             for i, file_path in enumerate(files):
-                file = self.files[file_path]
+                file = self.files.get(file_path)
+                if file is None:
+                    # first_pass skipped this file (e.g. parse error, empty,
+                    # or ignored after entering the candidate list). Skip
+                    # in second_pass too instead of crashing the whole index.
+                    logging.warning(
+                        "second_pass: %s not in files map (first_pass skipped it); skipping",
+                        file_path,
+                    )
+                    continue
                 logging.info(f'Processing file ({i + 1}/{files_len}): {file_path}')
                 for _, entity in file.entities.items():
                     entity.resolved_symbol(lambda key, symbol, fp=file_path: analyzers[fp.suffix].resolve_symbol(self.files, lsps[fp.suffix], fp, path, key, symbol))
diff --git a/bench/runners/mini_runner.py b/bench/runners/mini_runner.py
index 03e32e7b..3081bd6e 100644
--- a/bench/runners/mini_runner.py
+++ b/bench/runners/mini_runner.py
@@ -290,7 +290,7 @@ def _ensure_indexed(repo_path: Path) -> None:
             "rubi/rules",  # sympy: blocks indexing for ~hours otherwise
             "build", "dist", ".tox", ".eggs",
         ]
-        with httpx.Client(timeout=1800.0, headers=headers) as c:
+        with httpx.Client(timeout=7200.0, headers=headers) as c:
             r = c.post(
                 f"{base}/api/analyze_folder",
                 json={"path": str(repo_path), "ignore": default_ignore},

From ec7fac6cc80e9bbdbfd6dc20696696fbb87fe35c Mon Sep 17 00:00:00 2001
From: Dvir Dukhan <12258836+DvirDukhan@users.noreply.github.com>
Date: Thu, 28 May 2026 09:19:47 +0300
Subject: [PATCH 7/7] bench: add start-api.sh helper enabling tree-sitter fast
 resolver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After T18 (#691) + query-cache (#692), code_graph indexing on
pytest-6202 drops from 247s to 3.7s — but only if the API server is
launched with CODE_GRAPH_PY_RESOLVER=tree_sitter. This helper bakes
in that env plus the public/permissive flags the bench harness
expects, so calibration runs hit the fast path without manual setup.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 bench/scripts/start-api.sh | 44 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100755 bench/scripts/start-api.sh

diff --git a/bench/scripts/start-api.sh b/bench/scripts/start-api.sh
new file mode 100755
index 00000000..4e55f673
--- /dev/null
+++ b/bench/scripts/start-api.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# Launch the code-graph API server with the fast tree-sitter Python
+# resolver enabled (PR #691 + #692). This is what the bench harness
+# expects to talk to at 127.0.0.1:5000.
+#
+# Usage:
+#   bench/scripts/start-api.sh                  # default port 5000
+#   bench/scripts/start-api.sh --port 5001
+#
+# Prereqs:
+#   - FalkorDB running. For native falkordb on 6380 set
+#     FALKORDB_HOST=127.0.0.1 FALKORDB_PORT=6380 before invoking.
+#   - uv on PATH.
+#   - cwd must be a code-graph worktree containing api/ with PR #691
+#     and PR #692 applied (i.e. the dvirdukhan/query-cache branch tip
+#     or staging once those are merged).
+
+set -euo pipefail
+
+PORT=5000
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --port) PORT="$2"; shift 2 ;;
+        *) echo "Unknown arg: $1" >&2; exit 1 ;;
+    esac
+done
+
+# Tree-sitter static resolver — turns Python indexing from minutes to
+# seconds. Default is still jedi, so callers must opt in explicitly.
+export CODE_GRAPH_PY_RESOLVER="${CODE_GRAPH_PY_RESOLVER:-tree_sitter}"
+
+# Allow the bench harness to analyze any folder; the bench worktrees
+# live under bench/cache/worktrees.
+export ALLOWED_ANALYSIS_DIR="${ALLOWED_ANALYSIS_DIR:-/}"
+
+# Public mode: bench harness does not bother with bearer tokens.
+export CODE_GRAPH_PUBLIC="${CODE_GRAPH_PUBLIC:-1}"
+
+echo "[start-api] CODE_GRAPH_PY_RESOLVER=$CODE_GRAPH_PY_RESOLVER"
+echo "[start-api] CODE_GRAPH_PUBLIC=$CODE_GRAPH_PUBLIC"
+echo "[start-api] FALKORDB_HOST=${FALKORDB_HOST:-127.0.0.1} FALKORDB_PORT=${FALKORDB_PORT:-6379}"
+echo "[start-api] Listening on 127.0.0.1:$PORT"
+
+exec uv run uvicorn api.index:app --host 127.0.0.1 --port "$PORT"