Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions packages/agent/src/adapters/claude/claude-agent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1656,12 +1656,24 @@ export class ClaudeAcpAgent extends BaseAcpAgent {
// needs so the session doesn't pin the whole meta object.
const baseBranch = meta?.baseBranch;
const environment = meta?.environment;
// Snapshot the external MCP servers (before the in-process local server is
// mixed in below) so run_mcp_script can dial them with inherited auth.
const externalMcpServers: Record<string, McpServerConfig> =
supportsMcpInjection(earlyModelId)
? parseMcpServers(params, this.logger)
: {};
const buildInProcessMcpServers = (): Record<
string,
McpSdkServerConfigWithInstance
> => {
const server = createLocalToolsMcpServer(
{ cwd, token: resolveGithubToken(), taskId, baseBranch },
{
cwd,
token: resolveGithubToken(),
taskId,
baseBranch,
scriptableMcpServers: externalMcpServers,
},
{ environment },
);
return server ? { [LOCAL_TOOLS_MCP_NAME]: server } : {};
Expand All @@ -1676,9 +1688,7 @@ export class ClaudeAcpAgent extends BaseAcpAgent {
}

const mcpServers: Record<string, McpServerConfig> = {
...(supportsMcpInjection(earlyModelId)
? parseMcpServers(params, this.logger)
: {}),
...externalMcpServers,
...initialInProcess,
};

Expand Down
14 changes: 9 additions & 5 deletions packages/agent/src/adapters/codex/codex-agent.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -424,16 +424,20 @@ describe("CodexAcpAgent", () => {
_meta: { systemPrompt: string };
};

// Existing MCP server is preserved; ours is appended.
expect(forwarded.mcpServers).toHaveLength(2);
// Existing MCP server is preserved; the structured-output server is
// appended. The local-tools server is also present because the existing
// server makes run_mcp_script/list_mcp_tools available.
expect(forwarded.mcpServers[0].name).toBe("existing");
expect(forwarded.mcpServers[1].name).toBe("posthog_output");
expect(forwarded.mcpServers[1].command).toBe(process.execPath);
const outputServer = forwarded.mcpServers.find(
(s) => s.name === "posthog_output",
);
expect(outputServer).toBeDefined();
expect(outputServer?.command).toBe(process.execPath);

// The schema is forwarded base64-encoded so codex-acp doesn't have
// to escape it through a shell.
const envEntry = (
forwarded.mcpServers[1].env as Array<{ name: string; value: string }>
outputServer?.env as Array<{ name: string; value: string }>
).find((e) => e.name === "POSTHOG_OUTPUT_SCHEMA");
expect(envEntry).toBeDefined();
const decoded = JSON.parse(
Expand Down
5 changes: 5 additions & 0 deletions packages/agent/src/adapters/codex/codex-agent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ import {
emptyBaseline,
estimateTokens,
} from "../claude/context-breakdown";
import { parseMcpServers } from "../claude/session/mcp-config";
import { classifyAgentError } from "../error-classification";
import {
enabledLocalTools,
Expand Down Expand Up @@ -646,6 +647,10 @@ export class CodexAcpAgent extends BaseAcpAgent {
token: resolveGithubToken(),
taskId: resolveTaskId(meta),
baseBranch: meta?.baseBranch,
// Reuse the ACP MCP servers so run_mcp_script can dial them (auth inherited).
scriptableMcpServers: parseMcpServers({
mcpServers: request.mcpServers ?? [],
}),
};
const tools = enabledLocalTools(ctx, meta);
if (tools.length === 0) {
Expand Down
3 changes: 3 additions & 0 deletions packages/agent/src/adapters/local-tools/index.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { listMcpToolsTool, runMcpScriptTool } from "../../mcp-scripting/tools";
import type { LocalTool, LocalToolCtx, LocalToolGateMeta } from "./registry";
import { signedCommitTool } from "./tools/signed-commit";
import { signedMergeTool } from "./tools/signed-merge";
Expand All @@ -17,6 +18,8 @@ export const LOCAL_TOOLS: LocalTool[] = [
signedCommitTool,
signedMergeTool,
signedRewriteTool,
runMcpScriptTool,
listMcpToolsTool,
];

/** Tools whose gate passes for the given context — the set to actually expose. */
Expand Down
10 changes: 10 additions & 0 deletions packages/agent/src/adapters/local-tools/registry.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import type { McpServerConfig } from "@anthropic-ai/claude-agent-sdk";
import type { z } from "zod";

/**
Expand All @@ -20,6 +21,15 @@ export interface LocalToolCtx {
* back to origin/HEAD detection when unset.
*/
baseBranch?: string;
/**
* The session's external MCP servers, keyed by name — the same
* `McpServerConfig` map handed to the Claude SDK `query()`. The MCP-scripting
* tools (`run_mcp_script` / `list_mcp_tools`) open their own clients against
* these configs, inheriting auth (stdio `env`, http/sse `headers`). In-process
* `sdk` servers are skipped — they have no transport to dial. Absent or empty
* means scripting tools self-disable.
*/
scriptableMcpServers?: Record<string, McpServerConfig>;
}

/** Minimal session-meta shape needed to gate tools (e.g. cloud-only). */
Expand Down
111 changes: 111 additions & 0 deletions packages/agent/src/mcp-scripting/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# MCP tools as scripts

Lets the agent write **one JavaScript script** that calls the connected MCP
tools as ordinary async functions, instead of orchestrating them one tool-call
at a time. The classic pain it removes: a server that needs 100 sequential calls
(list, then act per item) becomes a single script with a loop.

```js
const issues = await tools.linear.listIssues({ teamId })
const stale = issues.filter((i) => i.status === "backlog")
for (const i of stale) {
await tools.linear.createComment({ issueId: i.id, body: "bump" })
}
return { bumped: stale.length }
```

Exposed to the model as two local tools (registered in
`../adapters/local-tools/index.ts`):

- **`list_mcp_tools`** — returns `.d.ts`-style signatures for every
`tools.<server>.<tool>(args)` call available, generated from each tool's MCP
input schema. Call it first to discover what to call.
- **`run_mcp_script`** — takes `{ script, timeoutMs? }`, runs the script with
`tools` injected, returns `{ result, logs, error? }`.

## Pieces

| File | Responsibility |
| --- | --- |
| `client-pool.ts` | Opens/caches one MCP `Client` per server from the session's `McpServerConfig` map; `listTools` / `callTool`. |
| `proxy.ts` | Builds the lazy `tools.<server>.<tool>(args)` proxy that forwards to the pool. |
| `runner.ts` | Runs the script in a constrained `node:vm` context with a wall-clock timeout. |
| `signatures.ts` | Renders connected tools as TypeScript-style signatures. |
| `tools.ts` | The `run_mcp_script` / `list_mcp_tools` local-tool definitions. |

## Credential flow — no new auth path

The proxy dials the **exact same MCP server configs** the agent's own MCP tools
use, so authentication is inherited verbatim. The chain:

1. The ACP client sends MCP servers in the `newSession` params. `parseMcpServers`
(`../adapters/claude/session/mcp-config.ts`) turns them into a
`Record<string, McpServerConfig>` — **stdio** entries carry `env`, **http/sse**
entries carry `headers`. This map is the single credential source.
2. Both adapters snapshot that map into `LocalToolCtx.scriptableMcpServers`:
`claude-agent.ts` passes it *before* the in-process local-tools server is mixed
in (so scripts never try to dial an in-process `sdk` server — those have no
transport), and `codex-agent.ts` derives it from the same ACP `mcpServers` via
`parseMcpServers`. The scripting tools self-disable when no external servers
are present.
3. On a `run_mcp_script` / `list_mcp_tools` call, `McpClientPool` reads a config
and constructs the matching MCP SDK transport:
- `stdio` → `StdioClientTransport` with `command`/`args`/`env` (the session env
is inherited too, so stdio servers keep ambient credentials).
- `http` → `StreamableHTTPClientTransport` with `requestInit.headers`.
- `sse` → `SSEClientTransport` with `requestInit.headers`.

There is no separate token store, no re-auth, and nothing the model can set: a
script can only reach servers the session was already authorized for, with the
same credentials those tools already had.

## Sandbox model

`runner.ts` executes the script in a `node:vm` context whose globals are an
explicit allowlist:

- **Granted:** `tools`, a captured `console`, and pure stateless helpers
(`JSON`, `Math`, `Date`, `Array`/`Object`/`Map`/`Set`/…, `structuredClone`,
`TextEncoder`/`TextDecoder`, `URL`/`URLSearchParams`, `setTimeout`/`clearTimeout`).
- **Denied:** `require`, `import`, `process`, `global`/`globalThis` ambient
authority, `Buffer`, `fetch`, filesystem — so the **only** way out is `tools.*`.
- **No dynamic code:** the context is created with
`codeGeneration: { strings: false, wasm: false }`, so `new Function(...)` /
`eval` throw — closing the most common `vm` escape via the `Function`
constructor.
- **Wall-clock timeout:** default 30s, capped at 120s. `node:vm` can't interrupt a
pending Promise (e.g. a hung tool call), so the timeout *races* script
completion to bound total time; the per-server MCP tool timeout still applies to
each individual call.

`node:vm` is **not** a hard security boundary against hostile code sharing the
process — but here the script author is the same agent that already calls these
tools directly. The goal is to **remove ambient authority** (fs/net/env) and
funnel every side effect through the audited `tools` path, not to contain an
adversary. Cloud runs additionally execute the whole agent inside a sandbox,
which is the real isolation layer.

## Adopt vs build

Researched the "code mode for MCP" ecosystem (Cloudflare *Code Mode*,
`@utcp/code-mode` / `code-mode-mcp`, `mcpac`). They all run as a **separate MCP
server or standalone process** that connects to MCP clients via its own config
(or target Cloudflare `workerd` isolates), and several add a second abstraction
(UTCP) on top of MCP. None reuse an existing in-process `McpServerConfig` map
with already-resolved credentials — which is the entire integration we need.

Adopting one would mean standing up another process, re-plumbing auth into it,
and taking a heavier dependency (some MPL-2.0) for what is a ~5-file thin layer
over the MCP SDK `Client` we already depend on. **Decision: build.** The layer is
small, has no new runtime dependencies (only `@modelcontextprotocol/sdk` and
`zod`, both already present), and inherits auth for free.

## Tests

`mcp-scripting.test.ts` covers proxy generation, a script calling a tool,
looping/batching, timeout enforcement, error surfacing, signature rendering, and
sandbox-escape attempts (`require`/`process`/`global`/`Buffer`/`fetch`/`new
Function` denied). `client-pool.integration.test.ts` spins up a real stdio MCP
server (`fixtures/echo-mcp-server.mjs`) and drives it end-to-end through a
script, including asserting that stdio `env` reaches the server (the credential
path).
121 changes: 121 additions & 0 deletions packages/agent/src/mcp-scripting/client-pool.integration.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import { fileURLToPath } from "node:url";
import type { McpServerConfig } from "@anthropic-ai/claude-agent-sdk";
import { afterEach, describe, expect, it } from "vitest";
import { McpClientPool } from "./client-pool";
import { buildToolsProxy } from "./proxy";
import { runScript } from "./runner";
import { listMcpToolsTool, runMcpScriptTool } from "./tools";

const ECHO_SERVER = fileURLToPath(
new URL("./fixtures/echo-mcp-server.mjs", import.meta.url),
);

describe("McpClientPool (real stdio MCP server)", () => {
let pool: McpClientPool | undefined;

afterEach(async () => {
await pool?.close();
pool = undefined;
});

it("lists tools and calls them over a real stdio transport", async () => {
pool = new McpClientPool({
echo: { type: "stdio", command: process.execPath, args: [ECHO_SERVER] },
});

const tools = await pool.listTools("echo");
expect(tools.map((t) => t.name).sort()).toEqual(["add", "whoami"]);

const result = await pool.callTool("echo", "add", { a: 2, b: 3 });
expect(result.isError).toBe(false);
expect(result.value).toEqual({ sum: 5 });
});

it("inherits stdio env as the credential path", async () => {
pool = new McpClientPool({
echo: {
type: "stdio",
command: process.execPath,
args: [ECHO_SERVER],
env: { ECHO_SECRET: "s3cr3t-token" },
},
});

const result = await pool.callTool("echo", "whoami", {});
expect(result.value).toBe("s3cr3t-token");
});

it("drives the real server end-to-end through a script", async () => {
pool = new McpClientPool({
echo: { type: "stdio", command: process.execPath, args: [ECHO_SERVER] },
});
const tools = buildToolsProxy(pool, pool.serverNames());

const { result, error } = await runScript({
tools,
script: `
let total = 0
for (let i = 1; i <= 3; i++) {
const r = await tools.echo.add({ a: total, b: i })
total = r.sum
}
return total
`,
});

expect(error).toBeUndefined();
expect(result).toBe(6);
}, 15_000);

it("excludes in-process sdk servers from serverNames", () => {
pool = new McpClientPool({
echo: { type: "stdio", command: process.execPath, args: [ECHO_SERVER] },
// sdk servers have no dialable transport; cast to satisfy the union.
inproc: { type: "sdk", name: "inproc" } as never,
});
expect(pool.serverNames()).toEqual(["echo"]);
});
});

describe("scripting local tools (real stdio MCP server)", () => {
const echoConfig: Record<string, McpServerConfig> = {
echo: { type: "stdio", command: process.execPath, args: [ECHO_SERVER] },
};

it("run_mcp_script gates on having scriptable servers", () => {
expect(runMcpScriptTool.isEnabled({ cwd: "/r" }, undefined)).toBe(false);
expect(
runMcpScriptTool.isEnabled(
{ cwd: "/r", scriptableMcpServers: echoConfig },
undefined,
),
).toBe(true);
});

it("list_mcp_tools renders real signatures and notes unreachable servers", async () => {
const result = await listMcpToolsTool.handler(
{
cwd: "/r",
scriptableMcpServers: {
...echoConfig,
broken: { type: "stdio", command: "definitely-not-a-real-binary" },
},
},
{},
);
const text = result.content[0].text;
expect(text).toContain("echo");
expect(text).toContain("add(args: {");
expect(text).toContain("Unreachable servers");
expect(text).toContain("broken");
}, 15_000);

it("run_mcp_script executes against the real server end-to-end", async () => {
const result = await runMcpScriptTool.handler(
{ cwd: "/r", scriptableMcpServers: echoConfig },
{ script: "return (await tools.echo.add({ a: 40, b: 2 })).sum" },
);
expect(result.isError).toBeUndefined();
expect(result.content[0].text).toContain('"result": 42');
}, 15_000);
});
Loading
Loading