From 1b36da82ea1f056ea6931d507c7c6a2e2e77ffbf Mon Sep 17 00:00:00 2001
From: Abhijeet Prasad <abhijeet@braintrustdata.com>
Date: Tue, 3 Mar 2026 17:25:50 -0500
Subject: [PATCH 1/2] feat: Add bun test runner integration

---
 .github/workflows/bun-test.yaml               |  43 +++
 CLAUDE.md                                     |   8 +-
 js/Makefile                                   |  14 +-
 js/examples/bun-test/bun-test-example.test.ts | 244 +++++++++++++
 js/package.json                               |   5 +-
 js/src/exports.ts                             |   1 +
 js/src/wrappers/bun-test/README.md            | 206 +++++++++++
 .../bun-test/bun-test-example.test.ts         | 134 +++++++
 .../wrappers/bun-test/bun-test-span.test.ts   | 127 +++++++
 js/src/wrappers/bun-test/bun-test.test.ts     | 329 ++++++++++++++++++
 js/src/wrappers/bun-test/index.ts             |  10 +
 js/src/wrappers/bun-test/suite.ts             | 208 +++++++++++
 js/src/wrappers/bun-test/test-helpers.ts      |  36 ++
 js/src/wrappers/bun-test/tsconfig.json        |  13 +
 js/src/wrappers/bun-test/types.ts             | 169 +++++++++
 js/tsconfig.json                              |   3 +-
 js/vitest.config.js                           |   2 +
 mise.toml                                     |   1 +
 pnpm-lock.yaml                                |  17 +
 19 files changed, 1563 insertions(+), 7 deletions(-)
 create mode 100644 .github/workflows/bun-test.yaml
 create mode 100644 js/examples/bun-test/bun-test-example.test.ts
 create mode 100644 js/src/wrappers/bun-test/README.md
 create mode 100644 js/src/wrappers/bun-test/bun-test-example.test.ts
 create mode 100644 js/src/wrappers/bun-test/bun-test-span.test.ts
 create mode 100644 js/src/wrappers/bun-test/bun-test.test.ts
 create mode 100644 js/src/wrappers/bun-test/index.ts
 create mode 100644 js/src/wrappers/bun-test/suite.ts
 create mode 100644 js/src/wrappers/bun-test/test-helpers.ts
 create mode 100644 js/src/wrappers/bun-test/tsconfig.json
 create mode 100644 js/src/wrappers/bun-test/types.ts

diff --git a/.github/workflows/bun-test.yaml b/.github/workflows/bun-test.yaml
new file mode 100644
index 000000000..37b60e494
--- /dev/null
+++ b/.github/workflows/bun-test.yaml
@@ -0,0 +1,43 @@
+name: bun-test
+
+on:
+  pull_request:
+    paths:
+      - "js/**"
+      - ".github/workflows/bun-test.yaml"
+      - "pnpm-lock.yaml"
+  push:
+    branches: [main]
+
+jobs:
+  bun-test:
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: 22
+
+      - uses: pnpm/action-setup@v4
+
+      - uses: oven-sh/setup-bun@v2
+
+      - name: Install dependencies
+        run: pnpm install --frozen-lockfile
+
+      - name: Build
+        run: pnpm run build
+
+      # Unit tests (bun test)
+      - name: Run unit tests
+        run: cd js && bun test src/wrappers/bun-test/
+
+      # Integration tests (bun test) - need bun runtime + API keys
+      - name: Run bun integration tests
+        env:
+          BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        run: cd js/examples/bun-test && bun test
diff --git a/CLAUDE.md b/CLAUDE.md
index a73257713..25d7f422e 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -2,7 +2,7 @@
 
 JavaScript client for Braintrust, plus wrapper libraries for OpenAI, Anthropic, and other AI providers.
 
-This repo uses `pnpm` as it's package manager.
+This repo uses `pnpm` as its package manager and [mise](https://mise.jdx.dev/) to manage tool versions.
 
 ## Structure
 
@@ -23,7 +23,13 @@ sdk/
 
 ## Setup
 
+This repo uses [mise](https://mise.jdx.dev/) to manage tool versions (e.g. `pnpm`). The root `mise.toml` pins versions and runs `pnpm install` automatically on `mise install`.
+
 ```bash
+mise install      # Install tools and dependencies (recommended)
+# or manually:
 pnpm install      # Install dependencies
 pnpm run build    # Build all packages
 ```
+
+mise also auto-loads a `.env` file if present — see `.env.example` to configure API keys.
diff --git a/js/Makefile b/js/Makefile
index 0e1a9b396..d90b59323 100644
--- a/js/Makefile
+++ b/js/Makefile
@@ -17,6 +17,7 @@ help:
 	@echo "  make test-ai-sdk-v6     - Run AI SDK v6 wrapper tests"
 	@echo "  make test-claude-agent-sdk - Run Claude Agent SDK wrapper tests"
 	@echo "  make test-vitest       - Run Vitest wrapper tests"
+	@echo "  make test-bun-test     - Run Bun test runner wrapper tests"
 	@echo "  make test-api-compat    - Run API compatibility tests"
 	@echo "  make bench              - Run queue performance benchmarks"
 	@echo "  make test-latest        - Run core + latest versions of wrappers"
@@ -28,7 +29,7 @@ help:
 	@echo ""
 	@echo "See smoke/README.md for details on smoke test infrastructure"
 
-.PHONY: help bench build clean test test-core test-openai test-anthropic test-google-genai test-ai-sdk test-ai-sdk-v5 test-ai-sdk-v6 test-claude-agent-sdk test-vitest test-latest install-optional-deps publish-beta-local test-smoke
+.PHONY: help bench build clean test test-core test-openai test-anthropic test-google-genai test-ai-sdk test-ai-sdk-v5 test-ai-sdk-v6 test-claude-agent-sdk test-vitest test-bun-test test-latest install-optional-deps publish-beta-local test-smoke
 
 # -------------------------------------------------------------------------------------------------	#
 # Anthropic testing
@@ -82,6 +83,13 @@ test-claude-agent-sdk:
 test-vitest:
 	cd src/wrappers/vitest && pnpm install && pnpm test
 
+# -------------------------------------------------------------------------------------------------
+# Bun test runner testing
+# -------------------------------------------------------------------------------------------------
+
+test-bun-test:
+	bun test src/wrappers/bun-test/
+
 # -------------------------------------------------------------------------------------------------
 # OpenAI testing
 # -------------------------------------------------------------------------------------------------
@@ -122,10 +130,10 @@ test-api-compat:
 	pnpm test:api-compat
 
 # Test everything
-test: test-core test-openai test-anthropic test-google-genai test-ai-sdk test-vitest test-claude-agent-sdk
+test: test-core test-openai test-anthropic test-google-genai test-ai-sdk test-vitest test-claude-agent-sdk test-bun-test
 
 # Test the core and the latest versions of wrappers.
-test-latest: test-core test-anthropic-latest test-openai-latest test-google-genai test-ai-sdk test-vitest test-claude-agent-sdk
+test-latest: test-core test-anthropic-latest test-openai-latest test-google-genai test-ai-sdk test-vitest test-claude-agent-sdk test-bun-test
 
 
 prune:
diff --git a/js/examples/bun-test/bun-test-example.test.ts b/js/examples/bun-test/bun-test-example.test.ts
new file mode 100644
index 000000000..318ae8c37
--- /dev/null
+++ b/js/examples/bun-test/bun-test-example.test.ts
@@ -0,0 +1,244 @@
+/**
+ * Bun Test Runner + Braintrust Example
+ *
+ * Demonstrates using initBunTestSuite to track test results as
+ * Braintrust experiments using the Bun test runner.
+ *
+ * Run with: bun test
+ * Requires: BRAINTRUST_API_KEY and OPENAI_API_KEY environment variables
+ */
+
+import { test, describe, afterAll } from "bun:test";
+import { configureNode } from "../../src/node";
+import { initBunTestSuite } from "../../src/wrappers/bun-test/index";
+import { _exportsForTestingOnly, login, currentSpan } from "../../src/logger";
+import { wrapOpenAI } from "../../src/wrappers/oai";
+import OpenAI from "openai";
+
+configureNode();
+
+_exportsForTestingOnly.setInitialTestState();
+await login({ apiKey: process.env.BRAINTRUST_API_KEY });
+
+if (!process.env.OPENAI_API_KEY) {
+  throw new Error(
+    "OPENAI_API_KEY environment variable must be set to run examples/bun-test/bun-test-example.test.ts",
+  );
+}
+
+const openai = wrapOpenAI(new OpenAI({ apiKey: process.env.OPENAI_API_KEY }));
+
+// ============================================================
+// Basic Usage — scorers, data expansion, logging
+// ============================================================
+
+describe("Translation Evaluation", () => {
+  const suite = initBunTestSuite({
+    projectName: "example-bun-test",
+    afterAll,
+    test,
+  });
+
+  // --- Single test with input/expected and a scorer ---
+
+  suite.test(
+    "basic translation test",
+    {
+      input: { text: "Hello", targetLang: "Spanish" },
+      expected: "Hola",
+      metadata: { difficulty: "easy" },
+      tags: ["translation", "spanish"],
+      scorers: [
+        ({ output, expected }) => ({
+          name: "exact_match",
+          score:
+            String(output).toLowerCase().trim() ===
+            String(expected).toLowerCase().trim()
+              ? 1
+              : 0,
+        }),
+      ],
+    },
+    async ({ input }) => {
+      const { text, targetLang } = input as {
+        text: string;
+        targetLang: string;
+      };
+      const response = await openai.chat.completions.create({
+        model: "gpt-3.5-turbo",
+        messages: [
+          {
+            role: "user",
+            content: `Translate "${text}" to ${targetLang}. Respond with ONLY the translation.`,
+          },
+        ],
+        temperature: 0,
+      });
+      return response.choices[0]?.message?.content?.trim() || "";
+    },
+  );
+
+  // --- Data expansion with a loop ---
+
+  const translationCases = [
+    {
+      input: { text: "Good morning", targetLang: "Spanish" },
+      expected: "Buenos días",
+    },
+    {
+      input: { text: "Thank you very much", targetLang: "Spanish" },
+      expected: "Muchas gracias",
+    },
+    {
+      input: { text: "Goodbye", targetLang: "French" },
+      expected: "Au revoir",
+    },
+  ];
+
+  for (const [i, record] of translationCases.entries()) {
+    suite.test(
+      `translation [${i}]: "${record.input.text}" → ${record.input.targetLang}`,
+      {
+        ...record,
+        scorers: [
+          ({ output, expected }) => {
+            const outputStr = String(output).toLowerCase().trim();
+            const expectedStr = String(expected).toLowerCase().trim();
+            const outputWords = new Set(outputStr.split(" "));
+            const expectedWords = expectedStr.split(" ");
+            const matches = expectedWords.filter((w) =>
+              outputWords.has(w),
+            ).length;
+            return {
+              name: "word_overlap",
+              score: matches / expectedWords.length,
+              metadata: { matches, total: expectedWords.length },
+            };
+          },
+        ],
+      },
+      async ({ input }) => {
+        const { text, targetLang } = input as {
+          text: string;
+          targetLang: string;
+        };
+        const response = await openai.chat.completions.create({
+          model: "gpt-3.5-turbo",
+          messages: [
+            {
+              role: "user",
+              content: `Translate "${text}" to ${targetLang}. Respond with ONLY the translation.`,
+            },
+          ],
+          temperature: 0,
+        });
+        return response.choices[0]?.message?.content?.trim() || "";
+      },
+    );
+  }
+
+  // --- currentSpan() for custom logging ---
+
+  suite.test(
+    "translation with extra logging",
+    {
+      input: { text: "How are you?", targetLang: "Spanish" },
+      expected: "¿Cómo estás?",
+    },
+    async ({ input }) => {
+      const { text, targetLang } = input as {
+        text: string;
+        targetLang: string;
+      };
+      const response = await openai.chat.completions.create({
+        model: "gpt-3.5-turbo",
+        messages: [
+          {
+            role: "user",
+            content: `Translate "${text}" to ${targetLang}. Respond with ONLY the translation.`,
+          },
+        ],
+        temperature: 0,
+      });
+
+      const result = response.choices[0]?.message?.content?.trim() || "";
+
+      currentSpan().log({
+        output: { tokens: response.usage, model: response.model },
+        scores: { human_quality: 0.95 },
+        metadata: { evaluator: "example" },
+      });
+
+      return result;
+    },
+  );
+});
+
+// ============================================================
+// Multiple Scorers
+// ============================================================
+
+describe("Multiple Scorers", () => {
+  const suite = initBunTestSuite({
+    projectName: "example-bun-test",
+    afterAll,
+    test,
+  });
+
+  suite.test(
+    "translation with multiple custom scorers",
+    {
+      input: { text: "Hello world", targetLang: "Spanish" },
+      expected: "Hola mundo",
+      scorers: [
+        ({ output, expected }) => ({
+          name: "exact_match",
+          score:
+            String(output).toLowerCase().trim() ===
+            String(expected).toLowerCase().trim()
+              ? 1
+              : 0,
+        }),
+        ({ output, expected }) => {
+          const outputWords = new Set(
+            String(output).toLowerCase().trim().split(" "),
+          );
+          const expectedWords = String(expected)
+            .toLowerCase()
+            .trim()
+            .split(" ");
+          const matches = expectedWords.filter((w) =>
+            outputWords.has(w),
+          ).length;
+          return {
+            name: "word_overlap",
+            score: matches / expectedWords.length,
+            metadata: { matches, total: expectedWords.length },
+          };
+        },
+        ({ output }) => ({
+          name: "conciseness",
+          score: String(output).length < 20 ? 1 : 0.7,
+          metadata: { length: String(output).length },
+        }),
+      ],
+    },
+    async ({ input }) => {
+      const { text, targetLang } = input as {
+        text: string;
+        targetLang: string;
+      };
+      const response = await openai.chat.completions.create({
+        model: "gpt-3.5-turbo",
+        messages: [
+          {
+            role: "user",
+            content: `Translate "${text}" to ${targetLang}. Respond with ONLY the translation.`,
+          },
+        ],
+        temperature: 0,
+      });
+      return response.choices[0]?.message?.content?.trim() || "";
+    },
+  );
+});
diff --git a/js/package.json b/js/package.json
index 96ca6635d..81a81bc8e 100644
--- a/js/package.json
+++ b/js/package.json
@@ -140,6 +140,7 @@
     "@openai/agents": "^0.0.14",
     "@types/argparse": "^2.0.14",
     "@types/async": "^3.2.24",
+    "@types/bun": "^1.3.10",
     "@types/cli-progress": "^3.11.5",
     "@types/cors": "^2.8.17",
     "@types/express": "^5.0.0",
@@ -160,8 +161,6 @@
     "jiti": "^2.6.1",
     "openapi-zod-client": "^1.18.3",
     "rollup": "^4.28.1",
-    "vite": "^5.4.14",
-    "webpack": "^5.97.1",
     "tar": "^7.5.2",
     "tinybench": "^4.0.1",
     "ts-jest": "^29.1.4",
@@ -170,8 +169,10 @@
     "typedoc": "^0.25.13",
     "typedoc-plugin-markdown": "^3.17.1",
     "typescript": "5.4.4",
+    "vite": "^5.4.14",
     "vite-tsconfig-paths": "^4.3.2",
     "vitest": "^2.1.9",
+    "webpack": "^5.97.1",
     "zod": "^3.25.34"
   },
   "dependencies": {
diff --git a/js/src/exports.ts b/js/src/exports.ts
index 6eb2b5c1f..81bfdc7f4 100644
--- a/js/src/exports.ts
+++ b/js/src/exports.ts
@@ -175,6 +175,7 @@ export { wrapClaudeAgentSDK } from "./wrappers/claude-agent-sdk/claude-agent-sdk
 export { wrapGoogleGenAI } from "./wrappers/google-genai";
 export { wrapVitest } from "./wrappers/vitest";
 export { initNodeTestSuite } from "./wrappers/node-test";
+export { initBunTestSuite } from "./wrappers/bun-test";
 
 export * as graph from "./graph-framework";
 
diff --git a/js/src/wrappers/bun-test/README.md b/js/src/wrappers/bun-test/README.md
new file mode 100644
index 000000000..afe4623b0
--- /dev/null
+++ b/js/src/wrappers/bun-test/README.md
@@ -0,0 +1,206 @@
+# Braintrust Bun Test Runner Integration
+
+Track your Bun test results as [Braintrust](https://braintrust.dev) experiments using [bun:test](https://bun.sh/docs/test/writing).
+
+## Quick Start
+
+```typescript
+import { test, describe, afterAll } from "bun:test";
+import { initBunTestSuite } from "braintrust";
+
+describe("My Evaluation Suite", () => {
+  const suite = initBunTestSuite({
+    projectName: "my-project",
+    afterAll, // Auto-flush results after all tests
+    test, // Required: bun:test's test function
+  });
+
+  suite.test(
+    "evaluates output",
+    {
+      input: { text: "hello" },
+      expected: "hola",
+      scorers: [
+        ({ output, expected }) => ({
+          name: "exact_match",
+          score: output === expected ? 1 : 0,
+        }),
+      ],
+    },
+    async ({ input }) => {
+      return await translate(input.text);
+    },
+  );
+
+  // Untracked tests use regular test() as normal
+  test("sanity check", () => {
+    expect(1 + 1).toBe(2);
+  });
+});
+```
+
+## Core Features
+
+- **Composable**: `suite.test()` wraps `bun:test`'s `test()` — use `test()` directly for untracked tests
+- **Experiment tracking**: Each test case creates a span with input, output, expected, and scores
+- **Automatic scoring**: Attach scorer functions to evaluate outputs
+- **Data expansion**: Use `for` loops for parameterized tests
+- **Auto-flush**: Pass `afterAll` to automatically flush results when tests finish
+
+## API Reference
+
+### `initBunTestSuite(config)`
+
+Creates a new test suite with Braintrust experiment tracking.
+
+**Config:**
+
+| Field            | Type       | Required | Description                                      |
+| ---------------- | ---------- | -------- | ------------------------------------------------ |
+| `projectName`    | `string`   | Yes      | Braintrust project name                          |
+| `test`           | `Function` | Yes      | `test` from `bun:test`                           |
+| `experimentName` | `string`   | No       | Custom experiment name (default: auto-generated) |
+| `displaySummary` | `boolean`  | No       | Show summary after flush (default: `true`)       |
+| `afterAll`       | `Function` | No       | `afterAll` from `bun:test` for auto-flush        |
+| `onProgress`     | `Function` | No       | Callback for progress events                     |
+
+**Returns:** `BunTestSuite` with `test`, `it`, and `flush()`.
+
+### `suite.test(name, config, fn)`
+
+Creates a traced test case.
+
+**Parameters:**
+
+| Parameter | Type         | Description                           |
+| --------- | ------------ | ------------------------------------- |
+| `name`    | `string`     | Test name (used as span name)         |
+| `config`  | `EvalConfig` | Input, expected, scorers, etc.        |
+| `fn`      | `Function`   | Test function receiving `EvalContext` |
+
+**EvalConfig:**
+
+| Field      | Type                      | Description                         |
+| ---------- | ------------------------- | ----------------------------------- |
+| `input`    | `unknown`                 | Test input data                     |
+| `expected` | `unknown`                 | Expected output (passed to scorers) |
+| `metadata` | `Record<string, unknown>` | Custom metadata                     |
+| `tags`     | `string[]`                | Tags for organizing test cases      |
+| `scorers`  | `ScorerFunction[]`        | Scorer functions                    |
+| `name`     | `string`                  | Override span name                  |
+
+### Test Modifiers
+
+All modifiers from `bun:test` are supported:
+
+```typescript
+suite.test.skip("skipped test", config, fn);
+suite.test.only("focused test", config, fn);
+suite.test.todo("planned test");
+suite.test.failing("expected failure", config, fn);
+suite.test.concurrent("parallel test", config, fn);
+suite.test.serial("sequential test", config, fn);
+
+// Conditional modifiers
+suite.test.if(condition)("conditional test", config, fn);
+suite.test.skipIf(condition)("skip-if test", config, fn);
+suite.test.todoIf(condition)("todo-if test", config, fn);
+```
+
+`suite.it` is an alias for `suite.test`.
+
+## Using Scorers
+
+```typescript
+// Basic scorer
+suite.test(
+  "my test",
+  {
+    input: "hello",
+    expected: "HELLO",
+    scorers: [
+      ({ output, expected }) => ({
+        name: "exact_match",
+        score: output === expected ? 1 : 0,
+      }),
+    ],
+  },
+  async ({ input }) => (input as string).toUpperCase(),
+);
+
+// Multiple scorers
+suite.test(
+  "multi-scored",
+  {
+    input: "hello world",
+    expected: "Hola mundo",
+    scorers: [
+      ({ output, expected }) => ({
+        name: "exact_match",
+        score: String(output) === String(expected) ? 1 : 0,
+      }),
+      ({ output }) => ({
+        name: "not_empty",
+        score: String(output).length > 0 ? 1 : 0,
+      }),
+    ],
+  },
+  async ({ input }) => await translate(input),
+);
+```
+
+## Data Expansion
+
+Use `for` loops instead of `test.each` for parameterized tests:
+
+```typescript
+const cases = [
+  { input: "hello", expected: "hola" },
+  { input: "goodbye", expected: "adiós" },
+];
+
+for (const [i, record] of cases.entries()) {
+  suite.test(
+    `translation [${i}]: ${record.input}`,
+    {
+      input: record.input,
+      expected: record.expected,
+      scorers: [myScorer],
+    },
+    async ({ input }) => await translate(input as string),
+  );
+}
+```
+
+## Custom Logging with `currentSpan()`
+
+```typescript
+import { currentSpan } from "braintrust";
+
+suite.test(
+  "with custom logging",
+  { input: { query: "test" } },
+  async ({ input }) => {
+    const result = await myFunction(input);
+
+    currentSpan().log({
+      output: { tokens: result.usage, model: result.model },
+      scores: { human_quality: 0.95 },
+      metadata: { evaluator: "example" },
+    });
+
+    return result.text;
+  },
+);
+```
+
+## Running
+
+```bash
+bun test
+```
+
+## Additional Resources
+
+- [Braintrust Documentation](https://www.braintrust.dev/docs)
+- [Bun Test Runner](https://bun.sh/docs/test/writing)
diff --git a/js/src/wrappers/bun-test/bun-test-example.test.ts b/js/src/wrappers/bun-test/bun-test-example.test.ts
new file mode 100644
index 000000000..a1b1a9184
--- /dev/null
+++ b/js/src/wrappers/bun-test/bun-test-example.test.ts
@@ -0,0 +1,134 @@
+import { test, describe, afterAll, beforeAll } from "bun:test";
+import { currentSpan } from "../../logger";
+import { initBunTestSuite } from "./suite";
+import {
+  setupBunTestEnv,
+  teardownBunTestEnv,
+  createTestInitExperiment,
+} from "./test-helpers";
+
+let moduleBackgroundLogger: Awaited<ReturnType<typeof setupBunTestEnv>>;
+beforeAll(async () => {
+  moduleBackgroundLogger = await setupBunTestEnv();
+});
+
+describe("Bun Test Suite Example", () => {
+  const suite = initBunTestSuite({
+    projectName: "bun-test-example",
+    displaySummary: false,
+    afterAll,
+    test,
+    _initExperiment: createTestInitExperiment(),
+    onProgress: (event) => {
+      if (event.type === "test_complete") {
+        console.log(
+          `  ${event.testName} (${event.duration.toFixed(2)}ms) - ${event.passed ? "PASSED" : "FAILED"}`,
+        );
+      }
+    },
+  });
+
+  // Basic test with suite.test()
+  suite.test(
+    "basic addition",
+    { input: { a: 2, b: 2 }, expected: 4 },
+    async ({ input }) => {
+      const { a, b } = input as { a: number; b: number };
+      return a + b;
+    },
+  );
+
+  // Test with metadata and tags
+  suite.test(
+    "multiplication with metadata",
+    {
+      input: { a: 3, b: 4 },
+      expected: 12,
+      metadata: { category: "arithmetic", difficulty: "easy" },
+      tags: ["math", "multiplication"],
+    },
+    async ({ input }) => {
+      const { a, b } = input as { a: number; b: number };
+      return a * b;
+    },
+  );
+
+  // Test with scorers
+  suite.test(
+    "string transformation with scorers",
+    {
+      input: "hello world",
+      expected: "HELLO WORLD",
+      scorers: [
+        ({ output, expected }) => ({
+          name: "exact_match",
+          score: output === expected ? 1 : 0,
+        }),
+        ({ output }) => ({
+          name: "is_uppercase",
+          score:
+            typeof output === "string" && output === output.toUpperCase()
+              ? 1
+              : 0,
+        }),
+      ],
+    },
+    async ({ input }) => {
+      return (input as string).toUpperCase();
+    },
+  );
+
+  // Data expansion with a loop
+  const translationData = [
+    { input: { text: "hello" }, expected: "hola" },
+    { input: { text: "goodbye" }, expected: "adiós" },
+    { input: { text: "thanks" }, expected: "gracias" },
+  ];
+
+  for (const [i, record] of translationData.entries()) {
+    suite.test(
+      `translation [${i}]`,
+      {
+        input: record.input,
+        expected: record.expected,
+        scorers: [
+          ({ output, expected }) => ({
+            name: "exact_match",
+            score: output === expected ? 1 : 0,
+          }),
+        ],
+      },
+      async ({ input }) => {
+        const translations: Record<string, string> = {
+          hello: "hola",
+          goodbye: "adiós",
+          thanks: "gracias",
+        };
+        return translations[(input as any).text] || "unknown";
+      },
+    );
+  }
+
+  // Test using currentSpan() for custom logging
+  suite.test(
+    "custom outputs and feedback",
+    { input: { query: "test query" } },
+    async ({ input }) => {
+      const result = `processed: ${(input as any).query}`;
+      currentSpan().log({
+        output: { processed_query: result, model: "test-model" },
+        scores: { relevance: 0.9 },
+        metadata: { evaluator: "human" },
+      });
+      return result;
+    },
+  );
+});
+
+afterAll(async () => {
+  await moduleBackgroundLogger.flush();
+  const spans = await moduleBackgroundLogger.drain();
+  console.log(`  Example tests captured ${spans.length} spans`);
+
+  teardownBunTestEnv();
+});
diff --git a/js/src/wrappers/bun-test/bun-test-span.test.ts b/js/src/wrappers/bun-test/bun-test-span.test.ts
new file mode 100644
index 000000000..35e7d1a63
--- /dev/null
+++ b/js/src/wrappers/bun-test/bun-test-span.test.ts
@@ -0,0 +1,127 @@
+import { test, describe, afterAll, expect, beforeAll } from "bun:test";
+import { currentSpan } from "../../logger";
+import { initBunTestSuite } from "./suite";
+import {
+  setupBunTestEnv,
+  teardownBunTestEnv,
+  createTestInitExperiment,
+} from "./test-helpers";
+
+let moduleBackgroundLogger: Awaited<ReturnType<typeof setupBunTestEnv>>;
+beforeAll(async () => {
+  moduleBackgroundLogger = await setupBunTestEnv();
+});
+
+describe("Bun Test Suite Span Creation Integration", () => {
+  const suite = initBunTestSuite({
+    projectName: "bun-test-span-integration",
+    displaySummary: false,
+    test,
+    _initExperiment: createTestInitExperiment(),
+  });
+
+  suite.test(
+    "creates span with input and expected",
+    {
+      input: { value: 5 },
+      expected: 10,
+      metadata: { operation: "multiply" },
+      tags: ["math"],
+    },
+    async ({ input }) => {
+      return (input as any).value * 2;
+    },
+  );
+
+  suite.test(
+    "creates span with custom outputs and feedback",
+    { input: "test-data" },
+    async () => {
+      currentSpan().log({
+        output: { step1: "started", step2: "completed" },
+        scores: { quality: 0.95 },
+      });
+      return "final result";
+    },
+  );
+
+  suite.test(
+    "creates span with scorer results",
+    {
+      input: "hello",
+      expected: "HELLO",
+      scorers: [
+        ({ output, expected }) => ({
+          name: "case_match",
+          score: output === expected ? 1 : 0,
+        }),
+      ],
+    },
+    async ({ input }) => {
+      return (input as string).toUpperCase();
+    },
+  );
+
+  suite.test(
+    "creates span for passing test with pass score",
+    { input: "simple" },
+    async () => {
+      return "result";
+    },
+  );
+});
+
+afterAll(async () => {
+  await moduleBackgroundLogger.flush();
+  const spans = await moduleBackgroundLogger.drain();
+
+  // Verify spans were created
+  expect(spans.length).toBeGreaterThan(0);
+
+  // Verify task type spans exist
+  const taskSpans = spans.filter(
+    (s: any) => s.span_attributes?.type === "task",
+  );
+  expect(taskSpans.length).toBeGreaterThan(0);
+
+  // Verify pass scores exist
+  const spansWithPassScore = spans.filter(
+    (s: any) => s.scores?.pass !== undefined,
+  );
+  expect(spansWithPassScore.length).toBeGreaterThan(0);
+
+  // Verify passing tests
+  const passingTests = spans.filter((s: any) => s.scores?.pass === 1);
+  expect(passingTests.length).toBeGreaterThan(0);
+
+  // Verify spans have output
+  const spansWithOutputs = spans.filter((s: any) => s.output);
+  expect(spansWithOutputs.length).toBeGreaterThan(0);
+
+  // Verify spans have input
+  const spansWithInput = spans.filter((s: any) => s.input !== undefined);
+  expect(spansWithInput.length).toBeGreaterThan(0);
+
+  // Verify spans have expected
+  const spansWithExpected = spans.filter((s: any) => s.expected !== undefined);
+  expect(spansWithExpected.length).toBeGreaterThan(0);
+
+  // Verify spans have metadata
+  const spansWithMetadata = spans.filter(
+    (s: any) => s.metadata && Object.keys(s.metadata).length > 0,
+  );
+  expect(spansWithMetadata.length).toBeGreaterThan(0);
+
+  // Verify spans have tags
+  const spansWithTags = spans.filter((s: any) => s.tags && s.tags.length > 0);
+  expect(spansWithTags.length).toBeGreaterThan(0);
+
+  // Verify custom scores (from scorers)
+  const spansWithCustomScores = spans.filter((s: any) => {
+    const scores = s.scores || {};
+    return Object.keys(scores).some((key) => key !== "pass");
+  });
+  expect(spansWithCustomScores.length).toBeGreaterThan(0);
+
+  teardownBunTestEnv();
+});
diff --git a/js/src/wrappers/bun-test/bun-test.test.ts b/js/src/wrappers/bun-test/bun-test.test.ts
new file mode 100644
index 000000000..ba316e95f
--- /dev/null
+++ b/js/src/wrappers/bun-test/bun-test.test.ts
@@ -0,0 +1,329 @@
+import { test, describe, afterAll, expect, beforeAll } from "bun:test";
+import { initBunTestSuite } from "./suite";
+import {
+  setupBunTestEnv,
+  teardownBunTestEnv,
+  createTestInitExperiment,
+} from "./test-helpers";
+
+beforeAll(async () => {
+  await setupBunTestEnv();
+});
+
+// ---------------------------------------------------------------------------
+// API surface
+// ---------------------------------------------------------------------------
+
+describe("initBunTestSuite API surface", () => {
+  const suite = initBunTestSuite({
+    projectName: "api-surface",
+    test,
+    _initExperiment: createTestInitExperiment(),
+  });
+
+  test("suite has test, it, and flush", () => {
+    expect(suite.test).toBeDefined();
+    expect(typeof suite.test).toBe("function");
+    expect(suite.it).toBeDefined();
+    expect(suite.it).toBe(suite.test);
+    expect(suite.flush).toBeDefined();
+    expect(typeof suite.flush).toBe("function");
+  });
+
+  test("suite.test has all modifier properties", () => {
+    expect(typeof suite.test.skip).toBe("function");
+    expect(typeof suite.test.only).toBe("function");
+    expect(typeof suite.test.todo).toBe("function");
+    expect(typeof suite.test.failing).toBe("function");
+    expect(typeof suite.test.concurrent).toBe("function");
+    expect(typeof suite.test.serial).toBe("function");
+    expect(typeof suite.test.if).toBe("function");
+    expect(typeof suite.test.skipIf).toBe("function");
+    expect(typeof suite.test.todoIf).toBe("function");
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Basic traced eval
+// ---------------------------------------------------------------------------
+
+describe("basic traced eval", () => {
+  let result: unknown;
+
+  const suite = initBunTestSuite({
+    projectName: "basic-eval",
+    test,
+    displaySummary: false,
+    _initExperiment: createTestInitExperiment(),
+  });
+
+  suite.test(
+    "runs traced eval and returns output",
+    { input: "hello" },
+    async ({ input }) => {
+      result = `processed: ${input}`;
+      return result;
+    },
+  );
+
+  test("traced eval produced correct result", () => {
+    expect(result).toBe("processed: hello");
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Scorer invocation
+// ---------------------------------------------------------------------------
+
+describe("scorer invocation", () => {
+  let scorerCallArgs: any = null;
+
+  const suite = initBunTestSuite({
+    projectName: "scorer-test",
+    test,
+    displaySummary: false,
+    _initExperiment: createTestInitExperiment(),
+  });
+
+  suite.test(
+    "test with scorer",
+    {
+      input: { text: "hello" },
+      expected: "world",
+      metadata: { lang: "en" },
+      scorers: [
+        (args) => {
+          scorerCallArgs = args;
+          return { name: "test_score", score: 1 };
+        },
+      ],
+    },
+    async () => "output-value",
+  );
+
+  test("scorer received correct arguments", () => {
+    expect(scorerCallArgs).toEqual({
+      output: "output-value",
+      expected: "world",
+      input: { text: "hello" },
+      metadata: { lang: "en" },
+    });
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Scorers run on error
+// ---------------------------------------------------------------------------
+
+describe("scorers run even when test function throws", () => {
+  let scorerCalled = false;
+
+  const suite = initBunTestSuite({
+    projectName: "scorer-error-test",
+    test,
+    displaySummary: false,
+    _initExperiment: createTestInitExperiment(),
+  });
+
+  // test.failing expects the test to throw — bun marks it as passed
+  suite.test.failing(
+    "error test",
+    {
+      input: "hello",
+      scorers: [
+        () => {
+          scorerCalled = true;
+          return { name: "post_error_score", score: 0 };
+        },
+      ],
+    },
+    async () => {
+      throw new Error("test failure");
+    },
+  );
+
+  test("scorer was still called", () => {
+    expect(scorerCalled).toBe(true);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Return value logged as output
+// ---------------------------------------------------------------------------
+
+describe("return value", () => {
+  let captured: unknown;
+
+  const suite = initBunTestSuite({
+    projectName: "output-test",
+    test,
+    displaySummary: false,
+    _initExperiment: createTestInitExperiment(),
+  });
+
+  suite.test("output test", { input: "hello" }, async () => {
+    captured = { result: "some output" };
+    return captured;
+  });
+
+  test("return value was captured", () => {
+    expect(captured).toEqual({ result: "some output" });
+  });
+});
+
+// ---------------------------------------------------------------------------
+// afterAll registration
+// ---------------------------------------------------------------------------
+
+describe("afterAll registration", () => {
+  test("afterAll is called with a flush function", () => {
+    const fns: Function[] = [];
+    initBunTestSuite({
+      projectName: "after-test",
+      test,
+      afterAll: (fn) => fns.push(fn),
+      _initExperiment: createTestInitExperiment(),
+    });
+
+    expect(fns).toHaveLength(1);
+    expect(typeof fns[0]).toBe("function");
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Flush behavior
+// ---------------------------------------------------------------------------
+
+describe("flush", () => {
+  test("flush is a no-op when no experiment was created", async () => {
+    const suite = initBunTestSuite({
+      projectName: "no-experiment",
+      test,
+      displaySummary: false,
+      _initExperiment: createTestInitExperiment(),
+    });
+
+    // Should not throw even though no eval was called
+    await suite.flush();
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Span naming
+// ---------------------------------------------------------------------------
+
+describe("span naming via progress events", () => {
+  const events: any[] = [];
+
+  const suite = initBunTestSuite({
+    projectName: "name-test",
+    test,
+    displaySummary: false,
+    onProgress: (event) => events.push(event),
+    _initExperiment: createTestInitExperiment(),
+  });
+
+  suite.test(
+    "original-name",
+    { input: "hello", name: "custom-span-name" },
+    async () => "result",
+  );
+
+  suite.test("test-name-used", { input: "hello" }, async () => "result");
+
+  test("evalConfig.name overrides test name for span", () => {
+    const starts = events.filter((e) => e.type === "test_start");
+    expect(starts[0].testName).toBe("custom-span-name");
+  });
+
+  test("test name is used when evalConfig.name is not set", () => {
+    const starts = events.filter((e) => e.type === "test_start");
+    expect(starts[1].testName).toBe("test-name-used");
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Progress events
+// ---------------------------------------------------------------------------
+
+describe("onProgress events", () => {
+  const events: any[] = [];
+
+  const suite = initBunTestSuite({
+    projectName: "progress-test",
+    test,
+    displaySummary: false,
+    onProgress: (event) => events.push(event),
+    _initExperiment: createTestInitExperiment(),
+  });
+
+  suite.test("progress-test", { input: "hello" }, async () => "result");
+
+  test("receives test_start and test_complete events", () => {
+    expect(events).toEqual([
+      { type: "test_start", testName: "progress-test" },
+      expect.objectContaining({
+        type: "test_complete",
+        testName: "progress-test",
+        passed: true,
+      }),
+    ]);
+    expect(typeof events[1].duration).toBe("number");
+  });
+});
+
+describe("onProgress on failure", () => {
+  const events: any[] = [];
+
+  const suite = initBunTestSuite({
+    projectName: "fail-progress-test",
+    test,
+    displaySummary: false,
+    onProgress: (event) => events.push(event),
+    _initExperiment: createTestInitExperiment(),
+  });
+
+  // Use test.failing so bun expects the throw
+  suite.test.failing("fail-test", { input: "hello" }, async () => {
+    throw new Error("intentional failure");
+  });
+
+  test("reports passed=false when test throws", () => {
+    const complete = events.find((e) => e.type === "test_complete");
+    expect(complete.passed).toBe(false);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Modifiers — verify they register without error
+// ---------------------------------------------------------------------------
+
+describe("test modifiers", () => {
+  const suite = initBunTestSuite({
+    projectName: "modifier-test",
+    test,
+    displaySummary: false,
+    _initExperiment: createTestInitExperiment(),
+  });
+
+  suite.test.skip("skipped test", { input: "x" }, async () => "y");
+  suite.test.todo("todo test");
+  suite.test.skipIf(true)("skipIf-true test", { input: "x" }, async () => "y");
+  suite.test.todoIf(true)("todoIf-true test", { input: "x" }, async () => "y");
+
+  // If(false) should not run the test
+  suite.test.if(false)("if-false test", { input: "x" }, async () => {
+    throw new Error("should not run");
+  });
+
+  // If(true) should run the test
+  suite.test.if(true)("if-true test", { input: "x" }, async () => "y");
+});
+
+// ---------------------------------------------------------------------------
+// Cleanup
+// ---------------------------------------------------------------------------
+
+afterAll(async () => {
+  teardownBunTestEnv();
+});
diff --git a/js/src/wrappers/bun-test/index.ts b/js/src/wrappers/bun-test/index.ts
new file mode 100644
index 000000000..6e9bee17f
--- /dev/null
+++ b/js/src/wrappers/bun-test/index.ts
@@ -0,0 +1,10 @@
+export { initBunTestSuite } from "./suite";
+export type {
+  BunTestSuiteConfig,
+  BunTestProgressEvent,
+  EvalConfig,
+  EvalContext,
+  SuiteTestFunction,
+  BunTestSuite,
+  ScorerFunction,
+} from "./types";
diff --git a/js/src/wrappers/bun-test/suite.ts b/js/src/wrappers/bun-test/suite.ts
new file mode 100644
index 000000000..a70b73737
--- /dev/null
+++ b/js/src/wrappers/bun-test/suite.ts
@@ -0,0 +1,208 @@
+import { initExperiment, type Experiment } from "../../logger";
+import { runTracedEval } from "../shared/traced-eval";
+import { summarizeAndFlush } from "../shared/flush";
+import type {
+  BunTestSuite,
+  EvalConfig,
+  EvalContext,
+  SuiteTestFunction,
+} from "./types";
+import type { BunTestSuiteConfig } from "./types";
+
+type TestFn = (...args: any[]) => any;
+
+/** The shape we expect from bun:test's `test` at runtime. */
+interface ValidatedTestFunction extends TestFn {
+  skip: TestFn;
+  only: TestFn;
+  todo: TestFn;
+  failing: TestFn;
+  concurrent: TestFn;
+  serial: TestFn;
+  if: (condition: boolean) => TestFn;
+  skipIf: (condition: boolean) => TestFn;
+  todoIf: (condition: boolean) => TestFn;
+}
+
+const DIRECT_MODIFIERS = [
+  "skip",
+  "only",
+  "todo",
+  "failing",
+  "concurrent",
+  "serial",
+] as const;
+const CONDITIONAL_MODIFIERS = ["if", "skipIf", "todoIf"] as const;
+
+function validateTestFunction(test: unknown): ValidatedTestFunction {
+  if (typeof test !== "function") {
+    throw new Error(
+      `initBunTestSuite: "test" must be a function (got ${typeof test}). ` +
+        `Pass the "test" export from bun:test.`,
+    );
+  }
+  const t = test as unknown as Record<string, unknown>;
+  for (const mod of [...DIRECT_MODIFIERS, ...CONDITIONAL_MODIFIERS]) {
+    if (typeof t[mod] !== "function") {
+      throw new Error(
+        `initBunTestSuite: "test.${mod}" must be a function (got ${typeof t[mod]}). ` +
+          `Make sure you are passing the "test" export from bun:test.`,
+      );
+    }
+  }
+  return test as ValidatedTestFunction;
+}
+
+/**
+ * Creates a new Bun test suite with Braintrust experiment tracking.
+ *
+ * @example
+ * ```typescript
+ * import { test, describe, afterAll } from 'bun:test';
+ * import { initBunTestSuite } from 'braintrust';
+ *
+ * describe('My Tests', () => {
+ *   const suite = initBunTestSuite({
+ *     projectName: 'my-project',
+ *     afterAll,
+ *     test,
+ *   });
+ *
+ *   suite.test('my eval', {
+ *     input: 'hello',
+ *     expected: 'world',
+ *     scorers: [myScorer],
+ *   }, async ({ input }) => {
+ *     return await myFunction(input);
+ *   });
+ * });
+ * ```
+ */
+export function initBunTestSuite<TTest extends (...args: any[]) => any>(
+  config: BunTestSuiteConfig<TTest>,
+): BunTestSuite {
+  let experiment: Experiment | undefined;
+
+  const getOrCreateExperiment = (): Experiment => {
+    if (experiment) {
+      return experiment;
+    }
+
+    const experimentName =
+      config.experimentName ||
+      `${config.projectName}-${new Date().toISOString()}`;
+    const initExp = config._initExperiment ?? initExperiment;
+    experiment = initExp(config.projectName, {
+      experiment: experimentName,
+    }) as Experiment;
+    return experiment;
+  };
+
+  function createTracedFn(
+    name: string,
+    evalConfig: EvalConfig,
+    fn: (context: EvalContext) => unknown | Promise<unknown>,
+  ): () => Promise<void> {
+    return async () => {
+      const exp = getOrCreateExperiment();
+      const spanName = evalConfig.name ?? name;
+
+      if (config.onProgress) {
+        config.onProgress({ type: "test_start", testName: spanName });
+      }
+
+      const startTime = performance.now();
+      let passed = false;
+
+      try {
+        await runTracedEval({
+          experiment: exp,
+          spanName,
+          input: evalConfig.input,
+          expected: evalConfig.expected,
+          metadata: evalConfig.metadata,
+          tags: evalConfig.tags,
+          scorers: evalConfig.scorers,
+          fn: () =>
+            fn({
+              input: evalConfig.input,
+              expected: evalConfig.expected,
+              metadata: evalConfig.metadata,
+            }),
+        });
+        passed = true;
+      } catch (error) {
+        passed = false;
+        throw error;
+      } finally {
+        if (config.onProgress) {
+          config.onProgress({
+            type: "test_complete",
+            testName: spanName,
+            passed,
+            duration: performance.now() - startTime,
+          });
+        }
+      }
+    };
+  }
+
+  function wrapTestVariant(
+    testFn: (name: string, fn: () => void | Promise<void>) => void,
+  ) {
+    return (
+      name: string,
+      evalConfig: EvalConfig,
+      fn: (context: EvalContext) => unknown | Promise<unknown>,
+    ) => {
+      testFn(name, createTracedFn(name, evalConfig, fn));
+    };
+  }
+
+  function wrapConditional(
+    modifier: (
+      condition: boolean,
+    ) => (name: string, fn: () => void | Promise<void>) => void,
+  ) {
+    return (condition: boolean) => {
+      return wrapTestVariant(modifier(condition));
+    };
+  }
+
+  const t = validateTestFunction(config.test);
+  const suiteTest = Object.assign(wrapTestVariant(t), {
+    skip: wrapTestVariant(t.skip.bind(t)),
+    only: wrapTestVariant(t.only.bind(t)),
+    todo: wrapTestVariant(t.todo.bind(t)),
+    failing: wrapTestVariant(t.failing.bind(t)),
+    concurrent: wrapTestVariant(t.concurrent.bind(t)),
+    serial: wrapTestVariant(t.serial.bind(t)),
+    if: wrapConditional(t.if.bind(t)),
+    skipIf: wrapConditional(t.skipIf.bind(t)),
+    todoIf: wrapConditional(t.todoIf.bind(t)),
+  }) as SuiteTestFunction;
+
+  async function flush(): Promise<void> {
+    if (!experiment) {
+      return;
+    }
+
+    await summarizeAndFlush(experiment, {
+      displaySummary: config.displaySummary,
+    });
+    experiment = undefined;
+  }
+
+  const suite: BunTestSuite = {
+    test: suiteTest,
+    it: suiteTest,
+    flush,
+  };
+
+  // Auto-register flush hook if afterAll() was provided
+  if (config.afterAll) {
+    config.afterAll(() => suite.flush());
+  }
+
+  return suite;
+}
diff --git a/js/src/wrappers/bun-test/test-helpers.ts b/js/src/wrappers/bun-test/test-helpers.ts
new file mode 100644
index 000000000..32138e1b1
--- /dev/null
+++ b/js/src/wrappers/bun-test/test-helpers.ts
@@ -0,0 +1,36 @@
+import { configureNode } from "../../node/config";
+import {
+  _exportsForTestingOnly,
+  type TestBackgroundLogger,
+} from "../../logger";
+
+/**
+ * Sets up the test environment for bun-test wrapper tests.
+ * Must be called in beforeAll (top-level await).
+ *
+ * Returns the background logger for span verification.
+ */
+export async function setupBunTestEnv(): Promise<TestBackgroundLogger> {
+  configureNode();
+  _exportsForTestingOnly.setInitialTestState();
+  await _exportsForTestingOnly.simulateLoginForTests();
+  return _exportsForTestingOnly.useTestBackgroundLogger();
+}
+
+export function teardownBunTestEnv(): void {
+  _exportsForTestingOnly.clearTestBackgroundLogger();
+  _exportsForTestingOnly.simulateLogoutForTests();
+}
+
+/**
+ * Creates a test-only initExperiment function that uses the in-memory
+ * test logger instead of making real API calls.
+ */
+export function createTestInitExperiment() {
+  return (projectName: string, options?: { experiment?: string }) => {
+    return _exportsForTestingOnly.initTestExperiment(
+      options?.experiment || "test-experiment",
+      projectName,
+    );
+  };
+}
diff --git a/js/src/wrappers/bun-test/tsconfig.json b/js/src/wrappers/bun-test/tsconfig.json
new file mode 100644
index 000000000..dafec4174
--- /dev/null
+++ b/js/src/wrappers/bun-test/tsconfig.json
@@ -0,0 +1,13 @@
+{
+  "extends": "../../../tsconfig.json",
+  "compilerOptions": {
+    "types": ["@types/bun"]
+  },
+  "include": ["./**/*.ts", "../../../src/**/*.ts"],
+  "exclude": [
+    "node_modules/**",
+    "dist/**",
+    "../../../vendor/**",
+    "**/vendor/**"
+  ]
+}
diff --git a/js/src/wrappers/bun-test/types.ts b/js/src/wrappers/bun-test/types.ts
new file mode 100644
index 000000000..512e7dcaf
--- /dev/null
+++ b/js/src/wrappers/bun-test/types.ts
@@ -0,0 +1,169 @@
+import type { ScorerFunction } from "../shared/types";
+
+export type { ScorerFunction } from "../shared/types";
+
+/** Progress events emitted by the bun-test integration. */
+export type BunTestProgressEvent =
+  | { type: "test_start"; testName: string }
+  | {
+      type: "test_complete";
+      testName: string;
+      passed: boolean;
+      duration: number;
+    };
+
+/**
+ * Configuration for a single eval test case.
+ */
+export interface EvalConfig {
+  /** Test input data, logged to the span. */
+  input?: unknown;
+  /** Expected output, passed to scorers. */
+  expected?: unknown;
+  /** Custom metadata, logged to the span. */
+  metadata?: Record<string, unknown>;
+  /** Tags for organizing test cases. */
+  tags?: string[];
+  /** Scorer functions to evaluate the output. */
+  scorers?: ScorerFunction[];
+  /** Override span name (defaults to the test name). */
+  name?: string;
+}
+
+/**
+ * Context passed to the eval test function.
+ */
+export interface EvalContext {
+  input: unknown;
+  expected?: unknown;
+  metadata?: Record<string, unknown>;
+}
+
+/**
+ * The wrapped test function with `(name, config, fn)` signature.
+ */
+export interface SuiteTestFunction {
+  (
+    name: string,
+    config: EvalConfig,
+    fn: (context: EvalContext) => unknown | Promise<unknown>,
+  ): void;
+  skip: (
+    name: string,
+    config: EvalConfig,
+    fn: (context: EvalContext) => unknown | Promise<unknown>,
+  ) => void;
+  only: (
+    name: string,
+    config: EvalConfig,
+    fn: (context: EvalContext) => unknown | Promise<unknown>,
+  ) => void;
+  todo: (
+    name: string,
+    config?: EvalConfig,
+    fn?: (context: EvalContext) => unknown | Promise<unknown>,
+  ) => void;
+  failing: (
+    name: string,
+    config: EvalConfig,
+    fn: (context: EvalContext) => unknown | Promise<unknown>,
+  ) => void;
+  concurrent: (
+    name: string,
+    config: EvalConfig,
+    fn: (context: EvalContext) => unknown | Promise<unknown>,
+  ) => void;
+  serial: (
+    name: string,
+    config: EvalConfig,
+    fn: (context: EvalContext) => unknown | Promise<unknown>,
+  ) => void;
+  if: (
+    condition: boolean,
+  ) => (
+    name: string,
+    config: EvalConfig,
+    fn: (context: EvalContext) => unknown | Promise<unknown>,
+  ) => void;
+  skipIf: (
+    condition: boolean,
+  ) => (
+    name: string,
+    config: EvalConfig,
+    fn: (context: EvalContext) => unknown | Promise<unknown>,
+  ) => void;
+  todoIf: (
+    condition: boolean,
+  ) => (
+    name: string,
+    config: EvalConfig,
+    fn: (context: EvalContext) => unknown | Promise<unknown>,
+  ) => void;
+}
+
+/**
+ * Configuration for `initBunTestSuite()`.
+ *
+ * The `TTest` generic forwards the type of your `test` function
+ * (e.g. `Test<[]>` from `bun:test`) without re-declaring it.
+ */
+export interface BunTestSuiteConfig<
+  TTest extends (...args: any[]) => any = (...args: any[]) => any,
+> {
+  /** Project name for the Braintrust experiment. */
+  projectName: string;
+  /** Optional experiment name. Defaults to a timestamp-based name. */
+  experimentName?: string;
+  /**
+   * If true, displays a formatted experiment summary after flushing.
+   * Defaults to true.
+   */
+  displaySummary?: boolean;
+  /**
+   * The `test` function from `bun:test`. Required.
+   * The exact type is forwarded via the `TTest` generic so no
+   * wrapper interface is needed.
+   */
+  test: TTest;
+  /**
+   * Pass `afterAll` from `bun:test` to auto-register a flush hook.
+   * When provided, `suite.flush()` is called automatically after all tests.
+   */
+  afterAll?: (fn: () => void | Promise<void>) => void;
+  /**
+   * Callback for real-time progress events.
+   * Emits `test_start` and `test_complete` events.
+   */
+  onProgress?: (event: BunTestProgressEvent) => void;
+  /**
+   * @internal For testing only. Override the experiment initialization function.
+   */
+  _initExperiment?: (
+    projectName: string,
+    options?: { experiment?: string },
+  ) => any;
+}
+
+/**
+ * The public API surface returned by `initBunTestSuite()`.
+ */
+export interface BunTestSuite {
+  /**
+   * Wrapped test function that creates a traced eval.
+   * Call as `suite.test(name, config, fn)`.
+   * Supports modifiers: `.skip`, `.only`, `.todo`, `.failing`,
+   * `.concurrent`, `.serial`, `.if`, `.skipIf`, `.todoIf`.
+   */
+  test: SuiteTestFunction;
+
+  /**
+   * Alias for `suite.test` (Jest/Vitest convention).
+   */
+  it: SuiteTestFunction;
+
+  /**
+   * Flush the experiment: summarize results and send data to Braintrust.
+   * Called automatically if `afterAll` was provided in the config.
+   */
+  flush(): Promise<void>;
+}
diff --git a/js/tsconfig.json b/js/tsconfig.json
index bdfdf2f12..a4fefc812 100644
--- a/js/tsconfig.json
+++ b/js/tsconfig.json
@@ -7,7 +7,8 @@
     "moduleResolution": "node",
     "strict": true,
     "esModuleInterop": true,
-    "skipLibCheck": true
+    "skipLibCheck": true,
+    "types": ["node"]
   },
   "include": ["."],
   "exclude": [
diff --git a/js/vitest.config.js b/js/vitest.config.js
index 089f86712..061736b4f 100644
--- a/js/vitest.config.js
+++ b/js/vitest.config.js
@@ -40,12 +40,14 @@ const config = {
       "./vendor/**",
       // Exclude subdirectories with their own test configs
       "src/wrappers/ai-sdk/**",
+      "src/wrappers/bun-test/**",
       "src/wrappers/claude-agent-sdk/**",
       "src/wrappers/vitest/**",
       "smoke/**",
       // Exclude example tests (require API keys and make real API calls)
       "examples/vitest/**",
       "examples/node-test/**",
+      "examples/bun-test/**",
     ],
     // Additional test environment configuration
     watchExclude: [
diff --git a/mise.toml b/mise.toml
index f328d72fe..c3f30fb3f 100644
--- a/mise.toml
+++ b/mise.toml
@@ -9,6 +9,7 @@ _.file = ".env"
 
 [tools]
 pnpm = "10.26.2"
+bun = "latest"
 
 [hooks]
 postinstall = "pnpm install"
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 78f01d233..c72c5ff1e 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -361,6 +361,9 @@ importers:
       '@types/async':
         specifier: ^3.2.24
         version: 3.2.24
+      '@types/bun':
+        specifier: ^1.3.10
+        version: 1.3.10
       '@types/cli-progress':
         specifier: ^3.11.5
         version: 3.11.5
@@ -2386,6 +2389,9 @@ packages:
   '@types/body-parser@1.19.5':
     resolution: {integrity: sha512-fB3Zu92ucau0iQ0JMCFQE7b/dv8Ot07NI3KaZIkIUNXq82k4eBAqUaneXfleGY9JWskeS9y+u0nXMyspcuQrCg==}
 
+  '@types/bun@1.3.10':
+    resolution: {integrity: sha512-0+rlrUrOrTSskibryHbvQkDOWRJwJZqZlxrUs1u4oOoTln8+WIXBPmAuCF35SWB2z4Zl3E84Nl/D0P7803nigQ==}
+
   '@types/cli-progress@3.11.5':
     resolution: {integrity: sha512-D4PbNRbviKyppS5ivBGyFO29POlySLmA2HyUFE4p5QGazAMM3CwkKWcvTl8gvElSuxRh6FPKL8XmidX873ou4g==}
 
@@ -2994,6 +3000,9 @@ packages:
   buffer-from@1.1.2:
     resolution: {integrity: sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==}
 
+  bun-types@1.3.10:
+    resolution: {integrity: sha512-tcpfCCl6XWo6nCVnpcVrxQ+9AYN1iqMIzgrSKYMB/fjLtV2eyAVEg7AxQJuCq/26R6HpKWykQXuSOq/21RYcbg==}
+
   bundle-require@5.1.0:
     resolution: {integrity: sha512-3WrrOuZiyaaZPWiEt4G3+IffISVC9HYlWueJEBWED4ZH4aIAC2PnkdnuRrR94M+w6yGWn4AglWtJtBI8YqvgoA==}
     engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0}
@@ -8061,6 +8070,10 @@ snapshots:
       '@types/connect': 3.4.38
       '@types/node': 22.19.1
 
+  '@types/bun@1.3.10':
+    dependencies:
+      bun-types: 1.3.10
+
   '@types/cli-progress@3.11.5':
     dependencies:
       '@types/node': 22.19.1
@@ -8961,6 +8974,10 @@ snapshots:
 
   buffer-from@1.1.2: {}
 
+  bun-types@1.3.10:
+    dependencies:
+      '@types/node': 22.19.1
+
   bundle-require@5.1.0(esbuild@0.24.2):
     dependencies:
       esbuild: 0.24.2

From 13afd208bf314704cd443a76bb5051f9288872da Mon Sep 17 00:00:00 2001
From: Abhijeet Prasad <abhijeet@braintrustdata.com>
Date: Wed, 4 Mar 2026 13:53:30 -0500
Subject: [PATCH 2/2] test.only fixes for bun

---
 js/Makefile                        |  4 +-
 js/src/wrappers/bun-test/suite.ts  | 51 ++++++++++++----
 js/src/wrappers/bun-test/types.ts  | 98 ++++++------------------------
 js/src/wrappers/node-test/types.ts | 31 +---------
 js/src/wrappers/shared/types.ts    | 29 +++++++++
 5 files changed, 91 insertions(+), 122 deletions(-)

diff --git a/js/Makefile b/js/Makefile
index d90b59323..49fb408b3 100644
--- a/js/Makefile
+++ b/js/Makefile
@@ -130,10 +130,10 @@ test-api-compat:
 	pnpm test:api-compat
 
 # Test everything
-test: test-core test-openai test-anthropic test-google-genai test-ai-sdk test-vitest test-claude-agent-sdk test-bun-test
+test: test-core test-openai test-anthropic test-google-genai test-ai-sdk test-vitest test-claude-agent-sdk
 
 # Test the core and the latest versions of wrappers.
-test-latest: test-core test-anthropic-latest test-openai-latest test-google-genai test-ai-sdk test-vitest test-claude-agent-sdk test-bun-test
+test-latest: test-core test-anthropic-latest test-openai-latest test-google-genai test-ai-sdk test-vitest test-claude-agent-sdk
 
 
 prune:
diff --git a/js/src/wrappers/bun-test/suite.ts b/js/src/wrappers/bun-test/suite.ts
index a70b73737..0536db082 100644
--- a/js/src/wrappers/bun-test/suite.ts
+++ b/js/src/wrappers/bun-test/suite.ts
@@ -34,6 +34,20 @@ const DIRECT_MODIFIERS = [
 ] as const;
 const CONDITIONAL_MODIFIERS = ["if", "skipIf", "todoIf"] as const;
 
+// Modifiers safe to validate eagerly. "only" is excluded because bun's CI
+// mode throws when test.only is even *accessed* (property read) to prevent
+// accidentally focusing tests. The .only wrapper defers access to call time.
+const VALIDATED_MODIFIERS = [
+  "skip",
+  "todo",
+  "failing",
+  "concurrent",
+  "serial",
+  "if",
+  "skipIf",
+  "todoIf",
+] as const;
+
 function validateTestFunction(test: unknown): ValidatedTestFunction {
   if (typeof test !== "function") {
     throw new Error(
@@ -42,7 +56,7 @@ function validateTestFunction(test: unknown): ValidatedTestFunction {
     );
   }
   const t = test as unknown as Record<string, unknown>;
-  for (const mod of [...DIRECT_MODIFIERS, ...CONDITIONAL_MODIFIERS]) {
+  for (const mod of VALIDATED_MODIFIERS) {
     if (typeof t[mod] !== "function") {
       throw new Error(
         `initBunTestSuite: "test.${mod}" must be a function (got ${typeof t[mod]}). ` +
@@ -170,17 +184,30 @@ export function initBunTestSuite<TTest extends (...args: any[]) => any>(
   }
 
   const t = validateTestFunction(config.test);
-  const suiteTest = Object.assign(wrapTestVariant(t), {
-    skip: wrapTestVariant(t.skip.bind(t)),
-    only: wrapTestVariant(t.only.bind(t)),
-    todo: wrapTestVariant(t.todo.bind(t)),
-    failing: wrapTestVariant(t.failing.bind(t)),
-    concurrent: wrapTestVariant(t.concurrent.bind(t)),
-    serial: wrapTestVariant(t.serial.bind(t)),
-    if: wrapConditional(t.if.bind(t)),
-    skipIf: wrapConditional(t.skipIf.bind(t)),
-    todoIf: wrapConditional(t.todoIf.bind(t)),
-  }) as SuiteTestFunction;
+
+  // Build modifier wrappers lazily — calling .bind() on modifiers like
+  // test.only at construction time triggers bun's CI guard which disables
+  // .only when CI=true. By deferring the .bind() to invocation time, we
+  // avoid the error when the modifier is never actually called.
+  const modifiers: Partial<Omit<SuiteTestFunction, never>> = {};
+  for (const mod of DIRECT_MODIFIERS) {
+    (modifiers as any)[mod] = (
+      name: string,
+      evalConfig: EvalConfig,
+      fn: (context: EvalContext) => unknown | Promise<unknown>,
+    ) => {
+      wrapTestVariant(t[mod].bind(t))(name, evalConfig, fn);
+    };
+  }
+  for (const mod of CONDITIONAL_MODIFIERS) {
+    (modifiers as any)[mod] = (condition: boolean) => {
+      return wrapConditional(t[mod].bind(t))(condition);
+    };
+  }
+  const suiteTest = Object.assign(
+    wrapTestVariant(t),
+    modifiers,
+  ) as unknown as SuiteTestFunction;
 
   async function flush(): Promise<void> {
     if (!experiment) {
diff --git a/js/src/wrappers/bun-test/types.ts b/js/src/wrappers/bun-test/types.ts
index 512e7dcaf..e6bc9965a 100644
--- a/js/src/wrappers/bun-test/types.ts
+++ b/js/src/wrappers/bun-test/types.ts
@@ -1,6 +1,6 @@
-import type { ScorerFunction } from "../shared/types";
+export type { ScorerFunction, EvalConfig, EvalContext } from "../shared/types";
 
-export type { ScorerFunction } from "../shared/types";
+import type { EvalConfig, EvalContext } from "../shared/types";
 
 /** Progress events emitted by the bun-test integration. */
 export type BunTestProgressEvent =
@@ -12,93 +12,33 @@ export type BunTestProgressEvent =
       duration: number;
     };
 
-/**
- * Configuration for a single eval test case.
- */
-export interface EvalConfig {
-  /** Test input data, logged to the span. */
-  input?: unknown;
-  /** Expected output, passed to scorers. */
-  expected?: unknown;
-  /** Custom metadata, logged to the span. */
-  metadata?: Record<string, unknown>;
-  /** Tags for organizing test cases. */
-  tags?: string[];
-  /** Scorer functions to evaluate the output. */
-  scorers?: ScorerFunction[];
-  /** Override span name (defaults to the test name). */
-  name?: string;
-}
+/** A single traced eval test function signature. */
+type EvalTestFn = (
+  name: string,
+  config: EvalConfig,
+  fn: (context: EvalContext) => unknown | Promise<unknown>,
+) => void;
 
-/**
- * Context passed to the eval test function.
- */
-export interface EvalContext {
-  input: unknown;
-  expected?: unknown;
-  metadata?: Record<string, unknown>;
-}
+/** Conditional modifier: returns an EvalTestFn based on a boolean condition. */
+type ConditionalEvalTestFn = (condition: boolean) => EvalTestFn;
 
 /**
  * The wrapped test function with `(name, config, fn)` signature.
  */
-export interface SuiteTestFunction {
-  (
-    name: string,
-    config: EvalConfig,
-    fn: (context: EvalContext) => unknown | Promise<unknown>,
-  ): void;
-  skip: (
-    name: string,
-    config: EvalConfig,
-    fn: (context: EvalContext) => unknown | Promise<unknown>,
-  ) => void;
-  only: (
-    name: string,
-    config: EvalConfig,
-    fn: (context: EvalContext) => unknown | Promise<unknown>,
-  ) => void;
+export interface SuiteTestFunction extends EvalTestFn {
+  skip: EvalTestFn;
+  only: EvalTestFn;
   todo: (
     name: string,
     config?: EvalConfig,
     fn?: (context: EvalContext) => unknown | Promise<unknown>,
   ) => void;
-  failing: (
-    name: string,
-    config: EvalConfig,
-    fn: (context: EvalContext) => unknown | Promise<unknown>,
-  ) => void;
-  concurrent: (
-    name: string,
-    config: EvalConfig,
-    fn: (context: EvalContext) => unknown | Promise<unknown>,
-  ) => void;
-  serial: (
-    name: string,
-    config: EvalConfig,
-    fn: (context: EvalContext) => unknown | Promise<unknown>,
-  ) => void;
-  if: (
-    condition: boolean,
-  ) => (
-    name: string,
-    config: EvalConfig,
-    fn: (context: EvalContext) => unknown | Promise<unknown>,
-  ) => void;
-  skipIf: (
-    condition: boolean,
-  ) => (
-    name: string,
-    config: EvalConfig,
-    fn: (context: EvalContext) => unknown | Promise<unknown>,
-  ) => void;
-  todoIf: (
-    condition: boolean,
-  ) => (
-    name: string,
-    config: EvalConfig,
-    fn: (context: EvalContext) => unknown | Promise<unknown>,
-  ) => void;
+  failing: EvalTestFn;
+  concurrent: EvalTestFn;
+  serial: EvalTestFn;
+  if: ConditionalEvalTestFn;
+  skipIf: ConditionalEvalTestFn;
+  todoIf: ConditionalEvalTestFn;
 }
 
 /**
diff --git a/js/src/wrappers/node-test/types.ts b/js/src/wrappers/node-test/types.ts
index 7cf36f2da..0f41deaf0 100644
--- a/js/src/wrappers/node-test/types.ts
+++ b/js/src/wrappers/node-test/types.ts
@@ -1,6 +1,6 @@
-import type { ScorerFunction } from "../shared/types";
+import type { EvalConfig, EvalContext } from "../shared/types";
 
-export type { ScorerFunction } from "../shared/types";
+export type { ScorerFunction, EvalConfig, EvalContext } from "../shared/types";
 
 /** Progress events emitted by the node-test integration. */
 export type NodeTestProgressEvent =
@@ -46,33 +46,6 @@ export interface NodeTestSuiteConfig {
   onProgress?: (event: NodeTestProgressEvent) => void;
 }
 
-/**
- * Configuration for a single eval test case.
- */
-export interface EvalConfig {
-  /** Test input data, logged to the span. */
-  input?: unknown;
-  /** Expected output, passed to scorers. */
-  expected?: unknown;
-  /** Custom metadata, logged to the span. */
-  metadata?: Record<string, unknown>;
-  /** Tags for organizing test cases. */
-  tags?: string[];
-  /** Scorer functions to evaluate the output. */
-  scorers?: ScorerFunction[];
-  /** Override span name (defaults to `t.name`, then `"unnamed test"`). */
-  name?: string;
-}
-
-/**
- * Context passed to the eval test function.
- */
-export interface EvalContext {
-  input: unknown;
-  expected?: unknown;
-  metadata?: Record<string, unknown>;
-}
-
 /**
  * The public API surface returned by `initNodeTestSuite()`.
  */
diff --git a/js/src/wrappers/shared/types.ts b/js/src/wrappers/shared/types.ts
index ba87847a2..7cb5ba553 100644
--- a/js/src/wrappers/shared/types.ts
+++ b/js/src/wrappers/shared/types.ts
@@ -8,6 +8,35 @@ export type ScorerFunction<Output = unknown> = (args: {
   metadata?: Record<string, unknown>;
 }) => Score | Promise<Score> | number | null | Array<Score>;
 
+/**
+ * Configuration for a single eval test case.
+ * Shared across test runner integrations (node-test, bun-test, etc.).
+ */
+export interface EvalConfig {
+  /** Test input data, logged to the span. */
+  input?: unknown;
+  /** Expected output, passed to scorers. */
+  expected?: unknown;
+  /** Custom metadata, logged to the span. */
+  metadata?: Record<string, unknown>;
+  /** Tags for organizing test cases. */
+  tags?: string[];
+  /** Scorer functions to evaluate the output. */
+  scorers?: ScorerFunction[];
+  /** Override span name (defaults to the test name). */
+  name?: string;
+}
+
+/**
+ * Context passed to the eval test function.
+ * Shared across test runner integrations (node-test, bun-test, etc.).
+ */
+export interface EvalContext {
+  input: unknown;
+  expected?: unknown;
+  metadata?: Record<string, unknown>;
+}
+
 // Progress event types for real-time test reporting
 export type ProgressEvent =
   | { type: "suite_start"; suiteName: string }