From 1b36da82ea1f056ea6931d507c7c6a2e2e77ffbf Mon Sep 17 00:00:00 2001 From: Abhijeet Prasad Date: Tue, 3 Mar 2026 17:25:50 -0500 Subject: [PATCH 1/2] feat: Add bun test runner integration --- .github/workflows/bun-test.yaml | 43 +++ CLAUDE.md | 8 +- js/Makefile | 14 +- js/examples/bun-test/bun-test-example.test.ts | 244 +++++++++++++ js/package.json | 5 +- js/src/exports.ts | 1 + js/src/wrappers/bun-test/README.md | 206 +++++++++++ .../bun-test/bun-test-example.test.ts | 134 +++++++ .../wrappers/bun-test/bun-test-span.test.ts | 127 +++++++ js/src/wrappers/bun-test/bun-test.test.ts | 329 ++++++++++++++++++ js/src/wrappers/bun-test/index.ts | 10 + js/src/wrappers/bun-test/suite.ts | 208 +++++++++++ js/src/wrappers/bun-test/test-helpers.ts | 36 ++ js/src/wrappers/bun-test/tsconfig.json | 13 + js/src/wrappers/bun-test/types.ts | 169 +++++++++ js/tsconfig.json | 3 +- js/vitest.config.js | 2 + mise.toml | 1 + pnpm-lock.yaml | 17 + 19 files changed, 1563 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/bun-test.yaml create mode 100644 js/examples/bun-test/bun-test-example.test.ts create mode 100644 js/src/wrappers/bun-test/README.md create mode 100644 js/src/wrappers/bun-test/bun-test-example.test.ts create mode 100644 js/src/wrappers/bun-test/bun-test-span.test.ts create mode 100644 js/src/wrappers/bun-test/bun-test.test.ts create mode 100644 js/src/wrappers/bun-test/index.ts create mode 100644 js/src/wrappers/bun-test/suite.ts create mode 100644 js/src/wrappers/bun-test/test-helpers.ts create mode 100644 js/src/wrappers/bun-test/tsconfig.json create mode 100644 js/src/wrappers/bun-test/types.ts diff --git a/.github/workflows/bun-test.yaml b/.github/workflows/bun-test.yaml new file mode 100644 index 000000000..37b60e494 --- /dev/null +++ b/.github/workflows/bun-test.yaml @@ -0,0 +1,43 @@ +name: bun-test + +on: + pull_request: + paths: + - "js/**" + - ".github/workflows/bun-test.yaml" + - "pnpm-lock.yaml" + push: + branches: [main] + +jobs: + bun-test: + runs-on: ubuntu-latest + timeout-minutes: 15 + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-node@v4 + with: + node-version: 22 + + - uses: pnpm/action-setup@v4 + + - uses: oven-sh/setup-bun@v2 + + - name: Install dependencies + run: pnpm install --frozen-lockfile + + - name: Build + run: pnpm run build + + # Unit tests (bun test) + - name: Run unit tests + run: cd js && bun test src/wrappers/bun-test/ + + # Integration tests (bun test) - need bun runtime + API keys + - name: Run bun integration tests + env: + BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + run: cd js/examples/bun-test && bun test diff --git a/CLAUDE.md b/CLAUDE.md index a73257713..25d7f422e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,7 +2,7 @@ JavaScript client for Braintrust, plus wrapper libraries for OpenAI, Anthropic, and other AI providers. -This repo uses `pnpm` as it's package manager. +This repo uses `pnpm` as its package manager and [mise](https://mise.jdx.dev/) to manage tool versions. ## Structure @@ -23,7 +23,13 @@ sdk/ ## Setup +This repo uses [mise](https://mise.jdx.dev/) to manage tool versions (e.g. `pnpm`). The root `mise.toml` pins versions and runs `pnpm install` automatically on `mise install`. + ```bash +mise install # Install tools and dependencies (recommended) +# or manually: pnpm install # Install dependencies pnpm run build # Build all packages ``` + +mise also auto-loads a `.env` file if present — see `.env.example` to configure API keys. diff --git a/js/Makefile b/js/Makefile index 0e1a9b396..d90b59323 100644 --- a/js/Makefile +++ b/js/Makefile @@ -17,6 +17,7 @@ help: @echo " make test-ai-sdk-v6 - Run AI SDK v6 wrapper tests" @echo " make test-claude-agent-sdk - Run Claude Agent SDK wrapper tests" @echo " make test-vitest - Run Vitest wrapper tests" + @echo " make test-bun-test - Run Bun test runner wrapper tests" @echo " make test-api-compat - Run API compatibility tests" @echo " make bench - Run queue performance benchmarks" @echo " make test-latest - Run core + latest versions of wrappers" @@ -28,7 +29,7 @@ help: @echo "" @echo "See smoke/README.md for details on smoke test infrastructure" -.PHONY: help bench build clean test test-core test-openai test-anthropic test-google-genai test-ai-sdk test-ai-sdk-v5 test-ai-sdk-v6 test-claude-agent-sdk test-vitest test-latest install-optional-deps publish-beta-local test-smoke +.PHONY: help bench build clean test test-core test-openai test-anthropic test-google-genai test-ai-sdk test-ai-sdk-v5 test-ai-sdk-v6 test-claude-agent-sdk test-vitest test-bun-test test-latest install-optional-deps publish-beta-local test-smoke # ------------------------------------------------------------------------------------------------- # # Anthropic testing @@ -82,6 +83,13 @@ test-claude-agent-sdk: test-vitest: cd src/wrappers/vitest && pnpm install && pnpm test +# ------------------------------------------------------------------------------------------------- +# Bun test runner testing +# ------------------------------------------------------------------------------------------------- + +test-bun-test: + bun test src/wrappers/bun-test/ + # ------------------------------------------------------------------------------------------------- # OpenAI testing # ------------------------------------------------------------------------------------------------- @@ -122,10 +130,10 @@ test-api-compat: pnpm test:api-compat # Test everything -test: test-core test-openai test-anthropic test-google-genai test-ai-sdk test-vitest test-claude-agent-sdk +test: test-core test-openai test-anthropic test-google-genai test-ai-sdk test-vitest test-claude-agent-sdk test-bun-test # Test the core and the latest versions of wrappers. -test-latest: test-core test-anthropic-latest test-openai-latest test-google-genai test-ai-sdk test-vitest test-claude-agent-sdk +test-latest: test-core test-anthropic-latest test-openai-latest test-google-genai test-ai-sdk test-vitest test-claude-agent-sdk test-bun-test prune: diff --git a/js/examples/bun-test/bun-test-example.test.ts b/js/examples/bun-test/bun-test-example.test.ts new file mode 100644 index 000000000..318ae8c37 --- /dev/null +++ b/js/examples/bun-test/bun-test-example.test.ts @@ -0,0 +1,244 @@ +/** + * Bun Test Runner + Braintrust Example + * + * Demonstrates using initBunTestSuite to track test results as + * Braintrust experiments using the Bun test runner. + * + * Run with: bun test + * Requires: BRAINTRUST_API_KEY and OPENAI_API_KEY environment variables + */ + +import { test, describe, afterAll } from "bun:test"; +import { configureNode } from "../../src/node"; +import { initBunTestSuite } from "../../src/wrappers/bun-test/index"; +import { _exportsForTestingOnly, login, currentSpan } from "../../src/logger"; +import { wrapOpenAI } from "../../src/wrappers/oai"; +import OpenAI from "openai"; + +configureNode(); + +_exportsForTestingOnly.setInitialTestState(); +await login({ apiKey: process.env.BRAINTRUST_API_KEY }); + +if (!process.env.OPENAI_API_KEY) { + throw new Error( + "OPENAI_API_KEY environment variable must be set to run examples/bun-test/bun-test-example.test.ts", + ); +} + +const openai = wrapOpenAI(new OpenAI({ apiKey: process.env.OPENAI_API_KEY })); + +// ============================================================ +// Basic Usage — scorers, data expansion, logging +// ============================================================ + +describe("Translation Evaluation", () => { + const suite = initBunTestSuite({ + projectName: "example-bun-test", + afterAll, + test, + }); + + // --- Single test with input/expected and a scorer --- + + suite.test( + "basic translation test", + { + input: { text: "Hello", targetLang: "Spanish" }, + expected: "Hola", + metadata: { difficulty: "easy" }, + tags: ["translation", "spanish"], + scorers: [ + ({ output, expected }) => ({ + name: "exact_match", + score: + String(output).toLowerCase().trim() === + String(expected).toLowerCase().trim() + ? 1 + : 0, + }), + ], + }, + async ({ input }) => { + const { text, targetLang } = input as { + text: string; + targetLang: string; + }; + const response = await openai.chat.completions.create({ + model: "gpt-3.5-turbo", + messages: [ + { + role: "user", + content: `Translate "${text}" to ${targetLang}. Respond with ONLY the translation.`, + }, + ], + temperature: 0, + }); + return response.choices[0]?.message?.content?.trim() || ""; + }, + ); + + // --- Data expansion with a loop --- + + const translationCases = [ + { + input: { text: "Good morning", targetLang: "Spanish" }, + expected: "Buenos días", + }, + { + input: { text: "Thank you very much", targetLang: "Spanish" }, + expected: "Muchas gracias", + }, + { + input: { text: "Goodbye", targetLang: "French" }, + expected: "Au revoir", + }, + ]; + + for (const [i, record] of translationCases.entries()) { + suite.test( + `translation [${i}]: "${record.input.text}" → ${record.input.targetLang}`, + { + ...record, + scorers: [ + ({ output, expected }) => { + const outputStr = String(output).toLowerCase().trim(); + const expectedStr = String(expected).toLowerCase().trim(); + const outputWords = new Set(outputStr.split(" ")); + const expectedWords = expectedStr.split(" "); + const matches = expectedWords.filter((w) => + outputWords.has(w), + ).length; + return { + name: "word_overlap", + score: matches / expectedWords.length, + metadata: { matches, total: expectedWords.length }, + }; + }, + ], + }, + async ({ input }) => { + const { text, targetLang } = input as { + text: string; + targetLang: string; + }; + const response = await openai.chat.completions.create({ + model: "gpt-3.5-turbo", + messages: [ + { + role: "user", + content: `Translate "${text}" to ${targetLang}. Respond with ONLY the translation.`, + }, + ], + temperature: 0, + }); + return response.choices[0]?.message?.content?.trim() || ""; + }, + ); + } + + // --- currentSpan() for custom logging --- + + suite.test( + "translation with extra logging", + { + input: { text: "How are you?", targetLang: "Spanish" }, + expected: "¿Cómo estás?", + }, + async ({ input }) => { + const { text, targetLang } = input as { + text: string; + targetLang: string; + }; + const response = await openai.chat.completions.create({ + model: "gpt-3.5-turbo", + messages: [ + { + role: "user", + content: `Translate "${text}" to ${targetLang}. Respond with ONLY the translation.`, + }, + ], + temperature: 0, + }); + + const result = response.choices[0]?.message?.content?.trim() || ""; + + currentSpan().log({ + output: { tokens: response.usage, model: response.model }, + scores: { human_quality: 0.95 }, + metadata: { evaluator: "example" }, + }); + + return result; + }, + ); +}); + +// ============================================================ +// Multiple Scorers +// ============================================================ + +describe("Multiple Scorers", () => { + const suite = initBunTestSuite({ + projectName: "example-bun-test", + afterAll, + test, + }); + + suite.test( + "translation with multiple custom scorers", + { + input: { text: "Hello world", targetLang: "Spanish" }, + expected: "Hola mundo", + scorers: [ + ({ output, expected }) => ({ + name: "exact_match", + score: + String(output).toLowerCase().trim() === + String(expected).toLowerCase().trim() + ? 1 + : 0, + }), + ({ output, expected }) => { + const outputWords = new Set( + String(output).toLowerCase().trim().split(" "), + ); + const expectedWords = String(expected) + .toLowerCase() + .trim() + .split(" "); + const matches = expectedWords.filter((w) => + outputWords.has(w), + ).length; + return { + name: "word_overlap", + score: matches / expectedWords.length, + metadata: { matches, total: expectedWords.length }, + }; + }, + ({ output }) => ({ + name: "conciseness", + score: String(output).length < 20 ? 1 : 0.7, + metadata: { length: String(output).length }, + }), + ], + }, + async ({ input }) => { + const { text, targetLang } = input as { + text: string; + targetLang: string; + }; + const response = await openai.chat.completions.create({ + model: "gpt-3.5-turbo", + messages: [ + { + role: "user", + content: `Translate "${text}" to ${targetLang}. Respond with ONLY the translation.`, + }, + ], + temperature: 0, + }); + return response.choices[0]?.message?.content?.trim() || ""; + }, + ); +}); diff --git a/js/package.json b/js/package.json index 96ca6635d..81a81bc8e 100644 --- a/js/package.json +++ b/js/package.json @@ -140,6 +140,7 @@ "@openai/agents": "^0.0.14", "@types/argparse": "^2.0.14", "@types/async": "^3.2.24", + "@types/bun": "^1.3.10", "@types/cli-progress": "^3.11.5", "@types/cors": "^2.8.17", "@types/express": "^5.0.0", @@ -160,8 +161,6 @@ "jiti": "^2.6.1", "openapi-zod-client": "^1.18.3", "rollup": "^4.28.1", - "vite": "^5.4.14", - "webpack": "^5.97.1", "tar": "^7.5.2", "tinybench": "^4.0.1", "ts-jest": "^29.1.4", @@ -170,8 +169,10 @@ "typedoc": "^0.25.13", "typedoc-plugin-markdown": "^3.17.1", "typescript": "5.4.4", + "vite": "^5.4.14", "vite-tsconfig-paths": "^4.3.2", "vitest": "^2.1.9", + "webpack": "^5.97.1", "zod": "^3.25.34" }, "dependencies": { diff --git a/js/src/exports.ts b/js/src/exports.ts index 6eb2b5c1f..81bfdc7f4 100644 --- a/js/src/exports.ts +++ b/js/src/exports.ts @@ -175,6 +175,7 @@ export { wrapClaudeAgentSDK } from "./wrappers/claude-agent-sdk/claude-agent-sdk export { wrapGoogleGenAI } from "./wrappers/google-genai"; export { wrapVitest } from "./wrappers/vitest"; export { initNodeTestSuite } from "./wrappers/node-test"; +export { initBunTestSuite } from "./wrappers/bun-test"; export * as graph from "./graph-framework"; diff --git a/js/src/wrappers/bun-test/README.md b/js/src/wrappers/bun-test/README.md new file mode 100644 index 000000000..afe4623b0 --- /dev/null +++ b/js/src/wrappers/bun-test/README.md @@ -0,0 +1,206 @@ +# Braintrust Bun Test Runner Integration + +Track your Bun test results as [Braintrust](https://braintrust.dev) experiments using [bun:test](https://bun.sh/docs/test/writing). + +## Quick Start + +```typescript +import { test, describe, afterAll } from "bun:test"; +import { initBunTestSuite } from "braintrust"; + +describe("My Evaluation Suite", () => { + const suite = initBunTestSuite({ + projectName: "my-project", + afterAll, // Auto-flush results after all tests + test, // Required: bun:test's test function + }); + + suite.test( + "evaluates output", + { + input: { text: "hello" }, + expected: "hola", + scorers: [ + ({ output, expected }) => ({ + name: "exact_match", + score: output === expected ? 1 : 0, + }), + ], + }, + async ({ input }) => { + return await translate(input.text); + }, + ); + + // Untracked tests use regular test() as normal + test("sanity check", () => { + expect(1 + 1).toBe(2); + }); +}); +``` + +## Core Features + +- **Composable**: `suite.test()` wraps `bun:test`'s `test()` — use `test()` directly for untracked tests +- **Experiment tracking**: Each test case creates a span with input, output, expected, and scores +- **Automatic scoring**: Attach scorer functions to evaluate outputs +- **Data expansion**: Use `for` loops for parameterized tests +- **Auto-flush**: Pass `afterAll` to automatically flush results when tests finish + +## API Reference + +### `initBunTestSuite(config)` + +Creates a new test suite with Braintrust experiment tracking. + +**Config:** + +| Field | Type | Required | Description | +| ---------------- | ---------- | -------- | ------------------------------------------------ | +| `projectName` | `string` | Yes | Braintrust project name | +| `test` | `Function` | Yes | `test` from `bun:test` | +| `experimentName` | `string` | No | Custom experiment name (default: auto-generated) | +| `displaySummary` | `boolean` | No | Show summary after flush (default: `true`) | +| `afterAll` | `Function` | No | `afterAll` from `bun:test` for auto-flush | +| `onProgress` | `Function` | No | Callback for progress events | + +**Returns:** `BunTestSuite` with `test`, `it`, and `flush()`. + +### `suite.test(name, config, fn)` + +Creates a traced test case. + +**Parameters:** + +| Parameter | Type | Description | +| --------- | ------------ | ------------------------------------- | +| `name` | `string` | Test name (used as span name) | +| `config` | `EvalConfig` | Input, expected, scorers, etc. | +| `fn` | `Function` | Test function receiving `EvalContext` | + +**EvalConfig:** + +| Field | Type | Description | +| ---------- | ------------------------- | ----------------------------------- | +| `input` | `unknown` | Test input data | +| `expected` | `unknown` | Expected output (passed to scorers) | +| `metadata` | `Record` | Custom metadata | +| `tags` | `string[]` | Tags for organizing test cases | +| `scorers` | `ScorerFunction[]` | Scorer functions | +| `name` | `string` | Override span name | + +### Test Modifiers + +All modifiers from `bun:test` are supported: + +```typescript +suite.test.skip("skipped test", config, fn); +suite.test.only("focused test", config, fn); +suite.test.todo("planned test"); +suite.test.failing("expected failure", config, fn); +suite.test.concurrent("parallel test", config, fn); +suite.test.serial("sequential test", config, fn); + +// Conditional modifiers +suite.test.if(condition)("conditional test", config, fn); +suite.test.skipIf(condition)("skip-if test", config, fn); +suite.test.todoIf(condition)("todo-if test", config, fn); +``` + +`suite.it` is an alias for `suite.test`. + +## Using Scorers + +```typescript +// Basic scorer +suite.test( + "my test", + { + input: "hello", + expected: "HELLO", + scorers: [ + ({ output, expected }) => ({ + name: "exact_match", + score: output === expected ? 1 : 0, + }), + ], + }, + async ({ input }) => (input as string).toUpperCase(), +); + +// Multiple scorers +suite.test( + "multi-scored", + { + input: "hello world", + expected: "Hola mundo", + scorers: [ + ({ output, expected }) => ({ + name: "exact_match", + score: String(output) === String(expected) ? 1 : 0, + }), + ({ output }) => ({ + name: "not_empty", + score: String(output).length > 0 ? 1 : 0, + }), + ], + }, + async ({ input }) => await translate(input), +); +``` + +## Data Expansion + +Use `for` loops instead of `test.each` for parameterized tests: + +```typescript +const cases = [ + { input: "hello", expected: "hola" }, + { input: "goodbye", expected: "adiós" }, +]; + +for (const [i, record] of cases.entries()) { + suite.test( + `translation [${i}]: ${record.input}`, + { + input: record.input, + expected: record.expected, + scorers: [myScorer], + }, + async ({ input }) => await translate(input as string), + ); +} +``` + +## Custom Logging with `currentSpan()` + +```typescript +import { currentSpan } from "braintrust"; + +suite.test( + "with custom logging", + { input: { query: "test" } }, + async ({ input }) => { + const result = await myFunction(input); + + currentSpan().log({ + output: { tokens: result.usage, model: result.model }, + scores: { human_quality: 0.95 }, + metadata: { evaluator: "example" }, + }); + + return result.text; + }, +); +``` + +## Running + +```bash +bun test +``` + +## Additional Resources + +- [Braintrust Documentation](https://www.braintrust.dev/docs) +- [Bun Test Runner](https://bun.sh/docs/test/writing) diff --git a/js/src/wrappers/bun-test/bun-test-example.test.ts b/js/src/wrappers/bun-test/bun-test-example.test.ts new file mode 100644 index 000000000..a1b1a9184 --- /dev/null +++ b/js/src/wrappers/bun-test/bun-test-example.test.ts @@ -0,0 +1,134 @@ +import { test, describe, afterAll, beforeAll } from "bun:test"; +import { currentSpan } from "../../logger"; +import { initBunTestSuite } from "./suite"; +import { + setupBunTestEnv, + teardownBunTestEnv, + createTestInitExperiment, +} from "./test-helpers"; + +let moduleBackgroundLogger: Awaited>; +beforeAll(async () => { + moduleBackgroundLogger = await setupBunTestEnv(); +}); + +describe("Bun Test Suite Example", () => { + const suite = initBunTestSuite({ + projectName: "bun-test-example", + displaySummary: false, + afterAll, + test, + _initExperiment: createTestInitExperiment(), + onProgress: (event) => { + if (event.type === "test_complete") { + console.log( + ` ${event.testName} (${event.duration.toFixed(2)}ms) - ${event.passed ? "PASSED" : "FAILED"}`, + ); + } + }, + }); + + // Basic test with suite.test() + suite.test( + "basic addition", + { input: { a: 2, b: 2 }, expected: 4 }, + async ({ input }) => { + const { a, b } = input as { a: number; b: number }; + return a + b; + }, + ); + + // Test with metadata and tags + suite.test( + "multiplication with metadata", + { + input: { a: 3, b: 4 }, + expected: 12, + metadata: { category: "arithmetic", difficulty: "easy" }, + tags: ["math", "multiplication"], + }, + async ({ input }) => { + const { a, b } = input as { a: number; b: number }; + return a * b; + }, + ); + + // Test with scorers + suite.test( + "string transformation with scorers", + { + input: "hello world", + expected: "HELLO WORLD", + scorers: [ + ({ output, expected }) => ({ + name: "exact_match", + score: output === expected ? 1 : 0, + }), + ({ output }) => ({ + name: "is_uppercase", + score: + typeof output === "string" && output === output.toUpperCase() + ? 1 + : 0, + }), + ], + }, + async ({ input }) => { + return (input as string).toUpperCase(); + }, + ); + + // Data expansion with a loop + const translationData = [ + { input: { text: "hello" }, expected: "hola" }, + { input: { text: "goodbye" }, expected: "adiós" }, + { input: { text: "thanks" }, expected: "gracias" }, + ]; + + for (const [i, record] of translationData.entries()) { + suite.test( + `translation [${i}]`, + { + input: record.input, + expected: record.expected, + scorers: [ + ({ output, expected }) => ({ + name: "exact_match", + score: output === expected ? 1 : 0, + }), + ], + }, + async ({ input }) => { + const translations: Record = { + hello: "hola", + goodbye: "adiós", + thanks: "gracias", + }; + return translations[(input as any).text] || "unknown"; + }, + ); + } + + // Test using currentSpan() for custom logging + suite.test( + "custom outputs and feedback", + { input: { query: "test query" } }, + async ({ input }) => { + const result = `processed: ${(input as any).query}`; + currentSpan().log({ + output: { processed_query: result, model: "test-model" }, + scores: { relevance: 0.9 }, + metadata: { evaluator: "human" }, + }); + return result; + }, + ); +}); + +afterAll(async () => { + await moduleBackgroundLogger.flush(); + const spans = await moduleBackgroundLogger.drain(); + console.log(` Example tests captured ${spans.length} spans`); + + teardownBunTestEnv(); +}); diff --git a/js/src/wrappers/bun-test/bun-test-span.test.ts b/js/src/wrappers/bun-test/bun-test-span.test.ts new file mode 100644 index 000000000..35e7d1a63 --- /dev/null +++ b/js/src/wrappers/bun-test/bun-test-span.test.ts @@ -0,0 +1,127 @@ +import { test, describe, afterAll, expect, beforeAll } from "bun:test"; +import { currentSpan } from "../../logger"; +import { initBunTestSuite } from "./suite"; +import { + setupBunTestEnv, + teardownBunTestEnv, + createTestInitExperiment, +} from "./test-helpers"; + +let moduleBackgroundLogger: Awaited>; +beforeAll(async () => { + moduleBackgroundLogger = await setupBunTestEnv(); +}); + +describe("Bun Test Suite Span Creation Integration", () => { + const suite = initBunTestSuite({ + projectName: "bun-test-span-integration", + displaySummary: false, + test, + _initExperiment: createTestInitExperiment(), + }); + + suite.test( + "creates span with input and expected", + { + input: { value: 5 }, + expected: 10, + metadata: { operation: "multiply" }, + tags: ["math"], + }, + async ({ input }) => { + return (input as any).value * 2; + }, + ); + + suite.test( + "creates span with custom outputs and feedback", + { input: "test-data" }, + async () => { + currentSpan().log({ + output: { step1: "started", step2: "completed" }, + scores: { quality: 0.95 }, + }); + return "final result"; + }, + ); + + suite.test( + "creates span with scorer results", + { + input: "hello", + expected: "HELLO", + scorers: [ + ({ output, expected }) => ({ + name: "case_match", + score: output === expected ? 1 : 0, + }), + ], + }, + async ({ input }) => { + return (input as string).toUpperCase(); + }, + ); + + suite.test( + "creates span for passing test with pass score", + { input: "simple" }, + async () => { + return "result"; + }, + ); +}); + +afterAll(async () => { + await moduleBackgroundLogger.flush(); + const spans = await moduleBackgroundLogger.drain(); + + // Verify spans were created + expect(spans.length).toBeGreaterThan(0); + + // Verify task type spans exist + const taskSpans = spans.filter( + (s: any) => s.span_attributes?.type === "task", + ); + expect(taskSpans.length).toBeGreaterThan(0); + + // Verify pass scores exist + const spansWithPassScore = spans.filter( + (s: any) => s.scores?.pass !== undefined, + ); + expect(spansWithPassScore.length).toBeGreaterThan(0); + + // Verify passing tests + const passingTests = spans.filter((s: any) => s.scores?.pass === 1); + expect(passingTests.length).toBeGreaterThan(0); + + // Verify spans have output + const spansWithOutputs = spans.filter((s: any) => s.output); + expect(spansWithOutputs.length).toBeGreaterThan(0); + + // Verify spans have input + const spansWithInput = spans.filter((s: any) => s.input !== undefined); + expect(spansWithInput.length).toBeGreaterThan(0); + + // Verify spans have expected + const spansWithExpected = spans.filter((s: any) => s.expected !== undefined); + expect(spansWithExpected.length).toBeGreaterThan(0); + + // Verify spans have metadata + const spansWithMetadata = spans.filter( + (s: any) => s.metadata && Object.keys(s.metadata).length > 0, + ); + expect(spansWithMetadata.length).toBeGreaterThan(0); + + // Verify spans have tags + const spansWithTags = spans.filter((s: any) => s.tags && s.tags.length > 0); + expect(spansWithTags.length).toBeGreaterThan(0); + + // Verify custom scores (from scorers) + const spansWithCustomScores = spans.filter((s: any) => { + const scores = s.scores || {}; + return Object.keys(scores).some((key) => key !== "pass"); + }); + expect(spansWithCustomScores.length).toBeGreaterThan(0); + + teardownBunTestEnv(); +}); diff --git a/js/src/wrappers/bun-test/bun-test.test.ts b/js/src/wrappers/bun-test/bun-test.test.ts new file mode 100644 index 000000000..ba316e95f --- /dev/null +++ b/js/src/wrappers/bun-test/bun-test.test.ts @@ -0,0 +1,329 @@ +import { test, describe, afterAll, expect, beforeAll } from "bun:test"; +import { initBunTestSuite } from "./suite"; +import { + setupBunTestEnv, + teardownBunTestEnv, + createTestInitExperiment, +} from "./test-helpers"; + +beforeAll(async () => { + await setupBunTestEnv(); +}); + +// --------------------------------------------------------------------------- +// API surface +// --------------------------------------------------------------------------- + +describe("initBunTestSuite API surface", () => { + const suite = initBunTestSuite({ + projectName: "api-surface", + test, + _initExperiment: createTestInitExperiment(), + }); + + test("suite has test, it, and flush", () => { + expect(suite.test).toBeDefined(); + expect(typeof suite.test).toBe("function"); + expect(suite.it).toBeDefined(); + expect(suite.it).toBe(suite.test); + expect(suite.flush).toBeDefined(); + expect(typeof suite.flush).toBe("function"); + }); + + test("suite.test has all modifier properties", () => { + expect(typeof suite.test.skip).toBe("function"); + expect(typeof suite.test.only).toBe("function"); + expect(typeof suite.test.todo).toBe("function"); + expect(typeof suite.test.failing).toBe("function"); + expect(typeof suite.test.concurrent).toBe("function"); + expect(typeof suite.test.serial).toBe("function"); + expect(typeof suite.test.if).toBe("function"); + expect(typeof suite.test.skipIf).toBe("function"); + expect(typeof suite.test.todoIf).toBe("function"); + }); +}); + +// --------------------------------------------------------------------------- +// Basic traced eval +// --------------------------------------------------------------------------- + +describe("basic traced eval", () => { + let result: unknown; + + const suite = initBunTestSuite({ + projectName: "basic-eval", + test, + displaySummary: false, + _initExperiment: createTestInitExperiment(), + }); + + suite.test( + "runs traced eval and returns output", + { input: "hello" }, + async ({ input }) => { + result = `processed: ${input}`; + return result; + }, + ); + + test("traced eval produced correct result", () => { + expect(result).toBe("processed: hello"); + }); +}); + +// --------------------------------------------------------------------------- +// Scorer invocation +// --------------------------------------------------------------------------- + +describe("scorer invocation", () => { + let scorerCallArgs: any = null; + + const suite = initBunTestSuite({ + projectName: "scorer-test", + test, + displaySummary: false, + _initExperiment: createTestInitExperiment(), + }); + + suite.test( + "test with scorer", + { + input: { text: "hello" }, + expected: "world", + metadata: { lang: "en" }, + scorers: [ + (args) => { + scorerCallArgs = args; + return { name: "test_score", score: 1 }; + }, + ], + }, + async () => "output-value", + ); + + test("scorer received correct arguments", () => { + expect(scorerCallArgs).toEqual({ + output: "output-value", + expected: "world", + input: { text: "hello" }, + metadata: { lang: "en" }, + }); + }); +}); + +// --------------------------------------------------------------------------- +// Scorers run on error +// --------------------------------------------------------------------------- + +describe("scorers run even when test function throws", () => { + let scorerCalled = false; + + const suite = initBunTestSuite({ + projectName: "scorer-error-test", + test, + displaySummary: false, + _initExperiment: createTestInitExperiment(), + }); + + // test.failing expects the test to throw — bun marks it as passed + suite.test.failing( + "error test", + { + input: "hello", + scorers: [ + () => { + scorerCalled = true; + return { name: "post_error_score", score: 0 }; + }, + ], + }, + async () => { + throw new Error("test failure"); + }, + ); + + test("scorer was still called", () => { + expect(scorerCalled).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Return value logged as output +// --------------------------------------------------------------------------- + +describe("return value", () => { + let captured: unknown; + + const suite = initBunTestSuite({ + projectName: "output-test", + test, + displaySummary: false, + _initExperiment: createTestInitExperiment(), + }); + + suite.test("output test", { input: "hello" }, async () => { + captured = { result: "some output" }; + return captured; + }); + + test("return value was captured", () => { + expect(captured).toEqual({ result: "some output" }); + }); +}); + +// --------------------------------------------------------------------------- +// afterAll registration +// --------------------------------------------------------------------------- + +describe("afterAll registration", () => { + test("afterAll is called with a flush function", () => { + const fns: Function[] = []; + initBunTestSuite({ + projectName: "after-test", + test, + afterAll: (fn) => fns.push(fn), + _initExperiment: createTestInitExperiment(), + }); + + expect(fns).toHaveLength(1); + expect(typeof fns[0]).toBe("function"); + }); +}); + +// --------------------------------------------------------------------------- +// Flush behavior +// --------------------------------------------------------------------------- + +describe("flush", () => { + test("flush is a no-op when no experiment was created", async () => { + const suite = initBunTestSuite({ + projectName: "no-experiment", + test, + displaySummary: false, + _initExperiment: createTestInitExperiment(), + }); + + // Should not throw even though no eval was called + await suite.flush(); + }); +}); + +// --------------------------------------------------------------------------- +// Span naming +// --------------------------------------------------------------------------- + +describe("span naming via progress events", () => { + const events: any[] = []; + + const suite = initBunTestSuite({ + projectName: "name-test", + test, + displaySummary: false, + onProgress: (event) => events.push(event), + _initExperiment: createTestInitExperiment(), + }); + + suite.test( + "original-name", + { input: "hello", name: "custom-span-name" }, + async () => "result", + ); + + suite.test("test-name-used", { input: "hello" }, async () => "result"); + + test("evalConfig.name overrides test name for span", () => { + const starts = events.filter((e) => e.type === "test_start"); + expect(starts[0].testName).toBe("custom-span-name"); + }); + + test("test name is used when evalConfig.name is not set", () => { + const starts = events.filter((e) => e.type === "test_start"); + expect(starts[1].testName).toBe("test-name-used"); + }); +}); + +// --------------------------------------------------------------------------- +// Progress events +// --------------------------------------------------------------------------- + +describe("onProgress events", () => { + const events: any[] = []; + + const suite = initBunTestSuite({ + projectName: "progress-test", + test, + displaySummary: false, + onProgress: (event) => events.push(event), + _initExperiment: createTestInitExperiment(), + }); + + suite.test("progress-test", { input: "hello" }, async () => "result"); + + test("receives test_start and test_complete events", () => { + expect(events).toEqual([ + { type: "test_start", testName: "progress-test" }, + expect.objectContaining({ + type: "test_complete", + testName: "progress-test", + passed: true, + }), + ]); + expect(typeof events[1].duration).toBe("number"); + }); +}); + +describe("onProgress on failure", () => { + const events: any[] = []; + + const suite = initBunTestSuite({ + projectName: "fail-progress-test", + test, + displaySummary: false, + onProgress: (event) => events.push(event), + _initExperiment: createTestInitExperiment(), + }); + + // Use test.failing so bun expects the throw + suite.test.failing("fail-test", { input: "hello" }, async () => { + throw new Error("intentional failure"); + }); + + test("reports passed=false when test throws", () => { + const complete = events.find((e) => e.type === "test_complete"); + expect(complete.passed).toBe(false); + }); +}); + +// --------------------------------------------------------------------------- +// Modifiers — verify they register without error +// --------------------------------------------------------------------------- + +describe("test modifiers", () => { + const suite = initBunTestSuite({ + projectName: "modifier-test", + test, + displaySummary: false, + _initExperiment: createTestInitExperiment(), + }); + + suite.test.skip("skipped test", { input: "x" }, async () => "y"); + suite.test.todo("todo test"); + suite.test.skipIf(true)("skipIf-true test", { input: "x" }, async () => "y"); + suite.test.todoIf(true)("todoIf-true test", { input: "x" }, async () => "y"); + + // If(false) should not run the test + suite.test.if(false)("if-false test", { input: "x" }, async () => { + throw new Error("should not run"); + }); + + // If(true) should run the test + suite.test.if(true)("if-true test", { input: "x" }, async () => "y"); +}); + +// --------------------------------------------------------------------------- +// Cleanup +// --------------------------------------------------------------------------- + +afterAll(async () => { + teardownBunTestEnv(); +}); diff --git a/js/src/wrappers/bun-test/index.ts b/js/src/wrappers/bun-test/index.ts new file mode 100644 index 000000000..6e9bee17f --- /dev/null +++ b/js/src/wrappers/bun-test/index.ts @@ -0,0 +1,10 @@ +export { initBunTestSuite } from "./suite"; +export type { + BunTestSuiteConfig, + BunTestProgressEvent, + EvalConfig, + EvalContext, + SuiteTestFunction, + BunTestSuite, + ScorerFunction, +} from "./types"; diff --git a/js/src/wrappers/bun-test/suite.ts b/js/src/wrappers/bun-test/suite.ts new file mode 100644 index 000000000..a70b73737 --- /dev/null +++ b/js/src/wrappers/bun-test/suite.ts @@ -0,0 +1,208 @@ +import { initExperiment, type Experiment } from "../../logger"; +import { runTracedEval } from "../shared/traced-eval"; +import { summarizeAndFlush } from "../shared/flush"; +import type { + BunTestSuite, + EvalConfig, + EvalContext, + SuiteTestFunction, +} from "./types"; +import type { BunTestSuiteConfig } from "./types"; + +type TestFn = (...args: any[]) => any; + +/** The shape we expect from bun:test's `test` at runtime. */ +interface ValidatedTestFunction extends TestFn { + skip: TestFn; + only: TestFn; + todo: TestFn; + failing: TestFn; + concurrent: TestFn; + serial: TestFn; + if: (condition: boolean) => TestFn; + skipIf: (condition: boolean) => TestFn; + todoIf: (condition: boolean) => TestFn; +} + +const DIRECT_MODIFIERS = [ + "skip", + "only", + "todo", + "failing", + "concurrent", + "serial", +] as const; +const CONDITIONAL_MODIFIERS = ["if", "skipIf", "todoIf"] as const; + +function validateTestFunction(test: unknown): ValidatedTestFunction { + if (typeof test !== "function") { + throw new Error( + `initBunTestSuite: "test" must be a function (got ${typeof test}). ` + + `Pass the "test" export from bun:test.`, + ); + } + const t = test as unknown as Record; + for (const mod of [...DIRECT_MODIFIERS, ...CONDITIONAL_MODIFIERS]) { + if (typeof t[mod] !== "function") { + throw new Error( + `initBunTestSuite: "test.${mod}" must be a function (got ${typeof t[mod]}). ` + + `Make sure you are passing the "test" export from bun:test.`, + ); + } + } + return test as ValidatedTestFunction; +} + +/** + * Creates a new Bun test suite with Braintrust experiment tracking. + * + * @example + * ```typescript + * import { test, describe, afterAll } from 'bun:test'; + * import { initBunTestSuite } from 'braintrust'; + * + * describe('My Tests', () => { + * const suite = initBunTestSuite({ + * projectName: 'my-project', + * afterAll, + * test, + * }); + * + * suite.test('my eval', { + * input: 'hello', + * expected: 'world', + * scorers: [myScorer], + * }, async ({ input }) => { + * return await myFunction(input); + * }); + * }); + * ``` + */ +export function initBunTestSuite any>( + config: BunTestSuiteConfig, +): BunTestSuite { + let experiment: Experiment | undefined; + + const getOrCreateExperiment = (): Experiment => { + if (experiment) { + return experiment; + } + + const experimentName = + config.experimentName || + `${config.projectName}-${new Date().toISOString()}`; + const initExp = config._initExperiment ?? initExperiment; + experiment = initExp(config.projectName, { + experiment: experimentName, + }) as Experiment; + return experiment; + }; + + function createTracedFn( + name: string, + evalConfig: EvalConfig, + fn: (context: EvalContext) => unknown | Promise, + ): () => Promise { + return async () => { + const exp = getOrCreateExperiment(); + const spanName = evalConfig.name ?? name; + + if (config.onProgress) { + config.onProgress({ type: "test_start", testName: spanName }); + } + + const startTime = performance.now(); + let passed = false; + + try { + await runTracedEval({ + experiment: exp, + spanName, + input: evalConfig.input, + expected: evalConfig.expected, + metadata: evalConfig.metadata, + tags: evalConfig.tags, + scorers: evalConfig.scorers, + fn: () => + fn({ + input: evalConfig.input, + expected: evalConfig.expected, + metadata: evalConfig.metadata, + }), + }); + passed = true; + } catch (error) { + passed = false; + throw error; + } finally { + if (config.onProgress) { + config.onProgress({ + type: "test_complete", + testName: spanName, + passed, + duration: performance.now() - startTime, + }); + } + } + }; + } + + function wrapTestVariant( + testFn: (name: string, fn: () => void | Promise) => void, + ) { + return ( + name: string, + evalConfig: EvalConfig, + fn: (context: EvalContext) => unknown | Promise, + ) => { + testFn(name, createTracedFn(name, evalConfig, fn)); + }; + } + + function wrapConditional( + modifier: ( + condition: boolean, + ) => (name: string, fn: () => void | Promise) => void, + ) { + return (condition: boolean) => { + return wrapTestVariant(modifier(condition)); + }; + } + + const t = validateTestFunction(config.test); + const suiteTest = Object.assign(wrapTestVariant(t), { + skip: wrapTestVariant(t.skip.bind(t)), + only: wrapTestVariant(t.only.bind(t)), + todo: wrapTestVariant(t.todo.bind(t)), + failing: wrapTestVariant(t.failing.bind(t)), + concurrent: wrapTestVariant(t.concurrent.bind(t)), + serial: wrapTestVariant(t.serial.bind(t)), + if: wrapConditional(t.if.bind(t)), + skipIf: wrapConditional(t.skipIf.bind(t)), + todoIf: wrapConditional(t.todoIf.bind(t)), + }) as SuiteTestFunction; + + async function flush(): Promise { + if (!experiment) { + return; + } + + await summarizeAndFlush(experiment, { + displaySummary: config.displaySummary, + }); + experiment = undefined; + } + + const suite: BunTestSuite = { + test: suiteTest, + it: suiteTest, + flush, + }; + + // Auto-register flush hook if afterAll() was provided + if (config.afterAll) { + config.afterAll(() => suite.flush()); + } + + return suite; +} diff --git a/js/src/wrappers/bun-test/test-helpers.ts b/js/src/wrappers/bun-test/test-helpers.ts new file mode 100644 index 000000000..32138e1b1 --- /dev/null +++ b/js/src/wrappers/bun-test/test-helpers.ts @@ -0,0 +1,36 @@ +import { configureNode } from "../../node/config"; +import { + _exportsForTestingOnly, + type TestBackgroundLogger, +} from "../../logger"; + +/** + * Sets up the test environment for bun-test wrapper tests. + * Must be called in beforeAll (top-level await). + * + * Returns the background logger for span verification. + */ +export async function setupBunTestEnv(): Promise { + configureNode(); + _exportsForTestingOnly.setInitialTestState(); + await _exportsForTestingOnly.simulateLoginForTests(); + return _exportsForTestingOnly.useTestBackgroundLogger(); +} + +export function teardownBunTestEnv(): void { + _exportsForTestingOnly.clearTestBackgroundLogger(); + _exportsForTestingOnly.simulateLogoutForTests(); +} + +/** + * Creates a test-only initExperiment function that uses the in-memory + * test logger instead of making real API calls. + */ +export function createTestInitExperiment() { + return (projectName: string, options?: { experiment?: string }) => { + return _exportsForTestingOnly.initTestExperiment( + options?.experiment || "test-experiment", + projectName, + ); + }; +} diff --git a/js/src/wrappers/bun-test/tsconfig.json b/js/src/wrappers/bun-test/tsconfig.json new file mode 100644 index 000000000..dafec4174 --- /dev/null +++ b/js/src/wrappers/bun-test/tsconfig.json @@ -0,0 +1,13 @@ +{ + "extends": "../../../tsconfig.json", + "compilerOptions": { + "types": ["@types/bun"] + }, + "include": ["./**/*.ts", "../../../src/**/*.ts"], + "exclude": [ + "node_modules/**", + "dist/**", + "../../../vendor/**", + "**/vendor/**" + ] +} diff --git a/js/src/wrappers/bun-test/types.ts b/js/src/wrappers/bun-test/types.ts new file mode 100644 index 000000000..512e7dcaf --- /dev/null +++ b/js/src/wrappers/bun-test/types.ts @@ -0,0 +1,169 @@ +import type { ScorerFunction } from "../shared/types"; + +export type { ScorerFunction } from "../shared/types"; + +/** Progress events emitted by the bun-test integration. */ +export type BunTestProgressEvent = + | { type: "test_start"; testName: string } + | { + type: "test_complete"; + testName: string; + passed: boolean; + duration: number; + }; + +/** + * Configuration for a single eval test case. + */ +export interface EvalConfig { + /** Test input data, logged to the span. */ + input?: unknown; + /** Expected output, passed to scorers. */ + expected?: unknown; + /** Custom metadata, logged to the span. */ + metadata?: Record; + /** Tags for organizing test cases. */ + tags?: string[]; + /** Scorer functions to evaluate the output. */ + scorers?: ScorerFunction[]; + /** Override span name (defaults to the test name). */ + name?: string; +} + +/** + * Context passed to the eval test function. + */ +export interface EvalContext { + input: unknown; + expected?: unknown; + metadata?: Record; +} + +/** + * The wrapped test function with `(name, config, fn)` signature. + */ +export interface SuiteTestFunction { + ( + name: string, + config: EvalConfig, + fn: (context: EvalContext) => unknown | Promise, + ): void; + skip: ( + name: string, + config: EvalConfig, + fn: (context: EvalContext) => unknown | Promise, + ) => void; + only: ( + name: string, + config: EvalConfig, + fn: (context: EvalContext) => unknown | Promise, + ) => void; + todo: ( + name: string, + config?: EvalConfig, + fn?: (context: EvalContext) => unknown | Promise, + ) => void; + failing: ( + name: string, + config: EvalConfig, + fn: (context: EvalContext) => unknown | Promise, + ) => void; + concurrent: ( + name: string, + config: EvalConfig, + fn: (context: EvalContext) => unknown | Promise, + ) => void; + serial: ( + name: string, + config: EvalConfig, + fn: (context: EvalContext) => unknown | Promise, + ) => void; + if: ( + condition: boolean, + ) => ( + name: string, + config: EvalConfig, + fn: (context: EvalContext) => unknown | Promise, + ) => void; + skipIf: ( + condition: boolean, + ) => ( + name: string, + config: EvalConfig, + fn: (context: EvalContext) => unknown | Promise, + ) => void; + todoIf: ( + condition: boolean, + ) => ( + name: string, + config: EvalConfig, + fn: (context: EvalContext) => unknown | Promise, + ) => void; +} + +/** + * Configuration for `initBunTestSuite()`. + * + * The `TTest` generic forwards the type of your `test` function + * (e.g. `Test<[]>` from `bun:test`) without re-declaring it. + */ +export interface BunTestSuiteConfig< + TTest extends (...args: any[]) => any = (...args: any[]) => any, +> { + /** Project name for the Braintrust experiment. */ + projectName: string; + /** Optional experiment name. Defaults to a timestamp-based name. */ + experimentName?: string; + /** + * If true, displays a formatted experiment summary after flushing. + * Defaults to true. + */ + displaySummary?: boolean; + /** + * The `test` function from `bun:test`. Required. + * The exact type is forwarded via the `TTest` generic so no + * wrapper interface is needed. + */ + test: TTest; + /** + * Pass `afterAll` from `bun:test` to auto-register a flush hook. + * When provided, `suite.flush()` is called automatically after all tests. + */ + afterAll?: (fn: () => void | Promise) => void; + /** + * Callback for real-time progress events. + * Emits `test_start` and `test_complete` events. + */ + onProgress?: (event: BunTestProgressEvent) => void; + /** + * @internal For testing only. Override the experiment initialization function. + */ + _initExperiment?: ( + projectName: string, + options?: { experiment?: string }, + ) => any; +} + +/** + * The public API surface returned by `initBunTestSuite()`. + */ +export interface BunTestSuite { + /** + * Wrapped test function that creates a traced eval. + * Call as `suite.test(name, config, fn)`. + * Supports modifiers: `.skip`, `.only`, `.todo`, `.failing`, + * `.concurrent`, `.serial`, `.if`, `.skipIf`, `.todoIf`. + */ + test: SuiteTestFunction; + + /** + * Alias for `suite.test` (Jest/Vitest convention). + */ + it: SuiteTestFunction; + + /** + * Flush the experiment: summarize results and send data to Braintrust. + * Called automatically if `afterAll` was provided in the config. + */ + flush(): Promise; +} diff --git a/js/tsconfig.json b/js/tsconfig.json index bdfdf2f12..a4fefc812 100644 --- a/js/tsconfig.json +++ b/js/tsconfig.json @@ -7,7 +7,8 @@ "moduleResolution": "node", "strict": true, "esModuleInterop": true, - "skipLibCheck": true + "skipLibCheck": true, + "types": ["node"] }, "include": ["."], "exclude": [ diff --git a/js/vitest.config.js b/js/vitest.config.js index 089f86712..061736b4f 100644 --- a/js/vitest.config.js +++ b/js/vitest.config.js @@ -40,12 +40,14 @@ const config = { "./vendor/**", // Exclude subdirectories with their own test configs "src/wrappers/ai-sdk/**", + "src/wrappers/bun-test/**", "src/wrappers/claude-agent-sdk/**", "src/wrappers/vitest/**", "smoke/**", // Exclude example tests (require API keys and make real API calls) "examples/vitest/**", "examples/node-test/**", + "examples/bun-test/**", ], // Additional test environment configuration watchExclude: [ diff --git a/mise.toml b/mise.toml index f328d72fe..c3f30fb3f 100644 --- a/mise.toml +++ b/mise.toml @@ -9,6 +9,7 @@ _.file = ".env" [tools] pnpm = "10.26.2" +bun = "latest" [hooks] postinstall = "pnpm install" diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 78f01d233..c72c5ff1e 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -361,6 +361,9 @@ importers: '@types/async': specifier: ^3.2.24 version: 3.2.24 + '@types/bun': + specifier: ^1.3.10 + version: 1.3.10 '@types/cli-progress': specifier: ^3.11.5 version: 3.11.5 @@ -2386,6 +2389,9 @@ packages: '@types/body-parser@1.19.5': resolution: {integrity: sha512-fB3Zu92ucau0iQ0JMCFQE7b/dv8Ot07NI3KaZIkIUNXq82k4eBAqUaneXfleGY9JWskeS9y+u0nXMyspcuQrCg==} + '@types/bun@1.3.10': + resolution: {integrity: sha512-0+rlrUrOrTSskibryHbvQkDOWRJwJZqZlxrUs1u4oOoTln8+WIXBPmAuCF35SWB2z4Zl3E84Nl/D0P7803nigQ==} + '@types/cli-progress@3.11.5': resolution: {integrity: sha512-D4PbNRbviKyppS5ivBGyFO29POlySLmA2HyUFE4p5QGazAMM3CwkKWcvTl8gvElSuxRh6FPKL8XmidX873ou4g==} @@ -2994,6 +3000,9 @@ packages: buffer-from@1.1.2: resolution: {integrity: sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==} + bun-types@1.3.10: + resolution: {integrity: sha512-tcpfCCl6XWo6nCVnpcVrxQ+9AYN1iqMIzgrSKYMB/fjLtV2eyAVEg7AxQJuCq/26R6HpKWykQXuSOq/21RYcbg==} + bundle-require@5.1.0: resolution: {integrity: sha512-3WrrOuZiyaaZPWiEt4G3+IffISVC9HYlWueJEBWED4ZH4aIAC2PnkdnuRrR94M+w6yGWn4AglWtJtBI8YqvgoA==} engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} @@ -8061,6 +8070,10 @@ snapshots: '@types/connect': 3.4.38 '@types/node': 22.19.1 + '@types/bun@1.3.10': + dependencies: + bun-types: 1.3.10 + '@types/cli-progress@3.11.5': dependencies: '@types/node': 22.19.1 @@ -8961,6 +8974,10 @@ snapshots: buffer-from@1.1.2: {} + bun-types@1.3.10: + dependencies: + '@types/node': 22.19.1 + bundle-require@5.1.0(esbuild@0.24.2): dependencies: esbuild: 0.24.2 From 13afd208bf314704cd443a76bb5051f9288872da Mon Sep 17 00:00:00 2001 From: Abhijeet Prasad Date: Wed, 4 Mar 2026 13:53:30 -0500 Subject: [PATCH 2/2] test.only fixes for bun --- js/Makefile | 4 +- js/src/wrappers/bun-test/suite.ts | 51 ++++++++++++---- js/src/wrappers/bun-test/types.ts | 98 ++++++------------------------ js/src/wrappers/node-test/types.ts | 31 +--------- js/src/wrappers/shared/types.ts | 29 +++++++++ 5 files changed, 91 insertions(+), 122 deletions(-) diff --git a/js/Makefile b/js/Makefile index d90b59323..49fb408b3 100644 --- a/js/Makefile +++ b/js/Makefile @@ -130,10 +130,10 @@ test-api-compat: pnpm test:api-compat # Test everything -test: test-core test-openai test-anthropic test-google-genai test-ai-sdk test-vitest test-claude-agent-sdk test-bun-test +test: test-core test-openai test-anthropic test-google-genai test-ai-sdk test-vitest test-claude-agent-sdk # Test the core and the latest versions of wrappers. -test-latest: test-core test-anthropic-latest test-openai-latest test-google-genai test-ai-sdk test-vitest test-claude-agent-sdk test-bun-test +test-latest: test-core test-anthropic-latest test-openai-latest test-google-genai test-ai-sdk test-vitest test-claude-agent-sdk prune: diff --git a/js/src/wrappers/bun-test/suite.ts b/js/src/wrappers/bun-test/suite.ts index a70b73737..0536db082 100644 --- a/js/src/wrappers/bun-test/suite.ts +++ b/js/src/wrappers/bun-test/suite.ts @@ -34,6 +34,20 @@ const DIRECT_MODIFIERS = [ ] as const; const CONDITIONAL_MODIFIERS = ["if", "skipIf", "todoIf"] as const; +// Modifiers safe to validate eagerly. "only" is excluded because bun's CI +// mode throws when test.only is even *accessed* (property read) to prevent +// accidentally focusing tests. The .only wrapper defers access to call time. +const VALIDATED_MODIFIERS = [ + "skip", + "todo", + "failing", + "concurrent", + "serial", + "if", + "skipIf", + "todoIf", +] as const; + function validateTestFunction(test: unknown): ValidatedTestFunction { if (typeof test !== "function") { throw new Error( @@ -42,7 +56,7 @@ function validateTestFunction(test: unknown): ValidatedTestFunction { ); } const t = test as unknown as Record; - for (const mod of [...DIRECT_MODIFIERS, ...CONDITIONAL_MODIFIERS]) { + for (const mod of VALIDATED_MODIFIERS) { if (typeof t[mod] !== "function") { throw new Error( `initBunTestSuite: "test.${mod}" must be a function (got ${typeof t[mod]}). ` + @@ -170,17 +184,30 @@ export function initBunTestSuite any>( } const t = validateTestFunction(config.test); - const suiteTest = Object.assign(wrapTestVariant(t), { - skip: wrapTestVariant(t.skip.bind(t)), - only: wrapTestVariant(t.only.bind(t)), - todo: wrapTestVariant(t.todo.bind(t)), - failing: wrapTestVariant(t.failing.bind(t)), - concurrent: wrapTestVariant(t.concurrent.bind(t)), - serial: wrapTestVariant(t.serial.bind(t)), - if: wrapConditional(t.if.bind(t)), - skipIf: wrapConditional(t.skipIf.bind(t)), - todoIf: wrapConditional(t.todoIf.bind(t)), - }) as SuiteTestFunction; + + // Build modifier wrappers lazily — calling .bind() on modifiers like + // test.only at construction time triggers bun's CI guard which disables + // .only when CI=true. By deferring the .bind() to invocation time, we + // avoid the error when the modifier is never actually called. + const modifiers: Partial> = {}; + for (const mod of DIRECT_MODIFIERS) { + (modifiers as any)[mod] = ( + name: string, + evalConfig: EvalConfig, + fn: (context: EvalContext) => unknown | Promise, + ) => { + wrapTestVariant(t[mod].bind(t))(name, evalConfig, fn); + }; + } + for (const mod of CONDITIONAL_MODIFIERS) { + (modifiers as any)[mod] = (condition: boolean) => { + return wrapConditional(t[mod].bind(t))(condition); + }; + } + const suiteTest = Object.assign( + wrapTestVariant(t), + modifiers, + ) as unknown as SuiteTestFunction; async function flush(): Promise { if (!experiment) { diff --git a/js/src/wrappers/bun-test/types.ts b/js/src/wrappers/bun-test/types.ts index 512e7dcaf..e6bc9965a 100644 --- a/js/src/wrappers/bun-test/types.ts +++ b/js/src/wrappers/bun-test/types.ts @@ -1,6 +1,6 @@ -import type { ScorerFunction } from "../shared/types"; +export type { ScorerFunction, EvalConfig, EvalContext } from "../shared/types"; -export type { ScorerFunction } from "../shared/types"; +import type { EvalConfig, EvalContext } from "../shared/types"; /** Progress events emitted by the bun-test integration. */ export type BunTestProgressEvent = @@ -12,93 +12,33 @@ export type BunTestProgressEvent = duration: number; }; -/** - * Configuration for a single eval test case. - */ -export interface EvalConfig { - /** Test input data, logged to the span. */ - input?: unknown; - /** Expected output, passed to scorers. */ - expected?: unknown; - /** Custom metadata, logged to the span. */ - metadata?: Record; - /** Tags for organizing test cases. */ - tags?: string[]; - /** Scorer functions to evaluate the output. */ - scorers?: ScorerFunction[]; - /** Override span name (defaults to the test name). */ - name?: string; -} +/** A single traced eval test function signature. */ +type EvalTestFn = ( + name: string, + config: EvalConfig, + fn: (context: EvalContext) => unknown | Promise, +) => void; -/** - * Context passed to the eval test function. - */ -export interface EvalContext { - input: unknown; - expected?: unknown; - metadata?: Record; -} +/** Conditional modifier: returns an EvalTestFn based on a boolean condition. */ +type ConditionalEvalTestFn = (condition: boolean) => EvalTestFn; /** * The wrapped test function with `(name, config, fn)` signature. */ -export interface SuiteTestFunction { - ( - name: string, - config: EvalConfig, - fn: (context: EvalContext) => unknown | Promise, - ): void; - skip: ( - name: string, - config: EvalConfig, - fn: (context: EvalContext) => unknown | Promise, - ) => void; - only: ( - name: string, - config: EvalConfig, - fn: (context: EvalContext) => unknown | Promise, - ) => void; +export interface SuiteTestFunction extends EvalTestFn { + skip: EvalTestFn; + only: EvalTestFn; todo: ( name: string, config?: EvalConfig, fn?: (context: EvalContext) => unknown | Promise, ) => void; - failing: ( - name: string, - config: EvalConfig, - fn: (context: EvalContext) => unknown | Promise, - ) => void; - concurrent: ( - name: string, - config: EvalConfig, - fn: (context: EvalContext) => unknown | Promise, - ) => void; - serial: ( - name: string, - config: EvalConfig, - fn: (context: EvalContext) => unknown | Promise, - ) => void; - if: ( - condition: boolean, - ) => ( - name: string, - config: EvalConfig, - fn: (context: EvalContext) => unknown | Promise, - ) => void; - skipIf: ( - condition: boolean, - ) => ( - name: string, - config: EvalConfig, - fn: (context: EvalContext) => unknown | Promise, - ) => void; - todoIf: ( - condition: boolean, - ) => ( - name: string, - config: EvalConfig, - fn: (context: EvalContext) => unknown | Promise, - ) => void; + failing: EvalTestFn; + concurrent: EvalTestFn; + serial: EvalTestFn; + if: ConditionalEvalTestFn; + skipIf: ConditionalEvalTestFn; + todoIf: ConditionalEvalTestFn; } /** diff --git a/js/src/wrappers/node-test/types.ts b/js/src/wrappers/node-test/types.ts index 7cf36f2da..0f41deaf0 100644 --- a/js/src/wrappers/node-test/types.ts +++ b/js/src/wrappers/node-test/types.ts @@ -1,6 +1,6 @@ -import type { ScorerFunction } from "../shared/types"; +import type { EvalConfig, EvalContext } from "../shared/types"; -export type { ScorerFunction } from "../shared/types"; +export type { ScorerFunction, EvalConfig, EvalContext } from "../shared/types"; /** Progress events emitted by the node-test integration. */ export type NodeTestProgressEvent = @@ -46,33 +46,6 @@ export interface NodeTestSuiteConfig { onProgress?: (event: NodeTestProgressEvent) => void; } -/** - * Configuration for a single eval test case. - */ -export interface EvalConfig { - /** Test input data, logged to the span. */ - input?: unknown; - /** Expected output, passed to scorers. */ - expected?: unknown; - /** Custom metadata, logged to the span. */ - metadata?: Record; - /** Tags for organizing test cases. */ - tags?: string[]; - /** Scorer functions to evaluate the output. */ - scorers?: ScorerFunction[]; - /** Override span name (defaults to `t.name`, then `"unnamed test"`). */ - name?: string; -} - -/** - * Context passed to the eval test function. - */ -export interface EvalContext { - input: unknown; - expected?: unknown; - metadata?: Record; -} - /** * The public API surface returned by `initNodeTestSuite()`. */ diff --git a/js/src/wrappers/shared/types.ts b/js/src/wrappers/shared/types.ts index ba87847a2..7cb5ba553 100644 --- a/js/src/wrappers/shared/types.ts +++ b/js/src/wrappers/shared/types.ts @@ -8,6 +8,35 @@ export type ScorerFunction = (args: { metadata?: Record; }) => Score | Promise | number | null | Array; +/** + * Configuration for a single eval test case. + * Shared across test runner integrations (node-test, bun-test, etc.). + */ +export interface EvalConfig { + /** Test input data, logged to the span. */ + input?: unknown; + /** Expected output, passed to scorers. */ + expected?: unknown; + /** Custom metadata, logged to the span. */ + metadata?: Record; + /** Tags for organizing test cases. */ + tags?: string[]; + /** Scorer functions to evaluate the output. */ + scorers?: ScorerFunction[]; + /** Override span name (defaults to the test name). */ + name?: string; +} + +/** + * Context passed to the eval test function. + * Shared across test runner integrations (node-test, bun-test, etc.). + */ +export interface EvalContext { + input: unknown; + expected?: unknown; + metadata?: Record; +} + // Progress event types for real-time test reporting export type ProgressEvent = | { type: "suite_start"; suiteName: string }