braintrustdata · Andrew Kent (realark) · Mar 27, 2026 · Mar 26, 2026
diff --git a/btx/build.gradle b/btx/build.gradle
@@ -0,0 +1,64 @@
+plugins {
+    id 'java'
+}
+
+java {
+    toolchain {
+        languageVersion = JavaLanguageVersion.of(17)
+    }
+}
+
+repositories {
+    mavenCentral()
+    mavenLocal()
+}
+
+dependencies {
+    // Braintrust SDK (local project dependencies)
+    testImplementation project(':braintrust-sdk')
+    testImplementation project(':braintrust-sdk:instrumentation:openai_2_8_0')
+    testImplementation project(':braintrust-sdk:instrumentation:anthropic_2_2_0')
+    testImplementation project(':braintrust-sdk:instrumentation:genai_1_18_0')
+    testImplementation project(':braintrust-sdk:instrumentation:langchain_1_8_0')
+
+    // Jackson for JSON processing
+    testImplementation 'com.fasterxml.jackson.core:jackson-databind:2.16.1'
+
+    // OpenAI SDK
+    testImplementation 'com.openai:openai-java:2.8.1'
+
+    // Anthropic SDK
+    testImplementation 'com.anthropic:anthropic-java:2.10.0'
+
+    // Gemini SDK
+    testImplementation 'org.springframework.ai:spring-ai-google-genai:1.1.0'
+
+    // LangChain4j
+    testImplementation 'dev.langchain4j:langchain4j:1.9.1'
+    testImplementation 'dev.langchain4j:langchain4j-http-client:1.9.1'
+    testImplementation 'dev.langchain4j:langchain4j-open-ai:1.9.1'
+
+    // OpenTelemetry
+    testImplementation 'io.opentelemetry:opentelemetry-api:1.54.1'
+
+    // YAML parsing for spec files
+    testImplementation 'org.yaml:snakeyaml:2.3'
+
+    // Test framework
+    testImplementation(testFixtures(project(":test-harness")))
+    testImplementation "org.junit.jupiter:junit-jupiter:${rootProject.ext.junitVersion}"
+    testImplementation "org.junit.jupiter:junit-jupiter-params:${rootProject.ext.junitVersion}"
+    testImplementation "io.opentelemetry:opentelemetry-sdk:${rootProject.ext.otelVersion}"
+    testRuntimeOnly 'org.slf4j:slf4j-simple:2.0.17'
+    testRuntimeOnly 'org.junit.platform:junit-platform-launcher'
+}
+
+test {
+    useJUnitPlatform()
+    workingDir = rootProject.projectDir
+    testLogging {
+        events "passed", "skipped", "failed"
+        showStandardStreams = true
+        exceptionFormat "full"
+    }
+}
diff --git a/btx/spec/README.md b/btx/spec/README.md
@@ -0,0 +1,9 @@
+# Braintrust Spec
+
+Cross language specs for implementing a Braintrust SDK.
+
+Contains:
+
+- markdown files describing complex features
+- yaml describing end-to-end tests and assertions
+- yaml describing cross-language constants (envars, string attributes)
diff --git a/btx/spec/llm_span/README.md b/btx/spec/llm_span/README.md
@@ -0,0 +1,3 @@
+# llm span end-to-end tests
+
+TODO: document this
diff --git a/btx/spec/llm_span/anthropic/messages.yaml b/btx/spec/llm_span/anthropic/messages.yaml
@@ -0,0 +1,34 @@
+name: messages
+type: llm_span_test
+provider: anthropic
+endpoint: /v1/messages
+enabled_runners: ["python", "typescript", "java", "csharp"]
+requests:
+  - model: claude-haiku-4-5-20251001
+    temperature: 0.0
+    max_tokens: 128
+    system: "You are a helpful assistant."
+    messages:
+      - role: user
+        content: What is the capital of France?
+expected_brainstore_spans:
+  - metrics:
+      tokens: !fn is_non_negative_number
+      prompt_tokens: !fn is_non_negative_number
+      completion_tokens: !fn is_non_negative_number
+    metadata:
+      model: claude-haiku-4-5-20251001
+      provider: anthropic
+    span_attributes:
+      name: anthropic.messages.create
+      type: llm
+    input:
+      - content: What is the capital of France?
+        role: user
+      - content: "You are a helpful assistant."
+        role: system
+    output:
+      content:
+        - text: The capital of France is Paris.
+          type: text
+      role: assistant
diff --git a/btx/spec/llm_span/google/attachments.yaml b/btx/spec/llm_span/google/attachments.yaml
@@ -0,0 +1,44 @@
+name: attachments
+type: llm_span_test
+provider: google
+endpoint: /v1/models/gemini-2.0-flash:generateContent
+enabled_runners: ["python", "typescript", "java", "go"]
+requests:
+  - contents:
+      - role: user
+        parts:
+          - text: What color is this image?
+          - inline_data:
+              mime_type: image/png
+              # 1x1 red pixel
+              data: iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8DwHwAFBQIAX8jx0gAAAABJRU5ErkJggg==
+    generationConfig:
+      temperature: 0.0
+expected_brainstore_spans:
+  - metrics:
+      tokens: !fn is_non_negative_number
+      prompt_tokens: !fn is_non_negative_number
+      completion_tokens: !fn is_non_negative_number
+    metadata:
+      model: gemini-2.0-flash
+    span_attributes:
+      name: generate_content
+      type: llm
+    input:
+      model: gemini-2.0-flash
+      contents:
+        - role: user
+          parts:
+            - text: What color is this image?
+            - image_url:
+                url:
+                  content_type: image/png
+                  filename: !fn is_non_empty_string
+                  key: !fn is_non_empty_string
+                  type: braintrust_attachment
+    output:
+      candidates:
+        - content:
+            parts:
+              - text: !fn is_non_empty_string
+            role: model
diff --git a/btx/spec/llm_span/google/generate_content.yaml b/btx/spec/llm_span/google/generate_content.yaml
@@ -0,0 +1,34 @@
+name: generate_content
+type: llm_span_test
+provider: google
+endpoint: /v1/models/gemini-2.5-flash:generateContent
+enabled_runners: ["python", "typescript", "java", "go"]
+requests:
+  - contents:
+      - role: user
+        parts:
+          - text: What is the capital of France?
+    generationConfig:
+      temperature: 0.0
+expected_brainstore_spans:
+  - metrics:
+      tokens: !fn is_non_negative_number
+      prompt_tokens: !fn is_non_negative_number
+      completion_tokens: !fn is_non_negative_number
+    metadata:
+      model: gemini-2.5-flash
+    span_attributes:
+      name: generate_content
+      type: llm
+    input:
+      model: gemini-2.5-flash
+      contents:
+        - role: user
+          parts:
+            - text: What is the capital of France?
+    output:
+      candidates:
+        - content:
+            parts:
+              - text: !fn is_non_empty_string
+            role: model
diff --git a/btx/spec/llm_span/openai/attachments.yaml b/btx/spec/llm_span/openai/attachments.yaml
@@ -0,0 +1,52 @@
+name: attachments
+type: llm_span_test
+provider: openai
+endpoint: /v1/chat/completions
+requests:
+  - model: gpt-4o-mini
+    temperature: 0.0
+    messages:
+      - role: system
+        content: you are a helpful assistant
+      - role: user
+        content:
+        - type: text
+          text: What color is this image?
+        - type: image_url
+          image_url:
+            # 1x1 red pixel
+            url: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8DwHwAFBQIAX8jx0gAAAABJRU5ErkJggg==
+expected_brainstore_spans:
+  - metrics:
+      tokens: !fn is_non_negative_number
+      prompt_tokens: !fn is_non_negative_number
+      completion_tokens: !fn is_non_negative_number
+    metadata:
+      model: !starts_with "gpt-4o-mini"
+      provider: openai
+    span_attributes:
+      name: Chat Completion
+      type: llm
+    input:
+      - role: system
+        content: you are a helpful assistant
+      - role: user
+        content:
+          - text: What color is this image?
+            type: text
+          - image_url:
+              url:
+                content_type: image/png
+                filename: !fn is_non_empty_string
+                key: !fn is_non_empty_string
+                type: braintrust_attachment
+            type: image_url
+    output:
+      - !or
+        - finish_reason: stop
+          index: 0
+          message:
+            role: assistant
+            content: !fn is_non_empty_string
+        - role: assistant
+          content: !fn is_non_empty_string
diff --git a/btx/spec/llm_span/openai/completions.yaml b/btx/spec/llm_span/openai/completions.yaml
@@ -0,0 +1,34 @@
+name: completions
+type: llm_span_test
+provider: openai
+endpoint: /v1/chat/completions
+requests:
+  - model: gpt-4o-mini
+    temperature: 0.0
+    messages:
+      - role: system
+        content: you are a helpful assistant
+      - role: user
+        content: What is the capital of France?
+expected_brainstore_spans:
+  - metrics:
+      tokens: !fn is_non_negative_number
+      prompt_tokens: !fn is_non_negative_number
+      completion_tokens: !fn is_non_negative_number
+    metadata:
+      model: !starts_with "gpt-4o-mini"
+      provider: openai
+    span_attributes:
+      name: Chat Completion
+      type: llm
+    input:
+      - role: system
+        content: you are a helpful assistant
+      - role: user
+        content: What is the capital of France?
+    output:
+      - finish_reason: stop
+        index: 0
+        message:
+          role: assistant
+          content: The capital of France is Paris.
diff --git a/btx/spec/llm_span/openai/reasoning.yaml b/btx/spec/llm_span/openai/reasoning.yaml
@@ -0,0 +1,84 @@
+name: reasoning
+type: llm_span_test
+provider: openai
+endpoint: /v1/responses
+# java instrumentation doesn't support responses yet
+enabled_runners: ["python", "typescript", "java"]
+requests:
+  - model: o4-mini
+    reasoning:
+      effort: high
+      summary: detailed
+    input:
+      - role: user
+        content: >
+          Look at this sequence: 2, 6, 12, 20, 30. What is the pattern and what
+          would be the formula for the nth term?
+  - model: o4-mini
+    reasoning:
+      effort: high
+      summary: detailed
+    input:
+      # Full context from the first response is inserted by the runner
+      - role: user
+        content: Using the pattern you discovered, what would be the 10th term? And can you find the sum of the first 10 terms?
+expected_brainstore_spans:
+  - metrics:
+      tokens: !fn is_non_negative_number
+      prompt_tokens: !fn is_non_negative_number
+      completion_tokens: !fn is_non_negative_number
+      completion_reasoning_tokens: !fn is_non_negative_number
+    metadata:
+      model: !starts_with "o4-mini"
+      provider: openai
+    span_attributes:
+      type: llm
+    input:
+      - role: user
+        content: >
+          Look at this sequence: 2, 6, 12, 20, 30. What is the pattern and what
+          would be the formula for the nth term?
+    output:
+      - type: reasoning
+        summary: !fn is_reasoning_message
+      - type: message
+        role: assistant
+        status: completed
+        content:
+          - type: output_text
+            annotations: []
+            logprobs: []
+            text: !fn is_non_empty_string
+  - metrics:
+      tokens: !fn is_non_negative_number
+      prompt_tokens: !fn is_non_negative_number
+      completion_tokens: !fn is_non_negative_number
+      completion_reasoning_tokens: !fn is_non_negative_number
+    metadata:
+      model: !starts_with "o4-mini"
+      provider: openai
+    span_attributes:
+      type: llm
+    input:
+      # Light assertions since most of the context is duplicated/inserted
+      - role: user
+      - type: reasoning
+        summary: !fn is_reasoning_message
+      - type: message
+        role: assistant
+        status: completed
+        content:
+          - type: output_text
+      - role: user
+        content: Using the pattern you discovered, what would be the 10th term? And can you find the sum of the first 10 terms?
+    output:
+      - type: reasoning
+        summary: !fn is_reasoning_message
+      - type: message
+        role: assistant
+        status: completed
+        content:
+          - type: output_text
+            annotations: []
+            logprobs: []
+            text: !fn "lambda value: \"440\" in value"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# llm span end-to-end tests

		TODO: document this