Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions btx/build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
plugins {
id 'java'
}

java {
toolchain {
languageVersion = JavaLanguageVersion.of(17)
}
}

repositories {
mavenCentral()
mavenLocal()
}

dependencies {
// Braintrust SDK (local project dependencies)
testImplementation project(':braintrust-sdk')
testImplementation project(':braintrust-sdk:instrumentation:openai_2_8_0')
testImplementation project(':braintrust-sdk:instrumentation:anthropic_2_2_0')
testImplementation project(':braintrust-sdk:instrumentation:genai_1_18_0')
testImplementation project(':braintrust-sdk:instrumentation:langchain_1_8_0')

// Jackson for JSON processing
testImplementation 'com.fasterxml.jackson.core:jackson-databind:2.16.1'

// OpenAI SDK
testImplementation 'com.openai:openai-java:2.8.1'

// Anthropic SDK
testImplementation 'com.anthropic:anthropic-java:2.10.0'

// Gemini SDK
testImplementation 'org.springframework.ai:spring-ai-google-genai:1.1.0'

// LangChain4j
testImplementation 'dev.langchain4j:langchain4j:1.9.1'
testImplementation 'dev.langchain4j:langchain4j-http-client:1.9.1'
testImplementation 'dev.langchain4j:langchain4j-open-ai:1.9.1'

// OpenTelemetry
testImplementation 'io.opentelemetry:opentelemetry-api:1.54.1'

// YAML parsing for spec files
testImplementation 'org.yaml:snakeyaml:2.3'

// Test framework
testImplementation(testFixtures(project(":test-harness")))
testImplementation "org.junit.jupiter:junit-jupiter:${rootProject.ext.junitVersion}"
testImplementation "org.junit.jupiter:junit-jupiter-params:${rootProject.ext.junitVersion}"
testImplementation "io.opentelemetry:opentelemetry-sdk:${rootProject.ext.otelVersion}"
testRuntimeOnly 'org.slf4j:slf4j-simple:2.0.17'
testRuntimeOnly 'org.junit.platform:junit-platform-launcher'
}

test {
useJUnitPlatform()
workingDir = rootProject.projectDir
testLogging {
events "passed", "skipped", "failed"
showStandardStreams = true
exceptionFormat "full"
}
}
9 changes: 9 additions & 0 deletions btx/spec/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Braintrust Spec

Cross language specs for implementing a Braintrust SDK.

Contains:

- markdown files describing complex features
- yaml describing end-to-end tests and assertions
- yaml describing cross-language constants (envars, string attributes)
3 changes: 3 additions & 0 deletions btx/spec/llm_span/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# llm span end-to-end tests

TODO: document this
34 changes: 34 additions & 0 deletions btx/spec/llm_span/anthropic/messages.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: messages
type: llm_span_test
provider: anthropic
endpoint: /v1/messages
enabled_runners: ["python", "typescript", "java", "csharp"]
requests:
- model: claude-haiku-4-5-20251001
temperature: 0.0
max_tokens: 128
system: "You are a helpful assistant."
messages:
- role: user
content: What is the capital of France?
expected_brainstore_spans:
- metrics:
tokens: !fn is_non_negative_number
prompt_tokens: !fn is_non_negative_number
completion_tokens: !fn is_non_negative_number
metadata:
model: claude-haiku-4-5-20251001
provider: anthropic
span_attributes:
name: anthropic.messages.create
type: llm
input:
- content: What is the capital of France?
role: user
- content: "You are a helpful assistant."
role: system
output:
content:
- text: The capital of France is Paris.
type: text
role: assistant
44 changes: 44 additions & 0 deletions btx/spec/llm_span/google/attachments.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
name: attachments
type: llm_span_test
provider: google
endpoint: /v1/models/gemini-2.0-flash:generateContent
enabled_runners: ["python", "typescript", "java", "go"]
requests:
- contents:
- role: user
parts:
- text: What color is this image?
- inline_data:
mime_type: image/png
# 1x1 red pixel
data: iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8DwHwAFBQIAX8jx0gAAAABJRU5ErkJggg==
generationConfig:
temperature: 0.0
expected_brainstore_spans:
- metrics:
tokens: !fn is_non_negative_number
prompt_tokens: !fn is_non_negative_number
completion_tokens: !fn is_non_negative_number
metadata:
model: gemini-2.0-flash
span_attributes:
name: generate_content
type: llm
input:
model: gemini-2.0-flash
contents:
- role: user
parts:
- text: What color is this image?
- image_url:
url:
content_type: image/png
filename: !fn is_non_empty_string
key: !fn is_non_empty_string
type: braintrust_attachment
output:
candidates:
- content:
parts:
- text: !fn is_non_empty_string
role: model
34 changes: 34 additions & 0 deletions btx/spec/llm_span/google/generate_content.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: generate_content
type: llm_span_test
provider: google
endpoint: /v1/models/gemini-2.5-flash:generateContent
enabled_runners: ["python", "typescript", "java", "go"]
requests:
- contents:
- role: user
parts:
- text: What is the capital of France?
generationConfig:
temperature: 0.0
expected_brainstore_spans:
- metrics:
tokens: !fn is_non_negative_number
prompt_tokens: !fn is_non_negative_number
completion_tokens: !fn is_non_negative_number
metadata:
model: gemini-2.5-flash
span_attributes:
name: generate_content
type: llm
input:
model: gemini-2.5-flash
contents:
- role: user
parts:
- text: What is the capital of France?
output:
candidates:
- content:
parts:
- text: !fn is_non_empty_string
role: model
52 changes: 52 additions & 0 deletions btx/spec/llm_span/openai/attachments.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
name: attachments
type: llm_span_test
provider: openai
endpoint: /v1/chat/completions
requests:
- model: gpt-4o-mini
temperature: 0.0
messages:
- role: system
content: you are a helpful assistant
- role: user
content:
- type: text
text: What color is this image?
- type: image_url
image_url:
# 1x1 red pixel
url: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8DwHwAFBQIAX8jx0gAAAABJRU5ErkJggg==
expected_brainstore_spans:
- metrics:
tokens: !fn is_non_negative_number
prompt_tokens: !fn is_non_negative_number
completion_tokens: !fn is_non_negative_number
metadata:
model: !starts_with "gpt-4o-mini"
provider: openai
span_attributes:
name: Chat Completion
type: llm
input:
- role: system
content: you are a helpful assistant
- role: user
content:
- text: What color is this image?
type: text
- image_url:
url:
content_type: image/png
filename: !fn is_non_empty_string
key: !fn is_non_empty_string
type: braintrust_attachment
type: image_url
output:
- !or
- finish_reason: stop
index: 0
message:
role: assistant
content: !fn is_non_empty_string
- role: assistant
content: !fn is_non_empty_string
34 changes: 34 additions & 0 deletions btx/spec/llm_span/openai/completions.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: completions
type: llm_span_test
provider: openai
endpoint: /v1/chat/completions
requests:
- model: gpt-4o-mini
temperature: 0.0
messages:
- role: system
content: you are a helpful assistant
- role: user
content: What is the capital of France?
expected_brainstore_spans:
- metrics:
tokens: !fn is_non_negative_number
prompt_tokens: !fn is_non_negative_number
completion_tokens: !fn is_non_negative_number
metadata:
model: !starts_with "gpt-4o-mini"
provider: openai
span_attributes:
name: Chat Completion
type: llm
input:
- role: system
content: you are a helpful assistant
- role: user
content: What is the capital of France?
output:
- finish_reason: stop
index: 0
message:
role: assistant
content: The capital of France is Paris.
84 changes: 84 additions & 0 deletions btx/spec/llm_span/openai/reasoning.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
name: reasoning
type: llm_span_test
provider: openai
endpoint: /v1/responses
# java instrumentation doesn't support responses yet
enabled_runners: ["python", "typescript", "java"]
requests:
- model: o4-mini
reasoning:
effort: high
summary: detailed
input:
- role: user
content: >
Look at this sequence: 2, 6, 12, 20, 30. What is the pattern and what
would be the formula for the nth term?
- model: o4-mini
reasoning:
effort: high
summary: detailed
input:
# Full context from the first response is inserted by the runner
- role: user
content: Using the pattern you discovered, what would be the 10th term? And can you find the sum of the first 10 terms?
expected_brainstore_spans:
- metrics:
tokens: !fn is_non_negative_number
prompt_tokens: !fn is_non_negative_number
completion_tokens: !fn is_non_negative_number
completion_reasoning_tokens: !fn is_non_negative_number
metadata:
model: !starts_with "o4-mini"
provider: openai
span_attributes:
type: llm
input:
- role: user
content: >
Look at this sequence: 2, 6, 12, 20, 30. What is the pattern and what
would be the formula for the nth term?
output:
- type: reasoning
summary: !fn is_reasoning_message
- type: message
role: assistant
status: completed
content:
- type: output_text
annotations: []
logprobs: []
text: !fn is_non_empty_string
- metrics:
tokens: !fn is_non_negative_number
prompt_tokens: !fn is_non_negative_number
completion_tokens: !fn is_non_negative_number
completion_reasoning_tokens: !fn is_non_negative_number
metadata:
model: !starts_with "o4-mini"
provider: openai
span_attributes:
type: llm
input:
# Light assertions since most of the context is duplicated/inserted
- role: user
- type: reasoning
summary: !fn is_reasoning_message
- type: message
role: assistant
status: completed
content:
- type: output_text
- role: user
content: Using the pattern you discovered, what would be the 10th term? And can you find the sum of the first 10 terms?
output:
- type: reasoning
summary: !fn is_reasoning_message
- type: message
role: assistant
status: completed
content:
- type: output_text
annotations: []
logprobs: []
text: !fn "lambda value: \"440\" in value"
Loading
Loading