Skip to content

Commit d2d51de

Browse files
committed
SDK spec llm span test runner
1 parent 328dbdd commit d2d51de

23 files changed

Lines changed: 2130 additions & 0 deletions

btx/build.gradle

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
plugins {
2+
id 'java'
3+
}
4+
5+
java {
6+
toolchain {
7+
languageVersion = JavaLanguageVersion.of(17)
8+
}
9+
}
10+
11+
repositories {
12+
mavenCentral()
13+
mavenLocal()
14+
}
15+
16+
dependencies {
17+
// Braintrust SDK (local project dependencies)
18+
testImplementation project(':braintrust-sdk')
19+
testImplementation project(':braintrust-sdk:instrumentation:openai_2_8_0')
20+
testImplementation project(':braintrust-sdk:instrumentation:anthropic_2_2_0')
21+
testImplementation project(':braintrust-sdk:instrumentation:genai_1_18_0')
22+
testImplementation project(':braintrust-sdk:instrumentation:langchain_1_8_0')
23+
24+
// Jackson for JSON processing
25+
testImplementation 'com.fasterxml.jackson.core:jackson-databind:2.16.1'
26+
27+
// OpenAI SDK
28+
testImplementation 'com.openai:openai-java:2.8.1'
29+
30+
// Anthropic SDK
31+
testImplementation 'com.anthropic:anthropic-java:2.10.0'
32+
33+
// Gemini SDK
34+
testImplementation 'org.springframework.ai:spring-ai-google-genai:1.1.0'
35+
36+
// LangChain4j
37+
testImplementation 'dev.langchain4j:langchain4j:1.9.1'
38+
testImplementation 'dev.langchain4j:langchain4j-http-client:1.9.1'
39+
testImplementation 'dev.langchain4j:langchain4j-open-ai:1.9.1'
40+
41+
// OpenTelemetry
42+
testImplementation 'io.opentelemetry:opentelemetry-api:1.54.1'
43+
44+
// YAML parsing for spec files
45+
testImplementation 'org.yaml:snakeyaml:2.3'
46+
47+
// Test framework
48+
testImplementation(testFixtures(project(":test-harness")))
49+
testImplementation "org.junit.jupiter:junit-jupiter:${rootProject.ext.junitVersion}"
50+
testImplementation "org.junit.jupiter:junit-jupiter-params:${rootProject.ext.junitVersion}"
51+
testImplementation "io.opentelemetry:opentelemetry-sdk:${rootProject.ext.otelVersion}"
52+
testRuntimeOnly 'org.slf4j:slf4j-simple:2.0.17'
53+
testRuntimeOnly 'org.junit.platform:junit-platform-launcher'
54+
}
55+
56+
test {
57+
useJUnitPlatform()
58+
workingDir = rootProject.projectDir
59+
testLogging {
60+
events "passed", "skipped", "failed"
61+
showStandardStreams = true
62+
exceptionFormat "full"
63+
}
64+
}

btx/health-check-btx-server

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
4+
# BTX Server Health Check Script
5+
# Usage: ./scripts/health-check-btx-server <runner-directory>
6+
# Example: ./scripts/health-check-btx-server ./src/btx/runners/java
7+
#
8+
# Exits with:
9+
# 0 - Server started successfully and health check passed
10+
# 1 - Server failed to start or health check failed
11+
12+
if [ $# -ne 1 ]; then
13+
echo "Usage: $0 <runner-directory>" >&2
14+
echo "Example: $0 ./src/btx/runners/java" >&2
15+
exit 1
16+
fi
17+
18+
RUNNER_DIR="$1"
19+
START_SCRIPT="$RUNNER_DIR/start.sh"
20+
21+
# Validate runner directory
22+
if [ ! -d "$RUNNER_DIR" ]; then
23+
echo "Error: Runner directory does not exist: $RUNNER_DIR" >&2
24+
exit 1
25+
fi
26+
27+
if [ ! -f "$START_SCRIPT" ]; then
28+
echo "Error: start.sh not found in $RUNNER_DIR" >&2
29+
exit 1
30+
fi
31+
32+
if [ ! -x "$START_SCRIPT" ]; then
33+
echo "Error: start.sh is not executable: $START_SCRIPT" >&2
34+
exit 1
35+
fi
36+
37+
# Find a random open port
38+
find_open_port() {
39+
python3 -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()'
40+
}
41+
42+
PORT=$(find_open_port)
43+
HEALTH_URL="http://localhost:$PORT/health"
44+
MAX_WAIT=30 # Maximum seconds to wait for health check
45+
POLL_INTERVAL=0.5 # Seconds between health check attempts
46+
47+
echo "Starting BTX server on port $PORT..."
48+
49+
# Start the server in background, redirecting output to temp files
50+
TMPDIR=$(mktemp -d)
51+
LOG_FILE="$TMPDIR/btx-server.log"
52+
PID_FILE="$TMPDIR/btx-server.pid"
53+
54+
# Cleanup function
55+
cleanup() {
56+
if [ -f "$PID_FILE" ]; then
57+
PID=$(cat "$PID_FILE")
58+
if kill -0 "$PID" 2>/dev/null; then
59+
echo "Stopping server (PID: $PID)..."
60+
kill "$PID" 2>/dev/null || true
61+
# Give it a moment to shut down gracefully
62+
sleep 0.5
63+
# Force kill if still running
64+
if kill -0 "$PID" 2>/dev/null; then
65+
kill -9 "$PID" 2>/dev/null || true
66+
fi
67+
fi
68+
fi
69+
rm -rf "$TMPDIR"
70+
}
71+
72+
trap cleanup EXIT
73+
74+
# Start the server
75+
"$START_SCRIPT" "$PORT" > "$LOG_FILE" 2>&1 &
76+
SERVER_PID=$!
77+
echo $SERVER_PID > "$PID_FILE"
78+
79+
echo "Server started with PID: $SERVER_PID"
80+
echo "Log file: $LOG_FILE"
81+
82+
# Wait for server to be healthy
83+
echo "Waiting for health check at $HEALTH_URL..."
84+
ATTEMPTS=0
85+
MAX_ATTEMPTS=$((MAX_WAIT * 2)) # Poll twice per second
86+
87+
while [ $ATTEMPTS -lt $MAX_ATTEMPTS ]; do
88+
# Check if process is still alive
89+
if ! kill -0 $SERVER_PID 2>/dev/null; then
90+
echo "Error: Server process died unexpectedly" >&2
91+
echo "Last 20 lines of log:" >&2
92+
tail -20 "$LOG_FILE" >&2
93+
exit 1
94+
fi
95+
96+
# Try health check
97+
if curl -sf "$HEALTH_URL" > /dev/null 2>&1; then
98+
RESPONSE=$(curl -s "$HEALTH_URL")
99+
echo "Health check passed!"
100+
echo "Response: $RESPONSE"
101+
exit 0
102+
fi
103+
104+
sleep $POLL_INTERVAL
105+
ATTEMPTS=$((ATTEMPTS + 1))
106+
107+
# Show progress every 10 attempts (5 seconds)
108+
if [ $((ATTEMPTS % 10)) -eq 0 ]; then
109+
ELAPSED=$((ATTEMPTS / 2))
110+
echo "Still waiting... (${ELAPSED}s elapsed)"
111+
fi
112+
done
113+
114+
# Timeout reached
115+
echo "Error: Health check timed out after ${MAX_WAIT}s" >&2
116+
echo "Last 20 lines of log:" >&2
117+
tail -20 "$LOG_FILE" >&2
118+
exit 1

btx/spec/README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Braintrust Spec
2+
3+
Cross language specs for implementing a Braintrust SDK.
4+
5+
Contains:
6+
7+
- markdown files describing complex features
8+
- yaml describing end-to-end tests and assertions
9+
- yaml describing cross-language constants (envars, string attributes)

btx/spec/llm_span/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# llm span end-to-end tests
2+
3+
TODO: document this
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
name: messages
2+
type: llm_span_test
3+
provider: anthropic
4+
endpoint: /v1/messages
5+
enabled_runners: ["python", "typescript", "java", "csharp"]
6+
requests:
7+
- model: claude-haiku-4-5-20251001
8+
temperature: 0.0
9+
max_tokens: 128
10+
system: "You are a helpful assistant."
11+
messages:
12+
- role: user
13+
content: What is the capital of France?
14+
expected_brainstore_spans:
15+
- metrics:
16+
tokens: !fn is_non_negative_number
17+
prompt_tokens: !fn is_non_negative_number
18+
completion_tokens: !fn is_non_negative_number
19+
metadata:
20+
model: claude-haiku-4-5-20251001
21+
provider: anthropic
22+
span_attributes:
23+
name: anthropic.messages.create
24+
type: llm
25+
input:
26+
- content: What is the capital of France?
27+
role: user
28+
- content: "You are a helpful assistant."
29+
role: system
30+
output:
31+
content:
32+
- text: The capital of France is Paris.
33+
type: text
34+
role: assistant
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
name: attachments
2+
type: llm_span_test
3+
provider: google
4+
endpoint: /v1/models/gemini-2.0-flash:generateContent
5+
enabled_runners: ["python", "typescript", "java", "go"]
6+
requests:
7+
- contents:
8+
- role: user
9+
parts:
10+
- text: What color is this image?
11+
- inline_data:
12+
mime_type: image/png
13+
# 1x1 red pixel
14+
data: iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8DwHwAFBQIAX8jx0gAAAABJRU5ErkJggg==
15+
generationConfig:
16+
temperature: 0.0
17+
expected_brainstore_spans:
18+
- metrics:
19+
tokens: !fn is_non_negative_number
20+
prompt_tokens: !fn is_non_negative_number
21+
completion_tokens: !fn is_non_negative_number
22+
metadata:
23+
model: gemini-2.0-flash
24+
span_attributes:
25+
name: generate_content
26+
type: llm
27+
input:
28+
model: gemini-2.0-flash
29+
contents:
30+
- role: user
31+
parts:
32+
- text: What color is this image?
33+
- image_url:
34+
url:
35+
content_type: image/png
36+
filename: !fn is_non_empty_string
37+
key: !fn is_non_empty_string
38+
type: braintrust_attachment
39+
output:
40+
candidates:
41+
- content:
42+
parts:
43+
- text: !fn is_non_empty_string
44+
role: model
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
name: generate_content
2+
type: llm_span_test
3+
provider: google
4+
endpoint: /v1/models/gemini-2.5-flash:generateContent
5+
enabled_runners: ["python", "typescript", "java", "go"]
6+
requests:
7+
- contents:
8+
- role: user
9+
parts:
10+
- text: What is the capital of France?
11+
generationConfig:
12+
temperature: 0.0
13+
expected_brainstore_spans:
14+
- metrics:
15+
tokens: !fn is_non_negative_number
16+
prompt_tokens: !fn is_non_negative_number
17+
completion_tokens: !fn is_non_negative_number
18+
metadata:
19+
model: gemini-2.5-flash
20+
span_attributes:
21+
name: generate_content
22+
type: llm
23+
input:
24+
model: gemini-2.5-flash
25+
contents:
26+
- role: user
27+
parts:
28+
- text: What is the capital of France?
29+
output:
30+
candidates:
31+
- content:
32+
parts:
33+
- text: !fn is_non_empty_string
34+
role: model
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
name: attachments
2+
type: llm_span_test
3+
provider: openai
4+
endpoint: /v1/chat/completions
5+
requests:
6+
- model: gpt-4o-mini
7+
temperature: 0.0
8+
messages:
9+
- role: system
10+
content: you are a helpful assistant
11+
- role: user
12+
content:
13+
- type: text
14+
text: What color is this image?
15+
- type: image_url
16+
image_url:
17+
# 1x1 red pixel
18+
url: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8DwHwAFBQIAX8jx0gAAAABJRU5ErkJggg==
19+
expected_brainstore_spans:
20+
- metrics:
21+
tokens: !fn is_non_negative_number
22+
prompt_tokens: !fn is_non_negative_number
23+
completion_tokens: !fn is_non_negative_number
24+
metadata:
25+
model: !starts_with "gpt-4o-mini"
26+
provider: openai
27+
span_attributes:
28+
name: Chat Completion
29+
type: llm
30+
input:
31+
- role: system
32+
content: you are a helpful assistant
33+
- role: user
34+
content:
35+
- text: What color is this image?
36+
type: text
37+
- image_url:
38+
url:
39+
content_type: image/png
40+
filename: !fn is_non_empty_string
41+
key: !fn is_non_empty_string
42+
type: braintrust_attachment
43+
type: image_url
44+
output:
45+
- !or
46+
- finish_reason: stop
47+
index: 0
48+
message:
49+
role: assistant
50+
content: !fn is_non_empty_string
51+
- role: assistant
52+
content: !fn is_non_empty_string

0 commit comments

Comments
 (0)