From 536fba4d8b88eaa29eb1dd2a48f5bf63f1435c91 Mon Sep 17 00:00:00 2001 From: Parker Henderson Date: Tue, 24 Mar 2026 16:56:21 -0700 Subject: [PATCH 1/2] feat: add cross-platform SSE listener support --- src/eval.rs | 195 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 163 insertions(+), 32 deletions(-) diff --git a/src/eval.rs b/src/eval.rs index d108c61..8244479 100644 --- a/src/eval.rs +++ b/src/eval.rs @@ -28,6 +28,9 @@ use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; use strip_ansi_escapes::strip; use tokio::io::{AsyncBufReadExt, BufReader}; +#[cfg(not(unix))] +use tokio::net::TcpListener; +#[cfg(unix)] use tokio::net::UnixListener; use tokio::process::Command; use tokio::sync::mpsc; @@ -71,7 +74,7 @@ struct EvalRunnerProcess { rx: mpsc::UnboundedReceiver, sse_task: tokio::task::JoinHandle<()>, sse_connected: Arc, - _socket_cleanup_guard: SocketCleanupGuard, + _socket_cleanup_guard: Option, } struct EvalProcessOutput { @@ -210,6 +213,20 @@ impl Drop for SocketCleanupGuard { } } +enum SseListener { + #[cfg(unix)] + Unix(UnixListener), + #[cfg(not(unix))] + Tcp(TcpListener), +} + +struct BoundSseListener { + listener: SseListener, + env_key: &'static str, + env_value: String, + socket_cleanup_guard: Option, +} + #[derive(Debug, Copy, Clone, Eq, PartialEq, ValueEnum)] pub enum EvalLanguage { #[value(alias = "js")] @@ -659,32 +676,60 @@ async fn spawn_eval_runner( let (js_runner, py_runner) = prepare_eval_runners()?; let force_esm = matches!(js_mode, JsMode::ForceEsm); - let (listener, socket_path, socket_cleanup_guard) = bind_sse_listener()?; + let BoundSseListener { + listener, + env_key, + env_value, + socket_cleanup_guard, + } = bind_sse_listener()?; let (tx, rx) = mpsc::unbounded_channel(); let sse_connected = Arc::new(AtomicBool::new(false)); let tx_sse = tx.clone(); let sse_connected_for_task = Arc::clone(&sse_connected); let sse_task = tokio::spawn(async move { - match listener.accept().await { - Ok((stream, _)) => { - sse_connected_for_task.store(true, Ordering::Relaxed); - if let Err(err) = read_sse_stream(stream, tx_sse.clone()).await { + match listener { + #[cfg(unix)] + SseListener::Unix(listener) => match listener.accept().await { + Ok((stream, _)) => { + sse_connected_for_task.store(true, Ordering::Relaxed); + if let Err(err) = read_sse_stream(stream, tx_sse.clone()).await { + let _ = tx_sse.send(EvalEvent::Error { + message: format!("SSE stream error: {err}"), + stack: None, + status: None, + }); + } + } + Err(err) => { let _ = tx_sse.send(EvalEvent::Error { - message: format!("SSE stream error: {err}"), + message: format!("Failed to accept SSE connection: {err}"), stack: None, status: None, }); } - } - Err(err) => { - let _ = tx_sse.send(EvalEvent::Error { - message: format!("Failed to accept SSE connection: {err}"), - stack: None, - status: None, - }); - } - }; + }, + #[cfg(not(unix))] + SseListener::Tcp(listener) => match listener.accept().await { + Ok((stream, _)) => { + sse_connected_for_task.store(true, Ordering::Relaxed); + if let Err(err) = read_sse_stream(stream, tx_sse.clone()).await { + let _ = tx_sse.send(EvalEvent::Error { + message: format!("SSE stream error: {err}"), + stack: None, + status: None, + }); + } + } + Err(err) => { + let _ = tx_sse.send(EvalEvent::Error { + message: format!("Failed to accept SSE connection: {err}"), + stack: None, + status: None, + }); + } + }, + } }); let (mut cmd, runner_kind) = match language { @@ -752,10 +797,7 @@ async fn spawn_eval_runner( serde_json::to_string(&options.extra_args).context("failed to serialize extra args")?; cmd.env("BT_EVAL_EXTRA_ARGS_JSON", serialized); } - cmd.env( - "BT_EVAL_SSE_SOCK", - socket_path.to_string_lossy().to_string(), - ); + cmd.env(env_key, env_value); cmd.stdout(Stdio::piped()); cmd.stderr(Stdio::piped()); @@ -2303,7 +2345,11 @@ fn prepare_js_runner_in_cwd() -> Result { fn runner_bin_name(runner_command: &Path) -> Option { let name = runner_command.file_name()?.to_str()?.to_ascii_lowercase(); - Some(name.strip_suffix(".cmd").unwrap_or(&name).to_string()) + let trimmed = name + .strip_suffix(".cmd") + .or_else(|| name.strip_suffix(".exe")) + .unwrap_or(&name); + Some(trimmed.to_string()) } fn runner_kind_for_bin(runner_command: &Path) -> RunnerKind { @@ -2341,9 +2387,15 @@ fn is_ts_node_runner(runner_command: &Path) -> bool { fn find_python_binary() -> Option { if let Some(venv_root) = std::env::var_os("VIRTUAL_ENV") { - let candidate = PathBuf::from(venv_root).join("bin").join("python"); - if candidate.is_file() { - return Some(candidate); + let venv_root = PathBuf::from(venv_root); + let unix = venv_root.join("bin").join("python"); + if unix.is_file() { + return Some(unix); + } + + let windows = venv_root.join("Scripts").join("python.exe"); + if windows.is_file() { + return Some(windows); } } find_binary_in_path(&["python3", "python"]) @@ -2357,9 +2409,10 @@ fn find_node_module_bin(binary: &str, start: &Path) -> Option { return Some(base); } if cfg!(windows) { - let cmd = base.with_extension("cmd"); - if cmd.is_file() { - return Some(cmd); + for candidate in with_windows_extensions(&base) { + if candidate.is_file() { + return Some(candidate); + } } } current = dir.parent(); @@ -2376,9 +2429,10 @@ fn find_binary_in_path(candidates: &[&str]) -> Option { return Some(path); } if cfg!(windows) { - let cmd = path.with_extension("cmd"); - if cmd.is_file() { - return Some(cmd); + for candidate_path in with_windows_extensions(&path) { + if candidate_path.is_file() { + return Some(candidate_path); + } } } } @@ -2386,6 +2440,16 @@ fn find_binary_in_path(candidates: &[&str]) -> Option { None } +#[cfg(windows)] +fn with_windows_extensions(path: &Path) -> [PathBuf; 2] { + [path.with_extension("exe"), path.with_extension("cmd")] +} + +#[cfg(not(windows))] +fn with_windows_extensions(_path: &Path) -> [PathBuf; 0] { + [] +} + fn build_sse_socket_path() -> Result { let pid = std::process::id(); let serial = SSE_SOCKET_COUNTER.fetch_add(1, Ordering::Relaxed); @@ -2396,14 +2460,22 @@ fn build_sse_socket_path() -> Result { Ok(std::env::temp_dir().join(format!("bt-eval-{pid}-{now}-{serial}.sock"))) } -fn bind_sse_listener() -> Result<(UnixListener, PathBuf, SocketCleanupGuard)> { +#[cfg(unix)] +fn bind_sse_listener() -> Result { let mut last_bind_err: Option = None; for _ in 0..SSE_SOCKET_BIND_MAX_ATTEMPTS { let socket_path = build_sse_socket_path()?; let socket_cleanup_guard = SocketCleanupGuard::new(socket_path.clone()); let _ = std::fs::remove_file(&socket_path); match UnixListener::bind(&socket_path) { - Ok(listener) => return Ok((listener, socket_path, socket_cleanup_guard)), + Ok(listener) => { + return Ok(BoundSseListener { + listener: SseListener::Unix(listener), + env_key: "BT_EVAL_SSE_SOCK", + env_value: socket_path.to_string_lossy().to_string(), + socket_cleanup_guard: Some(socket_cleanup_guard), + }) + } Err(err) if matches!( err.kind(), @@ -2429,6 +2501,32 @@ fn bind_sse_listener() -> Result<(UnixListener, PathBuf, SocketCleanupGuard)> { )) } +#[cfg(not(unix))] +fn bind_sse_listener() -> Result { + bind_sse_tcp_listener() +} + +#[cfg(not(unix))] +fn bind_sse_tcp_listener() -> Result { + let std_listener = std::net::TcpListener::bind((std::net::Ipv4Addr::LOCALHOST, 0)) + .context("failed to bind SSE TCP listener")?; + let addr = std_listener + .local_addr() + .context("failed to resolve SSE TCP listener address")?; + std_listener + .set_nonblocking(true) + .context("failed to set SSE TCP listener non-blocking mode")?; + let listener = + TcpListener::from_std(std_listener).context("failed to create tokio SSE TCP listener")?; + + Ok(BoundSseListener { + listener: SseListener::Tcp(listener), + env_key: "BT_EVAL_SSE_ADDR", + env_value: format!("127.0.0.1:{}", addr.port()), + socket_cleanup_guard: None, + }) +} + fn eval_runner_cache_dir() -> PathBuf { let root = std::env::var_os("XDG_CACHE_HOME") .map(PathBuf::from) @@ -3952,10 +4050,43 @@ mod tests { #[test] fn runner_kind_for_bin_detects_bun() { assert_eq!(runner_kind_for_bin(Path::new("bun")), RunnerKind::Bun); + assert_eq!(runner_kind_for_bin(Path::new("bun.exe")), RunnerKind::Bun); assert_eq!(runner_kind_for_bin(Path::new("bunx")), RunnerKind::Bun); + assert_eq!(runner_kind_for_bin(Path::new("bunx.cmd")), RunnerKind::Bun); assert_eq!(runner_kind_for_bin(Path::new("deno")), RunnerKind::Other); } + #[test] + fn find_python_binary_uses_virtual_env_scripts_python_exe() { + let _guard = env_test_lock() + .lock() + .unwrap_or_else(|poisoned| poisoned.into_inner()); + let dir = make_temp_dir("venv-scripts-python"); + let scripts_dir = dir.join("Scripts"); + fs::create_dir_all(&scripts_dir).expect("create scripts directory"); + let scripts_python = scripts_dir.join("python.exe"); + fs::write(&scripts_python, "").expect("write scripts python"); + + let previous = set_env_var("VIRTUAL_ENV", &dir.to_string_lossy()); + let found = find_python_binary(); + restore_env_var("VIRTUAL_ENV", previous); + + assert_eq!(found, Some(scripts_python)); + let _ = fs::remove_dir_all(&dir); + } + + #[cfg(not(unix))] + #[test] + fn bind_sse_tcp_listener_sets_sse_addr_env() { + let bound = bind_sse_tcp_listener().expect("bind tcp sse listener"); + assert_eq!(bound.env_key, "BT_EVAL_SSE_ADDR"); + assert!( + bound.env_value.starts_with("127.0.0.1:"), + "expected loopback host in SSE address env" + ); + assert!(bound.socket_cleanup_guard.is_none()); + } + #[test] fn set_node_heap_size_env_sets_default_when_absent() { let _guard = env_test_lock() From 0471015df2808e062bc6008f30635584c4daf7c2 Mon Sep 17 00:00:00 2001 From: Parker Henderson Date: Wed, 25 Mar 2026 11:13:35 -0700 Subject: [PATCH 2/2] WIP --- README.md | 6 +- plan.md | 157 +++++++++++++++++++++++++++++++++ research.md | 249 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/eval.rs | 6 ++ 4 files changed, 415 insertions(+), 3 deletions(-) create mode 100644 plan.md create mode 100644 research.md diff --git a/README.md b/README.md index 92a972e..6ea6a0f 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ## Current Limitations -- `bt eval` is currently Unix-only (Linux/macOS). Windows support is planned. +- `bt eval` supports Linux, macOS, and Windows. ## Install @@ -114,7 +114,7 @@ Remove-Item -Recurse -Force (Join-Path $env:APPDATA "bt") -ErrorAction SilentlyC | `bt auth` | Authenticate with Braintrust | | `bt switch` | Switch org and project context | | `bt status` | Show current org and project context | -| `bt eval` | Run eval files (Unix only) | +| `bt eval` | Run eval files | | `bt sql` | Run SQL queries against Braintrust | | `bt view` | View logs, traces, and spans | | `bt projects` | Manage projects (list, create, view, delete) | @@ -326,5 +326,5 @@ Skill smoke-test harness: - Add richer channel controls for self-update (for example pinned/branch canary selection). - Expand release verification and smoke tests for installer flows across more architectures/environments. -- Add `bt eval` support on Windows (today, `bt eval` is Unix-only due to Unix socket usage). +- Expand `bt eval` cross-platform CI coverage, especially on Windows. - Add signed artifact verification guidance (signature flow) in install and upgrade docs. diff --git a/plan.md b/plan.md new file mode 100644 index 0000000..8f846e5 --- /dev/null +++ b/plan.md @@ -0,0 +1,157 @@ +# Plan: Make `bt eval` Work Natively on Windows (CLI + SDK) + +## Goal + +Ship reliable native Windows support for `bt eval` (no WSL), with deterministic SSE transport, passing Windows build/tests, and no regressions on macOS/Linux. + +## Non-Goals + +- No protocol redesign (keep existing SSE event protocol). +- No new ad-hoc runtime env configuration outside existing CLI-owned `BT_EVAL_*` contract. +- No behavior changes to unrelated commands. + +## Current Failures to Address + +1. **SDK compile break on Windows** +- `braintrust-sdk-rust` references `std::os::unix::net::UnixStream` unguarded in `src/eval/bt_runner.rs`. +- Windows build fails at compile time before TCP fallback can run. + +2. **Transport ambiguity from inherited env** +- Runners check `BT_EVAL_SSE_SOCK` before `BT_EVAL_SSE_ADDR`. +- If both are present, runner may choose wrong transport and fail to stream. + +## Workstream A: `braintrust-sdk-rust` (must land first) + +Owner: SDK repo (`../braintrust-sdk-rust`) + +### A1. Make SSE writer platform-safe + +- Refactor `create_sse_writer()` in `src/eval/bt_runner.rs`: + - Unix-only Unix socket connect path behind `#[cfg(unix)]`. + - Keep TCP connect path available on all platforms. + - Ensure non-Unix builds never reference `std::os::unix::*`. + +### A2. Keep deterministic transport semantics + +- On Unix: + - Preserve current preference order (`BT_EVAL_SSE_SOCK`, then `BT_EVAL_SSE_ADDR`). +- On non-Unix: + - Ignore `BT_EVAL_SSE_SOCK` (optionally log unsupported warning) and use `BT_EVAL_SSE_ADDR`. + +### A3. Add SDK tests + +- Unit tests for: + - env-driven selection logic by platform + - valid/invalid `BT_EVAL_SSE_ADDR` handling +- Add at least one CI check that compiles SDK eval on Windows. + +### A4. Merge and pin + +- Merge SDK fix. +- Record commit SHA to consume from `btcli`. + +## Workstream B: `btcli` Runtime Wiring + +Owner: CLI repo (`btcli`) + +### B1. Keep platform listener ownership in CLI + +- Retain current listener model in `src/eval.rs`: + - Unix binds local Unix socket and exports `BT_EVAL_SSE_SOCK`. + - Non-Unix binds localhost TCP and exports `BT_EVAL_SSE_ADDR`. + +### B2. Enforce one transport env in child process + +- Before spawn, set selected SSE env key and remove the conflicting key. +- This avoids inherited-env collisions across Rust/JS/Python runners. + +Status: implemented in `src/eval.rs` (keep and test). + +### B3. Add CLI tests for transport env determinism + +- Add unit/integration coverage that verifies: + - chosen key is present in child env + - opposite key is removed + - Windows path uses `BT_EVAL_SSE_ADDR` + - Unix path uses `BT_EVAL_SSE_SOCK` + +## Workstream C: JS/Python Runner Hardening (recommended) + +Owner: CLI repo runner scripts + +### C1. Keep current compatibility behavior + +- JS and Python runners may keep env priority as-is, since CLI now sanitizes env. + +### C2. Optional defensive hardening + +- On non-Unix, if `BT_EVAL_SSE_SOCK` is present: + - skip Unix socket attempt (or emit explicit warning) + - continue with `BT_EVAL_SSE_ADDR` if provided + +This reduces dependency on perfect parent env hygiene and improves diagnostics. + +## Workstream D: Dependency Update in `btcli` + +Owner: CLI repo + +### D1. Bump SDK revision + +- Update `braintrust-sdk-rust` `rev` in `Cargo.toml`. +- Refresh `Cargo.lock`. + +### D2. Verify compile and behavior + +- Ensure `cargo check` passes locally. +- Ensure release-canary Windows artifact build compiles with new SDK rev. + +## Workstream E: CI + Validation Matrix + +Owner: CLI + SDK repos + +### E1. Required validation + +1. **Windows build path** +- `x86_64-pc-windows-msvc` build succeeds in release-canary/release matrix. + +2. **Runtime eval path (Windows)** +- JS eval run emits SSE (`start`, `summary`, `done`) and exits correctly. +- Python eval run emits SSE and exits correctly. + +3. **Conflict env regression** +- Simulate inherited `BT_EVAL_SSE_SOCK` while Windows listener selects TCP. +- Verify child still uses TCP via `BT_EVAL_SSE_ADDR`. + +4. **Unix regression check** +- Unix eval path continues to use Unix sockets with no behavior regressions. + +### E2. Nice-to-have validation + +- Add a dedicated Windows eval integration job (JS + Python smoke tests). + +## Sequencing (Execution Order) + +1. Implement and merge **Workstream A** in `braintrust-sdk-rust`. +2. Land/keep **Workstream B** in `btcli` with tests. +3. Apply **Workstream D** SDK rev bump in `btcli`. +4. Run **Workstream E** validation gates. +5. Optional **Workstream C** hardening if diagnostics still weak. + +## Risks and Mitigations + +1. Risk: SDK change lands but CLI still has env collisions. +- Mitigation: keep CLI opposite-key removal as a hard requirement. + +2. Risk: CI only proves compile, not runtime SSE correctness. +- Mitigation: add Windows eval smoke tests that assert SSE event flow. + +3. Risk: Unix regressions from transport refactor. +- Mitigation: keep Unix ordering unchanged and run existing Unix eval tests. + +## Definition of Done + +- Windows artifact build succeeds without `std::os::unix` compile errors. +- `bt eval` runs natively on Windows for JS and Python with working SSE progress/summary. +- Conflicting inherited SSE env vars do not break transport selection. +- macOS/Linux behavior remains unchanged. +- `btcli` is pinned to an SDK commit that contains the Windows-safe eval transport fix. diff --git a/research.md b/research.md new file mode 100644 index 0000000..de3e961 --- /dev/null +++ b/research.md @@ -0,0 +1,249 @@ +# Eval SSE Transport Research (btcli + braintrust-sdk-rust) + +## Scope + +This document explains how eval execution and SSE event transport currently work across: + +- `btcli` (parent process and UI) +- JavaScript runner script (`scripts/eval-runner.ts`) +- Python runner script (`scripts/eval-runner.py`) +- Rust SDK runner (`../braintrust-sdk-rust/src/eval/bt_runner.rs`) + +It focuses on the Windows-native issues we are fixing: + +1. incorrect/unsafe Unix-socket handling on Windows +2. nondeterministic transport selection when both `BT_EVAL_SSE_SOCK` and `BT_EVAL_SSE_ADDR` are present + +## High-Level Architecture + +### Parent/child split + +`btcli` is the parent orchestrator. It: + +1. starts a local SSE listener +2. chooses transport (`unix socket` or `tcp`) +3. passes the selected endpoint to the child runner via env vars +4. reads SSE events and maps them into `EvalEvent` for UI/output + +Child runners (JS/Python/Rust SDK) do not host the listener. They connect to what `btcli` exposes and emit SSE events. + +## btcli Flow + +### Listener binding and transport selection + +Entry path: + +- `spawn_eval_runner(...)` in `src/eval.rs` +- calls `bind_sse_listener()` + +Transport chosen by platform in `src/eval.rs`: + +- Unix: `bind_sse_listener()` -> `UnixListener::bind(...)`, exports `BT_EVAL_SSE_SOCK` +- Non-Unix (Windows): `bind_sse_listener()` -> `bind_sse_tcp_listener()`, exports `BT_EVAL_SSE_ADDR` with `127.0.0.1:` + +Relevant code: + +- `src/eval.rs:2471` (unix binding) +- `src/eval.rs:2512` (non-unix binding) +- `src/eval.rs:2531` (env key for non-unix is `BT_EVAL_SSE_ADDR`) + +### Child env wiring + +After constructing the child command, `btcli` sets eval env vars and now clears the opposite SSE key: + +- sets chosen key/value +- removes conflicting key (`BT_EVAL_SSE_SOCK` or `BT_EVAL_SSE_ADDR`) + +Relevant code: + +- `src/eval.rs:801` +- `src/eval.rs:807` + +This makes transport selection deterministic for all runners. + +### SSE parsing and event mapping + +`btcli` reads SSE framing (`event:` / `data:` blocks) and maps events: + +- `start` +- `summary` +- `processing` +- `progress` +- `console` +- `error` +- `dependencies` +- `done` + +Relevant code: + +- `read_sse_stream(...)` in `src/eval.rs:2715` +- `handle_sse_event(...)` in `src/eval.rs:2748` + +## Child Runner Implementations + +### JavaScript runner (`scripts/eval-runner.ts`) + +Config/env reader: + +- `readRunnerConfig()` consumes `BT_EVAL_*` flags (`BT_EVAL_JSONL`, `BT_EVAL_LIST`, etc.) + +SSE transport connection order in `createSseWriter()`: + +1. if `BT_EVAL_SSE_SOCK` is set, connect with `node:net` path mode +2. else if `BT_EVAL_SSE_ADDR` is set, connect host/port TCP + +Relevant code: + +- `scripts/eval-runner.ts:256` +- `scripts/eval-runner.ts:678` +- `scripts/eval-runner.ts:687` +- `scripts/eval-runner.ts:721` + +Event emission: + +- sends `processing`, `summary`, `error`, `dependencies`, `done`, and progress/console events +- finalization sends `dependencies` + `done` then closes socket + +Relevant code: + +- `scripts/eval-runner.ts:1908` +- `scripts/eval-runner.ts:1958` + +### Python runner (`scripts/eval-runner.py`) + +Config/env reader: + +- `read_runner_config()` consumes `BT_EVAL_*` flags + +SSE transport connection order in `create_sse_writer()`: + +1. if `BT_EVAL_SSE_SOCK` is set, connect using `socket.AF_UNIX` +2. else if `BT_EVAL_SSE_ADDR` is set, parse and connect TCP + +Relevant code: + +- `scripts/eval-runner.py:145` +- `scripts/eval-runner.py:90` +- `scripts/eval-runner.py:97` + +Event emission: + +- sends `dependencies` and `done` before exit, closes socket in `finally` + +Relevant code: + +- `scripts/eval-runner.py:958` +- `scripts/eval-runner.py:985` + +## Rust SDK Eval Runner (`braintrust-sdk-rust`) + +### Core entry points + +- `BtEvalRunner::from_env()` reads config from env and initializes SSE writer +- `eval(...)` runs evaluator, emits `start` and `summary` or `error` +- `finish(...)` handles list mode output and returns pass/fail aggregate + +Relevant code: + +- `../braintrust-sdk-rust/src/eval/bt_runner.rs:223` +- `../braintrust-sdk-rust/src/eval/bt_runner.rs:287` +- `../braintrust-sdk-rust/src/eval/bt_runner.rs:364` + +### Current transport logic and Windows break + +In `create_sse_writer()`: + +1. checks `BT_EVAL_SSE_SOCK` first +2. tries `std::os::unix::net::UnixStream::connect(...)` +3. falls back to `BT_EVAL_SSE_ADDR` TCP + +Relevant code: + +- `../braintrust-sdk-rust/src/eval/bt_runner.rs:127` +- `../braintrust-sdk-rust/src/eval/bt_runner.rs:131` +- `../braintrust-sdk-rust/src/eval/bt_runner.rs:145` + +Problem: `std::os::unix` is not available on Windows. Because that symbol is referenced unguarded, Windows compile fails before runtime fallback is possible. + +Observed CI error pattern: + +- compiler error: `could not find unix in os` +- location: `src/eval/bt_runner.rs:131` +- downstream `cargo-dist` exits with `127` after failing to produce `bt.exe` + +## Why We Still Need Platform-Aware Logic in btcli + +Even if SDK/runners become fully cross-platform, `btcli` must still: + +- bind listener socket/port +- choose endpoint type +- pass endpoint env to child +- parse SSE for UI/progress integration + +So the right boundary is: + +- `btcli`: transport host/binding + env provisioning + event consumption +- SDK/runners: transport client/connection + event emission + +## Root Causes for the Current Regression Class + +### 1) Compile-time portability gap in Rust SDK + +`bt_runner.rs` references Unix APIs without `cfg(unix)` guards. + +### 2) Runtime transport ambiguity from inherited env + +All child runners currently check `BT_EVAL_SSE_SOCK` before `BT_EVAL_SSE_ADDR`. +If both are present (for example inherited parent env), the runner can choose the wrong transport. + +This is why clearing the opposite key in `btcli` is necessary even after SDK fixes. + +## Fix Strategy for Native Windows Support (No WSL) + +### In `braintrust-sdk-rust` + +Refactor `create_sse_writer()` to be platform-safe and deterministic: + +1. Gate Unix socket connect behind `#[cfg(unix)]`. +2. On non-Unix, never reference `std::os::unix`. +3. Keep TCP path (`BT_EVAL_SSE_ADDR`) available on all platforms. +4. Optional: on Unix, still prefer socket then TCP; on non-Unix, ignore `BT_EVAL_SSE_SOCK` entirely or log that it is unsupported. + +### In `btcli` + +Keep the opposite-key clearing behavior in `spawn_eval_runner()`: + +- set selected env key +- remove the conflicting one + +This guarantees deterministic behavior across SDK, JS runner, and Python runner. + +### In JS/Python runners + +Current behavior is acceptable with CLI env cleanup, but optional hardening: + +- On non-Unix, ignore or warn on `BT_EVAL_SSE_SOCK` before attempting Unix connect. + +## Validation Matrix + +### Windows + +1. Build `bt` and confirm no compile error from `std::os::unix`. +2. Run at least one JS eval and one Python eval via `bt eval`. +3. Verify progress/summary stream appears (SSE connected through TCP). +4. Verify behavior when parent env includes stale `BT_EVAL_SSE_SOCK`: + - child should still use `BT_EVAL_SSE_ADDR`. + +### Unix (macOS/Linux) + +1. Run eval and verify Unix socket path still works. +2. Confirm no regression in progress/summary/error event handling. + +## Key Takeaway + +Native Windows support is achieved by using TCP SSE end-to-end on Windows, not by disabling eval features. +The minimal robust model is: + +- `btcli` chooses and exports exactly one transport key +- SDK/runners connect using platform-safe transport code +- SSE event protocol remains unchanged diff --git a/src/eval.rs b/src/eval.rs index 8244479..24f5edb 100644 --- a/src/eval.rs +++ b/src/eval.rs @@ -797,7 +797,13 @@ async fn spawn_eval_runner( serde_json::to_string(&options.extra_args).context("failed to serialize extra args")?; cmd.env("BT_EVAL_EXTRA_ARGS_JSON", serialized); } + let conflicting_sse_env_key = if env_key == "BT_EVAL_SSE_SOCK" { + "BT_EVAL_SSE_ADDR" + } else { + "BT_EVAL_SSE_SOCK" + }; cmd.env(env_key, env_value); + cmd.env_remove(conflicting_sse_env_key); cmd.stdout(Stdio::piped()); cmd.stderr(Stdio::piped());