LightconeResearch · cailmdaley · Jun 13, 2026 · Jun 13, 2026 · Jun 13, 2026 · Jun 13, 2026
diff --git a/evals/local-demo.yaml b/evals/local-demo.yaml
@@ -0,0 +1,24 @@
+# Full local demo: the with/without-Lightcone A/B across three harnesses on the
+# snae task, run in local Docker. The matrix is {claude, codex, pi} x
+# {with-skills, without-skills}. The report's "Δ lift" column is the headline —
+# how much the Lightcone layer moved each harness's score.
+#
+# Fair, comparable model tier across harnesses (codex Spark was retired): codex
+# and pi both run gpt-5.4-mini, claude runs haiku. pi reaches gpt-5.4-mini via
+# Cail's GitHub Copilot subscription using pi's native `provider/model` string.
+# claude needs CLAUDE_CODE_OAUTH_TOKEN in the host env (loaded from .env).
+id: local-demo
+backend: local_docker
+tasks:
+  - snae
+harnesses:
+  - { name: claude, model: haiku }
+  - { name: codex, model: gpt-5.4-mini }
+  - { name: pi, model: github-copilot/gpt-5.4-mini }
+skill_variants: [true, false]
+num_trials: 1
+max_concurrency: 3
+# No turn cap — agents run to completion (claude uses the task's max_turns=200).
+# trial_timeout is just a safety ceiling against a hung agent, not a turn cap.
+trial_timeout: 1800
+output_dir: eval-results
diff --git a/evals/local-smoke.yaml b/evals/local-smoke.yaml
@@ -0,0 +1,22 @@
+# Cheap plumbing smoke for the local-Docker multi-harness path. `max_turns`
+# caps each trial so it exits fast: the build won't complete (graders score
+# low), but the whole path — container build, auth copy-in, headless invoke,
+# output parse, grading, teardown, scorecard — is exercised end to end across
+# all three harnesses. Run this first to shake out orchestration/auth bugs
+# before spending on a full build.
+id: local-smoke
+backend: local_docker
+tasks:
+  - snae
+harnesses:
+  - claude
+  - codex
+  - pi
+skill_variants: [true]
+num_trials: 1
+max_concurrency: 1
+# max_turns only bounds claude; codex and pi have no max-turns flag, so a SHORT
+# trial_timeout is what keeps a smoke cheap for them.
+max_turns: 5
+trial_timeout: 180
+output_dir: eval-results
diff --git a/evals/tasks/snae/astra.yaml b/evals/tasks/snae/astra.yaml
@@ -1,13 +1,31 @@
 # ASTRA Analysis Specification
 # Documentation: https://github.com/LightconeResearch/ASTRA
 
+id: snae
 version: "1.0"
 name: "snae"
-description: |
-  Fit the Union2.1 Type Ia supernova distance modulus vs redshift data
-  to a flat LCDM cosmological model with two free parameters (H0, Omega_L)
-  using maximum-likelihood (MAP) point estimation. This provides best-fit
-  cosmological parameters as a building block for a larger analysis.
+
+narrative:
+  summary: |
+    Fit the Union2.1 Type Ia supernova distance modulus vs redshift data
+    to a flat LCDM cosmological model with two free parameters (H0, Omega_L)
+    using maximum-likelihood (MAP) point estimation. This provides best-fit
+    cosmological parameters as a building block for a larger analysis.
+  inputs: |
+    The single input is the [Union2.1 compilation](#inputs.union21): 580
+    Type Ia supernovae with redshift, distance modulus, and uncertainties.
+  methods: |
+    The fit minimizes a chi-squared between the observed distance moduli and
+    the flat-LCDM prediction, varying H0 and Omega_L. Three decisions shape
+    the fit: the [optimizer](#decisions.optimizer) used for the minimization,
+    the [error model](#decisions.error_model) (statistical-only vs.
+    statistical+systematic uncertainties), and a [low-redshift cut](#decisions.redshift_cut)
+    that optionally removes peculiar-velocity-dominated supernovae.
+  outputs: |
+    Three outputs: the [best-fit parameters](#outputs.best_fit) (H0, Omega_L,
+    reduced chi-squared), a [Hubble diagram](#outputs.hubble_diagram) with the
+    best-fit model overlaid on the data, and a [residuals plot](#outputs.residuals)
+    of data minus model versus redshift.
 
 container: Containerfile
 
@@ -21,22 +39,38 @@ outputs:
   - id: best_fit
     type: metric
     description: "Best-fit H0 and Omega_L from chi-squared minimization, with reduced chi-squared"
+    inputs: [union21]
+    decisions: [optimizer, error_model, redshift_cut]
     recipe:
-      command: python scripts/fit.py
+      command: >-
+        python scripts/fit.py
+        --union21 {inputs.union21}
+        --optimizer {decisions.optimizer}
+        --error-model {decisions.error_model}
+        --redshift-cut {decisions.redshift_cut}
+        --out {output}
 
   - id: hubble_diagram
     type: figure
     description: "Hubble diagram: distance modulus vs redshift with best-fit model overlay"
+    inputs: [union21, best_fit]
     recipe:
-      command: python scripts/plot_hubble.py
-      inputs: [best_fit]
+      command: >-
+        python scripts/plot_hubble.py
+        --union21 {inputs.union21}
+        --best-fit {inputs.best_fit}
+        --out {output}
 
   - id: residuals
     type: figure
     description: "Residuals plot: data minus best-fit model vs redshift"
+    inputs: [union21, best_fit]
     recipe:
-      command: python scripts/plot_residuals.py
-      inputs: [best_fit]
+      command: >-
+        python scripts/plot_residuals.py
+        --union21 {inputs.union21}
+        --best-fit {inputs.best_fit}
+        --out {output}
 
 decisions:
   optimizer:

diff --git a/src/lightcone/eval/backends/__init__.py b/src/lightcone/eval/backends/__init__.py
@@ -0,0 +1,21 @@
+"""Sandbox backends for the eval harness.
+
+A backend is the execution substrate one trial runs inside. All backends share
+the :class:`Sandbox` surface, so a harness drives any of them unchanged.
+
+  - :class:`LocalDockerSandbox` — a local Docker container per trial. The
+    counterpart of the Daytona :class:`lightcone.eval.sandbox.EvalSandbox`, for
+    running the suite on a developer/CI host with Docker rather than a Daytona
+    account.
+"""
+
+from __future__ import annotations
+
+from lightcone.eval.backends.base import ExecuteResult, Sandbox
+from lightcone.eval.backends.local_docker import LocalDockerSandbox
+
+__all__ = [
+    "ExecuteResult",
+    "LocalDockerSandbox",
+    "Sandbox",
+]
diff --git a/src/lightcone/eval/backends/base.py b/src/lightcone/eval/backends/base.py
@@ -0,0 +1,72 @@
+"""Sandbox backend abstraction for eval trials.
+
+A ``Sandbox`` is the execution substrate one eval trial runs inside. It mirrors
+the public surface of the original :class:`lightcone.eval.sandbox.EvalSandbox`
+(the Daytona backend) so harnesses — which depend only on the
+``SandboxLike`` protocol (``WORK_DIR``, ``exec``, ``exec_async_poll``,
+``upload_file``) — drive any backend unchanged.
+
+Backends:
+  - ``EvalSandbox`` (sandbox.py)        — ephemeral Daytona cloud sandbox
+  - ``LocalDockerSandbox`` (this pkg)   — a local Docker container per trial
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from pathlib import Path
+
+
+@dataclass
+class ExecuteResult:
+    """Result from running a command in a sandbox."""
+
+    exit_code: int
+    output: str
+
+
+class Sandbox(ABC):
+    """One ephemeral execution substrate for a single eval trial.
+
+    The lifecycle is ``create() → setup() → (exec/exec_async_poll/upload_file)* →
+    teardown()``. Subclasses provide the concrete substrate (Daytona cloud
+    sandbox, local Docker container, …); the abstract surface here is exactly
+    what the trial loop and the harness layer consume.
+    """
+
+    #: Project root inside the sandbox — where ``lc init`` scaffolds and the
+    #: agent runs. Mirrors ``EvalSandbox.WORK_DIR``.
+    WORK_DIR = "/home/evaluser/project"
+
+    @abstractmethod
+    def create(self) -> None:
+        """Provision the substrate (build image if needed, start the sandbox)."""
+
+    @abstractmethod
+    def setup(
+        self,
+        seed_dir: Path,
+        universe: str,
+        loop_prompt_template: str,
+        wheels: list[Path] | None = None,
+    ) -> None:
+        """Scaffold the project via ``lc init`` and overlay task seed files."""
+
+    @abstractmethod
+    def exec(self, cmd: str, timeout: int = 300, cwd: str | None = None) -> ExecuteResult:
+        """Run a command in the sandbox, returning its exit code and output."""
+
+    @abstractmethod
+    def exec_async_poll(
+        self, cmd: str, timeout: int = 600, poll_interval: int = 10
+    ) -> ExecuteResult:
+        """Run a long-running command, tolerant of gateway timeouts."""
+
+    @abstractmethod
+    def upload_file(self, remote_path: str, content: bytes) -> None:
+        """Upload a file into the sandbox at ``remote_path``."""
+
+    @abstractmethod
+    def teardown(self) -> None:
+        """Destroy the substrate. Idempotent."""