braintrustdata
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎scripts/eval-runner.py‎
Lines changed: 15 additions & 0 deletions b/‎scripts/eval-runner.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎scripts/functions-bundler.ts‎
Lines changed: 4 additions & 1 deletion b/‎scripts/functions-bundler.ts‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎scripts/functions-runner.py‎
Lines changed: 130 additions & 10 deletions b/‎scripts/functions-runner.py‎
Lines changed: 130 additions & 10 deletions
diff --git a/‎scripts/functions-runner.ts‎
Lines changed: 109 additions & 0 deletions b/‎scripts/functions-runner.ts‎
Lines changed: 109 additions & 0 deletions
@@ -23,3 +23,4 @@ tests/evals/js/eval-bun/test-data.txt
 __pycache__
 
 bt-sync
+*.env
@@ -435,6 +435,21 @@ def load_evaluators(files: list[str]) -> tuple[list[EvaluatorInstance], dict[str
     cwd = os.getcwd()
     if cwd not in sys.path:
         sys.path.insert(0, cwd)
+
+    # Add the project root inferred from input files to sys.path so that
+    # sibling-package imports work when files live outside CWD (e.g.
+    # sandbox bundles extracted to a temp directory).  Walk up from each
+    # file's directory looking for a register.py (bundle marker) or the
+    # filesystem root, whichever comes first.
+    for f in files:
+        d = os.path.dirname(os.path.abspath(f))
+        while d and d != os.path.dirname(d):
+            if os.path.isfile(os.path.join(d, "register.py")):
+                if d not in sys.path:
+                    sys.path.insert(0, d)
+                break
+            d = os.path.dirname(d)
+
     unique_files: set[str] = set()
     for file_path in files:
         for candidate in collect_files(file_path):
 
@@ -275,7 +275,10 @@ async function main(): Promise<void> {
   const externalPackages = parseExternalPackages(
     process.env.BT_FUNCTIONS_PUSH_EXTERNAL_PACKAGES,
   );
-  const external = buildExternalPackagePatterns(externalPackages);
+  const selfContained = process.env.BT_FUNCTIONS_PUSH_SELF_CONTAINED === "1";
+  const external = selfContained
+    ? ["fsevents", "chokidar"]
+    : buildExternalPackagePatterns(externalPackages);
   const tsconfig = loadTsconfigPath();
 
   const outputDir = path.dirname(outputFile);
 
@@ -3,6 +3,7 @@
 import inspect
 import json
 import os
+import re
 import sys
 from contextlib import nullcontext
 from typing import Any
@@ -28,9 +29,9 @@ def to_json_value(value: Any) -> Any:
         return [to_json_value(item) for item in value]
     if isinstance(value, dict):
         return {str(key): to_json_value(val) for key, val in value.items()}
-    if hasattr(value, "model_dump"):
+    if hasattr(value, "model_dump") and not isinstance(value, type):
         return to_json_value(value.model_dump())
-    if hasattr(value, "dict"):
+    if hasattr(value, "dict") and not isinstance(value, type):
         return to_json_value(value.dict())
     if hasattr(value, "__dict__"):
         result: dict[str, Any] = {}
@@ -42,21 +43,31 @@ def to_json_value(value: Any) -> Any:
     return str(value)
 
 
-def load_framework_globals() -> tuple[Any, Any, Any]:
+def load_framework_globals() -> tuple[Any, Any, Any, Any]:
     # Prefer current SDK layout first:
     # - braintrust.framework2 exposes module-level `global_`
     # - braintrust.framework exposes `_set_lazy_load`
     try:
         from braintrust.framework import _set_lazy_load as lazy
         from braintrust.framework2 import global_ as global_state
 
-        return global_state.functions, global_state.prompts, lazy
+        try:
+            from braintrust.framework import _evals
+        except (ImportError, ModuleNotFoundError):
+            _evals = None
+
+        return global_state.functions, global_state.prompts, lazy, _evals
     except (ImportError, ModuleNotFoundError):
         # Backward compatibility with older SDK layout.
         from braintrust.framework2.global_ import functions, prompts
         from braintrust.framework2.lazy_load import _set_lazy_load as lazy
 
-        return functions, prompts, lazy
+        try:
+            from braintrust.framework import _evals
+        except (ImportError, ModuleNotFoundError):
+            _evals = None
+
+        return functions, prompts, lazy, _evals
 
 
 def normalize_project_selector(project: Any) -> tuple[str | None, str | None]:
@@ -277,16 +288,113 @@ async def collect_function_event_entries(prompts_registry: Any) -> list[dict[str
     return entries
 
 
+def slugify(text: str) -> str:
+    return re.sub(r"^-|-$", "", re.sub(r"[^a-z0-9]+", "-", text.lower()))
+
+
+def collect_evaluator_entries(evals_registry: Any, source_file: str) -> list[dict[str, Any]]:
+    if evals_registry is None:
+        return []
+
+    evaluators = getattr(evals_registry, "evaluators", None)
+    if not evaluators or not isinstance(evaluators, dict):
+        return []
+
+    entries: list[dict[str, Any]] = []
+    stem_base, _ = os.path.splitext(os.path.basename(source_file))
+    stem = re.sub(r"\.eval$", "", stem_base)
+
+    for eval_name, instance in evaluators.items():
+        if instance is None:
+            continue
+        evaluator = getattr(instance, "evaluator", None)
+        if evaluator is None:
+            continue
+
+        project_name = getattr(evaluator, "project_name", None)
+        project_id, proj_name = normalize_project_selector(
+            {"project_name": project_name} if isinstance(project_name, str) else None
+        )
+
+        scores = getattr(evaluator, "scores", []) or []
+        score_descriptors = [
+            {"name": getattr(score, "__name__", f"scorer_{i}")}
+            for i, score in enumerate(scores)
+        ]
+
+        evaluator_definition: dict[str, Any] = {"scores": score_descriptors}
+
+        raw_params = getattr(evaluator, "parameters", None)
+        if raw_params is not None:
+            marker = getattr(raw_params, "__braintrust_parameters_marker", None)
+            if marker is True:
+                evaluator_definition["parameters"] = {
+                    "type": "braintrust.parameters",
+                    "schema": getattr(raw_params, "schema", None),
+                    "source": {
+                        "parametersId": getattr(raw_params, "id", None),
+                        "slug": getattr(raw_params, "slug", None),
+                        "name": getattr(raw_params, "name", None),
+                        "projectId": getattr(raw_params, "projectId", None),
+                        "version": getattr(raw_params, "version", None),
+                    },
+                }
+            else:
+                # Use the braintrust SDK's parameters_to_json_schema when
+                # available so that Pydantic model classes are converted to
+                # proper staticParametersSchema entries (type: "data" with a
+                # JSON Schema) that the UI can parse.
+                try:
+                    from braintrust.parameters import parameters_to_json_schema
+                    serialized = parameters_to_json_schema(raw_params)
+                except Exception:
+                    serialized = to_json_value(raw_params)
+                if serialized is not None:
+                    evaluator_definition["parameters"] = serialized
+
+        base_entry: dict[str, Any] = {"kind": "code"}
+        if project_id:
+            base_entry["project_id"] = project_id
+        if proj_name:
+            base_entry["project_name"] = proj_name
+
+        # Sandbox entry only — task and scorer entries are pushed separately
+        # when the eval is actually run, matching the Python SDK behavior.
+        sandbox_entry = {
+            **base_entry,
+            "name": f"Eval {eval_name} sandbox",
+            "slug": slugify(f"{stem}-{eval_name}-sandbox"),
+            "function_type": "sandbox",
+            "location": {
+                "type": "sandbox",
+                "sandbox_spec": {"provider": "lambda"},
+                "entrypoints": [os.path.relpath(source_file)],
+                "eval_name": eval_name,
+                "evaluator_definition": evaluator_definition,
+            },
+            "metadata": {"_bt_sandbox_group_name": stem},
+        }
+        entries.append(sandbox_entry)
+
+    return entries
+
+
 async def process_file(file_path: str) -> dict[str, Any]:
     abs_path = os.path.abspath(file_path)
     cwd = os.getcwd()
     if cwd not in sys.path:
         sys.path.insert(0, cwd)
 
-    purge_local_modules(cwd, preserve_modules={__name__, "python_runner_common"})
-    functions_registry, prompts_registry, lazy_loader = load_framework_globals()
+    functions_registry, prompts_registry, lazy_loader, evals_registry = load_framework_globals()
     clear_registry(functions_registry)
     clear_registry(prompts_registry)
+    if (
+        evals_registry is not None
+        and hasattr(evals_registry, "evaluators")
+        and isinstance(evals_registry.evaluators, dict)
+    ):
+        evals_registry.evaluators.clear()
+    purge_local_modules(cwd, preserve_modules={__name__, "python_runner_common"})
 
     module_name = import_module_name_from_cwd(cwd, abs_path)
     if module_name is None:
@@ -298,12 +406,13 @@ async def process_file(file_path: str) -> dict[str, Any]:
         import_file(module_name, abs_path, extra_paths)
         code_entries = collect_code_entries(functions_registry)
         event_entries = await collect_function_event_entries(prompts_registry)
-        entries = [*code_entries, *event_entries]
+        evaluator_entries = collect_evaluator_entries(evals_registry, abs_path)
+        entries = [*code_entries, *event_entries, *evaluator_entries]
         file_manifest: dict[str, Any] = {
             "source_file": abs_path,
             "entries": entries,
         }
-        if code_entries:
+        if code_entries or evaluator_entries:
             runner_root = os.path.dirname(os.path.abspath(__file__))
             project_root = os.path.abspath(cwd)
             path_rest: list[str] = []
@@ -350,13 +459,24 @@ async def process_file(file_path: str) -> dict[str, Any]:
                     continue
                 seen_sources.add(init_source)
                 bundled_sources.append(init_source)
+            # Compute entry_module as a CWD-relative dotted path so that the
+            # archive root inferred by push.rs walks back to CWD, matching
+            # the Python SDK behavior and allowing sibling-package imports.
+            rel_path = os.path.relpath(abs_path, cwd)
+            archive_module = re.sub(r"\.py$", "", rel_path).replace("-", "_").replace(os.sep, ".")
             file_manifest["python_bundle"] = {
-                "entry_module": module_name,
+                "entry_module": archive_module,
                 "sources": bundled_sources,
             }
 
     clear_registry(functions_registry)
     clear_registry(prompts_registry)
+    if (
+        evals_registry is not None
+        and hasattr(evals_registry, "evaluators")
+        and isinstance(evals_registry.evaluators, dict)
+    ):
+        evals_registry.evaluators.clear()
     return file_manifest
 
 
 
@@ -88,6 +88,20 @@ type Manifest = {
   files: ManifestFile[];
 };
 
+function slugify(input: string): string {
+  return input
+    .toLowerCase()
+    .replace(/[^a-z0-9]+/g, "-")
+    .replace(/^-|-$/g, "");
+}
+
+function extractScoreName(score: unknown, idx: number): string {
+  if (typeof score === "function" && typeof score.name === "string") {
+    return score.name || `scorer_${idx}`;
+  }
+  return `scorer_${idx}`;
+}
+
 type EvalRegistry = NonNullable<typeof globalThis._evals>;
 type ZodToJsonSchemaFn = (schema: unknown) => unknown;
 type ZodV4ToJsonSchemaFn = (
@@ -558,6 +572,97 @@ function collectCodeEntries(items: CodeRegistryItem[]): CodeEntry[] {
   return entries;
 }
 
+function collectEvaluatorEntries(
+  evaluators: Record<string, unknown>,
+  sourceFilePath: string,
+): CodeEntry[] {
+  const entries: CodeEntry[] = [];
+  const ext = path.extname(sourceFilePath);
+  const stem = path.basename(sourceFilePath, ext).replace(/\.eval$/, "");
+
+  for (const [evalName, entry] of Object.entries(evaluators)) {
+    if (!entry || typeof entry !== "object") {
+      continue;
+    }
+
+    const evaluator = (entry as Record<string, unknown>).evaluator;
+    if (!evaluator || typeof evaluator !== "object") {
+      continue;
+    }
+
+    const evalObj = evaluator as Record<string, unknown>;
+    const projectName =
+      typeof evalObj.project_name === "string" ? evalObj.project_name : undefined;
+    const scores = Array.isArray(evalObj.scores) ? evalObj.scores : [];
+
+    const selector = asProjectSelector(
+      typeof projectName === "string" ? { name: projectName } : undefined,
+    );
+    const projectId =
+      typeof selector.project_id === "string" ? selector.project_id : undefined;
+    const selectorProjectName =
+      typeof selector.project_name === "string"
+        ? selector.project_name
+        : undefined;
+
+    const scoreDescriptors = scores.map((s: unknown, i: number) => ({
+      name: extractScoreName(s, i),
+    }));
+
+    const evaluatorDefinition: JsonObject = {
+      scores: scoreDescriptors as JsonValue,
+    };
+
+    const rawParams = evalObj.parameters;
+    if (rawParams !== undefined && rawParams !== null) {
+      const marker =
+        rawParams !== null &&
+        typeof rawParams === "object" &&
+        (rawParams as Record<string, unknown>).__braintrust_parameters_marker === true;
+      if (marker) {
+        const paramObj = rawParams as Record<string, unknown>;
+        evaluatorDefinition.parameters = toJsonValue({
+          type: "braintrust.parameters",
+          schema: paramObj.schema,
+          source: {
+            parametersId: paramObj.id,
+            slug: paramObj.slug,
+            name: paramObj.name,
+            projectId: paramObj.projectId,
+            version: paramObj.version,
+          },
+        } as JsonValue);
+      } else {
+        const serialized = toJsonValue(rawParams as JsonValue);
+        if (serialized !== undefined) {
+          evaluatorDefinition.parameters = serialized;
+        }
+      }
+    }
+
+    // Sandbox entry only — task and scorer entries are pushed separately
+    // when the eval is actually run, matching the Python SDK behavior.
+    entries.push({
+      kind: "code",
+      project_id: projectId,
+      project_name: selectorProjectName,
+      name: `Eval ${evalName} sandbox`,
+      slug: slugify(`${stem}-${evalName}-sandbox`),
+      function_type: "sandbox",
+      location: {
+        type: "sandbox",
+        sandbox_spec: { provider: "lambda" },
+        entrypoints: [path.relative(process.cwd(), sourceFilePath)],
+        eval_name: evalName,
+        evaluator_definition: evaluatorDefinition as JsonValue,
+      } as JsonValue,
+      metadata: { _bt_sandbox_group_name: stem },
+    });
+  }
+
+  return entries;
+}
+
 async function processFile(filePath: string): Promise<ManifestFile> {
   const absolutePath = path.resolve(process.cwd(), filePath);
   const fallbackRegistry = freshRegistry();
@@ -577,6 +682,10 @@ async function processFile(filePath: string): Promise<ManifestFile> {
       registry.parameters as EventRegistryItem[],
       false,
     )),
+    ...collectEvaluatorEntries(
+      registry.evaluators as Record<string, unknown>,
+      absolutePath,
+    ),
   ];
 
   return {
Original file line number	Diff line number	Diff line change
`@@ -23,3 +23,4 @@ tests/evals/js/eval-bun/test-data.txt`
`23`	`23`	`__pycache__`
`24`	`24`
`25`	`25`	`bt-sync`
	`26`	`+*.env`