From 4e75faf9e27595ffc8011c9a94aae40eef313455 Mon Sep 17 00:00:00 2001
From: Eric Windmill <eric@ericwindmill.com>
Date: Thu, 12 Mar 2026 12:47:42 -0700
Subject: [PATCH 1/8] feat: Introduce a dedicated `yaml_config.md` for detailed
 configuration fields, refactoring `configuration_reference.md` to link to it
 and updating `index.md` and `custom.css`.

---
 docs/_static/custom.css                   |  19 +
 docs/reference/configuration_reference.md | 242 +-------
 docs/reference/index.md                   |   1 +
 docs/reference/yaml_config.md             | 691 ++++++++++++++++++++++
 4 files changed, 718 insertions(+), 235 deletions(-)
 create mode 100644 docs/reference/yaml_config.md

diff --git a/docs/_static/custom.css b/docs/_static/custom.css
index 9243a6b..a0d57ff 100644
--- a/docs/_static/custom.css
+++ b/docs/_static/custom.css
@@ -418,3 +418,22 @@ html[data-theme="dark"] .sig > span.pre:not(:first-child) {
 html[data-theme="dark"] .sig-paren {
     color: #888888;
 }
+
+
+/* ============================================
+   COLLAPSIBLE SIDEBARS ON WIDE SCREENS
+   ============================================ */
+
+.bd-sidebar-primary {
+    padding-right: 30px;
+    width: auto !important;
+}
+
+
+.bd-sidebar-secondary  {
+    width: auto !important;
+}
+
+.bd-article-container {
+    max-width: none !important;
+}
\ No newline at end of file
diff --git a/docs/reference/configuration_reference.md b/docs/reference/configuration_reference.md
index deb2193..a1f4d68 100644
--- a/docs/reference/configuration_reference.md
+++ b/docs/reference/configuration_reference.md
@@ -12,6 +12,8 @@ The evaluation framework uses the `eval/` directory as its entry point. It conta
 
 Configuration is parsed and resolved by the Dart `dataset_config_dart` package, which produces an EvalSet JSON manifest consumed by the Python `dash_evals`.
 
+> **See also:** [YAML Configuration Fields](yaml_config.md) for a complete field-by-field reference with Dart and Python cross-references.
+
 ## Directory Structure
 
 ```
@@ -84,77 +86,7 @@ samples:
         The fix should handle the disposed controller properly.
 ```
 
-### Task-Level Fields
-
-#### Core Fields
-
-| Field | Type | Required | Description |
-|-------|------|----------|-------------|
-| `func` | string | Yes | Name of the `@task` function (resolved dynamically via `importlib`) |
-| `description` | string | No | Human-readable description |
-| `samples` | object | Yes | Samples config with `inline` and/or `paths` keys |
-| `allowed_variants` | list | No | Whitelist of variant names this task accepts (omit to accept all) |
-| `system_message` | string | No | Custom system prompt for this task |
-| `workspace` | object | No | Default workspace for all samples |
-| `tests` | object | No | Default test files for all samples |
-
-#### Inspect AI Task Parameters
-
-These map directly to [Inspect AI's `Task` constructor](https://inspect.aisi.org.uk/reference/inspect_ai.html#task). All are optional and override any `task_defaults` set in the job file.
-
-| Field | Type | Description |
-|-------|------|-------------|
-| `model` | string | Default model for this task (overrides the eval model) |
-| `config` | object | Model generation config (e.g., `{temperature: 0.2, max_tokens: 4096}`) |
-| `model_roles` | object | Named roles for use in `get_model()` |
-| `sandbox` | string/object | Sandbox environment type or `[type, config_path]` |
-| `approval` | string/object | Tool use approval policies |
-| `epochs` | int/object | Number of times to repeat each sample (optionally with score reducer) |
-| `fail_on_error` | number/bool | `true` = fail on first error, `0.0–1.0` = fail if proportion exceeds threshold |
-| `continue_on_fail` | bool | Continue running if `fail_on_error` condition is met |
-| `message_limit` | int | Max total messages per sample |
-| `token_limit` | int | Max total tokens per sample |
-| `time_limit` | int | Max clock time (seconds) per sample |
-| `working_limit` | int | Max working time (seconds) per sample (excludes wait time) |
-| `cost_limit` | float | Max cost (dollars) per sample |
-| `early_stopping` | string/object | Early stopping callbacks |
-| `display_name` | string | Task display name (e.g., for plotting) |
-| `version` | int | Version of task spec (to distinguish evolutions) |
-| `metadata` | object | Additional metadata to associate with the task |
-
-### Samples Object
-
-| Field | Type | Description |
-|-------|------|-------------|
-| `inline` | list | Inline sample definitions |
-| `paths` | list | Glob patterns for external sample YAML files (relative to task dir) |
-
-### Sample Fields (inline in task.yaml)
-
-#### Core Fields
-
-| Field | Type | Required | Description |
-|-------|------|----------|-------------|
-| `id` | string | Yes | Unique sample identifier |
-| `input` | string | Yes | The prompt given to the model |
-| `target` | string | Yes | Expected output or grading criteria |
-| `difficulty` | string | No | `easy`, `medium`, or `hard` |
-| `tags` | list | No | Categories for filtering |
-| `system_message` | string | No | Override system prompt for this sample |
-| `metadata` | object | No | Arbitrary metadata |
-| `workspace` | object | No | Override task-level workspace |
-| `tests` | object | No | Override task-level tests |
-
-#### Inspect AI Sample Parameters
-
-These map directly to [Inspect AI's `Sample`](https://inspect.aisi.org.uk/reference/inspect_ai.dataset.html#sample).
-
-| Field | Type | Description |
-|-------|------|-------------|
-| `choices` | list | Answer choices for multiple-choice evaluations |
-| `sandbox` | string/object | Override sandbox environment for this sample |
-| `files` | object | Files to copy into the sandbox (`{destination: source}`) |
-| `setup` | string | Setup script to run in the sandbox before evaluation |
+For the complete list of task fields (including Inspect AI `Task` parameters), see the [Task fields table](yaml_config.md#task).
 
 ### Workspace/Tests References
 
@@ -201,34 +133,7 @@ samples:
         category: language_fundamentals
 ```
 
----
-
-### Core Fields
-
-| Field | Type | Required | Description |
-|-------|------|----------|-------------|
-| `id` | string | Yes | Unique sample identifier |
-| `input` | string | Yes | The prompt given to the model |
-| `target` | string | Yes | Expected output or grading criteria |
-| `difficulty` | string | No | `easy`, `medium`, or `hard` |
-| `tags` | list | No | Categories for filtering |
-| `system_message` | string | No | Override system prompt for this sample |
-| `metadata` | object | No | Arbitrary metadata |
-| `workspace` | object | No | Override task-level workspace |
-| `tests` | object | No | Override task-level tests |
-
----
-
-### Inspect AI Sample Parameters
-
-These map directly to [Inspect AI's `Sample`](https://inspect.aisi.org.uk/reference/inspect_ai.dataset.html#sample).
-
-| Field | Type | Description |
-|-------|------|-------------|
-| `choices` | list | Answer choices for multiple-choice evaluations |
-| `sandbox` | string/object | Override sandbox environment for this sample |
-| `files` | object | Files to copy into the sandbox (`{destination: source}`) |
-| `setup` | string | Setup script to run in the sandbox before evaluation |
+For the complete list of sample fields, see the [Sample fields table](yaml_config.md#sample).
 
 ### Multiple Choice Example
 
@@ -256,33 +161,6 @@ These map directly to [Inspect AI's `Sample`](https://inspect.aisi.org.uk/refere
   setup: "cd /workspace && flutter pub get"
 ```
 
----
-
-### Workspace & Tests References
-
-Workspaces and test paths can be specified at task level (inherited by all samples) or per-sample (overrides task level).
-
-```yaml
-# Reference a reusable template
-workspace:
-  template: flutter_app
-
-# Reference a path relative to task directory
-workspace:
-  path: ./project
-
-# Clone from git
-workspace:
-  git: https://github.com/example/repo.git
-
-# Shorthand (equivalent to path:)
-workspace: ./project
-```
-
-> [!NOTE]
-> Paths in `workspace` and `tests` are resolved **relative to the task directory** (e.g., `tasks/flutter_bug_fix/`).
-
-
 ---
 
 ## Job files
@@ -330,110 +208,13 @@ task_defaults:
 #   log_images: true
 ```
 
-
-### Core Job Fields
-
-| Field | Type | Description |
-|-------|------|-------------|
-| `logs_dir` | string | Override logs directory (default: `../logs`) |
-| `sandbox_type` | string | Sandbox type: `local`, `docker`, or `podman` (default: `local`) |
-| `max_connections` | int | Max concurrent API connections (default: `10`) |
-| `max_retries` | int | Max retry attempts for failed samples (default: `3`) |
-| `save_examples` | bool | If `true`, copies the agent's final workspace to `<logs_dir>/<run>/examples/` after each sample. (default: `false`) |
-| `models` | list | Filter to specific models — omit to run all |
-| `variants` | map | Named variant definitions (see Variants section) — omit to run all defined in task files |
-| `tasks` | object | Task discovery and overrides (see below) |
-
-### Inspect AI eval_set() Parameters
-
-All [Inspect AI `eval_set()` parameters](https://inspect.aisi.org.uk/reference/inspect_ai.html#eval_set) are available as top-level keys in the job file. These control retry behavior, concurrency, logging, and more.
-
-#### Retry & Error Handling
-
-| Field | Type | Default | Description |
-|-------|------|---------|-------------|
-| `retry_attempts` | int | `10` | Max retry attempts before giving up |
-| `retry_wait` | float | `60` | Seconds between retries (exponential backoff) |
-| `retry_connections` | float | `0.5` | Reduce max_connections at this rate per retry |
-| `retry_cleanup` | bool | `true` | Cleanup failed log files after retries |
-| `retry_on_error` | int | — | Retry samples on error (per-sample) |
-| `fail_on_error` | float | `0.05` | Fail if error proportion exceeds threshold |
-| `continue_on_fail` | bool | — | Continue running even if fail_on_error is met |
-| `debug_errors` | bool | `false` | Raise task errors for debugging |
-
-#### Concurrency
-
-| Field | Type | Default | Description |
-|-------|------|---------|-------------|
-| `max_samples` | int | `max_connections` | Max concurrent samples per task |
-| `max_tasks` | int | `max(4, models)` | Max tasks to run in parallel |
-| `max_subprocesses` | int | `cpu_count` | Max subprocesses in parallel |
-| `max_sandboxes` | int | — | Max sandboxes per-provider in parallel |
-
-#### Logging
-
-| Field | Type | Default | Description |
-|-------|------|---------|-------------|
-| `log_level` | string | `info` | Console log level (`debug`, `info`, `warning`, `error`) |
-| `log_level_transcript` | string | `info` | Log file level |
-| `log_format` | string | `json` | Log format (`eval` or `json`) |
-| `log_samples` | bool | `true` | Log detailed samples and scores |
-| `log_realtime` | bool | `true` | Log events in realtime |
-| `log_images` | bool | `false` | Log base64-encoded images |
-| `log_buffer` | int | — | Samples to buffer before log write |
-| `log_shared` | int | — | Sync sample events for realtime viewing |
-| `log_dir_allow_dirty` | bool | `false` | Allow log dir with unrelated logs |
-
-#### Model Configuration
-
-| Field | Type | Description |
-|-------|------|-------------|
-| `model_base_url` | string | Base URL for the model API |
-| `model_args` | object | Model creation arguments |
-| `model_roles` | object | Named roles for `get_model()` |
-| `task_args` | object | Task creation arguments |
-| `model_cost_config` | object | Model prices for cost tracking |
-
-#### Sample Control
-
-| Field | Type | Description |
-|-------|------|-------------|
-| `limit` | int/list | Limit samples (count or `[start, end]` range) |
-| `sample_id` | string/list | Evaluate specific sample(s) |
-| `sample_shuffle` | bool/int | Shuffle samples (pass seed for deterministic order) |
-| `epochs` | int/object | Repeat samples and optional score reducer |
-
-#### Limits (Applied to All Samples)
-
-| Field | Type | Description |
-|-------|------|-------------|
-| `message_limit` | int | Max messages per sample |
-| `token_limit` | int | Max tokens per sample |
-| `time_limit` | int | Max clock time (seconds) per sample |
-| `working_limit` | int | Max working time (seconds) per sample |
-| `cost_limit` | float | Max cost (dollars) per sample |
-
-#### Miscellaneous
-
-| Field | Type | Description |
-|-------|------|-------------|
-| `tags` | list | Tags for this evaluation run |
-| `metadata` | object | Metadata for this evaluation run |
-| `trace` | bool | Trace model interactions to terminal |
-| `display` | string | Task display type (default: `full`) |
-| `score` | bool | Score output (default: `true`) |
-| `approval` | string/object | Tool use approval policies |
-| `solver` | string/object | Alternative solver(s) |
-| `sandbox_cleanup` | bool | Cleanup sandbox after task (default: `true`) |
-| `bundle_dir` | string | Directory for bundled logs + viewer |
-| `bundle_overwrite` | bool | Overwrite files in bundle_dir |
-| `eval_set_id` | string | Custom ID for the eval set |
+For the complete list of job fields (including all Inspect AI `eval_set()` parameters), see the [Job fields table](yaml_config.md#job).
 
 ### Pass-Through Sections
 
 #### `task_defaults`
 
-Default [Task parameters](#inspect-ai-task-parameters) applied to **every task** in this job. Per-task overrides from `task.yaml` take precedence.
+Default [Task parameters](yaml_config.md#task) applied to **every task** in this job. Per-task overrides from `task.yaml` take precedence.
 
 ```yaml
 task_defaults:
@@ -467,11 +248,6 @@ tasks:
       exclude-samples: [slow_test]   # Exclude these samples
 ```
 
-| Field | Type | Description |
-|-------|------|-------------|
-| `paths` | list | Glob patterns for discovering task directories |
-| `inline` | object | Per-task configuration overrides |
-
 ---
 
 ## Variants
@@ -486,11 +262,7 @@ variants:
   full: { context_files: [./context_files/flutter.md], mcp_servers: [dart] }
 ```
 
-| Field | Type | Default | Description |
-|-------|------|---------|-------------|
-| `context_files` | list | `[]` | Paths or glob patterns to context files (relative to task dir) |
-| `skills` | list | `[]` | Paths or glob patterns to skill directories (relative to task dir) |
-| `mcp_servers` | list | `[]` | MCP server identifiers |
+Variant sub-fields (`context_files`, `mcp_servers`, `skills`, `flutter_channel`) are documented in the [Job fields table](yaml_config.md#job).
 
 Tasks can optionally restrict which variants apply to them via `allowed_variants` in their `task.yaml`:
 
diff --git a/docs/reference/index.md b/docs/reference/index.md
index 1576729..86879cb 100644
--- a/docs/reference/index.md
+++ b/docs/reference/index.md
@@ -8,6 +8,7 @@ API documentation, CLI usage, and other reference material.
 glossary
 cli
 configuration_reference
+yaml_config
 ```
 
 ```{toctree}
diff --git a/docs/reference/yaml_config.md b/docs/reference/yaml_config.md
new file mode 100644
index 0000000..8e6e632
--- /dev/null
+++ b/docs/reference/yaml_config.md
@@ -0,0 +1,691 @@
+# YAML Configuration Fields
+
+This page provides a complete field-by-field reference for each YAML configuration file type, cross-referenced with the corresponding Dart and Python object field names.
+
+## Job
+
+Job files define runtime settings for an evaluation run, including sandbox configuration, rate limits, model selection, variant definitions, and pass-through parameters for Inspect AI's `eval_set()` and `Task` constructors. Located in `eval/jobs/`.
+
+```{list-table}
+:header-rows: 1
+:widths: 20 8 5 12 12 43
+
+* - Field name
+  - YAML type
+  - Optional
+  - Dart field
+  - Python field
+  - Description
+* - `log_dir`
+  - string
+  - N
+  - `logDir`
+  - `log_dir`
+  - Directory to write evaluation logs to
+* - `sandbox_type`
+  - string
+  - Y
+  - `sandboxType`
+  - `sandbox_type`
+  - Sandbox type: `local`, `docker`, or `podman` (default: `local`)
+* - `max_connections`
+  - int
+  - Y
+  - `maxConnections`
+  - `max_connections`
+  - Maximum concurrent API connections (default: `10`)
+* - `models`
+  - list
+  - Y
+  - `models`
+  - `models`
+  - Filter to specific models — omit to use defaults
+* - `variants`
+  - map
+  - Y
+  - `variants`
+  - `variants`
+  - Named variant definitions (keys are names, values are config maps)
+* - `variants`\
+    &nbsp;&nbsp;`.<name>`\
+    &nbsp;&nbsp;`.context_files`
+  - list
+  - Y
+  -
+  -
+  - Paths or glob patterns to context files
+* - `variants`\
+    &nbsp;&nbsp;`.<name>`\
+    &nbsp;&nbsp;`.mcp_servers`
+  - list
+  - Y
+  -
+  -
+  - MCP server identifiers
+* - `variants`\
+    &nbsp;&nbsp;`.<name>`\
+    &nbsp;&nbsp;`.skills`
+  - list
+  - Y
+  -
+  -
+  - Paths or glob patterns to skill directories
+* - `variants`\
+    &nbsp;&nbsp;`.<name>`\
+    &nbsp;&nbsp;`.flutter_channel`
+  - string
+  - Y
+  -
+  -
+  - Flutter SDK channel (`stable`, `beta`, `main`)
+* - `task_paths`
+  - list
+  - Y
+  - `taskPaths`
+  - `task_paths`
+  - Glob patterns for discovering task directories (relative to dataset root)
+* - `tasks`
+  - object
+  - Y
+  - `tasks`
+  - `tasks`
+  - Per-task configurations with inline overrides
+* - `tasks`\
+    &nbsp;&nbsp;`.<task_id>`\
+    &nbsp;&nbsp;`.include-samples`
+  - list
+  - Y
+  - `JobTask.includeSamples`
+  - `JobTask.include_samples`
+  - Only run these sample IDs
+* - `tasks`\
+    &nbsp;&nbsp;`.<task_id>`\
+    &nbsp;&nbsp;`.exclude-samples`
+  - list
+  - Y
+  - `JobTask.excludeSamples`
+  - `JobTask.exclude_samples`
+  - Exclude these sample IDs
+* - `tasks`\
+    &nbsp;&nbsp;`.<task_id>`\
+    &nbsp;&nbsp;`.system_message`
+  - string
+  - Y
+  - `JobTask.systemMessage`
+  - `JobTask.system_message`
+  - Override system message for this task
+* - `save_examples`
+  - bool
+  - Y
+  - `saveExamples`
+  - `save_examples`
+  - Copy final workspace to `<logDir>/examples/` after each sample (default: `false`)
+* - `retry_attempts`
+  - int
+  - Y
+  - `retryAttempts`
+  - `retry_attempts`
+  - Max retry attempts before giving up
+* - `max_retries`
+  - int
+  - Y
+  - `maxRetries`
+  - `max_retries`
+  - Max retry attempts for failed samples
+* - `retry_wait`
+  - float
+  - Y
+  - `retryWait`
+  - `retry_wait`
+  - Seconds between retries (exponential backoff)
+* - `retry_connections`
+  - float
+  - Y
+  - `retryConnections`
+  - `retry_connections`
+  - Reduce `max_connections` at this rate per retry
+* - `retry_cleanup`
+  - bool
+  - Y
+  - `retryCleanup`
+  - `retry_cleanup`
+  - Cleanup failed log files after retries
+* - `fail_on_error`
+  - float
+  - Y
+  - `failOnError`
+  - `fail_on_error`
+  - Fail if error proportion exceeds threshold (`0.0–1.0`)
+* - `continue_on_fail`
+  - bool
+  - Y
+  - `continueOnFail`
+  - `continue_on_fail`
+  - Continue running even if `fail_on_error` condition is met
+* - `retry_on_error`
+  - int
+  - Y
+  - `retryOnError`
+  - `retry_on_error`
+  - Retry samples on error (per-sample)
+* - `debug_errors`
+  - bool
+  - Y
+  - `debugErrors`
+  - `debug_errors`
+  - Raise task errors for debugging
+* - `max_samples`
+  - int
+  - Y
+  - `maxSamples`
+  - `max_samples`
+  - Max concurrent samples per task
+* - `max_tasks`
+  - int
+  - Y
+  - `maxTasks`
+  - `max_tasks`
+  - Max tasks to run in parallel
+* - `max_subprocesses`
+  - int
+  - Y
+  - `maxSubprocesses`
+  - `max_subprocesses`
+  - Max subprocesses in parallel
+* - `max_sandboxes`
+  - int
+  - Y
+  - `maxSandboxes`
+  - `max_sandboxes`
+  - Max sandboxes (per-provider) in parallel
+* - `log_level`
+  - string
+  - Y
+  - `logLevel`
+  - `log_level`
+  - Console log level (`debug`, `info`, `warning`, `error`)
+* - `log_level_transcript`
+  - string
+  - Y
+  - `logLevelTranscript`
+  - `log_level_transcript`
+  - Log file level
+* - `log_format`
+  - string
+  - Y
+  - `logFormat`
+  - `log_format`
+  - Log format (`eval` or `json`)
+* - `log_samples`
+  - bool
+  - Y
+  - `logSamples`
+  - `log_samples`
+  - Log detailed samples and scores
+* - `log_realtime`
+  - bool
+  - Y
+  - `logRealtime`
+  - `log_realtime`
+  - Log events in realtime
+* - `log_images`
+  - bool
+  - Y
+  - `logImages`
+  - `log_images`
+  - Log base64-encoded images
+* - `log_buffer`
+  - int
+  - Y
+  - `logBuffer`
+  - `log_buffer`
+  - Samples to buffer before log write
+* - `log_shared`
+  - int
+  - Y
+  - `logShared`
+  - `log_shared`
+  - Sync sample events for realtime viewing
+* - `log_dir_allow_dirty`
+  - bool
+  - Y
+  - `logDirAllowDirty`
+  - `log_dir_allow_dirty`
+  - Allow log dir with unrelated logs
+* - `model_base_url`
+  - string
+  - Y
+  - `modelBaseUrl`
+  - `model_base_url`
+  - Base URL for the model API
+* - `model_args`
+  - object
+  - Y
+  - `modelArgs`
+  - `model_args`
+  - Model creation arguments
+* - `model_roles`
+  - object
+  - Y
+  - `modelRoles`
+  - `model_roles`
+  - Named roles for `get_model()`
+* - `task_args`
+  - object
+  - Y
+  - `taskArgs`
+  - `task_args`
+  - Task creation arguments
+* - `model_cost_config`
+  - object
+  - Y
+  - `modelCostConfig`
+  - `model_cost_config`
+  - Model prices for cost tracking
+* - `limit`
+  - int/list
+  - Y
+  - `limit`
+  - `limit`
+  - Limit samples (count or `[start, end]` range)
+* - `sample_id`
+  - string/list
+  - Y
+  - `sampleId`
+  - `sample_id`
+  - Evaluate specific sample(s)
+* - `sample_shuffle`
+  - bool/int
+  - Y
+  - `sampleShuffle`
+  - `sample_shuffle`
+  - Shuffle samples (pass seed for deterministic order)
+* - `epochs`
+  - int/object
+  - Y
+  - `epochs`
+  - `epochs`
+  - Repeat samples and optional score reducer
+* - `message_limit`
+  - int
+  - Y
+  - `messageLimit`
+  - `message_limit`
+  - Max messages per sample
+* - `token_limit`
+  - int
+  - Y
+  - `tokenLimit`
+  - `token_limit`
+  - Max tokens per sample
+* - `time_limit`
+  - int
+  - Y
+  - `timeLimit`
+  - `time_limit`
+  - Max clock time (seconds) per sample
+* - `working_limit`
+  - int
+  - Y
+  - `workingLimit`
+  - `working_limit`
+  - Max working time (seconds) per sample
+* - `cost_limit`
+  - float
+  - Y
+  - `costLimit`
+  - `cost_limit`
+  - Max cost (dollars) per sample
+* - `tags`
+  - list
+  - Y
+  - `tags`
+  - `tags`
+  - Tags for this evaluation run
+* - `metadata`
+  - object
+  - Y
+  - `metadata`
+  - `metadata`
+  - Metadata for this evaluation run
+* - `trace`
+  - bool
+  - Y
+  - `trace`
+  - `trace`
+  - Trace model interactions to terminal
+* - `display`
+  - string
+  - Y
+  - `display`
+  - `display`
+  - Task display type (default: `full`)
+* - `score`
+  - bool
+  - Y
+  - `score`
+  - `score`
+  - Score output (default: `true`)
+* - `approval`
+  - string/object
+  - Y
+  - `approval`
+  - `approval`
+  - Tool use approval policies
+* - `solver`
+  - string/object
+  - Y
+  - `solver`
+  - `solver`
+  - Alternative solver(s)
+* - `sandbox_cleanup`
+  - bool
+  - Y
+  - `sandboxCleanup`
+  - `sandbox_cleanup`
+  - Cleanup sandbox after task
+* - `bundle_dir`
+  - string
+  - Y
+  - `bundleDir`
+  - `bundle_dir`
+  - Directory for bundled logs + viewer
+* - `bundle_overwrite`
+  - bool
+  - Y
+  - `bundleOverwrite`
+  - `bundle_overwrite`
+  - Overwrite files in `bundle_dir`
+* - `eval_set_id`
+  - string
+  - Y
+  - `evalSetId`
+  - `eval_set_id`
+  - Custom ID for the eval set
+* - `eval_set_overrides`
+  - object
+  - Y
+  - `evalSetOverrides`
+  - `eval_set_overrides`
+  - Additional `eval_set()` kwargs not covered by top-level fields
+* - `task_defaults`
+  - object
+  - Y
+  - `taskDefaults`
+  - `task_defaults`
+  - Default `Task` kwargs applied to every task in this job
+```
+
+## Task
+
+Task files define a single evaluation task with its samples, prompt configuration, and optional Inspect AI `Task` parameter overrides. Located in `eval/tasks/<task_id>/task.yaml`.
+
+```{list-table}
+:header-rows: 1
+:widths: 20 8 5 12 12 43
+
+* - Field name
+  - YAML type
+  - Optional
+  - Dart field
+  - Python field
+  - Description
+* - `func`
+  - string
+  - Y
+  -
+  -
+  - Name of the `@task` function (defaults to directory name)
+* - `id`
+  - string
+  - Y
+  -
+  -
+  - Task identifier (defaults to directory name)
+* - `description`
+  - string
+  - Y
+  -
+  -
+  - Human-readable description
+* - `system_message`
+  - string
+  - Y
+  -
+  -
+  - Custom system prompt for this task
+* - `samples`
+  - object
+  - N
+  -
+  -
+  - Samples config with `inline` and/or `paths` keys
+* - `samples`\
+    &nbsp;&nbsp;`.inline`
+  - list
+  - Y
+  -
+  -
+  - Inline sample definitions (list of sample objects)
+* - `samples`\
+    &nbsp;&nbsp;`.paths`
+  - list
+  - Y
+  -
+  -
+  - Glob patterns for external sample YAML files (relative to task dir)
+* - `allowed_variants`
+  - list
+  - Y
+  -
+  -
+  - Whitelist of variant names this task accepts
+* - `workspace`
+  - string/object
+  - Y
+  -
+  -
+  - Default workspace for all samples
+* - `tests`
+  - string/object
+  - Y
+  -
+  -
+  - Default test files for all samples
+* - `model`
+  - string
+  - Y
+  - `model`
+  - `model`
+  - Default model for this task
+* - `config`
+  - object
+  - Y
+  - `config`
+  - `config`
+  - Model generation config (e.g. `{temperature: 0.2}`)
+* - `model_roles`
+  - object
+  - Y
+  - `modelRoles`
+  - `model_roles`
+  - Named roles for `get_model()`
+* - `sandbox`
+  - string/object
+  - Y
+  - `sandbox`
+  - `sandbox`
+  - Sandbox environment type or config
+* - `approval`
+  - string/object
+  - Y
+  - `approval`
+  - `approval`
+  - Tool use approval policies
+* - `epochs`
+  - int/object
+  - Y
+  - `epochs`
+  - `epochs`
+  - Number of times to repeat each sample
+* - `fail_on_error`
+  - number/bool
+  - Y
+  - `failOnError`
+  - `fail_on_error`
+  - Fail threshold for sample errors
+* - `continue_on_fail`
+  - bool
+  - Y
+  - `continueOnFail`
+  - `continue_on_fail`
+  - Continue running if `fail_on_error` condition is met
+* - `message_limit`
+  - int
+  - Y
+  - `messageLimit`
+  - `message_limit`
+  - Max total messages per sample
+* - `token_limit`
+  - int
+  - Y
+  - `tokenLimit`
+  - `token_limit`
+  - Max total tokens per sample
+* - `time_limit`
+  - int
+  - Y
+  - `timeLimit`
+  - `time_limit`
+  - Max clock time (seconds) per sample
+* - `working_limit`
+  - int
+  - Y
+  - `workingLimit`
+  - `working_limit`
+  - Max working time (seconds) per sample
+* - `cost_limit`
+  - float
+  - Y
+  - `costLimit`
+  - `cost_limit`
+  - Max cost (dollars) per sample
+* - `early_stopping`
+  - string/object
+  - Y
+  - `earlyStopping`
+  - `early_stopping`
+  - Early stopping callbacks
+* - `display_name`
+  - string
+  - Y
+  - `displayName`
+  - `display_name`
+  - Task display name (e.g. for plotting)
+* - `version`
+  - int
+  - Y
+  - `version`
+  - `version`
+  - Version of task spec
+* - `metadata`
+  - object
+  - Y
+  - `metadata`
+  - `metadata`
+  - Additional metadata to associate with the task
+```
+
+## Sample
+
+Samples are individual test cases defined either inline in `task.yaml` under `samples.inline`, or in external YAML files referenced via `samples.paths`. Fields like `difficulty`, `tags`, `workspace`, and `tests` are parsed from YAML and stored inside the sample's `metadata` dict.
+
+```{list-table}
+:header-rows: 1
+:widths: 20 8 5 12 12 43
+
+* - Field name
+  - YAML type
+  - Optional
+  - Dart field
+  - Python field
+  - Description
+* - `id`
+  - string
+  - N
+  - `id`
+  - `id`
+  - Unique sample identifier
+* - `input`
+  - string
+  - N
+  - `input`
+  - `input`
+  - The prompt given to the model
+* - `target`
+  - string
+  - N
+  - `target`
+  - `target`
+  - Expected output or grading criteria
+* - `difficulty`
+  - string
+  - Y
+  -
+  -
+  - `easy`, `medium`, or `hard` (stored in `metadata["difficulty"]`)
+* - `tags`
+  - list
+  - Y
+  -
+  -
+  - Categories for filtering (stored in `metadata["tags"]`)
+* - `system_message`
+  - string
+  - Y
+  -
+  -
+  - Override system prompt for this sample (stored in `metadata`)
+* - `workspace`
+  - string/object
+  - Y
+  -
+  -
+  - Override task-level workspace (resolved path stored in `metadata["workspace"]`)
+* - `tests`
+  - string/object
+  - Y
+  -
+  -
+  - Override task-level tests (resolved path stored in `metadata["tests"]`)
+* - `choices`
+  - list
+  - Y
+  - `choices`
+  - `choices`
+  - Answer choices for multiple-choice evaluations
+* - `metadata`
+  - object
+  - Y
+  - `metadata`
+  - `metadata`
+  - Arbitrary metadata
+* - `sandbox`
+  - string/object
+  - Y
+  - `sandbox`
+  - `sandbox`
+  - Override sandbox environment for this sample
+* - `files`
+  - object
+  - Y
+  - `files`
+  - `files`
+  - Files to copy into sandbox (`{destination: source}`)
+* - `setup`
+  - string
+  - Y
+  - `setup`
+  - `setup`
+  - Setup script to run in sandbox before evaluation
+```

From 3cce70806c9aef115f2a51bbd44181c7c55c8b93 Mon Sep 17 00:00:00 2001
From: Eric Windmill <eric@ericwindmill.com>
Date: Fri, 13 Mar 2026 15:56:27 -0700
Subject: [PATCH 2/8] updates in flight

---
 CHANGELOG.md                                  | 105 ++++++++++++++++++
 docs/reference/yaml_config.md                 |  84 +++++++++++---
 .../src/dataset_config_python/models/job.py   |   2 +-
 3 files changed, 177 insertions(+), 14 deletions(-)
 create mode 100644 CHANGELOG.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..13b3b70
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,105 @@
+# Changelog
+
+## Unreleased
+
+### New
+
+- **`Job.description`.** Optional human-readable description field on Job.
+
+- **`Job.imagePrefix` / `Job.image_prefix`.** Registry URL prefix prepended to image names during sandbox resolution. Enables switching between local images and remote registries (e.g. Artifact Registry on GKE) without duplicating job YAML files.
+
+- **Tag-based filtering.** New `TagFilter` model with `include_tags` and `exclude_tags`, used at three levels:
+  - `Job.taskFilters` / `Job.task_filters` — select tasks by metadata tags
+  - `Job.sampleFilters` / `Job.sample_filters` — select samples by metadata tags
+  - `variant_filters` on task YAML — restrict which variants apply to a task (supplements `allowed_variants`)
+
+- **`JobTask.args`.** Per-task argument overrides. Allows a job to pass task-specific arguments (e.g. `base_url`, `dataset_path`) to individual tasks.
+
+- **`Task.systemMessage` / `Task.system_message`.** System prompt override at the task level. Previously only available as a job-level override via `JobTask`.
+
+- **`Task.sandboxParameters` / `Task.sandbox_parameters`.** Pass-through dictionary for sandbox plugin configuration.
+
+- **`module:task` syntax.** Task function references can now use `module.path:function_name` format for Python tasks.
+
+### Breaking Changes
+
+- **`Task.taskFunc` → `Task.func`.** Renamed model field to match the YAML key name. JSON serialization key changes from `"task_func"` to `"func"`. Both Dart and Python packages must update in lockstep.
+
+- **Sandbox registry is now configurable.** The hardcoded `kSandboxRegistry` and `kSdkChannels` maps are extracted from `eval_set_resolver.dart` and made data-driven, allowing non-Flutter projects to define their own sandbox configurations.
+
+- **Workspace resolution uses native Inspect fields.** The `workspace` YAML key remains as parser-level sugar but resolves into Inspect AI's native `Sample.files` and `Sample.setup` fields. The `Sample.setup` command is no longer hardcoded to `cd /workspace && flutter pub get`; it is configurable or omitted for non-Flutter tasks.
+
+### Documentation
+
+- Updated `docs/reference/yaml_config.md` with all new fields and updated descriptions.
+- Updated `docs/guides/config.md` (pending — after implementation).
+
+## 11 March, 2025
+
+### New
+
+- **`dataset_config_python` package.** Python port of the Dart config package (`dataset_config_dart`), providing full parity for YAML parsing, resolution, and JSON output. Includes Pydantic models for `Job`, `Task`, `Sample`, `EvalSet`, `Variant`, `Dataset`, and `ContextFile`. Exposes `resolve()` and `write_eval_sets()` as the public API. No Dart SDK or Inspect AI dependency required — can be installed standalone by any team that needs to parse eval config YAML.
+
+### Breaking Changes
+
+- **Renamed `dataset_config` → `dataset_config_dart`.** The Dart config package was renamed for clarity alongside the new Python package.
+
+- **Renamed `dash_evals_config` → `dataset_config_python`.** The Python config package was renamed from its original name for consistency with the Dart package.
+
+## 28 February, 2025
+
+### New
+
+- **`eval_config` Dart package.** New package with a layered Parser → Resolver → Writer architecture that converts dataset YAML into EvalSet JSON for the Python runner. Provides `ConfigResolver` facade plus direct access to `YamlParser`, `JsonParser`, `EvalSetResolver`, and `EvalSetWriter`.
+
+- **Dual-mode eval runner.** The Python runner now supports two invocation modes:
+  - `run-evals --json ./eval_set.json` — consume a JSON manifest produced by the Dart CLI
+  - `run-evals --task <name> --model <model>` — run a single task directly from CLI arguments
+
+- **Generalized task functions.** Task implementations are now language-agnostic by default. Flutter-specific tasks (`flutter_bug_fix`, `flutter_code_gen`) are thin wrappers around the generic `bug_fix` and `code_gen` tasks. New tasks: `analyze_codebase`, `mcp_tool`, `skill_test`.
+
+- **New Dart domain models.** `EvalSet`, `Task`, `Sample`, `Variant`, and `TaskInfo` models in the `models` package map directly to the Inspect AI evaluation structure.
+
+### Breaking Changes
+
+- **Removed Python `registries.py`.** Task/model/sandbox registries are removed. Task functions are now discovered dynamically via `importlib` (short names like `"flutter_code_gen"` resolve automatically).
+
+- **Removed `TaskConfig` and `SampleConfig`.** Replaced by `ParsedTask` (intermediate parsing type in `eval_config`) and `Sample` (Inspect AI domain model).
+
+- **Removed legacy Python config parsing.** The `config/parsers/` directory, `load_yaml` utility, and associated model definitions have been removed from `eval_runner`. Configuration is now handled by the Dart `eval_config` package.
+
+- **Models package reorganized.** Report-app models (used by the Flutter results viewer) moved to `models/lib/src/report_app/`. The top-level `models/lib/src/` now contains inspect-domain models.
+
+- **Dataset utilities moved.** `DatasetReader`, `filesystem_utils`, and discovery helpers moved from `eval_config` to `eval_cli`.
+
+## 25 February, 2025
+
+### Breaking Changes
+
+- **Variant format changed from list to named map.** Job YAML files now define variants as a named map instead of a list. Tasks can optionally restrict applicable variants via `allowed_variants` in their `task.yaml`.
+
+  **Before (list format):**
+  ```yaml
+  variants:
+    - baseline
+    - { mcp_servers: [dart] }
+  ```
+
+  **After (named map format):**
+  ```yaml
+  # job.yaml
+  variants:
+    baseline: {}
+    mcp_only: { mcp_servers: [dart] }
+    context_only: { context_files: [./context_files/flutter.md] }
+    full: { context_files: [./context_files/flutter.md], mcp_servers: [dart] }
+  ```
+
+  ```yaml
+  # task.yaml (optional — omit to accept all job variants)
+  allowed_variants: [baseline, mcp_only]
+  ```
+
+- **Removed `DEFAULT_VARIANTS` registry.** Variants are no longer defined globally in `registries.py`. Each job file defines its own variants.
+
+- **Removed `variants` from `JobTask`.** Per-task variant overrides (`job.tasks.<id>.variants`) are replaced by task-level `allowed_variants` whitelists.
\ No newline at end of file
diff --git a/docs/reference/yaml_config.md b/docs/reference/yaml_config.md
index 8e6e632..05d63b8 100644
--- a/docs/reference/yaml_config.md
+++ b/docs/reference/yaml_config.md
@@ -4,7 +4,7 @@ This page provides a complete field-by-field reference for each YAML configurati
 
 ## Job
 
-Job files define runtime settings for an evaluation run, including sandbox configuration, rate limits, model selection, variant definitions, and pass-through parameters for Inspect AI's `eval_set()` and `Task` constructors. Located in `eval/jobs/`.
+Job files define runtime settings for an evaluation run, including sandbox configuration, rate limits, model selection, variant definitions, tag-based filtering, and pass-through parameters for Inspect AI's `eval_set()` and `Task` constructors. Located in `eval/jobs/`.
 
 ```{list-table}
 :header-rows: 1
@@ -16,6 +16,12 @@ Job files define runtime settings for an evaluation run, including sandbox confi
   - Dart field
   - Python field
   - Description
+* - `description`
+  - string
+  - Y
+  - `description`
+  - `description`
+  - Human-readable description of the job
 * - `log_dir`
   - string
   - N
@@ -28,6 +34,12 @@ Job files define runtime settings for an evaluation run, including sandbox confi
   - `sandboxType`
   - `sandbox_type`
   - Sandbox type: `local`, `docker`, or `podman` (default: `local`)
+* - `image_prefix`
+  - string
+  - Y
+  - `imagePrefix`
+  - `image_prefix`
+  - Registry prefix prepended to image names during sandbox resolution (e.g. `us-central1-docker.pkg.dev/project/repo/`)
 * - `max_connections`
   - int
   - Y
@@ -78,6 +90,32 @@ Job files define runtime settings for an evaluation run, including sandbox confi
   -
   -
   - Flutter SDK channel (`stable`, `beta`, `main`)
+* - `task_filters`
+  - object
+  - Y
+  - `taskFilters`
+  - `task_filters`
+  - Tag-based task selection filter
+* - `task_filters`\
+    &nbsp;&nbsp;`.include_tags`
+  - list
+  - Y
+  - `TagFilter.includeTags`
+  - `TagFilter.include_tags`
+  - Only run tasks whose metadata tags include **all** of these
+* - `task_filters`\
+    &nbsp;&nbsp;`.exclude_tags`
+  - list
+  - Y
+  - `TagFilter.excludeTags`
+  - `TagFilter.exclude_tags`
+  - Exclude tasks whose metadata tags include **any** of these
+* - `sample_filters`
+  - object
+  - Y
+  - `sampleFilters`
+  - `sample_filters`
+  - Tag-based sample selection filter (same schema as `task_filters`)
 * - `task_paths`
   - list
   - Y
@@ -114,6 +152,14 @@ Job files define runtime settings for an evaluation run, including sandbox confi
   - `JobTask.systemMessage`
   - `JobTask.system_message`
   - Override system message for this task
+* - `tasks`\
+    &nbsp;&nbsp;`.<task_id>`\
+    &nbsp;&nbsp;`.args`
+  - object
+  - Y
+  - `JobTask.args`
+  - `JobTask.args`
+  - Per-task argument overrides passed to the task function
 * - `save_examples`
   - bool
   - Y
@@ -433,9 +479,9 @@ Task files define a single evaluation task with its samples, prompt configuratio
 * - `func`
   - string
   - Y
-  -
-  -
-  - Name of the `@task` function (defaults to directory name)
+  - `func`
+  - `func`
+  - Name of the `@task` function or `module:function` reference (defaults to directory name)
 * - `id`
   - string
   - Y
@@ -445,15 +491,9 @@ Task files define a single evaluation task with its samples, prompt configuratio
 * - `description`
   - string
   - Y
-  -
-  -
+  - `description`
+  - `description`
   - Human-readable description
-* - `system_message`
-  - string
-  - Y
-  -
-  -
-  - Custom system prompt for this task
 * - `samples`
   - object
   - N
@@ -480,12 +520,30 @@ Task files define a single evaluation task with its samples, prompt configuratio
   -
   -
   - Whitelist of variant names this task accepts
+* - `variant_filters`
+  - object
+  - Y
+  -
+  -
+  - Tag-based variant filter (same schema as job-level `task_filters`)
+* - `system_message`
+  - string
+  - Y
+  - `systemMessage`
+  - `system_message`
+  - Custom system prompt for this task
+* - `sandbox_parameters`
+  - object
+  - Y
+  - `sandboxParameters`
+  - `sandbox_parameters`
+  - Pass-through parameters for sandbox plugin configuration
 * - `workspace`
   - string/object
   - Y
   -
   -
-  - Default workspace for all samples
+  - Default workspace for all samples (resolved into `Sample.files` and `Sample.setup`)
 * - `tests`
   - string/object
   - Y
diff --git a/packages/dataset_config_python/src/dataset_config_python/models/job.py b/packages/dataset_config_python/src/dataset_config_python/models/job.py
index c82ccc1..683e09f 100644
--- a/packages/dataset_config_python/src/dataset_config_python/models/job.py
+++ b/packages/dataset_config_python/src/dataset_config_python/models/job.py
@@ -4,7 +4,7 @@
 
 from typing import Any
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel
 
 
 class JobTask(BaseModel):

From b9af6a29d79286497daa533da5abdbb55c861c4f Mon Sep 17 00:00:00 2001
From: Eric Windmill <eric@ericwindmill.com>
Date: Fri, 13 Mar 2026 16:45:25 -0700
Subject: [PATCH 3/8] rename func

---
 IMPLEMENTATION_PLAN.md                        | 315 ++++++++++++++++++
 .../src/dash_evals/runner/json_runner.py      |   4 +-
 .../lib/src/models/context_file.g.dart        |   2 +-
 .../lib/src/models/dataset.g.dart             |   2 +-
 .../lib/src/models/eval_log.g.dart            |  76 ++---
 .../lib/src/models/eval_set.g.dart            |   2 +-
 .../lib/src/models/job.dart                   |  23 ++
 .../lib/src/models/job.freezed.dart           | 186 ++++++++---
 .../lib/src/models/job.g.dart                 |  16 +-
 .../lib/src/models/models.dart                |   1 +
 .../lib/src/models/tag_filter.dart            |  33 ++
 .../lib/src/models/tag_filter.freezed.dart    | 290 ++++++++++++++++
 .../lib/src/models/tag_filter.g.dart          |  22 ++
 .../lib/src/models/task.dart                  |  14 +-
 .../lib/src/models/task.freezed.dart          |  68 ++--
 .../lib/src/models/task.g.dart                |  10 +-
 .../lib/src/models/variant.g.dart             |   2 +-
 .../lib/src/parsed_task.dart                  |  14 +-
 .../lib/src/parsers/json_parser.dart          |   4 +-
 .../lib/src/parsers/yaml_parser.dart          |   4 +-
 .../lib/src/resolvers/eval_set_resolver.dart  |   2 +-
 packages/dataset_config_dart/pubspec.yaml     |   3 +
 .../test/eval_set_resolver_test.dart          |  10 +-
 .../test/eval_set_writer_test.dart            |   2 +-
 .../test/json_parser_test.dart                |   4 +-
 .../test/parsed_task_test.dart                |  16 +-
 .../dataset_config_python/models/__init__.py  |   3 +
 .../src/dataset_config_python/models/job.py   |  12 +
 .../models/tag_filter.py                      |  30 ++
 .../src/dataset_config_python/models/task.py  |   8 +-
 .../src/dataset_config_python/parser.py       |  10 +-
 .../src/dataset_config_python/resolver.py     |   2 +-
 .../tests/test_config.py                      |   6 +-
 .../devals_cli/lib/src/dataset/dry_run.dart   |   4 +-
 tool/config_parity/pubspec.lock               | 108 ------
 35 files changed, 1038 insertions(+), 270 deletions(-)
 create mode 100644 IMPLEMENTATION_PLAN.md
 create mode 100644 packages/dataset_config_dart/lib/src/models/tag_filter.dart
 create mode 100644 packages/dataset_config_dart/lib/src/models/tag_filter.freezed.dart
 create mode 100644 packages/dataset_config_dart/lib/src/models/tag_filter.g.dart
 create mode 100644 packages/dataset_config_python/src/dataset_config_python/models/tag_filter.py
 delete mode 100644 tool/config_parity/pubspec.lock

diff --git a/IMPLEMENTATION_PLAN.md b/IMPLEMENTATION_PLAN.md
new file mode 100644
index 0000000..74441ea
--- /dev/null
+++ b/IMPLEMENTATION_PLAN.md
@@ -0,0 +1,315 @@
+# Config Improvements — Implementation Plan
+
+This document details the implementation steps for all decided config improvements. Each section includes the specific files to modify in both Dart and Python packages, what to change, and relevant context.
+
+> **Branch:** `yardstick-config-updates`
+> **Related docs:** `CHANGELOG.md`, `docs/reference/yaml_config.md`
+> **Design analysis:** The original design doc (`config_improvements.md`) has been deleted. The finalized decisions are captured in `CHANGELOG.md`.
+
+---
+
+## Table of Contents
+
+1. [Model Changes](#1-model-changes)
+2. [Parser/Resolver Changes](#2-parserresolver-changes)
+3. [Tag-Based Filtering](#3-tag-based-filtering)
+4. [File Index](#4-file-index)
+5. [Verification](#5-verification)
+
+---
+
+## 1. Model Changes
+
+### 1.1 Add `description` to Job
+
+Simple optional string field.
+
+**Dart** — `packages/dataset_config_dart/lib/src/models/job.dart`
+```dart
+String? description,  // Add to Job freezed class
+```
+
+**Python** — `packages/dataset_config_python/src/dataset_config_python/models/job.py`
+```python
+description: str | None = None
+```
+
+**Parser** — `packages/dataset_config_dart/lib/src/parsers/yaml_parser.dart`
+```dart
+final description = data['description'] as String?;
+// Pass to Job constructor
+```
+
+---
+
+### 1.2 Add `image_prefix` to Job
+
+Registry URL prefix prepended to image names during sandbox resolution (e.g. `us-central1-docker.pkg.dev/project/repo/`).
+
+**Dart** — `packages/dataset_config_dart/lib/src/models/job.dart`
+```dart
+String? imagePrefix,
+```
+
+**Python** — `packages/dataset_config_python/src/dataset_config_python/models/job.py`
+```python
+image_prefix: str | None = None
+```
+
+**Parser** — read `image_prefix` from YAML, pass to Job.
+
+**Resolver** — `packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart`
+- In `_resolveSandbox()`, prepend `job.imagePrefix` to image names when constructing sandbox specs.
+
+---
+
+### 1.3 Add `args` to JobTask
+
+Per-task argument overrides passed to the task function.
+
+**Dart** — `packages/dataset_config_dart/lib/src/models/job.dart` (on `JobTask` class)
+```dart
+@JsonKey(name: 'args') Map<String, dynamic>? args,
+```
+
+**Python** — `packages/dataset_config_python/src/dataset_config_python/models/job.py` (on `JobTask` class)
+```python
+args: dict[str, Any] | None = None
+```
+
+**Parser** — In `JobTask.fromYaml()` (both Dart and Python), read `args` from the per-task map.
+
+---
+
+### 1.4 Add `system_message` to Task model
+
+Currently exists on `ParsedTask` but not the output `Task` model. Promote it.
+
+**Dart** — `packages/dataset_config_dart/lib/src/models/task.dart`
+```dart
+@JsonKey(name: 'system_message') String? systemMessage,
+```
+
+**Python** — `packages/dataset_config_python/src/dataset_config_python/models/task.py`
+```python
+system_message: str | None = None
+```
+
+**Resolver** — `eval_set_resolver.dart` already puts `system_message` into Task metadata. After this change, set it as a first-class field on the Task object instead.
+
+---
+
+### 1.5 Add `sandbox_parameters` to Task
+
+Pass-through dict for sandbox plugin configuration.
+
+**Dart** — `packages/dataset_config_dart/lib/src/models/task.dart`
+```dart
+@JsonKey(name: 'sandbox_parameters') Map<String, dynamic>? sandboxParameters,
+```
+
+**Python** — `packages/dataset_config_python/src/dataset_config_python/models/task.py`
+```python
+sandbox_parameters: dict[str, Any] | None = None
+```
+
+**Parser** — read `sandbox_parameters` from task.yaml.
+
+---
+
+### 1.6 Rename `task_func` → `func`
+
+The YAML parser already aliases `func` → `task_func`. This renames the model field to match.
+
+**Dart** — `packages/dataset_config_dart/lib/src/models/task.dart`
+- Rename `taskFunc` → `func`
+- Update `@JsonKey(name: 'task_func')` → `@JsonKey(name: 'func')`
+- Regenerate `.freezed.dart` / `.g.dart`
+
+**Python** — `packages/dataset_config_python/src/dataset_config_python/models/task.py`
+- Rename `task_func` → `func`
+
+**Other files to update:**
+- `packages/dataset_config_dart/lib/src/parsed_task.dart` — `taskFunc` field and `copyWith`
+- `packages/dataset_config_dart/lib/src/parsers/yaml_parser.dart` — variable names referencing `taskFunc`
+- `packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart` — `tc.taskFunc`
+- `packages/devals_cli/lib/src/dataset/dry_run.dart` — references `task_func`
+- `packages/dash_evals/src/dash_evals/runner/json_runner.py` — `task_def.get("task_func")`
+- `packages/dataset_config_python/tests/test_config.py` — Task construction with `task_func=`
+- `tool/config_parity/` — both `resolve_dart.dart` and `resolve_python.py`
+
+---
+
+## 2. Parser/Resolver Changes
+
+### 2.1 Support `module:task` syntax
+
+Task function references can use `module.path:function_name` format.
+
+**Python** — `packages/dash_evals/src/dash_evals/runner/json_runner.py`
+- Update `_resolve_task_func()` to split on `:` and import the module, then get the function by attribute name.
+
+**Dart parser** — `yaml_parser.dart` L53 already reads `func` as a string. No Dart change needed — the module resolution happens in the Python runner.
+
+---
+
+### 2.2 Make sandbox registry configurable
+
+The hardcoded `kSandboxRegistry` and `kSdkChannels` in `eval_set_resolver.dart` (lines 25-42) need to become data-driven.
+
+**Approach:**
+1. Move `kSandboxRegistry` and `kSdkChannels` out of the resolver
+2. Add an optional `sandbox_registry` parameter to `EvalSetResolver.resolve()`, or make it a field on the resolver
+3. The consuming project (dash_evals CLI) passes its sandbox registry when calling the resolver
+4. Default to an empty registry if none provided (no sandbox resolution)
+
+**Files:**
+- `packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart` — extract constants, add parameter
+- `packages/devals_cli/` — pass the Flutter-specific registry when calling the resolver
+- Python resolver (`packages/dataset_config_python/src/dataset_config_python/resolver.py`) — mirror the same approach
+
+---
+
+### 2.3 Workspace: use native Inspect fields
+
+The `workspace` YAML key stays as parser sugar but resolves into Inspect's native `Sample.files` and `Sample.setup`.
+
+**Current behavior** (`eval_set_resolver.dart` L132-141):
+```dart
+if (workspace != null && isContainer) {
+  files = {...?files, '/workspace': workspace};
+  setup = setup ?? 'cd /workspace && flutter pub get';
+  enriched['workspace'] = '/workspace';
+}
+```
+
+**Change:**
+- Make the auto-generated `setup` command configurable. Options:
+  - Add a `workspace_setup` field to Task YAML (e.g. `workspace_setup: "cd /workspace && npm install"`)
+  - Or: only auto-generate setup for tasks that have a Flutter-specific tag/metadata
+  - Or: remove auto-generation entirely; require the task author to specify `setup` if needed
+- The resolver should still map `workspace` → `Sample.files['/workspace']`, but not assume Flutter.
+
+**Files:**
+- `packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart` — update workspace → files mapping
+- `packages/dataset_config_python/src/dataset_config_python/resolver.py` — mirror
+
+---
+
+## 3. Tag-Based Filtering
+
+### 3.1 New `TagFilter` model
+
+**Dart** — new file `packages/dataset_config_dart/lib/src/models/tag_filter.dart`
+```dart
+@freezed
+sealed class TagFilter with _$TagFilter {
+  const factory TagFilter({
+    @JsonKey(name: 'include_tags') List<String>? includeTags,
+    @JsonKey(name: 'exclude_tags') List<String>? excludeTags,
+  }) = _TagFilter;
+
+  factory TagFilter.fromJson(Map<String, dynamic> json) =>
+      _$TagFilterFromJson(json);
+}
+```
+
+**Python** — new file or add to `packages/dataset_config_python/src/dataset_config_python/models/tag_filter.py`
+```python
+class TagFilter(BaseModel):
+    include_tags: list[str] | None = None
+    exclude_tags: list[str] | None = None
+```
+
+**Shared matching function** (add to both languages):
+```python
+def matches_filter(item_tags: list[str], filter: TagFilter) -> bool:
+    if filter.include_tags and not all(t in item_tags for t in filter.include_tags):
+        return False
+    if filter.exclude_tags and any(t in item_tags for t in filter.exclude_tags):
+        return False
+    return True
+```
+
+### 3.2 Add filters to Job and Task
+
+**Job model:**
+- `taskFilters: TagFilter?` / `task_filters: TagFilter | None`
+- `sampleFilters: TagFilter?` / `sample_filters: TagFilter | None`
+
+**Task YAML (parser-level, not model):**
+- `variant_filters: TagFilter?` — parsed from task.yaml, stored on `ParsedTask`
+
+### 3.3 Apply filters in resolver
+
+In `_expandTaskConfigs()` (`eval_set_resolver.dart` L418-493), add filtering steps:
+
+1. **Task filtering** (after L431): if `job.taskFilters` is set, check `taskConfig.metadata['tags']` against the filter
+2. **Sample filtering** (after L460): if `job.sampleFilters` is set, filter samples by `sample.metadata['tags']`
+3. **Variant filtering** (after L440): if `taskConfig.variantFilters` is set, check variant metadata tags
+
+These run alongside (not replacing) the existing ID-based filters.
+
+---
+
+## 4. File Index
+
+All files that need modification, grouped by package:
+
+### `dataset_config_dart`
+| File | Changes |
+|---|---|
+| `lib/src/models/job.dart` | Add `description`, `imagePrefix`, `taskFilters`, `sampleFilters` |
+| `lib/src/models/job.dart` (JobTask) | Add `args` |
+| `lib/src/models/task.dart` | Rename `taskFunc` → `func`, add `systemMessage`, `sandboxParameters` |
+| `lib/src/models/tag_filter.dart` | **New file** — `TagFilter` model |
+| `lib/src/models/models.dart` | Export `tag_filter.dart` |
+| `lib/src/parsed_task.dart` | Rename `taskFunc` → `func`, add `variantFilters` |
+| `lib/src/parsers/yaml_parser.dart` | Read new fields from YAML |
+| `lib/src/resolvers/eval_set_resolver.dart` | Configurable sandbox registry, tag filtering, workspace setup |
+| `test/` | Update tests for renamed fields and new features |
+
+### `dataset_config_python`
+| File | Changes |
+|---|---|
+| `models/job.py` | Add `description`, `image_prefix`, `task_filters`, `sample_filters` |
+| `models/job.py` (JobTask) | Add `args` |
+| `models/task.py` | Rename `task_func` → `func`, add `system_message`, `sandbox_parameters` |
+| `models/tag_filter.py` | **New file** — `TagFilter` model |
+| `models/__init__.py` | Export `TagFilter` |
+| `parser.py` | Read new fields from YAML |
+| `resolver.py` | Configurable sandbox registry, tag filtering, workspace setup |
+| `tests/test_config.py` | Update tests |
+
+### `dash_evals` (Python runner)
+| File | Changes |
+|---|---|
+| `runner/json_runner.py` | `task_func` → `func`, `module:task` syntax support |
+
+### `devals_cli` (Dart CLI)
+| File | Changes |
+|---|---|
+| `lib/src/dataset/dry_run.dart` | `task_func` → `func` references |
+
+### Other
+| File | Changes |
+|---|---|
+| `tool/config_parity/` | Update both resolve scripts for renamed fields |
+| `docs/reference/yaml_config.md` | Already updated |
+| `CHANGELOG.md` | Already updated |
+| `docs/guides/config.md` | Update after implementation |
+
+---
+
+## 5. Verification
+
+### Automated
+- Run `dart test` in `dataset_config_dart`
+- Run `pytest` in `dataset_config_python`
+- Run `tool/config_parity` to verify Dart/Python output parity
+- Run `dart analyze` across workspace
+
+### Manual
+- Verify `make html` in `docs/` builds without new errors
+- Verify a sample job YAML with the new fields parses correctly
+- Verify tag filtering produces expected task/sample subsets
diff --git a/packages/dash_evals/src/dash_evals/runner/json_runner.py b/packages/dash_evals/src/dash_evals/runner/json_runner.py
index a5d7a5b..7db7e89 100644
--- a/packages/dash_evals/src/dash_evals/runner/json_runner.py
+++ b/packages/dash_evals/src/dash_evals/runner/json_runner.py
@@ -146,13 +146,13 @@ def _run_single_manifest(manifest: dict) -> bool:
     task_instances: list[inspect_ai.Task] = []
 
     for task_def in task_defs:
-        task_func_name = task_def.get("task_func")
+        task_func_name = task_def.get("func")
         task_name = task_def.get("name", task_func_name or "(unknown)")
 
         if not task_func_name:
             # Mode 2: hydrate directly from JSON (future)
             job_logger.warning(
-                f"  ⚠ {task_name}: no task_func — Mode 2 hydration not yet supported"
+                f"  ⚠ {task_name}: no func — Mode 2 hydration not yet supported"
             )
             continue
 
diff --git a/packages/dataset_config_dart/lib/src/models/context_file.g.dart b/packages/dataset_config_dart/lib/src/models/context_file.g.dart
index fcea90e..7489275 100644
--- a/packages/dataset_config_dart/lib/src/models/context_file.g.dart
+++ b/packages/dataset_config_dart/lib/src/models/context_file.g.dart
@@ -37,7 +37,7 @@ _ContextFile _$ContextFileFromJson(Map<String, dynamic> json) => _ContextFile(
 
 Map<String, dynamic> _$ContextFileToJson(_ContextFile instance) =>
     <String, dynamic>{
-      'metadata': instance.metadata.toJson(),
+      'metadata': instance.metadata,
       'content': instance.content,
       'file_path': instance.filePath,
     };
diff --git a/packages/dataset_config_dart/lib/src/models/dataset.g.dart b/packages/dataset_config_dart/lib/src/models/dataset.g.dart
index 0b281d8..a3c87a3 100644
--- a/packages/dataset_config_dart/lib/src/models/dataset.g.dart
+++ b/packages/dataset_config_dart/lib/src/models/dataset.g.dart
@@ -18,7 +18,7 @@ _Dataset _$DatasetFromJson(Map<String, dynamic> json) => _Dataset(
 );
 
 Map<String, dynamic> _$DatasetToJson(_Dataset instance) => <String, dynamic>{
-  'samples': instance.samples.map((e) => e.toJson()).toList(),
+  'samples': instance.samples,
   'name': instance.name,
   'location': instance.location,
   'shuffled': instance.shuffled,
diff --git a/packages/dataset_config_dart/lib/src/models/eval_log.g.dart b/packages/dataset_config_dart/lib/src/models/eval_log.g.dart
index f6fa452..d55efb0 100644
--- a/packages/dataset_config_dart/lib/src/models/eval_log.g.dart
+++ b/packages/dataset_config_dart/lib/src/models/eval_log.g.dart
@@ -39,17 +39,17 @@ _EvalLog _$EvalLogFromJson(Map<String, dynamic> json) => _EvalLog(
 Map<String, dynamic> _$EvalLogToJson(_EvalLog instance) => <String, dynamic>{
   'version': instance.version,
   'status': instance.status,
-  'eval': instance.eval.toJson(),
-  'plan': instance.plan?.toJson(),
-  'results': instance.results?.toJson(),
-  'stats': instance.stats?.toJson(),
-  'error': instance.error?.toJson(),
+  'eval': instance.eval,
+  'plan': instance.plan,
+  'results': instance.results,
+  'stats': instance.stats,
+  'error': instance.error,
   'invalidated': instance.invalidated,
-  'samples': instance.samples?.map((e) => e.toJson()).toList(),
-  'reductions': instance.reductions?.map((e) => e.toJson()).toList(),
+  'samples': instance.samples,
+  'reductions': instance.reductions,
   'location': instance.location,
   'etag': instance.etag,
-  'eval_set_info': instance.evalSetInfo?.toJson(),
+  'eval_set_info': instance.evalSetInfo,
 };
 
 _EvalSpec _$EvalSpecFromJson(Map<String, dynamic> json) => _EvalSpec(
@@ -125,15 +125,15 @@ Map<String, dynamic> _$EvalSpecToJson(_EvalSpec instance) => <String, dynamic>{
   'solver_args': instance.solverArgs,
   'solver_args_passed': instance.solverArgsPassed,
   'tags': instance.tags,
-  'dataset': instance.dataset?.toJson(),
+  'dataset': instance.dataset,
   'sandbox': instance.sandbox,
   'model': instance.model,
-  'model_generate_config': instance.modelGenerateConfig?.toJson(),
+  'model_generate_config': instance.modelGenerateConfig,
   'model_base_url': instance.modelBaseUrl,
   'model_args': instance.modelArgs,
   'model_roles': instance.modelRoles,
-  'config': instance.config.toJson(),
-  'revision': instance.revision?.toJson(),
+  'config': instance.config,
+  'revision': instance.revision,
   'packages': instance.packages,
   'metadata': instance.metadata,
   'scorers': instance.scorers,
@@ -249,9 +249,9 @@ _EvalPlan _$EvalPlanFromJson(Map<String, dynamic> json) => _EvalPlan(
 
 Map<String, dynamic> _$EvalPlanToJson(_EvalPlan instance) => <String, dynamic>{
   'name': instance.name,
-  'steps': instance.steps.map((e) => e.toJson()).toList(),
-  'finish': instance.finish?.toJson(),
-  'config': instance.config.toJson(),
+  'steps': instance.steps,
+  'finish': instance.finish,
+  'config': instance.config,
 };
 
 _EvalPlanStep _$EvalPlanStepFromJson(Map<String, dynamic> json) =>
@@ -291,12 +291,10 @@ Map<String, dynamic> _$EvalResultsToJson(_EvalResults instance) =>
     <String, dynamic>{
       'total_samples': instance.totalSamples,
       'completed_samples': instance.completedSamples,
-      'early_stopping': instance.earlyStopping?.toJson(),
-      'scores': instance.scores.map((e) => e.toJson()).toList(),
+      'early_stopping': instance.earlyStopping,
+      'scores': instance.scores,
       'metadata': instance.metadata,
-      'sample_reductions': instance.sampleReductions
-          ?.map((e) => e.toJson())
-          .toList(),
+      'sample_reductions': instance.sampleReductions,
     };
 
 _EarlyStoppingSummary _$EarlyStoppingSummaryFromJson(
@@ -338,7 +336,7 @@ Map<String, dynamic> _$EvalScoreToJson(_EvalScore instance) =>
       'scored_samples': instance.scoredSamples,
       'unscored_samples': instance.unscoredSamples,
       'params': instance.params,
-      'metrics': instance.metrics.map((e) => e.toJson()).toList(),
+      'metrics': instance.metrics,
       'metadata': instance.metadata,
     };
 
@@ -372,7 +370,7 @@ Map<String, dynamic> _$EvalSampleReductionsToJson(
 ) => <String, dynamic>{
   'scorer': instance.scorer,
   'reducer': instance.reducer,
-  'samples': instance.samples.map((e) => e.toJson()).toList(),
+  'samples': instance.samples,
 };
 
 _EvalStats _$EvalStatsFromJson(Map<String, dynamic> json) => _EvalStats(
@@ -389,7 +387,7 @@ Map<String, dynamic> _$EvalStatsToJson(_EvalStats instance) =>
     <String, dynamic>{
       'started_at': instance.startedAt,
       'completed_at': instance.completedAt,
-      'model_usage': instance.modelUsage.map((k, e) => MapEntry(k, e.toJson())),
+      'model_usage': instance.modelUsage,
     };
 
 _EvalError _$EvalErrorFromJson(Map<String, dynamic> json) => _EvalError(
@@ -470,22 +468,22 @@ Map<String, dynamic> _$EvalSampleToJson(_EvalSample instance) =>
       'sandbox': instance.sandbox,
       'files': instance.files,
       'setup': instance.setup,
-      'messages': instance.messages.map((e) => e.toJson()).toList(),
-      'output': instance.output.toJson(),
-      'scores': instance.scores?.map((k, e) => MapEntry(k, e.toJson())),
+      'messages': instance.messages,
+      'output': instance.output,
+      'scores': instance.scores,
       'store': instance.store,
       'events': instance.events,
-      'model_usage': instance.modelUsage.map((k, e) => MapEntry(k, e.toJson())),
+      'model_usage': instance.modelUsage,
       'started_at': instance.startedAt,
       'completed_at': instance.completedAt,
       'total_time': instance.totalTime,
       'working_time': instance.workingTime,
       'uuid': instance.uuid,
-      'invalidation': instance.invalidation?.toJson(),
-      'error': instance.error?.toJson(),
-      'error_retries': instance.errorRetries?.map((e) => e.toJson()).toList(),
+      'invalidation': instance.invalidation,
+      'error': instance.error,
+      'error_retries': instance.errorRetries,
       'attachments': instance.attachments,
-      'limit': instance.limit?.toJson(),
+      'limit': instance.limit,
     };
 
 _ModelOutput _$ModelOutputFromJson(Map<String, dynamic> json) => _ModelOutput(
@@ -511,14 +509,14 @@ _ModelOutput _$ModelOutputFromJson(Map<String, dynamic> json) => _ModelOutput(
 Map<String, dynamic> _$ModelOutputToJson(_ModelOutput instance) =>
     <String, dynamic>{
       'model': instance.model,
-      'choices': instance.choices.map((e) => e.toJson()).toList(),
-      'usage': instance.usage?.toJson(),
+      'choices': instance.choices,
+      'usage': instance.usage,
       'completion': instance.completion,
       'stop_reason': instance.stopReason,
       'time': instance.time,
       'metadata': instance.metadata,
       'error': instance.error,
-      'message': instance.message?.toJson(),
+      'message': instance.message,
     };
 
 _ChatCompletionChoice _$ChatCompletionChoiceFromJson(
@@ -536,9 +534,9 @@ _ChatCompletionChoice _$ChatCompletionChoiceFromJson(
 Map<String, dynamic> _$ChatCompletionChoiceToJson(
   _ChatCompletionChoice instance,
 ) => <String, dynamic>{
-  'message': instance.message.toJson(),
+  'message': instance.message,
   'stop_reason': instance.stopReason,
-  'logprobs': instance.logprobs?.toJson(),
+  'logprobs': instance.logprobs,
 };
 
 _ModelUsage _$ModelUsageFromJson(Map<String, dynamic> json) => _ModelUsage(
@@ -620,7 +618,7 @@ Map<String, dynamic> _$ChatMessageAssistantToJson(
   'source': instance.source,
   'metadata': instance.metadata,
   'role': instance.role,
-  'tool_calls': instance.toolCalls?.map((e) => e.toJson()).toList(),
+  'tool_calls': instance.toolCalls,
   'model': instance.model,
 };
 
@@ -647,7 +645,7 @@ Map<String, dynamic> _$ChatMessageToolToJson(ChatMessageTool instance) =>
       'role': instance.role,
       'tool_call_id': instance.toolCallId,
       'function': instance.function,
-      'error': instance.error?.toJson(),
+      'error': instance.error,
     };
 
 ContentText _$ContentTextFromJson(Map<String, dynamic> json) => ContentText(
@@ -932,7 +930,7 @@ _EvalSetInfo _$EvalSetInfoFromJson(Map<String, dynamic> json) => _EvalSetInfo(
 Map<String, dynamic> _$EvalSetInfoToJson(_EvalSetInfo instance) =>
     <String, dynamic>{
       'eval_set_id': instance.evalSetId,
-      'tasks': instance.tasks.map((e) => e.toJson()).toList(),
+      'tasks': instance.tasks,
     };
 
 _EvalSetTask _$EvalSetTaskFromJson(Map<String, dynamic> json) => _EvalSetTask(
diff --git a/packages/dataset_config_dart/lib/src/models/eval_set.g.dart b/packages/dataset_config_dart/lib/src/models/eval_set.g.dart
index 7b0db55..4e91dab 100644
--- a/packages/dataset_config_dart/lib/src/models/eval_set.g.dart
+++ b/packages/dataset_config_dart/lib/src/models/eval_set.g.dart
@@ -64,7 +64,7 @@ _EvalSet _$EvalSetFromJson(Map<String, dynamic> json) => _EvalSet(
 );
 
 Map<String, dynamic> _$EvalSetToJson(_EvalSet instance) => <String, dynamic>{
-  'tasks': instance.tasks.map((e) => e.toJson()).toList(),
+  'tasks': instance.tasks,
   'log_dir': instance.logDir,
   'retry_attempts': instance.retryAttempts,
   'retry_wait': instance.retryWait,
diff --git a/packages/dataset_config_dart/lib/src/models/job.dart b/packages/dataset_config_dart/lib/src/models/job.dart
index 800f19c..0d8f49d 100644
--- a/packages/dataset_config_dart/lib/src/models/job.dart
+++ b/packages/dataset_config_dart/lib/src/models/job.dart
@@ -1,4 +1,5 @@
 import 'package:freezed_annotation/freezed_annotation.dart';
+import 'tag_filter.dart';
 
 part 'job.freezed.dart';
 part 'job.g.dart';
@@ -45,6 +46,14 @@ sealed class Job with _$Job {
     // Core job settings
     // ------------------------------------------------------------------
 
+    /// Human-readable description of this job.
+    String? description,
+
+    /// Registry URL prefix prepended to image names during sandbox resolution.
+    ///
+    /// Example: `us-central1-docker.pkg.dev/project/repo/`
+    @JsonKey(name: 'image_prefix') String? imagePrefix,
+
     /// Directory to write evaluation logs to.
     @JsonKey(name: 'log_dir') required String logDir,
 
@@ -233,6 +242,16 @@ sealed class Job with _$Job {
     ///
     /// Per-task overrides (from `task.yaml`) take precedence.
     @JsonKey(name: 'task_defaults') Map<String, dynamic>? taskDefaults,
+
+    // ------------------------------------------------------------------
+    // Tag-based filtering
+    // ------------------------------------------------------------------
+
+    /// Tag filters applied to tasks.
+    @JsonKey(name: 'task_filters') TagFilter? taskFilters,
+
+    /// Tag filters applied to samples.
+    @JsonKey(name: 'sample_filters') TagFilter? sampleFilters,
   }) = _Job;
 
   factory Job.fromJson(Map<String, dynamic> json) => _$JobFromJson(json);
@@ -256,6 +275,9 @@ sealed class JobTask with _$JobTask {
 
     /// Override system message for this task.
     @JsonKey(name: 'system_message') String? systemMessage,
+
+    /// Per-task argument overrides passed to the task function.
+    @JsonKey(name: 'args') Map<String, dynamic>? args,
   }) = _JobTask;
 
   factory JobTask.fromJson(Map<String, dynamic> json) =>
@@ -274,6 +296,7 @@ sealed class JobTask with _$JobTask {
       includeSamples: (data['include-samples'] as List?)?.cast<String>(),
       excludeSamples: (data['exclude-samples'] as List?)?.cast<String>(),
       systemMessage: data['system_message'] as String?,
+      args: (data['args'] as Map?)?.cast<String, dynamic>(),
     );
   }
 }
diff --git a/packages/dataset_config_dart/lib/src/models/job.freezed.dart b/packages/dataset_config_dart/lib/src/models/job.freezed.dart
index e249877..4b955bd 100644
--- a/packages/dataset_config_dart/lib/src/models/job.freezed.dart
+++ b/packages/dataset_config_dart/lib/src/models/job.freezed.dart
@@ -18,7 +18,11 @@ mixin _$Job {
 // ------------------------------------------------------------------
 // Core job settings
 // ------------------------------------------------------------------
-/// Directory to write evaluation logs to.
+/// Human-readable description of this job.
+ String? get description;/// Registry URL prefix prepended to image names during sandbox resolution.
+///
+/// Example: `us-central1-docker.pkg.dev/project/repo/`
+@JsonKey(name: 'image_prefix') String? get imagePrefix;/// Directory to write evaluation logs to.
 @JsonKey(name: 'log_dir') String get logDir;/// Sandbox type: `'local'`, `'docker'`, or `'podman'`.
 @JsonKey(name: 'sandbox_type') String get sandboxType;/// Maximum concurrent API connections.
 @JsonKey(name: 'max_connections') int get maxConnections;/// Models to run. `null` means use defaults from registries.
@@ -91,7 +95,12 @@ mixin _$Job {
 @JsonKey(name: 'eval_set_overrides') Map<String, dynamic>? get evalSetOverrides;/// Default `Task` kwargs applied to every task in this job.
 ///
 /// Per-task overrides (from `task.yaml`) take precedence.
-@JsonKey(name: 'task_defaults') Map<String, dynamic>? get taskDefaults;
+@JsonKey(name: 'task_defaults') Map<String, dynamic>? get taskDefaults;// ------------------------------------------------------------------
+// Tag-based filtering
+// ------------------------------------------------------------------
+/// Tag filters applied to tasks.
+@JsonKey(name: 'task_filters') TagFilter? get taskFilters;/// Tag filters applied to samples.
+@JsonKey(name: 'sample_filters') TagFilter? get sampleFilters;
 /// Create a copy of Job
 /// with the given fields replaced by the non-null parameter values.
 @JsonKey(includeFromJson: false, includeToJson: false)
@@ -104,16 +113,16 @@ $JobCopyWith<Job> get copyWith => _$JobCopyWithImpl<Job>(this as Job, _$identity
 
 @override
 bool operator ==(Object other) {
-  return identical(this, other) || (other.runtimeType == runtimeType&&other is Job&&(identical(other.logDir, logDir) || other.logDir == logDir)&&(identical(other.sandboxType, sandboxType) || other.sandboxType == sandboxType)&&(identical(other.maxConnections, maxConnections) || other.maxConnections == maxConnections)&&const DeepCollectionEquality().equals(other.models, models)&&const DeepCollectionEquality().equals(other.variants, variants)&&const DeepCollectionEquality().equals(other.taskPaths, taskPaths)&&const DeepCollectionEquality().equals(other.tasks, tasks)&&(identical(other.saveExamples, saveExamples) || other.saveExamples == saveExamples)&&(identical(other.retryAttempts, retryAttempts) || other.retryAttempts == retryAttempts)&&(identical(other.maxRetries, maxRetries) || other.maxRetries == maxRetries)&&(identical(other.retryWait, retryWait) || other.retryWait == retryWait)&&(identical(other.retryConnections, retryConnections) || other.retryConnections == retryConnections)&&(identical(other.retryCleanup, retryCleanup) || other.retryCleanup == retryCleanup)&&(identical(other.failOnError, failOnError) || other.failOnError == failOnError)&&(identical(other.continueOnFail, continueOnFail) || other.continueOnFail == continueOnFail)&&(identical(other.retryOnError, retryOnError) || other.retryOnError == retryOnError)&&(identical(other.debugErrors, debugErrors) || other.debugErrors == debugErrors)&&(identical(other.maxSamples, maxSamples) || other.maxSamples == maxSamples)&&(identical(other.maxTasks, maxTasks) || other.maxTasks == maxTasks)&&(identical(other.maxSubprocesses, maxSubprocesses) || other.maxSubprocesses == maxSubprocesses)&&(identical(other.maxSandboxes, maxSandboxes) || other.maxSandboxes == maxSandboxes)&&(identical(other.logLevel, logLevel) || other.logLevel == logLevel)&&(identical(other.logLevelTranscript, logLevelTranscript) || other.logLevelTranscript == logLevelTranscript)&&(identical(other.logFormat, logFormat) || other.logFormat == logFormat)&&const DeepCollectionEquality().equals(other.tags, tags)&&const DeepCollectionEquality().equals(other.metadata, metadata)&&(identical(other.trace, trace) || other.trace == trace)&&(identical(other.display, display) || other.display == display)&&(identical(other.score, score) || other.score == score)&&const DeepCollectionEquality().equals(other.limit, limit)&&const DeepCollectionEquality().equals(other.sampleId, sampleId)&&const DeepCollectionEquality().equals(other.sampleShuffle, sampleShuffle)&&const DeepCollectionEquality().equals(other.epochs, epochs)&&const DeepCollectionEquality().equals(other.approval, approval)&&const DeepCollectionEquality().equals(other.solver, solver)&&(identical(other.sandboxCleanup, sandboxCleanup) || other.sandboxCleanup == sandboxCleanup)&&(identical(other.modelBaseUrl, modelBaseUrl) || other.modelBaseUrl == modelBaseUrl)&&const DeepCollectionEquality().equals(other.modelArgs, modelArgs)&&const DeepCollectionEquality().equals(other.modelRoles, modelRoles)&&const DeepCollectionEquality().equals(other.taskArgs, taskArgs)&&(identical(other.messageLimit, messageLimit) || other.messageLimit == messageLimit)&&(identical(other.tokenLimit, tokenLimit) || other.tokenLimit == tokenLimit)&&(identical(other.timeLimit, timeLimit) || other.timeLimit == timeLimit)&&(identical(other.workingLimit, workingLimit) || other.workingLimit == workingLimit)&&(identical(other.costLimit, costLimit) || other.costLimit == costLimit)&&const DeepCollectionEquality().equals(other.modelCostConfig, modelCostConfig)&&(identical(other.logSamples, logSamples) || other.logSamples == logSamples)&&(identical(other.logRealtime, logRealtime) || other.logRealtime == logRealtime)&&(identical(other.logImages, logImages) || other.logImages == logImages)&&(identical(other.logBuffer, logBuffer) || other.logBuffer == logBuffer)&&(identical(other.logShared, logShared) || other.logShared == logShared)&&(identical(other.bundleDir, bundleDir) || other.bundleDir == bundleDir)&&(identical(other.bundleOverwrite, bundleOverwrite) || other.bundleOverwrite == bundleOverwrite)&&(identical(other.logDirAllowDirty, logDirAllowDirty) || other.logDirAllowDirty == logDirAllowDirty)&&(identical(other.evalSetId, evalSetId) || other.evalSetId == evalSetId)&&const DeepCollectionEquality().equals(other.evalSetOverrides, evalSetOverrides)&&const DeepCollectionEquality().equals(other.taskDefaults, taskDefaults));
+  return identical(this, other) || (other.runtimeType == runtimeType&&other is Job&&(identical(other.description, description) || other.description == description)&&(identical(other.imagePrefix, imagePrefix) || other.imagePrefix == imagePrefix)&&(identical(other.logDir, logDir) || other.logDir == logDir)&&(identical(other.sandboxType, sandboxType) || other.sandboxType == sandboxType)&&(identical(other.maxConnections, maxConnections) || other.maxConnections == maxConnections)&&const DeepCollectionEquality().equals(other.models, models)&&const DeepCollectionEquality().equals(other.variants, variants)&&const DeepCollectionEquality().equals(other.taskPaths, taskPaths)&&const DeepCollectionEquality().equals(other.tasks, tasks)&&(identical(other.saveExamples, saveExamples) || other.saveExamples == saveExamples)&&(identical(other.retryAttempts, retryAttempts) || other.retryAttempts == retryAttempts)&&(identical(other.maxRetries, maxRetries) || other.maxRetries == maxRetries)&&(identical(other.retryWait, retryWait) || other.retryWait == retryWait)&&(identical(other.retryConnections, retryConnections) || other.retryConnections == retryConnections)&&(identical(other.retryCleanup, retryCleanup) || other.retryCleanup == retryCleanup)&&(identical(other.failOnError, failOnError) || other.failOnError == failOnError)&&(identical(other.continueOnFail, continueOnFail) || other.continueOnFail == continueOnFail)&&(identical(other.retryOnError, retryOnError) || other.retryOnError == retryOnError)&&(identical(other.debugErrors, debugErrors) || other.debugErrors == debugErrors)&&(identical(other.maxSamples, maxSamples) || other.maxSamples == maxSamples)&&(identical(other.maxTasks, maxTasks) || other.maxTasks == maxTasks)&&(identical(other.maxSubprocesses, maxSubprocesses) || other.maxSubprocesses == maxSubprocesses)&&(identical(other.maxSandboxes, maxSandboxes) || other.maxSandboxes == maxSandboxes)&&(identical(other.logLevel, logLevel) || other.logLevel == logLevel)&&(identical(other.logLevelTranscript, logLevelTranscript) || other.logLevelTranscript == logLevelTranscript)&&(identical(other.logFormat, logFormat) || other.logFormat == logFormat)&&const DeepCollectionEquality().equals(other.tags, tags)&&const DeepCollectionEquality().equals(other.metadata, metadata)&&(identical(other.trace, trace) || other.trace == trace)&&(identical(other.display, display) || other.display == display)&&(identical(other.score, score) || other.score == score)&&const DeepCollectionEquality().equals(other.limit, limit)&&const DeepCollectionEquality().equals(other.sampleId, sampleId)&&const DeepCollectionEquality().equals(other.sampleShuffle, sampleShuffle)&&const DeepCollectionEquality().equals(other.epochs, epochs)&&const DeepCollectionEquality().equals(other.approval, approval)&&const DeepCollectionEquality().equals(other.solver, solver)&&(identical(other.sandboxCleanup, sandboxCleanup) || other.sandboxCleanup == sandboxCleanup)&&(identical(other.modelBaseUrl, modelBaseUrl) || other.modelBaseUrl == modelBaseUrl)&&const DeepCollectionEquality().equals(other.modelArgs, modelArgs)&&const DeepCollectionEquality().equals(other.modelRoles, modelRoles)&&const DeepCollectionEquality().equals(other.taskArgs, taskArgs)&&(identical(other.messageLimit, messageLimit) || other.messageLimit == messageLimit)&&(identical(other.tokenLimit, tokenLimit) || other.tokenLimit == tokenLimit)&&(identical(other.timeLimit, timeLimit) || other.timeLimit == timeLimit)&&(identical(other.workingLimit, workingLimit) || other.workingLimit == workingLimit)&&(identical(other.costLimit, costLimit) || other.costLimit == costLimit)&&const DeepCollectionEquality().equals(other.modelCostConfig, modelCostConfig)&&(identical(other.logSamples, logSamples) || other.logSamples == logSamples)&&(identical(other.logRealtime, logRealtime) || other.logRealtime == logRealtime)&&(identical(other.logImages, logImages) || other.logImages == logImages)&&(identical(other.logBuffer, logBuffer) || other.logBuffer == logBuffer)&&(identical(other.logShared, logShared) || other.logShared == logShared)&&(identical(other.bundleDir, bundleDir) || other.bundleDir == bundleDir)&&(identical(other.bundleOverwrite, bundleOverwrite) || other.bundleOverwrite == bundleOverwrite)&&(identical(other.logDirAllowDirty, logDirAllowDirty) || other.logDirAllowDirty == logDirAllowDirty)&&(identical(other.evalSetId, evalSetId) || other.evalSetId == evalSetId)&&const DeepCollectionEquality().equals(other.evalSetOverrides, evalSetOverrides)&&const DeepCollectionEquality().equals(other.taskDefaults, taskDefaults)&&(identical(other.taskFilters, taskFilters) || other.taskFilters == taskFilters)&&(identical(other.sampleFilters, sampleFilters) || other.sampleFilters == sampleFilters));
 }
 
 @JsonKey(includeFromJson: false, includeToJson: false)
 @override
-int get hashCode => Object.hashAll([runtimeType,logDir,sandboxType,maxConnections,const DeepCollectionEquality().hash(models),const DeepCollectionEquality().hash(variants),const DeepCollectionEquality().hash(taskPaths),const DeepCollectionEquality().hash(tasks),saveExamples,retryAttempts,maxRetries,retryWait,retryConnections,retryCleanup,failOnError,continueOnFail,retryOnError,debugErrors,maxSamples,maxTasks,maxSubprocesses,maxSandboxes,logLevel,logLevelTranscript,logFormat,const DeepCollectionEquality().hash(tags),const DeepCollectionEquality().hash(metadata),trace,display,score,const DeepCollectionEquality().hash(limit),const DeepCollectionEquality().hash(sampleId),const DeepCollectionEquality().hash(sampleShuffle),const DeepCollectionEquality().hash(epochs),const DeepCollectionEquality().hash(approval),const DeepCollectionEquality().hash(solver),sandboxCleanup,modelBaseUrl,const DeepCollectionEquality().hash(modelArgs),const DeepCollectionEquality().hash(modelRoles),const DeepCollectionEquality().hash(taskArgs),messageLimit,tokenLimit,timeLimit,workingLimit,costLimit,const DeepCollectionEquality().hash(modelCostConfig),logSamples,logRealtime,logImages,logBuffer,logShared,bundleDir,bundleOverwrite,logDirAllowDirty,evalSetId,const DeepCollectionEquality().hash(evalSetOverrides),const DeepCollectionEquality().hash(taskDefaults)]);
+int get hashCode => Object.hashAll([runtimeType,description,imagePrefix,logDir,sandboxType,maxConnections,const DeepCollectionEquality().hash(models),const DeepCollectionEquality().hash(variants),const DeepCollectionEquality().hash(taskPaths),const DeepCollectionEquality().hash(tasks),saveExamples,retryAttempts,maxRetries,retryWait,retryConnections,retryCleanup,failOnError,continueOnFail,retryOnError,debugErrors,maxSamples,maxTasks,maxSubprocesses,maxSandboxes,logLevel,logLevelTranscript,logFormat,const DeepCollectionEquality().hash(tags),const DeepCollectionEquality().hash(metadata),trace,display,score,const DeepCollectionEquality().hash(limit),const DeepCollectionEquality().hash(sampleId),const DeepCollectionEquality().hash(sampleShuffle),const DeepCollectionEquality().hash(epochs),const DeepCollectionEquality().hash(approval),const DeepCollectionEquality().hash(solver),sandboxCleanup,modelBaseUrl,const DeepCollectionEquality().hash(modelArgs),const DeepCollectionEquality().hash(modelRoles),const DeepCollectionEquality().hash(taskArgs),messageLimit,tokenLimit,timeLimit,workingLimit,costLimit,const DeepCollectionEquality().hash(modelCostConfig),logSamples,logRealtime,logImages,logBuffer,logShared,bundleDir,bundleOverwrite,logDirAllowDirty,evalSetId,const DeepCollectionEquality().hash(evalSetOverrides),const DeepCollectionEquality().hash(taskDefaults),taskFilters,sampleFilters]);
 
 @override
 String toString() {
-  return 'Job(logDir: $logDir, sandboxType: $sandboxType, maxConnections: $maxConnections, models: $models, variants: $variants, taskPaths: $taskPaths, tasks: $tasks, saveExamples: $saveExamples, retryAttempts: $retryAttempts, maxRetries: $maxRetries, retryWait: $retryWait, retryConnections: $retryConnections, retryCleanup: $retryCleanup, failOnError: $failOnError, continueOnFail: $continueOnFail, retryOnError: $retryOnError, debugErrors: $debugErrors, maxSamples: $maxSamples, maxTasks: $maxTasks, maxSubprocesses: $maxSubprocesses, maxSandboxes: $maxSandboxes, logLevel: $logLevel, logLevelTranscript: $logLevelTranscript, logFormat: $logFormat, tags: $tags, metadata: $metadata, trace: $trace, display: $display, score: $score, limit: $limit, sampleId: $sampleId, sampleShuffle: $sampleShuffle, epochs: $epochs, approval: $approval, solver: $solver, sandboxCleanup: $sandboxCleanup, modelBaseUrl: $modelBaseUrl, modelArgs: $modelArgs, modelRoles: $modelRoles, taskArgs: $taskArgs, messageLimit: $messageLimit, tokenLimit: $tokenLimit, timeLimit: $timeLimit, workingLimit: $workingLimit, costLimit: $costLimit, modelCostConfig: $modelCostConfig, logSamples: $logSamples, logRealtime: $logRealtime, logImages: $logImages, logBuffer: $logBuffer, logShared: $logShared, bundleDir: $bundleDir, bundleOverwrite: $bundleOverwrite, logDirAllowDirty: $logDirAllowDirty, evalSetId: $evalSetId, evalSetOverrides: $evalSetOverrides, taskDefaults: $taskDefaults)';
+  return 'Job(description: $description, imagePrefix: $imagePrefix, logDir: $logDir, sandboxType: $sandboxType, maxConnections: $maxConnections, models: $models, variants: $variants, taskPaths: $taskPaths, tasks: $tasks, saveExamples: $saveExamples, retryAttempts: $retryAttempts, maxRetries: $maxRetries, retryWait: $retryWait, retryConnections: $retryConnections, retryCleanup: $retryCleanup, failOnError: $failOnError, continueOnFail: $continueOnFail, retryOnError: $retryOnError, debugErrors: $debugErrors, maxSamples: $maxSamples, maxTasks: $maxTasks, maxSubprocesses: $maxSubprocesses, maxSandboxes: $maxSandboxes, logLevel: $logLevel, logLevelTranscript: $logLevelTranscript, logFormat: $logFormat, tags: $tags, metadata: $metadata, trace: $trace, display: $display, score: $score, limit: $limit, sampleId: $sampleId, sampleShuffle: $sampleShuffle, epochs: $epochs, approval: $approval, solver: $solver, sandboxCleanup: $sandboxCleanup, modelBaseUrl: $modelBaseUrl, modelArgs: $modelArgs, modelRoles: $modelRoles, taskArgs: $taskArgs, messageLimit: $messageLimit, tokenLimit: $tokenLimit, timeLimit: $timeLimit, workingLimit: $workingLimit, costLimit: $costLimit, modelCostConfig: $modelCostConfig, logSamples: $logSamples, logRealtime: $logRealtime, logImages: $logImages, logBuffer: $logBuffer, logShared: $logShared, bundleDir: $bundleDir, bundleOverwrite: $bundleOverwrite, logDirAllowDirty: $logDirAllowDirty, evalSetId: $evalSetId, evalSetOverrides: $evalSetOverrides, taskDefaults: $taskDefaults, taskFilters: $taskFilters, sampleFilters: $sampleFilters)';
 }
 
 
@@ -124,11 +133,11 @@ abstract mixin class $JobCopyWith<$Res>  {
   factory $JobCopyWith(Job value, $Res Function(Job) _then) = _$JobCopyWithImpl;
 @useResult
 $Res call({
-@JsonKey(name: 'log_dir') String logDir,@JsonKey(name: 'sandbox_type') String sandboxType,@JsonKey(name: 'max_connections') int maxConnections, List<String>? models, Map<String, Map<String, dynamic>>? variants,@JsonKey(name: 'task_paths') List<String>? taskPaths, Map<String, JobTask>? tasks,@JsonKey(name: 'save_examples') bool saveExamples,@JsonKey(name: 'retry_attempts') int? retryAttempts,@JsonKey(name: 'max_retries') int? maxRetries,@JsonKey(name: 'retry_wait') double? retryWait,@JsonKey(name: 'retry_connections') double? retryConnections,@JsonKey(name: 'retry_cleanup') bool? retryCleanup,@JsonKey(name: 'fail_on_error') double? failOnError,@JsonKey(name: 'continue_on_fail') bool? continueOnFail,@JsonKey(name: 'retry_on_error') int? retryOnError,@JsonKey(name: 'debug_errors') bool? debugErrors,@JsonKey(name: 'max_samples') int? maxSamples,@JsonKey(name: 'max_tasks') int? maxTasks,@JsonKey(name: 'max_subprocesses') int? maxSubprocesses,@JsonKey(name: 'max_sandboxes') int? maxSandboxes,@JsonKey(name: 'log_level') String? logLevel,@JsonKey(name: 'log_level_transcript') String? logLevelTranscript,@JsonKey(name: 'log_format') String? logFormat, List<String>? tags, Map<String, dynamic>? metadata, bool? trace, String? display, bool? score, Object? limit,@JsonKey(name: 'sample_id') Object? sampleId,@JsonKey(name: 'sample_shuffle') Object? sampleShuffle, Object? epochs, Object? approval, Object? solver,@JsonKey(name: 'sandbox_cleanup') bool? sandboxCleanup,@JsonKey(name: 'model_base_url') String? modelBaseUrl,@JsonKey(name: 'model_args') Map<String, Object?>? modelArgs,@JsonKey(name: 'model_roles') Map<String, String>? modelRoles,@JsonKey(name: 'task_args') Map<String, Object?>? taskArgs,@JsonKey(name: 'message_limit') int? messageLimit,@JsonKey(name: 'token_limit') int? tokenLimit,@JsonKey(name: 'time_limit') int? timeLimit,@JsonKey(name: 'working_limit') int? workingLimit,@JsonKey(name: 'cost_limit') double? costLimit,@JsonKey(name: 'model_cost_config') Map<String, Object?>? modelCostConfig,@JsonKey(name: 'log_samples') bool? logSamples,@JsonKey(name: 'log_realtime') bool? logRealtime,@JsonKey(name: 'log_images') bool? logImages,@JsonKey(name: 'log_buffer') int? logBuffer,@JsonKey(name: 'log_shared') int? logShared,@JsonKey(name: 'bundle_dir') String? bundleDir,@JsonKey(name: 'bundle_overwrite') bool? bundleOverwrite,@JsonKey(name: 'log_dir_allow_dirty') bool? logDirAllowDirty,@JsonKey(name: 'eval_set_id') String? evalSetId,@JsonKey(name: 'eval_set_overrides') Map<String, dynamic>? evalSetOverrides,@JsonKey(name: 'task_defaults') Map<String, dynamic>? taskDefaults
+ String? description,@JsonKey(name: 'image_prefix') String? imagePrefix,@JsonKey(name: 'log_dir') String logDir,@JsonKey(name: 'sandbox_type') String sandboxType,@JsonKey(name: 'max_connections') int maxConnections, List<String>? models, Map<String, Map<String, dynamic>>? variants,@JsonKey(name: 'task_paths') List<String>? taskPaths, Map<String, JobTask>? tasks,@JsonKey(name: 'save_examples') bool saveExamples,@JsonKey(name: 'retry_attempts') int? retryAttempts,@JsonKey(name: 'max_retries') int? maxRetries,@JsonKey(name: 'retry_wait') double? retryWait,@JsonKey(name: 'retry_connections') double? retryConnections,@JsonKey(name: 'retry_cleanup') bool? retryCleanup,@JsonKey(name: 'fail_on_error') double? failOnError,@JsonKey(name: 'continue_on_fail') bool? continueOnFail,@JsonKey(name: 'retry_on_error') int? retryOnError,@JsonKey(name: 'debug_errors') bool? debugErrors,@JsonKey(name: 'max_samples') int? maxSamples,@JsonKey(name: 'max_tasks') int? maxTasks,@JsonKey(name: 'max_subprocesses') int? maxSubprocesses,@JsonKey(name: 'max_sandboxes') int? maxSandboxes,@JsonKey(name: 'log_level') String? logLevel,@JsonKey(name: 'log_level_transcript') String? logLevelTranscript,@JsonKey(name: 'log_format') String? logFormat, List<String>? tags, Map<String, dynamic>? metadata, bool? trace, String? display, bool? score, Object? limit,@JsonKey(name: 'sample_id') Object? sampleId,@JsonKey(name: 'sample_shuffle') Object? sampleShuffle, Object? epochs, Object? approval, Object? solver,@JsonKey(name: 'sandbox_cleanup') bool? sandboxCleanup,@JsonKey(name: 'model_base_url') String? modelBaseUrl,@JsonKey(name: 'model_args') Map<String, Object?>? modelArgs,@JsonKey(name: 'model_roles') Map<String, String>? modelRoles,@JsonKey(name: 'task_args') Map<String, Object?>? taskArgs,@JsonKey(name: 'message_limit') int? messageLimit,@JsonKey(name: 'token_limit') int? tokenLimit,@JsonKey(name: 'time_limit') int? timeLimit,@JsonKey(name: 'working_limit') int? workingLimit,@JsonKey(name: 'cost_limit') double? costLimit,@JsonKey(name: 'model_cost_config') Map<String, Object?>? modelCostConfig,@JsonKey(name: 'log_samples') bool? logSamples,@JsonKey(name: 'log_realtime') bool? logRealtime,@JsonKey(name: 'log_images') bool? logImages,@JsonKey(name: 'log_buffer') int? logBuffer,@JsonKey(name: 'log_shared') int? logShared,@JsonKey(name: 'bundle_dir') String? bundleDir,@JsonKey(name: 'bundle_overwrite') bool? bundleOverwrite,@JsonKey(name: 'log_dir_allow_dirty') bool? logDirAllowDirty,@JsonKey(name: 'eval_set_id') String? evalSetId,@JsonKey(name: 'eval_set_overrides') Map<String, dynamic>? evalSetOverrides,@JsonKey(name: 'task_defaults') Map<String, dynamic>? taskDefaults,@JsonKey(name: 'task_filters') TagFilter? taskFilters,@JsonKey(name: 'sample_filters') TagFilter? sampleFilters
 });
 
 
-
+$TagFilterCopyWith<$Res>? get taskFilters;$TagFilterCopyWith<$Res>? get sampleFilters;
 
 }
 /// @nodoc
@@ -141,9 +150,11 @@ class _$JobCopyWithImpl<$Res>
 
 /// Create a copy of Job
 /// with the given fields replaced by the non-null parameter values.
-@pragma('vm:prefer-inline') @override $Res call({Object? logDir = null,Object? sandboxType = null,Object? maxConnections = null,Object? models = freezed,Object? variants = freezed,Object? taskPaths = freezed,Object? tasks = freezed,Object? saveExamples = null,Object? retryAttempts = freezed,Object? maxRetries = freezed,Object? retryWait = freezed,Object? retryConnections = freezed,Object? retryCleanup = freezed,Object? failOnError = freezed,Object? continueOnFail = freezed,Object? retryOnError = freezed,Object? debugErrors = freezed,Object? maxSamples = freezed,Object? maxTasks = freezed,Object? maxSubprocesses = freezed,Object? maxSandboxes = freezed,Object? logLevel = freezed,Object? logLevelTranscript = freezed,Object? logFormat = freezed,Object? tags = freezed,Object? metadata = freezed,Object? trace = freezed,Object? display = freezed,Object? score = freezed,Object? limit = freezed,Object? sampleId = freezed,Object? sampleShuffle = freezed,Object? epochs = freezed,Object? approval = freezed,Object? solver = freezed,Object? sandboxCleanup = freezed,Object? modelBaseUrl = freezed,Object? modelArgs = freezed,Object? modelRoles = freezed,Object? taskArgs = freezed,Object? messageLimit = freezed,Object? tokenLimit = freezed,Object? timeLimit = freezed,Object? workingLimit = freezed,Object? costLimit = freezed,Object? modelCostConfig = freezed,Object? logSamples = freezed,Object? logRealtime = freezed,Object? logImages = freezed,Object? logBuffer = freezed,Object? logShared = freezed,Object? bundleDir = freezed,Object? bundleOverwrite = freezed,Object? logDirAllowDirty = freezed,Object? evalSetId = freezed,Object? evalSetOverrides = freezed,Object? taskDefaults = freezed,}) {
+@pragma('vm:prefer-inline') @override $Res call({Object? description = freezed,Object? imagePrefix = freezed,Object? logDir = null,Object? sandboxType = null,Object? maxConnections = null,Object? models = freezed,Object? variants = freezed,Object? taskPaths = freezed,Object? tasks = freezed,Object? saveExamples = null,Object? retryAttempts = freezed,Object? maxRetries = freezed,Object? retryWait = freezed,Object? retryConnections = freezed,Object? retryCleanup = freezed,Object? failOnError = freezed,Object? continueOnFail = freezed,Object? retryOnError = freezed,Object? debugErrors = freezed,Object? maxSamples = freezed,Object? maxTasks = freezed,Object? maxSubprocesses = freezed,Object? maxSandboxes = freezed,Object? logLevel = freezed,Object? logLevelTranscript = freezed,Object? logFormat = freezed,Object? tags = freezed,Object? metadata = freezed,Object? trace = freezed,Object? display = freezed,Object? score = freezed,Object? limit = freezed,Object? sampleId = freezed,Object? sampleShuffle = freezed,Object? epochs = freezed,Object? approval = freezed,Object? solver = freezed,Object? sandboxCleanup = freezed,Object? modelBaseUrl = freezed,Object? modelArgs = freezed,Object? modelRoles = freezed,Object? taskArgs = freezed,Object? messageLimit = freezed,Object? tokenLimit = freezed,Object? timeLimit = freezed,Object? workingLimit = freezed,Object? costLimit = freezed,Object? modelCostConfig = freezed,Object? logSamples = freezed,Object? logRealtime = freezed,Object? logImages = freezed,Object? logBuffer = freezed,Object? logShared = freezed,Object? bundleDir = freezed,Object? bundleOverwrite = freezed,Object? logDirAllowDirty = freezed,Object? evalSetId = freezed,Object? evalSetOverrides = freezed,Object? taskDefaults = freezed,Object? taskFilters = freezed,Object? sampleFilters = freezed,}) {
   return _then(_self.copyWith(
-logDir: null == logDir ? _self.logDir : logDir // ignore: cast_nullable_to_non_nullable
+description: freezed == description ? _self.description : description // ignore: cast_nullable_to_non_nullable
+as String?,imagePrefix: freezed == imagePrefix ? _self.imagePrefix : imagePrefix // ignore: cast_nullable_to_non_nullable
+as String?,logDir: null == logDir ? _self.logDir : logDir // ignore: cast_nullable_to_non_nullable
 as String,sandboxType: null == sandboxType ? _self.sandboxType : sandboxType // ignore: cast_nullable_to_non_nullable
 as String,maxConnections: null == maxConnections ? _self.maxConnections : maxConnections // ignore: cast_nullable_to_non_nullable
 as int,models: freezed == models ? _self.models : models // ignore: cast_nullable_to_non_nullable
@@ -194,10 +205,36 @@ as bool?,logDirAllowDirty: freezed == logDirAllowDirty ? _self.logDirAllowDirty
 as bool?,evalSetId: freezed == evalSetId ? _self.evalSetId : evalSetId // ignore: cast_nullable_to_non_nullable
 as String?,evalSetOverrides: freezed == evalSetOverrides ? _self.evalSetOverrides : evalSetOverrides // ignore: cast_nullable_to_non_nullable
 as Map<String, dynamic>?,taskDefaults: freezed == taskDefaults ? _self.taskDefaults : taskDefaults // ignore: cast_nullable_to_non_nullable
-as Map<String, dynamic>?,
+as Map<String, dynamic>?,taskFilters: freezed == taskFilters ? _self.taskFilters : taskFilters // ignore: cast_nullable_to_non_nullable
+as TagFilter?,sampleFilters: freezed == sampleFilters ? _self.sampleFilters : sampleFilters // ignore: cast_nullable_to_non_nullable
+as TagFilter?,
   ));
 }
-
+/// Create a copy of Job
+/// with the given fields replaced by the non-null parameter values.
+@override
+@pragma('vm:prefer-inline')
+$TagFilterCopyWith<$Res>? get taskFilters {
+    if (_self.taskFilters == null) {
+    return null;
+  }
+
+  return $TagFilterCopyWith<$Res>(_self.taskFilters!, (value) {
+    return _then(_self.copyWith(taskFilters: value));
+  });
+}/// Create a copy of Job
+/// with the given fields replaced by the non-null parameter values.
+@override
+@pragma('vm:prefer-inline')
+$TagFilterCopyWith<$Res>? get sampleFilters {
+    if (_self.sampleFilters == null) {
+    return null;
+  }
+
+  return $TagFilterCopyWith<$Res>(_self.sampleFilters!, (value) {
+    return _then(_self.copyWith(sampleFilters: value));
+  });
+}
 }
 
 
@@ -276,10 +313,10 @@ return $default(_that);case _:
 /// }
 /// ```
 
-@optionalTypeArgs TResult maybeWhen<TResult extends Object?>(TResult Function(@JsonKey(name: 'log_dir')  String logDir, @JsonKey(name: 'sandbox_type')  String sandboxType, @JsonKey(name: 'max_connections')  int maxConnections,  List<String>? models,  Map<String, Map<String, dynamic>>? variants, @JsonKey(name: 'task_paths')  List<String>? taskPaths,  Map<String, JobTask>? tasks, @JsonKey(name: 'save_examples')  bool saveExamples, @JsonKey(name: 'retry_attempts')  int? retryAttempts, @JsonKey(name: 'max_retries')  int? maxRetries, @JsonKey(name: 'retry_wait')  double? retryWait, @JsonKey(name: 'retry_connections')  double? retryConnections, @JsonKey(name: 'retry_cleanup')  bool? retryCleanup, @JsonKey(name: 'fail_on_error')  double? failOnError, @JsonKey(name: 'continue_on_fail')  bool? continueOnFail, @JsonKey(name: 'retry_on_error')  int? retryOnError, @JsonKey(name: 'debug_errors')  bool? debugErrors, @JsonKey(name: 'max_samples')  int? maxSamples, @JsonKey(name: 'max_tasks')  int? maxTasks, @JsonKey(name: 'max_subprocesses')  int? maxSubprocesses, @JsonKey(name: 'max_sandboxes')  int? maxSandboxes, @JsonKey(name: 'log_level')  String? logLevel, @JsonKey(name: 'log_level_transcript')  String? logLevelTranscript, @JsonKey(name: 'log_format')  String? logFormat,  List<String>? tags,  Map<String, dynamic>? metadata,  bool? trace,  String? display,  bool? score,  Object? limit, @JsonKey(name: 'sample_id')  Object? sampleId, @JsonKey(name: 'sample_shuffle')  Object? sampleShuffle,  Object? epochs,  Object? approval,  Object? solver, @JsonKey(name: 'sandbox_cleanup')  bool? sandboxCleanup, @JsonKey(name: 'model_base_url')  String? modelBaseUrl, @JsonKey(name: 'model_args')  Map<String, Object?>? modelArgs, @JsonKey(name: 'model_roles')  Map<String, String>? modelRoles, @JsonKey(name: 'task_args')  Map<String, Object?>? taskArgs, @JsonKey(name: 'message_limit')  int? messageLimit, @JsonKey(name: 'token_limit')  int? tokenLimit, @JsonKey(name: 'time_limit')  int? timeLimit, @JsonKey(name: 'working_limit')  int? workingLimit, @JsonKey(name: 'cost_limit')  double? costLimit, @JsonKey(name: 'model_cost_config')  Map<String, Object?>? modelCostConfig, @JsonKey(name: 'log_samples')  bool? logSamples, @JsonKey(name: 'log_realtime')  bool? logRealtime, @JsonKey(name: 'log_images')  bool? logImages, @JsonKey(name: 'log_buffer')  int? logBuffer, @JsonKey(name: 'log_shared')  int? logShared, @JsonKey(name: 'bundle_dir')  String? bundleDir, @JsonKey(name: 'bundle_overwrite')  bool? bundleOverwrite, @JsonKey(name: 'log_dir_allow_dirty')  bool? logDirAllowDirty, @JsonKey(name: 'eval_set_id')  String? evalSetId, @JsonKey(name: 'eval_set_overrides')  Map<String, dynamic>? evalSetOverrides, @JsonKey(name: 'task_defaults')  Map<String, dynamic>? taskDefaults)?  $default,{required TResult orElse(),}) {final _that = this;
+@optionalTypeArgs TResult maybeWhen<TResult extends Object?>(TResult Function( String? description, @JsonKey(name: 'image_prefix')  String? imagePrefix, @JsonKey(name: 'log_dir')  String logDir, @JsonKey(name: 'sandbox_type')  String sandboxType, @JsonKey(name: 'max_connections')  int maxConnections,  List<String>? models,  Map<String, Map<String, dynamic>>? variants, @JsonKey(name: 'task_paths')  List<String>? taskPaths,  Map<String, JobTask>? tasks, @JsonKey(name: 'save_examples')  bool saveExamples, @JsonKey(name: 'retry_attempts')  int? retryAttempts, @JsonKey(name: 'max_retries')  int? maxRetries, @JsonKey(name: 'retry_wait')  double? retryWait, @JsonKey(name: 'retry_connections')  double? retryConnections, @JsonKey(name: 'retry_cleanup')  bool? retryCleanup, @JsonKey(name: 'fail_on_error')  double? failOnError, @JsonKey(name: 'continue_on_fail')  bool? continueOnFail, @JsonKey(name: 'retry_on_error')  int? retryOnError, @JsonKey(name: 'debug_errors')  bool? debugErrors, @JsonKey(name: 'max_samples')  int? maxSamples, @JsonKey(name: 'max_tasks')  int? maxTasks, @JsonKey(name: 'max_subprocesses')  int? maxSubprocesses, @JsonKey(name: 'max_sandboxes')  int? maxSandboxes, @JsonKey(name: 'log_level')  String? logLevel, @JsonKey(name: 'log_level_transcript')  String? logLevelTranscript, @JsonKey(name: 'log_format')  String? logFormat,  List<String>? tags,  Map<String, dynamic>? metadata,  bool? trace,  String? display,  bool? score,  Object? limit, @JsonKey(name: 'sample_id')  Object? sampleId, @JsonKey(name: 'sample_shuffle')  Object? sampleShuffle,  Object? epochs,  Object? approval,  Object? solver, @JsonKey(name: 'sandbox_cleanup')  bool? sandboxCleanup, @JsonKey(name: 'model_base_url')  String? modelBaseUrl, @JsonKey(name: 'model_args')  Map<String, Object?>? modelArgs, @JsonKey(name: 'model_roles')  Map<String, String>? modelRoles, @JsonKey(name: 'task_args')  Map<String, Object?>? taskArgs, @JsonKey(name: 'message_limit')  int? messageLimit, @JsonKey(name: 'token_limit')  int? tokenLimit, @JsonKey(name: 'time_limit')  int? timeLimit, @JsonKey(name: 'working_limit')  int? workingLimit, @JsonKey(name: 'cost_limit')  double? costLimit, @JsonKey(name: 'model_cost_config')  Map<String, Object?>? modelCostConfig, @JsonKey(name: 'log_samples')  bool? logSamples, @JsonKey(name: 'log_realtime')  bool? logRealtime, @JsonKey(name: 'log_images')  bool? logImages, @JsonKey(name: 'log_buffer')  int? logBuffer, @JsonKey(name: 'log_shared')  int? logShared, @JsonKey(name: 'bundle_dir')  String? bundleDir, @JsonKey(name: 'bundle_overwrite')  bool? bundleOverwrite, @JsonKey(name: 'log_dir_allow_dirty')  bool? logDirAllowDirty, @JsonKey(name: 'eval_set_id')  String? evalSetId, @JsonKey(name: 'eval_set_overrides')  Map<String, dynamic>? evalSetOverrides, @JsonKey(name: 'task_defaults')  Map<String, dynamic>? taskDefaults, @JsonKey(name: 'task_filters')  TagFilter? taskFilters, @JsonKey(name: 'sample_filters')  TagFilter? sampleFilters)?  $default,{required TResult orElse(),}) {final _that = this;
 switch (_that) {
 case _Job() when $default != null:
-return $default(_that.logDir,_that.sandboxType,_that.maxConnections,_that.models,_that.variants,_that.taskPaths,_that.tasks,_that.saveExamples,_that.retryAttempts,_that.maxRetries,_that.retryWait,_that.retryConnections,_that.retryCleanup,_that.failOnError,_that.continueOnFail,_that.retryOnError,_that.debugErrors,_that.maxSamples,_that.maxTasks,_that.maxSubprocesses,_that.maxSandboxes,_that.logLevel,_that.logLevelTranscript,_that.logFormat,_that.tags,_that.metadata,_that.trace,_that.display,_that.score,_that.limit,_that.sampleId,_that.sampleShuffle,_that.epochs,_that.approval,_that.solver,_that.sandboxCleanup,_that.modelBaseUrl,_that.modelArgs,_that.modelRoles,_that.taskArgs,_that.messageLimit,_that.tokenLimit,_that.timeLimit,_that.workingLimit,_that.costLimit,_that.modelCostConfig,_that.logSamples,_that.logRealtime,_that.logImages,_that.logBuffer,_that.logShared,_that.bundleDir,_that.bundleOverwrite,_that.logDirAllowDirty,_that.evalSetId,_that.evalSetOverrides,_that.taskDefaults);case _:
+return $default(_that.description,_that.imagePrefix,_that.logDir,_that.sandboxType,_that.maxConnections,_that.models,_that.variants,_that.taskPaths,_that.tasks,_that.saveExamples,_that.retryAttempts,_that.maxRetries,_that.retryWait,_that.retryConnections,_that.retryCleanup,_that.failOnError,_that.continueOnFail,_that.retryOnError,_that.debugErrors,_that.maxSamples,_that.maxTasks,_that.maxSubprocesses,_that.maxSandboxes,_that.logLevel,_that.logLevelTranscript,_that.logFormat,_that.tags,_that.metadata,_that.trace,_that.display,_that.score,_that.limit,_that.sampleId,_that.sampleShuffle,_that.epochs,_that.approval,_that.solver,_that.sandboxCleanup,_that.modelBaseUrl,_that.modelArgs,_that.modelRoles,_that.taskArgs,_that.messageLimit,_that.tokenLimit,_that.timeLimit,_that.workingLimit,_that.costLimit,_that.modelCostConfig,_that.logSamples,_that.logRealtime,_that.logImages,_that.logBuffer,_that.logShared,_that.bundleDir,_that.bundleOverwrite,_that.logDirAllowDirty,_that.evalSetId,_that.evalSetOverrides,_that.taskDefaults,_that.taskFilters,_that.sampleFilters);case _:
   return orElse();
 
 }
@@ -297,10 +334,10 @@ return $default(_that.logDir,_that.sandboxType,_that.maxConnections,_that.models
 /// }
 /// ```
 
-@optionalTypeArgs TResult when<TResult extends Object?>(TResult Function(@JsonKey(name: 'log_dir')  String logDir, @JsonKey(name: 'sandbox_type')  String sandboxType, @JsonKey(name: 'max_connections')  int maxConnections,  List<String>? models,  Map<String, Map<String, dynamic>>? variants, @JsonKey(name: 'task_paths')  List<String>? taskPaths,  Map<String, JobTask>? tasks, @JsonKey(name: 'save_examples')  bool saveExamples, @JsonKey(name: 'retry_attempts')  int? retryAttempts, @JsonKey(name: 'max_retries')  int? maxRetries, @JsonKey(name: 'retry_wait')  double? retryWait, @JsonKey(name: 'retry_connections')  double? retryConnections, @JsonKey(name: 'retry_cleanup')  bool? retryCleanup, @JsonKey(name: 'fail_on_error')  double? failOnError, @JsonKey(name: 'continue_on_fail')  bool? continueOnFail, @JsonKey(name: 'retry_on_error')  int? retryOnError, @JsonKey(name: 'debug_errors')  bool? debugErrors, @JsonKey(name: 'max_samples')  int? maxSamples, @JsonKey(name: 'max_tasks')  int? maxTasks, @JsonKey(name: 'max_subprocesses')  int? maxSubprocesses, @JsonKey(name: 'max_sandboxes')  int? maxSandboxes, @JsonKey(name: 'log_level')  String? logLevel, @JsonKey(name: 'log_level_transcript')  String? logLevelTranscript, @JsonKey(name: 'log_format')  String? logFormat,  List<String>? tags,  Map<String, dynamic>? metadata,  bool? trace,  String? display,  bool? score,  Object? limit, @JsonKey(name: 'sample_id')  Object? sampleId, @JsonKey(name: 'sample_shuffle')  Object? sampleShuffle,  Object? epochs,  Object? approval,  Object? solver, @JsonKey(name: 'sandbox_cleanup')  bool? sandboxCleanup, @JsonKey(name: 'model_base_url')  String? modelBaseUrl, @JsonKey(name: 'model_args')  Map<String, Object?>? modelArgs, @JsonKey(name: 'model_roles')  Map<String, String>? modelRoles, @JsonKey(name: 'task_args')  Map<String, Object?>? taskArgs, @JsonKey(name: 'message_limit')  int? messageLimit, @JsonKey(name: 'token_limit')  int? tokenLimit, @JsonKey(name: 'time_limit')  int? timeLimit, @JsonKey(name: 'working_limit')  int? workingLimit, @JsonKey(name: 'cost_limit')  double? costLimit, @JsonKey(name: 'model_cost_config')  Map<String, Object?>? modelCostConfig, @JsonKey(name: 'log_samples')  bool? logSamples, @JsonKey(name: 'log_realtime')  bool? logRealtime, @JsonKey(name: 'log_images')  bool? logImages, @JsonKey(name: 'log_buffer')  int? logBuffer, @JsonKey(name: 'log_shared')  int? logShared, @JsonKey(name: 'bundle_dir')  String? bundleDir, @JsonKey(name: 'bundle_overwrite')  bool? bundleOverwrite, @JsonKey(name: 'log_dir_allow_dirty')  bool? logDirAllowDirty, @JsonKey(name: 'eval_set_id')  String? evalSetId, @JsonKey(name: 'eval_set_overrides')  Map<String, dynamic>? evalSetOverrides, @JsonKey(name: 'task_defaults')  Map<String, dynamic>? taskDefaults)  $default,) {final _that = this;
+@optionalTypeArgs TResult when<TResult extends Object?>(TResult Function( String? description, @JsonKey(name: 'image_prefix')  String? imagePrefix, @JsonKey(name: 'log_dir')  String logDir, @JsonKey(name: 'sandbox_type')  String sandboxType, @JsonKey(name: 'max_connections')  int maxConnections,  List<String>? models,  Map<String, Map<String, dynamic>>? variants, @JsonKey(name: 'task_paths')  List<String>? taskPaths,  Map<String, JobTask>? tasks, @JsonKey(name: 'save_examples')  bool saveExamples, @JsonKey(name: 'retry_attempts')  int? retryAttempts, @JsonKey(name: 'max_retries')  int? maxRetries, @JsonKey(name: 'retry_wait')  double? retryWait, @JsonKey(name: 'retry_connections')  double? retryConnections, @JsonKey(name: 'retry_cleanup')  bool? retryCleanup, @JsonKey(name: 'fail_on_error')  double? failOnError, @JsonKey(name: 'continue_on_fail')  bool? continueOnFail, @JsonKey(name: 'retry_on_error')  int? retryOnError, @JsonKey(name: 'debug_errors')  bool? debugErrors, @JsonKey(name: 'max_samples')  int? maxSamples, @JsonKey(name: 'max_tasks')  int? maxTasks, @JsonKey(name: 'max_subprocesses')  int? maxSubprocesses, @JsonKey(name: 'max_sandboxes')  int? maxSandboxes, @JsonKey(name: 'log_level')  String? logLevel, @JsonKey(name: 'log_level_transcript')  String? logLevelTranscript, @JsonKey(name: 'log_format')  String? logFormat,  List<String>? tags,  Map<String, dynamic>? metadata,  bool? trace,  String? display,  bool? score,  Object? limit, @JsonKey(name: 'sample_id')  Object? sampleId, @JsonKey(name: 'sample_shuffle')  Object? sampleShuffle,  Object? epochs,  Object? approval,  Object? solver, @JsonKey(name: 'sandbox_cleanup')  bool? sandboxCleanup, @JsonKey(name: 'model_base_url')  String? modelBaseUrl, @JsonKey(name: 'model_args')  Map<String, Object?>? modelArgs, @JsonKey(name: 'model_roles')  Map<String, String>? modelRoles, @JsonKey(name: 'task_args')  Map<String, Object?>? taskArgs, @JsonKey(name: 'message_limit')  int? messageLimit, @JsonKey(name: 'token_limit')  int? tokenLimit, @JsonKey(name: 'time_limit')  int? timeLimit, @JsonKey(name: 'working_limit')  int? workingLimit, @JsonKey(name: 'cost_limit')  double? costLimit, @JsonKey(name: 'model_cost_config')  Map<String, Object?>? modelCostConfig, @JsonKey(name: 'log_samples')  bool? logSamples, @JsonKey(name: 'log_realtime')  bool? logRealtime, @JsonKey(name: 'log_images')  bool? logImages, @JsonKey(name: 'log_buffer')  int? logBuffer, @JsonKey(name: 'log_shared')  int? logShared, @JsonKey(name: 'bundle_dir')  String? bundleDir, @JsonKey(name: 'bundle_overwrite')  bool? bundleOverwrite, @JsonKey(name: 'log_dir_allow_dirty')  bool? logDirAllowDirty, @JsonKey(name: 'eval_set_id')  String? evalSetId, @JsonKey(name: 'eval_set_overrides')  Map<String, dynamic>? evalSetOverrides, @JsonKey(name: 'task_defaults')  Map<String, dynamic>? taskDefaults, @JsonKey(name: 'task_filters')  TagFilter? taskFilters, @JsonKey(name: 'sample_filters')  TagFilter? sampleFilters)  $default,) {final _that = this;
 switch (_that) {
 case _Job():
-return $default(_that.logDir,_that.sandboxType,_that.maxConnections,_that.models,_that.variants,_that.taskPaths,_that.tasks,_that.saveExamples,_that.retryAttempts,_that.maxRetries,_that.retryWait,_that.retryConnections,_that.retryCleanup,_that.failOnError,_that.continueOnFail,_that.retryOnError,_that.debugErrors,_that.maxSamples,_that.maxTasks,_that.maxSubprocesses,_that.maxSandboxes,_that.logLevel,_that.logLevelTranscript,_that.logFormat,_that.tags,_that.metadata,_that.trace,_that.display,_that.score,_that.limit,_that.sampleId,_that.sampleShuffle,_that.epochs,_that.approval,_that.solver,_that.sandboxCleanup,_that.modelBaseUrl,_that.modelArgs,_that.modelRoles,_that.taskArgs,_that.messageLimit,_that.tokenLimit,_that.timeLimit,_that.workingLimit,_that.costLimit,_that.modelCostConfig,_that.logSamples,_that.logRealtime,_that.logImages,_that.logBuffer,_that.logShared,_that.bundleDir,_that.bundleOverwrite,_that.logDirAllowDirty,_that.evalSetId,_that.evalSetOverrides,_that.taskDefaults);}
+return $default(_that.description,_that.imagePrefix,_that.logDir,_that.sandboxType,_that.maxConnections,_that.models,_that.variants,_that.taskPaths,_that.tasks,_that.saveExamples,_that.retryAttempts,_that.maxRetries,_that.retryWait,_that.retryConnections,_that.retryCleanup,_that.failOnError,_that.continueOnFail,_that.retryOnError,_that.debugErrors,_that.maxSamples,_that.maxTasks,_that.maxSubprocesses,_that.maxSandboxes,_that.logLevel,_that.logLevelTranscript,_that.logFormat,_that.tags,_that.metadata,_that.trace,_that.display,_that.score,_that.limit,_that.sampleId,_that.sampleShuffle,_that.epochs,_that.approval,_that.solver,_that.sandboxCleanup,_that.modelBaseUrl,_that.modelArgs,_that.modelRoles,_that.taskArgs,_that.messageLimit,_that.tokenLimit,_that.timeLimit,_that.workingLimit,_that.costLimit,_that.modelCostConfig,_that.logSamples,_that.logRealtime,_that.logImages,_that.logBuffer,_that.logShared,_that.bundleDir,_that.bundleOverwrite,_that.logDirAllowDirty,_that.evalSetId,_that.evalSetOverrides,_that.taskDefaults,_that.taskFilters,_that.sampleFilters);}
 }
 /// A variant of `when` that fallback to returning `null`
 ///
@@ -314,10 +351,10 @@ return $default(_that.logDir,_that.sandboxType,_that.maxConnections,_that.models
 /// }
 /// ```
 
-@optionalTypeArgs TResult? whenOrNull<TResult extends Object?>(TResult? Function(@JsonKey(name: 'log_dir')  String logDir, @JsonKey(name: 'sandbox_type')  String sandboxType, @JsonKey(name: 'max_connections')  int maxConnections,  List<String>? models,  Map<String, Map<String, dynamic>>? variants, @JsonKey(name: 'task_paths')  List<String>? taskPaths,  Map<String, JobTask>? tasks, @JsonKey(name: 'save_examples')  bool saveExamples, @JsonKey(name: 'retry_attempts')  int? retryAttempts, @JsonKey(name: 'max_retries')  int? maxRetries, @JsonKey(name: 'retry_wait')  double? retryWait, @JsonKey(name: 'retry_connections')  double? retryConnections, @JsonKey(name: 'retry_cleanup')  bool? retryCleanup, @JsonKey(name: 'fail_on_error')  double? failOnError, @JsonKey(name: 'continue_on_fail')  bool? continueOnFail, @JsonKey(name: 'retry_on_error')  int? retryOnError, @JsonKey(name: 'debug_errors')  bool? debugErrors, @JsonKey(name: 'max_samples')  int? maxSamples, @JsonKey(name: 'max_tasks')  int? maxTasks, @JsonKey(name: 'max_subprocesses')  int? maxSubprocesses, @JsonKey(name: 'max_sandboxes')  int? maxSandboxes, @JsonKey(name: 'log_level')  String? logLevel, @JsonKey(name: 'log_level_transcript')  String? logLevelTranscript, @JsonKey(name: 'log_format')  String? logFormat,  List<String>? tags,  Map<String, dynamic>? metadata,  bool? trace,  String? display,  bool? score,  Object? limit, @JsonKey(name: 'sample_id')  Object? sampleId, @JsonKey(name: 'sample_shuffle')  Object? sampleShuffle,  Object? epochs,  Object? approval,  Object? solver, @JsonKey(name: 'sandbox_cleanup')  bool? sandboxCleanup, @JsonKey(name: 'model_base_url')  String? modelBaseUrl, @JsonKey(name: 'model_args')  Map<String, Object?>? modelArgs, @JsonKey(name: 'model_roles')  Map<String, String>? modelRoles, @JsonKey(name: 'task_args')  Map<String, Object?>? taskArgs, @JsonKey(name: 'message_limit')  int? messageLimit, @JsonKey(name: 'token_limit')  int? tokenLimit, @JsonKey(name: 'time_limit')  int? timeLimit, @JsonKey(name: 'working_limit')  int? workingLimit, @JsonKey(name: 'cost_limit')  double? costLimit, @JsonKey(name: 'model_cost_config')  Map<String, Object?>? modelCostConfig, @JsonKey(name: 'log_samples')  bool? logSamples, @JsonKey(name: 'log_realtime')  bool? logRealtime, @JsonKey(name: 'log_images')  bool? logImages, @JsonKey(name: 'log_buffer')  int? logBuffer, @JsonKey(name: 'log_shared')  int? logShared, @JsonKey(name: 'bundle_dir')  String? bundleDir, @JsonKey(name: 'bundle_overwrite')  bool? bundleOverwrite, @JsonKey(name: 'log_dir_allow_dirty')  bool? logDirAllowDirty, @JsonKey(name: 'eval_set_id')  String? evalSetId, @JsonKey(name: 'eval_set_overrides')  Map<String, dynamic>? evalSetOverrides, @JsonKey(name: 'task_defaults')  Map<String, dynamic>? taskDefaults)?  $default,) {final _that = this;
+@optionalTypeArgs TResult? whenOrNull<TResult extends Object?>(TResult? Function( String? description, @JsonKey(name: 'image_prefix')  String? imagePrefix, @JsonKey(name: 'log_dir')  String logDir, @JsonKey(name: 'sandbox_type')  String sandboxType, @JsonKey(name: 'max_connections')  int maxConnections,  List<String>? models,  Map<String, Map<String, dynamic>>? variants, @JsonKey(name: 'task_paths')  List<String>? taskPaths,  Map<String, JobTask>? tasks, @JsonKey(name: 'save_examples')  bool saveExamples, @JsonKey(name: 'retry_attempts')  int? retryAttempts, @JsonKey(name: 'max_retries')  int? maxRetries, @JsonKey(name: 'retry_wait')  double? retryWait, @JsonKey(name: 'retry_connections')  double? retryConnections, @JsonKey(name: 'retry_cleanup')  bool? retryCleanup, @JsonKey(name: 'fail_on_error')  double? failOnError, @JsonKey(name: 'continue_on_fail')  bool? continueOnFail, @JsonKey(name: 'retry_on_error')  int? retryOnError, @JsonKey(name: 'debug_errors')  bool? debugErrors, @JsonKey(name: 'max_samples')  int? maxSamples, @JsonKey(name: 'max_tasks')  int? maxTasks, @JsonKey(name: 'max_subprocesses')  int? maxSubprocesses, @JsonKey(name: 'max_sandboxes')  int? maxSandboxes, @JsonKey(name: 'log_level')  String? logLevel, @JsonKey(name: 'log_level_transcript')  String? logLevelTranscript, @JsonKey(name: 'log_format')  String? logFormat,  List<String>? tags,  Map<String, dynamic>? metadata,  bool? trace,  String? display,  bool? score,  Object? limit, @JsonKey(name: 'sample_id')  Object? sampleId, @JsonKey(name: 'sample_shuffle')  Object? sampleShuffle,  Object? epochs,  Object? approval,  Object? solver, @JsonKey(name: 'sandbox_cleanup')  bool? sandboxCleanup, @JsonKey(name: 'model_base_url')  String? modelBaseUrl, @JsonKey(name: 'model_args')  Map<String, Object?>? modelArgs, @JsonKey(name: 'model_roles')  Map<String, String>? modelRoles, @JsonKey(name: 'task_args')  Map<String, Object?>? taskArgs, @JsonKey(name: 'message_limit')  int? messageLimit, @JsonKey(name: 'token_limit')  int? tokenLimit, @JsonKey(name: 'time_limit')  int? timeLimit, @JsonKey(name: 'working_limit')  int? workingLimit, @JsonKey(name: 'cost_limit')  double? costLimit, @JsonKey(name: 'model_cost_config')  Map<String, Object?>? modelCostConfig, @JsonKey(name: 'log_samples')  bool? logSamples, @JsonKey(name: 'log_realtime')  bool? logRealtime, @JsonKey(name: 'log_images')  bool? logImages, @JsonKey(name: 'log_buffer')  int? logBuffer, @JsonKey(name: 'log_shared')  int? logShared, @JsonKey(name: 'bundle_dir')  String? bundleDir, @JsonKey(name: 'bundle_overwrite')  bool? bundleOverwrite, @JsonKey(name: 'log_dir_allow_dirty')  bool? logDirAllowDirty, @JsonKey(name: 'eval_set_id')  String? evalSetId, @JsonKey(name: 'eval_set_overrides')  Map<String, dynamic>? evalSetOverrides, @JsonKey(name: 'task_defaults')  Map<String, dynamic>? taskDefaults, @JsonKey(name: 'task_filters')  TagFilter? taskFilters, @JsonKey(name: 'sample_filters')  TagFilter? sampleFilters)?  $default,) {final _that = this;
 switch (_that) {
 case _Job() when $default != null:
-return $default(_that.logDir,_that.sandboxType,_that.maxConnections,_that.models,_that.variants,_that.taskPaths,_that.tasks,_that.saveExamples,_that.retryAttempts,_that.maxRetries,_that.retryWait,_that.retryConnections,_that.retryCleanup,_that.failOnError,_that.continueOnFail,_that.retryOnError,_that.debugErrors,_that.maxSamples,_that.maxTasks,_that.maxSubprocesses,_that.maxSandboxes,_that.logLevel,_that.logLevelTranscript,_that.logFormat,_that.tags,_that.metadata,_that.trace,_that.display,_that.score,_that.limit,_that.sampleId,_that.sampleShuffle,_that.epochs,_that.approval,_that.solver,_that.sandboxCleanup,_that.modelBaseUrl,_that.modelArgs,_that.modelRoles,_that.taskArgs,_that.messageLimit,_that.tokenLimit,_that.timeLimit,_that.workingLimit,_that.costLimit,_that.modelCostConfig,_that.logSamples,_that.logRealtime,_that.logImages,_that.logBuffer,_that.logShared,_that.bundleDir,_that.bundleOverwrite,_that.logDirAllowDirty,_that.evalSetId,_that.evalSetOverrides,_that.taskDefaults);case _:
+return $default(_that.description,_that.imagePrefix,_that.logDir,_that.sandboxType,_that.maxConnections,_that.models,_that.variants,_that.taskPaths,_that.tasks,_that.saveExamples,_that.retryAttempts,_that.maxRetries,_that.retryWait,_that.retryConnections,_that.retryCleanup,_that.failOnError,_that.continueOnFail,_that.retryOnError,_that.debugErrors,_that.maxSamples,_that.maxTasks,_that.maxSubprocesses,_that.maxSandboxes,_that.logLevel,_that.logLevelTranscript,_that.logFormat,_that.tags,_that.metadata,_that.trace,_that.display,_that.score,_that.limit,_that.sampleId,_that.sampleShuffle,_that.epochs,_that.approval,_that.solver,_that.sandboxCleanup,_that.modelBaseUrl,_that.modelArgs,_that.modelRoles,_that.taskArgs,_that.messageLimit,_that.tokenLimit,_that.timeLimit,_that.workingLimit,_that.costLimit,_that.modelCostConfig,_that.logSamples,_that.logRealtime,_that.logImages,_that.logBuffer,_that.logShared,_that.bundleDir,_that.bundleOverwrite,_that.logDirAllowDirty,_that.evalSetId,_that.evalSetOverrides,_that.taskDefaults,_that.taskFilters,_that.sampleFilters);case _:
   return null;
 
 }
@@ -329,12 +366,18 @@ return $default(_that.logDir,_that.sandboxType,_that.maxConnections,_that.models
 @JsonSerializable()
 
 class _Job implements Job {
-  const _Job({@JsonKey(name: 'log_dir') required this.logDir, @JsonKey(name: 'sandbox_type') this.sandboxType = 'local', @JsonKey(name: 'max_connections') this.maxConnections = 10, final  List<String>? models, final  Map<String, Map<String, dynamic>>? variants, @JsonKey(name: 'task_paths') final  List<String>? taskPaths, final  Map<String, JobTask>? tasks, @JsonKey(name: 'save_examples') this.saveExamples = false, @JsonKey(name: 'retry_attempts') this.retryAttempts, @JsonKey(name: 'max_retries') this.maxRetries, @JsonKey(name: 'retry_wait') this.retryWait, @JsonKey(name: 'retry_connections') this.retryConnections, @JsonKey(name: 'retry_cleanup') this.retryCleanup, @JsonKey(name: 'fail_on_error') this.failOnError, @JsonKey(name: 'continue_on_fail') this.continueOnFail, @JsonKey(name: 'retry_on_error') this.retryOnError, @JsonKey(name: 'debug_errors') this.debugErrors, @JsonKey(name: 'max_samples') this.maxSamples, @JsonKey(name: 'max_tasks') this.maxTasks, @JsonKey(name: 'max_subprocesses') this.maxSubprocesses, @JsonKey(name: 'max_sandboxes') this.maxSandboxes, @JsonKey(name: 'log_level') this.logLevel, @JsonKey(name: 'log_level_transcript') this.logLevelTranscript, @JsonKey(name: 'log_format') this.logFormat, final  List<String>? tags, final  Map<String, dynamic>? metadata, this.trace, this.display, this.score, this.limit, @JsonKey(name: 'sample_id') this.sampleId, @JsonKey(name: 'sample_shuffle') this.sampleShuffle, this.epochs, this.approval, this.solver, @JsonKey(name: 'sandbox_cleanup') this.sandboxCleanup, @JsonKey(name: 'model_base_url') this.modelBaseUrl, @JsonKey(name: 'model_args') final  Map<String, Object?>? modelArgs, @JsonKey(name: 'model_roles') final  Map<String, String>? modelRoles, @JsonKey(name: 'task_args') final  Map<String, Object?>? taskArgs, @JsonKey(name: 'message_limit') this.messageLimit, @JsonKey(name: 'token_limit') this.tokenLimit, @JsonKey(name: 'time_limit') this.timeLimit, @JsonKey(name: 'working_limit') this.workingLimit, @JsonKey(name: 'cost_limit') this.costLimit, @JsonKey(name: 'model_cost_config') final  Map<String, Object?>? modelCostConfig, @JsonKey(name: 'log_samples') this.logSamples, @JsonKey(name: 'log_realtime') this.logRealtime, @JsonKey(name: 'log_images') this.logImages, @JsonKey(name: 'log_buffer') this.logBuffer, @JsonKey(name: 'log_shared') this.logShared, @JsonKey(name: 'bundle_dir') this.bundleDir, @JsonKey(name: 'bundle_overwrite') this.bundleOverwrite, @JsonKey(name: 'log_dir_allow_dirty') this.logDirAllowDirty, @JsonKey(name: 'eval_set_id') this.evalSetId, @JsonKey(name: 'eval_set_overrides') final  Map<String, dynamic>? evalSetOverrides, @JsonKey(name: 'task_defaults') final  Map<String, dynamic>? taskDefaults}): _models = models,_variants = variants,_taskPaths = taskPaths,_tasks = tasks,_tags = tags,_metadata = metadata,_modelArgs = modelArgs,_modelRoles = modelRoles,_taskArgs = taskArgs,_modelCostConfig = modelCostConfig,_evalSetOverrides = evalSetOverrides,_taskDefaults = taskDefaults;
+  const _Job({this.description, @JsonKey(name: 'image_prefix') this.imagePrefix, @JsonKey(name: 'log_dir') required this.logDir, @JsonKey(name: 'sandbox_type') this.sandboxType = 'local', @JsonKey(name: 'max_connections') this.maxConnections = 10, final  List<String>? models, final  Map<String, Map<String, dynamic>>? variants, @JsonKey(name: 'task_paths') final  List<String>? taskPaths, final  Map<String, JobTask>? tasks, @JsonKey(name: 'save_examples') this.saveExamples = false, @JsonKey(name: 'retry_attempts') this.retryAttempts, @JsonKey(name: 'max_retries') this.maxRetries, @JsonKey(name: 'retry_wait') this.retryWait, @JsonKey(name: 'retry_connections') this.retryConnections, @JsonKey(name: 'retry_cleanup') this.retryCleanup, @JsonKey(name: 'fail_on_error') this.failOnError, @JsonKey(name: 'continue_on_fail') this.continueOnFail, @JsonKey(name: 'retry_on_error') this.retryOnError, @JsonKey(name: 'debug_errors') this.debugErrors, @JsonKey(name: 'max_samples') this.maxSamples, @JsonKey(name: 'max_tasks') this.maxTasks, @JsonKey(name: 'max_subprocesses') this.maxSubprocesses, @JsonKey(name: 'max_sandboxes') this.maxSandboxes, @JsonKey(name: 'log_level') this.logLevel, @JsonKey(name: 'log_level_transcript') this.logLevelTranscript, @JsonKey(name: 'log_format') this.logFormat, final  List<String>? tags, final  Map<String, dynamic>? metadata, this.trace, this.display, this.score, this.limit, @JsonKey(name: 'sample_id') this.sampleId, @JsonKey(name: 'sample_shuffle') this.sampleShuffle, this.epochs, this.approval, this.solver, @JsonKey(name: 'sandbox_cleanup') this.sandboxCleanup, @JsonKey(name: 'model_base_url') this.modelBaseUrl, @JsonKey(name: 'model_args') final  Map<String, Object?>? modelArgs, @JsonKey(name: 'model_roles') final  Map<String, String>? modelRoles, @JsonKey(name: 'task_args') final  Map<String, Object?>? taskArgs, @JsonKey(name: 'message_limit') this.messageLimit, @JsonKey(name: 'token_limit') this.tokenLimit, @JsonKey(name: 'time_limit') this.timeLimit, @JsonKey(name: 'working_limit') this.workingLimit, @JsonKey(name: 'cost_limit') this.costLimit, @JsonKey(name: 'model_cost_config') final  Map<String, Object?>? modelCostConfig, @JsonKey(name: 'log_samples') this.logSamples, @JsonKey(name: 'log_realtime') this.logRealtime, @JsonKey(name: 'log_images') this.logImages, @JsonKey(name: 'log_buffer') this.logBuffer, @JsonKey(name: 'log_shared') this.logShared, @JsonKey(name: 'bundle_dir') this.bundleDir, @JsonKey(name: 'bundle_overwrite') this.bundleOverwrite, @JsonKey(name: 'log_dir_allow_dirty') this.logDirAllowDirty, @JsonKey(name: 'eval_set_id') this.evalSetId, @JsonKey(name: 'eval_set_overrides') final  Map<String, dynamic>? evalSetOverrides, @JsonKey(name: 'task_defaults') final  Map<String, dynamic>? taskDefaults, @JsonKey(name: 'task_filters') this.taskFilters, @JsonKey(name: 'sample_filters') this.sampleFilters}): _models = models,_variants = variants,_taskPaths = taskPaths,_tasks = tasks,_tags = tags,_metadata = metadata,_modelArgs = modelArgs,_modelRoles = modelRoles,_taskArgs = taskArgs,_modelCostConfig = modelCostConfig,_evalSetOverrides = evalSetOverrides,_taskDefaults = taskDefaults;
   factory _Job.fromJson(Map<String, dynamic> json) => _$JobFromJson(json);
 
 // ------------------------------------------------------------------
 // Core job settings
 // ------------------------------------------------------------------
+/// Human-readable description of this job.
+@override final  String? description;
+/// Registry URL prefix prepended to image names during sandbox resolution.
+///
+/// Example: `us-central1-docker.pkg.dev/project/repo/`
+@override@JsonKey(name: 'image_prefix') final  String? imagePrefix;
 /// Directory to write evaluation logs to.
 @override@JsonKey(name: 'log_dir') final  String logDir;
 /// Sandbox type: `'local'`, `'docker'`, or `'podman'`.
@@ -583,6 +626,13 @@ class _Job implements Job {
   return EqualUnmodifiableMapView(value);
 }
 
+// ------------------------------------------------------------------
+// Tag-based filtering
+// ------------------------------------------------------------------
+/// Tag filters applied to tasks.
+@override@JsonKey(name: 'task_filters') final  TagFilter? taskFilters;
+/// Tag filters applied to samples.
+@override@JsonKey(name: 'sample_filters') final  TagFilter? sampleFilters;
 
 /// Create a copy of Job
 /// with the given fields replaced by the non-null parameter values.
@@ -597,16 +647,16 @@ Map<String, dynamic> toJson() {
 
 @override
 bool operator ==(Object other) {
-  return identical(this, other) || (other.runtimeType == runtimeType&&other is _Job&&(identical(other.logDir, logDir) || other.logDir == logDir)&&(identical(other.sandboxType, sandboxType) || other.sandboxType == sandboxType)&&(identical(other.maxConnections, maxConnections) || other.maxConnections == maxConnections)&&const DeepCollectionEquality().equals(other._models, _models)&&const DeepCollectionEquality().equals(other._variants, _variants)&&const DeepCollectionEquality().equals(other._taskPaths, _taskPaths)&&const DeepCollectionEquality().equals(other._tasks, _tasks)&&(identical(other.saveExamples, saveExamples) || other.saveExamples == saveExamples)&&(identical(other.retryAttempts, retryAttempts) || other.retryAttempts == retryAttempts)&&(identical(other.maxRetries, maxRetries) || other.maxRetries == maxRetries)&&(identical(other.retryWait, retryWait) || other.retryWait == retryWait)&&(identical(other.retryConnections, retryConnections) || other.retryConnections == retryConnections)&&(identical(other.retryCleanup, retryCleanup) || other.retryCleanup == retryCleanup)&&(identical(other.failOnError, failOnError) || other.failOnError == failOnError)&&(identical(other.continueOnFail, continueOnFail) || other.continueOnFail == continueOnFail)&&(identical(other.retryOnError, retryOnError) || other.retryOnError == retryOnError)&&(identical(other.debugErrors, debugErrors) || other.debugErrors == debugErrors)&&(identical(other.maxSamples, maxSamples) || other.maxSamples == maxSamples)&&(identical(other.maxTasks, maxTasks) || other.maxTasks == maxTasks)&&(identical(other.maxSubprocesses, maxSubprocesses) || other.maxSubprocesses == maxSubprocesses)&&(identical(other.maxSandboxes, maxSandboxes) || other.maxSandboxes == maxSandboxes)&&(identical(other.logLevel, logLevel) || other.logLevel == logLevel)&&(identical(other.logLevelTranscript, logLevelTranscript) || other.logLevelTranscript == logLevelTranscript)&&(identical(other.logFormat, logFormat) || other.logFormat == logFormat)&&const DeepCollectionEquality().equals(other._tags, _tags)&&const DeepCollectionEquality().equals(other._metadata, _metadata)&&(identical(other.trace, trace) || other.trace == trace)&&(identical(other.display, display) || other.display == display)&&(identical(other.score, score) || other.score == score)&&const DeepCollectionEquality().equals(other.limit, limit)&&const DeepCollectionEquality().equals(other.sampleId, sampleId)&&const DeepCollectionEquality().equals(other.sampleShuffle, sampleShuffle)&&const DeepCollectionEquality().equals(other.epochs, epochs)&&const DeepCollectionEquality().equals(other.approval, approval)&&const DeepCollectionEquality().equals(other.solver, solver)&&(identical(other.sandboxCleanup, sandboxCleanup) || other.sandboxCleanup == sandboxCleanup)&&(identical(other.modelBaseUrl, modelBaseUrl) || other.modelBaseUrl == modelBaseUrl)&&const DeepCollectionEquality().equals(other._modelArgs, _modelArgs)&&const DeepCollectionEquality().equals(other._modelRoles, _modelRoles)&&const DeepCollectionEquality().equals(other._taskArgs, _taskArgs)&&(identical(other.messageLimit, messageLimit) || other.messageLimit == messageLimit)&&(identical(other.tokenLimit, tokenLimit) || other.tokenLimit == tokenLimit)&&(identical(other.timeLimit, timeLimit) || other.timeLimit == timeLimit)&&(identical(other.workingLimit, workingLimit) || other.workingLimit == workingLimit)&&(identical(other.costLimit, costLimit) || other.costLimit == costLimit)&&const DeepCollectionEquality().equals(other._modelCostConfig, _modelCostConfig)&&(identical(other.logSamples, logSamples) || other.logSamples == logSamples)&&(identical(other.logRealtime, logRealtime) || other.logRealtime == logRealtime)&&(identical(other.logImages, logImages) || other.logImages == logImages)&&(identical(other.logBuffer, logBuffer) || other.logBuffer == logBuffer)&&(identical(other.logShared, logShared) || other.logShared == logShared)&&(identical(other.bundleDir, bundleDir) || other.bundleDir == bundleDir)&&(identical(other.bundleOverwrite, bundleOverwrite) || other.bundleOverwrite == bundleOverwrite)&&(identical(other.logDirAllowDirty, logDirAllowDirty) || other.logDirAllowDirty == logDirAllowDirty)&&(identical(other.evalSetId, evalSetId) || other.evalSetId == evalSetId)&&const DeepCollectionEquality().equals(other._evalSetOverrides, _evalSetOverrides)&&const DeepCollectionEquality().equals(other._taskDefaults, _taskDefaults));
+  return identical(this, other) || (other.runtimeType == runtimeType&&other is _Job&&(identical(other.description, description) || other.description == description)&&(identical(other.imagePrefix, imagePrefix) || other.imagePrefix == imagePrefix)&&(identical(other.logDir, logDir) || other.logDir == logDir)&&(identical(other.sandboxType, sandboxType) || other.sandboxType == sandboxType)&&(identical(other.maxConnections, maxConnections) || other.maxConnections == maxConnections)&&const DeepCollectionEquality().equals(other._models, _models)&&const DeepCollectionEquality().equals(other._variants, _variants)&&const DeepCollectionEquality().equals(other._taskPaths, _taskPaths)&&const DeepCollectionEquality().equals(other._tasks, _tasks)&&(identical(other.saveExamples, saveExamples) || other.saveExamples == saveExamples)&&(identical(other.retryAttempts, retryAttempts) || other.retryAttempts == retryAttempts)&&(identical(other.maxRetries, maxRetries) || other.maxRetries == maxRetries)&&(identical(other.retryWait, retryWait) || other.retryWait == retryWait)&&(identical(other.retryConnections, retryConnections) || other.retryConnections == retryConnections)&&(identical(other.retryCleanup, retryCleanup) || other.retryCleanup == retryCleanup)&&(identical(other.failOnError, failOnError) || other.failOnError == failOnError)&&(identical(other.continueOnFail, continueOnFail) || other.continueOnFail == continueOnFail)&&(identical(other.retryOnError, retryOnError) || other.retryOnError == retryOnError)&&(identical(other.debugErrors, debugErrors) || other.debugErrors == debugErrors)&&(identical(other.maxSamples, maxSamples) || other.maxSamples == maxSamples)&&(identical(other.maxTasks, maxTasks) || other.maxTasks == maxTasks)&&(identical(other.maxSubprocesses, maxSubprocesses) || other.maxSubprocesses == maxSubprocesses)&&(identical(other.maxSandboxes, maxSandboxes) || other.maxSandboxes == maxSandboxes)&&(identical(other.logLevel, logLevel) || other.logLevel == logLevel)&&(identical(other.logLevelTranscript, logLevelTranscript) || other.logLevelTranscript == logLevelTranscript)&&(identical(other.logFormat, logFormat) || other.logFormat == logFormat)&&const DeepCollectionEquality().equals(other._tags, _tags)&&const DeepCollectionEquality().equals(other._metadata, _metadata)&&(identical(other.trace, trace) || other.trace == trace)&&(identical(other.display, display) || other.display == display)&&(identical(other.score, score) || other.score == score)&&const DeepCollectionEquality().equals(other.limit, limit)&&const DeepCollectionEquality().equals(other.sampleId, sampleId)&&const DeepCollectionEquality().equals(other.sampleShuffle, sampleShuffle)&&const DeepCollectionEquality().equals(other.epochs, epochs)&&const DeepCollectionEquality().equals(other.approval, approval)&&const DeepCollectionEquality().equals(other.solver, solver)&&(identical(other.sandboxCleanup, sandboxCleanup) || other.sandboxCleanup == sandboxCleanup)&&(identical(other.modelBaseUrl, modelBaseUrl) || other.modelBaseUrl == modelBaseUrl)&&const DeepCollectionEquality().equals(other._modelArgs, _modelArgs)&&const DeepCollectionEquality().equals(other._modelRoles, _modelRoles)&&const DeepCollectionEquality().equals(other._taskArgs, _taskArgs)&&(identical(other.messageLimit, messageLimit) || other.messageLimit == messageLimit)&&(identical(other.tokenLimit, tokenLimit) || other.tokenLimit == tokenLimit)&&(identical(other.timeLimit, timeLimit) || other.timeLimit == timeLimit)&&(identical(other.workingLimit, workingLimit) || other.workingLimit == workingLimit)&&(identical(other.costLimit, costLimit) || other.costLimit == costLimit)&&const DeepCollectionEquality().equals(other._modelCostConfig, _modelCostConfig)&&(identical(other.logSamples, logSamples) || other.logSamples == logSamples)&&(identical(other.logRealtime, logRealtime) || other.logRealtime == logRealtime)&&(identical(other.logImages, logImages) || other.logImages == logImages)&&(identical(other.logBuffer, logBuffer) || other.logBuffer == logBuffer)&&(identical(other.logShared, logShared) || other.logShared == logShared)&&(identical(other.bundleDir, bundleDir) || other.bundleDir == bundleDir)&&(identical(other.bundleOverwrite, bundleOverwrite) || other.bundleOverwrite == bundleOverwrite)&&(identical(other.logDirAllowDirty, logDirAllowDirty) || other.logDirAllowDirty == logDirAllowDirty)&&(identical(other.evalSetId, evalSetId) || other.evalSetId == evalSetId)&&const DeepCollectionEquality().equals(other._evalSetOverrides, _evalSetOverrides)&&const DeepCollectionEquality().equals(other._taskDefaults, _taskDefaults)&&(identical(other.taskFilters, taskFilters) || other.taskFilters == taskFilters)&&(identical(other.sampleFilters, sampleFilters) || other.sampleFilters == sampleFilters));
 }
 
 @JsonKey(includeFromJson: false, includeToJson: false)
 @override
-int get hashCode => Object.hashAll([runtimeType,logDir,sandboxType,maxConnections,const DeepCollectionEquality().hash(_models),const DeepCollectionEquality().hash(_variants),const DeepCollectionEquality().hash(_taskPaths),const DeepCollectionEquality().hash(_tasks),saveExamples,retryAttempts,maxRetries,retryWait,retryConnections,retryCleanup,failOnError,continueOnFail,retryOnError,debugErrors,maxSamples,maxTasks,maxSubprocesses,maxSandboxes,logLevel,logLevelTranscript,logFormat,const DeepCollectionEquality().hash(_tags),const DeepCollectionEquality().hash(_metadata),trace,display,score,const DeepCollectionEquality().hash(limit),const DeepCollectionEquality().hash(sampleId),const DeepCollectionEquality().hash(sampleShuffle),const DeepCollectionEquality().hash(epochs),const DeepCollectionEquality().hash(approval),const DeepCollectionEquality().hash(solver),sandboxCleanup,modelBaseUrl,const DeepCollectionEquality().hash(_modelArgs),const DeepCollectionEquality().hash(_modelRoles),const DeepCollectionEquality().hash(_taskArgs),messageLimit,tokenLimit,timeLimit,workingLimit,costLimit,const DeepCollectionEquality().hash(_modelCostConfig),logSamples,logRealtime,logImages,logBuffer,logShared,bundleDir,bundleOverwrite,logDirAllowDirty,evalSetId,const DeepCollectionEquality().hash(_evalSetOverrides),const DeepCollectionEquality().hash(_taskDefaults)]);
+int get hashCode => Object.hashAll([runtimeType,description,imagePrefix,logDir,sandboxType,maxConnections,const DeepCollectionEquality().hash(_models),const DeepCollectionEquality().hash(_variants),const DeepCollectionEquality().hash(_taskPaths),const DeepCollectionEquality().hash(_tasks),saveExamples,retryAttempts,maxRetries,retryWait,retryConnections,retryCleanup,failOnError,continueOnFail,retryOnError,debugErrors,maxSamples,maxTasks,maxSubprocesses,maxSandboxes,logLevel,logLevelTranscript,logFormat,const DeepCollectionEquality().hash(_tags),const DeepCollectionEquality().hash(_metadata),trace,display,score,const DeepCollectionEquality().hash(limit),const DeepCollectionEquality().hash(sampleId),const DeepCollectionEquality().hash(sampleShuffle),const DeepCollectionEquality().hash(epochs),const DeepCollectionEquality().hash(approval),const DeepCollectionEquality().hash(solver),sandboxCleanup,modelBaseUrl,const DeepCollectionEquality().hash(_modelArgs),const DeepCollectionEquality().hash(_modelRoles),const DeepCollectionEquality().hash(_taskArgs),messageLimit,tokenLimit,timeLimit,workingLimit,costLimit,const DeepCollectionEquality().hash(_modelCostConfig),logSamples,logRealtime,logImages,logBuffer,logShared,bundleDir,bundleOverwrite,logDirAllowDirty,evalSetId,const DeepCollectionEquality().hash(_evalSetOverrides),const DeepCollectionEquality().hash(_taskDefaults),taskFilters,sampleFilters]);
 
 @override
 String toString() {
-  return 'Job(logDir: $logDir, sandboxType: $sandboxType, maxConnections: $maxConnections, models: $models, variants: $variants, taskPaths: $taskPaths, tasks: $tasks, saveExamples: $saveExamples, retryAttempts: $retryAttempts, maxRetries: $maxRetries, retryWait: $retryWait, retryConnections: $retryConnections, retryCleanup: $retryCleanup, failOnError: $failOnError, continueOnFail: $continueOnFail, retryOnError: $retryOnError, debugErrors: $debugErrors, maxSamples: $maxSamples, maxTasks: $maxTasks, maxSubprocesses: $maxSubprocesses, maxSandboxes: $maxSandboxes, logLevel: $logLevel, logLevelTranscript: $logLevelTranscript, logFormat: $logFormat, tags: $tags, metadata: $metadata, trace: $trace, display: $display, score: $score, limit: $limit, sampleId: $sampleId, sampleShuffle: $sampleShuffle, epochs: $epochs, approval: $approval, solver: $solver, sandboxCleanup: $sandboxCleanup, modelBaseUrl: $modelBaseUrl, modelArgs: $modelArgs, modelRoles: $modelRoles, taskArgs: $taskArgs, messageLimit: $messageLimit, tokenLimit: $tokenLimit, timeLimit: $timeLimit, workingLimit: $workingLimit, costLimit: $costLimit, modelCostConfig: $modelCostConfig, logSamples: $logSamples, logRealtime: $logRealtime, logImages: $logImages, logBuffer: $logBuffer, logShared: $logShared, bundleDir: $bundleDir, bundleOverwrite: $bundleOverwrite, logDirAllowDirty: $logDirAllowDirty, evalSetId: $evalSetId, evalSetOverrides: $evalSetOverrides, taskDefaults: $taskDefaults)';
+  return 'Job(description: $description, imagePrefix: $imagePrefix, logDir: $logDir, sandboxType: $sandboxType, maxConnections: $maxConnections, models: $models, variants: $variants, taskPaths: $taskPaths, tasks: $tasks, saveExamples: $saveExamples, retryAttempts: $retryAttempts, maxRetries: $maxRetries, retryWait: $retryWait, retryConnections: $retryConnections, retryCleanup: $retryCleanup, failOnError: $failOnError, continueOnFail: $continueOnFail, retryOnError: $retryOnError, debugErrors: $debugErrors, maxSamples: $maxSamples, maxTasks: $maxTasks, maxSubprocesses: $maxSubprocesses, maxSandboxes: $maxSandboxes, logLevel: $logLevel, logLevelTranscript: $logLevelTranscript, logFormat: $logFormat, tags: $tags, metadata: $metadata, trace: $trace, display: $display, score: $score, limit: $limit, sampleId: $sampleId, sampleShuffle: $sampleShuffle, epochs: $epochs, approval: $approval, solver: $solver, sandboxCleanup: $sandboxCleanup, modelBaseUrl: $modelBaseUrl, modelArgs: $modelArgs, modelRoles: $modelRoles, taskArgs: $taskArgs, messageLimit: $messageLimit, tokenLimit: $tokenLimit, timeLimit: $timeLimit, workingLimit: $workingLimit, costLimit: $costLimit, modelCostConfig: $modelCostConfig, logSamples: $logSamples, logRealtime: $logRealtime, logImages: $logImages, logBuffer: $logBuffer, logShared: $logShared, bundleDir: $bundleDir, bundleOverwrite: $bundleOverwrite, logDirAllowDirty: $logDirAllowDirty, evalSetId: $evalSetId, evalSetOverrides: $evalSetOverrides, taskDefaults: $taskDefaults, taskFilters: $taskFilters, sampleFilters: $sampleFilters)';
 }
 
 
@@ -617,11 +667,11 @@ abstract mixin class _$JobCopyWith<$Res> implements $JobCopyWith<$Res> {
   factory _$JobCopyWith(_Job value, $Res Function(_Job) _then) = __$JobCopyWithImpl;
 @override @useResult
 $Res call({
-@JsonKey(name: 'log_dir') String logDir,@JsonKey(name: 'sandbox_type') String sandboxType,@JsonKey(name: 'max_connections') int maxConnections, List<String>? models, Map<String, Map<String, dynamic>>? variants,@JsonKey(name: 'task_paths') List<String>? taskPaths, Map<String, JobTask>? tasks,@JsonKey(name: 'save_examples') bool saveExamples,@JsonKey(name: 'retry_attempts') int? retryAttempts,@JsonKey(name: 'max_retries') int? maxRetries,@JsonKey(name: 'retry_wait') double? retryWait,@JsonKey(name: 'retry_connections') double? retryConnections,@JsonKey(name: 'retry_cleanup') bool? retryCleanup,@JsonKey(name: 'fail_on_error') double? failOnError,@JsonKey(name: 'continue_on_fail') bool? continueOnFail,@JsonKey(name: 'retry_on_error') int? retryOnError,@JsonKey(name: 'debug_errors') bool? debugErrors,@JsonKey(name: 'max_samples') int? maxSamples,@JsonKey(name: 'max_tasks') int? maxTasks,@JsonKey(name: 'max_subprocesses') int? maxSubprocesses,@JsonKey(name: 'max_sandboxes') int? maxSandboxes,@JsonKey(name: 'log_level') String? logLevel,@JsonKey(name: 'log_level_transcript') String? logLevelTranscript,@JsonKey(name: 'log_format') String? logFormat, List<String>? tags, Map<String, dynamic>? metadata, bool? trace, String? display, bool? score, Object? limit,@JsonKey(name: 'sample_id') Object? sampleId,@JsonKey(name: 'sample_shuffle') Object? sampleShuffle, Object? epochs, Object? approval, Object? solver,@JsonKey(name: 'sandbox_cleanup') bool? sandboxCleanup,@JsonKey(name: 'model_base_url') String? modelBaseUrl,@JsonKey(name: 'model_args') Map<String, Object?>? modelArgs,@JsonKey(name: 'model_roles') Map<String, String>? modelRoles,@JsonKey(name: 'task_args') Map<String, Object?>? taskArgs,@JsonKey(name: 'message_limit') int? messageLimit,@JsonKey(name: 'token_limit') int? tokenLimit,@JsonKey(name: 'time_limit') int? timeLimit,@JsonKey(name: 'working_limit') int? workingLimit,@JsonKey(name: 'cost_limit') double? costLimit,@JsonKey(name: 'model_cost_config') Map<String, Object?>? modelCostConfig,@JsonKey(name: 'log_samples') bool? logSamples,@JsonKey(name: 'log_realtime') bool? logRealtime,@JsonKey(name: 'log_images') bool? logImages,@JsonKey(name: 'log_buffer') int? logBuffer,@JsonKey(name: 'log_shared') int? logShared,@JsonKey(name: 'bundle_dir') String? bundleDir,@JsonKey(name: 'bundle_overwrite') bool? bundleOverwrite,@JsonKey(name: 'log_dir_allow_dirty') bool? logDirAllowDirty,@JsonKey(name: 'eval_set_id') String? evalSetId,@JsonKey(name: 'eval_set_overrides') Map<String, dynamic>? evalSetOverrides,@JsonKey(name: 'task_defaults') Map<String, dynamic>? taskDefaults
+ String? description,@JsonKey(name: 'image_prefix') String? imagePrefix,@JsonKey(name: 'log_dir') String logDir,@JsonKey(name: 'sandbox_type') String sandboxType,@JsonKey(name: 'max_connections') int maxConnections, List<String>? models, Map<String, Map<String, dynamic>>? variants,@JsonKey(name: 'task_paths') List<String>? taskPaths, Map<String, JobTask>? tasks,@JsonKey(name: 'save_examples') bool saveExamples,@JsonKey(name: 'retry_attempts') int? retryAttempts,@JsonKey(name: 'max_retries') int? maxRetries,@JsonKey(name: 'retry_wait') double? retryWait,@JsonKey(name: 'retry_connections') double? retryConnections,@JsonKey(name: 'retry_cleanup') bool? retryCleanup,@JsonKey(name: 'fail_on_error') double? failOnError,@JsonKey(name: 'continue_on_fail') bool? continueOnFail,@JsonKey(name: 'retry_on_error') int? retryOnError,@JsonKey(name: 'debug_errors') bool? debugErrors,@JsonKey(name: 'max_samples') int? maxSamples,@JsonKey(name: 'max_tasks') int? maxTasks,@JsonKey(name: 'max_subprocesses') int? maxSubprocesses,@JsonKey(name: 'max_sandboxes') int? maxSandboxes,@JsonKey(name: 'log_level') String? logLevel,@JsonKey(name: 'log_level_transcript') String? logLevelTranscript,@JsonKey(name: 'log_format') String? logFormat, List<String>? tags, Map<String, dynamic>? metadata, bool? trace, String? display, bool? score, Object? limit,@JsonKey(name: 'sample_id') Object? sampleId,@JsonKey(name: 'sample_shuffle') Object? sampleShuffle, Object? epochs, Object? approval, Object? solver,@JsonKey(name: 'sandbox_cleanup') bool? sandboxCleanup,@JsonKey(name: 'model_base_url') String? modelBaseUrl,@JsonKey(name: 'model_args') Map<String, Object?>? modelArgs,@JsonKey(name: 'model_roles') Map<String, String>? modelRoles,@JsonKey(name: 'task_args') Map<String, Object?>? taskArgs,@JsonKey(name: 'message_limit') int? messageLimit,@JsonKey(name: 'token_limit') int? tokenLimit,@JsonKey(name: 'time_limit') int? timeLimit,@JsonKey(name: 'working_limit') int? workingLimit,@JsonKey(name: 'cost_limit') double? costLimit,@JsonKey(name: 'model_cost_config') Map<String, Object?>? modelCostConfig,@JsonKey(name: 'log_samples') bool? logSamples,@JsonKey(name: 'log_realtime') bool? logRealtime,@JsonKey(name: 'log_images') bool? logImages,@JsonKey(name: 'log_buffer') int? logBuffer,@JsonKey(name: 'log_shared') int? logShared,@JsonKey(name: 'bundle_dir') String? bundleDir,@JsonKey(name: 'bundle_overwrite') bool? bundleOverwrite,@JsonKey(name: 'log_dir_allow_dirty') bool? logDirAllowDirty,@JsonKey(name: 'eval_set_id') String? evalSetId,@JsonKey(name: 'eval_set_overrides') Map<String, dynamic>? evalSetOverrides,@JsonKey(name: 'task_defaults') Map<String, dynamic>? taskDefaults,@JsonKey(name: 'task_filters') TagFilter? taskFilters,@JsonKey(name: 'sample_filters') TagFilter? sampleFilters
 });
 
 
-
+@override $TagFilterCopyWith<$Res>? get taskFilters;@override $TagFilterCopyWith<$Res>? get sampleFilters;
 
 }
 /// @nodoc
@@ -634,9 +684,11 @@ class __$JobCopyWithImpl<$Res>
 
 /// Create a copy of Job
 /// with the given fields replaced by the non-null parameter values.
-@override @pragma('vm:prefer-inline') $Res call({Object? logDir = null,Object? sandboxType = null,Object? maxConnections = null,Object? models = freezed,Object? variants = freezed,Object? taskPaths = freezed,Object? tasks = freezed,Object? saveExamples = null,Object? retryAttempts = freezed,Object? maxRetries = freezed,Object? retryWait = freezed,Object? retryConnections = freezed,Object? retryCleanup = freezed,Object? failOnError = freezed,Object? continueOnFail = freezed,Object? retryOnError = freezed,Object? debugErrors = freezed,Object? maxSamples = freezed,Object? maxTasks = freezed,Object? maxSubprocesses = freezed,Object? maxSandboxes = freezed,Object? logLevel = freezed,Object? logLevelTranscript = freezed,Object? logFormat = freezed,Object? tags = freezed,Object? metadata = freezed,Object? trace = freezed,Object? display = freezed,Object? score = freezed,Object? limit = freezed,Object? sampleId = freezed,Object? sampleShuffle = freezed,Object? epochs = freezed,Object? approval = freezed,Object? solver = freezed,Object? sandboxCleanup = freezed,Object? modelBaseUrl = freezed,Object? modelArgs = freezed,Object? modelRoles = freezed,Object? taskArgs = freezed,Object? messageLimit = freezed,Object? tokenLimit = freezed,Object? timeLimit = freezed,Object? workingLimit = freezed,Object? costLimit = freezed,Object? modelCostConfig = freezed,Object? logSamples = freezed,Object? logRealtime = freezed,Object? logImages = freezed,Object? logBuffer = freezed,Object? logShared = freezed,Object? bundleDir = freezed,Object? bundleOverwrite = freezed,Object? logDirAllowDirty = freezed,Object? evalSetId = freezed,Object? evalSetOverrides = freezed,Object? taskDefaults = freezed,}) {
+@override @pragma('vm:prefer-inline') $Res call({Object? description = freezed,Object? imagePrefix = freezed,Object? logDir = null,Object? sandboxType = null,Object? maxConnections = null,Object? models = freezed,Object? variants = freezed,Object? taskPaths = freezed,Object? tasks = freezed,Object? saveExamples = null,Object? retryAttempts = freezed,Object? maxRetries = freezed,Object? retryWait = freezed,Object? retryConnections = freezed,Object? retryCleanup = freezed,Object? failOnError = freezed,Object? continueOnFail = freezed,Object? retryOnError = freezed,Object? debugErrors = freezed,Object? maxSamples = freezed,Object? maxTasks = freezed,Object? maxSubprocesses = freezed,Object? maxSandboxes = freezed,Object? logLevel = freezed,Object? logLevelTranscript = freezed,Object? logFormat = freezed,Object? tags = freezed,Object? metadata = freezed,Object? trace = freezed,Object? display = freezed,Object? score = freezed,Object? limit = freezed,Object? sampleId = freezed,Object? sampleShuffle = freezed,Object? epochs = freezed,Object? approval = freezed,Object? solver = freezed,Object? sandboxCleanup = freezed,Object? modelBaseUrl = freezed,Object? modelArgs = freezed,Object? modelRoles = freezed,Object? taskArgs = freezed,Object? messageLimit = freezed,Object? tokenLimit = freezed,Object? timeLimit = freezed,Object? workingLimit = freezed,Object? costLimit = freezed,Object? modelCostConfig = freezed,Object? logSamples = freezed,Object? logRealtime = freezed,Object? logImages = freezed,Object? logBuffer = freezed,Object? logShared = freezed,Object? bundleDir = freezed,Object? bundleOverwrite = freezed,Object? logDirAllowDirty = freezed,Object? evalSetId = freezed,Object? evalSetOverrides = freezed,Object? taskDefaults = freezed,Object? taskFilters = freezed,Object? sampleFilters = freezed,}) {
   return _then(_Job(
-logDir: null == logDir ? _self.logDir : logDir // ignore: cast_nullable_to_non_nullable
+description: freezed == description ? _self.description : description // ignore: cast_nullable_to_non_nullable
+as String?,imagePrefix: freezed == imagePrefix ? _self.imagePrefix : imagePrefix // ignore: cast_nullable_to_non_nullable
+as String?,logDir: null == logDir ? _self.logDir : logDir // ignore: cast_nullable_to_non_nullable
 as String,sandboxType: null == sandboxType ? _self.sandboxType : sandboxType // ignore: cast_nullable_to_non_nullable
 as String,maxConnections: null == maxConnections ? _self.maxConnections : maxConnections // ignore: cast_nullable_to_non_nullable
 as int,models: freezed == models ? _self._models : models // ignore: cast_nullable_to_non_nullable
@@ -687,11 +739,37 @@ as bool?,logDirAllowDirty: freezed == logDirAllowDirty ? _self.logDirAllowDirty
 as bool?,evalSetId: freezed == evalSetId ? _self.evalSetId : evalSetId // ignore: cast_nullable_to_non_nullable
 as String?,evalSetOverrides: freezed == evalSetOverrides ? _self._evalSetOverrides : evalSetOverrides // ignore: cast_nullable_to_non_nullable
 as Map<String, dynamic>?,taskDefaults: freezed == taskDefaults ? _self._taskDefaults : taskDefaults // ignore: cast_nullable_to_non_nullable
-as Map<String, dynamic>?,
+as Map<String, dynamic>?,taskFilters: freezed == taskFilters ? _self.taskFilters : taskFilters // ignore: cast_nullable_to_non_nullable
+as TagFilter?,sampleFilters: freezed == sampleFilters ? _self.sampleFilters : sampleFilters // ignore: cast_nullable_to_non_nullable
+as TagFilter?,
   ));
 }
 
-
+/// Create a copy of Job
+/// with the given fields replaced by the non-null parameter values.
+@override
+@pragma('vm:prefer-inline')
+$TagFilterCopyWith<$Res>? get taskFilters {
+    if (_self.taskFilters == null) {
+    return null;
+  }
+
+  return $TagFilterCopyWith<$Res>(_self.taskFilters!, (value) {
+    return _then(_self.copyWith(taskFilters: value));
+  });
+}/// Create a copy of Job
+/// with the given fields replaced by the non-null parameter values.
+@override
+@pragma('vm:prefer-inline')
+$TagFilterCopyWith<$Res>? get sampleFilters {
+    if (_self.sampleFilters == null) {
+    return null;
+  }
+
+  return $TagFilterCopyWith<$Res>(_self.sampleFilters!, (value) {
+    return _then(_self.copyWith(sampleFilters: value));
+  });
+}
 }
 
 
@@ -702,7 +780,8 @@ mixin _$JobTask {
  String get id;/// Only run these sample IDs. Mutually exclusive with [excludeSamples].
 @JsonKey(name: 'include_samples') List<String>? get includeSamples;/// Exclude these sample IDs. Mutually exclusive with [includeSamples].
 @JsonKey(name: 'exclude_samples') List<String>? get excludeSamples;/// Override system message for this task.
-@JsonKey(name: 'system_message') String? get systemMessage;
+@JsonKey(name: 'system_message') String? get systemMessage;/// Per-task argument overrides passed to the task function.
+@JsonKey(name: 'args') Map<String, dynamic>? get args;
 /// Create a copy of JobTask
 /// with the given fields replaced by the non-null parameter values.
 @JsonKey(includeFromJson: false, includeToJson: false)
@@ -715,16 +794,16 @@ $JobTaskCopyWith<JobTask> get copyWith => _$JobTaskCopyWithImpl<JobTask>(this as
 
 @override
 bool operator ==(Object other) {
-  return identical(this, other) || (other.runtimeType == runtimeType&&other is JobTask&&(identical(other.id, id) || other.id == id)&&const DeepCollectionEquality().equals(other.includeSamples, includeSamples)&&const DeepCollectionEquality().equals(other.excludeSamples, excludeSamples)&&(identical(other.systemMessage, systemMessage) || other.systemMessage == systemMessage));
+  return identical(this, other) || (other.runtimeType == runtimeType&&other is JobTask&&(identical(other.id, id) || other.id == id)&&const DeepCollectionEquality().equals(other.includeSamples, includeSamples)&&const DeepCollectionEquality().equals(other.excludeSamples, excludeSamples)&&(identical(other.systemMessage, systemMessage) || other.systemMessage == systemMessage)&&const DeepCollectionEquality().equals(other.args, args));
 }
 
 @JsonKey(includeFromJson: false, includeToJson: false)
 @override
-int get hashCode => Object.hash(runtimeType,id,const DeepCollectionEquality().hash(includeSamples),const DeepCollectionEquality().hash(excludeSamples),systemMessage);
+int get hashCode => Object.hash(runtimeType,id,const DeepCollectionEquality().hash(includeSamples),const DeepCollectionEquality().hash(excludeSamples),systemMessage,const DeepCollectionEquality().hash(args));
 
 @override
 String toString() {
-  return 'JobTask(id: $id, includeSamples: $includeSamples, excludeSamples: $excludeSamples, systemMessage: $systemMessage)';
+  return 'JobTask(id: $id, includeSamples: $includeSamples, excludeSamples: $excludeSamples, systemMessage: $systemMessage, args: $args)';
 }
 
 
@@ -735,7 +814,7 @@ abstract mixin class $JobTaskCopyWith<$Res>  {
   factory $JobTaskCopyWith(JobTask value, $Res Function(JobTask) _then) = _$JobTaskCopyWithImpl;
 @useResult
 $Res call({
- String id,@JsonKey(name: 'include_samples') List<String>? includeSamples,@JsonKey(name: 'exclude_samples') List<String>? excludeSamples,@JsonKey(name: 'system_message') String? systemMessage
+ String id,@JsonKey(name: 'include_samples') List<String>? includeSamples,@JsonKey(name: 'exclude_samples') List<String>? excludeSamples,@JsonKey(name: 'system_message') String? systemMessage,@JsonKey(name: 'args') Map<String, dynamic>? args
 });
 
 
@@ -752,13 +831,14 @@ class _$JobTaskCopyWithImpl<$Res>
 
 /// Create a copy of JobTask
 /// with the given fields replaced by the non-null parameter values.
-@pragma('vm:prefer-inline') @override $Res call({Object? id = null,Object? includeSamples = freezed,Object? excludeSamples = freezed,Object? systemMessage = freezed,}) {
+@pragma('vm:prefer-inline') @override $Res call({Object? id = null,Object? includeSamples = freezed,Object? excludeSamples = freezed,Object? systemMessage = freezed,Object? args = freezed,}) {
   return _then(_self.copyWith(
 id: null == id ? _self.id : id // ignore: cast_nullable_to_non_nullable
 as String,includeSamples: freezed == includeSamples ? _self.includeSamples : includeSamples // ignore: cast_nullable_to_non_nullable
 as List<String>?,excludeSamples: freezed == excludeSamples ? _self.excludeSamples : excludeSamples // ignore: cast_nullable_to_non_nullable
 as List<String>?,systemMessage: freezed == systemMessage ? _self.systemMessage : systemMessage // ignore: cast_nullable_to_non_nullable
-as String?,
+as String?,args: freezed == args ? _self.args : args // ignore: cast_nullable_to_non_nullable
+as Map<String, dynamic>?,
   ));
 }
 
@@ -840,10 +920,10 @@ return $default(_that);case _:
 /// }
 /// ```
 
-@optionalTypeArgs TResult maybeWhen<TResult extends Object?>(TResult Function( String id, @JsonKey(name: 'include_samples')  List<String>? includeSamples, @JsonKey(name: 'exclude_samples')  List<String>? excludeSamples, @JsonKey(name: 'system_message')  String? systemMessage)?  $default,{required TResult orElse(),}) {final _that = this;
+@optionalTypeArgs TResult maybeWhen<TResult extends Object?>(TResult Function( String id, @JsonKey(name: 'include_samples')  List<String>? includeSamples, @JsonKey(name: 'exclude_samples')  List<String>? excludeSamples, @JsonKey(name: 'system_message')  String? systemMessage, @JsonKey(name: 'args')  Map<String, dynamic>? args)?  $default,{required TResult orElse(),}) {final _that = this;
 switch (_that) {
 case _JobTask() when $default != null:
-return $default(_that.id,_that.includeSamples,_that.excludeSamples,_that.systemMessage);case _:
+return $default(_that.id,_that.includeSamples,_that.excludeSamples,_that.systemMessage,_that.args);case _:
   return orElse();
 
 }
@@ -861,10 +941,10 @@ return $default(_that.id,_that.includeSamples,_that.excludeSamples,_that.systemM
 /// }
 /// ```
 
-@optionalTypeArgs TResult when<TResult extends Object?>(TResult Function( String id, @JsonKey(name: 'include_samples')  List<String>? includeSamples, @JsonKey(name: 'exclude_samples')  List<String>? excludeSamples, @JsonKey(name: 'system_message')  String? systemMessage)  $default,) {final _that = this;
+@optionalTypeArgs TResult when<TResult extends Object?>(TResult Function( String id, @JsonKey(name: 'include_samples')  List<String>? includeSamples, @JsonKey(name: 'exclude_samples')  List<String>? excludeSamples, @JsonKey(name: 'system_message')  String? systemMessage, @JsonKey(name: 'args')  Map<String, dynamic>? args)  $default,) {final _that = this;
 switch (_that) {
 case _JobTask():
-return $default(_that.id,_that.includeSamples,_that.excludeSamples,_that.systemMessage);}
+return $default(_that.id,_that.includeSamples,_that.excludeSamples,_that.systemMessage,_that.args);}
 }
 /// A variant of `when` that fallback to returning `null`
 ///
@@ -878,10 +958,10 @@ return $default(_that.id,_that.includeSamples,_that.excludeSamples,_that.systemM
 /// }
 /// ```
 
-@optionalTypeArgs TResult? whenOrNull<TResult extends Object?>(TResult? Function( String id, @JsonKey(name: 'include_samples')  List<String>? includeSamples, @JsonKey(name: 'exclude_samples')  List<String>? excludeSamples, @JsonKey(name: 'system_message')  String? systemMessage)?  $default,) {final _that = this;
+@optionalTypeArgs TResult? whenOrNull<TResult extends Object?>(TResult? Function( String id, @JsonKey(name: 'include_samples')  List<String>? includeSamples, @JsonKey(name: 'exclude_samples')  List<String>? excludeSamples, @JsonKey(name: 'system_message')  String? systemMessage, @JsonKey(name: 'args')  Map<String, dynamic>? args)?  $default,) {final _that = this;
 switch (_that) {
 case _JobTask() when $default != null:
-return $default(_that.id,_that.includeSamples,_that.excludeSamples,_that.systemMessage);case _:
+return $default(_that.id,_that.includeSamples,_that.excludeSamples,_that.systemMessage,_that.args);case _:
   return null;
 
 }
@@ -893,7 +973,7 @@ return $default(_that.id,_that.includeSamples,_that.excludeSamples,_that.systemM
 @JsonSerializable()
 
 class _JobTask implements JobTask {
-  const _JobTask({required this.id, @JsonKey(name: 'include_samples') final  List<String>? includeSamples, @JsonKey(name: 'exclude_samples') final  List<String>? excludeSamples, @JsonKey(name: 'system_message') this.systemMessage}): _includeSamples = includeSamples,_excludeSamples = excludeSamples;
+  const _JobTask({required this.id, @JsonKey(name: 'include_samples') final  List<String>? includeSamples, @JsonKey(name: 'exclude_samples') final  List<String>? excludeSamples, @JsonKey(name: 'system_message') this.systemMessage, @JsonKey(name: 'args') final  Map<String, dynamic>? args}): _includeSamples = includeSamples,_excludeSamples = excludeSamples,_args = args;
   factory _JobTask.fromJson(Map<String, dynamic> json) => _$JobTaskFromJson(json);
 
 /// Task identifier matching a task directory name in `tasks/`.
@@ -922,6 +1002,17 @@ class _JobTask implements JobTask {
 
 /// Override system message for this task.
 @override@JsonKey(name: 'system_message') final  String? systemMessage;
+/// Per-task argument overrides passed to the task function.
+ final  Map<String, dynamic>? _args;
+/// Per-task argument overrides passed to the task function.
+@override@JsonKey(name: 'args') Map<String, dynamic>? get args {
+  final value = _args;
+  if (value == null) return null;
+  if (_args is EqualUnmodifiableMapView) return _args;
+  // ignore: implicit_dynamic_type
+  return EqualUnmodifiableMapView(value);
+}
+
 
 /// Create a copy of JobTask
 /// with the given fields replaced by the non-null parameter values.
@@ -936,16 +1027,16 @@ Map<String, dynamic> toJson() {
 
 @override
 bool operator ==(Object other) {
-  return identical(this, other) || (other.runtimeType == runtimeType&&other is _JobTask&&(identical(other.id, id) || other.id == id)&&const DeepCollectionEquality().equals(other._includeSamples, _includeSamples)&&const DeepCollectionEquality().equals(other._excludeSamples, _excludeSamples)&&(identical(other.systemMessage, systemMessage) || other.systemMessage == systemMessage));
+  return identical(this, other) || (other.runtimeType == runtimeType&&other is _JobTask&&(identical(other.id, id) || other.id == id)&&const DeepCollectionEquality().equals(other._includeSamples, _includeSamples)&&const DeepCollectionEquality().equals(other._excludeSamples, _excludeSamples)&&(identical(other.systemMessage, systemMessage) || other.systemMessage == systemMessage)&&const DeepCollectionEquality().equals(other._args, _args));
 }
 
 @JsonKey(includeFromJson: false, includeToJson: false)
 @override
-int get hashCode => Object.hash(runtimeType,id,const DeepCollectionEquality().hash(_includeSamples),const DeepCollectionEquality().hash(_excludeSamples),systemMessage);
+int get hashCode => Object.hash(runtimeType,id,const DeepCollectionEquality().hash(_includeSamples),const DeepCollectionEquality().hash(_excludeSamples),systemMessage,const DeepCollectionEquality().hash(_args));
 
 @override
 String toString() {
-  return 'JobTask(id: $id, includeSamples: $includeSamples, excludeSamples: $excludeSamples, systemMessage: $systemMessage)';
+  return 'JobTask(id: $id, includeSamples: $includeSamples, excludeSamples: $excludeSamples, systemMessage: $systemMessage, args: $args)';
 }
 
 
@@ -956,7 +1047,7 @@ abstract mixin class _$JobTaskCopyWith<$Res> implements $JobTaskCopyWith<$Res> {
   factory _$JobTaskCopyWith(_JobTask value, $Res Function(_JobTask) _then) = __$JobTaskCopyWithImpl;
 @override @useResult
 $Res call({
- String id,@JsonKey(name: 'include_samples') List<String>? includeSamples,@JsonKey(name: 'exclude_samples') List<String>? excludeSamples,@JsonKey(name: 'system_message') String? systemMessage
+ String id,@JsonKey(name: 'include_samples') List<String>? includeSamples,@JsonKey(name: 'exclude_samples') List<String>? excludeSamples,@JsonKey(name: 'system_message') String? systemMessage,@JsonKey(name: 'args') Map<String, dynamic>? args
 });
 
 
@@ -973,13 +1064,14 @@ class __$JobTaskCopyWithImpl<$Res>
 
 /// Create a copy of JobTask
 /// with the given fields replaced by the non-null parameter values.
-@override @pragma('vm:prefer-inline') $Res call({Object? id = null,Object? includeSamples = freezed,Object? excludeSamples = freezed,Object? systemMessage = freezed,}) {
+@override @pragma('vm:prefer-inline') $Res call({Object? id = null,Object? includeSamples = freezed,Object? excludeSamples = freezed,Object? systemMessage = freezed,Object? args = freezed,}) {
   return _then(_JobTask(
 id: null == id ? _self.id : id // ignore: cast_nullable_to_non_nullable
 as String,includeSamples: freezed == includeSamples ? _self._includeSamples : includeSamples // ignore: cast_nullable_to_non_nullable
 as List<String>?,excludeSamples: freezed == excludeSamples ? _self._excludeSamples : excludeSamples // ignore: cast_nullable_to_non_nullable
 as List<String>?,systemMessage: freezed == systemMessage ? _self.systemMessage : systemMessage // ignore: cast_nullable_to_non_nullable
-as String?,
+as String?,args: freezed == args ? _self._args : args // ignore: cast_nullable_to_non_nullable
+as Map<String, dynamic>?,
   ));
 }
 
diff --git a/packages/dataset_config_dart/lib/src/models/job.g.dart b/packages/dataset_config_dart/lib/src/models/job.g.dart
index f62e5b3..a3abef1 100644
--- a/packages/dataset_config_dart/lib/src/models/job.g.dart
+++ b/packages/dataset_config_dart/lib/src/models/job.g.dart
@@ -7,6 +7,8 @@ part of 'job.dart';
 // **************************************************************************
 
 _Job _$JobFromJson(Map<String, dynamic> json) => _Job(
+  description: json['description'] as String?,
+  imagePrefix: json['image_prefix'] as String?,
   logDir: json['log_dir'] as String,
   sandboxType: json['sandbox_type'] as String? ?? 'local',
   maxConnections: (json['max_connections'] as num?)?.toInt() ?? 10,
@@ -72,16 +74,24 @@ _Job _$JobFromJson(Map<String, dynamic> json) => _Job(
   evalSetId: json['eval_set_id'] as String?,
   evalSetOverrides: json['eval_set_overrides'] as Map<String, dynamic>?,
   taskDefaults: json['task_defaults'] as Map<String, dynamic>?,
+  taskFilters: json['task_filters'] == null
+      ? null
+      : TagFilter.fromJson(json['task_filters'] as Map<String, dynamic>),
+  sampleFilters: json['sample_filters'] == null
+      ? null
+      : TagFilter.fromJson(json['sample_filters'] as Map<String, dynamic>),
 );
 
 Map<String, dynamic> _$JobToJson(_Job instance) => <String, dynamic>{
+  'description': instance.description,
+  'image_prefix': instance.imagePrefix,
   'log_dir': instance.logDir,
   'sandbox_type': instance.sandboxType,
   'max_connections': instance.maxConnections,
   'models': instance.models,
   'variants': instance.variants,
   'task_paths': instance.taskPaths,
-  'tasks': instance.tasks?.map((k, e) => MapEntry(k, e.toJson())),
+  'tasks': instance.tasks,
   'save_examples': instance.saveExamples,
   'retry_attempts': instance.retryAttempts,
   'max_retries': instance.maxRetries,
@@ -132,6 +142,8 @@ Map<String, dynamic> _$JobToJson(_Job instance) => <String, dynamic>{
   'eval_set_id': instance.evalSetId,
   'eval_set_overrides': instance.evalSetOverrides,
   'task_defaults': instance.taskDefaults,
+  'task_filters': instance.taskFilters,
+  'sample_filters': instance.sampleFilters,
 };
 
 _JobTask _$JobTaskFromJson(Map<String, dynamic> json) => _JobTask(
@@ -143,6 +155,7 @@ _JobTask _$JobTaskFromJson(Map<String, dynamic> json) => _JobTask(
       ?.map((e) => e as String)
       .toList(),
   systemMessage: json['system_message'] as String?,
+  args: json['args'] as Map<String, dynamic>?,
 );
 
 Map<String, dynamic> _$JobTaskToJson(_JobTask instance) => <String, dynamic>{
@@ -150,4 +163,5 @@ Map<String, dynamic> _$JobTaskToJson(_JobTask instance) => <String, dynamic>{
   'include_samples': instance.includeSamples,
   'exclude_samples': instance.excludeSamples,
   'system_message': instance.systemMessage,
+  'args': instance.args,
 };
diff --git a/packages/dataset_config_dart/lib/src/models/models.dart b/packages/dataset_config_dart/lib/src/models/models.dart
index 5b590fb..4fba25c 100644
--- a/packages/dataset_config_dart/lib/src/models/models.dart
+++ b/packages/dataset_config_dart/lib/src/models/models.dart
@@ -1,6 +1,7 @@
 // Config models (eval runner input configuration)
 export 'context_file.dart';
 export 'job.dart';
+export 'tag_filter.dart';
 export 'variant.dart';
 
 // Inspect AI models (mirrors the Python Inspect AI API types)
diff --git a/packages/dataset_config_dart/lib/src/models/tag_filter.dart b/packages/dataset_config_dart/lib/src/models/tag_filter.dart
new file mode 100644
index 0000000..f5a4ec1
--- /dev/null
+++ b/packages/dataset_config_dart/lib/src/models/tag_filter.dart
@@ -0,0 +1,33 @@
+import 'package:freezed_annotation/freezed_annotation.dart';
+
+part 'tag_filter.freezed.dart';
+part 'tag_filter.g.dart';
+
+/// Tag-based filter for including/excluding items by their tags.
+@freezed
+sealed class TagFilter with _$TagFilter {
+  const factory TagFilter({
+    @JsonKey(name: 'include_tags') List<String>? includeTags,
+    @JsonKey(name: 'exclude_tags') List<String>? excludeTags,
+  }) = _TagFilter;
+
+  factory TagFilter.fromJson(Map<String, dynamic> json) =>
+      _$TagFilterFromJson(json);
+}
+
+/// Check whether a set of [itemTags] matches the given [filter].
+///
+/// Returns `true` if:
+/// - All include_tags (if any) are present in [itemTags]
+/// - No exclude_tags (if any) are present in [itemTags]
+bool matchesTagFilter(List<String> itemTags, TagFilter filter) {
+  if (filter.includeTags != null &&
+      !filter.includeTags!.every((t) => itemTags.contains(t))) {
+    return false;
+  }
+  if (filter.excludeTags != null &&
+      filter.excludeTags!.any((t) => itemTags.contains(t))) {
+    return false;
+  }
+  return true;
+}
diff --git a/packages/dataset_config_dart/lib/src/models/tag_filter.freezed.dart b/packages/dataset_config_dart/lib/src/models/tag_filter.freezed.dart
new file mode 100644
index 0000000..5df78eb
--- /dev/null
+++ b/packages/dataset_config_dart/lib/src/models/tag_filter.freezed.dart
@@ -0,0 +1,290 @@
+// GENERATED CODE - DO NOT MODIFY BY HAND
+// coverage:ignore-file
+// ignore_for_file: type=lint
+// ignore_for_file: unused_element, deprecated_member_use, deprecated_member_use_from_same_package, use_function_type_syntax_for_parameters, unnecessary_const, avoid_init_to_null, invalid_override_different_default_values_named, prefer_expression_function_bodies, annotate_overrides, invalid_annotation_target, unnecessary_question_mark
+
+part of 'tag_filter.dart';
+
+// **************************************************************************
+// FreezedGenerator
+// **************************************************************************
+
+// dart format off
+T _$identity<T>(T value) => value;
+
+/// @nodoc
+mixin _$TagFilter {
+
+@JsonKey(name: 'include_tags') List<String>? get includeTags;@JsonKey(name: 'exclude_tags') List<String>? get excludeTags;
+/// Create a copy of TagFilter
+/// with the given fields replaced by the non-null parameter values.
+@JsonKey(includeFromJson: false, includeToJson: false)
+@pragma('vm:prefer-inline')
+$TagFilterCopyWith<TagFilter> get copyWith => _$TagFilterCopyWithImpl<TagFilter>(this as TagFilter, _$identity);
+
+  /// Serializes this TagFilter to a JSON map.
+  Map<String, dynamic> toJson();
+
+
+@override
+bool operator ==(Object other) {
+  return identical(this, other) || (other.runtimeType == runtimeType&&other is TagFilter&&const DeepCollectionEquality().equals(other.includeTags, includeTags)&&const DeepCollectionEquality().equals(other.excludeTags, excludeTags));
+}
+
+@JsonKey(includeFromJson: false, includeToJson: false)
+@override
+int get hashCode => Object.hash(runtimeType,const DeepCollectionEquality().hash(includeTags),const DeepCollectionEquality().hash(excludeTags));
+
+@override
+String toString() {
+  return 'TagFilter(includeTags: $includeTags, excludeTags: $excludeTags)';
+}
+
+
+}
+
+/// @nodoc
+abstract mixin class $TagFilterCopyWith<$Res>  {
+  factory $TagFilterCopyWith(TagFilter value, $Res Function(TagFilter) _then) = _$TagFilterCopyWithImpl;
+@useResult
+$Res call({
+@JsonKey(name: 'include_tags') List<String>? includeTags,@JsonKey(name: 'exclude_tags') List<String>? excludeTags
+});
+
+
+
+
+}
+/// @nodoc
+class _$TagFilterCopyWithImpl<$Res>
+    implements $TagFilterCopyWith<$Res> {
+  _$TagFilterCopyWithImpl(this._self, this._then);
+
+  final TagFilter _self;
+  final $Res Function(TagFilter) _then;
+
+/// Create a copy of TagFilter
+/// with the given fields replaced by the non-null parameter values.
+@pragma('vm:prefer-inline') @override $Res call({Object? includeTags = freezed,Object? excludeTags = freezed,}) {
+  return _then(_self.copyWith(
+includeTags: freezed == includeTags ? _self.includeTags : includeTags // ignore: cast_nullable_to_non_nullable
+as List<String>?,excludeTags: freezed == excludeTags ? _self.excludeTags : excludeTags // ignore: cast_nullable_to_non_nullable
+as List<String>?,
+  ));
+}
+
+}
+
+
+/// Adds pattern-matching-related methods to [TagFilter].
+extension TagFilterPatterns on TagFilter {
+/// A variant of `map` that fallback to returning `orElse`.
+///
+/// It is equivalent to doing:
+/// ```dart
+/// switch (sealedClass) {
+///   case final Subclass value:
+///     return ...;
+///   case _:
+///     return orElse();
+/// }
+/// ```
+
+@optionalTypeArgs TResult maybeMap<TResult extends Object?>(TResult Function( _TagFilter value)?  $default,{required TResult orElse(),}){
+final _that = this;
+switch (_that) {
+case _TagFilter() when $default != null:
+return $default(_that);case _:
+  return orElse();
+
+}
+}
+/// A `switch`-like method, using callbacks.
+///
+/// Callbacks receives the raw object, upcasted.
+/// It is equivalent to doing:
+/// ```dart
+/// switch (sealedClass) {
+///   case final Subclass value:
+///     return ...;
+///   case final Subclass2 value:
+///     return ...;
+/// }
+/// ```
+
+@optionalTypeArgs TResult map<TResult extends Object?>(TResult Function( _TagFilter value)  $default,){
+final _that = this;
+switch (_that) {
+case _TagFilter():
+return $default(_that);}
+}
+/// A variant of `map` that fallback to returning `null`.
+///
+/// It is equivalent to doing:
+/// ```dart
+/// switch (sealedClass) {
+///   case final Subclass value:
+///     return ...;
+///   case _:
+///     return null;
+/// }
+/// ```
+
+@optionalTypeArgs TResult? mapOrNull<TResult extends Object?>(TResult? Function( _TagFilter value)?  $default,){
+final _that = this;
+switch (_that) {
+case _TagFilter() when $default != null:
+return $default(_that);case _:
+  return null;
+
+}
+}
+/// A variant of `when` that fallback to an `orElse` callback.
+///
+/// It is equivalent to doing:
+/// ```dart
+/// switch (sealedClass) {
+///   case Subclass(:final field):
+///     return ...;
+///   case _:
+///     return orElse();
+/// }
+/// ```
+
+@optionalTypeArgs TResult maybeWhen<TResult extends Object?>(TResult Function(@JsonKey(name: 'include_tags')  List<String>? includeTags, @JsonKey(name: 'exclude_tags')  List<String>? excludeTags)?  $default,{required TResult orElse(),}) {final _that = this;
+switch (_that) {
+case _TagFilter() when $default != null:
+return $default(_that.includeTags,_that.excludeTags);case _:
+  return orElse();
+
+}
+}
+/// A `switch`-like method, using callbacks.
+///
+/// As opposed to `map`, this offers destructuring.
+/// It is equivalent to doing:
+/// ```dart
+/// switch (sealedClass) {
+///   case Subclass(:final field):
+///     return ...;
+///   case Subclass2(:final field2):
+///     return ...;
+/// }
+/// ```
+
+@optionalTypeArgs TResult when<TResult extends Object?>(TResult Function(@JsonKey(name: 'include_tags')  List<String>? includeTags, @JsonKey(name: 'exclude_tags')  List<String>? excludeTags)  $default,) {final _that = this;
+switch (_that) {
+case _TagFilter():
+return $default(_that.includeTags,_that.excludeTags);}
+}
+/// A variant of `when` that fallback to returning `null`
+///
+/// It is equivalent to doing:
+/// ```dart
+/// switch (sealedClass) {
+///   case Subclass(:final field):
+///     return ...;
+///   case _:
+///     return null;
+/// }
+/// ```
+
+@optionalTypeArgs TResult? whenOrNull<TResult extends Object?>(TResult? Function(@JsonKey(name: 'include_tags')  List<String>? includeTags, @JsonKey(name: 'exclude_tags')  List<String>? excludeTags)?  $default,) {final _that = this;
+switch (_that) {
+case _TagFilter() when $default != null:
+return $default(_that.includeTags,_that.excludeTags);case _:
+  return null;
+
+}
+}
+
+}
+
+/// @nodoc
+@JsonSerializable()
+
+class _TagFilter implements TagFilter {
+  const _TagFilter({@JsonKey(name: 'include_tags') final  List<String>? includeTags, @JsonKey(name: 'exclude_tags') final  List<String>? excludeTags}): _includeTags = includeTags,_excludeTags = excludeTags;
+  factory _TagFilter.fromJson(Map<String, dynamic> json) => _$TagFilterFromJson(json);
+
+ final  List<String>? _includeTags;
+@override@JsonKey(name: 'include_tags') List<String>? get includeTags {
+  final value = _includeTags;
+  if (value == null) return null;
+  if (_includeTags is EqualUnmodifiableListView) return _includeTags;
+  // ignore: implicit_dynamic_type
+  return EqualUnmodifiableListView(value);
+}
+
+ final  List<String>? _excludeTags;
+@override@JsonKey(name: 'exclude_tags') List<String>? get excludeTags {
+  final value = _excludeTags;
+  if (value == null) return null;
+  if (_excludeTags is EqualUnmodifiableListView) return _excludeTags;
+  // ignore: implicit_dynamic_type
+  return EqualUnmodifiableListView(value);
+}
+
+
+/// Create a copy of TagFilter
+/// with the given fields replaced by the non-null parameter values.
+@override @JsonKey(includeFromJson: false, includeToJson: false)
+@pragma('vm:prefer-inline')
+_$TagFilterCopyWith<_TagFilter> get copyWith => __$TagFilterCopyWithImpl<_TagFilter>(this, _$identity);
+
+@override
+Map<String, dynamic> toJson() {
+  return _$TagFilterToJson(this, );
+}
+
+@override
+bool operator ==(Object other) {
+  return identical(this, other) || (other.runtimeType == runtimeType&&other is _TagFilter&&const DeepCollectionEquality().equals(other._includeTags, _includeTags)&&const DeepCollectionEquality().equals(other._excludeTags, _excludeTags));
+}
+
+@JsonKey(includeFromJson: false, includeToJson: false)
+@override
+int get hashCode => Object.hash(runtimeType,const DeepCollectionEquality().hash(_includeTags),const DeepCollectionEquality().hash(_excludeTags));
+
+@override
+String toString() {
+  return 'TagFilter(includeTags: $includeTags, excludeTags: $excludeTags)';
+}
+
+
+}
+
+/// @nodoc
+abstract mixin class _$TagFilterCopyWith<$Res> implements $TagFilterCopyWith<$Res> {
+  factory _$TagFilterCopyWith(_TagFilter value, $Res Function(_TagFilter) _then) = __$TagFilterCopyWithImpl;
+@override @useResult
+$Res call({
+@JsonKey(name: 'include_tags') List<String>? includeTags,@JsonKey(name: 'exclude_tags') List<String>? excludeTags
+});
+
+
+
+
+}
+/// @nodoc
+class __$TagFilterCopyWithImpl<$Res>
+    implements _$TagFilterCopyWith<$Res> {
+  __$TagFilterCopyWithImpl(this._self, this._then);
+
+  final _TagFilter _self;
+  final $Res Function(_TagFilter) _then;
+
+/// Create a copy of TagFilter
+/// with the given fields replaced by the non-null parameter values.
+@override @pragma('vm:prefer-inline') $Res call({Object? includeTags = freezed,Object? excludeTags = freezed,}) {
+  return _then(_TagFilter(
+includeTags: freezed == includeTags ? _self._includeTags : includeTags // ignore: cast_nullable_to_non_nullable
+as List<String>?,excludeTags: freezed == excludeTags ? _self._excludeTags : excludeTags // ignore: cast_nullable_to_non_nullable
+as List<String>?,
+  ));
+}
+
+
+}
+
+// dart format on
diff --git a/packages/dataset_config_dart/lib/src/models/tag_filter.g.dart b/packages/dataset_config_dart/lib/src/models/tag_filter.g.dart
new file mode 100644
index 0000000..db8553c
--- /dev/null
+++ b/packages/dataset_config_dart/lib/src/models/tag_filter.g.dart
@@ -0,0 +1,22 @@
+// GENERATED CODE - DO NOT MODIFY BY HAND
+
+part of 'tag_filter.dart';
+
+// **************************************************************************
+// JsonSerializableGenerator
+// **************************************************************************
+
+_TagFilter _$TagFilterFromJson(Map<String, dynamic> json) => _TagFilter(
+  includeTags: (json['include_tags'] as List<dynamic>?)
+      ?.map((e) => e as String)
+      .toList(),
+  excludeTags: (json['exclude_tags'] as List<dynamic>?)
+      ?.map((e) => e as String)
+      .toList(),
+);
+
+Map<String, dynamic> _$TagFilterToJson(_TagFilter instance) =>
+    <String, dynamic>{
+      'include_tags': instance.includeTags,
+      'exclude_tags': instance.excludeTags,
+    };
diff --git a/packages/dataset_config_dart/lib/src/models/task.dart b/packages/dataset_config_dart/lib/src/models/task.dart
index ccb568b..19e4f02 100644
--- a/packages/dataset_config_dart/lib/src/models/task.dart
+++ b/packages/dataset_config_dart/lib/src/models/task.dart
@@ -95,7 +95,13 @@ sealed class Task with _$Task {
     /// `@task` function (e.g. `"flutter_code_gen"` or
     /// `"dash_evals.runner.tasks.flutter_code_gen"`).
     /// When absent, the runner hydrates directly from JSON (Mode 2 — future).
-    @JsonKey(name: 'task_func') String? taskFunc,
+    @JsonKey(name: 'func') String? func,
+
+    /// System message override for this task.
+    @JsonKey(name: 'system_message') String? systemMessage,
+
+    /// Pass-through dict for sandbox plugin configuration.
+    @JsonKey(name: 'sandbox_parameters') Map<String, dynamic>? sandboxParameters,
 
     /// Task name.
     ///
@@ -113,14 +119,14 @@ sealed class Task with _$Task {
 }
 
 class TaskMetadata {
-  final String taskFunc;
+  final String func;
   final Map<String, Object?> additional;
 
-  TaskMetadata(this.taskFunc, this.additional);
+  TaskMetadata(this.func, this.additional);
 
   Map<String, dynamic> toJson() {
     return {
-      'taskFunc': taskFunc,
+      'func': func,
     };
   }
 }
diff --git a/packages/dataset_config_dart/lib/src/models/task.freezed.dart b/packages/dataset_config_dart/lib/src/models/task.freezed.dart
index 94a4a37..b38f7d9 100644
--- a/packages/dataset_config_dart/lib/src/models/task.freezed.dart
+++ b/packages/dataset_config_dart/lib/src/models/task.freezed.dart
@@ -50,14 +50,15 @@ mixin _$Task {
 @JsonKey(name: 'early_stopping') Object? get earlyStopping;/// Task display name (e.g. for plotting).
 ///
 /// Defaults to the registered task name.
-@JsonKey(name: 'display_name') String? get displayName;
-/// Task function identifier for Mode 1 hydration.
+@JsonKey(name: 'display_name') String? get displayName;/// Task function identifier for Mode 1 hydration.
 ///
 /// When present, the Python runner uses this to look up a pre-built
 /// `@task` function (e.g. `"flutter_code_gen"` or
 /// `"dash_evals.runner.tasks.flutter_code_gen"`).
 /// When absent, the runner hydrates directly from JSON (Mode 2 — future).
-@JsonKey(name: 'task_func') String? get taskFunc;/// Task name.
+@JsonKey(name: 'func') String? get func;/// System message override for this task.
+@JsonKey(name: 'system_message') String? get systemMessage;/// Pass-through dict for sandbox plugin configuration.
+@JsonKey(name: 'sandbox_parameters') Map<String, dynamic>? get sandboxParameters;/// Task name.
 ///
 /// Automatically determined based on the registered name if not specified.
  String? get name;/// Version of task (to distinguish evolutions of the task spec).
@@ -75,16 +76,16 @@ $TaskCopyWith<Task> get copyWith => _$TaskCopyWithImpl<Task>(this as Task, _$ide
 
 @override
 bool operator ==(Object other) {
-  return identical(this, other) || (other.runtimeType == runtimeType&&other is Task&&(identical(other.dataset, dataset) || other.dataset == dataset)&&const DeepCollectionEquality().equals(other.setup, setup)&&const DeepCollectionEquality().equals(other.solver, solver)&&const DeepCollectionEquality().equals(other.cleanup, cleanup)&&const DeepCollectionEquality().equals(other.scorer, scorer)&&const DeepCollectionEquality().equals(other.metrics, metrics)&&(identical(other.model, model) || other.model == model)&&const DeepCollectionEquality().equals(other.config, config)&&const DeepCollectionEquality().equals(other.modelRoles, modelRoles)&&const DeepCollectionEquality().equals(other.sandbox, sandbox)&&const DeepCollectionEquality().equals(other.approval, approval)&&const DeepCollectionEquality().equals(other.epochs, epochs)&&const DeepCollectionEquality().equals(other.failOnError, failOnError)&&(identical(other.continueOnFail, continueOnFail) || other.continueOnFail == continueOnFail)&&(identical(other.messageLimit, messageLimit) || other.messageLimit == messageLimit)&&(identical(other.tokenLimit, tokenLimit) || other.tokenLimit == tokenLimit)&&(identical(other.timeLimit, timeLimit) || other.timeLimit == timeLimit)&&(identical(other.workingLimit, workingLimit) || other.workingLimit == workingLimit)&&(identical(other.costLimit, costLimit) || other.costLimit == costLimit)&&const DeepCollectionEquality().equals(other.earlyStopping, earlyStopping)&&(identical(other.displayName, displayName) || other.displayName == displayName)&&(identical(other.taskFunc, taskFunc) || other.taskFunc == taskFunc)&&(identical(other.name, name) || other.name == name)&&const DeepCollectionEquality().equals(other.version, version)&&const DeepCollectionEquality().equals(other.metadata, metadata));
+  return identical(this, other) || (other.runtimeType == runtimeType&&other is Task&&(identical(other.dataset, dataset) || other.dataset == dataset)&&const DeepCollectionEquality().equals(other.setup, setup)&&const DeepCollectionEquality().equals(other.solver, solver)&&const DeepCollectionEquality().equals(other.cleanup, cleanup)&&const DeepCollectionEquality().equals(other.scorer, scorer)&&const DeepCollectionEquality().equals(other.metrics, metrics)&&(identical(other.model, model) || other.model == model)&&const DeepCollectionEquality().equals(other.config, config)&&const DeepCollectionEquality().equals(other.modelRoles, modelRoles)&&const DeepCollectionEquality().equals(other.sandbox, sandbox)&&const DeepCollectionEquality().equals(other.approval, approval)&&const DeepCollectionEquality().equals(other.epochs, epochs)&&const DeepCollectionEquality().equals(other.failOnError, failOnError)&&(identical(other.continueOnFail, continueOnFail) || other.continueOnFail == continueOnFail)&&(identical(other.messageLimit, messageLimit) || other.messageLimit == messageLimit)&&(identical(other.tokenLimit, tokenLimit) || other.tokenLimit == tokenLimit)&&(identical(other.timeLimit, timeLimit) || other.timeLimit == timeLimit)&&(identical(other.workingLimit, workingLimit) || other.workingLimit == workingLimit)&&(identical(other.costLimit, costLimit) || other.costLimit == costLimit)&&const DeepCollectionEquality().equals(other.earlyStopping, earlyStopping)&&(identical(other.displayName, displayName) || other.displayName == displayName)&&(identical(other.func, func) || other.func == func)&&(identical(other.systemMessage, systemMessage) || other.systemMessage == systemMessage)&&const DeepCollectionEquality().equals(other.sandboxParameters, sandboxParameters)&&(identical(other.name, name) || other.name == name)&&const DeepCollectionEquality().equals(other.version, version)&&const DeepCollectionEquality().equals(other.metadata, metadata));
 }
 
 @JsonKey(includeFromJson: false, includeToJson: false)
 @override
-int get hashCode => Object.hashAll([runtimeType,dataset,const DeepCollectionEquality().hash(setup),const DeepCollectionEquality().hash(solver),const DeepCollectionEquality().hash(cleanup),const DeepCollectionEquality().hash(scorer),const DeepCollectionEquality().hash(metrics),model,const DeepCollectionEquality().hash(config),const DeepCollectionEquality().hash(modelRoles),const DeepCollectionEquality().hash(sandbox),const DeepCollectionEquality().hash(approval),const DeepCollectionEquality().hash(epochs),const DeepCollectionEquality().hash(failOnError),continueOnFail,messageLimit,tokenLimit,timeLimit,workingLimit,costLimit,const DeepCollectionEquality().hash(earlyStopping),displayName,taskFunc,name,const DeepCollectionEquality().hash(version),const DeepCollectionEquality().hash(metadata)]);
+int get hashCode => Object.hashAll([runtimeType,dataset,const DeepCollectionEquality().hash(setup),const DeepCollectionEquality().hash(solver),const DeepCollectionEquality().hash(cleanup),const DeepCollectionEquality().hash(scorer),const DeepCollectionEquality().hash(metrics),model,const DeepCollectionEquality().hash(config),const DeepCollectionEquality().hash(modelRoles),const DeepCollectionEquality().hash(sandbox),const DeepCollectionEquality().hash(approval),const DeepCollectionEquality().hash(epochs),const DeepCollectionEquality().hash(failOnError),continueOnFail,messageLimit,tokenLimit,timeLimit,workingLimit,costLimit,const DeepCollectionEquality().hash(earlyStopping),displayName,func,systemMessage,const DeepCollectionEquality().hash(sandboxParameters),name,const DeepCollectionEquality().hash(version),const DeepCollectionEquality().hash(metadata)]);
 
 @override
 String toString() {
-  return 'Task(dataset: $dataset, setup: $setup, solver: $solver, cleanup: $cleanup, scorer: $scorer, metrics: $metrics, model: $model, config: $config, modelRoles: $modelRoles, sandbox: $sandbox, approval: $approval, epochs: $epochs, failOnError: $failOnError, continueOnFail: $continueOnFail, messageLimit: $messageLimit, tokenLimit: $tokenLimit, timeLimit: $timeLimit, workingLimit: $workingLimit, costLimit: $costLimit, earlyStopping: $earlyStopping, displayName: $displayName, taskFunc: $taskFunc, name: $name, version: $version, metadata: $metadata)';
+  return 'Task(dataset: $dataset, setup: $setup, solver: $solver, cleanup: $cleanup, scorer: $scorer, metrics: $metrics, model: $model, config: $config, modelRoles: $modelRoles, sandbox: $sandbox, approval: $approval, epochs: $epochs, failOnError: $failOnError, continueOnFail: $continueOnFail, messageLimit: $messageLimit, tokenLimit: $tokenLimit, timeLimit: $timeLimit, workingLimit: $workingLimit, costLimit: $costLimit, earlyStopping: $earlyStopping, displayName: $displayName, func: $func, systemMessage: $systemMessage, sandboxParameters: $sandboxParameters, name: $name, version: $version, metadata: $metadata)';
 }
 
 
@@ -95,7 +96,7 @@ abstract mixin class $TaskCopyWith<$Res>  {
   factory $TaskCopyWith(Task value, $Res Function(Task) _then) = _$TaskCopyWithImpl;
 @useResult
 $Res call({
- Dataset? dataset, Object? setup, Object? solver, Object? cleanup, Object? scorer, Object? metrics, String? model, Object? config,@JsonKey(name: 'model_roles') Map<String, String>? modelRoles, Object? sandbox, Object? approval, Object? epochs,@JsonKey(name: 'fail_on_error') Object? failOnError,@JsonKey(name: 'continue_on_fail') bool? continueOnFail,@JsonKey(name: 'message_limit') int? messageLimit,@JsonKey(name: 'token_limit') int? tokenLimit,@JsonKey(name: 'time_limit') int? timeLimit,@JsonKey(name: 'working_limit') int? workingLimit,@JsonKey(name: 'cost_limit') double? costLimit,@JsonKey(name: 'early_stopping') Object? earlyStopping,@JsonKey(name: 'display_name') String? displayName,@JsonKey(name: 'task_func') String? taskFunc, String? name, Object version, Map<String, dynamic>? metadata
+ Dataset? dataset, Object? setup, Object? solver, Object? cleanup, Object? scorer, Object? metrics, String? model, Object? config,@JsonKey(name: 'model_roles') Map<String, String>? modelRoles, Object? sandbox, Object? approval, Object? epochs,@JsonKey(name: 'fail_on_error') Object? failOnError,@JsonKey(name: 'continue_on_fail') bool? continueOnFail,@JsonKey(name: 'message_limit') int? messageLimit,@JsonKey(name: 'token_limit') int? tokenLimit,@JsonKey(name: 'time_limit') int? timeLimit,@JsonKey(name: 'working_limit') int? workingLimit,@JsonKey(name: 'cost_limit') double? costLimit,@JsonKey(name: 'early_stopping') Object? earlyStopping,@JsonKey(name: 'display_name') String? displayName,@JsonKey(name: 'func') String? func,@JsonKey(name: 'system_message') String? systemMessage,@JsonKey(name: 'sandbox_parameters') Map<String, dynamic>? sandboxParameters, String? name, Object version, Map<String, dynamic>? metadata
 });
 
 
@@ -112,7 +113,7 @@ class _$TaskCopyWithImpl<$Res>
 
 /// Create a copy of Task
 /// with the given fields replaced by the non-null parameter values.
-@pragma('vm:prefer-inline') @override $Res call({Object? dataset = freezed,Object? setup = freezed,Object? solver = freezed,Object? cleanup = freezed,Object? scorer = freezed,Object? metrics = freezed,Object? model = freezed,Object? config = freezed,Object? modelRoles = freezed,Object? sandbox = freezed,Object? approval = freezed,Object? epochs = freezed,Object? failOnError = freezed,Object? continueOnFail = freezed,Object? messageLimit = freezed,Object? tokenLimit = freezed,Object? timeLimit = freezed,Object? workingLimit = freezed,Object? costLimit = freezed,Object? earlyStopping = freezed,Object? displayName = freezed,Object? taskFunc = freezed,Object? name = freezed,Object? version = null,Object? metadata = freezed,}) {
+@pragma('vm:prefer-inline') @override $Res call({Object? dataset = freezed,Object? setup = freezed,Object? solver = freezed,Object? cleanup = freezed,Object? scorer = freezed,Object? metrics = freezed,Object? model = freezed,Object? config = freezed,Object? modelRoles = freezed,Object? sandbox = freezed,Object? approval = freezed,Object? epochs = freezed,Object? failOnError = freezed,Object? continueOnFail = freezed,Object? messageLimit = freezed,Object? tokenLimit = freezed,Object? timeLimit = freezed,Object? workingLimit = freezed,Object? costLimit = freezed,Object? earlyStopping = freezed,Object? displayName = freezed,Object? func = freezed,Object? systemMessage = freezed,Object? sandboxParameters = freezed,Object? name = freezed,Object? version = null,Object? metadata = freezed,}) {
   return _then(_self.copyWith(
 dataset: freezed == dataset ? _self.dataset : dataset // ignore: cast_nullable_to_non_nullable
 as Dataset?,setup: freezed == setup ? _self.setup : setup ,solver: freezed == solver ? _self.solver : solver ,cleanup: freezed == cleanup ? _self.cleanup : cleanup ,scorer: freezed == scorer ? _self.scorer : scorer ,metrics: freezed == metrics ? _self.metrics : metrics ,model: freezed == model ? _self.model : model // ignore: cast_nullable_to_non_nullable
@@ -124,8 +125,10 @@ as int?,timeLimit: freezed == timeLimit ? _self.timeLimit : timeLimit // ignore:
 as int?,workingLimit: freezed == workingLimit ? _self.workingLimit : workingLimit // ignore: cast_nullable_to_non_nullable
 as int?,costLimit: freezed == costLimit ? _self.costLimit : costLimit // ignore: cast_nullable_to_non_nullable
 as double?,earlyStopping: freezed == earlyStopping ? _self.earlyStopping : earlyStopping ,displayName: freezed == displayName ? _self.displayName : displayName // ignore: cast_nullable_to_non_nullable
-as String?,taskFunc: freezed == taskFunc ? _self.taskFunc : taskFunc // ignore: cast_nullable_to_non_nullable
-as String?,name: freezed == name ? _self.name : name // ignore: cast_nullable_to_non_nullable
+as String?,func: freezed == func ? _self.func : func // ignore: cast_nullable_to_non_nullable
+as String?,systemMessage: freezed == systemMessage ? _self.systemMessage : systemMessage // ignore: cast_nullable_to_non_nullable
+as String?,sandboxParameters: freezed == sandboxParameters ? _self.sandboxParameters : sandboxParameters // ignore: cast_nullable_to_non_nullable
+as Map<String, dynamic>?,name: freezed == name ? _self.name : name // ignore: cast_nullable_to_non_nullable
 as String?,version: null == version ? _self.version : version ,metadata: freezed == metadata ? _self.metadata : metadata // ignore: cast_nullable_to_non_nullable
 as Map<String, dynamic>?,
   ));
@@ -221,10 +224,10 @@ return $default(_that);case _:
 /// }
 /// ```
 
-@optionalTypeArgs TResult maybeWhen<TResult extends Object?>(TResult Function( Dataset? dataset,  Object? setup,  Object? solver,  Object? cleanup,  Object? scorer,  Object? metrics,  String? model,  Object? config, @JsonKey(name: 'model_roles')  Map<String, String>? modelRoles,  Object? sandbox,  Object? approval,  Object? epochs, @JsonKey(name: 'fail_on_error')  Object? failOnError, @JsonKey(name: 'continue_on_fail')  bool? continueOnFail, @JsonKey(name: 'message_limit')  int? messageLimit, @JsonKey(name: 'token_limit')  int? tokenLimit, @JsonKey(name: 'time_limit')  int? timeLimit, @JsonKey(name: 'working_limit')  int? workingLimit, @JsonKey(name: 'cost_limit')  double? costLimit, @JsonKey(name: 'early_stopping')  Object? earlyStopping, @JsonKey(name: 'display_name')  String? displayName, @JsonKey(name: 'task_func')  String? taskFunc,  String? name,  Object version,  Map<String, dynamic>? metadata)?  $default,{required TResult orElse(),}) {final _that = this;
+@optionalTypeArgs TResult maybeWhen<TResult extends Object?>(TResult Function( Dataset? dataset,  Object? setup,  Object? solver,  Object? cleanup,  Object? scorer,  Object? metrics,  String? model,  Object? config, @JsonKey(name: 'model_roles')  Map<String, String>? modelRoles,  Object? sandbox,  Object? approval,  Object? epochs, @JsonKey(name: 'fail_on_error')  Object? failOnError, @JsonKey(name: 'continue_on_fail')  bool? continueOnFail, @JsonKey(name: 'message_limit')  int? messageLimit, @JsonKey(name: 'token_limit')  int? tokenLimit, @JsonKey(name: 'time_limit')  int? timeLimit, @JsonKey(name: 'working_limit')  int? workingLimit, @JsonKey(name: 'cost_limit')  double? costLimit, @JsonKey(name: 'early_stopping')  Object? earlyStopping, @JsonKey(name: 'display_name')  String? displayName, @JsonKey(name: 'func')  String? func, @JsonKey(name: 'system_message')  String? systemMessage, @JsonKey(name: 'sandbox_parameters')  Map<String, dynamic>? sandboxParameters,  String? name,  Object version,  Map<String, dynamic>? metadata)?  $default,{required TResult orElse(),}) {final _that = this;
 switch (_that) {
 case _Task() when $default != null:
-return $default(_that.dataset,_that.setup,_that.solver,_that.cleanup,_that.scorer,_that.metrics,_that.model,_that.config,_that.modelRoles,_that.sandbox,_that.approval,_that.epochs,_that.failOnError,_that.continueOnFail,_that.messageLimit,_that.tokenLimit,_that.timeLimit,_that.workingLimit,_that.costLimit,_that.earlyStopping,_that.displayName,_that.taskFunc,_that.name,_that.version,_that.metadata);case _:
+return $default(_that.dataset,_that.setup,_that.solver,_that.cleanup,_that.scorer,_that.metrics,_that.model,_that.config,_that.modelRoles,_that.sandbox,_that.approval,_that.epochs,_that.failOnError,_that.continueOnFail,_that.messageLimit,_that.tokenLimit,_that.timeLimit,_that.workingLimit,_that.costLimit,_that.earlyStopping,_that.displayName,_that.func,_that.systemMessage,_that.sandboxParameters,_that.name,_that.version,_that.metadata);case _:
   return orElse();
 
 }
@@ -242,10 +245,10 @@ return $default(_that.dataset,_that.setup,_that.solver,_that.cleanup,_that.score
 /// }
 /// ```
 
-@optionalTypeArgs TResult when<TResult extends Object?>(TResult Function( Dataset? dataset,  Object? setup,  Object? solver,  Object? cleanup,  Object? scorer,  Object? metrics,  String? model,  Object? config, @JsonKey(name: 'model_roles')  Map<String, String>? modelRoles,  Object? sandbox,  Object? approval,  Object? epochs, @JsonKey(name: 'fail_on_error')  Object? failOnError, @JsonKey(name: 'continue_on_fail')  bool? continueOnFail, @JsonKey(name: 'message_limit')  int? messageLimit, @JsonKey(name: 'token_limit')  int? tokenLimit, @JsonKey(name: 'time_limit')  int? timeLimit, @JsonKey(name: 'working_limit')  int? workingLimit, @JsonKey(name: 'cost_limit')  double? costLimit, @JsonKey(name: 'early_stopping')  Object? earlyStopping, @JsonKey(name: 'display_name')  String? displayName, @JsonKey(name: 'task_func')  String? taskFunc,  String? name,  Object version,  Map<String, dynamic>? metadata)  $default,) {final _that = this;
+@optionalTypeArgs TResult when<TResult extends Object?>(TResult Function( Dataset? dataset,  Object? setup,  Object? solver,  Object? cleanup,  Object? scorer,  Object? metrics,  String? model,  Object? config, @JsonKey(name: 'model_roles')  Map<String, String>? modelRoles,  Object? sandbox,  Object? approval,  Object? epochs, @JsonKey(name: 'fail_on_error')  Object? failOnError, @JsonKey(name: 'continue_on_fail')  bool? continueOnFail, @JsonKey(name: 'message_limit')  int? messageLimit, @JsonKey(name: 'token_limit')  int? tokenLimit, @JsonKey(name: 'time_limit')  int? timeLimit, @JsonKey(name: 'working_limit')  int? workingLimit, @JsonKey(name: 'cost_limit')  double? costLimit, @JsonKey(name: 'early_stopping')  Object? earlyStopping, @JsonKey(name: 'display_name')  String? displayName, @JsonKey(name: 'func')  String? func, @JsonKey(name: 'system_message')  String? systemMessage, @JsonKey(name: 'sandbox_parameters')  Map<String, dynamic>? sandboxParameters,  String? name,  Object version,  Map<String, dynamic>? metadata)  $default,) {final _that = this;
 switch (_that) {
 case _Task():
-return $default(_that.dataset,_that.setup,_that.solver,_that.cleanup,_that.scorer,_that.metrics,_that.model,_that.config,_that.modelRoles,_that.sandbox,_that.approval,_that.epochs,_that.failOnError,_that.continueOnFail,_that.messageLimit,_that.tokenLimit,_that.timeLimit,_that.workingLimit,_that.costLimit,_that.earlyStopping,_that.displayName,_that.taskFunc,_that.name,_that.version,_that.metadata);}
+return $default(_that.dataset,_that.setup,_that.solver,_that.cleanup,_that.scorer,_that.metrics,_that.model,_that.config,_that.modelRoles,_that.sandbox,_that.approval,_that.epochs,_that.failOnError,_that.continueOnFail,_that.messageLimit,_that.tokenLimit,_that.timeLimit,_that.workingLimit,_that.costLimit,_that.earlyStopping,_that.displayName,_that.func,_that.systemMessage,_that.sandboxParameters,_that.name,_that.version,_that.metadata);}
 }
 /// A variant of `when` that fallback to returning `null`
 ///
@@ -259,10 +262,10 @@ return $default(_that.dataset,_that.setup,_that.solver,_that.cleanup,_that.score
 /// }
 /// ```
 
-@optionalTypeArgs TResult? whenOrNull<TResult extends Object?>(TResult? Function( Dataset? dataset,  Object? setup,  Object? solver,  Object? cleanup,  Object? scorer,  Object? metrics,  String? model,  Object? config, @JsonKey(name: 'model_roles')  Map<String, String>? modelRoles,  Object? sandbox,  Object? approval,  Object? epochs, @JsonKey(name: 'fail_on_error')  Object? failOnError, @JsonKey(name: 'continue_on_fail')  bool? continueOnFail, @JsonKey(name: 'message_limit')  int? messageLimit, @JsonKey(name: 'token_limit')  int? tokenLimit, @JsonKey(name: 'time_limit')  int? timeLimit, @JsonKey(name: 'working_limit')  int? workingLimit, @JsonKey(name: 'cost_limit')  double? costLimit, @JsonKey(name: 'early_stopping')  Object? earlyStopping, @JsonKey(name: 'display_name')  String? displayName, @JsonKey(name: 'task_func')  String? taskFunc,  String? name,  Object version,  Map<String, dynamic>? metadata)?  $default,) {final _that = this;
+@optionalTypeArgs TResult? whenOrNull<TResult extends Object?>(TResult? Function( Dataset? dataset,  Object? setup,  Object? solver,  Object? cleanup,  Object? scorer,  Object? metrics,  String? model,  Object? config, @JsonKey(name: 'model_roles')  Map<String, String>? modelRoles,  Object? sandbox,  Object? approval,  Object? epochs, @JsonKey(name: 'fail_on_error')  Object? failOnError, @JsonKey(name: 'continue_on_fail')  bool? continueOnFail, @JsonKey(name: 'message_limit')  int? messageLimit, @JsonKey(name: 'token_limit')  int? tokenLimit, @JsonKey(name: 'time_limit')  int? timeLimit, @JsonKey(name: 'working_limit')  int? workingLimit, @JsonKey(name: 'cost_limit')  double? costLimit, @JsonKey(name: 'early_stopping')  Object? earlyStopping, @JsonKey(name: 'display_name')  String? displayName, @JsonKey(name: 'func')  String? func, @JsonKey(name: 'system_message')  String? systemMessage, @JsonKey(name: 'sandbox_parameters')  Map<String, dynamic>? sandboxParameters,  String? name,  Object version,  Map<String, dynamic>? metadata)?  $default,) {final _that = this;
 switch (_that) {
 case _Task() when $default != null:
-return $default(_that.dataset,_that.setup,_that.solver,_that.cleanup,_that.scorer,_that.metrics,_that.model,_that.config,_that.modelRoles,_that.sandbox,_that.approval,_that.epochs,_that.failOnError,_that.continueOnFail,_that.messageLimit,_that.tokenLimit,_that.timeLimit,_that.workingLimit,_that.costLimit,_that.earlyStopping,_that.displayName,_that.taskFunc,_that.name,_that.version,_that.metadata);case _:
+return $default(_that.dataset,_that.setup,_that.solver,_that.cleanup,_that.scorer,_that.metrics,_that.model,_that.config,_that.modelRoles,_that.sandbox,_that.approval,_that.epochs,_that.failOnError,_that.continueOnFail,_that.messageLimit,_that.tokenLimit,_that.timeLimit,_that.workingLimit,_that.costLimit,_that.earlyStopping,_that.displayName,_that.func,_that.systemMessage,_that.sandboxParameters,_that.name,_that.version,_that.metadata);case _:
   return null;
 
 }
@@ -274,7 +277,7 @@ return $default(_that.dataset,_that.setup,_that.solver,_that.cleanup,_that.score
 @JsonSerializable()
 
 class _Task implements Task {
-  const _Task({this.dataset, this.setup, this.solver, this.cleanup, this.scorer, this.metrics, this.model, this.config, @JsonKey(name: 'model_roles') final  Map<String, String>? modelRoles, this.sandbox, this.approval, this.epochs, @JsonKey(name: 'fail_on_error') this.failOnError, @JsonKey(name: 'continue_on_fail') this.continueOnFail, @JsonKey(name: 'message_limit') this.messageLimit, @JsonKey(name: 'token_limit') this.tokenLimit, @JsonKey(name: 'time_limit') this.timeLimit, @JsonKey(name: 'working_limit') this.workingLimit, @JsonKey(name: 'cost_limit') this.costLimit, @JsonKey(name: 'early_stopping') this.earlyStopping, @JsonKey(name: 'display_name') this.displayName, @JsonKey(name: 'task_func') this.taskFunc, this.name, this.version = 0, final  Map<String, dynamic>? metadata}): _modelRoles = modelRoles,_metadata = metadata;
+  const _Task({this.dataset, this.setup, this.solver, this.cleanup, this.scorer, this.metrics, this.model, this.config, @JsonKey(name: 'model_roles') final  Map<String, String>? modelRoles, this.sandbox, this.approval, this.epochs, @JsonKey(name: 'fail_on_error') this.failOnError, @JsonKey(name: 'continue_on_fail') this.continueOnFail, @JsonKey(name: 'message_limit') this.messageLimit, @JsonKey(name: 'token_limit') this.tokenLimit, @JsonKey(name: 'time_limit') this.timeLimit, @JsonKey(name: 'working_limit') this.workingLimit, @JsonKey(name: 'cost_limit') this.costLimit, @JsonKey(name: 'early_stopping') this.earlyStopping, @JsonKey(name: 'display_name') this.displayName, @JsonKey(name: 'func') this.func, @JsonKey(name: 'system_message') this.systemMessage, @JsonKey(name: 'sandbox_parameters') final  Map<String, dynamic>? sandboxParameters, this.name, this.version = 0, final  Map<String, dynamic>? metadata}): _modelRoles = modelRoles,_sandboxParameters = sandboxParameters,_metadata = metadata;
   factory _Task.fromJson(Map<String, dynamic> json) => _$TaskFromJson(json);
 
 /// Dataset to evaluate.
@@ -348,7 +351,20 @@ class _Task implements Task {
 /// `@task` function (e.g. `"flutter_code_gen"` or
 /// `"dash_evals.runner.tasks.flutter_code_gen"`).
 /// When absent, the runner hydrates directly from JSON (Mode 2 — future).
-@override@JsonKey(name: 'task_func') final  String? taskFunc;
+@override@JsonKey(name: 'func') final  String? func;
+/// System message override for this task.
+@override@JsonKey(name: 'system_message') final  String? systemMessage;
+/// Pass-through dict for sandbox plugin configuration.
+ final  Map<String, dynamic>? _sandboxParameters;
+/// Pass-through dict for sandbox plugin configuration.
+@override@JsonKey(name: 'sandbox_parameters') Map<String, dynamic>? get sandboxParameters {
+  final value = _sandboxParameters;
+  if (value == null) return null;
+  if (_sandboxParameters is EqualUnmodifiableMapView) return _sandboxParameters;
+  // ignore: implicit_dynamic_type
+  return EqualUnmodifiableMapView(value);
+}
+
 /// Task name.
 ///
 /// Automatically determined based on the registered name if not specified.
@@ -380,16 +396,16 @@ Map<String, dynamic> toJson() {
 
 @override
 bool operator ==(Object other) {
-  return identical(this, other) || (other.runtimeType == runtimeType&&other is _Task&&(identical(other.dataset, dataset) || other.dataset == dataset)&&const DeepCollectionEquality().equals(other.setup, setup)&&const DeepCollectionEquality().equals(other.solver, solver)&&const DeepCollectionEquality().equals(other.cleanup, cleanup)&&const DeepCollectionEquality().equals(other.scorer, scorer)&&const DeepCollectionEquality().equals(other.metrics, metrics)&&(identical(other.model, model) || other.model == model)&&const DeepCollectionEquality().equals(other.config, config)&&const DeepCollectionEquality().equals(other._modelRoles, _modelRoles)&&const DeepCollectionEquality().equals(other.sandbox, sandbox)&&const DeepCollectionEquality().equals(other.approval, approval)&&const DeepCollectionEquality().equals(other.epochs, epochs)&&const DeepCollectionEquality().equals(other.failOnError, failOnError)&&(identical(other.continueOnFail, continueOnFail) || other.continueOnFail == continueOnFail)&&(identical(other.messageLimit, messageLimit) || other.messageLimit == messageLimit)&&(identical(other.tokenLimit, tokenLimit) || other.tokenLimit == tokenLimit)&&(identical(other.timeLimit, timeLimit) || other.timeLimit == timeLimit)&&(identical(other.workingLimit, workingLimit) || other.workingLimit == workingLimit)&&(identical(other.costLimit, costLimit) || other.costLimit == costLimit)&&const DeepCollectionEquality().equals(other.earlyStopping, earlyStopping)&&(identical(other.displayName, displayName) || other.displayName == displayName)&&(identical(other.taskFunc, taskFunc) || other.taskFunc == taskFunc)&&(identical(other.name, name) || other.name == name)&&const DeepCollectionEquality().equals(other.version, version)&&const DeepCollectionEquality().equals(other._metadata, _metadata));
+  return identical(this, other) || (other.runtimeType == runtimeType&&other is _Task&&(identical(other.dataset, dataset) || other.dataset == dataset)&&const DeepCollectionEquality().equals(other.setup, setup)&&const DeepCollectionEquality().equals(other.solver, solver)&&const DeepCollectionEquality().equals(other.cleanup, cleanup)&&const DeepCollectionEquality().equals(other.scorer, scorer)&&const DeepCollectionEquality().equals(other.metrics, metrics)&&(identical(other.model, model) || other.model == model)&&const DeepCollectionEquality().equals(other.config, config)&&const DeepCollectionEquality().equals(other._modelRoles, _modelRoles)&&const DeepCollectionEquality().equals(other.sandbox, sandbox)&&const DeepCollectionEquality().equals(other.approval, approval)&&const DeepCollectionEquality().equals(other.epochs, epochs)&&const DeepCollectionEquality().equals(other.failOnError, failOnError)&&(identical(other.continueOnFail, continueOnFail) || other.continueOnFail == continueOnFail)&&(identical(other.messageLimit, messageLimit) || other.messageLimit == messageLimit)&&(identical(other.tokenLimit, tokenLimit) || other.tokenLimit == tokenLimit)&&(identical(other.timeLimit, timeLimit) || other.timeLimit == timeLimit)&&(identical(other.workingLimit, workingLimit) || other.workingLimit == workingLimit)&&(identical(other.costLimit, costLimit) || other.costLimit == costLimit)&&const DeepCollectionEquality().equals(other.earlyStopping, earlyStopping)&&(identical(other.displayName, displayName) || other.displayName == displayName)&&(identical(other.func, func) || other.func == func)&&(identical(other.systemMessage, systemMessage) || other.systemMessage == systemMessage)&&const DeepCollectionEquality().equals(other._sandboxParameters, _sandboxParameters)&&(identical(other.name, name) || other.name == name)&&const DeepCollectionEquality().equals(other.version, version)&&const DeepCollectionEquality().equals(other._metadata, _metadata));
 }
 
 @JsonKey(includeFromJson: false, includeToJson: false)
 @override
-int get hashCode => Object.hashAll([runtimeType,dataset,const DeepCollectionEquality().hash(setup),const DeepCollectionEquality().hash(solver),const DeepCollectionEquality().hash(cleanup),const DeepCollectionEquality().hash(scorer),const DeepCollectionEquality().hash(metrics),model,const DeepCollectionEquality().hash(config),const DeepCollectionEquality().hash(_modelRoles),const DeepCollectionEquality().hash(sandbox),const DeepCollectionEquality().hash(approval),const DeepCollectionEquality().hash(epochs),const DeepCollectionEquality().hash(failOnError),continueOnFail,messageLimit,tokenLimit,timeLimit,workingLimit,costLimit,const DeepCollectionEquality().hash(earlyStopping),displayName,taskFunc,name,const DeepCollectionEquality().hash(version),const DeepCollectionEquality().hash(_metadata)]);
+int get hashCode => Object.hashAll([runtimeType,dataset,const DeepCollectionEquality().hash(setup),const DeepCollectionEquality().hash(solver),const DeepCollectionEquality().hash(cleanup),const DeepCollectionEquality().hash(scorer),const DeepCollectionEquality().hash(metrics),model,const DeepCollectionEquality().hash(config),const DeepCollectionEquality().hash(_modelRoles),const DeepCollectionEquality().hash(sandbox),const DeepCollectionEquality().hash(approval),const DeepCollectionEquality().hash(epochs),const DeepCollectionEquality().hash(failOnError),continueOnFail,messageLimit,tokenLimit,timeLimit,workingLimit,costLimit,const DeepCollectionEquality().hash(earlyStopping),displayName,func,systemMessage,const DeepCollectionEquality().hash(_sandboxParameters),name,const DeepCollectionEquality().hash(version),const DeepCollectionEquality().hash(_metadata)]);
 
 @override
 String toString() {
-  return 'Task(dataset: $dataset, setup: $setup, solver: $solver, cleanup: $cleanup, scorer: $scorer, metrics: $metrics, model: $model, config: $config, modelRoles: $modelRoles, sandbox: $sandbox, approval: $approval, epochs: $epochs, failOnError: $failOnError, continueOnFail: $continueOnFail, messageLimit: $messageLimit, tokenLimit: $tokenLimit, timeLimit: $timeLimit, workingLimit: $workingLimit, costLimit: $costLimit, earlyStopping: $earlyStopping, displayName: $displayName, taskFunc: $taskFunc, name: $name, version: $version, metadata: $metadata)';
+  return 'Task(dataset: $dataset, setup: $setup, solver: $solver, cleanup: $cleanup, scorer: $scorer, metrics: $metrics, model: $model, config: $config, modelRoles: $modelRoles, sandbox: $sandbox, approval: $approval, epochs: $epochs, failOnError: $failOnError, continueOnFail: $continueOnFail, messageLimit: $messageLimit, tokenLimit: $tokenLimit, timeLimit: $timeLimit, workingLimit: $workingLimit, costLimit: $costLimit, earlyStopping: $earlyStopping, displayName: $displayName, func: $func, systemMessage: $systemMessage, sandboxParameters: $sandboxParameters, name: $name, version: $version, metadata: $metadata)';
 }
 
 
@@ -400,7 +416,7 @@ abstract mixin class _$TaskCopyWith<$Res> implements $TaskCopyWith<$Res> {
   factory _$TaskCopyWith(_Task value, $Res Function(_Task) _then) = __$TaskCopyWithImpl;
 @override @useResult
 $Res call({
- Dataset? dataset, Object? setup, Object? solver, Object? cleanup, Object? scorer, Object? metrics, String? model, Object? config,@JsonKey(name: 'model_roles') Map<String, String>? modelRoles, Object? sandbox, Object? approval, Object? epochs,@JsonKey(name: 'fail_on_error') Object? failOnError,@JsonKey(name: 'continue_on_fail') bool? continueOnFail,@JsonKey(name: 'message_limit') int? messageLimit,@JsonKey(name: 'token_limit') int? tokenLimit,@JsonKey(name: 'time_limit') int? timeLimit,@JsonKey(name: 'working_limit') int? workingLimit,@JsonKey(name: 'cost_limit') double? costLimit,@JsonKey(name: 'early_stopping') Object? earlyStopping,@JsonKey(name: 'display_name') String? displayName,@JsonKey(name: 'task_func') String? taskFunc, String? name, Object version, Map<String, dynamic>? metadata
+ Dataset? dataset, Object? setup, Object? solver, Object? cleanup, Object? scorer, Object? metrics, String? model, Object? config,@JsonKey(name: 'model_roles') Map<String, String>? modelRoles, Object? sandbox, Object? approval, Object? epochs,@JsonKey(name: 'fail_on_error') Object? failOnError,@JsonKey(name: 'continue_on_fail') bool? continueOnFail,@JsonKey(name: 'message_limit') int? messageLimit,@JsonKey(name: 'token_limit') int? tokenLimit,@JsonKey(name: 'time_limit') int? timeLimit,@JsonKey(name: 'working_limit') int? workingLimit,@JsonKey(name: 'cost_limit') double? costLimit,@JsonKey(name: 'early_stopping') Object? earlyStopping,@JsonKey(name: 'display_name') String? displayName,@JsonKey(name: 'func') String? func,@JsonKey(name: 'system_message') String? systemMessage,@JsonKey(name: 'sandbox_parameters') Map<String, dynamic>? sandboxParameters, String? name, Object version, Map<String, dynamic>? metadata
 });
 
 
@@ -417,7 +433,7 @@ class __$TaskCopyWithImpl<$Res>
 
 /// Create a copy of Task
 /// with the given fields replaced by the non-null parameter values.
-@override @pragma('vm:prefer-inline') $Res call({Object? dataset = freezed,Object? setup = freezed,Object? solver = freezed,Object? cleanup = freezed,Object? scorer = freezed,Object? metrics = freezed,Object? model = freezed,Object? config = freezed,Object? modelRoles = freezed,Object? sandbox = freezed,Object? approval = freezed,Object? epochs = freezed,Object? failOnError = freezed,Object? continueOnFail = freezed,Object? messageLimit = freezed,Object? tokenLimit = freezed,Object? timeLimit = freezed,Object? workingLimit = freezed,Object? costLimit = freezed,Object? earlyStopping = freezed,Object? displayName = freezed,Object? taskFunc = freezed,Object? name = freezed,Object? version = null,Object? metadata = freezed,}) {
+@override @pragma('vm:prefer-inline') $Res call({Object? dataset = freezed,Object? setup = freezed,Object? solver = freezed,Object? cleanup = freezed,Object? scorer = freezed,Object? metrics = freezed,Object? model = freezed,Object? config = freezed,Object? modelRoles = freezed,Object? sandbox = freezed,Object? approval = freezed,Object? epochs = freezed,Object? failOnError = freezed,Object? continueOnFail = freezed,Object? messageLimit = freezed,Object? tokenLimit = freezed,Object? timeLimit = freezed,Object? workingLimit = freezed,Object? costLimit = freezed,Object? earlyStopping = freezed,Object? displayName = freezed,Object? func = freezed,Object? systemMessage = freezed,Object? sandboxParameters = freezed,Object? name = freezed,Object? version = null,Object? metadata = freezed,}) {
   return _then(_Task(
 dataset: freezed == dataset ? _self.dataset : dataset // ignore: cast_nullable_to_non_nullable
 as Dataset?,setup: freezed == setup ? _self.setup : setup ,solver: freezed == solver ? _self.solver : solver ,cleanup: freezed == cleanup ? _self.cleanup : cleanup ,scorer: freezed == scorer ? _self.scorer : scorer ,metrics: freezed == metrics ? _self.metrics : metrics ,model: freezed == model ? _self.model : model // ignore: cast_nullable_to_non_nullable
@@ -429,8 +445,10 @@ as int?,timeLimit: freezed == timeLimit ? _self.timeLimit : timeLimit // ignore:
 as int?,workingLimit: freezed == workingLimit ? _self.workingLimit : workingLimit // ignore: cast_nullable_to_non_nullable
 as int?,costLimit: freezed == costLimit ? _self.costLimit : costLimit // ignore: cast_nullable_to_non_nullable
 as double?,earlyStopping: freezed == earlyStopping ? _self.earlyStopping : earlyStopping ,displayName: freezed == displayName ? _self.displayName : displayName // ignore: cast_nullable_to_non_nullable
-as String?,taskFunc: freezed == taskFunc ? _self.taskFunc : taskFunc // ignore: cast_nullable_to_non_nullable
-as String?,name: freezed == name ? _self.name : name // ignore: cast_nullable_to_non_nullable
+as String?,func: freezed == func ? _self.func : func // ignore: cast_nullable_to_non_nullable
+as String?,systemMessage: freezed == systemMessage ? _self.systemMessage : systemMessage // ignore: cast_nullable_to_non_nullable
+as String?,sandboxParameters: freezed == sandboxParameters ? _self._sandboxParameters : sandboxParameters // ignore: cast_nullable_to_non_nullable
+as Map<String, dynamic>?,name: freezed == name ? _self.name : name // ignore: cast_nullable_to_non_nullable
 as String?,version: null == version ? _self.version : version ,metadata: freezed == metadata ? _self._metadata : metadata // ignore: cast_nullable_to_non_nullable
 as Map<String, dynamic>?,
   ));
diff --git a/packages/dataset_config_dart/lib/src/models/task.g.dart b/packages/dataset_config_dart/lib/src/models/task.g.dart
index 9906b3a..7752223 100644
--- a/packages/dataset_config_dart/lib/src/models/task.g.dart
+++ b/packages/dataset_config_dart/lib/src/models/task.g.dart
@@ -32,14 +32,16 @@ _Task _$TaskFromJson(Map<String, dynamic> json) => _Task(
   costLimit: (json['cost_limit'] as num?)?.toDouble(),
   earlyStopping: json['early_stopping'],
   displayName: json['display_name'] as String?,
-  taskFunc: json['task_func'] as String?,
+  func: json['func'] as String?,
+  systemMessage: json['system_message'] as String?,
+  sandboxParameters: json['sandbox_parameters'] as Map<String, dynamic>?,
   name: json['name'] as String?,
   version: json['version'] as Object? ?? 0,
   metadata: json['metadata'] as Map<String, dynamic>?,
 );
 
 Map<String, dynamic> _$TaskToJson(_Task instance) => <String, dynamic>{
-  'dataset': instance.dataset?.toJson(),
+  'dataset': instance.dataset,
   'setup': instance.setup,
   'solver': instance.solver,
   'cleanup': instance.cleanup,
@@ -60,7 +62,9 @@ Map<String, dynamic> _$TaskToJson(_Task instance) => <String, dynamic>{
   'cost_limit': instance.costLimit,
   'early_stopping': instance.earlyStopping,
   'display_name': instance.displayName,
-  'task_func': instance.taskFunc,
+  'func': instance.func,
+  'system_message': instance.systemMessage,
+  'sandbox_parameters': instance.sandboxParameters,
   'name': instance.name,
   'version': instance.version,
   'metadata': instance.metadata,
diff --git a/packages/dataset_config_dart/lib/src/models/variant.g.dart b/packages/dataset_config_dart/lib/src/models/variant.g.dart
index 3ed7ff4..a9a6d25 100644
--- a/packages/dataset_config_dart/lib/src/models/variant.g.dart
+++ b/packages/dataset_config_dart/lib/src/models/variant.g.dart
@@ -28,7 +28,7 @@ _Variant _$VariantFromJson(Map<String, dynamic> json) => _Variant(
 
 Map<String, dynamic> _$VariantToJson(_Variant instance) => <String, dynamic>{
   'name': instance.name,
-  'context_files': instance.contextFiles.map((e) => e.toJson()).toList(),
+  'context_files': instance.contextFiles,
   'mcp_servers': instance.mcpServers,
   'skill_paths': instance.skillPaths,
   'flutter_channel': instance.flutterChannel,
diff --git a/packages/dataset_config_dart/lib/src/parsed_task.dart b/packages/dataset_config_dart/lib/src/parsed_task.dart
index 21ce5e3..ef74d5e 100644
--- a/packages/dataset_config_dart/lib/src/parsed_task.dart
+++ b/packages/dataset_config_dart/lib/src/parsed_task.dart
@@ -13,7 +13,7 @@ const kDefaultSystemMessage =
 /// former `TaskConfig` model-package class.
 class ParsedTask {
   final String id;
-  final String taskFunc;
+  final String func;
   final List<Sample> samples;
   final Variant variant;
   final String sandboxType;
@@ -22,6 +22,9 @@ class ParsedTask {
   final bool saveExamples;
   final String? examplesDir;
 
+  /// Tag filter for variant selection.
+  final TagFilter? variantFilters;
+
   // ------------------------------------------------------------------
   // Task-level settings (from task.yaml)
   // ------------------------------------------------------------------
@@ -79,7 +82,7 @@ class ParsedTask {
 
   const ParsedTask({
     required this.id,
-    required this.taskFunc,
+    required this.func,
     required this.samples,
     required this.variant,
     this.sandboxType = 'local',
@@ -87,6 +90,7 @@ class ParsedTask {
     this.allowedVariants,
     this.saveExamples = false,
     this.examplesDir,
+    this.variantFilters,
     // Task-level settings
     this.model,
     this.config,
@@ -110,7 +114,7 @@ class ParsedTask {
   /// Create a copy with overrides.
   ParsedTask copyWith({
     String? id,
-    String? taskFunc,
+    String? func,
     List<Sample>? samples,
     Variant? variant,
     String? sandboxType,
@@ -118,6 +122,7 @@ class ParsedTask {
     List<String>? allowedVariants,
     bool? saveExamples,
     String? examplesDir,
+    TagFilter? variantFilters,
     String? model,
     Map<String, dynamic>? config,
     Map<String, String>? modelRoles,
@@ -138,7 +143,7 @@ class ParsedTask {
   }) {
     return ParsedTask(
       id: id ?? this.id,
-      taskFunc: taskFunc ?? this.taskFunc,
+      func: func ?? this.func,
       samples: samples ?? this.samples,
       variant: variant ?? this.variant,
       sandboxType: sandboxType ?? this.sandboxType,
@@ -146,6 +151,7 @@ class ParsedTask {
       allowedVariants: allowedVariants ?? this.allowedVariants,
       saveExamples: saveExamples ?? this.saveExamples,
       examplesDir: examplesDir ?? this.examplesDir,
+      variantFilters: variantFilters ?? this.variantFilters,
       model: model ?? this.model,
       config: config ?? this.config,
       modelRoles: modelRoles ?? this.modelRoles,
diff --git a/packages/dataset_config_dart/lib/src/parsers/json_parser.dart b/packages/dataset_config_dart/lib/src/parsers/json_parser.dart
index 89d9668..3175ffd 100644
--- a/packages/dataset_config_dart/lib/src/parsers/json_parser.dart
+++ b/packages/dataset_config_dart/lib/src/parsers/json_parser.dart
@@ -21,7 +21,7 @@ class JsonParser extends Parser {
   List<ParsedTask> parseTasksFromMaps(List<Map<String, dynamic>> taskMaps) {
     return taskMaps.map((data) {
       final taskId = data['id'] as String;
-      final taskFunc = (data['func'] as String?) ?? taskId;
+      final func = (data['func'] as String?) ?? taskId;
       final systemMessage = data['system_message'] as String?;
       final allowedVariants = (data['allowed_variants'] as List?)
           ?.cast<String>();
@@ -113,7 +113,7 @@ class JsonParser extends Parser {
 
       return ParsedTask(
         id: taskId,
-        taskFunc: taskFunc,
+        func: func,
         variant: const Variant(),
         samples: samples,
         systemMessage: systemMessage,
diff --git a/packages/dataset_config_dart/lib/src/parsers/yaml_parser.dart b/packages/dataset_config_dart/lib/src/parsers/yaml_parser.dart
index 3ea236c..edd4b03 100644
--- a/packages/dataset_config_dart/lib/src/parsers/yaml_parser.dart
+++ b/packages/dataset_config_dart/lib/src/parsers/yaml_parser.dart
@@ -50,7 +50,7 @@ class YamlParser extends Parser {
     final taskDir = p.dirname(taskPath);
 
     final taskId = (data['id'] as String?) ?? p.basename(taskDir);
-    final taskFunc = (data['func'] as String?) ?? taskId;
+    final func = (data['func'] as String?) ?? taskId;
 
     final taskWorkspaceRaw = data['workspace'];
     final taskTestsRaw = data['tests'];
@@ -102,7 +102,7 @@ class YamlParser extends Parser {
     return [
       ParsedTask(
         id: taskId,
-        taskFunc: taskFunc,
+        func: func,
         variant: const Variant(), // placeholder baseline
         samples: samples,
         systemMessage: systemMessage,
diff --git a/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart b/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart
index d308d68..fe4861f 100644
--- a/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart
+++ b/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart
@@ -239,7 +239,7 @@ class EvalSetResolver {
       inspectTasks.add(
         Task(
           name: '${tc.id}:${tc.variant.name}',
-          taskFunc: tc.taskFunc,
+          func: tc.func,
           dataset: dataset,
           sandbox: taskSandbox,
           metadata: metadata,
diff --git a/packages/dataset_config_dart/pubspec.yaml b/packages/dataset_config_dart/pubspec.yaml
index cc76a7a..61a386b 100644
--- a/packages/dataset_config_dart/pubspec.yaml
+++ b/packages/dataset_config_dart/pubspec.yaml
@@ -15,5 +15,8 @@ dependencies:
   yaml: ^3.1.0
 
 dev_dependencies:
+  build_runner: ^2.12.2
+  freezed: ^3.2.5
+  json_serializable: ^6.13.0
   lints: ^6.0.0
   test: any
diff --git a/packages/dataset_config_dart/test/eval_set_resolver_test.dart b/packages/dataset_config_dart/test/eval_set_resolver_test.dart
index d982b58..de32e88 100644
--- a/packages/dataset_config_dart/test/eval_set_resolver_test.dart
+++ b/packages/dataset_config_dart/test/eval_set_resolver_test.dart
@@ -7,7 +7,7 @@ void main() {
   /// Helper to create a minimal [ParsedTask] for testing.
   ParsedTask makeTask({
     String id = 'test_task',
-    String taskFunc = 'question_answer',
+    String func = 'question_answer',
     List<Sample>? samples,
     Variant? variant,
     List<String>? allowedVariants,
@@ -18,7 +18,7 @@ void main() {
   }) {
     return ParsedTask(
       id: id,
-      taskFunc: taskFunc,
+      func: func,
       samples:
           samples ??
           [
@@ -278,14 +278,14 @@ void main() {
       expect(taskNames.first, contains('included'));
     });
 
-    test('taskFunc is propagated to output Task', () {
+    test('func is propagated to output Task', () {
       final results = resolver.resolve(
-        [makeTask(taskFunc: 'flutter_code_gen')],
+        [makeTask(func: 'flutter_code_gen')],
         makeJob(models: ['m']),
         '/tmp/dataset',
       );
 
-      expect(results.first.tasks.first.taskFunc, 'flutter_code_gen');
+      expect(results.first.tasks.first.func, 'flutter_code_gen');
     });
 
     test('system_message appears in task metadata', () {
diff --git a/packages/dataset_config_dart/test/eval_set_writer_test.dart b/packages/dataset_config_dart/test/eval_set_writer_test.dart
index 2ef58e5..ef377e6 100644
--- a/packages/dataset_config_dart/test/eval_set_writer_test.dart
+++ b/packages/dataset_config_dart/test/eval_set_writer_test.dart
@@ -25,7 +25,7 @@ void main() {
         taskCount,
         (i) => Task(
           name: 'task_$i:baseline',
-          taskFunc: 'func_$i',
+          func: 'func_$i',
           dataset: Dataset(
             samples: [
               Sample(id: 's$i', input: 'input $i', target: 'target $i'),
diff --git a/packages/dataset_config_dart/test/json_parser_test.dart b/packages/dataset_config_dart/test/json_parser_test.dart
index f09520c..3763af6 100644
--- a/packages/dataset_config_dart/test/json_parser_test.dart
+++ b/packages/dataset_config_dart/test/json_parser_test.dart
@@ -24,7 +24,7 @@ void main() {
 
       expect(tasks, hasLength(1));
       expect(tasks.first.id, 'my_task');
-      expect(tasks.first.taskFunc, 'question_answer');
+      expect(tasks.first.func, 'question_answer');
       expect(tasks.first.samples, hasLength(1));
       expect(tasks.first.samples.first.id, 's1');
       expect(tasks.first.samples.first.input, 'What is Dart?');
@@ -39,7 +39,7 @@ void main() {
         },
       ]);
 
-      expect(tasks.first.taskFunc, 'dart_qa');
+      expect(tasks.first.func, 'dart_qa');
     });
 
     test('throws FormatException when sample missing required field', () {
diff --git a/packages/dataset_config_dart/test/parsed_task_test.dart b/packages/dataset_config_dart/test/parsed_task_test.dart
index 4921e30..b6fb7c5 100644
--- a/packages/dataset_config_dart/test/parsed_task_test.dart
+++ b/packages/dataset_config_dart/test/parsed_task_test.dart
@@ -6,7 +6,7 @@ void main() {
     test('has correct defaults', () {
       const task = ParsedTask(
         id: 'test',
-        taskFunc: 'question_answer',
+        func: 'question_answer',
         samples: [],
         variant: Variant(),
       );
@@ -27,7 +27,7 @@ void main() {
     test('stores all constructor fields', () {
       const task = ParsedTask(
         id: 'my_task',
-        taskFunc: 'flutter_code_gen',
+        func: 'flutter_code_gen',
         samples: [Sample(id: 's1', input: 'q', target: 'a')],
         variant: Variant(name: 'full'),
         sandboxType: 'podman',
@@ -49,7 +49,7 @@ void main() {
       );
 
       expect(task.id, 'my_task');
-      expect(task.taskFunc, 'flutter_code_gen');
+      expect(task.func, 'flutter_code_gen');
       expect(task.samples, hasLength(1));
       expect(task.variant.name, 'full');
       expect(task.sandboxType, 'podman');
@@ -75,7 +75,7 @@ void main() {
     test('overrides specified fields', () {
       const original = ParsedTask(
         id: 'original',
-        taskFunc: 'func_a',
+        func: 'func_a',
         samples: [],
         variant: Variant(name: 'baseline'),
         timeLimit: 100,
@@ -93,7 +93,7 @@ void main() {
     test('preserves fields not overridden', () {
       const original = ParsedTask(
         id: 'task',
-        taskFunc: 'func',
+        func: 'func',
         samples: [],
         variant: Variant(name: 'full'),
         sandboxType: 'podman',
@@ -103,7 +103,7 @@ void main() {
 
       final copy = original.copyWith(id: 'new_id');
 
-      expect(copy.taskFunc, 'func');
+      expect(copy.func, 'func');
       expect(copy.variant.name, 'full');
       expect(copy.sandboxType, 'podman');
       expect(copy.systemMessage, 'Be helpful');
@@ -113,7 +113,7 @@ void main() {
     test('returns a new instance (not the same object)', () {
       const original = ParsedTask(
         id: 'a',
-        taskFunc: 'f',
+        func: 'f',
         samples: [],
         variant: Variant(),
       );
@@ -128,7 +128,7 @@ void main() {
     test('can override samples list', () {
       const original = ParsedTask(
         id: 'task',
-        taskFunc: 'func',
+        func: 'func',
         samples: [Sample(id: 's1', input: 'q', target: 'a')],
         variant: Variant(),
       );
diff --git a/packages/dataset_config_python/src/dataset_config_python/models/__init__.py b/packages/dataset_config_python/src/dataset_config_python/models/__init__.py
index a90aaad..f42caca 100644
--- a/packages/dataset_config_python/src/dataset_config_python/models/__init__.py
+++ b/packages/dataset_config_python/src/dataset_config_python/models/__init__.py
@@ -5,6 +5,7 @@
 from dataset_config_python.models.eval_set import EvalSet
 from dataset_config_python.models.job import Job, JobTask
 from dataset_config_python.models.sample import Sample
+from dataset_config_python.models.tag_filter import TagFilter, matches_tag_filter
 from dataset_config_python.models.task import Task
 from dataset_config_python.models.variant import Variant
 
@@ -16,6 +17,8 @@
     "Job",
     "JobTask",
     "Sample",
+    "TagFilter",
     "Task",
     "Variant",
+    "matches_tag_filter",
 ]
diff --git a/packages/dataset_config_python/src/dataset_config_python/models/job.py b/packages/dataset_config_python/src/dataset_config_python/models/job.py
index 683e09f..b259ed1 100644
--- a/packages/dataset_config_python/src/dataset_config_python/models/job.py
+++ b/packages/dataset_config_python/src/dataset_config_python/models/job.py
@@ -6,6 +6,8 @@
 
 from pydantic import BaseModel
 
+from dataset_config_python.models.tag_filter import TagFilter
+
 
 class JobTask(BaseModel):
     """Per-task configuration within a job."""
@@ -22,6 +24,9 @@ class JobTask(BaseModel):
     system_message: str | None = None
     """Override system message for this task."""
 
+    args: dict[str, Any] | None = None
+    """Per-task argument overrides passed to the task function."""
+
     @staticmethod
     def from_yaml(task_id: str, data: dict[str, Any] | None) -> JobTask:
         """Create from parsed YAML data."""
@@ -32,6 +37,7 @@ def from_yaml(task_id: str, data: dict[str, Any] | None) -> JobTask:
             include_samples=data.get("include-samples"),
             exclude_samples=data.get("exclude-samples"),
             system_message=data.get("system_message"),
+            args=data.get("args"),
         )
 
 
@@ -39,6 +45,8 @@ class Job(BaseModel):
     """A job configuration defining what to run and how to run it."""
 
     # Core settings
+    description: str | None = None
+    image_prefix: str | None = None
     log_dir: str
     sandbox_type: str = "local"
     max_connections: int = 10
@@ -100,3 +108,7 @@ class Job(BaseModel):
     # Pass-through overrides
     eval_set_overrides: dict[str, Any] | None = None
     task_defaults: dict[str, Any] | None = None
+
+    # Tag-based filtering
+    task_filters: TagFilter | None = None
+    sample_filters: TagFilter | None = None
diff --git a/packages/dataset_config_python/src/dataset_config_python/models/tag_filter.py b/packages/dataset_config_python/src/dataset_config_python/models/tag_filter.py
new file mode 100644
index 0000000..5d298e2
--- /dev/null
+++ b/packages/dataset_config_python/src/dataset_config_python/models/tag_filter.py
@@ -0,0 +1,30 @@
+"""Tag-based filter for including/excluding items by their tags."""
+
+from __future__ import annotations
+
+from pydantic import BaseModel
+
+
+class TagFilter(BaseModel):
+    """Tag-based filter for including/excluding items."""
+
+    include_tags: list[str] | None = None
+    exclude_tags: list[str] | None = None
+
+
+def matches_tag_filter(item_tags: list[str], tag_filter: TagFilter) -> bool:
+    """Check whether a set of item_tags matches the given filter.
+
+    Returns True if:
+    - All include_tags (if any) are present in item_tags
+    - No exclude_tags (if any) are present in item_tags
+    """
+    if tag_filter.include_tags and not all(
+        t in item_tags for t in tag_filter.include_tags
+    ):
+        return False
+    if tag_filter.exclude_tags and any(
+        t in item_tags for t in tag_filter.exclude_tags
+    ):
+        return False
+    return True
diff --git a/packages/dataset_config_python/src/dataset_config_python/models/task.py b/packages/dataset_config_python/src/dataset_config_python/models/task.py
index cafbbe3..5623ab3 100644
--- a/packages/dataset_config_python/src/dataset_config_python/models/task.py
+++ b/packages/dataset_config_python/src/dataset_config_python/models/task.py
@@ -19,9 +19,15 @@ class Task(BaseModel):
     name: str = ""
     """Task name (e.g. ``"dart_qa:baseline"``)."""
 
-    task_func: str | None = None
+    func: str | None = None
     """Task function identifier for hydration (e.g. ``"question_answer"``)."""
 
+    system_message: str | None = None
+    """System message override for this task."""
+
+    sandbox_parameters: dict[str, Any] | None = None
+    """Pass-through dict for sandbox plugin configuration."""
+
     dataset: Dataset | None = None
     """Inline dataset with samples."""
 
diff --git a/packages/dataset_config_python/src/dataset_config_python/parser.py b/packages/dataset_config_python/src/dataset_config_python/parser.py
index 0e9fc12..43a646e 100644
--- a/packages/dataset_config_python/src/dataset_config_python/parser.py
+++ b/packages/dataset_config_python/src/dataset_config_python/parser.py
@@ -29,7 +29,7 @@ def __init__(
         self,
         *,
         id: str,
-        task_func: str,
+        func: str,
         samples: list[Sample],
         variant: Variant | None = None,
         sandbox_type: str = "local",
@@ -57,7 +57,7 @@ def __init__(
         metadata: dict[str, Any] | None = None,
     ):
         self.id = id
-        self.task_func = task_func
+        self.func = func
         self.samples = samples
         self.variant = variant or Variant()
         self.sandbox_type = sandbox_type
@@ -89,7 +89,7 @@ def copy_with(
         self,
         *,
         id: str | None = _UNSET,
-        task_func: str | None = _UNSET,
+        func: str | None = _UNSET,
         samples: list[Sample] | None = _UNSET,
         variant: Variant | None = _UNSET,
         sandbox_type: str | None = _UNSET,
@@ -119,7 +119,7 @@ def copy_with(
         _U = ParsedTask._UNSET
         return ParsedTask(
             id=self.id if id is _U else id,  # type: ignore[arg-type]
-            task_func=self.task_func if task_func is _U else task_func,  # type: ignore[arg-type]
+            func=self.func if func is _U else func,  # type: ignore[arg-type]
             samples=self.samples if samples is _U else samples,  # type: ignore[arg-type]
             variant=self.variant if variant is _U else variant,
             sandbox_type=self.sandbox_type if sandbox_type is _U else sandbox_type,  # type: ignore[arg-type]
@@ -260,7 +260,7 @@ def _load_task_file(task_path: str, dataset_root: str) -> list[ParsedTask]:
     return [
         ParsedTask(
             id=task_id,
-            task_func=task_func,
+            func=task_func,
             variant=Variant(),
             samples=samples,
             system_message=system_message,
diff --git a/packages/dataset_config_python/src/dataset_config_python/resolver.py b/packages/dataset_config_python/src/dataset_config_python/resolver.py
index 0801b3c..9f5589a 100644
--- a/packages/dataset_config_python/src/dataset_config_python/resolver.py
+++ b/packages/dataset_config_python/src/dataset_config_python/resolver.py
@@ -213,7 +213,7 @@ def _build_eval_set(
         inspect_tasks.append(
             Task(
                 name=f"{tc.id}:{tc.variant.name}",
-                task_func=tc.task_func,
+                func=tc.func,
                 dataset=dataset,
                 sandbox=task_sandbox,
                 metadata=task_metadata,
diff --git a/packages/dataset_config_python/tests/test_config.py b/packages/dataset_config_python/tests/test_config.py
index b79f9a9..20890c3 100644
--- a/packages/dataset_config_python/tests/test_config.py
+++ b/packages/dataset_config_python/tests/test_config.py
@@ -178,7 +178,7 @@ def test_job_task_from_yaml_with_data(self):
 
     def test_eval_set_serialization(self):
         es = EvalSet(
-            tasks=[Task(name="test:baseline", task_func="qa")],
+            tasks=[Task(name="test:baseline", func="qa")],
             log_dir="/tmp/logs",
             model=["google/gemini-2.5-flash"],
         )
@@ -312,11 +312,11 @@ def test_write_single(self, dataset_dir, tmp_path):
 
     def test_write_multiple(self, tmp_path):
         es1 = EvalSet(
-            tasks=[Task(name="t1:baseline", task_func="qa")],
+            tasks=[Task(name="t1:baseline", func="qa")],
             log_dir="/tmp/logs1",
         )
         es2 = EvalSet(
-            tasks=[Task(name="t2:baseline", task_func="qa")],
+            tasks=[Task(name="t2:baseline", func="qa")],
             log_dir="/tmp/logs2",
         )
         output_dir = str(tmp_path / "output")
diff --git a/packages/devals_cli/lib/src/dataset/dry_run.dart b/packages/devals_cli/lib/src/dataset/dry_run.dart
index 891f700..856e172 100644
--- a/packages/devals_cli/lib/src/dataset/dry_run.dart
+++ b/packages/devals_cli/lib/src/dataset/dry_run.dart
@@ -32,9 +32,9 @@ bool _validateConfig(EvalSet config) {
   final taskSummaries = <String, int>{};
 
   for (final task in config.tasks) {
-    final name = task.name ?? task.taskFunc ?? '(unknown)';
+    final name = task.name ?? task.func ?? '(unknown)';
 
-    if (task.taskFunc == null) {
+    if (task.func == null) {
       warnings.add(
         'Task "$name" has no task_func — Mode 2 hydration required',
       );
diff --git a/tool/config_parity/pubspec.lock b/tool/config_parity/pubspec.lock
deleted file mode 100644
index dd2733b..0000000
--- a/tool/config_parity/pubspec.lock
+++ /dev/null
@@ -1,108 +0,0 @@
-# Generated by pub
-# See https://dart.dev/tools/pub/glossary#lockfile
-packages:
-  async:
-    dependency: transitive
-    description:
-      name: async
-      sha256: "758e6d74e971c3e5aceb4110bfd6698efc7f501675bcfe0c775459a8140750eb"
-      url: "https://pub.dev"
-    source: hosted
-    version: "2.13.0"
-  collection:
-    dependency: transitive
-    description:
-      name: collection
-      sha256: "2f5709ae4d3d59dd8f7cd309b4e023046b57d8a6c82130785d2b0e5868084e76"
-      url: "https://pub.dev"
-    source: hosted
-    version: "1.19.1"
-  dataset_config_dart:
-    dependency: "direct main"
-    description:
-      path: "../../packages/dataset_config_dart"
-      relative: true
-    source: path
-    version: "0.0.1"
-  file:
-    dependency: transitive
-    description:
-      name: file
-      sha256: a3b4f84adafef897088c160faf7dfffb7696046cb13ae90b508c2cbc95d3b8d4
-      url: "https://pub.dev"
-    source: hosted
-    version: "7.0.1"
-  freezed_annotation:
-    dependency: transitive
-    description:
-      name: freezed_annotation
-      sha256: "7294967ff0a6d98638e7acb774aac3af2550777accd8149c90af5b014e6d44d8"
-      url: "https://pub.dev"
-    source: hosted
-    version: "3.1.0"
-  glob:
-    dependency: transitive
-    description:
-      name: glob
-      sha256: c3f1ee72c96f8f78935e18aa8cecced9ab132419e8625dc187e1c2408efc20de
-      url: "https://pub.dev"
-    source: hosted
-    version: "2.1.3"
-  json_annotation:
-    dependency: transitive
-    description:
-      name: json_annotation
-      sha256: cb09e7dac6210041fad964ed7fbee004f14258b4eca4040f72d1234062ace4c8
-      url: "https://pub.dev"
-    source: hosted
-    version: "4.11.0"
-  meta:
-    dependency: transitive
-    description:
-      name: meta
-      sha256: "9f29b9bcc8ee287b1a31e0d01be0eae99a930dbffdaecf04b3f3d82a969f296f"
-      url: "https://pub.dev"
-    source: hosted
-    version: "1.18.1"
-  path:
-    dependency: "direct main"
-    description:
-      name: path
-      sha256: "75cca69d1490965be98c73ceaea117e8a04dd21217b37b292c9ddbec0d955bc5"
-      url: "https://pub.dev"
-    source: hosted
-    version: "1.9.1"
-  source_span:
-    dependency: transitive
-    description:
-      name: source_span
-      sha256: "56a02f1f4cd1a2d96303c0144c93bd6d909eea6bee6bf5a0e0b685edbd4c47ab"
-      url: "https://pub.dev"
-    source: hosted
-    version: "1.10.2"
-  string_scanner:
-    dependency: transitive
-    description:
-      name: string_scanner
-      sha256: "921cd31725b72fe181906c6a94d987c78e3b98c2e205b397ea399d4054872b43"
-      url: "https://pub.dev"
-    source: hosted
-    version: "1.4.1"
-  term_glyph:
-    dependency: transitive
-    description:
-      name: term_glyph
-      sha256: "7f554798625ea768a7518313e58f83891c7f5024f88e46e7182a4558850a4b8e"
-      url: "https://pub.dev"
-    source: hosted
-    version: "1.2.2"
-  yaml:
-    dependency: transitive
-    description:
-      name: yaml
-      sha256: b9da305ac7c39faa3f030eccd175340f968459dae4af175130b3fc47e40d76ce
-      url: "https://pub.dev"
-    source: hosted
-    version: "3.1.3"
-sdks:
-  dart: ">=3.10.0 <4.0.0"

From 28fba88f9990ab2b245d74ee6354b0b6744a6f95 Mon Sep 17 00:00:00 2001
From: Eric Windmill <eric@ericwindmill.com>
Date: Fri, 13 Mar 2026 16:51:08 -0700
Subject: [PATCH 4/8] adds task level fields and updates parser

---
 .../lib/src/parsed_task.dart                   |  6 ++++++
 .../lib/src/parsers/yaml_parser.dart           | 16 ++++++++++++++++
 .../lib/src/resolvers/eval_set_resolver.dart   | 18 +++++++++++++++++-
 .../src/dataset_config_python/parser.py        | 10 ++++++++++
 .../src/dataset_config_python/resolver.py      | 18 +++++++++++++++++-
 5 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/packages/dataset_config_dart/lib/src/parsed_task.dart b/packages/dataset_config_dart/lib/src/parsed_task.dart
index ef74d5e..c5afaf7 100644
--- a/packages/dataset_config_dart/lib/src/parsed_task.dart
+++ b/packages/dataset_config_dart/lib/src/parsed_task.dart
@@ -25,6 +25,9 @@ class ParsedTask {
   /// Tag filter for variant selection.
   final TagFilter? variantFilters;
 
+  /// Pass-through dict for sandbox plugin configuration.
+  final Map<String, dynamic>? sandboxParameters;
+
   // ------------------------------------------------------------------
   // Task-level settings (from task.yaml)
   // ------------------------------------------------------------------
@@ -91,6 +94,7 @@ class ParsedTask {
     this.saveExamples = false,
     this.examplesDir,
     this.variantFilters,
+    this.sandboxParameters,
     // Task-level settings
     this.model,
     this.config,
@@ -123,6 +127,7 @@ class ParsedTask {
     bool? saveExamples,
     String? examplesDir,
     TagFilter? variantFilters,
+    Map<String, dynamic>? sandboxParameters,
     String? model,
     Map<String, dynamic>? config,
     Map<String, String>? modelRoles,
@@ -152,6 +157,7 @@ class ParsedTask {
       saveExamples: saveExamples ?? this.saveExamples,
       examplesDir: examplesDir ?? this.examplesDir,
       variantFilters: variantFilters ?? this.variantFilters,
+      sandboxParameters: sandboxParameters ?? this.sandboxParameters,
       model: model ?? this.model,
       config: config ?? this.config,
       modelRoles: modelRoles ?? this.modelRoles,
diff --git a/packages/dataset_config_dart/lib/src/parsers/yaml_parser.dart b/packages/dataset_config_dart/lib/src/parsers/yaml_parser.dart
index edd4b03..e8f0b69 100644
--- a/packages/dataset_config_dart/lib/src/parsers/yaml_parser.dart
+++ b/packages/dataset_config_dart/lib/src/parsers/yaml_parser.dart
@@ -98,6 +98,7 @@ class YamlParser extends Parser {
     final displayName = data['display_name'] as String?;
     final version = data['version'];
     final taskMetadata = _asMap(data['metadata']);
+    final sandboxParameters = _asMap(data['sandbox_parameters']);
 
     return [
       ParsedTask(
@@ -125,6 +126,7 @@ class YamlParser extends Parser {
         displayName: displayName,
         version: version,
         metadata: taskMetadata,
+        sandboxParameters: sandboxParameters,
       ),
     ];
   }
@@ -370,14 +372,28 @@ class YamlParser extends Parser {
       }
     }
 
+    // Parse tag filters
+    final taskFiltersRaw = data['task_filters'];
+    final sampleFiltersRaw = data['sample_filters'];
+    final TagFilter? taskFilters = taskFiltersRaw is Map
+        ? TagFilter.fromJson(Map<String, dynamic>.from(taskFiltersRaw))
+        : null;
+    final TagFilter? sampleFilters = sampleFiltersRaw is Map
+        ? TagFilter.fromJson(Map<String, dynamic>.from(sampleFiltersRaw))
+        : null;
+
     return Job(
       logDir: logDir,
       sandboxType: sandboxType,
       maxConnections: maxConnections,
+      description: data['description'] as String?,
+      imagePrefix: data['image_prefix'] as String?,
       models: (data['models'] as List?)?.cast<String>(),
       variants: variants,
       taskPaths: taskPaths,
       tasks: tasks,
+      taskFilters: taskFilters,
+      sampleFilters: sampleFilters,
       saveExamples: data['save_examples'] == true,
       // Promoted eval_set() fields
       retryAttempts: data['retry_attempts'] as int?,
diff --git a/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart b/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart
index fe4861f..ba5dddf 100644
--- a/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart
+++ b/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart
@@ -243,6 +243,8 @@ class EvalSetResolver {
           dataset: dataset,
           sandbox: taskSandbox,
           metadata: metadata,
+          systemMessage: tc.systemMessage,
+          sandboxParameters: tc.sandboxParameters,
           model: resolvedModel,
           config: resolvedConfig,
           modelRoles: resolvedModelRoles,
@@ -427,9 +429,15 @@ class EvalSetResolver {
     for (final taskConfig in datasetTasks) {
       final taskId = taskConfig.id;
 
-      // Filter by job.tasks
+      // Filter by job.tasks (ID-based)
       if (job.tasks != null && !job.tasks!.containsKey(taskId)) continue;
 
+      // Filter by job.taskFilters (tag-based)
+      if (job.taskFilters != null) {
+        final taskTags = (taskConfig.metadata?['tags'] as List?)?.cast<String>() ?? [];
+        if (!matchesTagFilter(taskTags, job.taskFilters!)) continue;
+      }
+
       // Determine effective variants (intersection)
       final effectiveVariants = <String, Map<String, dynamic>>{};
       for (final entry in jobVariants.entries) {
@@ -459,6 +467,14 @@ class EvalSetResolver {
         }
       }
 
+      // Apply sample tag filtering (job-level)
+      if (job.sampleFilters != null) {
+        samples = samples.where((s) {
+          final sampleTags = (s.metadata?['tags'] as List?)?.cast<String>() ?? [];
+          return matchesTagFilter(sampleTags, job.sampleFilters!);
+        }).toList();
+      }
+
       // Apply system_message override
       var systemMessage = taskConfig.systemMessage;
       if (jobTask?.systemMessage != null) {
diff --git a/packages/dataset_config_python/src/dataset_config_python/parser.py b/packages/dataset_config_python/src/dataset_config_python/parser.py
index 43a646e..56dc89f 100644
--- a/packages/dataset_config_python/src/dataset_config_python/parser.py
+++ b/packages/dataset_config_python/src/dataset_config_python/parser.py
@@ -12,6 +12,7 @@
 
 from dataset_config_python.models.job import Job, JobTask
 from dataset_config_python.models.sample import Sample
+from dataset_config_python.models.tag_filter import TagFilter
 from dataset_config_python.models.variant import Variant
 
 # Default log directory (relative to dataset root).
@@ -55,6 +56,7 @@ def __init__(
         display_name: str | None = None,
         version: Any | None = None,
         metadata: dict[str, Any] | None = None,
+        sandbox_parameters: dict[str, Any] | None = None,
     ):
         self.id = id
         self.func = func
@@ -82,6 +84,7 @@ def __init__(
         self.display_name = display_name
         self.version = version
         self.metadata = metadata
+        self.sandbox_parameters = sandbox_parameters
 
     _UNSET: Any = object()
 
@@ -97,6 +100,7 @@ def copy_with(
         allowed_variants: list[str] | None = _UNSET,
         save_examples: bool | None = _UNSET,
         examples_dir: str | None = _UNSET,
+        sandbox_parameters: dict[str, Any] | None = _UNSET,
         model: str | None = _UNSET,
         config: dict[str, Any] | None = _UNSET,
         model_roles: dict[str, str] | None = _UNSET,
@@ -127,6 +131,7 @@ def copy_with(
             allowed_variants=self.allowed_variants if allowed_variants is _U else allowed_variants,
             save_examples=self.save_examples if save_examples is _U else save_examples,  # type: ignore[arg-type]
             examples_dir=self.examples_dir if examples_dir is _U else examples_dir,
+            sandbox_parameters=self.sandbox_parameters if sandbox_parameters is _U else sandbox_parameters,
             model=self.model if model is _U else model,
             config=self.config if config is _U else config,
             model_roles=self.model_roles if model_roles is _U else model_roles,
@@ -282,6 +287,7 @@ def _load_task_file(task_path: str, dataset_root: str) -> list[ParsedTask]:
             display_name=data.get("display_name"),
             version=data.get("version"),
             metadata=data.get("metadata") if isinstance(data.get("metadata"), dict) else None,
+            sandbox_parameters=data.get("sandbox_parameters") if isinstance(data.get("sandbox_parameters"), dict) else None,
         )
     ]
 
@@ -542,6 +548,10 @@ def parse_job(job_path: str, dataset_root: str) -> Job:
         task_defaults=(
             data.get("task_defaults") if isinstance(data.get("task_defaults"), dict) else None
         ),
+        description=data.get("description"),
+        image_prefix=data.get("image_prefix"),
+        task_filters=TagFilter(**data["task_filters"]) if isinstance(data.get("task_filters"), dict) else None,
+        sample_filters=TagFilter(**data["sample_filters"]) if isinstance(data.get("sample_filters"), dict) else None,
     )
 
 
diff --git a/packages/dataset_config_python/src/dataset_config_python/resolver.py b/packages/dataset_config_python/src/dataset_config_python/resolver.py
index 9f5589a..1415e2d 100644
--- a/packages/dataset_config_python/src/dataset_config_python/resolver.py
+++ b/packages/dataset_config_python/src/dataset_config_python/resolver.py
@@ -10,6 +10,7 @@
 from dataset_config_python.models.dataset import Dataset
 from dataset_config_python.models.eval_set import EvalSet
 from dataset_config_python.models.sample import Sample
+from dataset_config_python.models.tag_filter import matches_tag_filter
 from dataset_config_python.models.task import Task
 from dataset_config_python.models.variant import Variant
 from dataset_config_python.parser import ParsedTask, find_job_file, parse_job, parse_tasks
@@ -217,6 +218,8 @@ def _build_eval_set(
                 dataset=dataset,
                 sandbox=task_sandbox,
                 metadata=task_metadata,
+                system_message=tc.system_message,
+                sandbox_parameters=tc.sandbox_parameters,
                 model=tc.model or task_defaults.get("model"),
                 config=tc.config or task_defaults.get("config"),
                 model_roles=tc.model_roles or task_defaults.get("model_roles"),
@@ -372,10 +375,16 @@ def _expand_task_configs(
     for tc in dataset_tasks:
         task_id = tc.id
 
-        # Filter by job.tasks
+        # Filter by job.tasks (ID-based)
         if job.tasks is not None and task_id not in job.tasks:
             continue
 
+        # Filter by job.task_filters (tag-based)
+        if job.task_filters is not None:
+            task_tags = (tc.metadata or {}).get("tags", [])
+            if not matches_tag_filter(task_tags, job.task_filters):
+                continue
+
         # Determine effective variants (intersection)
         effective_variants: dict[str, dict[str, Any]] = {}
         for vname, vdef in job_variants.items():
@@ -393,6 +402,13 @@ def _expand_task_configs(
             if job_task.exclude_samples:
                 samples = [s for s in samples if s.id not in job_task.exclude_samples]
 
+        # Apply sample tag filtering (job-level)
+        if job.sample_filters is not None:
+            samples = [
+                s for s in samples
+                if matches_tag_filter((s.metadata or {}).get("tags", []), job.sample_filters)
+            ]
+
         # Apply system_message override
         system_message = tc.system_message
         if job_task and job_task.system_message is not None:

From fe24d910f9536313d043350250eff41f910b07e3 Mon Sep 17 00:00:00 2001
From: Eric Windmill <eric@ericwindmill.com>
Date: Fri, 13 Mar 2026 16:58:14 -0700
Subject: [PATCH 5/8] feat: allow configurable sandbox and SDK channel mappings
 in dataset resolvers and support colon syntax for task function resolution.

---
 .../src/dash_evals/runner/json_runner.py      | 16 ++++++++
 .../lib/src/resolvers/eval_set_resolver.dart  | 41 ++++++++++++++-----
 .../src/dataset_config_python/resolver.py     | 37 +++++++++++------
 3 files changed, 70 insertions(+), 24 deletions(-)

diff --git a/packages/dash_evals/src/dash_evals/runner/json_runner.py b/packages/dash_evals/src/dash_evals/runner/json_runner.py
index 7db7e89..89e4eee 100644
--- a/packages/dash_evals/src/dash_evals/runner/json_runner.py
+++ b/packages/dash_evals/src/dash_evals/runner/json_runner.py
@@ -27,6 +27,7 @@ def _resolve_task_func(name: str):
 
     Supports:
     - Short names: "flutter_code_gen" → dash_evals.runner.tasks.flutter_code_gen
+    - Colon syntax: "my_package.tasks:my_task" → import my_package.tasks, get my_task
     - Dotted paths: "dash_evals.runner.tasks.flutter_code_gen.flutter_code_gen"
 
     For short names, first tries to import a module with the same name.
@@ -36,6 +37,21 @@ def _resolve_task_func(name: str):
 
     Returns the callable task function.
     """
+    # Colon syntax: "module.path:function_name"
+    if ":" in name:
+        module_path, func_name = name.split(":", 1)
+        try:
+            module = importlib.import_module(module_path)
+        except ModuleNotFoundError:
+            raise ValueError(
+                f"Could not find module '{module_path}' for task function '{name}'. "
+                f"Check that the module exists and is importable."
+            )
+        func = getattr(module, func_name, None)
+        if func is None:
+            raise ValueError(f"Module '{module_path}' does not have a function '{func_name}'.")
+        return func
+
     if "." not in name:
         # Short name: try module with the same name first
         module_path = f"dash_evals.runner.tasks.{name}"
diff --git a/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart b/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart
index ba5dddf..871e5e9 100644
--- a/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart
+++ b/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart
@@ -21,8 +21,10 @@ const List<String> kDefaultModels = [
   'openai/gpt-5-pro',
 ];
 
-/// Available sandbox configurations.
-const Map<String, Map<String, String>> kSandboxRegistry = {
+/// Default sandbox configurations for Flutter evaluations.
+///
+/// Consumers can pass these to [EvalSetResolver] or provide their own.
+const Map<String, Map<String, String>> kDefaultSandboxRegistry = {
   'podman': {'name': 'podman', 'path': './sandboxes/podman/compose.yaml'},
   'podman-beta': {
     'name': 'podman',
@@ -34,8 +36,10 @@ const Map<String, Map<String, String>> kSandboxRegistry = {
   },
 };
 
-/// Maps Flutter SDK channel names to sandbox registry keys.
-const Map<String, String> kSdkChannels = {
+/// Default Flutter SDK channel → sandbox registry key mapping.
+///
+/// Consumers can pass these to [EvalSetResolver] or provide their own.
+const Map<String, String> kDefaultSdkChannels = {
   'stable': 'podman',
   'beta': 'podman-beta',
   'main': 'podman-main',
@@ -50,6 +54,22 @@ const Map<String, String> kSdkChannels = {
 /// 3. Groups by flutter_channel (one [EvalSet] per group)
 /// 4. Propagates job-level and task-level settings to the output
 class EvalSetResolver {
+  /// Creates a resolver with optional sandbox configuration.
+  ///
+  /// If [sandboxRegistry] or [sdkChannels] are not provided, they default
+  /// to empty maps (no sandbox resolution). Pass [kDefaultSandboxRegistry]
+  /// and [kDefaultSdkChannels] for the Flutter-specific sandbox setup.
+  const EvalSetResolver({
+    this.sandboxRegistry = const {},
+    this.sdkChannels = const {},
+  });
+
+  /// Named sandbox configurations (e.g. `'podman'` → compose file path).
+  final Map<String, Map<String, String>> sandboxRegistry;
+
+  /// SDK channel → sandbox registry key mapping.
+  final Map<String, String> sdkChannels;
+
   /// Resolve task configs and job into [EvalSet] objects.
   ///
   /// Groups by flutter_channel so each gets its own sandbox.
@@ -136,7 +156,6 @@ class EvalSetResolver {
 
         if (workspace != null && isContainer) {
           files = {...?files, '/workspace': workspace};
-          setup = setup ?? 'cd /workspace && flutter pub get';
           enriched['workspace'] = '/workspace';
         }
         if (workspaceGit != null) {
@@ -387,10 +406,10 @@ class EvalSetResolver {
     if (sandboxType.isEmpty || sandboxType == 'local') return 'local';
 
     // Channel override → look up channel-specific sandbox
-    if (flutterChannel != null && kSdkChannels.containsKey(flutterChannel)) {
-      final registryKey = kSdkChannels[flutterChannel]!;
-      if (kSandboxRegistry.containsKey(registryKey)) {
-        final def = kSandboxRegistry[registryKey]!;
+    if (flutterChannel != null && sdkChannels.containsKey(flutterChannel)) {
+      final registryKey = sdkChannels[flutterChannel]!;
+      if (sandboxRegistry.containsKey(registryKey)) {
+        final def = sandboxRegistry[registryKey]!;
         var sandboxPath = def['path']!;
         if (!p.isAbsolute(sandboxPath)) {
           sandboxPath = p.normalize(p.join(datasetRoot, sandboxPath));
@@ -400,8 +419,8 @@ class EvalSetResolver {
     }
 
     // Named sandbox from registry
-    if (kSandboxRegistry.containsKey(sandboxType)) {
-      final def = kSandboxRegistry[sandboxType]!;
+    if (sandboxRegistry.containsKey(sandboxType)) {
+      final def = sandboxRegistry[sandboxType]!;
       var sandboxPath = def['path']!;
       if (!p.isAbsolute(sandboxPath)) {
         sandboxPath = p.normalize(p.join(datasetRoot, sandboxPath));
diff --git a/packages/dataset_config_python/src/dataset_config_python/resolver.py b/packages/dataset_config_python/src/dataset_config_python/resolver.py
index 1415e2d..6c9116c 100644
--- a/packages/dataset_config_python/src/dataset_config_python/resolver.py
+++ b/packages/dataset_config_python/src/dataset_config_python/resolver.py
@@ -29,15 +29,16 @@
     "openai/gpt-5-pro",
 ]
 
-# Available sandbox configurations.
-SANDBOX_REGISTRY: dict[str, dict[str, str]] = {
+# Default sandbox configurations for Flutter evaluations.
+# Consumers can pass these to resolve() or provide their own.
+DEFAULT_SANDBOX_REGISTRY: dict[str, dict[str, str]] = {
     "podman": {"name": "podman", "path": "./sandboxes/podman/compose.yaml"},
     "podman-beta": {"name": "podman", "path": "./sandboxes/podman/compose-beta.yaml"},
     "podman-main": {"name": "podman", "path": "./sandboxes/podman/compose-main.yaml"},
 }
 
-# Maps Flutter SDK channel names to sandbox registry keys.
-SDK_CHANNELS: dict[str, str] = {
+# Default Flutter SDK channel → sandbox registry key mapping.
+DEFAULT_SDK_CHANNELS: dict[str, str] = {
     "stable": "podman",
     "beta": "podman-beta",
     "main": "podman-main",
@@ -51,6 +52,9 @@ def _is_glob(pattern: str) -> bool:
 def resolve(
     dataset_path: str,
     job_names: list[str],
+    *,
+    sandbox_registry: dict[str, dict[str, str]] | None = None,
+    sdk_channels: dict[str, str] | None = None,
 ) -> list[EvalSet]:
     """Resolve dataset + job(s) into EvalSet objects.
 
@@ -59,17 +63,21 @@ def resolve(
     Args:
         dataset_path: Root directory containing ``tasks/`` and ``jobs/``.
         job_names: Job names (looked up in ``jobs/``) or paths.
+        sandbox_registry: Named sandbox configurations. Defaults to empty.
+        sdk_channels: SDK channel → sandbox registry key mapping. Defaults to empty.
 
     Returns:
         A list of EvalSet objects ready for JSON serialization.
     """
+    registry = sandbox_registry or {}
+    channels = sdk_channels or {}
     task_configs = parse_tasks(dataset_path)
     results: list[EvalSet] = []
 
     for job_name in job_names:
         job_path = find_job_file(dataset_path, job_name)
         job = parse_job(job_path, dataset_path)
-        results.extend(_resolve_job(task_configs, job, dataset_path))
+        results.extend(_resolve_job(task_configs, job, dataset_path, registry, channels))
 
     return results
 
@@ -78,6 +86,8 @@ def _resolve_job(
     dataset_tasks: list[ParsedTask],
     job: Any,
     dataset_root: str,
+    sandbox_registry: dict[str, dict[str, str]],
+    sdk_channels: dict[str, str],
 ) -> list[EvalSet]:
     """Resolve task configs and job into EvalSet objects."""
     models = job.models if job.models else list(DEFAULT_MODELS)
@@ -96,7 +106,7 @@ def _resolve_job(
             task_configs=group,
             log_dir=job.log_dir,
             models=models,
-            sandbox=_resolve_sandbox(dataset_root, job, flutter_channel=channel),
+            sandbox=_resolve_sandbox(dataset_root, job, sandbox_registry, sdk_channels, flutter_channel=channel),
             job=job,
         )
         for channel, group in groups.items()
@@ -142,7 +152,6 @@ def _build_eval_set(
 
             if workspace is not None and is_container:
                 files = {**(files or {}), "/workspace": workspace}
-                setup = setup or "cd /workspace && flutter pub get"
                 enriched["workspace"] = "/workspace"
             if workspace_git is not None:
                 enriched["workspace_git"] = workspace_git
@@ -328,6 +337,8 @@ def _resolve_models(job: Any) -> list[str]:
 def _resolve_sandbox(
     dataset_root: str,
     job: Any,
+    sandbox_registry: dict[str, dict[str, str]],
+    sdk_channels: dict[str, str],
     *,
     flutter_channel: str | None = None,
 ) -> Any:
@@ -337,18 +348,18 @@ def _resolve_sandbox(
         return "local"
 
     # Channel override
-    if flutter_channel and flutter_channel in SDK_CHANNELS:
-        registry_key = SDK_CHANNELS[flutter_channel]
-        if registry_key in SANDBOX_REGISTRY:
-            defn = SANDBOX_REGISTRY[registry_key]
+    if flutter_channel and flutter_channel in sdk_channels:
+        registry_key = sdk_channels[flutter_channel]
+        if registry_key in sandbox_registry:
+            defn = sandbox_registry[registry_key]
             sandbox_path = defn["path"]
             if not os.path.isabs(sandbox_path):
                 sandbox_path = os.path.normpath(os.path.join(dataset_root, sandbox_path))
             return {"type": defn["name"], "path": sandbox_path}
 
     # Named sandbox from registry
-    if sandbox_type in SANDBOX_REGISTRY:
-        defn = SANDBOX_REGISTRY[sandbox_type]
+    if sandbox_type in sandbox_registry:
+        defn = sandbox_registry[sandbox_type]
         sandbox_path = defn["path"]
         if not os.path.isabs(sandbox_path):
             sandbox_path = os.path.normpath(os.path.join(dataset_root, sandbox_path))

From 2eb11048b9512b82fb14e553e3811ae2fa1b17aa Mon Sep 17 00:00:00 2001
From: Eric Windmill <eric@ericwindmill.com>
Date: Fri, 13 Mar 2026 17:05:16 -0700
Subject: [PATCH 6/8] feat: Introduce tag-based filtering, refined task
 function references, and expanded sandbox configuration options in
 documentation and API.

---
 docs/guides/config.md                         | 48 +++++++++-
 .../dataset_config_dart.md                    | 87 ++++++++++++++++---
 2 files changed, 124 insertions(+), 11 deletions(-)

diff --git a/docs/guides/config.md b/docs/guides/config.md
index aef6aba..f3889b9 100644
--- a/docs/guides/config.md
+++ b/docs/guides/config.md
@@ -39,5 +39,51 @@ In evals, the definition of dataset is expanded to include all fixtures of runni
 This means you care about job files and task files. Job files might look like this:
 
 - job/main.yaml (runs the whole thing)
-- job/ci.yaml (a job that runs as part of ci)
+- job/ci.yaml (a job that is run as part of ci)
 - job/local_dev.yaml (a job that is .gitignored, used for quick iteration)
+
+## Tag-based filtering
+
+Jobs can filter which tasks and samples run using tags. Tasks and samples define tags in their `metadata`, and jobs reference them via `task_filters` and `sample_filters`:
+
+```yaml
+# job.yaml
+task_filters:
+  include_tags: [code_gen]      # only tasks tagged "code_gen"
+  exclude_tags: [deprecated]    # skip deprecated tasks
+sample_filters:
+  include_tags: [flutter]       # only samples tagged "flutter"
+```
+
+- **`include_tags`** — an item must have *all* listed tags to be included
+- **`exclude_tags`** — an item is excluded if it has *any* listed tag
+
+Tag filters work alongside ID-based filtering (`tasks.<id>.include-samples` / `exclude-samples`).
+
+## Task function references
+
+The `func` field in task YAML identifies the Python `@task` function to run. Three formats are supported:
+
+| Format | Example | Resolution |
+|---|---|---|
+| Short name | `question_answer` | Looks up `dash_evals.runner.tasks.question_answer` |
+| Colon syntax | `my_package.tasks:my_task` | Imports `my_package.tasks`, gets `my_task` |
+| Dotted path | `my_package.tasks.my_task.my_task` | Last segment is the function name |
+
+## Sandbox configuration
+
+The sandbox registry is **configurable** — the resolver accepts a registry mapping names to compose files. The default registry is empty; the `devals_cli` passes the Flutter-specific registry:
+
+```yaml
+# job.yaml
+sandbox_type: podman          # looks up "podman" in the registry
+image_prefix: us-central1-docker.pkg.dev/my-project/repo/
+```
+
+The `image_prefix` is prepended to image names during sandbox resolution (useful for private registries).
+
+## Workspace setup
+
+When `workspace` is specified on a sample and the sandbox is a container (`docker` or `podman`), the resolver maps it to `Sample.files['/workspace']`. The setup command (e.g. `cd /workspace && flutter pub get`) is **not** auto-generated — specify it explicitly in your sample or task YAML via the `setup` field.
+
+For the full field reference, see {doc}`/reference/yaml_config`.
diff --git a/docs/reference/dart_api/dataset_config_dart/dataset_config_dart.md b/docs/reference/dart_api/dataset_config_dart/dataset_config_dart.md
index 460a6a4..7d3afaf 100644
--- a/docs/reference/dart_api/dataset_config_dart/dataset_config_dart.md
+++ b/docs/reference/dart_api/dataset_config_dart/dataset_config_dart.md
@@ -776,9 +776,25 @@ This is the resolution engine. It:
 #### `EvalSetResolver`
 
 ```dart
-EvalSetResolver()
+EvalSetResolver({Map<String, Map<String, String>> sandboxRegistry, Map<String, String> sdkChannels})
 ```
 
+Creates a resolver with optional sandbox configuration.
+
+If [sandboxRegistry] or [sdkChannels] are not provided, they default
+to empty maps (no sandbox resolution). Pass [kDefaultSandboxRegistry]
+and [kDefaultSdkChannels] for the Flutter-specific sandbox setup.
+
+### Properties
+
+- **`sandboxRegistry`** → `Map<String, Map<String, String>>` *(final)*
+
+  Named sandbox configurations (e.g. `'podman'` → compose file path).
+
+- **`sdkChannels`** → `Map<String, String>` *(final)*
+
+  SDK channel → sandbox registry key mapping.
+
 ### Methods
 
 #### `resolve`
@@ -1000,7 +1016,7 @@ task_defaults:
 #### `Job`
 
 ```dart
-Job({required String logDir, String sandboxType, int maxConnections, List<String>? models, Map<String, Map<String, dynamic>>? variants, List<String>? taskPaths, Map<String, JobTask>? tasks, bool saveExamples, int? retryAttempts, int? maxRetries, double? retryWait, double? retryConnections, bool? retryCleanup, double? failOnError, bool? continueOnFail, int? retryOnError, bool? debugErrors, int? maxSamples, int? maxTasks, int? maxSubprocesses, int? maxSandboxes, String? logLevel, String? logLevelTranscript, String? logFormat, List<String>? tags, Map<String, dynamic>? metadata, bool? trace, String? display, bool? score, Object? limit, Object? sampleId, Object? sampleShuffle, Object? epochs, Object? approval, Object? solver, bool? sandboxCleanup, String? modelBaseUrl, Map<String, Object?>? modelArgs, Map<String, String>? modelRoles, Map<String, Object?>? taskArgs, int? messageLimit, int? tokenLimit, int? timeLimit, int? workingLimit, double? costLimit, Map<String, Object?>? modelCostConfig, bool? logSamples, bool? logRealtime, bool? logImages, int? logBuffer, int? logShared, String? bundleDir, bool? bundleOverwrite, bool? logDirAllowDirty, String? evalSetId, Map<String, dynamic>? evalSetOverrides, Map<String, dynamic>? taskDefaults})
+Job({String? description, String? imagePrefix, required String logDir, String sandboxType, int maxConnections, List<String>? models, Map<String, Map<String, dynamic>>? variants, List<String>? taskPaths, Map<String, JobTask>? tasks, bool saveExamples, int? retryAttempts, int? maxRetries, double? retryWait, double? retryConnections, bool? retryCleanup, double? failOnError, bool? continueOnFail, int? retryOnError, bool? debugErrors, int? maxSamples, int? maxTasks, int? maxSubprocesses, int? maxSandboxes, String? logLevel, String? logLevelTranscript, String? logFormat, List<String>? tags, Map<String, dynamic>? metadata, bool? trace, String? display, bool? score, Object? limit, Object? sampleId, Object? sampleShuffle, Object? epochs, Object? approval, Object? solver, bool? sandboxCleanup, String? modelBaseUrl, Map<String, Object?>? modelArgs, Map<String, String>? modelRoles, Map<String, Object?>? taskArgs, int? messageLimit, int? tokenLimit, int? timeLimit, int? workingLimit, double? costLimit, Map<String, Object?>? modelCostConfig, bool? logSamples, bool? logRealtime, bool? logImages, int? logBuffer, int? logShared, String? bundleDir, bool? bundleOverwrite, bool? logDirAllowDirty, String? evalSetId, Map<String, dynamic>? evalSetOverrides, Map<String, dynamic>? taskDefaults, TagFilter? taskFilters, TagFilter? sampleFilters})
 ```
 
 #### `Job.fromJson`
@@ -1025,7 +1041,7 @@ a custom system message.
 #### `JobTask`
 
 ```dart
-JobTask({required String id, List<String>? includeSamples, List<String>? excludeSamples, String? systemMessage})
+JobTask({required String id, List<String>? includeSamples, List<String>? excludeSamples, String? systemMessage, Map<String, dynamic>? args})
 ```
 
 #### `JobTask.fromJson`
@@ -1200,14 +1216,14 @@ former `TaskConfig` model-package class.
 #### `ParsedTask`
 
 ```dart
-ParsedTask({required String id, required String taskFunc, required List<Sample> samples, required Variant variant, String sandboxType, String? systemMessage, List<String>? allowedVariants, bool saveExamples, String? examplesDir, String? model, Map<String, dynamic>? config, Map<String, String>? modelRoles, Object? sandbox, Object? approval, Object? epochs, Object? failOnError, bool? continueOnFail, int? messageLimit, int? tokenLimit, int? timeLimit, int? workingLimit, double? costLimit, Object? earlyStopping, String? displayName, Object? version, Map<String, dynamic>? metadata})
+ParsedTask({required String id, required String func, required List<Sample> samples, required Variant variant, String sandboxType, String? systemMessage, List<String>? allowedVariants, bool saveExamples, String? examplesDir, TagFilter? variantFilters, Map<String, dynamic>? sandboxParameters, String? model, Map<String, dynamic>? config, Map<String, String>? modelRoles, Object? sandbox, Object? approval, Object? epochs, Object? failOnError, bool? continueOnFail, int? messageLimit, int? tokenLimit, int? timeLimit, int? workingLimit, double? costLimit, Object? earlyStopping, String? displayName, Object? version, Map<String, dynamic>? metadata})
 ```
 
 ### Properties
 
 - **`id`** → `String` *(final)*
 
-- **`taskFunc`** → `String` *(final)*
+- **`func`** → `String` *(final)*
 
 - **`samples`** → `List<Sample>` *(final)*
 
@@ -1223,6 +1239,14 @@ ParsedTask({required String id, required String taskFunc, required List<Sample>
 
 - **`examplesDir`** → `String?` *(final)*
 
+- **`variantFilters`** → `TagFilter?` *(final)*
+
+  Tag filter for variant selection.
+
+- **`sandboxParameters`** → `Map<String, dynamic>?` *(final)*
+
+  Pass-through dict for sandbox plugin configuration.
+
 - **`model`** → `String?` *(final)*
 
   Default model for this task.
@@ -1296,7 +1320,7 @@ ParsedTask({required String id, required String taskFunc, required List<Sample>
 #### `copyWith`
 
 ```dart
-ParsedTask copyWith({String? id, String? taskFunc, List<Sample>? samples, Variant? variant, String? sandboxType, String? systemMessage, List<String>? allowedVariants, bool? saveExamples, String? examplesDir, String? model, Map<String, dynamic>? config, Map<String, String>? modelRoles, Object? sandbox, Object? approval, Object? epochs, Object? failOnError, bool? continueOnFail, int? messageLimit, int? tokenLimit, int? timeLimit, int? workingLimit, double? costLimit, Object? earlyStopping, String? displayName, Object? version, Map<String, dynamic>? metadata})
+ParsedTask copyWith({String? id, String? func, List<Sample>? samples, Variant? variant, String? sandboxType, String? systemMessage, List<String>? allowedVariants, bool? saveExamples, String? examplesDir, TagFilter? variantFilters, Map<String, dynamic>? sandboxParameters, String? model, Map<String, dynamic>? config, Map<String, String>? modelRoles, Object? sandbox, Object? approval, Object? epochs, Object? failOnError, bool? continueOnFail, int? messageLimit, int? tokenLimit, int? timeLimit, int? workingLimit, double? costLimit, Object? earlyStopping, String? displayName, Object? version, Map<String, dynamic>? metadata})
 ```
 
 Create a copy with overrides.
@@ -1304,7 +1328,7 @@ Create a copy with overrides.
 **Parameters:**
 
 - `id` (`String?`)
-- `taskFunc` (`String?`)
+- `func` (`String?`)
 - `samples` (`List<Sample>?`)
 - `variant` (`Variant?`)
 - `sandboxType` (`String?`)
@@ -1312,6 +1336,8 @@ Create a copy with overrides.
 - `allowedVariants` (`List<String>?`)
 - `saveExamples` (`bool?`)
 - `examplesDir` (`String?`)
+- `variantFilters` (`TagFilter?`)
+- `sandboxParameters` (`Map<String, dynamic>?`)
 - `model` (`String?`)
 - `config` (`Map<String, dynamic>?`)
 - `modelRoles` (`Map<String, String>?`)
@@ -1460,6 +1486,28 @@ Score.fromJson(Map<String, dynamic> json)
 
 ---
 
+## abstract class `TagFilter`
+
+**Mixins:** `_$TagFilter`
+
+Tag-based filter for including/excluding items by their tags.
+
+### Constructors
+
+#### `TagFilter`
+
+```dart
+TagFilter({List<String>? includeTags, List<String>? excludeTags})
+```
+
+#### `TagFilter.fromJson`
+
+```dart
+TagFilter.fromJson(Map<String, dynamic> json)
+```
+
+---
+
 ## abstract class `Task`
 
 **Mixins:** `_$Task`
@@ -1475,7 +1523,7 @@ constructor.
 #### `Task`
 
 ```dart
-Task({Dataset? dataset, Object? setup, Object? solver, Object? cleanup, Object? scorer, Object? metrics, String? model, Object? config, Map<String, String>? modelRoles, Object? sandbox, Object? approval, Object? epochs, Object? failOnError, bool? continueOnFail, int? messageLimit, int? tokenLimit, int? timeLimit, int? workingLimit, double? costLimit, Object? earlyStopping, String? displayName, String? taskFunc, String? name, Object version, Map<String, dynamic>? metadata})
+Task({Dataset? dataset, Object? setup, Object? solver, Object? cleanup, Object? scorer, Object? metrics, String? model, Object? config, Map<String, String>? modelRoles, Object? sandbox, Object? approval, Object? epochs, Object? failOnError, bool? continueOnFail, int? messageLimit, int? tokenLimit, int? timeLimit, int? workingLimit, double? costLimit, Object? earlyStopping, String? displayName, String? func, String? systemMessage, Map<String, dynamic>? sandboxParameters, String? name, Object version, Map<String, dynamic>? metadata})
 ```
 
 #### `Task.fromJson`
@@ -1519,12 +1567,12 @@ TaskInfo.fromJson(Map<String, dynamic> json)
 #### `TaskMetadata`
 
 ```dart
-TaskMetadata(String taskFunc, Map<String, Object?> additional)
+TaskMetadata(String func, Map<String, Object?> additional)
 ```
 
 ### Properties
 
-- **`taskFunc`** → `String` *(final)*
+- **`func`** → `String` *(final)*
 
 - **`additional`** → `Map<String, Object?>` *(final)*
 
@@ -1721,6 +1769,25 @@ Throws [FileSystemException] if the job file is not found.
 
 ---
 
+## `matchesTagFilter`
+
+```dart
+bool matchesTagFilter(List<String> itemTags, TagFilter filter)
+```
+
+Check whether a set of [itemTags] matches the given [filter].
+
+Returns `true` if:
+- All include_tags (if any) are present in [itemTags]
+- No exclude_tags (if any) are present in [itemTags]
+
+**Parameters:**
+
+- `itemTags` (`List<String>`) *(required)*
+- `filter` (`TagFilter`) *(required)*
+
+---
+
 ## `readYamlFile`
 
 ```dart

From 757acb7178fffe1c9283d04a559dc2299983b358 Mon Sep 17 00:00:00 2001
From: Eric Windmill <eric@ericwindmill.com>
Date: Fri, 13 Mar 2026 17:26:36 -0700
Subject: [PATCH 7/8] feat: Add variant filtering and propagate image prefix
 and job task arguments to resolved task metadata, updating the config parity
 tool.

---
 .../lib/src/parsers/yaml_parser.dart          |  7 ++
 .../lib/src/resolvers/eval_set_resolver.dart  | 17 +++++
 .../test/eval_set_resolver_test.dart          | 69 +++++++++++++++++++
 .../src/dataset_config_python/parser.py       |  9 +++
 .../src/dataset_config_python/resolver.py     | 17 +++++
 ...{config_partiy.dart => config_parity.dart} |  0
 6 files changed, 119 insertions(+)
 rename tool/config_parity/bin/{config_partiy.dart => config_parity.dart} (100%)

diff --git a/packages/dataset_config_dart/lib/src/parsers/yaml_parser.dart b/packages/dataset_config_dart/lib/src/parsers/yaml_parser.dart
index e8f0b69..d7c0b36 100644
--- a/packages/dataset_config_dart/lib/src/parsers/yaml_parser.dart
+++ b/packages/dataset_config_dart/lib/src/parsers/yaml_parser.dart
@@ -100,6 +100,12 @@ class YamlParser extends Parser {
     final taskMetadata = _asMap(data['metadata']);
     final sandboxParameters = _asMap(data['sandbox_parameters']);
 
+    // Parse variant_filters (tag-based variant restriction)
+    final variantFiltersRaw = _asMap(data['variant_filters']);
+    final variantFilters = variantFiltersRaw != null
+        ? TagFilter.fromJson(variantFiltersRaw)
+        : null;
+
     return [
       ParsedTask(
         id: taskId,
@@ -127,6 +133,7 @@ class YamlParser extends Parser {
         version: version,
         metadata: taskMetadata,
         sandboxParameters: sandboxParameters,
+        variantFilters: variantFilters,
       ),
     ];
   }
diff --git a/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart b/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart
index 871e5e9..928a908 100644
--- a/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart
+++ b/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart
@@ -211,6 +211,9 @@ class EvalSetResolver {
         if (tc.systemMessage != null) 'system_message': tc.systemMessage,
         if (tc.saveExamples) 'save_examples': true,
         if (tc.examplesDir != null) 'examples_dir': tc.examplesDir,
+        // Propagate image_prefix from job for container image resolution
+        if (job.imagePrefix != null && job.imagePrefix!.isNotEmpty)
+          'image_prefix': job.imagePrefix,
         // Merge any task-level metadata from YAML
         ...?tc.metadata,
       };
@@ -466,6 +469,13 @@ class EvalSetResolver {
         }
       }
 
+      // Filter by task-level variant_filters (tag-based)
+      if (taskConfig.variantFilters != null) {
+        effectiveVariants.removeWhere((name, _) {
+          return !matchesTagFilter([name], taskConfig.variantFilters!);
+        });
+      }
+
       // Get job-level task overrides
       final jobTask = (job.tasks != null && job.tasks!.containsKey(taskId))
           ? job.tasks![taskId]
@@ -500,6 +510,12 @@ class EvalSetResolver {
         systemMessage = jobTask!.systemMessage;
       }
 
+      // Merge job-task args into metadata
+      Map<String, dynamic>? mergedMetadata = taskConfig.metadata;
+      if (jobTask?.args != null && jobTask!.args!.isNotEmpty) {
+        mergedMetadata = {...?mergedMetadata, 'args': jobTask.args};
+      }
+
       // Create one ParsedTask per effective variant
       for (final entry in effectiveVariants.entries) {
         final variant = _resolveVariant(entry.key, entry.value, datasetRoot);
@@ -519,6 +535,7 @@ class EvalSetResolver {
             allowedVariants: null,
             saveExamples: job.saveExamples,
             examplesDir: examplesDir,
+            metadata: mergedMetadata,
           ),
         );
       }
diff --git a/packages/dataset_config_dart/test/eval_set_resolver_test.dart b/packages/dataset_config_dart/test/eval_set_resolver_test.dart
index de32e88..48d4c6d 100644
--- a/packages/dataset_config_dart/test/eval_set_resolver_test.dart
+++ b/packages/dataset_config_dart/test/eval_set_resolver_test.dart
@@ -15,6 +15,8 @@ void main() {
     String? model,
     int? timeLimit,
     int? messageLimit,
+    TagFilter? variantFilters,
+    Map<String, dynamic>? metadata,
   }) {
     return ParsedTask(
       id: id,
@@ -35,6 +37,8 @@ void main() {
       model: model,
       timeLimit: timeLimit,
       messageLimit: messageLimit,
+      variantFilters: variantFilters,
+      metadata: metadata,
     );
   }
 
@@ -47,6 +51,7 @@ void main() {
     Map<String, JobTask>? tasks,
     bool saveExamples = false,
     Map<String, dynamic>? taskDefaults,
+    String? imagePrefix,
   }) {
     return Job(
       logDir: logDir,
@@ -56,6 +61,7 @@ void main() {
       tasks: tasks,
       saveExamples: saveExamples,
       taskDefaults: taskDefaults,
+      imagePrefix: imagePrefix,
     );
   }
 
@@ -366,5 +372,68 @@ void main() {
       final dataset = results.first.tasks.first.dataset!;
       expect(dataset.name, 'my_eval:baseline');
     });
+
+    test('variant_filters restricts effective variants', () {
+      final results = resolver.resolve(
+        [
+          makeTask(
+            variantFilters: const TagFilter(
+              includeTags: ['baseline'],
+            ),
+          ),
+        ],
+        makeJob(
+          models: ['m'],
+          variants: {'baseline': {}, 'full': {}, 'mcp_only': {}},
+        ),
+        '/tmp/dataset',
+      );
+
+      final taskNames = results
+          .expand((e) => e.tasks)
+          .map((t) => t.name)
+          .toList();
+      expect(taskNames, ['test_task:baseline']);
+      expect(taskNames, isNot(contains('test_task:full')));
+      expect(taskNames, isNot(contains('test_task:mcp_only')));
+    });
+
+    test('image_prefix from job appears in task metadata', () {
+      final results = resolver.resolve(
+        [makeTask()],
+        makeJob(
+          models: ['m'],
+          imagePrefix: 'us-central1-docker.pkg.dev/my-project/repo/',
+        ),
+        '/tmp/dataset',
+      );
+
+      final metadata = results.first.tasks.first.metadata!;
+      expect(
+        metadata['image_prefix'],
+        'us-central1-docker.pkg.dev/my-project/repo/',
+      );
+    });
+
+    test('JobTask.args appears in task metadata', () {
+      final results = resolver.resolve(
+        [makeTask(id: 'my_task')],
+        makeJob(
+          models: ['m'],
+          tasks: {
+            'my_task': const JobTask(
+              id: 'my_task',
+              args: {'base_url': 'http://localhost', 'timeout': 30},
+            ),
+          },
+        ),
+        '/tmp/dataset',
+      );
+
+      final metadata = results.first.tasks.first.metadata!;
+      expect(metadata['args'], isA<Map>());
+      expect(metadata['args']['base_url'], 'http://localhost');
+      expect(metadata['args']['timeout'], 30);
+    });
   });
 }
diff --git a/packages/dataset_config_python/src/dataset_config_python/parser.py b/packages/dataset_config_python/src/dataset_config_python/parser.py
index 56dc89f..4148e11 100644
--- a/packages/dataset_config_python/src/dataset_config_python/parser.py
+++ b/packages/dataset_config_python/src/dataset_config_python/parser.py
@@ -57,6 +57,7 @@ def __init__(
         version: Any | None = None,
         metadata: dict[str, Any] | None = None,
         sandbox_parameters: dict[str, Any] | None = None,
+        variant_filters: TagFilter | None = None,
     ):
         self.id = id
         self.func = func
@@ -85,6 +86,7 @@ def __init__(
         self.version = version
         self.metadata = metadata
         self.sandbox_parameters = sandbox_parameters
+        self.variant_filters = variant_filters
 
     _UNSET: Any = object()
 
@@ -118,6 +120,7 @@ def copy_with(
         display_name: str | None = _UNSET,
         version: Any = _UNSET,
         metadata: dict[str, Any] | None = _UNSET,
+        variant_filters: TagFilter | None = _UNSET,
     ) -> ParsedTask:
         """Create a copy with overrides."""
         _U = ParsedTask._UNSET
@@ -149,6 +152,7 @@ def copy_with(
             display_name=self.display_name if display_name is _U else display_name,
             version=self.version if version is _U else version,
             metadata=self.metadata if metadata is _U else metadata,
+            variant_filters=self.variant_filters if variant_filters is _U else variant_filters,
         )
 
 
@@ -262,6 +266,10 @@ def _load_task_file(task_path: str, dataset_root: str) -> list[ParsedTask]:
         )
     samples = _load_samples_section(samples_raw, dataset_root, task_workspace, task_tests, task_dir)
 
+    # Parse variant_filters (tag-based variant restriction)
+    variant_filters_raw = data.get("variant_filters")
+    variant_filters = TagFilter(**variant_filters_raw) if isinstance(variant_filters_raw, dict) else None
+
     return [
         ParsedTask(
             id=task_id,
@@ -288,6 +296,7 @@ def _load_task_file(task_path: str, dataset_root: str) -> list[ParsedTask]:
             version=data.get("version"),
             metadata=data.get("metadata") if isinstance(data.get("metadata"), dict) else None,
             sandbox_parameters=data.get("sandbox_parameters") if isinstance(data.get("sandbox_parameters"), dict) else None,
+            variant_filters=variant_filters,
         )
     ]
 
diff --git a/packages/dataset_config_python/src/dataset_config_python/resolver.py b/packages/dataset_config_python/src/dataset_config_python/resolver.py
index 6c9116c..9ecd4e7 100644
--- a/packages/dataset_config_python/src/dataset_config_python/resolver.py
+++ b/packages/dataset_config_python/src/dataset_config_python/resolver.py
@@ -202,6 +202,9 @@ def _build_eval_set(
             task_metadata["save_examples"] = True
         if tc.examples_dir is not None:
             task_metadata["examples_dir"] = tc.examples_dir
+        # Propagate image_prefix from job for container image resolution
+        if job.image_prefix:
+            task_metadata["image_prefix"] = job.image_prefix
         if tc.metadata:
             task_metadata.update(tc.metadata)
 
@@ -402,6 +405,14 @@ def _expand_task_configs(
             if tc.allowed_variants is None or vname in tc.allowed_variants:
                 effective_variants[vname] = vdef
 
+        # Filter by task-level variant_filters (tag-based)
+        if tc.variant_filters is not None:
+            effective_variants = {
+                vname: vdef
+                for vname, vdef in effective_variants.items()
+                if matches_tag_filter([vname], tc.variant_filters)
+            }
+
         # Get job-level task overrides
         job_task = job.tasks.get(task_id) if job.tasks else None
 
@@ -425,6 +436,11 @@ def _expand_task_configs(
         if job_task and job_task.system_message is not None:
             system_message = job_task.system_message
 
+        # Merge job-task args into metadata
+        merged_metadata = dict(tc.metadata) if tc.metadata else None
+        if job_task and job_task.args:
+            merged_metadata = {**(merged_metadata or {}), "args": job_task.args}
+
         # Create one ParsedTask per effective variant
         for vname, vdef in effective_variants.items():
             variant = _resolve_variant(vname, vdef, dataset_root)
@@ -442,6 +458,7 @@ def _expand_task_configs(
                     allowed_variants=None,
                     save_examples=job.save_examples,
                     examples_dir=examples_dir,
+                    metadata=merged_metadata,
                 )
             )
 
diff --git a/tool/config_parity/bin/config_partiy.dart b/tool/config_parity/bin/config_parity.dart
similarity index 100%
rename from tool/config_parity/bin/config_partiy.dart
rename to tool/config_parity/bin/config_parity.dart

From 9c52fd1d26e623592e1621b8b3947bf8b4a72e9d Mon Sep 17 00:00:00 2001
From: Eric Windmill <eric@ericwindmill.com>
Date: Fri, 13 Mar 2026 17:55:49 -0700
Subject: [PATCH 8/8] feat: Generalize SDK channel to 'branch', consolidate
 sandbox configuration, and refine tag filtering logic.

---
 .../lib/src/models/tag_filter.dart            |  2 +
 .../lib/src/models/variant.dart               |  6 +--
 .../lib/src/models/variant.freezed.dart       | 50 +++++++++----------
 .../lib/src/models/variant.g.dart             |  4 +-
 .../lib/src/resolvers/eval_set_resolver.dart  | 35 ++++++-------
 .../src/dataset_config_python/__init__.py     |  4 +-
 .../dataset_config_python/models/variant.py   |  4 +-
 .../src/dataset_config_python/parser.py       |  8 +--
 .../src/dataset_config_python/resolver.py     | 43 ++++++++++------
 .../tests/test_config.py                      |  2 +-
 .../devals_cli/lib/src/dataset/dry_run.dart   |  2 +-
 11 files changed, 87 insertions(+), 73 deletions(-)

diff --git a/packages/dataset_config_dart/lib/src/models/tag_filter.dart b/packages/dataset_config_dart/lib/src/models/tag_filter.dart
index f5a4ec1..3e112f4 100644
--- a/packages/dataset_config_dart/lib/src/models/tag_filter.dart
+++ b/packages/dataset_config_dart/lib/src/models/tag_filter.dart
@@ -22,10 +22,12 @@ sealed class TagFilter with _$TagFilter {
 /// - No exclude_tags (if any) are present in [itemTags]
 bool matchesTagFilter(List<String> itemTags, TagFilter filter) {
   if (filter.includeTags != null &&
+      filter.includeTags!.isNotEmpty &&
       !filter.includeTags!.every((t) => itemTags.contains(t))) {
     return false;
   }
   if (filter.excludeTags != null &&
+      filter.excludeTags!.isNotEmpty &&
       filter.excludeTags!.any((t) => itemTags.contains(t))) {
     return false;
   }
diff --git a/packages/dataset_config_dart/lib/src/models/variant.dart b/packages/dataset_config_dart/lib/src/models/variant.dart
index 82afa37..bfa1542 100644
--- a/packages/dataset_config_dart/lib/src/models/variant.dart
+++ b/packages/dataset_config_dart/lib/src/models/variant.dart
@@ -43,9 +43,9 @@ sealed class Variant with _$Variant {
     /// Each directory must contain a `SKILL.md` file.
     @JsonKey(name: 'skill_paths') @Default([]) List<String> skillPaths,
 
-    /// Flutter SDK channel to use (e.g., `'stable'`, `'beta'`, `'main'`).
-    /// `null` means use the default (stable) image from the job's sandbox.
-    @JsonKey(name: 'flutter_channel') String? flutterChannel,
+    /// SDK branch/channel to use (e.g., `'stable'`, `'beta'`, `'main'`).
+    /// `null` means use the default image from the job's sandbox.
+    @JsonKey(name: 'branch') String? branch,
   }) = _Variant;
 
   const Variant._();
diff --git a/packages/dataset_config_dart/lib/src/models/variant.freezed.dart b/packages/dataset_config_dart/lib/src/models/variant.freezed.dart
index 9fe224c..5389724 100644
--- a/packages/dataset_config_dart/lib/src/models/variant.freezed.dart
+++ b/packages/dataset_config_dart/lib/src/models/variant.freezed.dart
@@ -20,9 +20,9 @@ mixin _$Variant {
 @JsonKey(name: 'context_files') List<ContextFile> get contextFiles;/// MCP server keys to enable (e.g., `['dart']`).
 @JsonKey(name: 'mcp_servers') List<String> get mcpServers;/// Resolved paths to agent skill directories.
 /// Each directory must contain a `SKILL.md` file.
-@JsonKey(name: 'skill_paths') List<String> get skillPaths;/// Flutter SDK channel to use (e.g., `'stable'`, `'beta'`, `'main'`).
-/// `null` means use the default (stable) image from the job's sandbox.
-@JsonKey(name: 'flutter_channel') String? get flutterChannel;
+@JsonKey(name: 'skill_paths') List<String> get skillPaths;/// SDK branch/channel to use (e.g., `'stable'`, `'beta'`, `'main'`).
+/// `null` means use the default image from the job's sandbox.
+@JsonKey(name: 'branch') String? get branch;
 /// Create a copy of Variant
 /// with the given fields replaced by the non-null parameter values.
 @JsonKey(includeFromJson: false, includeToJson: false)
@@ -35,16 +35,16 @@ $VariantCopyWith<Variant> get copyWith => _$VariantCopyWithImpl<Variant>(this as
 
 @override
 bool operator ==(Object other) {
-  return identical(this, other) || (other.runtimeType == runtimeType&&other is Variant&&(identical(other.name, name) || other.name == name)&&const DeepCollectionEquality().equals(other.contextFiles, contextFiles)&&const DeepCollectionEquality().equals(other.mcpServers, mcpServers)&&const DeepCollectionEquality().equals(other.skillPaths, skillPaths)&&(identical(other.flutterChannel, flutterChannel) || other.flutterChannel == flutterChannel));
+  return identical(this, other) || (other.runtimeType == runtimeType&&other is Variant&&(identical(other.name, name) || other.name == name)&&const DeepCollectionEquality().equals(other.contextFiles, contextFiles)&&const DeepCollectionEquality().equals(other.mcpServers, mcpServers)&&const DeepCollectionEquality().equals(other.skillPaths, skillPaths)&&(identical(other.branch, branch) || other.branch == branch));
 }
 
 @JsonKey(includeFromJson: false, includeToJson: false)
 @override
-int get hashCode => Object.hash(runtimeType,name,const DeepCollectionEquality().hash(contextFiles),const DeepCollectionEquality().hash(mcpServers),const DeepCollectionEquality().hash(skillPaths),flutterChannel);
+int get hashCode => Object.hash(runtimeType,name,const DeepCollectionEquality().hash(contextFiles),const DeepCollectionEquality().hash(mcpServers),const DeepCollectionEquality().hash(skillPaths),branch);
 
 @override
 String toString() {
-  return 'Variant(name: $name, contextFiles: $contextFiles, mcpServers: $mcpServers, skillPaths: $skillPaths, flutterChannel: $flutterChannel)';
+  return 'Variant(name: $name, contextFiles: $contextFiles, mcpServers: $mcpServers, skillPaths: $skillPaths, branch: $branch)';
 }
 
 
@@ -55,7 +55,7 @@ abstract mixin class $VariantCopyWith<$Res>  {
   factory $VariantCopyWith(Variant value, $Res Function(Variant) _then) = _$VariantCopyWithImpl;
 @useResult
 $Res call({
- String name,@JsonKey(name: 'context_files') List<ContextFile> contextFiles,@JsonKey(name: 'mcp_servers') List<String> mcpServers,@JsonKey(name: 'skill_paths') List<String> skillPaths,@JsonKey(name: 'flutter_channel') String? flutterChannel
+ String name,@JsonKey(name: 'context_files') List<ContextFile> contextFiles,@JsonKey(name: 'mcp_servers') List<String> mcpServers,@JsonKey(name: 'skill_paths') List<String> skillPaths,@JsonKey(name: 'branch') String? branch
 });
 
 
@@ -72,13 +72,13 @@ class _$VariantCopyWithImpl<$Res>
 
 /// Create a copy of Variant
 /// with the given fields replaced by the non-null parameter values.
-@pragma('vm:prefer-inline') @override $Res call({Object? name = null,Object? contextFiles = null,Object? mcpServers = null,Object? skillPaths = null,Object? flutterChannel = freezed,}) {
+@pragma('vm:prefer-inline') @override $Res call({Object? name = null,Object? contextFiles = null,Object? mcpServers = null,Object? skillPaths = null,Object? branch = freezed,}) {
   return _then(_self.copyWith(
 name: null == name ? _self.name : name // ignore: cast_nullable_to_non_nullable
 as String,contextFiles: null == contextFiles ? _self.contextFiles : contextFiles // ignore: cast_nullable_to_non_nullable
 as List<ContextFile>,mcpServers: null == mcpServers ? _self.mcpServers : mcpServers // ignore: cast_nullable_to_non_nullable
 as List<String>,skillPaths: null == skillPaths ? _self.skillPaths : skillPaths // ignore: cast_nullable_to_non_nullable
-as List<String>,flutterChannel: freezed == flutterChannel ? _self.flutterChannel : flutterChannel // ignore: cast_nullable_to_non_nullable
+as List<String>,branch: freezed == branch ? _self.branch : branch // ignore: cast_nullable_to_non_nullable
 as String?,
   ));
 }
@@ -161,10 +161,10 @@ return $default(_that);case _:
 /// }
 /// ```
 
-@optionalTypeArgs TResult maybeWhen<TResult extends Object?>(TResult Function( String name, @JsonKey(name: 'context_files')  List<ContextFile> contextFiles, @JsonKey(name: 'mcp_servers')  List<String> mcpServers, @JsonKey(name: 'skill_paths')  List<String> skillPaths, @JsonKey(name: 'flutter_channel')  String? flutterChannel)?  $default,{required TResult orElse(),}) {final _that = this;
+@optionalTypeArgs TResult maybeWhen<TResult extends Object?>(TResult Function( String name, @JsonKey(name: 'context_files')  List<ContextFile> contextFiles, @JsonKey(name: 'mcp_servers')  List<String> mcpServers, @JsonKey(name: 'skill_paths')  List<String> skillPaths, @JsonKey(name: 'branch')  String? branch)?  $default,{required TResult orElse(),}) {final _that = this;
 switch (_that) {
 case _Variant() when $default != null:
-return $default(_that.name,_that.contextFiles,_that.mcpServers,_that.skillPaths,_that.flutterChannel);case _:
+return $default(_that.name,_that.contextFiles,_that.mcpServers,_that.skillPaths,_that.branch);case _:
   return orElse();
 
 }
@@ -182,10 +182,10 @@ return $default(_that.name,_that.contextFiles,_that.mcpServers,_that.skillPaths,
 /// }
 /// ```
 
-@optionalTypeArgs TResult when<TResult extends Object?>(TResult Function( String name, @JsonKey(name: 'context_files')  List<ContextFile> contextFiles, @JsonKey(name: 'mcp_servers')  List<String> mcpServers, @JsonKey(name: 'skill_paths')  List<String> skillPaths, @JsonKey(name: 'flutter_channel')  String? flutterChannel)  $default,) {final _that = this;
+@optionalTypeArgs TResult when<TResult extends Object?>(TResult Function( String name, @JsonKey(name: 'context_files')  List<ContextFile> contextFiles, @JsonKey(name: 'mcp_servers')  List<String> mcpServers, @JsonKey(name: 'skill_paths')  List<String> skillPaths, @JsonKey(name: 'branch')  String? branch)  $default,) {final _that = this;
 switch (_that) {
 case _Variant():
-return $default(_that.name,_that.contextFiles,_that.mcpServers,_that.skillPaths,_that.flutterChannel);}
+return $default(_that.name,_that.contextFiles,_that.mcpServers,_that.skillPaths,_that.branch);}
 }
 /// A variant of `when` that fallback to returning `null`
 ///
@@ -199,10 +199,10 @@ return $default(_that.name,_that.contextFiles,_that.mcpServers,_that.skillPaths,
 /// }
 /// ```
 
-@optionalTypeArgs TResult? whenOrNull<TResult extends Object?>(TResult? Function( String name, @JsonKey(name: 'context_files')  List<ContextFile> contextFiles, @JsonKey(name: 'mcp_servers')  List<String> mcpServers, @JsonKey(name: 'skill_paths')  List<String> skillPaths, @JsonKey(name: 'flutter_channel')  String? flutterChannel)?  $default,) {final _that = this;
+@optionalTypeArgs TResult? whenOrNull<TResult extends Object?>(TResult? Function( String name, @JsonKey(name: 'context_files')  List<ContextFile> contextFiles, @JsonKey(name: 'mcp_servers')  List<String> mcpServers, @JsonKey(name: 'skill_paths')  List<String> skillPaths, @JsonKey(name: 'branch')  String? branch)?  $default,) {final _that = this;
 switch (_that) {
 case _Variant() when $default != null:
-return $default(_that.name,_that.contextFiles,_that.mcpServers,_that.skillPaths,_that.flutterChannel);case _:
+return $default(_that.name,_that.contextFiles,_that.mcpServers,_that.skillPaths,_that.branch);case _:
   return null;
 
 }
@@ -214,7 +214,7 @@ return $default(_that.name,_that.contextFiles,_that.mcpServers,_that.skillPaths,
 @JsonSerializable()
 
 class _Variant extends Variant {
-  const _Variant({this.name = 'baseline', @JsonKey(name: 'context_files') final  List<ContextFile> contextFiles = const [], @JsonKey(name: 'mcp_servers') final  List<String> mcpServers = const [], @JsonKey(name: 'skill_paths') final  List<String> skillPaths = const [], @JsonKey(name: 'flutter_channel') this.flutterChannel}): _contextFiles = contextFiles,_mcpServers = mcpServers,_skillPaths = skillPaths,super._();
+  const _Variant({this.name = 'baseline', @JsonKey(name: 'context_files') final  List<ContextFile> contextFiles = const [], @JsonKey(name: 'mcp_servers') final  List<String> mcpServers = const [], @JsonKey(name: 'skill_paths') final  List<String> skillPaths = const [], @JsonKey(name: 'branch') this.branch}): _contextFiles = contextFiles,_mcpServers = mcpServers,_skillPaths = skillPaths,super._();
   factory _Variant.fromJson(Map<String, dynamic> json) => _$VariantFromJson(json);
 
 /// User-defined variant name from the job file.
@@ -248,9 +248,9 @@ class _Variant extends Variant {
   return EqualUnmodifiableListView(_skillPaths);
 }
 
-/// Flutter SDK channel to use (e.g., `'stable'`, `'beta'`, `'main'`).
-/// `null` means use the default (stable) image from the job's sandbox.
-@override@JsonKey(name: 'flutter_channel') final  String? flutterChannel;
+/// SDK branch/channel to use (e.g., `'stable'`, `'beta'`, `'main'`).
+/// `null` means use the default image from the job's sandbox.
+@override@JsonKey(name: 'branch') final  String? branch;
 
 /// Create a copy of Variant
 /// with the given fields replaced by the non-null parameter values.
@@ -265,16 +265,16 @@ Map<String, dynamic> toJson() {
 
 @override
 bool operator ==(Object other) {
-  return identical(this, other) || (other.runtimeType == runtimeType&&other is _Variant&&(identical(other.name, name) || other.name == name)&&const DeepCollectionEquality().equals(other._contextFiles, _contextFiles)&&const DeepCollectionEquality().equals(other._mcpServers, _mcpServers)&&const DeepCollectionEquality().equals(other._skillPaths, _skillPaths)&&(identical(other.flutterChannel, flutterChannel) || other.flutterChannel == flutterChannel));
+  return identical(this, other) || (other.runtimeType == runtimeType&&other is _Variant&&(identical(other.name, name) || other.name == name)&&const DeepCollectionEquality().equals(other._contextFiles, _contextFiles)&&const DeepCollectionEquality().equals(other._mcpServers, _mcpServers)&&const DeepCollectionEquality().equals(other._skillPaths, _skillPaths)&&(identical(other.branch, branch) || other.branch == branch));
 }
 
 @JsonKey(includeFromJson: false, includeToJson: false)
 @override
-int get hashCode => Object.hash(runtimeType,name,const DeepCollectionEquality().hash(_contextFiles),const DeepCollectionEquality().hash(_mcpServers),const DeepCollectionEquality().hash(_skillPaths),flutterChannel);
+int get hashCode => Object.hash(runtimeType,name,const DeepCollectionEquality().hash(_contextFiles),const DeepCollectionEquality().hash(_mcpServers),const DeepCollectionEquality().hash(_skillPaths),branch);
 
 @override
 String toString() {
-  return 'Variant(name: $name, contextFiles: $contextFiles, mcpServers: $mcpServers, skillPaths: $skillPaths, flutterChannel: $flutterChannel)';
+  return 'Variant(name: $name, contextFiles: $contextFiles, mcpServers: $mcpServers, skillPaths: $skillPaths, branch: $branch)';
 }
 
 
@@ -285,7 +285,7 @@ abstract mixin class _$VariantCopyWith<$Res> implements $VariantCopyWith<$Res> {
   factory _$VariantCopyWith(_Variant value, $Res Function(_Variant) _then) = __$VariantCopyWithImpl;
 @override @useResult
 $Res call({
- String name,@JsonKey(name: 'context_files') List<ContextFile> contextFiles,@JsonKey(name: 'mcp_servers') List<String> mcpServers,@JsonKey(name: 'skill_paths') List<String> skillPaths,@JsonKey(name: 'flutter_channel') String? flutterChannel
+ String name,@JsonKey(name: 'context_files') List<ContextFile> contextFiles,@JsonKey(name: 'mcp_servers') List<String> mcpServers,@JsonKey(name: 'skill_paths') List<String> skillPaths,@JsonKey(name: 'branch') String? branch
 });
 
 
@@ -302,13 +302,13 @@ class __$VariantCopyWithImpl<$Res>
 
 /// Create a copy of Variant
 /// with the given fields replaced by the non-null parameter values.
-@override @pragma('vm:prefer-inline') $Res call({Object? name = null,Object? contextFiles = null,Object? mcpServers = null,Object? skillPaths = null,Object? flutterChannel = freezed,}) {
+@override @pragma('vm:prefer-inline') $Res call({Object? name = null,Object? contextFiles = null,Object? mcpServers = null,Object? skillPaths = null,Object? branch = freezed,}) {
   return _then(_Variant(
 name: null == name ? _self.name : name // ignore: cast_nullable_to_non_nullable
 as String,contextFiles: null == contextFiles ? _self._contextFiles : contextFiles // ignore: cast_nullable_to_non_nullable
 as List<ContextFile>,mcpServers: null == mcpServers ? _self._mcpServers : mcpServers // ignore: cast_nullable_to_non_nullable
 as List<String>,skillPaths: null == skillPaths ? _self._skillPaths : skillPaths // ignore: cast_nullable_to_non_nullable
-as List<String>,flutterChannel: freezed == flutterChannel ? _self.flutterChannel : flutterChannel // ignore: cast_nullable_to_non_nullable
+as List<String>,branch: freezed == branch ? _self.branch : branch // ignore: cast_nullable_to_non_nullable
 as String?,
   ));
 }
diff --git a/packages/dataset_config_dart/lib/src/models/variant.g.dart b/packages/dataset_config_dart/lib/src/models/variant.g.dart
index a9a6d25..09277ff 100644
--- a/packages/dataset_config_dart/lib/src/models/variant.g.dart
+++ b/packages/dataset_config_dart/lib/src/models/variant.g.dart
@@ -23,7 +23,7 @@ _Variant _$VariantFromJson(Map<String, dynamic> json) => _Variant(
           ?.map((e) => e as String)
           .toList() ??
       const [],
-  flutterChannel: json['flutter_channel'] as String?,
+  branch: json['branch'] as String?,
 );
 
 Map<String, dynamic> _$VariantToJson(_Variant instance) => <String, dynamic>{
@@ -31,5 +31,5 @@ Map<String, dynamic> _$VariantToJson(_Variant instance) => <String, dynamic>{
   'context_files': instance.contextFiles,
   'mcp_servers': instance.mcpServers,
   'skill_paths': instance.skillPaths,
-  'flutter_channel': instance.flutterChannel,
+  'branch': instance.branch,
 };
diff --git a/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart b/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart
index 928a908..53d6b47 100644
--- a/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart
+++ b/packages/dataset_config_dart/lib/src/resolvers/eval_set_resolver.dart
@@ -36,10 +36,10 @@ const Map<String, Map<String, String>> kDefaultSandboxRegistry = {
   },
 };
 
-/// Default Flutter SDK channel → sandbox registry key mapping.
+/// Default SDK branch → sandbox registry key mapping.
 ///
 /// Consumers can pass these to [EvalSetResolver] or provide their own.
-const Map<String, String> kDefaultSdkChannels = {
+const Map<String, String> kDefaultBranchChannels = {
   'stable': 'podman',
   'beta': 'podman-beta',
   'main': 'podman-main',
@@ -51,28 +51,28 @@ const Map<String, String> kDefaultSdkChannels = {
 /// This is the resolution engine. It:
 /// 1. Resolves models, sandboxes, and variants
 /// 2. Expands task × variant combinations into [Task] entries
-/// 3. Groups by flutter_channel (one [EvalSet] per group)
+/// 3. Groups by branch (one [EvalSet] per group)
 /// 4. Propagates job-level and task-level settings to the output
 class EvalSetResolver {
   /// Creates a resolver with optional sandbox configuration.
   ///
-  /// If [sandboxRegistry] or [sdkChannels] are not provided, they default
+  /// If [sandboxRegistry] or [branchChannels] are not provided, they default
   /// to empty maps (no sandbox resolution). Pass [kDefaultSandboxRegistry]
-  /// and [kDefaultSdkChannels] for the Flutter-specific sandbox setup.
+  /// and [kDefaultBranchChannels] for the Flutter-specific sandbox setup.
   const EvalSetResolver({
     this.sandboxRegistry = const {},
-    this.sdkChannels = const {},
+    this.branchChannels = const {},
   });
 
   /// Named sandbox configurations (e.g. `'podman'` → compose file path).
   final Map<String, Map<String, String>> sandboxRegistry;
 
-  /// SDK channel → sandbox registry key mapping.
-  final Map<String, String> sdkChannels;
+  /// SDK branch → sandbox registry key mapping.
+  final Map<String, String> branchChannels;
 
   /// Resolve task configs and job into [EvalSet] objects.
   ///
-  /// Groups by flutter_channel so each gets its own sandbox.
+  /// Groups by branch so each gets its own sandbox.
   List<EvalSet> resolve(
     List<ParsedTask> datasetTasks,
     Job job,
@@ -87,10 +87,10 @@ class EvalSetResolver {
       datasetRoot,
     );
 
-    // Group by flutter channel
+    // Group by branch
     final groups = <String?, List<ParsedTask>>{};
     for (final tc in expandedTasks) {
-      final key = tc.variant.flutterChannel;
+      final key = tc.variant.branch;
       (groups[key] ??= []).add(tc);
     }
 
@@ -103,7 +103,7 @@ class EvalSetResolver {
           sandbox: _resolveSandbox(
             datasetRoot,
             job,
-            flutterChannel: entry.key,
+            branch: entry.key,
           ),
           job: job,
         ),
@@ -156,6 +156,7 @@ class EvalSetResolver {
 
         if (workspace != null && isContainer) {
           files = {...?files, '/workspace': workspace};
+          setup ??= 'cd /workspace && flutter pub get';
           enriched['workspace'] = '/workspace';
         }
         if (workspaceGit != null) {
@@ -403,14 +404,14 @@ class EvalSetResolver {
   Object _resolveSandbox(
     String datasetRoot,
     Job job, {
-    String? flutterChannel,
+    String? branch,
   }) {
     final sandboxType = job.sandboxType;
     if (sandboxType.isEmpty || sandboxType == 'local') return 'local';
 
-    // Channel override → look up channel-specific sandbox
-    if (flutterChannel != null && sdkChannels.containsKey(flutterChannel)) {
-      final registryKey = sdkChannels[flutterChannel]!;
+    // Branch override → look up branch-specific sandbox
+    if (branch != null && branchChannels.containsKey(branch)) {
+      final registryKey = branchChannels[branch]!;
       if (sandboxRegistry.containsKey(registryKey)) {
         final def = sandboxRegistry[registryKey]!;
         var sandboxPath = def['path']!;
@@ -614,7 +615,7 @@ class EvalSetResolver {
       contextFiles: contextFiles,
       mcpServers: (vDef['mcp_servers'] as List?)?.cast<String>() ?? [],
       skillPaths: skillPaths,
-      flutterChannel: vDef['flutter_channel'] as String?,
+      branch: vDef['branch'] as String?,
     );
   }
 
diff --git a/packages/dataset_config_python/src/dataset_config_python/__init__.py b/packages/dataset_config_python/src/dataset_config_python/__init__.py
index 135b4cb..3a47fd0 100644
--- a/packages/dataset_config_python/src/dataset_config_python/__init__.py
+++ b/packages/dataset_config_python/src/dataset_config_python/__init__.py
@@ -6,7 +6,7 @@
 No Dart SDK or Inspect AI dependency required.
 """
 
-from dataset_config_python.resolver import resolve
+from dataset_config_python.resolver import DEFAULT_BRANCH_CHANNELS, DEFAULT_SANDBOX_REGISTRY, SandboxConfig, resolve
 from dataset_config_python.writer import write_eval_sets
 
-__all__ = ["resolve", "write_eval_sets"]
+__all__ = ["DEFAULT_BRANCH_CHANNELS", "DEFAULT_SANDBOX_REGISTRY", "SandboxConfig", "resolve", "write_eval_sets"]
diff --git a/packages/dataset_config_python/src/dataset_config_python/models/variant.py b/packages/dataset_config_python/src/dataset_config_python/models/variant.py
index 690e675..4fa39d6 100644
--- a/packages/dataset_config_python/src/dataset_config_python/models/variant.py
+++ b/packages/dataset_config_python/src/dataset_config_python/models/variant.py
@@ -32,5 +32,5 @@ class Variant(BaseModel):
     skill_paths: list[str] = Field(default_factory=list)
     """Resolved paths to agent skill directories."""
 
-    flutter_channel: str | None = None
-    """Flutter SDK channel to use (e.g. 'stable', 'beta', 'main')."""
+    branch: str | None = None
+    """SDK branch/channel to use (e.g. 'stable', 'beta', 'main')."""
diff --git a/packages/dataset_config_python/src/dataset_config_python/parser.py b/packages/dataset_config_python/src/dataset_config_python/parser.py
index 4148e11..dd8ed1d 100644
--- a/packages/dataset_config_python/src/dataset_config_python/parser.py
+++ b/packages/dataset_config_python/src/dataset_config_python/parser.py
@@ -246,7 +246,7 @@ def _load_task_file(task_path: str, dataset_root: str) -> list[ParsedTask]:
     task_dir = os.path.dirname(task_path)
 
     task_id = data.get("id") or os.path.basename(task_dir)
-    task_func = data.get("func") or task_id
+    func_name = data.get("func") or task_id
 
     task_workspace_raw = data.get("workspace")
     task_tests_raw = data.get("tests")
@@ -273,7 +273,7 @@ def _load_task_file(task_path: str, dataset_root: str) -> list[ParsedTask]:
     return [
         ParsedTask(
             id=task_id,
-            func=task_func,
+            func=func_name,
             variant=Variant(),
             samples=samples,
             system_message=system_message,
@@ -559,8 +559,8 @@ def parse_job(job_path: str, dataset_root: str) -> Job:
         ),
         description=data.get("description"),
         image_prefix=data.get("image_prefix"),
-        task_filters=TagFilter(**data["task_filters"]) if isinstance(data.get("task_filters"), dict) else None,
-        sample_filters=TagFilter(**data["sample_filters"]) if isinstance(data.get("sample_filters"), dict) else None,
+        task_filters=data.get("task_filters"),
+        sample_filters=data.get("sample_filters"),
     )
 
 
diff --git a/packages/dataset_config_python/src/dataset_config_python/resolver.py b/packages/dataset_config_python/src/dataset_config_python/resolver.py
index 9ecd4e7..dffd95e 100644
--- a/packages/dataset_config_python/src/dataset_config_python/resolver.py
+++ b/packages/dataset_config_python/src/dataset_config_python/resolver.py
@@ -4,6 +4,7 @@
 
 import glob as globmod
 import os
+from dataclasses import dataclass, field
 from typing import Any
 
 from dataset_config_python.models.context_file import ContextFile
@@ -37,14 +38,22 @@
     "podman-main": {"name": "podman", "path": "./sandboxes/podman/compose-main.yaml"},
 }
 
-# Default Flutter SDK channel → sandbox registry key mapping.
-DEFAULT_SDK_CHANNELS: dict[str, str] = {
+# Default SDK branch → sandbox registry key mapping.
+DEFAULT_BRANCH_CHANNELS: dict[str, str] = {
     "stable": "podman",
     "beta": "podman-beta",
     "main": "podman-main",
 }
 
 
+@dataclass
+class SandboxConfig:
+    """Sandbox registry and branch-channel mapping."""
+
+    registry: dict[str, dict[str, str]] = field(default_factory=dict)
+    branch_channels: dict[str, str] = field(default_factory=dict)
+
+
 def _is_glob(pattern: str) -> bool:
     return "*" in pattern or "?" in pattern or "[" in pattern
 
@@ -53,8 +62,7 @@ def resolve(
     dataset_path: str,
     job_names: list[str],
     *,
-    sandbox_registry: dict[str, dict[str, str]] | None = None,
-    sdk_channels: dict[str, str] | None = None,
+    sandbox_config: SandboxConfig | None = None,
 ) -> list[EvalSet]:
     """Resolve dataset + job(s) into EvalSet objects.
 
@@ -64,13 +72,14 @@ def resolve(
         dataset_path: Root directory containing ``tasks/`` and ``jobs/``.
         job_names: Job names (looked up in ``jobs/``) or paths.
         sandbox_registry: Named sandbox configurations. Defaults to empty.
-        sdk_channels: SDK channel → sandbox registry key mapping. Defaults to empty.
+        branch_channels: SDK branch → sandbox registry key mapping. Defaults to empty.
 
     Returns:
         A list of EvalSet objects ready for JSON serialization.
     """
-    registry = sandbox_registry or {}
-    channels = sdk_channels or {}
+    sandbox_cfg = sandbox_config or SandboxConfig()
+    registry = sandbox_cfg.registry
+    channels = sandbox_cfg.branch_channels
     task_configs = parse_tasks(dataset_path)
     results: list[EvalSet] = []
 
@@ -87,7 +96,7 @@ def _resolve_job(
     job: Any,
     dataset_root: str,
     sandbox_registry: dict[str, dict[str, str]],
-    sdk_channels: dict[str, str],
+    branch_channels: dict[str, str],
 ) -> list[EvalSet]:
     """Resolve task configs and job into EvalSet objects."""
     models = job.models if job.models else list(DEFAULT_MODELS)
@@ -95,10 +104,10 @@ def _resolve_job(
 
     expanded_tasks = _expand_task_configs(dataset_tasks, job, sandbox_type_str, dataset_root)
 
-    # Group by flutter channel
+    # Group by branch
     groups: dict[str | None, list[ParsedTask]] = {}
     for tc in expanded_tasks:
-        key = tc.variant.flutter_channel
+        key = tc.variant.branch
         groups.setdefault(key, []).append(tc)
 
     return [
@@ -106,7 +115,7 @@ def _resolve_job(
             task_configs=group,
             log_dir=job.log_dir,
             models=models,
-            sandbox=_resolve_sandbox(dataset_root, job, sandbox_registry, sdk_channels, flutter_channel=channel),
+            sandbox=_resolve_sandbox(dataset_root, job, sandbox_registry, branch_channels, branch=channel),
             job=job,
         )
         for channel, group in groups.items()
@@ -152,6 +161,7 @@ def _build_eval_set(
 
             if workspace is not None and is_container:
                 files = {**(files or {}), "/workspace": workspace}
+                setup = setup or "cd /workspace && flutter pub get"
                 enriched["workspace"] = "/workspace"
             if workspace_git is not None:
                 enriched["workspace_git"] = workspace_git
@@ -341,9 +351,9 @@ def _resolve_sandbox(
     dataset_root: str,
     job: Any,
     sandbox_registry: dict[str, dict[str, str]],
-    sdk_channels: dict[str, str],
+    branch_channels: dict[str, str],
     *,
-    flutter_channel: str | None = None,
+    branch: str | None = None,
 ) -> Any:
     """Resolve sandbox spec for a given config."""
     sandbox_type = job.sandbox_type
@@ -351,8 +361,9 @@ def _resolve_sandbox(
         return "local"
 
     # Channel override
-    if flutter_channel and flutter_channel in sdk_channels:
-        registry_key = sdk_channels[flutter_channel]
+    # Branch override → look up branch-specific sandbox
+    if branch and branch in branch_channels:
+        registry_key = branch_channels[branch]
         if registry_key in sandbox_registry:
             defn = sandbox_registry[registry_key]
             sandbox_path = defn["path"]
@@ -529,7 +540,7 @@ def _resolve_variant(
         context_files=context_files,
         mcp_servers=vdef.get("mcp_servers") or [],
         skill_paths=skill_paths,
-        flutter_channel=vdef.get("flutter_channel"),
+        branch=vdef.get("branch"),
     )
 
 
diff --git a/packages/dataset_config_python/tests/test_config.py b/packages/dataset_config_python/tests/test_config.py
index 20890c3..89ea89c 100644
--- a/packages/dataset_config_python/tests/test_config.py
+++ b/packages/dataset_config_python/tests/test_config.py
@@ -165,7 +165,7 @@ def test_variant_defaults(self):
         assert v.context_files == []
         assert v.mcp_servers == []
         assert v.skill_paths == []
-        assert v.flutter_channel is None
+        assert v.branch is None
 
     def test_job_task_from_yaml_none(self):
         jt = JobTask.from_yaml("my_task", None)
diff --git a/packages/devals_cli/lib/src/dataset/dry_run.dart b/packages/devals_cli/lib/src/dataset/dry_run.dart
index 856e172..1a61dcc 100644
--- a/packages/devals_cli/lib/src/dataset/dry_run.dart
+++ b/packages/devals_cli/lib/src/dataset/dry_run.dart
@@ -36,7 +36,7 @@ bool _validateConfig(EvalSet config) {
 
     if (task.func == null) {
       warnings.add(
-        'Task "$name" has no task_func — Mode 2 hydration required',
+        'Task "$name" has no func — Mode 2 hydration required',
       );
     }